Skip to content

vllm.model_executor.models.config

Gemma4Config

Bases: VerifyAndUpdateConfig

Source code in vllm/model_executor/models/config.py
class Gemma4Config(VerifyAndUpdateConfig):
    @staticmethod
    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
        """Configure attention for heterogeneous head dimensions.

        Gemma4 uses different head dimensions for sliding window
        (head_dim) vs full attention (global_head_dim) layers. The
        default FA3 on Hopper cannot handle head_dim > 256, which
        causes mixed backend selection and numerical divergence.

        When FA4 is available we force it for ALL layers, giving a
        uniform kernel path and avoiding the mixed FA3+FA4 penalty.
        When FA4 is not available we fall back to Triton.
        """
        hf_text_config = vllm_config.model_config.hf_text_config
        head_dim = getattr(hf_text_config, "head_dim", None)
        global_head_dim = getattr(hf_text_config, "global_head_dim", None)

        if head_dim is None or global_head_dim is None or head_dim == global_head_dim:
            return

        from vllm.v1.attention.backends.fa_utils import is_fa_version_supported

        max_head_dim = max(head_dim, global_head_dim)

        if is_fa_version_supported(4) and max_head_dim <= 512:
            if vllm_config.attention_config.flash_attn_version is None:
                vllm_config.attention_config.flash_attn_version = 4
                logger.info(
                    "Gemma4 model has heterogeneous head dimensions "
                    "(head_dim=%d, global_head_dim=%d). Using FA4 for "
                    "all layers to avoid mixed FA3/FA4 penalty.",
                    head_dim,
                    global_head_dim,
                )
        elif vllm_config.attention_config.backend is None:
            from vllm.v1.attention.backends.registry import (
                AttentionBackendEnum,
            )

            vllm_config.attention_config.backend = AttentionBackendEnum.TRITON_ATTN
            logger.info(
                "Gemma4 model has heterogeneous head dimensions "
                "(head_dim=%d, global_head_dim=%d). FA4 not available, "
                "forcing TRITON_ATTN backend.",
                head_dim,
                global_head_dim,
            )

verify_and_update_config staticmethod

verify_and_update_config(vllm_config: VllmConfig) -> None

Configure attention for heterogeneous head dimensions.

Gemma4 uses different head dimensions for sliding window (head_dim) vs full attention (global_head_dim) layers. The default FA3 on Hopper cannot handle head_dim > 256, which causes mixed backend selection and numerical divergence.

When FA4 is available we force it for ALL layers, giving a uniform kernel path and avoiding the mixed FA3+FA4 penalty. When FA4 is not available we fall back to Triton.

Source code in vllm/model_executor/models/config.py
@staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
    """Configure attention for heterogeneous head dimensions.

    Gemma4 uses different head dimensions for sliding window
    (head_dim) vs full attention (global_head_dim) layers. The
    default FA3 on Hopper cannot handle head_dim > 256, which
    causes mixed backend selection and numerical divergence.

    When FA4 is available we force it for ALL layers, giving a
    uniform kernel path and avoiding the mixed FA3+FA4 penalty.
    When FA4 is not available we fall back to Triton.
    """
    hf_text_config = vllm_config.model_config.hf_text_config
    head_dim = getattr(hf_text_config, "head_dim", None)
    global_head_dim = getattr(hf_text_config, "global_head_dim", None)

    if head_dim is None or global_head_dim is None or head_dim == global_head_dim:
        return

    from vllm.v1.attention.backends.fa_utils import is_fa_version_supported

    max_head_dim = max(head_dim, global_head_dim)

    if is_fa_version_supported(4) and max_head_dim <= 512:
        if vllm_config.attention_config.flash_attn_version is None:
            vllm_config.attention_config.flash_attn_version = 4
            logger.info(
                "Gemma4 model has heterogeneous head dimensions "
                "(head_dim=%d, global_head_dim=%d). Using FA4 for "
                "all layers to avoid mixed FA3/FA4 penalty.",
                head_dim,
                global_head_dim,
            )
    elif vllm_config.attention_config.backend is None:
        from vllm.v1.attention.backends.registry import (
            AttentionBackendEnum,
        )

        vllm_config.attention_config.backend = AttentionBackendEnum.TRITON_ATTN
        logger.info(
            "Gemma4 model has heterogeneous head dimensions "
            "(head_dim=%d, global_head_dim=%d). FA4 not available, "
            "forcing TRITON_ATTN backend.",
            head_dim,
            global_head_dim,
        )

HybridAttentionMambaModelConfig

Bases: VerifyAndUpdateConfig

Source code in vllm/model_executor/models/config.py
class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
    @classmethod
    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
        """
        Perform early validation and setup for hybrid attention/mamba models.

        Block size alignment with mamba page sizes is handled later by
        Platform.update_block_size_for_backend(), which runs after model
        layers are constructed and the attention backend is known.

        Args:
            vllm_config: vLLM Config
        """
        cache_config = vllm_config.cache_config

        # Disable calculate_kv_scales for hybrid models: uninitialized
        # recurrent state corrupts scales during the calibration pass.
        # See issue: https://github.com/vllm-project/vllm/issues/37554

        if cache_config.calculate_kv_scales:
            logger.warning(
                "Disabling calculate_kv_scales for hybrid model '%s'. "
                "Hybrid models with recurrent layers (GDN, Mamba, SSM) "
                "produce unreliable KV cache scales during the "
                "calibration pass because recurrent state is "
                "uninitialized. Using default scale of 1.0 instead.",
                vllm_config.model_config.model,
            )
            cache_config.calculate_kv_scales = False

        # Enable FULL_AND_PIECEWISE by default
        MambaModelConfig.verify_and_update_config(vllm_config)

verify_and_update_config classmethod

verify_and_update_config(vllm_config: VllmConfig) -> None

Perform early validation and setup for hybrid attention/mamba models.

Block size alignment with mamba page sizes is handled later by Platform.update_block_size_for_backend(), which runs after model layers are constructed and the attention backend is known.

Parameters:

Name Type Description Default
vllm_config VllmConfig

vLLM Config

required
Source code in vllm/model_executor/models/config.py
@classmethod
def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
    """
    Perform early validation and setup for hybrid attention/mamba models.

    Block size alignment with mamba page sizes is handled later by
    Platform.update_block_size_for_backend(), which runs after model
    layers are constructed and the attention backend is known.

    Args:
        vllm_config: vLLM Config
    """
    cache_config = vllm_config.cache_config

    # Disable calculate_kv_scales for hybrid models: uninitialized
    # recurrent state corrupts scales during the calibration pass.
    # See issue: https://github.com/vllm-project/vllm/issues/37554

    if cache_config.calculate_kv_scales:
        logger.warning(
            "Disabling calculate_kv_scales for hybrid model '%s'. "
            "Hybrid models with recurrent layers (GDN, Mamba, SSM) "
            "produce unreliable KV cache scales during the "
            "calibration pass because recurrent state is "
            "uninitialized. Using default scale of 1.0 instead.",
            vllm_config.model_config.model,
        )
        cache_config.calculate_kv_scales = False

    # Enable FULL_AND_PIECEWISE by default
    MambaModelConfig.verify_and_update_config(vllm_config)

LlamaNemotronVLConfig

Bases: VerifyAndUpdateConfig

Config handler for LlamaNemotronVL embedding models.

Source code in vllm/model_executor/models/config.py
class LlamaNemotronVLConfig(VerifyAndUpdateConfig):
    """Config handler for LlamaNemotronVL embedding models."""

    @staticmethod
    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
        from vllm.config.pooler import SequencePoolingType

        hf_config = model_config.hf_config

        # Set bidirectional attention on the language model config
        hf_config.is_causal = False
        if hasattr(hf_config, "llm_config"):
            hf_config.llm_config.is_causal = False

        if hasattr(hf_config, "vision_config"):
            hf_config.patch_size = hf_config.vision_config.patch_size

        # Set up pooling type
        pooling_type_map: dict[str, SequencePoolingType] = {
            "avg": "MEAN",
            "cls": "CLS",
            "last": "LAST",
        }

        # Get pooling type from config (check both top-level and llm_config)
        pooling = getattr(hf_config, "pooling", None)
        if pooling is None and hasattr(hf_config, "llm_config"):
            pooling = getattr(hf_config.llm_config, "pooling", "avg")

        pooling_type = pooling_type_map.get(pooling)
        if pooling_type is None:
            raise ValueError(f"pool_type {pooling!r} not supported")

        model_config.pooler_config.seq_pooling_type = pooling_type

MambaModelConfig

Bases: VerifyAndUpdateConfig

Source code in vllm/model_executor/models/config.py
class MambaModelConfig(VerifyAndUpdateConfig):
    @classmethod
    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
        """
        Enable FULL_AND_PIECEWISE cuda graph mode by default (required
        to get good performance for mamba layers in V1).

        Args:
            vllm_config: vLLM Config
        """
        model_config = vllm_config.model_config
        cache_config = vllm_config.cache_config

        if cache_config.enable_prefix_caching:
            if cache_config.mamba_cache_mode == "none":
                cache_config.mamba_cache_mode = (
                    "all" if model_config.supports_mamba_prefix_caching else "align"
                )
                logger.warning(
                    "Mamba cache mode is set to '%s' for %s by default "
                    "when prefix caching is enabled",
                    cache_config.mamba_cache_mode,
                    model_config.architecture,
                )
            if (
                cache_config.mamba_cache_mode == "all"
                and not model_config.supports_mamba_prefix_caching
            ):
                cache_config.mamba_cache_mode = "align"
                logger.warning(
                    "Hybrid or mamba-based model detected without support "
                    "for prefix caching with Mamba cache 'all' mode: "
                    "falling back to 'align' mode."
                )
            if cache_config.mamba_cache_mode == "align":
                assert vllm_config.scheduler_config.enable_chunked_prefill, (
                    "Chunked prefill is required for mamba cache mode 'align'."
                )
            logger.info(
                "Warning: Prefix caching in Mamba cache '%s' "
                "mode is currently enabled. "
                "Its support for Mamba layers is experimental. "
                "Please report any issues you may observe.",
                cache_config.mamba_cache_mode,
            )
            # By default, mamba block size will be set to max_model_len (see
            # below). When enabling prefix caching, we align mamba block size
            # to the block size as the basic granularity for prefix caching.
            if cache_config.mamba_block_size is None:
                cache_config.mamba_block_size = cache_config.block_size
        else:
            if cache_config.mamba_cache_mode != "none":
                cache_config.mamba_cache_mode = "none"
                logger.warning(
                    "Mamba cache mode is set to 'none' when prefix caching is disabled"
                )
            if cache_config.mamba_block_size is None:
                cache_config.mamba_block_size = model_config.max_model_len

verify_and_update_config classmethod

verify_and_update_config(vllm_config: VllmConfig) -> None

Enable FULL_AND_PIECEWISE cuda graph mode by default (required to get good performance for mamba layers in V1).

Parameters:

Name Type Description Default
vllm_config VllmConfig

vLLM Config

required
Source code in vllm/model_executor/models/config.py
@classmethod
def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
    """
    Enable FULL_AND_PIECEWISE cuda graph mode by default (required
    to get good performance for mamba layers in V1).

    Args:
        vllm_config: vLLM Config
    """
    model_config = vllm_config.model_config
    cache_config = vllm_config.cache_config

    if cache_config.enable_prefix_caching:
        if cache_config.mamba_cache_mode == "none":
            cache_config.mamba_cache_mode = (
                "all" if model_config.supports_mamba_prefix_caching else "align"
            )
            logger.warning(
                "Mamba cache mode is set to '%s' for %s by default "
                "when prefix caching is enabled",
                cache_config.mamba_cache_mode,
                model_config.architecture,
            )
        if (
            cache_config.mamba_cache_mode == "all"
            and not model_config.supports_mamba_prefix_caching
        ):
            cache_config.mamba_cache_mode = "align"
            logger.warning(
                "Hybrid or mamba-based model detected without support "
                "for prefix caching with Mamba cache 'all' mode: "
                "falling back to 'align' mode."
            )
        if cache_config.mamba_cache_mode == "align":
            assert vllm_config.scheduler_config.enable_chunked_prefill, (
                "Chunked prefill is required for mamba cache mode 'align'."
            )
        logger.info(
            "Warning: Prefix caching in Mamba cache '%s' "
            "mode is currently enabled. "
            "Its support for Mamba layers is experimental. "
            "Please report any issues you may observe.",
            cache_config.mamba_cache_mode,
        )
        # By default, mamba block size will be set to max_model_len (see
        # below). When enabling prefix caching, we align mamba block size
        # to the block size as the basic granularity for prefix caching.
        if cache_config.mamba_block_size is None:
            cache_config.mamba_block_size = cache_config.block_size
    else:
        if cache_config.mamba_cache_mode != "none":
            cache_config.mamba_cache_mode = "none"
            logger.warning(
                "Mamba cache mode is set to 'none' when prefix caching is disabled"
            )
        if cache_config.mamba_block_size is None:
            cache_config.mamba_block_size = model_config.max_model_len

NemotronHForCausalLMConfig

Bases: VerifyAndUpdateConfig

Source code in vllm/model_executor/models/config.py
class NemotronHForCausalLMConfig(VerifyAndUpdateConfig):
    DEFAULT_MAMBA_SSM_CACHE_DTYPE = "float32"
    """Only `float32` is known to have no accuracy issues by default."""

    @classmethod
    def update_mamba_ssm_cache_dtype(
        cls, *, cache_config: "CacheConfig", hf_config: "PretrainedConfig"
    ) -> None:
        """Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto'
        (or not explicitly set), to the value specified in the HF config, or to
        `float32` if not specified.
        """
        if cache_config.mamba_ssm_cache_dtype == "auto":
            mamba_ssm_cache_dtype = getattr(
                hf_config, "mamba_ssm_cache_dtype", cls.DEFAULT_MAMBA_SSM_CACHE_DTYPE
            )
            logger.info(
                "Updating mamba_ssm_cache_dtype to '%s' for NemotronH model",
                mamba_ssm_cache_dtype,
            )
            cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype

    @classmethod
    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
        cls.update_mamba_ssm_cache_dtype(
            cache_config=vllm_config.cache_config,
            hf_config=vllm_config.model_config.hf_config,
        )

DEFAULT_MAMBA_SSM_CACHE_DTYPE class-attribute instance-attribute

DEFAULT_MAMBA_SSM_CACHE_DTYPE = 'float32'

Only float32 is known to have no accuracy issues by default.

update_mamba_ssm_cache_dtype classmethod

update_mamba_ssm_cache_dtype(
    *,
    cache_config: CacheConfig,
    hf_config: PretrainedConfig,
) -> None

Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto' (or not explicitly set), to the value specified in the HF config, or to float32 if not specified.

Source code in vllm/model_executor/models/config.py
@classmethod
def update_mamba_ssm_cache_dtype(
    cls, *, cache_config: "CacheConfig", hf_config: "PretrainedConfig"
) -> None:
    """Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto'
    (or not explicitly set), to the value specified in the HF config, or to
    `float32` if not specified.
    """
    if cache_config.mamba_ssm_cache_dtype == "auto":
        mamba_ssm_cache_dtype = getattr(
            hf_config, "mamba_ssm_cache_dtype", cls.DEFAULT_MAMBA_SSM_CACHE_DTYPE
        )
        logger.info(
            "Updating mamba_ssm_cache_dtype to '%s' for NemotronH model",
            mamba_ssm_cache_dtype,
        )
        cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype

Qwen3_5ForConditionalGenerationConfig

Bases: VerifyAndUpdateConfig

Source code in vllm/model_executor/models/config.py
class Qwen3_5ForConditionalGenerationConfig(VerifyAndUpdateConfig):
    @staticmethod
    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
        """Update mamba_ssm_cache_dtype for Qwen3.5 models when set to 'auto'
        (or not explicitly set), to the value specified in the HF config's
        mamba_ssm_dtype field. Warn if the user explicitly overrides it to a
        different value.
        """
        cache_config = vllm_config.cache_config
        hf_text_config = vllm_config.model_config.hf_text_config
        mamba_ssm_dtype = getattr(hf_text_config, "mamba_ssm_dtype", None)
        if cache_config.mamba_ssm_cache_dtype == "auto":
            if mamba_ssm_dtype is not None:
                cache_config.mamba_ssm_cache_dtype = mamba_ssm_dtype
        elif (
            mamba_ssm_dtype is not None
            and cache_config.mamba_ssm_cache_dtype != mamba_ssm_dtype
        ):
            logger.warning(
                "Qwen3.5 model specifies mamba_ssm_dtype='%s' in its config, "
                "but --mamba-ssm-cache-dtype='%s' was passed. "
                "Using the user-specified value.",
                mamba_ssm_dtype,
                cache_config.mamba_ssm_cache_dtype,
            )

verify_and_update_config staticmethod

verify_and_update_config(vllm_config: VllmConfig) -> None

Update mamba_ssm_cache_dtype for Qwen3.5 models when set to 'auto' (or not explicitly set), to the value specified in the HF config's mamba_ssm_dtype field. Warn if the user explicitly overrides it to a different value.

Source code in vllm/model_executor/models/config.py
@staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
    """Update mamba_ssm_cache_dtype for Qwen3.5 models when set to 'auto'
    (or not explicitly set), to the value specified in the HF config's
    mamba_ssm_dtype field. Warn if the user explicitly overrides it to a
    different value.
    """
    cache_config = vllm_config.cache_config
    hf_text_config = vllm_config.model_config.hf_text_config
    mamba_ssm_dtype = getattr(hf_text_config, "mamba_ssm_dtype", None)
    if cache_config.mamba_ssm_cache_dtype == "auto":
        if mamba_ssm_dtype is not None:
            cache_config.mamba_ssm_cache_dtype = mamba_ssm_dtype
    elif (
        mamba_ssm_dtype is not None
        and cache_config.mamba_ssm_cache_dtype != mamba_ssm_dtype
    ):
        logger.warning(
            "Qwen3.5 model specifies mamba_ssm_dtype='%s' in its config, "
            "but --mamba-ssm-cache-dtype='%s' was passed. "
            "Using the user-specified value.",
            mamba_ssm_dtype,
            cache_config.mamba_ssm_cache_dtype,
        )