class DeepseekCompressor(nn.Module):
"""DeepSeek V4 KV/score compressor.
Owns the linear / norm / state-cache / ape state and the shared forward
prologue (kv/score split, save_partial_states launch). The
compress → norm → RoPE → store step is dispatched to a triton kernel
(``compress_norm_rope_store_triton``) by default, except for the NVIDIA
head_dim=128 indexer path which uses the cutedsl kernel
(``compress_norm_rope_store_cutedsl``) for better performance.
"""
def __init__(
self,
vllm_config: VllmConfig,
compress_ratio: int,
hidden_size: int,
head_dim: int,
rotate: bool = False,
prefix: str = "",
k_cache_prefix="",
use_fp4_cache: bool = False,
):
super().__init__()
self.compress_ratio = compress_ratio
self.hidden_size = hidden_size
self.head_dim = head_dim
self.rotate = rotate
self.prefix = prefix
self.k_cache_prefix = k_cache_prefix
self.use_fp4_cache = use_fp4_cache
config = vllm_config.model_config.hf_config
self.rope_head_dim = config.qk_rope_head_dim
self.nope_head_dim = self.head_dim - self.rope_head_dim
self.rms_norm_eps = config.rms_norm_eps
self.device = current_platform.device_type
self.max_num_reqs = vllm_config.scheduler_config.max_num_seqs
self.max_model_len = vllm_config.model_config.max_model_len
self.overlap = compress_ratio == 4
self.coff = 1 + self.overlap
state_dtype = torch.float32
self.ape = nn.Parameter(
torch.empty(
(compress_ratio, self.coff * self.head_dim),
dtype=state_dtype,
device=self.device,
),
requires_grad=False,
)
self.fused_wkv_wgate = MergedColumnParallelLinear(
self.hidden_size,
[self.coff * self.head_dim, self.coff * self.head_dim],
bias=False,
return_bias=False,
quant_config=None,
disable_tp=True,
prefix=f"{prefix}.fused_wkv_wgate",
)
self.norm = RMSNorm(self.head_dim, self.rms_norm_eps)
self.state_cache = CompressorStateCache(
state_dim=2 * self.coff * self.head_dim, # kv_state + score_state
dtype=state_dtype,
compress_ratio=compress_ratio,
prefix=f"{prefix}.state_cache",
)
# Save reference to static_forward_context for forward-time KV cache lookup.
# get_current_vllm_config() is only available during __init__, not forward.
self._static_forward_context = (
vllm_config.compilation_config.static_forward_context
)
if self.head_dim == 512:
assert not use_fp4_cache, (
"MXFP4 cache is only supported for indexer (head=128)"
)
self._quant_block = 64
self._token_stride = self.nope_head_dim + self.rope_head_dim * 2
self._scale_dim = self.nope_head_dim // 64 + 1 # 7 real + 1 pad
elif self.head_dim == 128:
if use_fp4_cache:
self._quant_block = MXFP4_BLOCK_SIZE
self._token_stride = self.head_dim // 2
self._scale_dim = self.head_dim // MXFP4_BLOCK_SIZE
else:
self._quant_block = 128
self._token_stride = self.head_dim
self._scale_dim = 4 # single float32 scale
else:
raise ValueError(
f"Unsupported head_dim for fused quant+cache: {self.head_dim}"
)
def forward(
self,
# [num_tokens, 2 * self.coff * self.head_dim]
kv_score: torch.Tensor,
# [num_tokens]
positions: torch.Tensor,
rotary_emb,
) -> None:
# Each of shape [num_tokens, coff * self.head_dim]
# input bf16, output are fp32
kv, score = kv_score.split(
[self.coff * self.head_dim, self.coff * self.head_dim], dim=-1
)
# Get the metadata and handle dummy profiling run.
attn_metadata = get_forward_context().attn_metadata
if not isinstance(attn_metadata, dict):
return
state_metadata = cast(
CompressorMetadata, attn_metadata[self.state_cache.prefix]
)
token_to_req_indices = state_metadata.token_to_req_indices
slot_mapping = state_metadata.slot_mapping
num_actual = slot_mapping.shape[0]
block_table = state_metadata.block_table
block_size = state_metadata.block_size
# [num_blocks, block_size, kv_dim+score_dim], where kv_dim == score_dim
state_cache = self.state_cache.kv_cache
# kv_state stored in first half, score_state stored in second half
state_width = state_cache.shape[-1] // 2
pdl_kwargs = (
{}
if current_platform.is_rocm() or current_platform.is_xpu()
else {"launch_pdl": False}
)
# Store the KV and score (with fused APE addition) in the state.
# NOTE: PDL is disabled — both this kernel and the compress kernels
# below depend on preceding kernel outputs (kv/score from the cublas
# GEMM; state_cache from this kernel) but neither emits/waits on PDL
# grid dependency primitives, so launch_pdl=True caused a
# read-after-write race and non-deterministic output.
save_partial_states(
kv=kv,
score=score,
ape=self.ape,
positions=positions,
state_cache=state_cache,
slot_mapping=slot_mapping,
block_size=block_size,
state_width=state_width,
compress_ratio=self.compress_ratio,
pdl_kwargs=pdl_kwargs,
)
# Fused: compress → RMSNorm → RoPE → FP8 quant → KV cache write.
# RoPE requirements (kernel applies forward GPT-J style rotation):
# - is_neox_style=False (interleaved pairs, NOT split-half)
# - cos_sin_cache layout: [max_pos, rope_head_dim] with first half cos,
# second half sin (per-pair, length rope_head_dim // 2 each)
# - applied to LAST rope_head_dim elements of head_dim
# - position used: (positions // compress_ratio) * compress_ratio
cos_sin_cache = rotary_emb.cos_sin_cache
k_cache_metadata = cast(Any, attn_metadata[self.k_cache_prefix])
kv_cache = self._static_forward_context[self.k_cache_prefix].kv_cache
if current_platform.is_cuda():
# NVIDIA GPUs.
if self.head_dim == 512:
from .nvidia.ops.sparse_attn_compress_cutedsl import (
compress_norm_rope_store_cutedsl,
)
# Main compressor path.
# Use a cutedsl kernel for better performance.
compress_norm_rope_store_fn = compress_norm_rope_store_cutedsl
else:
# Indexer path (head_dim == 128).
# Use a triton kernel.
compress_norm_rope_store_fn = compress_norm_rope_store_triton
else:
# AMD GPUs.
# Always use a triton kernel.
compress_norm_rope_store_fn = compress_norm_rope_store_triton
compress_norm_rope_store_fn(
state_cache=state_cache,
num_actual=num_actual,
token_to_req_indices=token_to_req_indices,
positions=positions,
slot_mapping=slot_mapping,
block_table=block_table,
block_size=block_size,
state_width=state_width,
cos_sin_cache=cos_sin_cache,
kv_cache=kv_cache,
k_cache_metadata=k_cache_metadata,
pdl_kwargs=pdl_kwargs,
head_dim=self.head_dim,
rope_head_dim=self.rope_head_dim,
compress_ratio=self.compress_ratio,
overlap=self.overlap,
use_fp4_cache=self.use_fp4_cache,
rms_norm_weight=self.norm.weight,
rms_norm_eps=self.rms_norm_eps,
quant_block=self._quant_block,
token_stride=self._token_stride,
scale_dim=self._scale_dim,
)