vllm.v1.attention.kv_dequant ¶

KV dequantization dispatch scaffold for attention backends.

Modules:

Name	Description
`flashinfer_tile`	FlashInfer KV dequant tile helpers.
`hadamard`	Hadamard helpers for INT4 KV cache quantization.
`triton_tile`	Triton KV dequantization helpers used by unified attention kernels.

assert_backend_supports_kv_quant_mode ¶

assert_backend_supports_kv_quant_mode(
    backend_name: str, quant_mode: KVQuantMode
) -> None

Raise when a backend has not declared support for the kv quant mode.

Source code in vllm/v1/attention/kv_dequant/__init__.py

def assert_backend_supports_kv_quant_mode(
    backend_name: str,
    quant_mode: KVQuantMode,
) -> None:
    """Raise when a backend has not declared support for the kv quant mode."""
    if quant_mode in (KVQuantMode.NONE, KVQuantMode.FP8_PER_TENSOR):
        return

    module = _BACKEND_TILE_MODULE.get(backend_name)
    if module is None or quant_mode not in module.SUPPORTED_MODES:
        raise RuntimeError(
            f"kv-cache quantization mode '{quant_mode.name.lower()}' is not yet "
            f"supported by '{backend_name}'."
        )