vllm.v1.attention.backends.mla.indexer ¶

logger `module-attribute` ¶

logger = init_logger(__name__)

DeepSeekV32IndexerDecodeMetadata `dataclass` ¶

Source code in vllm/v1/attention/backends/mla/indexer.py

@dataclass
class DeepSeekV32IndexerDecodeMetadata:
    block_table: torch.Tensor
    seq_lens: torch.Tensor
    decode_lens: torch.Tensor
    requires_padding: bool
    schedule_metadata: torch.Tensor

block_table `instance-attribute` ¶

block_table: Tensor

decode_lens `instance-attribute` ¶

decode_lens: Tensor

requires_padding `instance-attribute` ¶

requires_padding: bool

schedule_metadata `instance-attribute` ¶

schedule_metadata: Tensor

seq_lens `instance-attribute` ¶

seq_lens: Tensor

init ¶

__init__(
    block_table: Tensor,
    seq_lens: Tensor,
    decode_lens: Tensor,
    requires_padding: bool,
    schedule_metadata: Tensor,
) -> None

DeepseekV32IndexerBackend ¶

Bases: AttentionBackend

Source code in vllm/v1/attention/backends/mla/indexer.py

class DeepseekV32IndexerBackend(AttentionBackend):
    @staticmethod
    def get_metadata_cls() -> type["AttentionMetadata"]:
        return DeepseekV32IndexerMetadata

    @classmethod
    def get_supported_head_sizes(cls) -> list[int]:
        return [32, 64, 128]

    @staticmethod
    def get_builder_cls() -> type["DeepseekV32IndexerMetadataBuilder"]:
        return DeepseekV32IndexerMetadataBuilder

    @staticmethod
    def get_kv_cache_shape(
        num_blocks: int,
        block_size: int,
        num_kv_heads: int,
        head_size: int,
        cache_dtype_str: str = "auto",
    ) -> tuple[int, ...]:
        assert num_kv_heads == 1
        return (num_blocks, block_size, head_size)

    @staticmethod
    def get_kv_cache_stride_order() -> tuple[int, ...]:
        return (0, 1, 2)

get_builder_cls `staticmethod` ¶

get_builder_cls() -> type[
    DeepseekV32IndexerMetadataBuilder
]

Source code in vllm/v1/attention/backends/mla/indexer.py

@staticmethod
def get_builder_cls() -> type["DeepseekV32IndexerMetadataBuilder"]:
    return DeepseekV32IndexerMetadataBuilder

get_kv_cache_shape `staticmethod` ¶

get_kv_cache_shape(
    num_blocks: int,
    block_size: int,
    num_kv_heads: int,
    head_size: int,
    cache_dtype_str: str = "auto",
) -> tuple[int, ...]

Source code in vllm/v1/attention/backends/mla/indexer.py

@staticmethod
def get_kv_cache_shape(
    num_blocks: int,
    block_size: int,
    num_kv_heads: int,
    head_size: int,
    cache_dtype_str: str = "auto",
) -> tuple[int, ...]:
    assert num_kv_heads == 1
    return (num_blocks, block_size, head_size)

get_kv_cache_stride_order `staticmethod` ¶

get_kv_cache_stride_order() -> tuple[int, ...]

Source code in vllm/v1/attention/backends/mla/indexer.py

@staticmethod
def get_kv_cache_stride_order() -> tuple[int, ...]:
    return (0, 1, 2)

get_metadata_cls `staticmethod` ¶

get_metadata_cls() -> type[AttentionMetadata]

Source code in vllm/v1/attention/backends/mla/indexer.py

@staticmethod
def get_metadata_cls() -> type["AttentionMetadata"]:
    return DeepseekV32IndexerMetadata

get_supported_head_sizes `classmethod` ¶

get_supported_head_sizes() -> list[int]

Source code in vllm/v1/attention/backends/mla/indexer.py

@classmethod
def get_supported_head_sizes(cls) -> list[int]:
    return [32, 64, 128]

DeepseekV32IndexerMetadata `dataclass` ¶

Source code in vllm/v1/attention/backends/mla/indexer.py

@dataclass
class DeepseekV32IndexerMetadata:
    # FIXME (zyongye)
    # hacky way to access the data now, need to be in chunked meta
    seq_lens: torch.Tensor

    num_reqs: int
    max_query_len: int
    max_seq_len: int

    num_actual_tokens: int  # Number of tokens excluding padding.
    query_start_loc: torch.Tensor
    slot_mapping: torch.Tensor
    # The dimension of the attention heads
    head_dim: int

    # New for MLA (compared to FlashAttention)
    # For handling prefill decode split
    num_decodes: int
    num_decode_tokens: int
    num_prefills: int
    num_prefill_tokens: int

    decode: Optional[DeepSeekV32IndexerDecodeMetadata] = None
    prefill: Optional[DeepseekV32IndexerPrefillMetadata] = None

decode `class-attribute` `instance-attribute` ¶

decode: Optional[DeepSeekV32IndexerDecodeMetadata] = None

head_dim `instance-attribute` ¶

head_dim: int

max_query_len `instance-attribute` ¶

max_query_len: int

max_seq_len `instance-attribute` ¶

max_seq_len: int

num_actual_tokens `instance-attribute` ¶

num_actual_tokens: int

num_decode_tokens `instance-attribute` ¶

num_decode_tokens: int

num_decodes `instance-attribute` ¶

num_decodes: int

num_prefill_tokens `instance-attribute` ¶

num_prefill_tokens: int

num_prefills `instance-attribute` ¶

num_prefills: int

num_reqs `instance-attribute` ¶

num_reqs: int

prefill `class-attribute` `instance-attribute` ¶

prefill: Optional[DeepseekV32IndexerPrefillMetadata] = None

query_start_loc `instance-attribute` ¶

query_start_loc: Tensor

seq_lens `instance-attribute` ¶

seq_lens: Tensor

slot_mapping `instance-attribute` ¶

slot_mapping: Tensor

init ¶

__init__(
    seq_lens: Tensor,
    num_reqs: int,
    max_query_len: int,
    max_seq_len: int,
    num_actual_tokens: int,
    query_start_loc: Tensor,
    slot_mapping: Tensor,
    head_dim: int,
    num_decodes: int,
    num_decode_tokens: int,
    num_prefills: int,
    num_prefill_tokens: int,
    decode: Optional[
        DeepSeekV32IndexerDecodeMetadata
    ] = None,
    prefill: Optional[
        DeepseekV32IndexerPrefillMetadata
    ] = None,
) -> None

DeepseekV32IndexerMetadataBuilder ¶

Bases: AttentionMetadataBuilder

Source code in vllm/v1/attention/backends/mla/indexer.py

class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder):
    cudagraph_support: ClassVar[AttentionCGSupport] = (
        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
    )

    reorder_batch_threshold: int = 1

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        scheduler_config = self.vllm_config.scheduler_config
        # NOTE(Chen):an estimated max size of flattened_kv. Need to double check.
        self.max_prefill_buffer_size = get_max_prefill_buffer_size(self.vllm_config)
        self.num_speculative_tokens = (
            self.vllm_config.speculative_config.num_speculative_tokens
            if self.vllm_config.speculative_config
            else 0
        )
        # Now deepgemm fp8_paged_mqa_logits does not support next_n > 2
        self.reorder_batch_threshold += min(self.num_speculative_tokens, 1)

        props = torch.cuda.get_device_properties(self.device)
        sm_count = props.multi_processor_count
        self.num_sms = sm_count

        self.decode_lens_buffer = torch.empty(
            (scheduler_config.max_num_seqs,), dtype=torch.int32, device=self.device
        )

        # See: DeepGMM/csrc/apis/attention.hpp
        self.scheduler_metadata_buffer = torch.empty(
            (self.num_sms + 1, 2), dtype=torch.int32, device=self.device
        )

    def build_one_prefill_chunk(
        self, reqs_start, reqs_end, query_start_loc_cpu, seq_lens_cpu, block_table
    ):
        prefill_query_start_loc = (
            query_start_loc_cpu[reqs_start : reqs_end + 1]
            - query_start_loc_cpu[reqs_start]
        )
        cu_seqlen_ks, cu_seqlen_ke = kv_spans_from_batches(
            prefill_query_start_loc, seq_lens_cpu[reqs_start:reqs_end], self.device
        )
        token_start = query_start_loc_cpu[reqs_start].item()
        token_end = query_start_loc_cpu[reqs_end].item()
        total_seq_lens = seq_lens_cpu[reqs_start:reqs_end].sum()
        assert total_seq_lens <= self.max_prefill_buffer_size
        cu_seq_lens = (
            torch.cat(
                [
                    torch.zeros(1, dtype=torch.int32),
                    seq_lens_cpu[reqs_start:reqs_end].cumsum(dim=0),
                ]
            )
            .to(torch.int32)
            .to(self.device)
        )
        return DeepseekV32IndexerPrefillChunkMetadata(
            cu_seqlen_ks=cu_seqlen_ks,
            cu_seqlen_ke=cu_seqlen_ke,
            cu_seq_lens=cu_seq_lens,
            total_seq_lens=total_seq_lens,
            block_table=block_table[reqs_start:reqs_end],
            token_start=token_start,
            token_end=token_end,
            num_reqs=reqs_end - reqs_start,
        )

    def build(
        self,
        common_prefix_len: int,
        common_attn_metadata: CommonAttentionMetadata,
        fast_build: bool = False,
    ) -> DeepseekV32IndexerMetadata:
        num_reqs = common_attn_metadata.num_reqs
        num_tokens = common_attn_metadata.num_actual_tokens

        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
            split_decodes_and_prefills(
                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
            )
        )

        assert num_decodes + num_prefills == num_reqs
        assert num_decode_tokens + num_prefill_tokens == num_tokens

        prefill_metadata = None
        if num_prefills > 0:
            chunk_seq_ids = split_prefill_chunks(
                common_attn_metadata.seq_lens_cpu,
                self.max_prefill_buffer_size,
                num_decodes,
            )
            chunks = [
                self.build_one_prefill_chunk(
                    reqs_start,
                    reqs_end,
                    query_start_loc_cpu,
                    common_attn_metadata.seq_lens_cpu,
                    common_attn_metadata.block_table_tensor,
                )
                for reqs_start, reqs_end in chunk_seq_ids
            ]
            prefill_metadata = DeepseekV32IndexerPrefillMetadata(
                chunks=chunks,
            )

        decode_metadata = None
        if num_decodes > 0:
            torch.diff(
                common_attn_metadata.query_start_loc[: num_decodes + 1],
                out=self.decode_lens_buffer[:num_decodes],
            )
            decode_lens = self.decode_lens_buffer[:num_decodes]
            decode_lens_cpu = torch.diff(
                common_attn_metadata.query_start_loc_cpu[: num_decodes + 1]
            )

            # Use CPU to avoid GPU sync; breaking async scheduling
            requires_padding = (decode_lens_cpu.max() > decode_lens_cpu.min()).item()

            seq_lens = common_attn_metadata.seq_lens[:num_decodes]

            self.scheduler_metadata_buffer[:] = get_paged_mqa_logits_metadata(
                seq_lens, self.kv_cache_spec.block_size, self.num_sms
            )
            decode_metadata = DeepSeekV32IndexerDecodeMetadata(
                block_table=common_attn_metadata.block_table_tensor[:num_decodes, ...],
                seq_lens=common_attn_metadata.seq_lens[:num_decodes],
                decode_lens=decode_lens,
                requires_padding=requires_padding,
                schedule_metadata=self.scheduler_metadata_buffer,
            )

        attn_metadata = DeepseekV32IndexerMetadata(
            seq_lens=common_attn_metadata.seq_lens,
            num_reqs=common_attn_metadata.num_reqs,
            max_query_len=common_attn_metadata.max_query_len,
            max_seq_len=common_attn_metadata.max_seq_len,
            num_actual_tokens=common_attn_metadata.num_actual_tokens,
            query_start_loc=common_attn_metadata.query_start_loc,
            slot_mapping=common_attn_metadata.slot_mapping,
            head_dim=128,
            num_decodes=num_decodes,
            num_decode_tokens=num_decode_tokens,
            num_prefills=num_prefills,
            num_prefill_tokens=num_prefill_tokens,
            prefill=prefill_metadata,
            decode=decode_metadata,
        )

        # if get_tensor_model_parallel_rank() == 0:
        #     logger.info(f"attn_metadata: {attn_metadata}")
        return attn_metadata

cudagraph_support `class-attribute` ¶

cudagraph_support: AttentionCGSupport = (
    UNIFORM_SINGLE_TOKEN_DECODE
)

decode_lens_buffer `instance-attribute` ¶

decode_lens_buffer = empty(
    (max_num_seqs,), dtype=int32, device=device
)

max_prefill_buffer_size `instance-attribute` ¶

max_prefill_buffer_size = get_max_prefill_buffer_size(
    vllm_config
)

num_sms `instance-attribute` ¶

num_sms = sm_count

num_speculative_tokens `instance-attribute` ¶

num_speculative_tokens = (
    num_speculative_tokens if speculative_config else 0
)

reorder_batch_threshold `class-attribute` `instance-attribute` ¶

reorder_batch_threshold: int = 1

scheduler_metadata_buffer `instance-attribute` ¶

scheduler_metadata_buffer = empty(
    (num_sms + 1, 2), dtype=int32, device=device
)

init ¶

__init__(*args, **kwargs)

Source code in vllm/v1/attention/backends/mla/indexer.py

def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    scheduler_config = self.vllm_config.scheduler_config
    # NOTE(Chen):an estimated max size of flattened_kv. Need to double check.
    self.max_prefill_buffer_size = get_max_prefill_buffer_size(self.vllm_config)
    self.num_speculative_tokens = (
        self.vllm_config.speculative_config.num_speculative_tokens
        if self.vllm_config.speculative_config
        else 0
    )
    # Now deepgemm fp8_paged_mqa_logits does not support next_n > 2
    self.reorder_batch_threshold += min(self.num_speculative_tokens, 1)

    props = torch.cuda.get_device_properties(self.device)
    sm_count = props.multi_processor_count
    self.num_sms = sm_count

    self.decode_lens_buffer = torch.empty(
        (scheduler_config.max_num_seqs,), dtype=torch.int32, device=self.device
    )

    # See: DeepGMM/csrc/apis/attention.hpp
    self.scheduler_metadata_buffer = torch.empty(
        (self.num_sms + 1, 2), dtype=torch.int32, device=self.device
    )

build ¶

build(
    common_prefix_len: int,
    common_attn_metadata: CommonAttentionMetadata,
    fast_build: bool = False,
) -> DeepseekV32IndexerMetadata

Source code in vllm/v1/attention/backends/mla/indexer.py

def build(
    self,
    common_prefix_len: int,
    common_attn_metadata: CommonAttentionMetadata,
    fast_build: bool = False,
) -> DeepseekV32IndexerMetadata:
    num_reqs = common_attn_metadata.num_reqs
    num_tokens = common_attn_metadata.num_actual_tokens

    query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
    num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
        split_decodes_and_prefills(
            common_attn_metadata, decode_threshold=self.reorder_batch_threshold
        )
    )

    assert num_decodes + num_prefills == num_reqs
    assert num_decode_tokens + num_prefill_tokens == num_tokens

    prefill_metadata = None
    if num_prefills > 0:
        chunk_seq_ids = split_prefill_chunks(
            common_attn_metadata.seq_lens_cpu,
            self.max_prefill_buffer_size,
            num_decodes,
        )
        chunks = [
            self.build_one_prefill_chunk(
                reqs_start,
                reqs_end,
                query_start_loc_cpu,
                common_attn_metadata.seq_lens_cpu,
                common_attn_metadata.block_table_tensor,
            )
            for reqs_start, reqs_end in chunk_seq_ids
        ]
        prefill_metadata = DeepseekV32IndexerPrefillMetadata(
            chunks=chunks,
        )

    decode_metadata = None
    if num_decodes > 0:
        torch.diff(
            common_attn_metadata.query_start_loc[: num_decodes + 1],
            out=self.decode_lens_buffer[:num_decodes],
        )
        decode_lens = self.decode_lens_buffer[:num_decodes]
        decode_lens_cpu = torch.diff(
            common_attn_metadata.query_start_loc_cpu[: num_decodes + 1]
        )

        # Use CPU to avoid GPU sync; breaking async scheduling
        requires_padding = (decode_lens_cpu.max() > decode_lens_cpu.min()).item()

        seq_lens = common_attn_metadata.seq_lens[:num_decodes]

        self.scheduler_metadata_buffer[:] = get_paged_mqa_logits_metadata(
            seq_lens, self.kv_cache_spec.block_size, self.num_sms
        )
        decode_metadata = DeepSeekV32IndexerDecodeMetadata(
            block_table=common_attn_metadata.block_table_tensor[:num_decodes, ...],
            seq_lens=common_attn_metadata.seq_lens[:num_decodes],
            decode_lens=decode_lens,
            requires_padding=requires_padding,
            schedule_metadata=self.scheduler_metadata_buffer,
        )

    attn_metadata = DeepseekV32IndexerMetadata(
        seq_lens=common_attn_metadata.seq_lens,
        num_reqs=common_attn_metadata.num_reqs,
        max_query_len=common_attn_metadata.max_query_len,
        max_seq_len=common_attn_metadata.max_seq_len,
        num_actual_tokens=common_attn_metadata.num_actual_tokens,
        query_start_loc=common_attn_metadata.query_start_loc,
        slot_mapping=common_attn_metadata.slot_mapping,
        head_dim=128,
        num_decodes=num_decodes,
        num_decode_tokens=num_decode_tokens,
        num_prefills=num_prefills,
        num_prefill_tokens=num_prefill_tokens,
        prefill=prefill_metadata,
        decode=decode_metadata,
    )

    # if get_tensor_model_parallel_rank() == 0:
    #     logger.info(f"attn_metadata: {attn_metadata}")
    return attn_metadata

build_one_prefill_chunk ¶

build_one_prefill_chunk(
    reqs_start,
    reqs_end,
    query_start_loc_cpu,
    seq_lens_cpu,
    block_table,
)

Source code in vllm/v1/attention/backends/mla/indexer.py

def build_one_prefill_chunk(
    self, reqs_start, reqs_end, query_start_loc_cpu, seq_lens_cpu, block_table
):
    prefill_query_start_loc = (
        query_start_loc_cpu[reqs_start : reqs_end + 1]
        - query_start_loc_cpu[reqs_start]
    )
    cu_seqlen_ks, cu_seqlen_ke = kv_spans_from_batches(
        prefill_query_start_loc, seq_lens_cpu[reqs_start:reqs_end], self.device
    )
    token_start = query_start_loc_cpu[reqs_start].item()
    token_end = query_start_loc_cpu[reqs_end].item()
    total_seq_lens = seq_lens_cpu[reqs_start:reqs_end].sum()
    assert total_seq_lens <= self.max_prefill_buffer_size
    cu_seq_lens = (
        torch.cat(
            [
                torch.zeros(1, dtype=torch.int32),
                seq_lens_cpu[reqs_start:reqs_end].cumsum(dim=0),
            ]
        )
        .to(torch.int32)
        .to(self.device)
    )
    return DeepseekV32IndexerPrefillChunkMetadata(
        cu_seqlen_ks=cu_seqlen_ks,
        cu_seqlen_ke=cu_seqlen_ke,
        cu_seq_lens=cu_seq_lens,
        total_seq_lens=total_seq_lens,
        block_table=block_table[reqs_start:reqs_end],
        token_start=token_start,
        token_end=token_end,
        num_reqs=reqs_end - reqs_start,
    )

DeepseekV32IndexerPrefillChunkMetadata `dataclass` ¶

Source code in vllm/v1/attention/backends/mla/indexer.py

@dataclass
class DeepseekV32IndexerPrefillChunkMetadata:
    block_table: torch.Tensor
    cu_seqlen_ks: torch.Tensor
    cu_seqlen_ke: torch.Tensor
    cu_seq_lens: torch.Tensor
    total_seq_lens: int
    token_start: int
    token_end: int
    num_reqs: int

block_table `instance-attribute` ¶

block_table: Tensor

cu_seq_lens `instance-attribute` ¶

cu_seq_lens: Tensor

cu_seqlen_ke `instance-attribute` ¶

cu_seqlen_ke: Tensor

cu_seqlen_ks `instance-attribute` ¶

cu_seqlen_ks: Tensor

num_reqs `instance-attribute` ¶

num_reqs: int

token_end `instance-attribute` ¶

token_end: int

token_start `instance-attribute` ¶

token_start: int

total_seq_lens `instance-attribute` ¶

total_seq_lens: int

init ¶

__init__(
    block_table: Tensor,
    cu_seqlen_ks: Tensor,
    cu_seqlen_ke: Tensor,
    cu_seq_lens: Tensor,
    total_seq_lens: int,
    token_start: int,
    token_end: int,
    num_reqs: int,
) -> None

DeepseekV32IndexerPrefillMetadata `dataclass` ¶

Source code in vllm/v1/attention/backends/mla/indexer.py

@dataclass
class DeepseekV32IndexerPrefillMetadata:
    chunks: list[DeepseekV32IndexerPrefillChunkMetadata]

chunks `instance-attribute` ¶

chunks: list[DeepseekV32IndexerPrefillChunkMetadata]

init ¶

__init__(
    chunks: list[DeepseekV32IndexerPrefillChunkMetadata],
) -> None

get_max_prefill_buffer_size ¶

get_max_prefill_buffer_size(vllm_config: VllmConfig)

Source code in vllm/v1/attention/backends/mla/indexer.py

def get_max_prefill_buffer_size(vllm_config: VllmConfig):
    max_model_len = vllm_config.model_config.max_model_len
    # NOTE(Chen): 2 is a magic number for controlling the prefill buffer size.
    # May be tuned later.
    return max_model_len * 2

kv_spans_from_batches ¶

kv_spans_from_batches(
    start_seq_loc: Tensor,
    seq_len_per_batch: Tensor,
    device: device,
) -> tuple[Tensor, Tensor]

Parameters:

Name	Type	Description	Default
`start_seq_loc`	`Tensor`	1D long tensor [B+1], cumulative counts of selected tokens per batch. Example: [0, 2, 4, 7] -> batch sizes (selected) [2, 2, 3], N=7 tokens total.	required
`seq_len_per_batch`	`Tensor`	1D long tensor [B], full sequence length (KV length) of each batch. Example: [5, 9, 4].	required

Returns:

Name	Type	Description
`start_tensor`	`Tensor`	1D long tensor [N], start offset in the concatenated KV cache for each token's batch.
`end_location`	`Tensor`	1D long tensor [N], exclusive end = start + token's local position. (So the attended KV slice is kv[start:end].)

Assumes each batch contributes its full seq_len_per_batch[i] keys to the KV cache, andthe selected tokens within a batch are the last counts[i] positions of that sequence.

Source code in vllm/v1/attention/backends/mla/indexer.py

def kv_spans_from_batches(
    start_seq_loc: torch.Tensor, seq_len_per_batch: torch.Tensor, device: torch.device
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Args:
      start_seq_loc: 1D long tensor [B+1], cumulative counts of
                     selected tokens per batch.
            Example: [0, 2, 4, 7] ->
                     batch sizes (selected) [2, 2, 3], N=7 tokens total.
      seq_len_per_batch: 1D long tensor [B],
                         full sequence length (KV length) of each batch.
                         Example: [5, 9, 4].

    Returns:
      start_tensor: 1D long tensor [N], start offset in the
                    concatenated KV cache for each token's batch.
      end_location: 1D long tensor [N],
                    **exclusive** end = start + token's local position.
                    (So the attended KV slice is kv[start:end].)

    Assumes each batch contributes its full `seq_len_per_batch[i]`
    keys to the KV cache, andthe selected tokens within a batch
    are the **last** `counts[i]` positions of that sequence.
    """
    q = start_seq_loc.to(dtype=torch.long)
    L = seq_len_per_batch.to(dtype=torch.long)
    assert q.dim() == 1 and L.dim() == 1
    assert q.numel() == L.numel() + 1, "start_seq_loc must have length B+1"

    # Selected tokens per batch and totals
    counts = q[1:] - q[:-1]  # [B]
    N = int(q[-1].item())  # total selected tokens
    B = L.numel()

    if N == 0:
        return (
            torch.empty(0, dtype=torch.long, device=device),
            torch.empty(0, dtype=torch.long, device=device),
        )

    # KV start offsets per batch in the concatenated KV cache
    kv_starts_per_batch = torch.cumsum(L, dim=0) - L  # [B]

    # For each selected token, which batch does it belong to?
    batch_id = torch.repeat_interleave(torch.arange(B), counts)  # [N]

    # Map batch KV start to each token
    start_tensor = kv_starts_per_batch[batch_id]  # [N]

    # End-align local positions inside each batch:
    # local_pos = L[b] - counts[b] + (1..counts[b])  for each batch b
    L_expand = torch.repeat_interleave(L, counts)  # [N]
    m_expand = torch.repeat_interleave(counts, counts)  # [N]
    # position within the selected block: 1..counts[b]
    pos_within = (
        torch.arange(N, dtype=torch.long) - torch.repeat_interleave(q[:-1], counts) + 1
    )

    local_pos = L_expand - m_expand + pos_within  # [N], 1-based
    end_location = start_tensor + local_pos  # exclusive end

    return start_tensor.int().to(device), end_location.int().to(device)

split_prefill_chunks ¶

split_prefill_chunks(
    seq_lens_cpu: Tensor,
    max_prefill_buffer_size: int,
    reqs_start: int,
) -> list[tuple[int, int]]

Split the prefill chunks into a list of tuples of (reqs_start, reqs_end) such that the total sequence length of each chunk is less than the maximum prefill buffer size.

Parameters:

Name	Type	Description	Default
`seq_lens_cpu`	`Tensor`	The sequence lengths of the prefill requests.	required
`max_prefill_buffer_size`	`int`	The maximum prefill buffer size.	required
`reqs_start`	`int`	The start index of the prefill requests.	required

Returns:

Type	Description
`list[tuple[int, int]]`	A list of tuples of (reqs_start, reqs_end).

Source code in vllm/v1/attention/backends/mla/indexer.py

def split_prefill_chunks(
    seq_lens_cpu: torch.Tensor, max_prefill_buffer_size: int, reqs_start: int
) -> list[tuple[int, int]]:
    """
    Split the prefill chunks into a list of tuples of (reqs_start, reqs_end)
    such that the total sequence length of each chunk is less than the
    maximum prefill buffer size.

    Args:
        seq_lens_cpu: The sequence lengths of the prefill requests.
        max_prefill_buffer_size: The maximum prefill buffer size.
        reqs_start: The start index of the prefill requests.

    Returns:
        A list of tuples of (reqs_start, reqs_end).
    """
    chunk_seq_ids = []
    total_seq_lens = 0
    for i in range(reqs_start, len(seq_lens_cpu)):
        cur_seq_len = seq_lens_cpu[i].item()
        assert cur_seq_len <= max_prefill_buffer_size
        total_seq_lens += cur_seq_len
        if total_seq_lens > max_prefill_buffer_size:
            chunk_seq_ids.append((reqs_start, i))
            reqs_start = i
            total_seq_lens = cur_seq_len
    if total_seq_lens > 0:
        chunk_seq_ids.append((reqs_start, len(seq_lens_cpu)))
    return chunk_seq_ids

vllm.v1.attention.backends.mla.indexer ¶

logger module-attribute ¶

DeepSeekV32IndexerDecodeMetadata dataclass ¶

block_table instance-attribute ¶

decode_lens instance-attribute ¶

requires_padding instance-attribute ¶

schedule_metadata instance-attribute ¶

seq_lens instance-attribute ¶

__init__ ¶

DeepseekV32IndexerBackend ¶

get_builder_cls staticmethod ¶

get_kv_cache_shape staticmethod ¶

get_kv_cache_stride_order staticmethod ¶

get_metadata_cls staticmethod ¶

get_supported_head_sizes classmethod ¶

DeepseekV32IndexerMetadata dataclass ¶

decode class-attribute instance-attribute ¶

head_dim instance-attribute ¶

max_query_len instance-attribute ¶

max_seq_len instance-attribute ¶

num_actual_tokens instance-attribute ¶

num_decode_tokens instance-attribute ¶

num_decodes instance-attribute ¶

num_prefill_tokens instance-attribute ¶

num_prefills instance-attribute ¶

num_reqs instance-attribute ¶

prefill class-attribute instance-attribute ¶

query_start_loc instance-attribute ¶

seq_lens instance-attribute ¶

slot_mapping instance-attribute ¶

__init__ ¶

DeepseekV32IndexerMetadataBuilder ¶

cudagraph_support class-attribute ¶

decode_lens_buffer instance-attribute ¶

max_prefill_buffer_size instance-attribute ¶

num_sms instance-attribute ¶

num_speculative_tokens instance-attribute ¶

reorder_batch_threshold class-attribute instance-attribute ¶

scheduler_metadata_buffer instance-attribute ¶

__init__ ¶

build ¶

build_one_prefill_chunk ¶

DeepseekV32IndexerPrefillChunkMetadata dataclass ¶

block_table instance-attribute ¶

cu_seq_lens instance-attribute ¶

cu_seqlen_ke instance-attribute ¶

cu_seqlen_ks instance-attribute ¶

num_reqs instance-attribute ¶

token_end instance-attribute ¶

token_start instance-attribute ¶

total_seq_lens instance-attribute ¶

__init__ ¶

DeepseekV32IndexerPrefillMetadata dataclass ¶

chunks instance-attribute ¶

__init__ ¶

get_max_prefill_buffer_size ¶

kv_spans_from_batches ¶

split_prefill_chunks ¶

logger `module-attribute` ¶

DeepSeekV32IndexerDecodeMetadata `dataclass` ¶

block_table `instance-attribute` ¶

decode_lens `instance-attribute` ¶

requires_padding `instance-attribute` ¶

schedule_metadata `instance-attribute` ¶

seq_lens `instance-attribute` ¶

init ¶

get_builder_cls `staticmethod` ¶

get_kv_cache_shape `staticmethod` ¶

get_kv_cache_stride_order `staticmethod` ¶

get_metadata_cls `staticmethod` ¶

get_supported_head_sizes `classmethod` ¶

DeepseekV32IndexerMetadata `dataclass` ¶

decode `class-attribute` `instance-attribute` ¶

head_dim `instance-attribute` ¶

max_query_len `instance-attribute` ¶

max_seq_len `instance-attribute` ¶

num_actual_tokens `instance-attribute` ¶

num_decode_tokens `instance-attribute` ¶

num_decodes `instance-attribute` ¶

num_prefill_tokens `instance-attribute` ¶

num_prefills `instance-attribute` ¶

num_reqs `instance-attribute` ¶

prefill `class-attribute` `instance-attribute` ¶

query_start_loc `instance-attribute` ¶

seq_lens `instance-attribute` ¶

slot_mapping `instance-attribute` ¶

init ¶

cudagraph_support `class-attribute` ¶

decode_lens_buffer `instance-attribute` ¶

max_prefill_buffer_size `instance-attribute` ¶

num_sms `instance-attribute` ¶

num_speculative_tokens `instance-attribute` ¶

reorder_batch_threshold `class-attribute` `instance-attribute` ¶

scheduler_metadata_buffer `instance-attribute` ¶

init ¶

DeepseekV32IndexerPrefillChunkMetadata `dataclass` ¶

block_table `instance-attribute` ¶

cu_seq_lens `instance-attribute` ¶

cu_seqlen_ke `instance-attribute` ¶

cu_seqlen_ks `instance-attribute` ¶

num_reqs `instance-attribute` ¶

token_end `instance-attribute` ¶

token_start `instance-attribute` ¶

total_seq_lens `instance-attribute` ¶

init ¶

DeepseekV32IndexerPrefillMetadata `dataclass` ¶

chunks `instance-attribute` ¶

init ¶