Skip to content

vllm.entrypoints.pooling.utils

MetadataItem dataclass

Source code in vllm/entrypoints/pooling/utils.py
@dataclass
class MetadataItem:
    index: int
    embed_dtype: EmbedDType
    endianness: Endianness
    start: int
    end: int
    shape: tuple[int, ...]

embed_dtype instance-attribute

embed_dtype: EmbedDType

end instance-attribute

end: int

endianness instance-attribute

endianness: Endianness

index instance-attribute

index: int

shape instance-attribute

shape: tuple[int, ...]

start instance-attribute

start: int

__init__

__init__(
    index: int,
    embed_dtype: EmbedDType,
    endianness: Endianness,
    start: int,
    end: int,
    shape: tuple[int, ...],
) -> None

build_metadata_items

build_metadata_items(
    embed_dtype: EmbedDType,
    endianness: Endianness,
    shape: tuple[int, ...],
    n_request: int,
) -> list[MetadataItem]
Source code in vllm/entrypoints/pooling/utils.py
def build_metadata_items(
    embed_dtype: EmbedDType,
    endianness: Endianness,
    shape: tuple[int, ...],
    n_request: int,
) -> list[MetadataItem]:
    n_bytes = EMBED_DTYPES[embed_dtype].nbytes
    size = math.prod(shape)

    return [
        MetadataItem(
            index=i,
            embed_dtype=embed_dtype,
            endianness=endianness,
            start=i * size * n_bytes,
            end=(i + 1) * size * n_bytes,
            shape=shape,
        )
        for i in range(n_request)
    ]

decode_pooling_output

decode_pooling_output(
    items: list[MetadataItem], body: bytes
) -> list[Tensor]
Source code in vllm/entrypoints/pooling/utils.py
def decode_pooling_output(items: list[MetadataItem], body: bytes) -> list[torch.Tensor]:
    return [
        binary2tensor(
            body[item.start : item.end],
            item.shape,
            item.embed_dtype,
            item.endianness,
        )
        for item in sorted(items, key=lambda x: x.index)
    ]

encode_pooling_bytes

encode_pooling_bytes(
    pooling_outputs: list[PoolingRequestOutput],
    embed_dtype: EmbedDType,
    endianness: Endianness,
) -> tuple[
    list[bytes], list[dict[str, Any]], dict[str, Any]
]
Source code in vllm/entrypoints/pooling/utils.py
def encode_pooling_bytes(
    pooling_outputs: list[PoolingRequestOutput],
    embed_dtype: EmbedDType,
    endianness: Endianness,
) -> tuple[list[bytes], list[dict[str, Any]], dict[str, Any]]:
    num_prompt_tokens = 0
    items: list[dict[str, Any]] = []
    body: list[bytes] = []
    offset = 0
    for idx, output in enumerate(pooling_outputs):
        binary = tensor2binary(
            tensor=output.outputs.data,
            embed_dtype=embed_dtype,
            endianness=endianness,
        )
        size = len(binary)

        # Dictionary form of MetadataItem
        item = dict(
            index=idx,
            embed_dtype=embed_dtype,
            endianness=endianness,
            start=offset,
            end=offset + size,
            shape=output.outputs.data.shape,
        )

        body.append(binary)
        items.append(item)
        prompt_token_ids = output.prompt_token_ids
        num_prompt_tokens += len(prompt_token_ids)
        offset += size

    # Dictionary form of UsageInfo
    usage = dict(
        prompt_tokens=num_prompt_tokens,
        total_tokens=num_prompt_tokens,
    )

    return body, items, usage

encode_pooling_output_base64

encode_pooling_output_base64(
    output: PoolingRequestOutput,
    embed_dtype: EmbedDType,
    endianness: Endianness,
) -> str
Source code in vllm/entrypoints/pooling/utils.py
def encode_pooling_output_base64(
    output: PoolingRequestOutput,
    embed_dtype: EmbedDType,
    endianness: Endianness,
) -> str:
    embedding_bytes = tensor2binary(output.outputs.data, embed_dtype, endianness)
    return pybase64.b64encode(embedding_bytes).decode("utf-8")

encode_pooling_output_binary

encode_pooling_output_binary(
    output: PoolingRequestOutput,
    embed_dtype: EmbedDType,
    endianness: Endianness,
) -> bytes
Source code in vllm/entrypoints/pooling/utils.py
def encode_pooling_output_binary(
    output: PoolingRequestOutput,
    embed_dtype: EmbedDType,
    endianness: Endianness,
) -> bytes:
    return tensor2binary(output.outputs.data, embed_dtype, endianness)

encode_pooling_output_float

encode_pooling_output_float(
    output: PoolingRequestOutput,
) -> list[float]
Source code in vllm/entrypoints/pooling/utils.py
def encode_pooling_output_float(output: PoolingRequestOutput) -> list[float]:
    return output.outputs.data.tolist()