Skip to content

vllm.entrypoints.serve.disagg.protocol

GenerateRequest

Bases: BaseModel

Source code in vllm/entrypoints/serve/disagg/protocol.py
class GenerateRequest(BaseModel):
    request_id: str = Field(
        default_factory=lambda: f"{random_uuid()}",
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."
        ),
    )
    token_ids: list[int]
    """The token ids to generate text from."""

    # features: MultiModalFeatureSpec
    # TODO (NickLucche): implement once Renderer work is completed
    features: str | None = None
    """The processed MM inputs for the model."""

    sampling_params: SamplingParams
    """The sampling parameters for the model."""

    model: str | None = None

    stream: bool | None = False
    stream_options: StreamOptions | None = None
    cache_salt: str | None = Field(
        default=None,
        description=(
            "If specified, the prefix cache will be salted with the provided "
            "string to prevent an attacker to guess prompts in multi-user "
            "environments. The salt should be random, protected from "
            "access by 3rd parties, and long enough to be "
            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
            "to 256 bit)."
        ),
    )
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."
        ),
    )
    kv_transfer_params: dict[str, Any] | None = Field(
        default=None,
        description="KVTransfer parameters used for disaggregated serving.",
    )

cache_salt class-attribute instance-attribute

cache_salt: str | None = Field(
    default=None,
    description="If specified, the prefix cache will be salted with the provided string to prevent an attacker to guess prompts in multi-user environments. The salt should be random, protected from access by 3rd parties, and long enough to be unpredictable (e.g., 43 characters base64-encoded, corresponding to 256 bit).",
)

features class-attribute instance-attribute

features: str | None = None

The processed MM inputs for the model.

kv_transfer_params class-attribute instance-attribute

kv_transfer_params: dict[str, Any] | None = Field(
    default=None,
    description="KVTransfer parameters used for disaggregated serving.",
)

model class-attribute instance-attribute

model: str | None = None

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

request_id class-attribute instance-attribute

request_id: str = Field(
    default_factory=lambda: f"{random_uuid()}",
    description="The request_id related to this request. If the caller does not set it, a random_uuid will be generated. This id is used through out the inference process and return in response.",
)

sampling_params instance-attribute

sampling_params: SamplingParams

The sampling parameters for the model.

stream class-attribute instance-attribute

stream: bool | None = False

stream_options class-attribute instance-attribute

stream_options: StreamOptions | None = None

token_ids instance-attribute

token_ids: list[int]

The token ids to generate text from.

GenerateResponse

Bases: BaseModel

Source code in vllm/entrypoints/serve/disagg/protocol.py
class GenerateResponse(BaseModel):
    request_id: str = Field(
        default_factory=lambda: f"{random_uuid()}",
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."
        ),
    )
    choices: list[GenerateResponseChoice]

    prompt_logprobs: list[dict[int, Logprob] | None] | None = None

    kv_transfer_params: dict[str, Any] | None = Field(
        default=None,
        description="KVTransfer parameters used for disaggregated serving.",
    )

choices instance-attribute

kv_transfer_params class-attribute instance-attribute

kv_transfer_params: dict[str, Any] | None = Field(
    default=None,
    description="KVTransfer parameters used for disaggregated serving.",
)

prompt_logprobs class-attribute instance-attribute

prompt_logprobs: list[dict[int, Logprob] | None] | None = (
    None
)

request_id class-attribute instance-attribute

request_id: str = Field(
    default_factory=lambda: f"{random_uuid()}",
    description="The request_id related to this request. If the caller does not set it, a random_uuid will be generated. This id is used through out the inference process and return in response.",
)

GenerateResponseChoice

Bases: BaseModel

Source code in vllm/entrypoints/serve/disagg/protocol.py
class GenerateResponseChoice(BaseModel):
    index: int
    logprobs: ChatCompletionLogProbs | None = None
    # per OpenAI spec this is the default
    finish_reason: str | None = "stop"
    token_ids: list[int] | None = None

finish_reason class-attribute instance-attribute

finish_reason: str | None = 'stop'

index instance-attribute

index: int

logprobs class-attribute instance-attribute

logprobs: ChatCompletionLogProbs | None = None

token_ids class-attribute instance-attribute

token_ids: list[int] | None = None