vllm.entrypoints.serve.disagg.protocol ¶

GenerateRequest ¶

Bases: BaseModel

Source code in vllm/entrypoints/serve/disagg/protocol.py

class GenerateRequest(BaseModel):
    request_id: str = Field(
        default_factory=lambda: f"{random_uuid()}",
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."
        ),
    )
    token_ids: list[int]
    """The token ids to generate text from."""

    # features: MultiModalFeatureSpec
    # TODO (NickLucche): implement once Renderer work is completed
    features: str | None = None
    """The processed MM inputs for the model."""

    sampling_params: SamplingParams
    """The sampling parameters for the model."""

    model: str | None = None

    stream: bool | None = False
    stream_options: StreamOptions | None = None
    cache_salt: str | None = Field(
        default=None,
        description=(
            "If specified, the prefix cache will be salted with the provided "
            "string to prevent an attacker to guess prompts in multi-user "
            "environments. The salt should be random, protected from "
            "access by 3rd parties, and long enough to be "
            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
            "to 256 bit)."
        ),
    )
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."
        ),
    )
    kv_transfer_params: dict[str, Any] | None = Field(
        default=None,
        description="KVTransfer parameters used for disaggregated serving.",
    )

    def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
        return TokenizeParams(
            max_total_tokens=None,
            max_output_tokens=0,
        )

cache_salt `class-attribute` `instance-attribute` ¶

cache_salt: str | None = Field(
    default=None,
    description="If specified, the prefix cache will be salted with the provided string to prevent an attacker to guess prompts in multi-user environments. The salt should be random, protected from access by 3rd parties, and long enough to be unpredictable (e.g., 43 characters base64-encoded, corresponding to 256 bit).",
)

features `class-attribute` `instance-attribute` ¶

features: str | None = None

The processed MM inputs for the model.

kv_transfer_params `class-attribute` `instance-attribute` ¶

kv_transfer_params: dict[str, Any] | None = Field(
    default=None,
    description="KVTransfer parameters used for disaggregated serving.",
)

model `class-attribute` `instance-attribute` ¶

model: str | None = None

priority `class-attribute` `instance-attribute` ¶

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

request_id `class-attribute` `instance-attribute` ¶

request_id: str = Field(
    default_factory=lambda: f"{random_uuid()}",
    description="The request_id related to this request. If the caller does not set it, a random_uuid will be generated. This id is used through out the inference process and return in response.",
)

sampling_params `instance-attribute` ¶

sampling_params: SamplingParams

The sampling parameters for the model.

stream `class-attribute` `instance-attribute` ¶

stream: bool | None = False

stream_options `class-attribute` `instance-attribute` ¶

stream_options: StreamOptions | None = None

token_ids `instance-attribute` ¶

token_ids: list[int]

The token ids to generate text from.

build_tok_params ¶

build_tok_params(
    model_config: ModelConfig,
) -> TokenizeParams

Source code in vllm/entrypoints/serve/disagg/protocol.py

def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
    return TokenizeParams(
        max_total_tokens=None,
        max_output_tokens=0,
    )

GenerateResponse ¶

Bases: BaseModel

Source code in vllm/entrypoints/serve/disagg/protocol.py

class GenerateResponse(BaseModel):
    request_id: str = Field(
        default_factory=lambda: f"{random_uuid()}",
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."
        ),
    )
    choices: list[GenerateResponseChoice]

    prompt_logprobs: list[dict[int, Logprob] | None] | None = None

    kv_transfer_params: dict[str, Any] | None = Field(
        default=None,
        description="KVTransfer parameters used for disaggregated serving.",
    )

choices `instance-attribute` ¶

choices: list[GenerateResponseChoice]

kv_transfer_params `class-attribute` `instance-attribute` ¶

kv_transfer_params: dict[str, Any] | None = Field(
    default=None,
    description="KVTransfer parameters used for disaggregated serving.",
)

prompt_logprobs `class-attribute` `instance-attribute` ¶

prompt_logprobs: list[dict[int, Logprob] | None] | None = (
    None
)

request_id `class-attribute` `instance-attribute` ¶

request_id: str = Field(
    default_factory=lambda: f"{random_uuid()}",
    description="The request_id related to this request. If the caller does not set it, a random_uuid will be generated. This id is used through out the inference process and return in response.",
)

GenerateResponseChoice ¶

Bases: BaseModel

Source code in vllm/entrypoints/serve/disagg/protocol.py

class GenerateResponseChoice(BaseModel):
    index: int
    logprobs: ChatCompletionLogProbs | None = None
    # per OpenAI spec this is the default
    finish_reason: str | None = "stop"
    token_ids: list[int] | None = None

finish_reason `class-attribute` `instance-attribute` ¶

finish_reason: str | None = 'stop'

index `instance-attribute` ¶

index: int

logprobs `class-attribute` `instance-attribute` ¶

logprobs: ChatCompletionLogProbs | None = None

token_ids `class-attribute` `instance-attribute` ¶

token_ids: list[int] | None = None

vllm.entrypoints.serve.disagg.protocol ¶

GenerateRequest ¶

cache_salt class-attribute instance-attribute ¶

features class-attribute instance-attribute ¶

kv_transfer_params class-attribute instance-attribute ¶

model class-attribute instance-attribute ¶

priority class-attribute instance-attribute ¶

request_id class-attribute instance-attribute ¶

sampling_params instance-attribute ¶

stream class-attribute instance-attribute ¶

stream_options class-attribute instance-attribute ¶

token_ids instance-attribute ¶

build_tok_params ¶

GenerateResponse ¶

choices instance-attribute ¶

kv_transfer_params class-attribute instance-attribute ¶

prompt_logprobs class-attribute instance-attribute ¶

request_id class-attribute instance-attribute ¶

GenerateResponseChoice ¶

finish_reason class-attribute instance-attribute ¶

index instance-attribute ¶

logprobs class-attribute instance-attribute ¶

token_ids class-attribute instance-attribute ¶

cache_salt `class-attribute` `instance-attribute` ¶

features `class-attribute` `instance-attribute` ¶

kv_transfer_params `class-attribute` `instance-attribute` ¶

model `class-attribute` `instance-attribute` ¶

priority `class-attribute` `instance-attribute` ¶

request_id `class-attribute` `instance-attribute` ¶

sampling_params `instance-attribute` ¶

stream `class-attribute` `instance-attribute` ¶

stream_options `class-attribute` `instance-attribute` ¶

token_ids `instance-attribute` ¶

choices `instance-attribute` ¶

kv_transfer_params `class-attribute` `instance-attribute` ¶

prompt_logprobs `class-attribute` `instance-attribute` ¶

request_id `class-attribute` `instance-attribute` ¶

finish_reason `class-attribute` `instance-attribute` ¶

index `instance-attribute` ¶

logprobs `class-attribute` `instance-attribute` ¶

token_ids `class-attribute` `instance-attribute` ¶