vllm.renderers ¶

Modules:

Name	Description
`deepseek_v32`
`embed_utils`
`grok2`
`hf`
`mistral`
`params`
`protocol`
`registry`
`terratorch`

all `module-attribute` ¶

__all__ = [
    "BaseRenderer",
    "RendererRegistry",
    "renderer_from_config",
    "ChatParams",
    "TokenizeParams",
    "merge_kwargs",
]

BaseRenderer ¶

Bases: ABC

Source code in vllm/renderers/protocol.py

class BaseRenderer(ABC):
    @classmethod
    @abstractmethod
    def from_config(
        cls,
        config: "ModelConfig",
        tokenizer_kwargs: dict[str, Any],
    ) -> "BaseRenderer":
        raise NotImplementedError

    def __init__(self, config: "ModelConfig") -> None:
        super().__init__()

        self.config = config

        # Lazy initialization since offline LLM doesn't use async
        self._async_tokenizer: AsyncMicrobatchTokenizer | None = None

    @property
    @abstractmethod
    def tokenizer(self) -> TokenizerLike | None:
        raise NotImplementedError

    def get_tokenizer(self) -> TokenizerLike:
        tokenizer = self.tokenizer
        if tokenizer is None:
            raise ValueError("Tokenizer not available when `skip_tokenizer_init=True`")

        return tokenizer

    def get_async_tokenizer(self) -> AsyncMicrobatchTokenizer:
        if self._async_tokenizer is None:
            self._async_tokenizer = AsyncMicrobatchTokenizer(self.get_tokenizer())

        return self._async_tokenizer

    # Step 1: Convert raw inputs to prompts
    def render_completion(
        self,
        prompt_raw: str | list[int] | bytes,
    ) -> TextPrompt | TokensPrompt | EmbedsPrompt:
        error_msg = "Each prompt must be a string or an array of tokens"

        if isinstance(prompt_raw, str):
            return TextPrompt(prompt=prompt_raw)

        if isinstance(prompt_raw, list):
            if not is_list_of(prompt_raw, int):
                raise TypeError(error_msg)

            return TokensPrompt(prompt_token_ids=prompt_raw)

        if isinstance(prompt_raw, bytes):
            embeds = safe_load_prompt_embeds(self.config, prompt_raw)
            return EmbedsPrompt(prompt_embeds=embeds)

        raise TypeError(error_msg)

    def render_completions(
        self,
        prompt_input: str | list[str] | list[int] | list[list[int]] | None = None,
        prompt_embeds: bytes | list[bytes] | None = None,
    ) -> list[TextPrompt | TokensPrompt | EmbedsPrompt]:
        prompts_raw = list[str | list[int] | bytes]()

        if prompt_embeds is not None:  # embeds take higher priority
            if isinstance(prompt_embeds, bytes):
                prompts_raw.append(prompt_embeds)
            else:
                prompts_raw.extend(prompt_embeds)

        if prompt_input is not None:
            if isinstance(prompt_input, str) or (
                len(prompt_input) > 0 and is_list_of(prompt_input, int)
            ):
                prompts_raw.append(prompt_input)  # type: ignore[arg-type]
            else:
                prompts_raw.extend(prompt_input)  # type: ignore[arg-type]

        if len(prompts_raw) == 0:
            raise ValueError("You must pass at least one prompt")

        return [self.render_completion(prompt) for prompt in prompts_raw]

    async def render_completions_async(
        self,
        prompt_input: str | list[str] | list[int] | list[list[int]] | None = None,
        prompt_embeds: bytes | list[bytes] | None = None,
    ) -> list[TextPrompt | TokensPrompt | EmbedsPrompt]:
        return self.render_completions(prompt_input, prompt_embeds)

    @abstractmethod
    def render_messages(
        self,
        messages: list["ChatCompletionMessageParam"],
        params: ChatParams,
    ) -> tuple[list["ConversationMessage"], TextPrompt | TokensPrompt | EmbedsPrompt]:
        raise NotImplementedError

    async def render_messages_async(
        self,
        messages: list["ChatCompletionMessageParam"],
        params: ChatParams,
    ) -> tuple[list["ConversationMessage"], TextPrompt | TokensPrompt | EmbedsPrompt]:
        return self.render_messages(messages, params)

    # Step 2: Tokenize prompts if necessary
    def tokenize_prompt(
        self,
        prompt: TextPrompt | TokensPrompt | EmbedsPrompt,
        params: TokenizeParams,
    ) -> TokensPrompt | EmbedsPrompt:
        if "prompt_token_ids" not in prompt and "prompt_embeds" not in prompt:
            prompt = params.apply_pre_tokenization(self.tokenizer, prompt)

            tokenizer = self.get_tokenizer()
            prompt_token_ids = tokenizer.encode(
                prompt["prompt"],
                **params.get_encode_kwargs(),
            )

            prompt = TokensPrompt(prompt_token_ids=prompt_token_ids, **prompt)

        if params.needs_detokenization and "prompt" not in prompt:
            if "prompt_token_ids" not in prompt:
                raise RuntimeError("Cannot run detokenization on embeddings")

            tokenizer = self.get_tokenizer()
            prompt_text = tokenizer.decode(prompt["prompt_token_ids"])  # type: ignore[typeddict-item]
            prompt["prompt"] = prompt_text  # type: ignore[typeddict-unknown-key]

        return params.apply_post_tokenization(self.tokenizer, prompt)  # type: ignore[arg-type]

    def tokenize_prompts(
        self,
        prompts: list[TextPrompt | TokensPrompt | EmbedsPrompt],
        params: TokenizeParams,
    ) -> list[TokensPrompt | EmbedsPrompt]:
        return [self.tokenize_prompt(prompt, params) for prompt in prompts]

    async def tokenize_prompt_async(
        self,
        prompt: TextPrompt | TokensPrompt | EmbedsPrompt,
        params: TokenizeParams,
    ) -> TokensPrompt | EmbedsPrompt:
        if "prompt_token_ids" not in prompt and "prompt_embeds" not in prompt:
            prompt = params.apply_pre_tokenization(self.tokenizer, prompt)

            tokenizer = self.get_async_tokenizer()
            prompt_token_ids = await tokenizer.encode(
                prompt["prompt"],
                **params.get_encode_kwargs(),
            )

            prompt = TokensPrompt(prompt_token_ids=prompt_token_ids, **prompt)

        if params.needs_detokenization and "prompt" not in prompt:
            if "prompt_token_ids" not in prompt:
                raise RuntimeError("Cannot run detokenization on embeddings")

            tokenizer = self.get_async_tokenizer()
            prompt_text = await tokenizer.decode(prompt["prompt_token_ids"])  # type: ignore[typeddict-item]
            prompt["prompt"] = prompt_text  # type: ignore[typeddict-unknown-key]

        return params.apply_post_tokenization(self.tokenizer, prompt)  # type: ignore[arg-type]

    async def tokenize_prompts_async(
        self,
        prompts: list[TextPrompt | TokensPrompt | EmbedsPrompt],
        params: TokenizeParams,
    ) -> list[TokensPrompt | EmbedsPrompt]:
        return await asyncio.gather(
            *(self.tokenize_prompt_async(prompt, params) for prompt in prompts)
        )

_async_tokenizer `instance-attribute` ¶

_async_tokenizer: AsyncMicrobatchTokenizer | None = None

config `instance-attribute` ¶

config = config

tokenizer `abstractmethod` `property` ¶

tokenizer: TokenizerLike | None

init ¶

__init__(config: ModelConfig) -> None

Source code in vllm/renderers/protocol.py

def __init__(self, config: "ModelConfig") -> None:
    super().__init__()

    self.config = config

    # Lazy initialization since offline LLM doesn't use async
    self._async_tokenizer: AsyncMicrobatchTokenizer | None = None

from_config `abstractmethod` `classmethod` ¶

from_config(
    config: ModelConfig, tokenizer_kwargs: dict[str, Any]
) -> BaseRenderer

Source code in vllm/renderers/protocol.py

@classmethod
@abstractmethod
def from_config(
    cls,
    config: "ModelConfig",
    tokenizer_kwargs: dict[str, Any],
) -> "BaseRenderer":
    raise NotImplementedError

get_async_tokenizer ¶

get_async_tokenizer() -> AsyncMicrobatchTokenizer

Source code in vllm/renderers/protocol.py

def get_async_tokenizer(self) -> AsyncMicrobatchTokenizer:
    if self._async_tokenizer is None:
        self._async_tokenizer = AsyncMicrobatchTokenizer(self.get_tokenizer())

    return self._async_tokenizer

get_tokenizer ¶

get_tokenizer() -> TokenizerLike

Source code in vllm/renderers/protocol.py

def get_tokenizer(self) -> TokenizerLike:
    tokenizer = self.tokenizer
    if tokenizer is None:
        raise ValueError("Tokenizer not available when `skip_tokenizer_init=True`")

    return tokenizer

render_completion ¶

render_completion(
    prompt_raw: str | list[int] | bytes,
) -> TextPrompt | TokensPrompt | EmbedsPrompt

Source code in vllm/renderers/protocol.py

def render_completion(
    self,
    prompt_raw: str | list[int] | bytes,
) -> TextPrompt | TokensPrompt | EmbedsPrompt:
    error_msg = "Each prompt must be a string or an array of tokens"

    if isinstance(prompt_raw, str):
        return TextPrompt(prompt=prompt_raw)

    if isinstance(prompt_raw, list):
        if not is_list_of(prompt_raw, int):
            raise TypeError(error_msg)

        return TokensPrompt(prompt_token_ids=prompt_raw)

    if isinstance(prompt_raw, bytes):
        embeds = safe_load_prompt_embeds(self.config, prompt_raw)
        return EmbedsPrompt(prompt_embeds=embeds)

    raise TypeError(error_msg)

render_completions ¶

render_completions(
    prompt_input: str
    | list[str]
    | list[int]
    | list[list[int]]
    | None = None,
    prompt_embeds: bytes | list[bytes] | None = None,
) -> list[TextPrompt | TokensPrompt | EmbedsPrompt]

Source code in vllm/renderers/protocol.py

def render_completions(
    self,
    prompt_input: str | list[str] | list[int] | list[list[int]] | None = None,
    prompt_embeds: bytes | list[bytes] | None = None,
) -> list[TextPrompt | TokensPrompt | EmbedsPrompt]:
    prompts_raw = list[str | list[int] | bytes]()

    if prompt_embeds is not None:  # embeds take higher priority
        if isinstance(prompt_embeds, bytes):
            prompts_raw.append(prompt_embeds)
        else:
            prompts_raw.extend(prompt_embeds)

    if prompt_input is not None:
        if isinstance(prompt_input, str) or (
            len(prompt_input) > 0 and is_list_of(prompt_input, int)
        ):
            prompts_raw.append(prompt_input)  # type: ignore[arg-type]
        else:
            prompts_raw.extend(prompt_input)  # type: ignore[arg-type]

    if len(prompts_raw) == 0:
        raise ValueError("You must pass at least one prompt")

    return [self.render_completion(prompt) for prompt in prompts_raw]

render_completions_async `async` ¶

render_completions_async(
    prompt_input: str
    | list[str]
    | list[int]
    | list[list[int]]
    | None = None,
    prompt_embeds: bytes | list[bytes] | None = None,
) -> list[TextPrompt | TokensPrompt | EmbedsPrompt]

Source code in vllm/renderers/protocol.py

async def render_completions_async(
    self,
    prompt_input: str | list[str] | list[int] | list[list[int]] | None = None,
    prompt_embeds: bytes | list[bytes] | None = None,
) -> list[TextPrompt | TokensPrompt | EmbedsPrompt]:
    return self.render_completions(prompt_input, prompt_embeds)

render_messages `abstractmethod` ¶

render_messages(
    messages: list[ChatCompletionMessageParam],
    params: ChatParams,
) -> tuple[
    list[ConversationMessage],
    TextPrompt | TokensPrompt | EmbedsPrompt,
]

Source code in vllm/renderers/protocol.py

@abstractmethod
def render_messages(
    self,
    messages: list["ChatCompletionMessageParam"],
    params: ChatParams,
) -> tuple[list["ConversationMessage"], TextPrompt | TokensPrompt | EmbedsPrompt]:
    raise NotImplementedError

render_messages_async `async` ¶

render_messages_async(
    messages: list[ChatCompletionMessageParam],
    params: ChatParams,
) -> tuple[
    list[ConversationMessage],
    TextPrompt | TokensPrompt | EmbedsPrompt,
]

Source code in vllm/renderers/protocol.py

async def render_messages_async(
    self,
    messages: list["ChatCompletionMessageParam"],
    params: ChatParams,
) -> tuple[list["ConversationMessage"], TextPrompt | TokensPrompt | EmbedsPrompt]:
    return self.render_messages(messages, params)

tokenize_prompt ¶

tokenize_prompt(
    prompt: TextPrompt | TokensPrompt | EmbedsPrompt,
    params: TokenizeParams,
) -> TokensPrompt | EmbedsPrompt

Source code in vllm/renderers/protocol.py

def tokenize_prompt(
    self,
    prompt: TextPrompt | TokensPrompt | EmbedsPrompt,
    params: TokenizeParams,
) -> TokensPrompt | EmbedsPrompt:
    if "prompt_token_ids" not in prompt and "prompt_embeds" not in prompt:
        prompt = params.apply_pre_tokenization(self.tokenizer, prompt)

        tokenizer = self.get_tokenizer()
        prompt_token_ids = tokenizer.encode(
            prompt["prompt"],
            **params.get_encode_kwargs(),
        )

        prompt = TokensPrompt(prompt_token_ids=prompt_token_ids, **prompt)

    if params.needs_detokenization and "prompt" not in prompt:
        if "prompt_token_ids" not in prompt:
            raise RuntimeError("Cannot run detokenization on embeddings")

        tokenizer = self.get_tokenizer()
        prompt_text = tokenizer.decode(prompt["prompt_token_ids"])  # type: ignore[typeddict-item]
        prompt["prompt"] = prompt_text  # type: ignore[typeddict-unknown-key]

    return params.apply_post_tokenization(self.tokenizer, prompt)  # type: ignore[arg-type]

tokenize_prompt_async `async` ¶

tokenize_prompt_async(
    prompt: TextPrompt | TokensPrompt | EmbedsPrompt,
    params: TokenizeParams,
) -> TokensPrompt | EmbedsPrompt

Source code in vllm/renderers/protocol.py

async def tokenize_prompt_async(
    self,
    prompt: TextPrompt | TokensPrompt | EmbedsPrompt,
    params: TokenizeParams,
) -> TokensPrompt | EmbedsPrompt:
    if "prompt_token_ids" not in prompt and "prompt_embeds" not in prompt:
        prompt = params.apply_pre_tokenization(self.tokenizer, prompt)

        tokenizer = self.get_async_tokenizer()
        prompt_token_ids = await tokenizer.encode(
            prompt["prompt"],
            **params.get_encode_kwargs(),
        )

        prompt = TokensPrompt(prompt_token_ids=prompt_token_ids, **prompt)

    if params.needs_detokenization and "prompt" not in prompt:
        if "prompt_token_ids" not in prompt:
            raise RuntimeError("Cannot run detokenization on embeddings")

        tokenizer = self.get_async_tokenizer()
        prompt_text = await tokenizer.decode(prompt["prompt_token_ids"])  # type: ignore[typeddict-item]
        prompt["prompt"] = prompt_text  # type: ignore[typeddict-unknown-key]

    return params.apply_post_tokenization(self.tokenizer, prompt)  # type: ignore[arg-type]

tokenize_prompts ¶

tokenize_prompts(
    prompts: list[TextPrompt | TokensPrompt | EmbedsPrompt],
    params: TokenizeParams,
) -> list[TokensPrompt | EmbedsPrompt]

Source code in vllm/renderers/protocol.py

def tokenize_prompts(
    self,
    prompts: list[TextPrompt | TokensPrompt | EmbedsPrompt],
    params: TokenizeParams,
) -> list[TokensPrompt | EmbedsPrompt]:
    return [self.tokenize_prompt(prompt, params) for prompt in prompts]

tokenize_prompts_async `async` ¶

tokenize_prompts_async(
    prompts: list[TextPrompt | TokensPrompt | EmbedsPrompt],
    params: TokenizeParams,
) -> list[TokensPrompt | EmbedsPrompt]

Source code in vllm/renderers/protocol.py

async def tokenize_prompts_async(
    self,
    prompts: list[TextPrompt | TokensPrompt | EmbedsPrompt],
    params: TokenizeParams,
) -> list[TokensPrompt | EmbedsPrompt]:
    return await asyncio.gather(
        *(self.tokenize_prompt_async(prompt, params) for prompt in prompts)
    )

ChatParams `dataclass` ¶

Configuration to control how to parse chat messages.

Source code in vllm/renderers/params.py

@dataclass(frozen=True)
class ChatParams:
    """Configuration to control how to parse chat messages."""

    chat_template: str | None = None
    """The chat template to apply."""

    chat_template_content_format: ChatTemplateContentFormatOption = "auto"
    """The format of the chat template."""

    chat_template_kwargs: dict[str, Any] = field(default_factory=dict)
    """The kwargs to pass to the chat template."""

    def with_defaults(self, default_chat_template_kwargs: dict[str, Any] | None):
        if not default_chat_template_kwargs:
            return self

        return ChatParams(
            chat_template=self.chat_template,
            chat_template_content_format=self.chat_template_content_format,
            chat_template_kwargs=merge_kwargs(
                default_chat_template_kwargs,
                self.chat_template_kwargs,
            ),
        )

    def get_apply_chat_template_kwargs(self) -> dict[str, Any]:
        """The arguments to pass to `tokenizer.apply_chat_template`."""
        return merge_kwargs(
            self.chat_template_kwargs,
            dict(chat_template=self.chat_template, return_dict=False),
        )

chat_template `class-attribute` `instance-attribute` ¶

chat_template: str | None = None

The chat template to apply.

chat_template_content_format `class-attribute` `instance-attribute` ¶

chat_template_content_format: ChatTemplateContentFormatOption = "auto"

The format of the chat template.

chat_template_kwargs `class-attribute` `instance-attribute` ¶

chat_template_kwargs: dict[str, Any] = field(
    default_factory=dict
)

The kwargs to pass to the chat template.

init ¶

__init__(
    chat_template: str | None = None,
    chat_template_content_format: ChatTemplateContentFormatOption = "auto",
    chat_template_kwargs: dict[str, Any] = dict(),
) -> None

get_apply_chat_template_kwargs ¶

get_apply_chat_template_kwargs() -> dict[str, Any]

The arguments to pass to tokenizer.apply_chat_template.

Source code in vllm/renderers/params.py

def get_apply_chat_template_kwargs(self) -> dict[str, Any]:
    """The arguments to pass to `tokenizer.apply_chat_template`."""
    return merge_kwargs(
        self.chat_template_kwargs,
        dict(chat_template=self.chat_template, return_dict=False),
    )

with_defaults ¶

with_defaults(
    default_chat_template_kwargs: dict[str, Any] | None,
)

Source code in vllm/renderers/params.py

def with_defaults(self, default_chat_template_kwargs: dict[str, Any] | None):
    if not default_chat_template_kwargs:
        return self

    return ChatParams(
        chat_template=self.chat_template,
        chat_template_content_format=self.chat_template_content_format,
        chat_template_kwargs=merge_kwargs(
            default_chat_template_kwargs,
            self.chat_template_kwargs,
        ),
    )

RendererRegistry `dataclass` ¶

Source code in vllm/renderers/registry.py

@dataclass
class RendererRegistry:
    # Renderer mode ->  (renderer module, renderer class)
    renderers: dict[str, tuple[str, str]] = field(default_factory=dict)

    def register(self, renderer_mode: str, module: str, class_name: str) -> None:
        if renderer_mode in self.renderers:
            logger.warning(
                "%s.%s is already registered for renderer_mode=%r. "
                "It is overwritten by the new one.",
                module,
                class_name,
                renderer_mode,
            )

        self.renderers[renderer_mode] = (module, class_name)

        return None

    def load_renderer_cls(self, renderer_mode: str) -> type[BaseRenderer]:
        if renderer_mode not in self.renderers:
            raise ValueError(f"No renderer registered for {renderer_mode=!r}.")

        module, class_name = self.renderers[renderer_mode]
        logger.debug_once(f"Loading {class_name} for {renderer_mode=!r}")

        return resolve_obj_by_qualname(f"{module}.{class_name}")

    def load_renderer(
        self,
        renderer_mode: str,
        config: "ModelConfig",
        tokenizer_kwargs: dict[str, Any],
    ) -> BaseRenderer:
        renderer_cls = self.load_renderer_cls(renderer_mode)
        return renderer_cls.from_config(config, tokenizer_kwargs)

renderers `class-attribute` `instance-attribute` ¶

renderers: dict[str, tuple[str, str]] = field(
    default_factory=dict
)

init ¶

__init__(
    renderers: dict[str, tuple[str, str]] = dict(),
) -> None

load_renderer ¶

load_renderer(
    renderer_mode: str,
    config: ModelConfig,
    tokenizer_kwargs: dict[str, Any],
) -> BaseRenderer

Source code in vllm/renderers/registry.py

def load_renderer(
    self,
    renderer_mode: str,
    config: "ModelConfig",
    tokenizer_kwargs: dict[str, Any],
) -> BaseRenderer:
    renderer_cls = self.load_renderer_cls(renderer_mode)
    return renderer_cls.from_config(config, tokenizer_kwargs)

load_renderer_cls ¶

load_renderer_cls(renderer_mode: str) -> type[BaseRenderer]

Source code in vllm/renderers/registry.py

def load_renderer_cls(self, renderer_mode: str) -> type[BaseRenderer]:
    if renderer_mode not in self.renderers:
        raise ValueError(f"No renderer registered for {renderer_mode=!r}.")

    module, class_name = self.renderers[renderer_mode]
    logger.debug_once(f"Loading {class_name} for {renderer_mode=!r}")

    return resolve_obj_by_qualname(f"{module}.{class_name}")

register ¶

register(
    renderer_mode: str, module: str, class_name: str
) -> None

Source code in vllm/renderers/registry.py

def register(self, renderer_mode: str, module: str, class_name: str) -> None:
    if renderer_mode in self.renderers:
        logger.warning(
            "%s.%s is already registered for renderer_mode=%r. "
            "It is overwritten by the new one.",
            module,
            class_name,
            renderer_mode,
        )

    self.renderers[renderer_mode] = (module, class_name)

    return None

TokenizeParams `dataclass` ¶

Configuration to control how prompts are tokenized.

Source code in vllm/renderers/params.py

@dataclass(frozen=True)
class TokenizeParams:
    """Configuration to control how prompts are tokenized."""

    max_total_tokens: int | None
    """
    Maximum allowed number of input + output tokens.

    Usually, this refers to the model's context length.
    """

    max_output_tokens: int = 0
    """Maximum requested number of output tokens."""

    pad_prompt_tokens: int | None = None
    """
    Number of tokens to pad to:
    - `None` means no padding.
    - `-1` maps to `max_input_tokens`.
    """

    truncate_prompt_tokens: int | None = None
    """
    Number of tokens to keep:
    - `None` means no truncation.
    - `-1` maps to `max_input_tokens`.
    """

    do_lower_case: bool = False
    """Whether to normalize text to lower case before tokenization."""

    add_special_tokens: bool = True
    """Whether to add special tokens."""

    needs_detokenization: bool = False
    """
    Whether the tokenized prompt needs to contain the original text.

    Not to be confused with `SamplingParams.detokenize` which deals
    with the output generated by the model.
    """

    max_total_tokens_param: str = "max_total_tokens"
    """Override this to edit the message for validation errors."""

    max_output_tokens_param: str = "max_output_tokens"
    """Override this to edit the message for validation errors."""

    truncate_prompt_tokens_param: str = "truncate_prompt_tokens"
    """Override this to edit the message for validation errors."""

    @property
    def max_input_tokens(self) -> int | None:
        """Maximum allowed number of input tokens."""
        if self.max_total_tokens is None:
            return None

        return self.max_total_tokens - self.max_output_tokens

    def __post_init__(self) -> None:
        max_total_tokens = self.max_total_tokens
        max_output_tokens = self.max_output_tokens
        max_input_tokens = self.max_input_tokens
        truncate_prompt_tokens = self.truncate_prompt_tokens

        if (
            max_output_tokens is not None
            and max_total_tokens is not None
            and max_output_tokens > max_total_tokens
        ):
            raise VLLMValidationError(
                f"{self.max_output_tokens_param}={max_output_tokens}"
                f"cannot be greater than "
                f"{self.max_total_tokens_param}={max_total_tokens=}. "
                f"Please request fewer output tokens.",
                parameter=self.max_output_tokens_param,
                value=max_output_tokens,
            )

        if (
            max_input_tokens is not None
            and truncate_prompt_tokens is not None
            and truncate_prompt_tokens > max_input_tokens
        ):
            raise VLLMValidationError(
                f"{self.truncate_prompt_tokens_param}={truncate_prompt_tokens} "
                f"cannot be greater than {self.max_total_tokens_param} - "
                f"{self.max_output_tokens_param} = {max_input_tokens}. "
                f"Please request a smaller truncation size.",
                parameter=self.truncate_prompt_tokens_param,
                value=truncate_prompt_tokens,
            )

    def with_kwargs(self, tokenization_kwargs: dict[str, Any] | None):
        if tokenization_kwargs is None:
            tokenization_kwargs = {}

        max_length = tokenization_kwargs.pop("max_length", self.max_input_tokens)
        pad_prompt_tokens = tokenization_kwargs.pop(
            "pad_prompt_tokens", self.pad_prompt_tokens
        )
        truncate_prompt_tokens = tokenization_kwargs.pop(
            "truncate_prompt_tokens", self.truncate_prompt_tokens
        )
        do_lower_case = tokenization_kwargs.pop("do_lower_case", self.do_lower_case)
        add_special_tokens = tokenization_kwargs.pop(
            "add_special_tokens", self.add_special_tokens
        )
        needs_detokenization = tokenization_kwargs.pop(
            "needs_detokenization", self.needs_detokenization
        )

        # https://huggingface.co/docs/transformers/en/pad_truncation
        if padding := tokenization_kwargs.pop("padding", None):
            if padding == "max_length":
                pad_prompt_tokens = max_length
            elif padding in (False, "do_not_pad"):
                pad_prompt_tokens = None
            else:
                # To emit the below warning
                tokenization_kwargs["padding"] = padding

        if truncation := tokenization_kwargs.pop("truncation", None):
            if truncation in (True, "longest_first"):
                truncate_prompt_tokens = max_length
            elif truncation in (False, "do_not_truncate"):
                truncate_prompt_tokens = None
            else:
                # To emit the below warning
                tokenization_kwargs["truncation"] = truncation

        if tokenization_kwargs:
            logger.warning(
                "The following tokenization arguments are not supported "
                "by vLLM Renderer and will be ignored: %s",
                tokenization_kwargs,
            )

        max_total_tokens = self.max_total_tokens

        return TokenizeParams(
            max_total_tokens=max_total_tokens,
            max_output_tokens=(
                0
                if max_total_tokens is None or max_length is None
                else max_total_tokens - max_length
            ),
            pad_prompt_tokens=pad_prompt_tokens,
            truncate_prompt_tokens=truncate_prompt_tokens,
            do_lower_case=do_lower_case,
            add_special_tokens=add_special_tokens,
            needs_detokenization=needs_detokenization,
        )

    def get_encode_kwargs(self) -> dict[str, Any]:
        """The arguments to pass to `tokenizer.encode`."""
        max_length = self.truncate_prompt_tokens
        if max_length is not None and max_length < 0:
            max_length = self.max_input_tokens

        return dict(
            truncation=self.truncate_prompt_tokens is not None,
            max_length=max_length,
            add_special_tokens=self.add_special_tokens,
        )

    def _apply_lowercase(self, tokenizer: TokenizerLike | None, text: str) -> str:
        if self.do_lower_case:
            text = text.lower()

        return text

    def _validate_text(self, tokenizer: TokenizerLike | None, text: str) -> str:
        """Apply all validators to prompt text."""
        # TODO: Implement https://github.com/vllm-project/vllm/pull/31366
        for validator in (self._apply_lowercase,):
            text = validator(tokenizer, text)

        return text

    def apply_pre_tokenization(
        self,
        tokenizer: TokenizerLike | None,
        prompt: TextPrompt,
    ) -> TextPrompt:
        """
        Ensure that the prompt meets the requirements set out by this config.
        If that is not possible, raise a `VLLMValidationError`.

        This method is run before tokenization occurs.
        """
        prompt["prompt"] = self._validate_text(tokenizer, prompt["prompt"])

        return prompt

    def _apply_padding(self, tokenizer: TokenizerLike | None, tokens: _S) -> _S:
        """Apply padding to a token sequence."""
        pad_length = self.pad_prompt_tokens
        if pad_length is not None and pad_length < 0:
            pad_length = self.max_input_tokens

        if pad_length is None or pad_length <= len(tokens):
            return tokens

        if tokenizer is None:
            raise ValueError("Cannot pad tokens when `skip_tokenizer_init=True`")
        if not isinstance(tokens, list):
            raise ValueError("Cannot pad tokens for embedding inputs")

        return tokens + [tokenizer.pad_token_id] * (pad_length - len(tokens))

    def _apply_truncation(self, tokenizer: TokenizerLike | None, tokens: _S) -> _S:
        """Apply truncation to a token sequence."""
        max_length = self.truncate_prompt_tokens
        if max_length is not None and max_length < 0:
            max_length = self.max_input_tokens

        if max_length is None or max_length >= len(tokens):
            return tokens
        if max_length == 0:
            return tokens[:0]

        if getattr(tokenizer, "truncation_side", "left") == "left":
            return tokens[-max_length:]

        return tokens[:max_length]

    def _apply_length_check(self, tokenizer: TokenizerLike | None, tokens: _S) -> _S:
        """Apply length checks to a token sequence."""
        max_input_tokens = self.max_input_tokens

        if max_input_tokens is not None and len(tokens) > max_input_tokens:
            raise VLLMValidationError(
                f"You passed {len(tokens)} input tokens and "
                f"requested {self.max_output_tokens} output tokens. "
                f"However, the model's context length is only "
                f"{self.max_total_tokens}, resulting in a maximum "
                f"input length of {max_input_tokens}. "
                f"Please reduce the length of the input messages.",
                parameter="input_tokens",
                value=len(tokens),
            )

        return tokens

    def _validate_tokens(self, tokenizer: TokenizerLike | None, tokens: _S) -> _S:
        """Apply all validators to a token sequence."""
        for validator in (
            self._apply_padding,
            self._apply_truncation,
            self._apply_length_check,
        ):
            tokens = validator(tokenizer, tokens)

        return tokens

    def apply_post_tokenization(
        self,
        tokenizer: TokenizerLike | None,
        prompt: TokensPrompt | EmbedsPrompt,
    ) -> TokensPrompt | EmbedsPrompt:
        """
        Ensure that the prompt meets the requirements set out by this config.
        If that is not possible, raise a `VLLMValidationError`.

        This method is run after tokenization occurs.
        """
        if "prompt_token_ids" in prompt:
            prompt["prompt_token_ids"] = self._validate_tokens(  # type: ignore[typeddict-unknown-key]
                tokenizer,
                prompt["prompt_token_ids"],  # type: ignore[typeddict-item]
            )
        if "prompt_embeds" in prompt:
            prompt["prompt_embeds"] = self._validate_tokens(  # type: ignore[typeddict-unknown-key]
                tokenizer,
                prompt["prompt_embeds"],  # type: ignore[typeddict-item]
            )

        return prompt

add_special_tokens `class-attribute` `instance-attribute` ¶

add_special_tokens: bool = True

Whether to add special tokens.

do_lower_case `class-attribute` `instance-attribute` ¶

do_lower_case: bool = False

Whether to normalize text to lower case before tokenization.

max_input_tokens `property` ¶

max_input_tokens: int | None

Maximum allowed number of input tokens.

max_output_tokens `class-attribute` `instance-attribute` ¶

max_output_tokens: int = 0

Maximum requested number of output tokens.

max_output_tokens_param `class-attribute` `instance-attribute` ¶

max_output_tokens_param: str = 'max_output_tokens'

Override this to edit the message for validation errors.

max_total_tokens `instance-attribute` ¶

max_total_tokens: int | None

Maximum allowed number of input + output tokens.

Usually, this refers to the model's context length.

max_total_tokens_param `class-attribute` `instance-attribute` ¶

max_total_tokens_param: str = 'max_total_tokens'

Override this to edit the message for validation errors.

needs_detokenization `class-attribute` `instance-attribute` ¶

needs_detokenization: bool = False

Whether the tokenized prompt needs to contain the original text.

Not to be confused with SamplingParams.detokenize which deals with the output generated by the model.

pad_prompt_tokens `class-attribute` `instance-attribute` ¶

pad_prompt_tokens: int | None = None

Number of tokens to pad to: - None means no padding. - -1 maps to max_input_tokens.

truncate_prompt_tokens `class-attribute` `instance-attribute` ¶

truncate_prompt_tokens: int | None = None

Number of tokens to keep: - None means no truncation. - -1 maps to max_input_tokens.

truncate_prompt_tokens_param `class-attribute` `instance-attribute` ¶

truncate_prompt_tokens_param: str = "truncate_prompt_tokens"

Override this to edit the message for validation errors.

init ¶

__init__(
    max_total_tokens: int | None,
    max_output_tokens: int = 0,
    pad_prompt_tokens: int | None = None,
    truncate_prompt_tokens: int | None = None,
    do_lower_case: bool = False,
    add_special_tokens: bool = True,
    needs_detokenization: bool = False,
    max_total_tokens_param: str = "max_total_tokens",
    max_output_tokens_param: str = "max_output_tokens",
    truncate_prompt_tokens_param: str = "truncate_prompt_tokens",
) -> None

__post_init__ ¶

__post_init__() -> None

Source code in vllm/renderers/params.py

def __post_init__(self) -> None:
    max_total_tokens = self.max_total_tokens
    max_output_tokens = self.max_output_tokens
    max_input_tokens = self.max_input_tokens
    truncate_prompt_tokens = self.truncate_prompt_tokens

    if (
        max_output_tokens is not None
        and max_total_tokens is not None
        and max_output_tokens > max_total_tokens
    ):
        raise VLLMValidationError(
            f"{self.max_output_tokens_param}={max_output_tokens}"
            f"cannot be greater than "
            f"{self.max_total_tokens_param}={max_total_tokens=}. "
            f"Please request fewer output tokens.",
            parameter=self.max_output_tokens_param,
            value=max_output_tokens,
        )

    if (
        max_input_tokens is not None
        and truncate_prompt_tokens is not None
        and truncate_prompt_tokens > max_input_tokens
    ):
        raise VLLMValidationError(
            f"{self.truncate_prompt_tokens_param}={truncate_prompt_tokens} "
            f"cannot be greater than {self.max_total_tokens_param} - "
            f"{self.max_output_tokens_param} = {max_input_tokens}. "
            f"Please request a smaller truncation size.",
            parameter=self.truncate_prompt_tokens_param,
            value=truncate_prompt_tokens,
        )

_apply_length_check ¶

_apply_length_check(
    tokenizer: TokenizerLike | None, tokens: _S
) -> _S

Apply length checks to a token sequence.

Source code in vllm/renderers/params.py

def _apply_length_check(self, tokenizer: TokenizerLike | None, tokens: _S) -> _S:
    """Apply length checks to a token sequence."""
    max_input_tokens = self.max_input_tokens

    if max_input_tokens is not None and len(tokens) > max_input_tokens:
        raise VLLMValidationError(
            f"You passed {len(tokens)} input tokens and "
            f"requested {self.max_output_tokens} output tokens. "
            f"However, the model's context length is only "
            f"{self.max_total_tokens}, resulting in a maximum "
            f"input length of {max_input_tokens}. "
            f"Please reduce the length of the input messages.",
            parameter="input_tokens",
            value=len(tokens),
        )

    return tokens

_apply_lowercase ¶

_apply_lowercase(
    tokenizer: TokenizerLike | None, text: str
) -> str

Source code in vllm/renderers/params.py

def _apply_lowercase(self, tokenizer: TokenizerLike | None, text: str) -> str:
    if self.do_lower_case:
        text = text.lower()

    return text

_apply_padding ¶

_apply_padding(
    tokenizer: TokenizerLike | None, tokens: _S
) -> _S

Apply padding to a token sequence.

Source code in vllm/renderers/params.py

def _apply_padding(self, tokenizer: TokenizerLike | None, tokens: _S) -> _S:
    """Apply padding to a token sequence."""
    pad_length = self.pad_prompt_tokens
    if pad_length is not None and pad_length < 0:
        pad_length = self.max_input_tokens

    if pad_length is None or pad_length <= len(tokens):
        return tokens

    if tokenizer is None:
        raise ValueError("Cannot pad tokens when `skip_tokenizer_init=True`")
    if not isinstance(tokens, list):
        raise ValueError("Cannot pad tokens for embedding inputs")

    return tokens + [tokenizer.pad_token_id] * (pad_length - len(tokens))

_apply_truncation ¶

_apply_truncation(
    tokenizer: TokenizerLike | None, tokens: _S
) -> _S

Apply truncation to a token sequence.

Source code in vllm/renderers/params.py

def _apply_truncation(self, tokenizer: TokenizerLike | None, tokens: _S) -> _S:
    """Apply truncation to a token sequence."""
    max_length = self.truncate_prompt_tokens
    if max_length is not None and max_length < 0:
        max_length = self.max_input_tokens

    if max_length is None or max_length >= len(tokens):
        return tokens
    if max_length == 0:
        return tokens[:0]

    if getattr(tokenizer, "truncation_side", "left") == "left":
        return tokens[-max_length:]

    return tokens[:max_length]

_validate_text ¶

_validate_text(
    tokenizer: TokenizerLike | None, text: str
) -> str

Apply all validators to prompt text.

Source code in vllm/renderers/params.py

def _validate_text(self, tokenizer: TokenizerLike | None, text: str) -> str:
    """Apply all validators to prompt text."""
    # TODO: Implement https://github.com/vllm-project/vllm/pull/31366
    for validator in (self._apply_lowercase,):
        text = validator(tokenizer, text)

    return text

_validate_tokens ¶

_validate_tokens(
    tokenizer: TokenizerLike | None, tokens: _S
) -> _S

Apply all validators to a token sequence.

Source code in vllm/renderers/params.py

def _validate_tokens(self, tokenizer: TokenizerLike | None, tokens: _S) -> _S:
    """Apply all validators to a token sequence."""
    for validator in (
        self._apply_padding,
        self._apply_truncation,
        self._apply_length_check,
    ):
        tokens = validator(tokenizer, tokens)

    return tokens

apply_post_tokenization ¶

apply_post_tokenization(
    tokenizer: TokenizerLike | None,
    prompt: TokensPrompt | EmbedsPrompt,
) -> TokensPrompt | EmbedsPrompt

Ensure that the prompt meets the requirements set out by this config. If that is not possible, raise a VLLMValidationError.

This method is run after tokenization occurs.

Source code in vllm/renderers/params.py

def apply_post_tokenization(
    self,
    tokenizer: TokenizerLike | None,
    prompt: TokensPrompt | EmbedsPrompt,
) -> TokensPrompt | EmbedsPrompt:
    """
    Ensure that the prompt meets the requirements set out by this config.
    If that is not possible, raise a `VLLMValidationError`.

    This method is run after tokenization occurs.
    """
    if "prompt_token_ids" in prompt:
        prompt["prompt_token_ids"] = self._validate_tokens(  # type: ignore[typeddict-unknown-key]
            tokenizer,
            prompt["prompt_token_ids"],  # type: ignore[typeddict-item]
        )
    if "prompt_embeds" in prompt:
        prompt["prompt_embeds"] = self._validate_tokens(  # type: ignore[typeddict-unknown-key]
            tokenizer,
            prompt["prompt_embeds"],  # type: ignore[typeddict-item]
        )

    return prompt

apply_pre_tokenization ¶

apply_pre_tokenization(
    tokenizer: TokenizerLike | None, prompt: TextPrompt
) -> TextPrompt

Ensure that the prompt meets the requirements set out by this config. If that is not possible, raise a VLLMValidationError.

This method is run before tokenization occurs.

Source code in vllm/renderers/params.py

def apply_pre_tokenization(
    self,
    tokenizer: TokenizerLike | None,
    prompt: TextPrompt,
) -> TextPrompt:
    """
    Ensure that the prompt meets the requirements set out by this config.
    If that is not possible, raise a `VLLMValidationError`.

    This method is run before tokenization occurs.
    """
    prompt["prompt"] = self._validate_text(tokenizer, prompt["prompt"])

    return prompt

get_encode_kwargs ¶

get_encode_kwargs() -> dict[str, Any]

The arguments to pass to tokenizer.encode.

Source code in vllm/renderers/params.py

def get_encode_kwargs(self) -> dict[str, Any]:
    """The arguments to pass to `tokenizer.encode`."""
    max_length = self.truncate_prompt_tokens
    if max_length is not None and max_length < 0:
        max_length = self.max_input_tokens

    return dict(
        truncation=self.truncate_prompt_tokens is not None,
        max_length=max_length,
        add_special_tokens=self.add_special_tokens,
    )

with_kwargs ¶

with_kwargs(tokenization_kwargs: dict[str, Any] | None)

Source code in vllm/renderers/params.py

def with_kwargs(self, tokenization_kwargs: dict[str, Any] | None):
    if tokenization_kwargs is None:
        tokenization_kwargs = {}

    max_length = tokenization_kwargs.pop("max_length", self.max_input_tokens)
    pad_prompt_tokens = tokenization_kwargs.pop(
        "pad_prompt_tokens", self.pad_prompt_tokens
    )
    truncate_prompt_tokens = tokenization_kwargs.pop(
        "truncate_prompt_tokens", self.truncate_prompt_tokens
    )
    do_lower_case = tokenization_kwargs.pop("do_lower_case", self.do_lower_case)
    add_special_tokens = tokenization_kwargs.pop(
        "add_special_tokens", self.add_special_tokens
    )
    needs_detokenization = tokenization_kwargs.pop(
        "needs_detokenization", self.needs_detokenization
    )

    # https://huggingface.co/docs/transformers/en/pad_truncation
    if padding := tokenization_kwargs.pop("padding", None):
        if padding == "max_length":
            pad_prompt_tokens = max_length
        elif padding in (False, "do_not_pad"):
            pad_prompt_tokens = None
        else:
            # To emit the below warning
            tokenization_kwargs["padding"] = padding

    if truncation := tokenization_kwargs.pop("truncation", None):
        if truncation in (True, "longest_first"):
            truncate_prompt_tokens = max_length
        elif truncation in (False, "do_not_truncate"):
            truncate_prompt_tokens = None
        else:
            # To emit the below warning
            tokenization_kwargs["truncation"] = truncation

    if tokenization_kwargs:
        logger.warning(
            "The following tokenization arguments are not supported "
            "by vLLM Renderer and will be ignored: %s",
            tokenization_kwargs,
        )

    max_total_tokens = self.max_total_tokens

    return TokenizeParams(
        max_total_tokens=max_total_tokens,
        max_output_tokens=(
            0
            if max_total_tokens is None or max_length is None
            else max_total_tokens - max_length
        ),
        pad_prompt_tokens=pad_prompt_tokens,
        truncate_prompt_tokens=truncate_prompt_tokens,
        do_lower_case=do_lower_case,
        add_special_tokens=add_special_tokens,
        needs_detokenization=needs_detokenization,
    )

merge_kwargs ¶

merge_kwargs(
    defaults: dict[str, Any] | None,
    overrides: dict[str, Any] | None,
    /,
    *,
    unset_values: tuple[object, ...] = (None, "auto"),
) -> dict[str, Any]

Source code in vllm/renderers/params.py

def merge_kwargs(
    defaults: dict[str, Any] | None,
    overrides: dict[str, Any] | None,
    /,
    *,
    unset_values: tuple[object, ...] = (None, "auto"),
) -> dict[str, Any]:
    if defaults is None:
        defaults = {}
    if overrides is None:
        overrides = {}

    return defaults | {k: v for k, v in overrides.items() if v not in unset_values}

renderer_from_config ¶

renderer_from_config(config: ModelConfig, **kwargs)

Source code in vllm/renderers/registry.py

def renderer_from_config(config: "ModelConfig", **kwargs):
    tokenizer_mode, tokenizer_name, args, kwargs = tokenizer_args_from_config(
        config, **kwargs
    )

    if config.tokenizer_mode == "auto" and config.model_impl == "terratorch":
        renderer_mode = "terratorch"
    else:
        renderer_mode = tokenizer_mode

    return RENDERER_REGISTRY.load_renderer(
        renderer_mode,
        config,
        tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name},
    )

vllm.renderers ¶

__all__ module-attribute ¶

BaseRenderer ¶

_async_tokenizer instance-attribute ¶

config instance-attribute ¶

tokenizer abstractmethod property ¶

__init__ ¶

from_config abstractmethod classmethod ¶

get_async_tokenizer ¶

get_tokenizer ¶

render_completion ¶

render_completions ¶

render_completions_async async ¶

render_messages abstractmethod ¶

render_messages_async async ¶

tokenize_prompt ¶

tokenize_prompt_async async ¶

tokenize_prompts ¶

tokenize_prompts_async async ¶

ChatParams dataclass ¶

chat_template class-attribute instance-attribute ¶

chat_template_content_format class-attribute instance-attribute ¶

chat_template_kwargs class-attribute instance-attribute ¶

__init__ ¶

get_apply_chat_template_kwargs ¶

with_defaults ¶

RendererRegistry dataclass ¶

renderers class-attribute instance-attribute ¶

__init__ ¶

load_renderer ¶

load_renderer_cls ¶

register ¶

TokenizeParams dataclass ¶

add_special_tokens class-attribute instance-attribute ¶

do_lower_case class-attribute instance-attribute ¶

max_input_tokens property ¶

max_output_tokens class-attribute instance-attribute ¶

max_output_tokens_param class-attribute instance-attribute ¶

max_total_tokens instance-attribute ¶

max_total_tokens_param class-attribute instance-attribute ¶

needs_detokenization class-attribute instance-attribute ¶

pad_prompt_tokens class-attribute instance-attribute ¶

truncate_prompt_tokens class-attribute instance-attribute ¶

truncate_prompt_tokens_param class-attribute instance-attribute ¶

__init__ ¶

__post_init__ ¶

_apply_length_check ¶

_apply_lowercase ¶

_apply_padding ¶

_apply_truncation ¶

_validate_text ¶

_validate_tokens ¶

apply_post_tokenization ¶

apply_pre_tokenization ¶

get_encode_kwargs ¶

with_kwargs ¶

merge_kwargs ¶

renderer_from_config ¶

all `module-attribute` ¶

_async_tokenizer `instance-attribute` ¶

config `instance-attribute` ¶

tokenizer `abstractmethod` `property` ¶

init ¶

from_config `abstractmethod` `classmethod` ¶

render_completions_async `async` ¶

render_messages `abstractmethod` ¶

render_messages_async `async` ¶

tokenize_prompt_async `async` ¶

tokenize_prompts_async `async` ¶

ChatParams `dataclass` ¶

chat_template `class-attribute` `instance-attribute` ¶

chat_template_content_format `class-attribute` `instance-attribute` ¶

chat_template_kwargs `class-attribute` `instance-attribute` ¶

init ¶

RendererRegistry `dataclass` ¶

renderers `class-attribute` `instance-attribute` ¶

init ¶

TokenizeParams `dataclass` ¶

add_special_tokens `class-attribute` `instance-attribute` ¶

do_lower_case `class-attribute` `instance-attribute` ¶

max_input_tokens `property` ¶

max_output_tokens `class-attribute` `instance-attribute` ¶

max_output_tokens_param `class-attribute` `instance-attribute` ¶

max_total_tokens `instance-attribute` ¶

max_total_tokens_param `class-attribute` `instance-attribute` ¶

needs_detokenization `class-attribute` `instance-attribute` ¶

pad_prompt_tokens `class-attribute` `instance-attribute` ¶

truncate_prompt_tokens `class-attribute` `instance-attribute` ¶

truncate_prompt_tokens_param `class-attribute` `instance-attribute` ¶

init ¶