Skip to content

vllm.parser.abstract_parser

DelegatingParser

Bases: Parser

A Parser implementation that delegates to separate ReasoningParser and ToolParser instances.

This is the recommended base class for creating model-specific parsers that combine existing reasoning and tool parser implementations. Subclasses should set self._reasoning_parser and self._tool_parser in their __init__ method.

If either parser is None, the corresponding methods will return default values (no reasoning extraction, no tool calls).

Source code in vllm/parser/abstract_parser.py
class DelegatingParser(Parser):
    """
    A Parser implementation that delegates to separate ReasoningParser and
    ToolParser instances.

    This is the recommended base class for creating model-specific parsers
    that combine existing reasoning and tool parser implementations.
    Subclasses should set `self._reasoning_parser` and `self._tool_parser`
    in their `__init__` method.

    If either parser is None, the corresponding methods will return default
    values (no reasoning extraction, no tool calls).
    """

    def extract_reasoning(
        self,
        model_output: str,
        request: ChatCompletionRequest | ResponsesRequest,
    ) -> tuple[str | None, str | None]:
        if self._reasoning_parser is None:
            return None, model_output
        return self._reasoning_parser.extract_reasoning(model_output, request)

    def extract_reasoning_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        if self._reasoning_parser is None:
            return DeltaMessage(content=delta_text)
        return self._reasoning_parser.extract_reasoning_streaming(
            previous_text,
            current_text,
            delta_text,
            previous_token_ids,
            current_token_ids,
            delta_token_ids,
        )

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
        if self._tool_parser is None:
            return ExtractedToolCallInformation(
                tools_called=False, tool_calls=[], content=model_output
            )
        return self._tool_parser.extract_tool_calls(model_output, request)

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> DeltaMessage | None:
        if self._tool_parser is None:
            return None
        return self._tool_parser.extract_tool_calls_streaming(
            previous_text,
            current_text,
            delta_text,
            previous_token_ids,
            current_token_ids,
            delta_token_ids,
            request,
        )

extract_reasoning

extract_reasoning(
    model_output: str,
    request: ChatCompletionRequest | ResponsesRequest,
) -> tuple[str | None, str | None]
Source code in vllm/parser/abstract_parser.py
def extract_reasoning(
    self,
    model_output: str,
    request: ChatCompletionRequest | ResponsesRequest,
) -> tuple[str | None, str | None]:
    if self._reasoning_parser is None:
        return None, model_output
    return self._reasoning_parser.extract_reasoning(model_output, request)

extract_reasoning_streaming

extract_reasoning_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
) -> DeltaMessage | None
Source code in vllm/parser/abstract_parser.py
def extract_reasoning_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
    if self._reasoning_parser is None:
        return DeltaMessage(content=delta_text)
    return self._reasoning_parser.extract_reasoning_streaming(
        previous_text,
        current_text,
        delta_text,
        previous_token_ids,
        current_token_ids,
        delta_token_ids,
    )

extract_tool_calls

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation
Source code in vllm/parser/abstract_parser.py
def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:
    if self._tool_parser is None:
        return ExtractedToolCallInformation(
            tools_called=False, tool_calls=[], content=model_output
        )
    return self._tool_parser.extract_tool_calls(model_output, request)

extract_tool_calls_streaming

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> DeltaMessage | None
Source code in vllm/parser/abstract_parser.py
def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> DeltaMessage | None:
    if self._tool_parser is None:
        return None
    return self._tool_parser.extract_tool_calls_streaming(
        previous_text,
        current_text,
        delta_text,
        previous_token_ids,
        current_token_ids,
        delta_token_ids,
        request,
    )

Parser

Abstract Parser class that unifies ReasoningParser and ToolParser into a single interface for parsing model output.

This class provides a unified way to handle both reasoning extraction (e.g., chain-of-thought content in tags) and tool call extraction (e.g., function calls in XML/JSON format) from model outputs.

Subclasses can either: 1. Override the abstract methods directly for custom parsing logic 2. Set reasoning_parser and tool_parser properties to delegate to existing parser implementations

Class Attributes

reasoning_parser_cls: The ReasoningParser class to use (for compatibility with code that needs the class, not instance). tool_parser_cls: The ToolParser class to use (for compatibility with code that needs the class, not instance).

Source code in vllm/parser/abstract_parser.py
class Parser:
    """
    Abstract Parser class that unifies ReasoningParser and ToolParser into
    a single interface for parsing model output.

    This class provides a unified way to handle both reasoning extraction
    (e.g., chain-of-thought content in <think> tags) and tool call extraction
    (e.g., function calls in XML/JSON format) from model outputs.

    Subclasses can either:
    1. Override the abstract methods directly for custom parsing logic
    2. Set `reasoning_parser` and `tool_parser` properties to delegate to
       existing parser implementations

    Class Attributes:
        reasoning_parser_cls: The ReasoningParser class to use (for compatibility
            with code that needs the class, not instance).
        tool_parser_cls: The ToolParser class to use (for compatibility with
            code that needs the class, not instance).
    """

    # Class-level parser classes for compatibility with existing patterns
    # Subclasses should override these if they use specific parser classes
    reasoning_parser_cls: type[ReasoningParser] | None = None
    tool_parser_cls: type[ToolParser] | None = None

    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
        """
        Initialize the Parser.

        Args:
            tokenizer: The tokenizer used by the model. This is required for
                token-based parsing operations.
        """
        self.model_tokenizer = tokenizer
        self._reasoning_parser: ReasoningParser | None = None
        self._tool_parser: ToolParser | None = None

    @cached_property
    def vocab(self) -> dict[str, int]:
        """Get the vocabulary mapping from tokens to IDs."""
        return self.model_tokenizer.get_vocab()

    @property
    def reasoning_parser(self) -> ReasoningParser | None:
        """The underlying reasoning parser, if any."""
        return self._reasoning_parser

    @reasoning_parser.setter
    def reasoning_parser(self, parser: ReasoningParser | None) -> None:
        self._reasoning_parser = parser

    @property
    def tool_parser(self) -> ToolParser | None:
        """The underlying tool parser, if any."""
        return self._tool_parser

    @tool_parser.setter
    def tool_parser(self, parser: ToolParser | None) -> None:
        self._tool_parser = parser

    # ========== Reasoning Parser Methods ==========

    @abstractmethod
    def is_reasoning_end(self, input_ids: list[int]) -> bool:
        """
        Check if the reasoning content ends in the input_ids.

        Used by structured engines like `xgrammar` to check if the
        reasoning content ends in the model output.

        Args:
            input_ids: The token IDs of the model output.

        Returns:
            True if the reasoning content ends in the input_ids.
        """

    def is_reasoning_end_streaming(
        self, input_ids: list[int], delta_ids: list[int]
    ) -> bool:
        """
        Check if the reasoning content ends during a decode step.

        Args:
            input_ids: The entire model output token IDs.
            delta_ids: The last few computed tokens at the current decode step.

        Returns:
            True if the reasoning content ends in the delta_ids.
        """
        return self.is_reasoning_end(input_ids)

    @abstractmethod
    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
        """
        Extract content token IDs from the input_ids.

        This extracts the non-reasoning content (e.g., everything after
        the </think> tag).

        Args:
            input_ids: The token IDs of the model output.

        Returns:
            The extracted content token IDs.
        """

    @abstractmethod
    def extract_reasoning(
        self,
        model_output: str,
        request: ChatCompletionRequest | ResponsesRequest,
    ) -> tuple[str | None, str | None]:
        """
        Extract reasoning content from a complete model-generated string.

        Used for non-streaming responses where we have the entire model
        response available before sending to the client.

        Args:
            model_output: The complete model-generated string.
            request: The request object used to generate the output.

        Returns:
            A tuple of (reasoning_content, response_content).
        """

    @abstractmethod
    def extract_reasoning_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        """
        Extract reasoning content from a streaming delta message.

        Args:
            previous_text: Text from all previous tokens.
            current_text: Text including the current delta.
            delta_text: The new text in this delta.
            previous_token_ids: Token IDs from previous generation.
            current_token_ids: All token IDs including current.
            delta_token_ids: The new token IDs in this delta.

        Returns:
            A DeltaMessage with reasoning and/or content fields, or None.
        """

    # ========== Tool Parser Methods ==========

    def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
        """
        Adjust the request parameters for tool calling.

        Can be overridden by subclasses to modify request parameters
        (e.g., setting structured output schemas for tool calling).

        Args:
            request: The original request.

        Returns:
            The adjusted request.
        """
        return request

    @abstractmethod
    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
        """
        Extract tool calls from a complete model-generated string.

        Used for non-streaming responses.

        Args:
            model_output: The complete model-generated string.
            request: The request object used to generate the output.

        Returns:
            ExtractedToolCallInformation containing the tool calls.
        """

    @abstractmethod
    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> DeltaMessage | None:
        """
        Extract tool calls from a streaming delta message.

        Args:
            previous_text: Text from all previous tokens.
            current_text: Text including the current delta.
            delta_text: The new text in this delta.
            previous_token_ids: Token IDs from previous generation.
            current_token_ids: All token IDs including current.
            delta_token_ids: The new token IDs in this delta.
            request: The request object.

        Returns:
            A DeltaMessage with tool_calls field, or None.
        """

_reasoning_parser instance-attribute

_reasoning_parser: ReasoningParser | None = None

_tool_parser instance-attribute

_tool_parser: ToolParser | None = None

model_tokenizer instance-attribute

model_tokenizer = tokenizer

reasoning_parser property writable

reasoning_parser: ReasoningParser | None

The underlying reasoning parser, if any.

reasoning_parser_cls class-attribute instance-attribute

reasoning_parser_cls: type[ReasoningParser] | None = None

tool_parser property writable

tool_parser: ToolParser | None

The underlying tool parser, if any.

tool_parser_cls class-attribute instance-attribute

tool_parser_cls: type[ToolParser] | None = None

vocab cached property

vocab: dict[str, int]

Get the vocabulary mapping from tokens to IDs.

__init__

__init__(tokenizer: TokenizerLike, *args, **kwargs)

Initialize the Parser.

Parameters:

Name Type Description Default
tokenizer TokenizerLike

The tokenizer used by the model. This is required for token-based parsing operations.

required
Source code in vllm/parser/abstract_parser.py
def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
    """
    Initialize the Parser.

    Args:
        tokenizer: The tokenizer used by the model. This is required for
            token-based parsing operations.
    """
    self.model_tokenizer = tokenizer
    self._reasoning_parser: ReasoningParser | None = None
    self._tool_parser: ToolParser | None = None

adjust_request

adjust_request(
    request: ChatCompletionRequest,
) -> ChatCompletionRequest

Adjust the request parameters for tool calling.

Can be overridden by subclasses to modify request parameters (e.g., setting structured output schemas for tool calling).

Parameters:

Name Type Description Default
request ChatCompletionRequest

The original request.

required

Returns:

Type Description
ChatCompletionRequest

The adjusted request.

Source code in vllm/parser/abstract_parser.py
def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
    """
    Adjust the request parameters for tool calling.

    Can be overridden by subclasses to modify request parameters
    (e.g., setting structured output schemas for tool calling).

    Args:
        request: The original request.

    Returns:
        The adjusted request.
    """
    return request

extract_content_ids abstractmethod

extract_content_ids(input_ids: list[int]) -> list[int]

Extract content token IDs from the input_ids.

This extracts the non-reasoning content (e.g., everything after the tag).

Parameters:

Name Type Description Default
input_ids list[int]

The token IDs of the model output.

required

Returns:

Type Description
list[int]

The extracted content token IDs.

Source code in vllm/parser/abstract_parser.py
@abstractmethod
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
    """
    Extract content token IDs from the input_ids.

    This extracts the non-reasoning content (e.g., everything after
    the </think> tag).

    Args:
        input_ids: The token IDs of the model output.

    Returns:
        The extracted content token IDs.
    """

extract_reasoning abstractmethod

extract_reasoning(
    model_output: str,
    request: ChatCompletionRequest | ResponsesRequest,
) -> tuple[str | None, str | None]

Extract reasoning content from a complete model-generated string.

Used for non-streaming responses where we have the entire model response available before sending to the client.

Parameters:

Name Type Description Default
model_output str

The complete model-generated string.

required
request ChatCompletionRequest | ResponsesRequest

The request object used to generate the output.

required

Returns:

Type Description
tuple[str | None, str | None]

A tuple of (reasoning_content, response_content).

Source code in vllm/parser/abstract_parser.py
@abstractmethod
def extract_reasoning(
    self,
    model_output: str,
    request: ChatCompletionRequest | ResponsesRequest,
) -> tuple[str | None, str | None]:
    """
    Extract reasoning content from a complete model-generated string.

    Used for non-streaming responses where we have the entire model
    response available before sending to the client.

    Args:
        model_output: The complete model-generated string.
        request: The request object used to generate the output.

    Returns:
        A tuple of (reasoning_content, response_content).
    """

extract_reasoning_streaming abstractmethod

extract_reasoning_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
) -> DeltaMessage | None

Extract reasoning content from a streaming delta message.

Parameters:

Name Type Description Default
previous_text str

Text from all previous tokens.

required
current_text str

Text including the current delta.

required
delta_text str

The new text in this delta.

required
previous_token_ids Sequence[int]

Token IDs from previous generation.

required
current_token_ids Sequence[int]

All token IDs including current.

required
delta_token_ids Sequence[int]

The new token IDs in this delta.

required

Returns:

Type Description
DeltaMessage | None

A DeltaMessage with reasoning and/or content fields, or None.

Source code in vllm/parser/abstract_parser.py
@abstractmethod
def extract_reasoning_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
    """
    Extract reasoning content from a streaming delta message.

    Args:
        previous_text: Text from all previous tokens.
        current_text: Text including the current delta.
        delta_text: The new text in this delta.
        previous_token_ids: Token IDs from previous generation.
        current_token_ids: All token IDs including current.
        delta_token_ids: The new token IDs in this delta.

    Returns:
        A DeltaMessage with reasoning and/or content fields, or None.
    """

extract_tool_calls abstractmethod

extract_tool_calls(
    model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation

Extract tool calls from a complete model-generated string.

Used for non-streaming responses.

Parameters:

Name Type Description Default
model_output str

The complete model-generated string.

required
request ChatCompletionRequest

The request object used to generate the output.

required

Returns:

Type Description
ExtractedToolCallInformation

ExtractedToolCallInformation containing the tool calls.

Source code in vllm/parser/abstract_parser.py
@abstractmethod
def extract_tool_calls(
    self,
    model_output: str,
    request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:
    """
    Extract tool calls from a complete model-generated string.

    Used for non-streaming responses.

    Args:
        model_output: The complete model-generated string.
        request: The request object used to generate the output.

    Returns:
        ExtractedToolCallInformation containing the tool calls.
    """

extract_tool_calls_streaming abstractmethod

extract_tool_calls_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> DeltaMessage | None

Extract tool calls from a streaming delta message.

Parameters:

Name Type Description Default
previous_text str

Text from all previous tokens.

required
current_text str

Text including the current delta.

required
delta_text str

The new text in this delta.

required
previous_token_ids Sequence[int]

Token IDs from previous generation.

required
current_token_ids Sequence[int]

All token IDs including current.

required
delta_token_ids Sequence[int]

The new token IDs in this delta.

required
request ChatCompletionRequest

The request object.

required

Returns:

Type Description
DeltaMessage | None

A DeltaMessage with tool_calls field, or None.

Source code in vllm/parser/abstract_parser.py
@abstractmethod
def extract_tool_calls_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
    request: ChatCompletionRequest,
) -> DeltaMessage | None:
    """
    Extract tool calls from a streaming delta message.

    Args:
        previous_text: Text from all previous tokens.
        current_text: Text including the current delta.
        delta_text: The new text in this delta.
        previous_token_ids: Token IDs from previous generation.
        current_token_ids: All token IDs including current.
        delta_token_ids: The new token IDs in this delta.
        request: The request object.

    Returns:
        A DeltaMessage with tool_calls field, or None.
    """

is_reasoning_end abstractmethod

is_reasoning_end(input_ids: list[int]) -> bool

Check if the reasoning content ends in the input_ids.

Used by structured engines like xgrammar to check if the reasoning content ends in the model output.

Parameters:

Name Type Description Default
input_ids list[int]

The token IDs of the model output.

required

Returns:

Type Description
bool

True if the reasoning content ends in the input_ids.

Source code in vllm/parser/abstract_parser.py
@abstractmethod
def is_reasoning_end(self, input_ids: list[int]) -> bool:
    """
    Check if the reasoning content ends in the input_ids.

    Used by structured engines like `xgrammar` to check if the
    reasoning content ends in the model output.

    Args:
        input_ids: The token IDs of the model output.

    Returns:
        True if the reasoning content ends in the input_ids.
    """

is_reasoning_end_streaming

is_reasoning_end_streaming(
    input_ids: list[int], delta_ids: list[int]
) -> bool

Check if the reasoning content ends during a decode step.

Parameters:

Name Type Description Default
input_ids list[int]

The entire model output token IDs.

required
delta_ids list[int]

The last few computed tokens at the current decode step.

required

Returns:

Type Description
bool

True if the reasoning content ends in the delta_ids.

Source code in vllm/parser/abstract_parser.py
def is_reasoning_end_streaming(
    self, input_ids: list[int], delta_ids: list[int]
) -> bool:
    """
    Check if the reasoning content ends during a decode step.

    Args:
        input_ids: The entire model output token IDs.
        delta_ids: The last few computed tokens at the current decode step.

    Returns:
        True if the reasoning content ends in the delta_ids.
    """
    return self.is_reasoning_end(input_ids)

_WrappedParser

Bases: DelegatingParser

A DelegatingParser subclass that instantiates parsers from class attributes.

This class is used to dynamically create a parser that wraps individual ReasoningParser and ToolParser classes. The class attributes reasoning_parser_cls and tool_parser_cls should be set before instantiation.

Usage

_WrappedParser.reasoning_parser_cls = MyReasoningParser _WrappedParser.tool_parser_cls = MyToolParser parser = _WrappedParser(tokenizer)

Source code in vllm/parser/abstract_parser.py
class _WrappedParser(DelegatingParser):
    """
    A DelegatingParser subclass that instantiates parsers from class attributes.

    This class is used to dynamically create a parser that wraps individual
    ReasoningParser and ToolParser classes. The class attributes
    `reasoning_parser_cls` and `tool_parser_cls` should be set before
    instantiation.

    Usage:
        _WrappedParser.reasoning_parser_cls = MyReasoningParser
        _WrappedParser.tool_parser_cls = MyToolParser
        parser = _WrappedParser(tokenizer)
    """

    reasoning_parser_cls: type[ReasoningParser] | None = None
    tool_parser_cls: type[ToolParser] | None = None

    def __init__(self, tokenizer: TokenizerLike):
        super().__init__(tokenizer)
        # Instantiate the underlying parsers from class attributes
        if self.__class__.reasoning_parser_cls is not None:
            self._reasoning_parser = self.__class__.reasoning_parser_cls(tokenizer)
        if self.__class__.tool_parser_cls is not None:
            self._tool_parser = self.__class__.tool_parser_cls(tokenizer)

_reasoning_parser instance-attribute

_reasoning_parser = reasoning_parser_cls(tokenizer)

_tool_parser instance-attribute

_tool_parser = tool_parser_cls(tokenizer)

reasoning_parser_cls class-attribute instance-attribute

reasoning_parser_cls: type[ReasoningParser] | None = None

tool_parser_cls class-attribute instance-attribute

tool_parser_cls: type[ToolParser] | None = None

__init__

__init__(tokenizer: TokenizerLike)
Source code in vllm/parser/abstract_parser.py
def __init__(self, tokenizer: TokenizerLike):
    super().__init__(tokenizer)
    # Instantiate the underlying parsers from class attributes
    if self.__class__.reasoning_parser_cls is not None:
        self._reasoning_parser = self.__class__.reasoning_parser_cls(tokenizer)
    if self.__class__.tool_parser_cls is not None:
        self._tool_parser = self.__class__.tool_parser_cls(tokenizer)