Skip to content

vllm.reasoning.step3p5_reasoning_parser

Step3p5ReasoningParser

Bases: BaseThinkingReasoningParser

Reasoning parser for Step3p5 model.

Step3p5 uses the ... format, but it tends to emit an extra newline immediately before and/or after the token. This parser trims: - the newline right before - the newline right after

Source code in vllm/reasoning/step3p5_reasoning_parser.py
class Step3p5ReasoningParser(BaseThinkingReasoningParser):
    """
    Reasoning parser for Step3p5 model.

    Step3p5 uses the <think>...</think> format, but it tends to emit an extra
    newline immediately before and/or after the </think> token. This parser trims:
      - the newline right before </think>
      - the newline right after </think>
    """

    @property
    def start_token(self) -> str:
        return "<think>"

    @property
    def end_token(self) -> str:
        return "</think>"

    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
        super().__init__(tokenizer, *args, **kwargs)

        # Used to hold a trailing "\n" from reasoning content so we can decide
        # whether it is immediately before </think>.
        self._pending_reasoning_newline = False

        # Used to delay the reasoning end detection.
        # This is necessary to remove the newline appears immediately after </think>,
        # which may cause the end detection to be delayed by one round.
        self.end_offset = 1

    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
        if self.end_token_id in input_ids and self.end_offset > 0:
            self.end_offset -= 1
            return False
        return self.end_offset < 1

    def is_reasoning_end_streaming(
        self, input_ids: Sequence[int], delta_ids: Sequence[int]
    ) -> bool:
        if self.end_token_id in input_ids and self.end_offset > 0:
            self.end_offset -= 1
            return False
        return self.end_offset < 1

    def extract_reasoning(
        self,
        model_output: str,
        request: ChatCompletionRequest | ResponsesRequest,
    ) -> tuple[str | None, str | None]:
        reasoning, content = super().extract_reasoning(model_output, request)
        if reasoning is not None:
            reasoning = reasoning.removesuffix("\n")
        if content is not None:
            content = content.removeprefix("\n")
        return reasoning or None, content or None

    def extract_reasoning_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
    ) -> DeltaMessage | None:
        # Drop the immediate newline that models often emit after </think>.
        if previous_text.endswith(self.end_token) and delta_text:
            if delta_text == "\n":
                return None
            elif delta_text.startswith("\n"):
                remaining = delta_text.removeprefix("\n")
                return DeltaMessage(content=remaining) if remaining else None

        ret = super().extract_reasoning_streaming(
            previous_text,
            current_text,
            delta_text,
            previous_token_ids,
            current_token_ids,
            delta_token_ids,
        )

        if ret is None:
            return None

        # Compatibility path for models that don't generate the start token:
        # treat everything before </think> as reasoning and everything after
        # as content.
        if (
            self.start_token_id not in previous_token_ids
            and self.start_token_id not in delta_token_ids
        ):
            if self.end_token_id in delta_token_ids:
                end_index = delta_text.find(self.end_token)
                reasoning = delta_text[:end_index]
                content = delta_text[end_index + len(self.end_token) :]
                ret = DeltaMessage(reasoning=reasoning, content=content or None)
            elif self.end_token_id in previous_token_ids:
                ret = DeltaMessage(content=delta_text)
            else:
                ret = DeltaMessage(reasoning=delta_text)

        reasoning_to_output = ret.reasoning
        content_to_output = ret.content

        # Reasoning: handle the newline immediately before </think>.
        if reasoning_to_output is not None:
            if self._pending_reasoning_newline:
                reasoning_to_output = "\n" + reasoning_to_output
                self._pending_reasoning_newline = False

            if reasoning_to_output.endswith("\n"):
                reasoning_to_output = reasoning_to_output.removesuffix("\n")
                if self.end_token in delta_text:
                    # Trailing "\n" is right before </think>, drop it.
                    self._pending_reasoning_newline = False
                else:
                    # Hold the trailing "\n" until we know whether </think> follows.
                    self._pending_reasoning_newline = True

        # Content: handle the newline immediately after </think>.
        if content_to_output is not None:
            # No need to get into parser again to remove newline after </think>.
            self.end_offset -= 1

            # If we have content, reasoning must have ended.
            self._pending_reasoning_newline = False

            if self.end_token in delta_text and content_to_output.startswith("\n"):
                content_to_output = content_to_output.removeprefix("\n")

        reasoning_to_output = reasoning_to_output or None
        content_to_output = content_to_output or None
        if reasoning_to_output is None and content_to_output is None:
            return None

        return DeltaMessage(reasoning=reasoning_to_output, content=content_to_output)

_pending_reasoning_newline instance-attribute

_pending_reasoning_newline = False

end_offset instance-attribute

end_offset = 1

end_token property

end_token: str

start_token property

start_token: str

__init__

__init__(tokenizer: TokenizerLike, *args, **kwargs)
Source code in vllm/reasoning/step3p5_reasoning_parser.py
def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
    super().__init__(tokenizer, *args, **kwargs)

    # Used to hold a trailing "\n" from reasoning content so we can decide
    # whether it is immediately before </think>.
    self._pending_reasoning_newline = False

    # Used to delay the reasoning end detection.
    # This is necessary to remove the newline appears immediately after </think>,
    # which may cause the end detection to be delayed by one round.
    self.end_offset = 1

extract_reasoning

extract_reasoning(
    model_output: str,
    request: ChatCompletionRequest | ResponsesRequest,
) -> tuple[str | None, str | None]
Source code in vllm/reasoning/step3p5_reasoning_parser.py
def extract_reasoning(
    self,
    model_output: str,
    request: ChatCompletionRequest | ResponsesRequest,
) -> tuple[str | None, str | None]:
    reasoning, content = super().extract_reasoning(model_output, request)
    if reasoning is not None:
        reasoning = reasoning.removesuffix("\n")
    if content is not None:
        content = content.removeprefix("\n")
    return reasoning or None, content or None

extract_reasoning_streaming

extract_reasoning_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
) -> DeltaMessage | None
Source code in vllm/reasoning/step3p5_reasoning_parser.py
def extract_reasoning_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
    # Drop the immediate newline that models often emit after </think>.
    if previous_text.endswith(self.end_token) and delta_text:
        if delta_text == "\n":
            return None
        elif delta_text.startswith("\n"):
            remaining = delta_text.removeprefix("\n")
            return DeltaMessage(content=remaining) if remaining else None

    ret = super().extract_reasoning_streaming(
        previous_text,
        current_text,
        delta_text,
        previous_token_ids,
        current_token_ids,
        delta_token_ids,
    )

    if ret is None:
        return None

    # Compatibility path for models that don't generate the start token:
    # treat everything before </think> as reasoning and everything after
    # as content.
    if (
        self.start_token_id not in previous_token_ids
        and self.start_token_id not in delta_token_ids
    ):
        if self.end_token_id in delta_token_ids:
            end_index = delta_text.find(self.end_token)
            reasoning = delta_text[:end_index]
            content = delta_text[end_index + len(self.end_token) :]
            ret = DeltaMessage(reasoning=reasoning, content=content or None)
        elif self.end_token_id in previous_token_ids:
            ret = DeltaMessage(content=delta_text)
        else:
            ret = DeltaMessage(reasoning=delta_text)

    reasoning_to_output = ret.reasoning
    content_to_output = ret.content

    # Reasoning: handle the newline immediately before </think>.
    if reasoning_to_output is not None:
        if self._pending_reasoning_newline:
            reasoning_to_output = "\n" + reasoning_to_output
            self._pending_reasoning_newline = False

        if reasoning_to_output.endswith("\n"):
            reasoning_to_output = reasoning_to_output.removesuffix("\n")
            if self.end_token in delta_text:
                # Trailing "\n" is right before </think>, drop it.
                self._pending_reasoning_newline = False
            else:
                # Hold the trailing "\n" until we know whether </think> follows.
                self._pending_reasoning_newline = True

    # Content: handle the newline immediately after </think>.
    if content_to_output is not None:
        # No need to get into parser again to remove newline after </think>.
        self.end_offset -= 1

        # If we have content, reasoning must have ended.
        self._pending_reasoning_newline = False

        if self.end_token in delta_text and content_to_output.startswith("\n"):
            content_to_output = content_to_output.removeprefix("\n")

    reasoning_to_output = reasoning_to_output or None
    content_to_output = content_to_output or None
    if reasoning_to_output is None and content_to_output is None:
        return None

    return DeltaMessage(reasoning=reasoning_to_output, content=content_to_output)

is_reasoning_end

is_reasoning_end(input_ids: Sequence[int]) -> bool
Source code in vllm/reasoning/step3p5_reasoning_parser.py
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
    if self.end_token_id in input_ids and self.end_offset > 0:
        self.end_offset -= 1
        return False
    return self.end_offset < 1

is_reasoning_end_streaming

is_reasoning_end_streaming(
    input_ids: Sequence[int], delta_ids: Sequence[int]
) -> bool
Source code in vllm/reasoning/step3p5_reasoning_parser.py
def is_reasoning_end_streaming(
    self, input_ids: Sequence[int], delta_ids: Sequence[int]
) -> bool:
    if self.end_token_id in input_ids and self.end_offset > 0:
        self.end_offset -= 1
        return False
    return self.end_offset < 1