Skip to content

vllm.reasoning.olmo3_reasoning_parser

logger module-attribute

logger = init_logger(__name__)

Indices dataclass

Source code in vllm/reasoning/olmo3_reasoning_parser.py
@dt.dataclass(frozen=True)
class Indices:
    start: int
    end: int

    def __len__(self):
        return self.end - self.start

end instance-attribute

end: int

start instance-attribute

start: int

__init__

__init__(start: int, end: int) -> None

__len__

__len__()
Source code in vllm/reasoning/olmo3_reasoning_parser.py
def __len__(self):
    return self.end - self.start

Olmo3ReasoningBuffer dataclass

Source code in vllm/reasoning/olmo3_reasoning_parser.py
@dt.dataclass
class Olmo3ReasoningBuffer:
    think_start: str = "<think>"
    think_end: str = "</think>"
    buffer: str = ""

    # we start in reasoning state to support cases where we hardcode
    # <think> as the start of the reasoning block.
    # In those cases, the only token we will see is </think>, which
    # is when we switch to content state.
    state: Olmo3ReasoningState = Olmo3ReasoningState.REASONING

    def process_buffer(self) -> Optional[DeltaMessage]:
        start_think_idx = self.buffer.find(self.think_start)

        if start_think_idx >= 0:
            self.state = Olmo3ReasoningState.REASONING
            pretext, self.buffer = (
                self.buffer[:start_think_idx],
                self.buffer[start_think_idx + len(self.think_start) :],
            )
            if start_think_idx > 0:
                # this covers the case there's content before
                # the start of the reasoning block
                return DeltaMessage(content=pretext)

        end_think_idx = self.buffer.rfind(self.think_end)

        if end_think_idx >= 0:
            self.state = Olmo3ReasoningState.CONTENT
            pretext, self.buffer = (
                self.buffer[:end_think_idx],
                self.buffer[end_think_idx + len(self.think_end) :],
            )
            if end_think_idx > 0:
                # this covers the case there's content before
                # the end of the reasoning block
                return DeltaMessage(reasoning_content=pretext)

        if self.state == Olmo3ReasoningState.REASONING:
            # we are inside reasoning block, return and empty
            # the text buffer
            (
                text_buffer,
                self.buffer,
            ) = self.buffer, ""
            return DeltaMessage(reasoning_content=text_buffer)

        if self.state == Olmo3ReasoningState.CONTENT:
            # we are outside reasoning block, return and empty
            # the text buffer
            (
                text_buffer,
                self.buffer,
            ) = self.buffer, ""
            return DeltaMessage(content=text_buffer)

        # nothing to return unless we are in reasoning or content state
        return None

    def __len__(self):
        # is the length of the text buffer
        return len(self.buffer)

    def add_text(self, delta_text: str) -> Optional[DeltaMessage]:
        # we start by adding the delta text to the buffer
        self.buffer += delta_text

        # setting this to empty before starting
        delta_message: Optional[DeltaMessage] = None

        # we start by computing the overlap between the delta_text
        # and start/end of think tokens.
        _, overlap_think_start = string_overlap(delta_text, self.think_start)
        _, overlap_think_end = string_overlap(delta_text, self.think_end)

        partial_overlap_start = overlap_think_start is not None and len(
            overlap_think_start
        ) < len(self.think_start)
        partial_overlap_end = overlap_think_end is not None and len(
            overlap_think_end
        ) < len(self.think_end)

        if (
            partial_overlap_start
            and self.think_start in self.buffer
            and not partial_overlap_end
        ):
            # we can only process the buffer if partial overlap
            # is the last part of think token (thus causing
            # text_buffer to contain the start of think token)
            # and there are no partial overlaps with end think
            delta_message = self.process_buffer()

        elif partial_overlap_end and self.think_end in self.buffer:
            # same as before (partial overlap only allowed)
            # if the buffer contains the end think token,
            # but we don't have to check for partial overlap
            # with start think token because they are handled
            # by the previous condition
            delta_message = self.process_buffer()

        elif partial_overlap_start or partial_overlap_end:
            # in general, if there are overlaps, we don't
            # process the buffer because we want to wait until
            # the think token is fully completed.
            return None
        else:
            # we process the buffer as normal
            delta_message = self.process_buffer()

        return delta_message

buffer class-attribute instance-attribute

buffer: str = ''

state class-attribute instance-attribute

think_end class-attribute instance-attribute

think_end: str = '</think>'

think_start class-attribute instance-attribute

think_start: str = '<think>'

__init__

__init__(
    think_start: str = "<think>",
    think_end: str = "</think>",
    buffer: str = "",
    state: Olmo3ReasoningState = REASONING,
) -> None

__len__

__len__()
Source code in vllm/reasoning/olmo3_reasoning_parser.py
def __len__(self):
    # is the length of the text buffer
    return len(self.buffer)

add_text

add_text(delta_text: str) -> Optional[DeltaMessage]
Source code in vllm/reasoning/olmo3_reasoning_parser.py
def add_text(self, delta_text: str) -> Optional[DeltaMessage]:
    # we start by adding the delta text to the buffer
    self.buffer += delta_text

    # setting this to empty before starting
    delta_message: Optional[DeltaMessage] = None

    # we start by computing the overlap between the delta_text
    # and start/end of think tokens.
    _, overlap_think_start = string_overlap(delta_text, self.think_start)
    _, overlap_think_end = string_overlap(delta_text, self.think_end)

    partial_overlap_start = overlap_think_start is not None and len(
        overlap_think_start
    ) < len(self.think_start)
    partial_overlap_end = overlap_think_end is not None and len(
        overlap_think_end
    ) < len(self.think_end)

    if (
        partial_overlap_start
        and self.think_start in self.buffer
        and not partial_overlap_end
    ):
        # we can only process the buffer if partial overlap
        # is the last part of think token (thus causing
        # text_buffer to contain the start of think token)
        # and there are no partial overlaps with end think
        delta_message = self.process_buffer()

    elif partial_overlap_end and self.think_end in self.buffer:
        # same as before (partial overlap only allowed)
        # if the buffer contains the end think token,
        # but we don't have to check for partial overlap
        # with start think token because they are handled
        # by the previous condition
        delta_message = self.process_buffer()

    elif partial_overlap_start or partial_overlap_end:
        # in general, if there are overlaps, we don't
        # process the buffer because we want to wait until
        # the think token is fully completed.
        return None
    else:
        # we process the buffer as normal
        delta_message = self.process_buffer()

    return delta_message

process_buffer

process_buffer() -> Optional[DeltaMessage]
Source code in vllm/reasoning/olmo3_reasoning_parser.py
def process_buffer(self) -> Optional[DeltaMessage]:
    start_think_idx = self.buffer.find(self.think_start)

    if start_think_idx >= 0:
        self.state = Olmo3ReasoningState.REASONING
        pretext, self.buffer = (
            self.buffer[:start_think_idx],
            self.buffer[start_think_idx + len(self.think_start) :],
        )
        if start_think_idx > 0:
            # this covers the case there's content before
            # the start of the reasoning block
            return DeltaMessage(content=pretext)

    end_think_idx = self.buffer.rfind(self.think_end)

    if end_think_idx >= 0:
        self.state = Olmo3ReasoningState.CONTENT
        pretext, self.buffer = (
            self.buffer[:end_think_idx],
            self.buffer[end_think_idx + len(self.think_end) :],
        )
        if end_think_idx > 0:
            # this covers the case there's content before
            # the end of the reasoning block
            return DeltaMessage(reasoning_content=pretext)

    if self.state == Olmo3ReasoningState.REASONING:
        # we are inside reasoning block, return and empty
        # the text buffer
        (
            text_buffer,
            self.buffer,
        ) = self.buffer, ""
        return DeltaMessage(reasoning_content=text_buffer)

    if self.state == Olmo3ReasoningState.CONTENT:
        # we are outside reasoning block, return and empty
        # the text buffer
        (
            text_buffer,
            self.buffer,
        ) = self.buffer, ""
        return DeltaMessage(content=text_buffer)

    # nothing to return unless we are in reasoning or content state
    return None

Olmo3ReasoningParser

Bases: ReasoningParser

Reasoning parser for Olmo 3 model

Olmo3ReasoningParser

This class implements a reasoning parser specifically designed for the Olmo 3 family of models. Olmo 3 models do not use special tokens to indicate reasoning; rather, reasoning trace is wrapped in <think> and </think>, which are tokenized using standard vocabulary entries. Because of this, the parser operates in string space, accumulating the characters in a buffer until it sees <think> or </think>. tokens to switch modes.

Key Features
  • For non-stream output, Recognizes and extracts reasoning (text bracketed by <think> and </think>) and content (everything after the first </think>).
  • For stream process, it uses a buffer to accumulate delta text, and output progressive delta messages as soon as thinking starts or ends.
  • For reliability, some Olmo 3 models may hardcode the first <think> token is the input text (similar to Deepseek R1, or reasoning-only Qwen models). To support such variants, the parser can optionally work in cases where the first <think> token is missing from generation.
Source code in vllm/reasoning/olmo3_reasoning_parser.py
@ReasoningParserManager.register_module("olmo3")
class Olmo3ReasoningParser(ReasoningParser):
    """
    Reasoning parser for Olmo 3 model

    Olmo3ReasoningParser

    This class implements a reasoning parser specifically designed for the
    Olmo 3 family of models. Olmo 3 models do not use special tokens to
    indicate reasoning; rather, reasoning trace is wrapped in `<think>` and
    `</think>`, which are tokenized using standard vocabulary entries.
    Because of this, the parser operates in string space, accumulating the
    characters in a buffer until it sees `<think>` or `</think>`. tokens
    to switch modes.

    Key Features:
        - For non-stream output, Recognizes and extracts reasoning (text
          bracketed by `<think>` and `</think>`) and content (everything
          after the first `</think>`).
        - For stream process, it uses a buffer to accumulate delta text,
          and output progressive delta messages as soon as thinking starts
          or ends.
        - For reliability, some Olmo 3 models may hardcode the first
          `<think>` token is the input text (similar to Deepseek R1,
          or reasoning-only Qwen models). To support such variants, the
          parser can optionally work in cases where the first `<think>`
          token is missing from generation.
    """

    def __init__(self, tokenizer: "AnyTokenizer", *args, **kwargs):
        super().__init__(tokenizer, *args, **kwargs)

        self.think_start = r"<think>"
        self.think_end = r"</think>"

        # notice that the first think is optional; this allows template to
        # work in cases when we hardcode a <think> at the beginning of the
        # reasoning template.
        reasoning_expr = (
            rf"^(?:{self.think_start})?(?P<reasoning>.*?)"
            + rf"{self.think_end}(?P<content>.*)$"
        )
        self.reasoning_regex = re.compile(reasoning_expr, re.DOTALL)

        self.buffer = Olmo3ReasoningBuffer(
            think_start=self.think_start, think_end=self.think_end
        )

    def is_reasoning_end(self, input_ids: list[int]) -> bool:
        text = self.model_tokenizer.decode(input_ids)
        return self.think_end in text

    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
        # for Olmo 3 streaming reason parsing, the stream parse
        # will call first, and the same token will be called in
        # is_reasoning_end and extract_content_ids
        # this id is not part of content, so just return [] here.
        return []

    def extract_reasoning_content(
        self,
        model_output: str,
        request: Union[ChatCompletionRequest, ResponsesRequest],
    ) -> tuple[Optional[str], Optional[str]]:
        """Extract the reasoning content & content sections, respectively.
        If the sequence doesn't match what we expect, i.e., the model generates
        something else, all content is considered non-reasoning content.

        Args:
            model_output (str): Output of the model to be parsed.
            request (ChatCompletionRequest | ResponsesRequest): Request being
                processed.

        Returns:
            tuple[Optional[str], Optional[str]]: Tuple pair containing the
            reasoning content and non-reasoning content.
        """

        re_match = self.reasoning_regex.match(model_output)
        if re_match:
            reasoning_content = re_match.group("reasoning") or None
            content = re_match.group("content") or None
            return reasoning_content, content

        # no reasoning content
        return None, model_output

    def extract_reasoning_content_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
    ) -> Union[DeltaMessage, None]:
        """Extract content using token ID sequence state machine"""

        delta_message = self.buffer.add_text(delta_text)
        if delta_message is None and self.buffer.think_end in self.buffer.buffer:
            # this is a bit hacky, but, because of how the buffer is
            # constructed, if the last delta_text contains characters that
            # marks the end of thinking tokens, then messages in the buffer
            # would never be processed because we get no other turn. To get
            # around that, we check if the text buffer contains the end of
            # thinking tokens, and, if so, we reprocess the buffer again.
            delta_message = self.buffer.process_buffer()

        return delta_message

buffer instance-attribute

buffer = Olmo3ReasoningBuffer(
    think_start=think_start, think_end=think_end
)

reasoning_regex instance-attribute

reasoning_regex = compile(reasoning_expr, DOTALL)

think_end instance-attribute

think_end = '</think>'

think_start instance-attribute

think_start = '<think>'

__init__

__init__(tokenizer: AnyTokenizer, *args, **kwargs)
Source code in vllm/reasoning/olmo3_reasoning_parser.py
def __init__(self, tokenizer: "AnyTokenizer", *args, **kwargs):
    super().__init__(tokenizer, *args, **kwargs)

    self.think_start = r"<think>"
    self.think_end = r"</think>"

    # notice that the first think is optional; this allows template to
    # work in cases when we hardcode a <think> at the beginning of the
    # reasoning template.
    reasoning_expr = (
        rf"^(?:{self.think_start})?(?P<reasoning>.*?)"
        + rf"{self.think_end}(?P<content>.*)$"
    )
    self.reasoning_regex = re.compile(reasoning_expr, re.DOTALL)

    self.buffer = Olmo3ReasoningBuffer(
        think_start=self.think_start, think_end=self.think_end
    )

extract_content_ids

extract_content_ids(input_ids: list[int]) -> list[int]
Source code in vllm/reasoning/olmo3_reasoning_parser.py
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
    # for Olmo 3 streaming reason parsing, the stream parse
    # will call first, and the same token will be called in
    # is_reasoning_end and extract_content_ids
    # this id is not part of content, so just return [] here.
    return []

extract_reasoning_content

extract_reasoning_content(
    model_output: str,
    request: Union[ChatCompletionRequest, ResponsesRequest],
) -> tuple[Optional[str], Optional[str]]

Extract the reasoning content & content sections, respectively. If the sequence doesn't match what we expect, i.e., the model generates something else, all content is considered non-reasoning content.

Parameters:

Name Type Description Default
model_output str

Output of the model to be parsed.

required
request ChatCompletionRequest | ResponsesRequest

Request being processed.

required

Returns:

Type Description
Optional[str]

tuple[Optional[str], Optional[str]]: Tuple pair containing the

Optional[str]

reasoning content and non-reasoning content.

Source code in vllm/reasoning/olmo3_reasoning_parser.py
def extract_reasoning_content(
    self,
    model_output: str,
    request: Union[ChatCompletionRequest, ResponsesRequest],
) -> tuple[Optional[str], Optional[str]]:
    """Extract the reasoning content & content sections, respectively.
    If the sequence doesn't match what we expect, i.e., the model generates
    something else, all content is considered non-reasoning content.

    Args:
        model_output (str): Output of the model to be parsed.
        request (ChatCompletionRequest | ResponsesRequest): Request being
            processed.

    Returns:
        tuple[Optional[str], Optional[str]]: Tuple pair containing the
        reasoning content and non-reasoning content.
    """

    re_match = self.reasoning_regex.match(model_output)
    if re_match:
        reasoning_content = re_match.group("reasoning") or None
        content = re_match.group("content") or None
        return reasoning_content, content

    # no reasoning content
    return None, model_output

extract_reasoning_content_streaming

extract_reasoning_content_streaming(
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
) -> Union[DeltaMessage, None]

Extract content using token ID sequence state machine

Source code in vllm/reasoning/olmo3_reasoning_parser.py
def extract_reasoning_content_streaming(
    self,
    previous_text: str,
    current_text: str,
    delta_text: str,
    previous_token_ids: Sequence[int],
    current_token_ids: Sequence[int],
    delta_token_ids: Sequence[int],
) -> Union[DeltaMessage, None]:
    """Extract content using token ID sequence state machine"""

    delta_message = self.buffer.add_text(delta_text)
    if delta_message is None and self.buffer.think_end in self.buffer.buffer:
        # this is a bit hacky, but, because of how the buffer is
        # constructed, if the last delta_text contains characters that
        # marks the end of thinking tokens, then messages in the buffer
        # would never be processed because we get no other turn. To get
        # around that, we check if the text buffer contains the end of
        # thinking tokens, and, if so, we reprocess the buffer again.
        delta_message = self.buffer.process_buffer()

    return delta_message

is_reasoning_end

is_reasoning_end(input_ids: list[int]) -> bool
Source code in vllm/reasoning/olmo3_reasoning_parser.py
def is_reasoning_end(self, input_ids: list[int]) -> bool:
    text = self.model_tokenizer.decode(input_ids)
    return self.think_end in text

Olmo3ReasoningState

Bases: Enum

Source code in vllm/reasoning/olmo3_reasoning_parser.py
class Olmo3ReasoningState(enum.Enum):
    REASONING = 1
    CONTENT = 2

CONTENT class-attribute instance-attribute

CONTENT = 2

REASONING class-attribute instance-attribute

REASONING = 1

string_overlap

string_overlap(
    a: str, b: str
) -> tuple[Optional[Indices], Optional[Indices]]

Find the longest overlap where the end of string a matches the start of string b.

Parameters:

Name Type Description Default
a str

First string

required
b str

Second string

required

Returns:

Type Description
Optional[Indices]

Tuple of IndicesTuples representing the overlapping portions in each

Optional[Indices]

string, or a tuple of None if no overlap exists

Source code in vllm/reasoning/olmo3_reasoning_parser.py
def string_overlap(a: str, b: str) -> tuple[Optional[Indices], Optional[Indices]]:
    """
    Find the longest overlap where the end of string a matches the start
    of string b.

    Args:
        a: First string
        b: Second string

    Returns:
        Tuple of IndicesTuples representing the overlapping portions in each
        string, or a tuple of None if no overlap exists
    """

    # swap so a is always the shorter string
    a, b, swap = (a, b, False) if len(a) < len(b) else (b, a, True)

    # first check: is a fully contained in b?
    if a in b:
        ind_a = Indices(0, len(a))
        ind_b = Indices(b.index(a), b.index(a) + len(a))
        return (ind_b, ind_a) if swap else (ind_a, ind_b)

    # second check: does the end of a overlap with the
    #               beginning of b?
    for i in range(len(a) - 1, 0, -1):
        if a[-i:] == b[:i]:
            ind_a = Indices(len(a) - i, len(a))
            ind_b = Indices(0, i)
            return (ind_b, ind_a) if swap else (ind_a, ind_b)

    # third check: does the beginning of a overlap with
    #              the end of b?
    for i in range(len(a) - 1, 0, -1):
        if b[-i:] == a[:i]:
            ind_a = Indices(0, i)
            ind_b = Indices(len(b) - i, len(b))
            return (ind_b, ind_a) if swap else (ind_a, ind_b)

    return None, None