Skip to content

vllm.v1.engine.output_processor

EMPTY_CPU_TENSOR module-attribute

EMPTY_CPU_TENSOR = empty(0, device='cpu')

OutputProcessor

Process EngineCoreOutputs into RequestOutputs.

Source code in vllm/v1/engine/output_processor.py
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
class OutputProcessor:
    """Process EngineCoreOutputs into RequestOutputs."""

    def __init__(
        self,
        tokenizer: TokenizerLike | None,
        log_stats: bool,
        stream_interval: int = 1,
    ):
        self.log_stats = log_stats
        self.tokenizer = tokenizer
        self.stream_interval = stream_interval
        self.request_states: dict[str, RequestState] = {}
        self.parent_requests: dict[str, ParentRequest] = {}
        self.external_req_ids: defaultdict[str, list[str]] = defaultdict(list)
        self.lora_states = LoRARequestStates(log_stats)
        self.tracer: Tracer | None = None
        self._requests_drained = asyncio.Event()
        self._requests_drained.set()

    def get_num_unfinished_requests(self):
        return len(self.request_states)

    def has_unfinished_requests(self) -> bool:
        return len(self.request_states) > 0

    async def wait_for_requests_to_drain(self) -> None:
        if not self.request_states:
            return
        await self._requests_drained.wait()

    def propagate_error(self, e: Exception):
        """Propagate error to all generate() tasks."""

        for _, state in self.request_states.items():
            assert state.queue is not None
            state.queue.put(e)

    def abort_requests(self, request_ids: Iterable[str], internal: bool) -> list[str]:
        """Abort a list of requests.

        The request_ids may be either external request IDs (those passed to
        InputProcessor.process_inputs()) or internal request IDs (those randomly
        generated when creating the EngineCoreRequest).

        If an external request ID is provided, and that external request ID
        was used for multiple requests, all requests associated with that external
        request ID are aborted.

        In the case of parallel sampling, a request ID may be used to identify
        a parent request, in which case the associated child requests are aborted
        also.
        """
        internal_req_ids = []
        for request_id in request_ids:
            if internal:
                # Internal ID - this may be a parent request
                internal_req_ids.append(request_id)

                # Remove internal ID from the external->internal mapping
                if req_state := self.request_states.get(request_id):
                    external_req_id = req_state.external_req_id
                    internal_ids = self.external_req_ids[external_req_id]
                    internal_ids.remove(request_id)
                    if not internal_ids:
                        del self.external_req_ids[external_req_id]
            elif internal_ids := self.external_req_ids.pop(request_id, []):
                # External ID - abort all requests in the external->internal mapping
                internal_req_ids.extend(internal_ids)

        request_ids_to_abort = []
        for request_id in internal_req_ids:
            req_state = self.request_states.pop(request_id, None)
            if req_state is not None:
                self.lora_states.request_finished(request_id, req_state.lora_name)
                request_ids_to_abort.append(request_id)
                # Produce final abort output.
                if req_state.queue is not None and (
                    request_output := req_state.make_request_output(
                        new_token_ids=[],
                        # Set pooling_output is not None to
                        # correctly enter the abort pooling branch
                        pooling_output=EMPTY_CPU_TENSOR
                        if req_state.detokenizer is None
                        else None,
                        finish_reason=FinishReason.ABORT,
                        stop_reason=None,
                        kv_transfer_params=None,
                    )
                ):
                    req_state.queue.put(request_output)
            elif parent := self.parent_requests.get(request_id):
                # Abort children prior to removing the parent.
                if parent.child_requests:
                    child_reqs = list(parent.child_requests)
                    child_reqs = self.abort_requests(child_reqs, internal=True)
                    request_ids_to_abort.extend(child_reqs)
                self.parent_requests.pop(request_id, None)
        if not self.request_states:
            self._requests_drained.set()
        return request_ids_to_abort

    def add_request(
        self,
        request: EngineCoreRequest,
        prompt: str | None,
        parent_req: ParentRequest | None = None,
        request_index: int = 0,
        queue: RequestOutputCollector | None = None,
    ) -> None:
        request_id = request.request_id
        req_state = self.request_states.get(request_id)
        if req_state is not None:
            self._update_streaming_request_state(req_state, request, prompt)
            return

        req_state = RequestState.from_new_request(
            tokenizer=self.tokenizer,
            request=request,
            prompt=prompt,
            parent_req=parent_req,
            request_index=request_index,
            queue=queue,
            log_stats=self.log_stats,
            stream_interval=self.stream_interval,
        )
        if self._requests_drained.is_set():
            self._requests_drained.clear()
        self.request_states[request_id] = req_state
        if parent_req:
            self.parent_requests[parent_req.request_id] = parent_req

        # Track the external_req_id -> [internal_req_id, ...] mapping
        self.external_req_ids[req_state.external_req_id].append(request_id)

    def _update_streaming_request_state(
        self, req_state: RequestState, request: EngineCoreRequest, prompt: str | None
    ) -> None:
        """Queue a streaming update instead of immediately applying it."""
        if not request.resumable:
            # Final request - just mark completion, don't add its dummy tokens.
            if req_state.input_chunk_queue is None:
                # Engine already finished - emit final output and clean up.
                self._finish_request(req_state)
                if req_state.queue is not None:
                    # Emit a final output with finished=True
                    # to unblock the generate() loop.
                    req_state.queue.put(STREAM_FINISHED)
            elif req_state.input_chunk_queue:
                req_state.input_chunk_queue[-1].final = True
            else:
                req_state.streaming_input = False
            return

        update = StreamingUpdate(
            prompt=prompt,
            prompt_token_ids=request.prompt_token_ids,
            arrival_time=request.arrival_time,
        )

        # Apply request updates now if the last input already completed.
        if req_state.input_chunk_queue is None:
            req_state.apply_streaming_update(update)
            req_state.input_chunk_queue = deque()
        else:
            # Queue the streaming update otherwise.
            req_state.input_chunk_queue.append(update)

    def process_outputs(
        self,
        engine_core_outputs: list[EngineCoreOutput],
        engine_core_timestamp: float | None = None,
        iteration_stats: IterationStats | None = None,
    ) -> OutputProcessorOutput:
        """
        Process the EngineCoreOutputs:
        1) Compute stats for logging
        2) Detokenize
        3) Create and handle RequestOutput objects:
            * If there is a queue (for usage with AsyncLLM),
              put the RequestOutput objects into the queue for
              handling by the per-request generate() tasks.

            * If there is no queue (for usage with LLMEngine),
              return a list of RequestOutput objects.

        NOTE FOR DEVELOPERS

        vLLM V1 minimizes the number of python loops over the full
        batch to ensure system overheads are minimized. This is the
        only function that should loop over EngineCoreOutputs.

        If you need to touch every element of the batch, do it from
        within the loop below.
        """

        request_outputs: list[RequestOutput | PoolingRequestOutput] = []
        reqs_to_abort: list[str] = []
        for engine_core_output in engine_core_outputs:
            req_id = engine_core_output.request_id
            req_state = self.request_states.get(req_id)
            if req_state is None:
                # Ignore output for already-aborted request.
                continue

            # 1) Compute stats for this iteration.
            self._update_stats_from_output(
                req_state, engine_core_output, engine_core_timestamp, iteration_stats
            )

            new_token_ids = engine_core_output.new_token_ids
            pooling_output = engine_core_output.pooling_output
            finish_reason = engine_core_output.finish_reason
            stop_reason = engine_core_output.stop_reason
            kv_transfer_params = engine_core_output.kv_transfer_params
            routed_experts = engine_core_output.routed_experts
            req_state.num_cached_tokens = engine_core_output.num_cached_tokens
            req_state.is_prefilling = False

            if pooling_output is None:
                assert req_state.detokenizer is not None
                assert req_state.logprobs_processor is not None
                # 2) Detokenize the token ids into text and perform stop checks.
                stop_string = req_state.detokenizer.update(
                    new_token_ids, finish_reason == FinishReason.STOP
                )
                if stop_string:
                    finish_reason = FinishReason.STOP
                    stop_reason = stop_string

                # 3) Compute sample and prompt logprobs for request,
                # if required.
                req_state.logprobs_processor.update_from_output(engine_core_output)

            # 4) Create and handle RequestOutput objects.
            if request_output := req_state.make_request_output(
                new_token_ids,
                pooling_output,
                finish_reason,
                stop_reason,
                kv_transfer_params,
                routed_experts,
            ):
                if req_state.streaming_input:
                    request_output.finished = False

                if req_state.queue is not None:
                    # AsyncLLM: put into queue for handling by generate().
                    req_state.queue.put(request_output)
                else:
                    # LLMEngine: return list of RequestOutputs.
                    request_outputs.append(request_output)

            # Free completed requests.
            if finish_reason is not None:
                if req_state.streaming_input:
                    if req_state.input_chunk_queue:
                        update = req_state.input_chunk_queue.popleft()
                        req_state.apply_streaming_update(update)
                    else:
                        req_state.input_chunk_queue = None
                else:
                    self._finish_request(req_state)
                    if not engine_core_output.finished:
                        # If req not finished in EngineCore, but Detokenizer
                        # detected stop string, abort needed in EngineCore.
                        reqs_to_abort.append(req_id)

                    # Track per-request stats
                    self._update_stats_from_finished(
                        req_state, finish_reason, iteration_stats
                    )
                    if self.tracer:
                        self.do_tracing(engine_core_output, req_state, iteration_stats)

        return OutputProcessorOutput(
            request_outputs=request_outputs,
            reqs_to_abort=reqs_to_abort,
        )

    def _finish_request(self, req_state: RequestState) -> None:
        req_id = req_state.request_id
        self.request_states.pop(req_id)

        internal_ids = self.external_req_ids[req_state.external_req_id]
        internal_ids.remove(req_id)
        if not internal_ids:
            del self.external_req_ids[req_state.external_req_id]

        # Remove parent request if applicable.
        parent_req = req_state.parent_req
        if parent_req and not parent_req.child_requests:
            self.parent_requests.pop(parent_req.request_id, None)

        if not self.request_states:
            self._requests_drained.set()

    def update_scheduler_stats(self, scheduler_stats: SchedulerStats | None):
        self.lora_states.update_scheduler_stats(scheduler_stats)

    def do_tracing(
        self,
        engine_core_output: EngineCoreOutput,
        req_state: RequestState,
        iteration_stats: IterationStats | None,
    ) -> None:
        assert req_state.stats is not None
        assert iteration_stats is not None
        assert self.tracer is not None

        arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9)
        trace_context = extract_trace_context(engine_core_output.trace_headers)
        prompt_length = length_from_prompt_token_ids_or_embeds(
            req_state.prompt_token_ids, req_state.prompt_embeds
        )
        with self.tracer.start_as_current_span(
            "llm_request",
            kind=SpanKind.SERVER,
            context=trace_context,
            start_time=arrival_time_nano_seconds,
        ) as span:
            metrics = req_state.stats
            e2e_time = iteration_stats.iteration_timestamp - metrics.arrival_time
            queued_time = metrics.scheduled_ts - metrics.queued_ts
            prefill_time = metrics.first_token_ts - metrics.scheduled_ts
            decode_time = metrics.last_token_ts - metrics.first_token_ts
            inference_time = metrics.last_token_ts - metrics.scheduled_ts
            span.set_attribute(
                SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN,
                metrics.first_token_latency,
            )
            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, queued_time)
            span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS, prompt_length)
            span.set_attribute(
                SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
                metrics.num_generation_tokens,
            )
            span.set_attribute(
                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL, prefill_time
            )
            span.set_attribute(
                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE, decode_time
            )
            span.set_attribute(
                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE, inference_time
            )

            # meta
            span.set_attribute(
                SpanAttributes.GEN_AI_REQUEST_ID, req_state.external_req_id
            )
            if req_state.top_p:
                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, req_state.top_p)
            if req_state.max_tokens_param:
                span.set_attribute(
                    SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS, req_state.max_tokens_param
                )
            if req_state.temperature:
                span.set_attribute(
                    SpanAttributes.GEN_AI_REQUEST_TEMPERATURE, req_state.temperature
                )
            if req_state.n:
                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N, req_state.n)

    def _update_stats_from_output(
        self,
        req_state: RequestState,
        engine_core_output: EngineCoreOutput,
        engine_core_timestamp: float | None,
        iteration_stats: IterationStats | None,
    ):
        if iteration_stats is None:
            return

        assert engine_core_timestamp is not None
        assert req_state.stats is not None
        iteration_stats.update_from_output(
            engine_core_output,
            engine_core_timestamp,
            req_state.is_prefilling,
            req_state.prompt_len,
            req_state.stats,
            self.lora_states,
            req_state.lora_name,
        )

    def _update_stats_from_finished(
        self,
        req_state: RequestState,
        finish_reason: FinishReason | None,
        iteration_stats: IterationStats | None,
    ):
        if iteration_stats is None:
            return

        assert finish_reason is not None
        assert req_state.stats is not None
        iteration_stats.update_from_finished_request(
            finish_reason=finish_reason,
            num_prompt_tokens=req_state.prompt_len,
            max_tokens_param=req_state.max_tokens_param,
            req_stats=req_state.stats,
            num_cached_tokens=req_state.num_cached_tokens,
        )
        self.lora_states.request_finished(req_state.request_id, req_state.lora_name)

        ParentRequest.observe_finished_request(
            req_state.parent_req, iteration_stats, req_state.stats.num_generation_tokens
        )

_requests_drained instance-attribute

_requests_drained = Event()

external_req_ids instance-attribute

external_req_ids: defaultdict[str, list[str]] = defaultdict(
    list
)

log_stats instance-attribute

log_stats = log_stats

lora_states instance-attribute

lora_states = LoRARequestStates(log_stats)

parent_requests instance-attribute

parent_requests: dict[str, ParentRequest] = {}

request_states instance-attribute

request_states: dict[str, RequestState] = {}

stream_interval instance-attribute

stream_interval = stream_interval

tokenizer instance-attribute

tokenizer = tokenizer

tracer instance-attribute

tracer: Tracer | None = None

__init__

__init__(
    tokenizer: TokenizerLike | None,
    log_stats: bool,
    stream_interval: int = 1,
)
Source code in vllm/v1/engine/output_processor.py
def __init__(
    self,
    tokenizer: TokenizerLike | None,
    log_stats: bool,
    stream_interval: int = 1,
):
    self.log_stats = log_stats
    self.tokenizer = tokenizer
    self.stream_interval = stream_interval
    self.request_states: dict[str, RequestState] = {}
    self.parent_requests: dict[str, ParentRequest] = {}
    self.external_req_ids: defaultdict[str, list[str]] = defaultdict(list)
    self.lora_states = LoRARequestStates(log_stats)
    self.tracer: Tracer | None = None
    self._requests_drained = asyncio.Event()
    self._requests_drained.set()

_finish_request

_finish_request(req_state: RequestState) -> None
Source code in vllm/v1/engine/output_processor.py
def _finish_request(self, req_state: RequestState) -> None:
    req_id = req_state.request_id
    self.request_states.pop(req_id)

    internal_ids = self.external_req_ids[req_state.external_req_id]
    internal_ids.remove(req_id)
    if not internal_ids:
        del self.external_req_ids[req_state.external_req_id]

    # Remove parent request if applicable.
    parent_req = req_state.parent_req
    if parent_req and not parent_req.child_requests:
        self.parent_requests.pop(parent_req.request_id, None)

    if not self.request_states:
        self._requests_drained.set()

_update_stats_from_finished

_update_stats_from_finished(
    req_state: RequestState,
    finish_reason: FinishReason | None,
    iteration_stats: IterationStats | None,
)
Source code in vllm/v1/engine/output_processor.py
def _update_stats_from_finished(
    self,
    req_state: RequestState,
    finish_reason: FinishReason | None,
    iteration_stats: IterationStats | None,
):
    if iteration_stats is None:
        return

    assert finish_reason is not None
    assert req_state.stats is not None
    iteration_stats.update_from_finished_request(
        finish_reason=finish_reason,
        num_prompt_tokens=req_state.prompt_len,
        max_tokens_param=req_state.max_tokens_param,
        req_stats=req_state.stats,
        num_cached_tokens=req_state.num_cached_tokens,
    )
    self.lora_states.request_finished(req_state.request_id, req_state.lora_name)

    ParentRequest.observe_finished_request(
        req_state.parent_req, iteration_stats, req_state.stats.num_generation_tokens
    )

_update_stats_from_output

_update_stats_from_output(
    req_state: RequestState,
    engine_core_output: EngineCoreOutput,
    engine_core_timestamp: float | None,
    iteration_stats: IterationStats | None,
)
Source code in vllm/v1/engine/output_processor.py
def _update_stats_from_output(
    self,
    req_state: RequestState,
    engine_core_output: EngineCoreOutput,
    engine_core_timestamp: float | None,
    iteration_stats: IterationStats | None,
):
    if iteration_stats is None:
        return

    assert engine_core_timestamp is not None
    assert req_state.stats is not None
    iteration_stats.update_from_output(
        engine_core_output,
        engine_core_timestamp,
        req_state.is_prefilling,
        req_state.prompt_len,
        req_state.stats,
        self.lora_states,
        req_state.lora_name,
    )

_update_streaming_request_state

_update_streaming_request_state(
    req_state: RequestState,
    request: EngineCoreRequest,
    prompt: str | None,
) -> None

Queue a streaming update instead of immediately applying it.

Source code in vllm/v1/engine/output_processor.py
def _update_streaming_request_state(
    self, req_state: RequestState, request: EngineCoreRequest, prompt: str | None
) -> None:
    """Queue a streaming update instead of immediately applying it."""
    if not request.resumable:
        # Final request - just mark completion, don't add its dummy tokens.
        if req_state.input_chunk_queue is None:
            # Engine already finished - emit final output and clean up.
            self._finish_request(req_state)
            if req_state.queue is not None:
                # Emit a final output with finished=True
                # to unblock the generate() loop.
                req_state.queue.put(STREAM_FINISHED)
        elif req_state.input_chunk_queue:
            req_state.input_chunk_queue[-1].final = True
        else:
            req_state.streaming_input = False
        return

    update = StreamingUpdate(
        prompt=prompt,
        prompt_token_ids=request.prompt_token_ids,
        arrival_time=request.arrival_time,
    )

    # Apply request updates now if the last input already completed.
    if req_state.input_chunk_queue is None:
        req_state.apply_streaming_update(update)
        req_state.input_chunk_queue = deque()
    else:
        # Queue the streaming update otherwise.
        req_state.input_chunk_queue.append(update)

abort_requests

abort_requests(
    request_ids: Iterable[str], internal: bool
) -> list[str]

Abort a list of requests.

The request_ids may be either external request IDs (those passed to InputProcessor.process_inputs()) or internal request IDs (those randomly generated when creating the EngineCoreRequest).

If an external request ID is provided, and that external request ID was used for multiple requests, all requests associated with that external request ID are aborted.

In the case of parallel sampling, a request ID may be used to identify a parent request, in which case the associated child requests are aborted also.

Source code in vllm/v1/engine/output_processor.py
def abort_requests(self, request_ids: Iterable[str], internal: bool) -> list[str]:
    """Abort a list of requests.

    The request_ids may be either external request IDs (those passed to
    InputProcessor.process_inputs()) or internal request IDs (those randomly
    generated when creating the EngineCoreRequest).

    If an external request ID is provided, and that external request ID
    was used for multiple requests, all requests associated with that external
    request ID are aborted.

    In the case of parallel sampling, a request ID may be used to identify
    a parent request, in which case the associated child requests are aborted
    also.
    """
    internal_req_ids = []
    for request_id in request_ids:
        if internal:
            # Internal ID - this may be a parent request
            internal_req_ids.append(request_id)

            # Remove internal ID from the external->internal mapping
            if req_state := self.request_states.get(request_id):
                external_req_id = req_state.external_req_id
                internal_ids = self.external_req_ids[external_req_id]
                internal_ids.remove(request_id)
                if not internal_ids:
                    del self.external_req_ids[external_req_id]
        elif internal_ids := self.external_req_ids.pop(request_id, []):
            # External ID - abort all requests in the external->internal mapping
            internal_req_ids.extend(internal_ids)

    request_ids_to_abort = []
    for request_id in internal_req_ids:
        req_state = self.request_states.pop(request_id, None)
        if req_state is not None:
            self.lora_states.request_finished(request_id, req_state.lora_name)
            request_ids_to_abort.append(request_id)
            # Produce final abort output.
            if req_state.queue is not None and (
                request_output := req_state.make_request_output(
                    new_token_ids=[],
                    # Set pooling_output is not None to
                    # correctly enter the abort pooling branch
                    pooling_output=EMPTY_CPU_TENSOR
                    if req_state.detokenizer is None
                    else None,
                    finish_reason=FinishReason.ABORT,
                    stop_reason=None,
                    kv_transfer_params=None,
                )
            ):
                req_state.queue.put(request_output)
        elif parent := self.parent_requests.get(request_id):
            # Abort children prior to removing the parent.
            if parent.child_requests:
                child_reqs = list(parent.child_requests)
                child_reqs = self.abort_requests(child_reqs, internal=True)
                request_ids_to_abort.extend(child_reqs)
            self.parent_requests.pop(request_id, None)
    if not self.request_states:
        self._requests_drained.set()
    return request_ids_to_abort

add_request

add_request(
    request: EngineCoreRequest,
    prompt: str | None,
    parent_req: ParentRequest | None = None,
    request_index: int = 0,
    queue: RequestOutputCollector | None = None,
) -> None
Source code in vllm/v1/engine/output_processor.py
def add_request(
    self,
    request: EngineCoreRequest,
    prompt: str | None,
    parent_req: ParentRequest | None = None,
    request_index: int = 0,
    queue: RequestOutputCollector | None = None,
) -> None:
    request_id = request.request_id
    req_state = self.request_states.get(request_id)
    if req_state is not None:
        self._update_streaming_request_state(req_state, request, prompt)
        return

    req_state = RequestState.from_new_request(
        tokenizer=self.tokenizer,
        request=request,
        prompt=prompt,
        parent_req=parent_req,
        request_index=request_index,
        queue=queue,
        log_stats=self.log_stats,
        stream_interval=self.stream_interval,
    )
    if self._requests_drained.is_set():
        self._requests_drained.clear()
    self.request_states[request_id] = req_state
    if parent_req:
        self.parent_requests[parent_req.request_id] = parent_req

    # Track the external_req_id -> [internal_req_id, ...] mapping
    self.external_req_ids[req_state.external_req_id].append(request_id)

do_tracing

do_tracing(
    engine_core_output: EngineCoreOutput,
    req_state: RequestState,
    iteration_stats: IterationStats | None,
) -> None
Source code in vllm/v1/engine/output_processor.py
def do_tracing(
    self,
    engine_core_output: EngineCoreOutput,
    req_state: RequestState,
    iteration_stats: IterationStats | None,
) -> None:
    assert req_state.stats is not None
    assert iteration_stats is not None
    assert self.tracer is not None

    arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9)
    trace_context = extract_trace_context(engine_core_output.trace_headers)
    prompt_length = length_from_prompt_token_ids_or_embeds(
        req_state.prompt_token_ids, req_state.prompt_embeds
    )
    with self.tracer.start_as_current_span(
        "llm_request",
        kind=SpanKind.SERVER,
        context=trace_context,
        start_time=arrival_time_nano_seconds,
    ) as span:
        metrics = req_state.stats
        e2e_time = iteration_stats.iteration_timestamp - metrics.arrival_time
        queued_time = metrics.scheduled_ts - metrics.queued_ts
        prefill_time = metrics.first_token_ts - metrics.scheduled_ts
        decode_time = metrics.last_token_ts - metrics.first_token_ts
        inference_time = metrics.last_token_ts - metrics.scheduled_ts
        span.set_attribute(
            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN,
            metrics.first_token_latency,
        )
        span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
        span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, queued_time)
        span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS, prompt_length)
        span.set_attribute(
            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
            metrics.num_generation_tokens,
        )
        span.set_attribute(
            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL, prefill_time
        )
        span.set_attribute(
            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE, decode_time
        )
        span.set_attribute(
            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE, inference_time
        )

        # meta
        span.set_attribute(
            SpanAttributes.GEN_AI_REQUEST_ID, req_state.external_req_id
        )
        if req_state.top_p:
            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, req_state.top_p)
        if req_state.max_tokens_param:
            span.set_attribute(
                SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS, req_state.max_tokens_param
            )
        if req_state.temperature:
            span.set_attribute(
                SpanAttributes.GEN_AI_REQUEST_TEMPERATURE, req_state.temperature
            )
        if req_state.n:
            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N, req_state.n)

get_num_unfinished_requests

get_num_unfinished_requests()
Source code in vllm/v1/engine/output_processor.py
def get_num_unfinished_requests(self):
    return len(self.request_states)

has_unfinished_requests

has_unfinished_requests() -> bool
Source code in vllm/v1/engine/output_processor.py
def has_unfinished_requests(self) -> bool:
    return len(self.request_states) > 0

process_outputs

process_outputs(
    engine_core_outputs: list[EngineCoreOutput],
    engine_core_timestamp: float | None = None,
    iteration_stats: IterationStats | None = None,
) -> OutputProcessorOutput

Process the EngineCoreOutputs: 1) Compute stats for logging 2) Detokenize 3) Create and handle RequestOutput objects: * If there is a queue (for usage with AsyncLLM), put the RequestOutput objects into the queue for handling by the per-request generate() tasks.

* If there is no queue (for usage with LLMEngine),
  return a list of RequestOutput objects.

NOTE FOR DEVELOPERS

vLLM V1 minimizes the number of python loops over the full batch to ensure system overheads are minimized. This is the only function that should loop over EngineCoreOutputs.

If you need to touch every element of the batch, do it from within the loop below.

Source code in vllm/v1/engine/output_processor.py
def process_outputs(
    self,
    engine_core_outputs: list[EngineCoreOutput],
    engine_core_timestamp: float | None = None,
    iteration_stats: IterationStats | None = None,
) -> OutputProcessorOutput:
    """
    Process the EngineCoreOutputs:
    1) Compute stats for logging
    2) Detokenize
    3) Create and handle RequestOutput objects:
        * If there is a queue (for usage with AsyncLLM),
          put the RequestOutput objects into the queue for
          handling by the per-request generate() tasks.

        * If there is no queue (for usage with LLMEngine),
          return a list of RequestOutput objects.

    NOTE FOR DEVELOPERS

    vLLM V1 minimizes the number of python loops over the full
    batch to ensure system overheads are minimized. This is the
    only function that should loop over EngineCoreOutputs.

    If you need to touch every element of the batch, do it from
    within the loop below.
    """

    request_outputs: list[RequestOutput | PoolingRequestOutput] = []
    reqs_to_abort: list[str] = []
    for engine_core_output in engine_core_outputs:
        req_id = engine_core_output.request_id
        req_state = self.request_states.get(req_id)
        if req_state is None:
            # Ignore output for already-aborted request.
            continue

        # 1) Compute stats for this iteration.
        self._update_stats_from_output(
            req_state, engine_core_output, engine_core_timestamp, iteration_stats
        )

        new_token_ids = engine_core_output.new_token_ids
        pooling_output = engine_core_output.pooling_output
        finish_reason = engine_core_output.finish_reason
        stop_reason = engine_core_output.stop_reason
        kv_transfer_params = engine_core_output.kv_transfer_params
        routed_experts = engine_core_output.routed_experts
        req_state.num_cached_tokens = engine_core_output.num_cached_tokens
        req_state.is_prefilling = False

        if pooling_output is None:
            assert req_state.detokenizer is not None
            assert req_state.logprobs_processor is not None
            # 2) Detokenize the token ids into text and perform stop checks.
            stop_string = req_state.detokenizer.update(
                new_token_ids, finish_reason == FinishReason.STOP
            )
            if stop_string:
                finish_reason = FinishReason.STOP
                stop_reason = stop_string

            # 3) Compute sample and prompt logprobs for request,
            # if required.
            req_state.logprobs_processor.update_from_output(engine_core_output)

        # 4) Create and handle RequestOutput objects.
        if request_output := req_state.make_request_output(
            new_token_ids,
            pooling_output,
            finish_reason,
            stop_reason,
            kv_transfer_params,
            routed_experts,
        ):
            if req_state.streaming_input:
                request_output.finished = False

            if req_state.queue is not None:
                # AsyncLLM: put into queue for handling by generate().
                req_state.queue.put(request_output)
            else:
                # LLMEngine: return list of RequestOutputs.
                request_outputs.append(request_output)

        # Free completed requests.
        if finish_reason is not None:
            if req_state.streaming_input:
                if req_state.input_chunk_queue:
                    update = req_state.input_chunk_queue.popleft()
                    req_state.apply_streaming_update(update)
                else:
                    req_state.input_chunk_queue = None
            else:
                self._finish_request(req_state)
                if not engine_core_output.finished:
                    # If req not finished in EngineCore, but Detokenizer
                    # detected stop string, abort needed in EngineCore.
                    reqs_to_abort.append(req_id)

                # Track per-request stats
                self._update_stats_from_finished(
                    req_state, finish_reason, iteration_stats
                )
                if self.tracer:
                    self.do_tracing(engine_core_output, req_state, iteration_stats)

    return OutputProcessorOutput(
        request_outputs=request_outputs,
        reqs_to_abort=reqs_to_abort,
    )

propagate_error

propagate_error(e: Exception)

Propagate error to all generate() tasks.

Source code in vllm/v1/engine/output_processor.py
def propagate_error(self, e: Exception):
    """Propagate error to all generate() tasks."""

    for _, state in self.request_states.items():
        assert state.queue is not None
        state.queue.put(e)

update_scheduler_stats

update_scheduler_stats(
    scheduler_stats: SchedulerStats | None,
)
Source code in vllm/v1/engine/output_processor.py
def update_scheduler_stats(self, scheduler_stats: SchedulerStats | None):
    self.lora_states.update_scheduler_stats(scheduler_stats)

wait_for_requests_to_drain async

wait_for_requests_to_drain() -> None
Source code in vllm/v1/engine/output_processor.py
async def wait_for_requests_to_drain(self) -> None:
    if not self.request_states:
        return
    await self._requests_drained.wait()

OutputProcessorOutput dataclass

Source code in vllm/v1/engine/output_processor.py
@dataclass
class OutputProcessorOutput:
    request_outputs: list[RequestOutput | PoolingRequestOutput]
    reqs_to_abort: list[str]

reqs_to_abort instance-attribute

reqs_to_abort: list[str]

request_outputs instance-attribute

__init__

__init__(
    request_outputs: list[
        RequestOutput | PoolingRequestOutput
    ],
    reqs_to_abort: list[str],
) -> None

RequestOutputCollector

Collects streamed RequestOutputs per individual request, for hand-off to the consuming asyncio generate task.

When streaming deltas, RequestOutputs are merged if the producer gets ahead of the consumer.

Source code in vllm/v1/engine/output_processor.py
class RequestOutputCollector:
    """
    Collects streamed RequestOutputs per individual request,
    for hand-off to the consuming asyncio generate task.

    When streaming deltas, RequestOutputs are merged if the
    producer gets ahead of the consumer.
    """

    def __init__(self, output_kind: RequestOutputKind, request_id: str):
        self.aggregate = output_kind == RequestOutputKind.DELTA
        self.request_id = request_id
        self.output: RequestOutput | PoolingRequestOutput | Exception | None = None
        self.ready = asyncio.Event()

        self._input_stream_task: asyncio.Task | None = None

    def put(self, output: RequestOutput | PoolingRequestOutput | Exception) -> None:
        """Non-blocking put operation."""
        if self.output is None or isinstance(output, Exception):
            self.output = output
            self.ready.set()
        elif isinstance(self.output, RequestOutput) and isinstance(
            output, RequestOutput
        ):
            # This ensures that request outputs with different request indexes
            # (if n > 1) do not override each other.
            self.output.add(output, aggregate=self.aggregate)
        elif isinstance(self.output, PoolingRequestOutput) and isinstance(
            output, PoolingRequestOutput
        ):
            self.output = output

    async def get(self) -> RequestOutput | PoolingRequestOutput:
        """Get operation blocks on put event."""
        while (output := self.output) is None:
            await self.ready.wait()
        self.output = None
        self.ready.clear()
        if isinstance(output, Exception):
            raise output
        return output

    def get_nowait(self) -> RequestOutput | PoolingRequestOutput | None:
        """Non-blocking get operation."""
        output = self.output
        if output is not None:
            self.output = None
            self.ready.clear()
        if isinstance(output, Exception):
            raise output
        return output

    def close(self):
        if self._input_stream_task is not None:
            self._input_stream_task.cancel()
        self._input_stream_task = None

    def __del__(self):
        if (task := self._input_stream_task) is not None:
            task.get_loop().call_soon_threadsafe(task.cancel)
            self._input_stream_task = None

_input_stream_task instance-attribute

_input_stream_task: Task | None = None

aggregate instance-attribute

aggregate = output_kind == DELTA

output instance-attribute

output: (
    RequestOutput | PoolingRequestOutput | Exception | None
) = None

ready instance-attribute

ready = Event()

request_id instance-attribute

request_id = request_id

__del__

__del__()
Source code in vllm/v1/engine/output_processor.py
def __del__(self):
    if (task := self._input_stream_task) is not None:
        task.get_loop().call_soon_threadsafe(task.cancel)
        self._input_stream_task = None

__init__

__init__(output_kind: RequestOutputKind, request_id: str)
Source code in vllm/v1/engine/output_processor.py
def __init__(self, output_kind: RequestOutputKind, request_id: str):
    self.aggregate = output_kind == RequestOutputKind.DELTA
    self.request_id = request_id
    self.output: RequestOutput | PoolingRequestOutput | Exception | None = None
    self.ready = asyncio.Event()

    self._input_stream_task: asyncio.Task | None = None

close

close()
Source code in vllm/v1/engine/output_processor.py
def close(self):
    if self._input_stream_task is not None:
        self._input_stream_task.cancel()
    self._input_stream_task = None

get async

Get operation blocks on put event.

Source code in vllm/v1/engine/output_processor.py
async def get(self) -> RequestOutput | PoolingRequestOutput:
    """Get operation blocks on put event."""
    while (output := self.output) is None:
        await self.ready.wait()
    self.output = None
    self.ready.clear()
    if isinstance(output, Exception):
        raise output
    return output

get_nowait

get_nowait() -> RequestOutput | PoolingRequestOutput | None

Non-blocking get operation.

Source code in vllm/v1/engine/output_processor.py
def get_nowait(self) -> RequestOutput | PoolingRequestOutput | None:
    """Non-blocking get operation."""
    output = self.output
    if output is not None:
        self.output = None
        self.ready.clear()
    if isinstance(output, Exception):
        raise output
    return output

put

put(
    output: RequestOutput
    | PoolingRequestOutput
    | Exception,
) -> None

Non-blocking put operation.

Source code in vllm/v1/engine/output_processor.py
def put(self, output: RequestOutput | PoolingRequestOutput | Exception) -> None:
    """Non-blocking put operation."""
    if self.output is None or isinstance(output, Exception):
        self.output = output
        self.ready.set()
    elif isinstance(self.output, RequestOutput) and isinstance(
        output, RequestOutput
    ):
        # This ensures that request outputs with different request indexes
        # (if n > 1) do not override each other.
        self.output.add(output, aggregate=self.aggregate)
    elif isinstance(self.output, PoolingRequestOutput) and isinstance(
        output, PoolingRequestOutput
    ):
        self.output = output

RequestState

Source code in vllm/v1/engine/output_processor.py
class RequestState:
    def __init__(
        self,
        request_id: str,
        external_req_id: str,
        parent_req: ParentRequest | None,
        request_index: int,
        lora_request: LoRARequest | None,
        output_kind: RequestOutputKind,
        prompt: str | None,
        prompt_token_ids: list[int] | None,
        prompt_embeds: torch.Tensor | None,
        logprobs_processor: LogprobsProcessor | None,
        detokenizer: IncrementalDetokenizer | None,
        max_tokens_param: int | None,
        arrival_time: float,
        queue: RequestOutputCollector | None,
        log_stats: bool,
        stream_interval: int,
        top_p: float | None = None,
        n: int | None = None,
        temperature: float | None = None,
        stream_input: bool = False,
    ):
        self.request_id = request_id
        self.external_req_id = external_req_id
        self.parent_req = parent_req
        self.request_index = request_index
        self.lora_request = lora_request
        self.lora_name = lora_request.lora_name if lora_request is not None else None
        self.output_kind = output_kind
        self.prompt = prompt
        self.prompt_token_ids = prompt_token_ids
        self.prompt_embeds = prompt_embeds
        self.prompt_len = length_from_prompt_token_ids_or_embeds(
            self.prompt_token_ids, self.prompt_embeds
        )
        self.logprobs_processor = logprobs_processor
        self.detokenizer = detokenizer
        self.max_tokens_param = max_tokens_param
        self.top_p = top_p
        self.n = n
        self.temperature = temperature
        self.is_prefilling = True
        self.queue = queue
        self.num_cached_tokens = 0

        self.stats = RequestStateStats(arrival_time=arrival_time) if log_stats else None

        # Stream Interval
        self.stream_interval = stream_interval
        self.sent_tokens_offset = 0  # Offset of sent tokens

        # Streaming input queue
        self.streaming_input = stream_input
        self.input_chunk_queue: deque[StreamingUpdate] | None = (
            deque() if stream_input else None
        )

    def apply_streaming_update(self, update: StreamingUpdate) -> None:
        # Apply the update to the request state.
        self.streaming_input = not update.final
        # TODO also include relevant output tokens in new prompt here
        #     (match scheduler behavior).
        if update.prompt:
            self.prompt = (
                (self.prompt + update.prompt) if self.prompt else update.prompt
            )
        if self.prompt_token_ids:
            self.prompt_token_ids.extend(update.prompt_token_ids or ())
        else:
            self.prompt_token_ids = update.prompt_token_ids or []
        assert self.prompt_token_ids is not None
        self.prompt_len = len(self.prompt_token_ids)
        if self.stats is not None:
            self.stats.arrival_time = update.arrival_time
        self.is_prefilling = True

    @classmethod
    def from_new_request(
        cls,
        tokenizer: TokenizerLike | None,
        request: EngineCoreRequest,
        prompt: str | None,
        parent_req: ParentRequest | None,
        request_index: int,
        queue: RequestOutputCollector | None,
        log_stats: bool,
        stream_interval: int,
    ) -> "RequestState":
        if sampling_params := request.sampling_params:
            if not sampling_params.detokenize:
                tokenizer = None
            output_kind = sampling_params.output_kind
            logprobs_processor = LogprobsProcessor.from_new_request(
                tokenizer=tokenizer,
                request=request,
            )
            detokenizer = IncrementalDetokenizer.from_new_request(
                tokenizer=tokenizer,
                request=request,
            )
            max_tokens_param = sampling_params.max_tokens
            top_p = sampling_params.top_p
            n = sampling_params.n
            temperature = sampling_params.temperature
        else:
            logprobs_processor = None
            detokenizer = None
            max_tokens_param = None
            top_p = None
            n = None
            temperature = None
            assert request.pooling_params is not None
            output_kind = request.pooling_params.output_kind

        assert request.external_req_id is not None
        return cls(
            request_id=request.request_id,
            external_req_id=request.external_req_id,
            parent_req=parent_req,
            request_index=request_index,
            lora_request=request.lora_request,
            output_kind=output_kind,
            prompt=prompt,
            prompt_token_ids=request.prompt_token_ids,
            prompt_embeds=request.prompt_embeds,
            logprobs_processor=logprobs_processor,
            detokenizer=detokenizer,
            max_tokens_param=max_tokens_param,
            top_p=top_p,
            n=n,
            temperature=temperature,
            arrival_time=request.arrival_time,
            queue=queue,
            log_stats=log_stats,
            stream_interval=stream_interval,
            stream_input=request.resumable,
        )

    def make_request_output(
        self,
        new_token_ids: list[int],
        pooling_output: torch.Tensor | None,
        finish_reason: FinishReason | None,
        stop_reason: int | str | None,
        kv_transfer_params: dict[str, Any] | None = None,
        routed_experts: np.ndarray | None = None,
    ) -> RequestOutput | PoolingRequestOutput | None:
        finished = finish_reason is not None
        final_only = self.output_kind == RequestOutputKind.FINAL_ONLY

        if not finished and final_only:
            # Only the final output is required in FINAL_ONLY mode.
            return None

        if self.stream_interval > 1:
            assert self.detokenizer is not None

            # Send output request only when
            # 1. It has finished, or
            # 2. It is the first token, or
            # 3. It has reached the stream interval number of tokens
            if not (
                finished
                or self.sent_tokens_offset == 0
                or len(self.detokenizer.output_token_ids) - self.sent_tokens_offset
                >= self.stream_interval
            ):
                return None

            if self.output_kind == RequestOutputKind.DELTA:
                # Send tokens from the offset in DELTA mode, otherwise all
                # tokens are sent.
                new_token_ids = self.detokenizer.output_token_ids[
                    self.sent_tokens_offset :
                ]
                self.sent_tokens_offset = len(self.detokenizer.output_token_ids)

        external_req_id = self.external_req_id

        if pooling_output is not None:
            return self._new_request_output(
                external_req_id,
                [self._new_pooling_output(pooling_output)],
                finished,
            )

        output = self._new_completion_output(
            new_token_ids, finish_reason, stop_reason, routed_experts
        )

        if self.parent_req is None:
            outputs = [output]
        else:
            outputs, finished = self.parent_req.get_outputs(self.request_id, output)
            if not outputs:
                return None
            external_req_id = self.parent_req.external_req_id

        return self._new_request_output(
            external_req_id, outputs, finished, kv_transfer_params
        )

    def _new_request_output(
        self,
        external_req_id: str,
        outputs: list[CompletionOutput] | list[PoolingOutput],
        finished: bool,
        kv_transfer_params: dict[str, Any] | None = None,
    ) -> RequestOutput | PoolingRequestOutput:
        first_output = outputs[0]
        if isinstance(first_output, PoolingOutput):
            assert len(outputs) == 1
            # Prompt embeddings are currently not supported by pooling requests.
            assert self.prompt_token_ids is not None
            return PoolingRequestOutput(
                request_id=external_req_id,
                outputs=first_output,
                num_cached_tokens=self.num_cached_tokens,
                prompt_token_ids=self.prompt_token_ids,
                finished=finished,
            )
        assert self.logprobs_processor is not None
        if self.output_kind == RequestOutputKind.DELTA:
            # Side effect: logprobs processor forgets prompt logprobs
            prompt_logprobs = self.logprobs_processor.pop_prompt_logprobs()
        else:
            prompt_logprobs = self.logprobs_processor.prompt_logprobs

        # If prompt embeds were used, put placeholder prompt token ids
        prompt_token_ids = self.prompt_token_ids
        if prompt_token_ids is None and self.prompt_embeds is not None:
            prompt_token_ids = [0] * len(self.prompt_embeds)

        return RequestOutput(
            request_id=external_req_id,  # request_id is what was provided externally
            lora_request=self.lora_request,
            prompt=self.prompt,
            prompt_token_ids=prompt_token_ids,
            prompt_logprobs=prompt_logprobs,
            outputs=cast(list[CompletionOutput], outputs),
            finished=finished,
            kv_transfer_params=kv_transfer_params,
            num_cached_tokens=self.num_cached_tokens,
            metrics=self.stats,
        )

    def _new_completion_output(
        self,
        token_ids: list[int],
        finish_reason: FinishReason | None,
        stop_reason: int | str | None,
        routed_experts: np.ndarray | None = None,
    ) -> CompletionOutput:
        assert self.detokenizer is not None
        assert self.logprobs_processor is not None
        finished = finish_reason is not None
        delta = self.output_kind == RequestOutputKind.DELTA

        # Prepare text and token_ids, based on delta mode
        text = self.detokenizer.get_next_output_text(finished, delta)
        if not delta:
            token_ids = self.detokenizer.output_token_ids

        # Prepare logprobs, based on delta mode
        logprobs = self.logprobs_processor.logprobs
        if delta and logprobs:
            logprobs = logprobs[-len(token_ids) :]

        return CompletionOutput(
            index=self.request_index,
            text=text,
            token_ids=token_ids,
            routed_experts=routed_experts,
            logprobs=logprobs,
            cumulative_logprob=self.logprobs_processor.cumulative_logprob,
            finish_reason=str(finish_reason) if finished else None,
            stop_reason=stop_reason if finished else None,
        )

    def _new_pooling_output(self, pooling_output: torch.Tensor) -> PoolingOutput:
        return PoolingOutput(data=pooling_output)

detokenizer instance-attribute

detokenizer = detokenizer

external_req_id instance-attribute

external_req_id = external_req_id

input_chunk_queue instance-attribute

input_chunk_queue: deque[StreamingUpdate] | None = (
    deque() if stream_input else None
)

is_prefilling instance-attribute

is_prefilling = True

logprobs_processor instance-attribute

logprobs_processor = logprobs_processor

lora_name instance-attribute

lora_name = lora_name if lora_request is not None else None

lora_request instance-attribute

lora_request = lora_request

max_tokens_param instance-attribute

max_tokens_param = max_tokens_param

n instance-attribute

n = n

num_cached_tokens instance-attribute

num_cached_tokens = 0

output_kind instance-attribute

output_kind = output_kind

parent_req instance-attribute

parent_req = parent_req

prompt instance-attribute

prompt = prompt

prompt_embeds instance-attribute

prompt_embeds = prompt_embeds

prompt_len instance-attribute

prompt_len = length_from_prompt_token_ids_or_embeds(
    prompt_token_ids, prompt_embeds
)

prompt_token_ids instance-attribute

prompt_token_ids = prompt_token_ids

queue instance-attribute

queue = queue

request_id instance-attribute

request_id = request_id

request_index instance-attribute

request_index = request_index

sent_tokens_offset instance-attribute

sent_tokens_offset = 0

stats instance-attribute

stats = (
    RequestStateStats(arrival_time=arrival_time)
    if log_stats
    else None
)

stream_interval instance-attribute

stream_interval = stream_interval

streaming_input instance-attribute

streaming_input = stream_input

temperature instance-attribute

temperature = temperature

top_p instance-attribute

top_p = top_p

__init__

__init__(
    request_id: str,
    external_req_id: str,
    parent_req: ParentRequest | None,
    request_index: int,
    lora_request: LoRARequest | None,
    output_kind: RequestOutputKind,
    prompt: str | None,
    prompt_token_ids: list[int] | None,
    prompt_embeds: Tensor | None,
    logprobs_processor: LogprobsProcessor | None,
    detokenizer: IncrementalDetokenizer | None,
    max_tokens_param: int | None,
    arrival_time: float,
    queue: RequestOutputCollector | None,
    log_stats: bool,
    stream_interval: int,
    top_p: float | None = None,
    n: int | None = None,
    temperature: float | None = None,
    stream_input: bool = False,
)
Source code in vllm/v1/engine/output_processor.py
def __init__(
    self,
    request_id: str,
    external_req_id: str,
    parent_req: ParentRequest | None,
    request_index: int,
    lora_request: LoRARequest | None,
    output_kind: RequestOutputKind,
    prompt: str | None,
    prompt_token_ids: list[int] | None,
    prompt_embeds: torch.Tensor | None,
    logprobs_processor: LogprobsProcessor | None,
    detokenizer: IncrementalDetokenizer | None,
    max_tokens_param: int | None,
    arrival_time: float,
    queue: RequestOutputCollector | None,
    log_stats: bool,
    stream_interval: int,
    top_p: float | None = None,
    n: int | None = None,
    temperature: float | None = None,
    stream_input: bool = False,
):
    self.request_id = request_id
    self.external_req_id = external_req_id
    self.parent_req = parent_req
    self.request_index = request_index
    self.lora_request = lora_request
    self.lora_name = lora_request.lora_name if lora_request is not None else None
    self.output_kind = output_kind
    self.prompt = prompt
    self.prompt_token_ids = prompt_token_ids
    self.prompt_embeds = prompt_embeds
    self.prompt_len = length_from_prompt_token_ids_or_embeds(
        self.prompt_token_ids, self.prompt_embeds
    )
    self.logprobs_processor = logprobs_processor
    self.detokenizer = detokenizer
    self.max_tokens_param = max_tokens_param
    self.top_p = top_p
    self.n = n
    self.temperature = temperature
    self.is_prefilling = True
    self.queue = queue
    self.num_cached_tokens = 0

    self.stats = RequestStateStats(arrival_time=arrival_time) if log_stats else None

    # Stream Interval
    self.stream_interval = stream_interval
    self.sent_tokens_offset = 0  # Offset of sent tokens

    # Streaming input queue
    self.streaming_input = stream_input
    self.input_chunk_queue: deque[StreamingUpdate] | None = (
        deque() if stream_input else None
    )

_new_completion_output

_new_completion_output(
    token_ids: list[int],
    finish_reason: FinishReason | None,
    stop_reason: int | str | None,
    routed_experts: ndarray | None = None,
) -> CompletionOutput
Source code in vllm/v1/engine/output_processor.py
def _new_completion_output(
    self,
    token_ids: list[int],
    finish_reason: FinishReason | None,
    stop_reason: int | str | None,
    routed_experts: np.ndarray | None = None,
) -> CompletionOutput:
    assert self.detokenizer is not None
    assert self.logprobs_processor is not None
    finished = finish_reason is not None
    delta = self.output_kind == RequestOutputKind.DELTA

    # Prepare text and token_ids, based on delta mode
    text = self.detokenizer.get_next_output_text(finished, delta)
    if not delta:
        token_ids = self.detokenizer.output_token_ids

    # Prepare logprobs, based on delta mode
    logprobs = self.logprobs_processor.logprobs
    if delta and logprobs:
        logprobs = logprobs[-len(token_ids) :]

    return CompletionOutput(
        index=self.request_index,
        text=text,
        token_ids=token_ids,
        routed_experts=routed_experts,
        logprobs=logprobs,
        cumulative_logprob=self.logprobs_processor.cumulative_logprob,
        finish_reason=str(finish_reason) if finished else None,
        stop_reason=stop_reason if finished else None,
    )

_new_pooling_output

_new_pooling_output(
    pooling_output: Tensor,
) -> PoolingOutput
Source code in vllm/v1/engine/output_processor.py
def _new_pooling_output(self, pooling_output: torch.Tensor) -> PoolingOutput:
    return PoolingOutput(data=pooling_output)

_new_request_output

_new_request_output(
    external_req_id: str,
    outputs: list[CompletionOutput] | list[PoolingOutput],
    finished: bool,
    kv_transfer_params: dict[str, Any] | None = None,
) -> RequestOutput | PoolingRequestOutput
Source code in vllm/v1/engine/output_processor.py
def _new_request_output(
    self,
    external_req_id: str,
    outputs: list[CompletionOutput] | list[PoolingOutput],
    finished: bool,
    kv_transfer_params: dict[str, Any] | None = None,
) -> RequestOutput | PoolingRequestOutput:
    first_output = outputs[0]
    if isinstance(first_output, PoolingOutput):
        assert len(outputs) == 1
        # Prompt embeddings are currently not supported by pooling requests.
        assert self.prompt_token_ids is not None
        return PoolingRequestOutput(
            request_id=external_req_id,
            outputs=first_output,
            num_cached_tokens=self.num_cached_tokens,
            prompt_token_ids=self.prompt_token_ids,
            finished=finished,
        )
    assert self.logprobs_processor is not None
    if self.output_kind == RequestOutputKind.DELTA:
        # Side effect: logprobs processor forgets prompt logprobs
        prompt_logprobs = self.logprobs_processor.pop_prompt_logprobs()
    else:
        prompt_logprobs = self.logprobs_processor.prompt_logprobs

    # If prompt embeds were used, put placeholder prompt token ids
    prompt_token_ids = self.prompt_token_ids
    if prompt_token_ids is None and self.prompt_embeds is not None:
        prompt_token_ids = [0] * len(self.prompt_embeds)

    return RequestOutput(
        request_id=external_req_id,  # request_id is what was provided externally
        lora_request=self.lora_request,
        prompt=self.prompt,
        prompt_token_ids=prompt_token_ids,
        prompt_logprobs=prompt_logprobs,
        outputs=cast(list[CompletionOutput], outputs),
        finished=finished,
        kv_transfer_params=kv_transfer_params,
        num_cached_tokens=self.num_cached_tokens,
        metrics=self.stats,
    )

apply_streaming_update

apply_streaming_update(update: StreamingUpdate) -> None
Source code in vllm/v1/engine/output_processor.py
def apply_streaming_update(self, update: StreamingUpdate) -> None:
    # Apply the update to the request state.
    self.streaming_input = not update.final
    # TODO also include relevant output tokens in new prompt here
    #     (match scheduler behavior).
    if update.prompt:
        self.prompt = (
            (self.prompt + update.prompt) if self.prompt else update.prompt
        )
    if self.prompt_token_ids:
        self.prompt_token_ids.extend(update.prompt_token_ids or ())
    else:
        self.prompt_token_ids = update.prompt_token_ids or []
    assert self.prompt_token_ids is not None
    self.prompt_len = len(self.prompt_token_ids)
    if self.stats is not None:
        self.stats.arrival_time = update.arrival_time
    self.is_prefilling = True

from_new_request classmethod

from_new_request(
    tokenizer: TokenizerLike | None,
    request: EngineCoreRequest,
    prompt: str | None,
    parent_req: ParentRequest | None,
    request_index: int,
    queue: RequestOutputCollector | None,
    log_stats: bool,
    stream_interval: int,
) -> RequestState
Source code in vllm/v1/engine/output_processor.py
@classmethod
def from_new_request(
    cls,
    tokenizer: TokenizerLike | None,
    request: EngineCoreRequest,
    prompt: str | None,
    parent_req: ParentRequest | None,
    request_index: int,
    queue: RequestOutputCollector | None,
    log_stats: bool,
    stream_interval: int,
) -> "RequestState":
    if sampling_params := request.sampling_params:
        if not sampling_params.detokenize:
            tokenizer = None
        output_kind = sampling_params.output_kind
        logprobs_processor = LogprobsProcessor.from_new_request(
            tokenizer=tokenizer,
            request=request,
        )
        detokenizer = IncrementalDetokenizer.from_new_request(
            tokenizer=tokenizer,
            request=request,
        )
        max_tokens_param = sampling_params.max_tokens
        top_p = sampling_params.top_p
        n = sampling_params.n
        temperature = sampling_params.temperature
    else:
        logprobs_processor = None
        detokenizer = None
        max_tokens_param = None
        top_p = None
        n = None
        temperature = None
        assert request.pooling_params is not None
        output_kind = request.pooling_params.output_kind

    assert request.external_req_id is not None
    return cls(
        request_id=request.request_id,
        external_req_id=request.external_req_id,
        parent_req=parent_req,
        request_index=request_index,
        lora_request=request.lora_request,
        output_kind=output_kind,
        prompt=prompt,
        prompt_token_ids=request.prompt_token_ids,
        prompt_embeds=request.prompt_embeds,
        logprobs_processor=logprobs_processor,
        detokenizer=detokenizer,
        max_tokens_param=max_tokens_param,
        top_p=top_p,
        n=n,
        temperature=temperature,
        arrival_time=request.arrival_time,
        queue=queue,
        log_stats=log_stats,
        stream_interval=stream_interval,
        stream_input=request.resumable,
    )

make_request_output

make_request_output(
    new_token_ids: list[int],
    pooling_output: Tensor | None,
    finish_reason: FinishReason | None,
    stop_reason: int | str | None,
    kv_transfer_params: dict[str, Any] | None = None,
    routed_experts: ndarray | None = None,
) -> RequestOutput | PoolingRequestOutput | None
Source code in vllm/v1/engine/output_processor.py
def make_request_output(
    self,
    new_token_ids: list[int],
    pooling_output: torch.Tensor | None,
    finish_reason: FinishReason | None,
    stop_reason: int | str | None,
    kv_transfer_params: dict[str, Any] | None = None,
    routed_experts: np.ndarray | None = None,
) -> RequestOutput | PoolingRequestOutput | None:
    finished = finish_reason is not None
    final_only = self.output_kind == RequestOutputKind.FINAL_ONLY

    if not finished and final_only:
        # Only the final output is required in FINAL_ONLY mode.
        return None

    if self.stream_interval > 1:
        assert self.detokenizer is not None

        # Send output request only when
        # 1. It has finished, or
        # 2. It is the first token, or
        # 3. It has reached the stream interval number of tokens
        if not (
            finished
            or self.sent_tokens_offset == 0
            or len(self.detokenizer.output_token_ids) - self.sent_tokens_offset
            >= self.stream_interval
        ):
            return None

        if self.output_kind == RequestOutputKind.DELTA:
            # Send tokens from the offset in DELTA mode, otherwise all
            # tokens are sent.
            new_token_ids = self.detokenizer.output_token_ids[
                self.sent_tokens_offset :
            ]
            self.sent_tokens_offset = len(self.detokenizer.output_token_ids)

    external_req_id = self.external_req_id

    if pooling_output is not None:
        return self._new_request_output(
            external_req_id,
            [self._new_pooling_output(pooling_output)],
            finished,
        )

    output = self._new_completion_output(
        new_token_ids, finish_reason, stop_reason, routed_experts
    )

    if self.parent_req is None:
        outputs = [output]
    else:
        outputs, finished = self.parent_req.get_outputs(self.request_id, output)
        if not outputs:
            return None
        external_req_id = self.parent_req.external_req_id

    return self._new_request_output(
        external_req_id, outputs, finished, kv_transfer_params
    )

StreamingUpdate dataclass

Streaming input update data for output processor.

Contains the incremental prompt data to be applied to a request state when the current sub-request completes.

Source code in vllm/v1/engine/output_processor.py
@dataclass
class StreamingUpdate:
    """Streaming input update data for output processor.

    Contains the incremental prompt data to be applied to a request state
    when the current sub-request completes.
    """

    prompt: str | None
    prompt_token_ids: list[int] | None
    arrival_time: float
    final: bool = False

arrival_time instance-attribute

arrival_time: float

final class-attribute instance-attribute

final: bool = False

prompt instance-attribute

prompt: str | None

prompt_token_ids instance-attribute

prompt_token_ids: list[int] | None

__init__

__init__(
    prompt: str | None,
    prompt_token_ids: list[int] | None,
    arrival_time: float,
    final: bool = False,
) -> None