vllm.multimodal.utils ¶

logger `module-attribute` ¶

logger = init_logger(__name__)

getattr ¶

__getattr__(name: str)

Source code in vllm/multimodal/utils.py

def __getattr__(name: str):
    if name == "MEDIA_CONNECTOR_REGISTRY":
        from .media import MEDIA_CONNECTOR_REGISTRY

        warnings.warn(
            "`vllm.multimodal.utils.MEDIA_CONNECTOR_REGISTRY` "
            "has been moved to `vllm.multimodal.media.MEDIA_CONNECTOR_REGISTRY`. "
            "The old name will be removed in v0.17.",
            DeprecationWarning,
            stacklevel=2,
        )

        return MEDIA_CONNECTOR_REGISTRY

    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

argsort_mm_positions ¶

argsort_mm_positions(
    mm_positions: MultiModalPlaceholderDict,
) -> list[tuple[str, int]]

Given a MultiModalPlaceholderDict, output a sequence of keys to sort the dictionary by offset (starting index in the input sequence) in ascending order.

Returns:

Type	Description
`list[tuple[str, int]]`	A list of `(modality, idx)`, which can be used to access an item
`list[tuple[str, int]]`	by `mm_positions[modality][idx]`.

Source code in vllm/multimodal/utils.py

def argsort_mm_positions(
    mm_positions: MultiModalPlaceholderDict,
) -> list[tuple[str, int]]:
    """
    Given a `MultiModalPlaceholderDict`, output a sequence of keys to
    sort the dictionary by `offset` (starting index in the input sequence)
    in ascending order.

    Returns:
        A list of `(modality, idx)`, which can be used to access an item
        by `mm_positions[modality][idx]`.
    """
    flat_items = (
        (modality, idx, item)
        for modality, items in mm_positions.items()
        for idx, item in enumerate(items)
    )

    sorted_flat_items = sorted(flat_items, key=lambda x: x[2].offset)

    return [(modality, idx) for modality, idx, _ in sorted_flat_items]

encode_audio_base64 ¶

encode_audio_base64(
    audio: ndarray,
    sampling_rate: int,
    *,
    format: str = "WAV",
) -> str

Encode audio as base64.

Source code in vllm/multimodal/utils.py

def encode_audio_base64(
    audio: np.ndarray,
    sampling_rate: int,
    *,
    format: str = "WAV",
) -> str:
    """Encode audio as base64."""
    audio_io = AudioMediaIO()
    return audio_io.encode_base64((audio, sampling_rate), audio_format=format)

encode_audio_url ¶

encode_audio_url(
    audio: ndarray,
    sampling_rate: int,
    *,
    format: str = "WAV",
) -> str

Encode audio as a data URL.

Source code in vllm/multimodal/utils.py

def encode_audio_url(
    audio: np.ndarray,
    sampling_rate: int,
    *,
    format: str = "WAV",
) -> str:
    """Encode audio as a data URL."""
    audio_b64 = encode_audio_base64(audio, sampling_rate, format=format)
    mimetype = mimetypes.types_map.get("." + format.lower(), "audio")
    return f"data:{mimetype};base64,{audio_b64}"

encode_image_base64 ¶

encode_image_base64(
    image: Image,
    *,
    image_mode: str = "RGB",
    format: str | None = None,
) -> str

Encode a pillow image to base64 format.

By default, the image is converted into RGB format before being encoded.

Source code in vllm/multimodal/utils.py

def encode_image_base64(
    image: Image.Image,
    *,
    image_mode: str = "RGB",
    format: str | None = None,
) -> str:
    """
    Encode a pillow image to base64 format.

    By default, the image is converted into RGB format before being encoded.
    """
    image_io = ImageMediaIO(image_mode=image_mode)
    return image_io.encode_base64(image, image_format=format)

encode_image_url ¶

encode_image_url(
    image: Image,
    *,
    image_mode: str = "RGB",
    format: str = "PNG",
) -> str

Encode a pillow image as a data URL.

By default, the image is converted into RGB format before being encoded.

Source code in vllm/multimodal/utils.py

def encode_image_url(
    image: Image.Image,
    *,
    image_mode: str = "RGB",
    format: str = "PNG",
) -> str:
    """
    Encode a pillow image as a data URL.

    By default, the image is converted into RGB format before being encoded.
    """
    image_b64 = encode_image_base64(image, image_mode=image_mode, format=format)
    mimetype = mimetypes.types_map.get("." + format.lower(), "image")
    return f"data:{mimetype};base64,{image_b64}"

encode_video_base64 ¶

encode_video_base64(
    frames: NDArray, *, format: str = "JPEG"
) -> str

Source code in vllm/multimodal/utils.py

def encode_video_base64(
    frames: npt.NDArray,
    *,
    format: str = "JPEG",
) -> str:
    image_io = ImageMediaIO()
    video_io = VideoMediaIO(image_io)
    return video_io.encode_base64(frames, video_format=format)

encode_video_url ¶

encode_video_url(
    frames: NDArray, *, format: str = "JPEG"
) -> str

Source code in vllm/multimodal/utils.py

def encode_video_url(
    frames: npt.NDArray,
    *,
    format: str = "JPEG",
) -> str:
    video_b64 = encode_video_base64(frames, format=format)

    if format.lower() == "jpeg":
        mimetype = "video/jpeg"
    else:
        mimetype = mimetypes.types_map.get("." + format.lower(), "video")

    return f"data:{mimetype};base64,{video_b64}"

fetch_audio ¶

fetch_audio(
    audio_url: str,
    audio_io_kwargs: dict[str, Any] | None = None,
) -> tuple[ndarray, int | float]

Parameters:

Name	Type	Description	Default
`audio_url`	`str`	URL of the audio file to fetch.	required
`audio_io_kwargs`	`dict[str, Any] \| None`	Additional kwargs passed to handle audio IO.	`None`

Warning

This method has direct access to local files and is only intended to be called by user code. Never call this from the online server!

Source code in vllm/multimodal/utils.py

def fetch_audio(
    audio_url: str,
    audio_io_kwargs: dict[str, Any] | None = None,
) -> tuple[np.ndarray, int | float]:
    """
    Args:
        audio_url: URL of the audio file to fetch.
        audio_io_kwargs: Additional kwargs passed to handle audio IO.

    Warning:
        This method has direct access to local files and is only intended
        to be called by user code. Never call this from the online server!
    """
    media_io_kwargs = None if not audio_io_kwargs else {"audio": audio_io_kwargs}
    media_connector = MediaConnector(
        media_io_kwargs=media_io_kwargs,
        allowed_local_media_path="/",
    )
    return media_connector.fetch_audio(audio_url)

fetch_image ¶

fetch_image(
    image_url: str,
    image_io_kwargs: dict[str, Any] | None = None,
) -> Image

Parameters:

Name	Type	Description	Default
`image_url`	`str`	URL of the image file to fetch.	required
`image_io_kwargs`	`dict[str, Any] \| None`	Additional kwargs passed to handle image IO.	`None`

Warning

This method has direct access to local files and is only intended to be called by user code. Never call this from the online server!

Source code in vllm/multimodal/utils.py

def fetch_image(
    image_url: str,
    image_io_kwargs: dict[str, Any] | None = None,
) -> Image.Image:
    """
    Args:
        image_url: URL of the image file to fetch.
        image_io_kwargs: Additional kwargs passed to handle image IO.

    Warning:
        This method has direct access to local files and is only intended
        to be called by user code. Never call this from the online server!
    """
    media_io_kwargs = None if not image_io_kwargs else {"image": image_io_kwargs}
    media_connector = MediaConnector(
        media_io_kwargs=media_io_kwargs,
        allowed_local_media_path="/",
    )
    return media_connector.fetch_image(image_url)

fetch_video ¶

fetch_video(
    video_url: str,
    video_io_kwargs: dict[str, Any] | None = None,
) -> tuple[NDArray, dict[str, Any]]

Parameters:

Name	Type	Description	Default
`video_url`	`str`	URL of the video file to fetch.	required
`video_io_kwargs`	`dict[str, Any] \| None`	Additional kwargs passed to handle video IO.	`None`

Warning

This method has direct access to local files and is only intended to be called by user code. Never call this from the online server!

Source code in vllm/multimodal/utils.py

def fetch_video(
    video_url: str,
    video_io_kwargs: dict[str, Any] | None = None,
) -> tuple[npt.NDArray, dict[str, Any]]:
    """
    Args:
        video_url: URL of the video file to fetch.
        video_io_kwargs: Additional kwargs passed to handle video IO.

    Warning:
        This method has direct access to local files and is only intended
        to be called by user code. Never call this from the online server!
    """
    media_io_kwargs = None if not video_io_kwargs else {"video": video_io_kwargs}
    media_connector = MediaConnector(
        media_io_kwargs=media_io_kwargs,
        allowed_local_media_path="/",
    )
    return media_connector.fetch_video(video_url)

group_mm_kwargs_by_modality ¶

group_mm_kwargs_by_modality(
    mm_kwargs: list[tuple[str, MultiModalKwargsItem]],
    *,
    device: Device = None,
    pin_memory: bool = False,
) -> Generator[
    tuple[str, int, BatchedTensorInputs], None, None
]

Group consecutive MultiModalKwargsItems from mm_kwargs with the same modality together into the same MultiModalKwargs instance.

Parameters:

Name	Type	Description	Default
`mm_kwargs`	`list[tuple[str, MultiModalKwargsItem]]`	List of `MultiModalKwargsItem`.	required
`device`	`Device`	The device to place the grouped tensors on.	`None`
`pin_memory`	`bool`	Whether to pin memory for faster host-to-device transfer.	`False`

Yields:

Type	Description
`tuple[str, int, BatchedTensorInputs]`	A tuple `(modality, num_items, grouped_kwargs)`.

Source code in vllm/multimodal/utils.py

def group_mm_kwargs_by_modality(
    mm_kwargs: list[tuple[str, MultiModalKwargsItem]],
    *,
    device: torch.types.Device = None,
    pin_memory: bool = False,
) -> Generator[tuple[str, int, BatchedTensorInputs], None, None]:
    """Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same
    modality together into the same `MultiModalKwargs` instance.

    Args:
        mm_kwargs: List of `MultiModalKwargsItem`.
        device: The device to place the grouped tensors on.
        pin_memory: Whether to pin memory for faster host-to-device transfer.

    Yields:
        A tuple `(modality, num_items, grouped_kwargs)`.
    """
    for modality, group in groupby(mm_kwargs, key=lambda x: x[0]):
        items_lst = [item for _, item in group]
        mm_kwargs_items = MultiModalKwargsItems({modality: items_lst})
        mm_kwargs_data = mm_kwargs_items.get_data(
            device=device,
            pin_memory=pin_memory,
        )

        yield modality, len(items_lst), mm_kwargs_data

vllm.multimodal.utils ¶

logger module-attribute ¶

__getattr__ ¶

argsort_mm_positions ¶

encode_audio_base64 ¶

encode_audio_url ¶

encode_image_base64 ¶

encode_image_url ¶

encode_video_base64 ¶

encode_video_url ¶

fetch_audio ¶

fetch_image ¶

fetch_video ¶

group_mm_kwargs_by_modality ¶

logger `module-attribute` ¶

getattr ¶