Skip to content

vllm.multimodal.media

Modules:

Name Description
audio
base
connector
image
video

MEDIA_CONNECTOR_REGISTRY module-attribute

MEDIA_CONNECTOR_REGISTRY = ExtensionManager()

VIDEO_LOADER_REGISTRY module-attribute

VIDEO_LOADER_REGISTRY = ExtensionManager()

__all__ module-attribute

__all__ = [
    "MediaIO",
    "MediaWithBytes",
    "AudioEmbeddingMediaIO",
    "AudioMediaIO",
    "ImageEmbeddingMediaIO",
    "ImageMediaIO",
    "VIDEO_LOADER_REGISTRY",
    "VideoMediaIO",
    "MEDIA_CONNECTOR_REGISTRY",
    "MediaConnector",
]

AudioEmbeddingMediaIO

Bases: MediaIO[Tensor]

Source code in vllm/multimodal/media/audio.py
class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
    def __init__(self) -> None:
        super().__init__()

    def load_bytes(self, data: bytes) -> torch.Tensor:
        buffer = BytesIO(data)
        # Enable sparse tensor integrity checks to prevent out-of-bounds
        # writes from maliciously crafted tensors
        with torch.sparse.check_sparse_tensor_invariants():
            tensor = torch.load(buffer, weights_only=True)
            return tensor.to_dense()

    def load_base64(self, media_type: str, data: str) -> torch.Tensor:
        return self.load_bytes(pybase64.b64decode(data, validate=True))

    def load_file(self, filepath: Path) -> torch.Tensor:
        # Enable sparse tensor integrity checks to prevent out-of-bounds
        # writes from maliciously crafted tensors
        with torch.sparse.check_sparse_tensor_invariants():
            tensor = torch.load(filepath, weights_only=True)
            return tensor.to_dense()

    def encode_base64(self, media: torch.Tensor) -> str:
        return tensor2base64(media)

__init__

__init__() -> None
Source code in vllm/multimodal/media/audio.py
def __init__(self) -> None:
    super().__init__()

encode_base64

encode_base64(media: Tensor) -> str
Source code in vllm/multimodal/media/audio.py
def encode_base64(self, media: torch.Tensor) -> str:
    return tensor2base64(media)

load_base64

load_base64(media_type: str, data: str) -> Tensor
Source code in vllm/multimodal/media/audio.py
def load_base64(self, media_type: str, data: str) -> torch.Tensor:
    return self.load_bytes(pybase64.b64decode(data, validate=True))

load_bytes

load_bytes(data: bytes) -> Tensor
Source code in vllm/multimodal/media/audio.py
def load_bytes(self, data: bytes) -> torch.Tensor:
    buffer = BytesIO(data)
    # Enable sparse tensor integrity checks to prevent out-of-bounds
    # writes from maliciously crafted tensors
    with torch.sparse.check_sparse_tensor_invariants():
        tensor = torch.load(buffer, weights_only=True)
        return tensor.to_dense()

load_file

load_file(filepath: Path) -> Tensor
Source code in vllm/multimodal/media/audio.py
def load_file(self, filepath: Path) -> torch.Tensor:
    # Enable sparse tensor integrity checks to prevent out-of-bounds
    # writes from maliciously crafted tensors
    with torch.sparse.check_sparse_tensor_invariants():
        tensor = torch.load(filepath, weights_only=True)
        return tensor.to_dense()

AudioMediaIO

Bases: MediaIO[tuple[NDArray, float]]

Source code in vllm/multimodal/media/audio.py
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
    def __init__(self, **kwargs) -> None:
        super().__init__()

        # `kwargs` contains custom arguments from
        # --media-io-kwargs for this modality.
        # They can be passed to the underlying
        # media loaders (e.g. custom implementations)
        # for flexible control.
        self.kwargs = kwargs

    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
        return librosa.load(BytesIO(data), sr=None)

    def load_base64(
        self,
        media_type: str,
        data: str,
    ) -> tuple[npt.NDArray, float]:
        return self.load_bytes(base64.b64decode(data))

    def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
        return librosa.load(filepath, sr=None)

    def encode_base64(
        self,
        media: tuple[npt.NDArray, int],
        *,
        audio_format: str = "WAV",
    ) -> str:
        audio, sr = media

        with BytesIO() as buffer:
            soundfile.write(buffer, audio, sr, format=audio_format)
            data = buffer.getvalue()

        return base64.b64encode(data).decode("utf-8")

kwargs instance-attribute

kwargs = kwargs

__init__

__init__(**kwargs) -> None
Source code in vllm/multimodal/media/audio.py
def __init__(self, **kwargs) -> None:
    super().__init__()

    # `kwargs` contains custom arguments from
    # --media-io-kwargs for this modality.
    # They can be passed to the underlying
    # media loaders (e.g. custom implementations)
    # for flexible control.
    self.kwargs = kwargs

encode_base64

encode_base64(
    media: tuple[NDArray, int], *, audio_format: str = "WAV"
) -> str
Source code in vllm/multimodal/media/audio.py
def encode_base64(
    self,
    media: tuple[npt.NDArray, int],
    *,
    audio_format: str = "WAV",
) -> str:
    audio, sr = media

    with BytesIO() as buffer:
        soundfile.write(buffer, audio, sr, format=audio_format)
        data = buffer.getvalue()

    return base64.b64encode(data).decode("utf-8")

load_base64

load_base64(
    media_type: str, data: str
) -> tuple[NDArray, float]
Source code in vllm/multimodal/media/audio.py
def load_base64(
    self,
    media_type: str,
    data: str,
) -> tuple[npt.NDArray, float]:
    return self.load_bytes(base64.b64decode(data))

load_bytes

load_bytes(data: bytes) -> tuple[NDArray, float]
Source code in vllm/multimodal/media/audio.py
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
    return librosa.load(BytesIO(data), sr=None)

load_file

load_file(filepath: Path) -> tuple[NDArray, float]
Source code in vllm/multimodal/media/audio.py
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
    return librosa.load(filepath, sr=None)

ImageEmbeddingMediaIO

Bases: MediaIO[Tensor]

Source code in vllm/multimodal/media/image.py
class ImageEmbeddingMediaIO(MediaIO[torch.Tensor]):
    def __init__(self) -> None:
        super().__init__()

    def load_bytes(self, data: bytes) -> torch.Tensor:
        buffer = BytesIO(data)
        # Enable sparse tensor integrity checks to prevent out-of-bounds
        # writes from maliciously crafted tensors
        with torch.sparse.check_sparse_tensor_invariants():
            tensor = torch.load(buffer, weights_only=True)
            return tensor.to_dense()

    def load_base64(self, media_type: str, data: str) -> torch.Tensor:
        return self.load_bytes(pybase64.b64decode(data, validate=True))

    def load_file(self, filepath: Path) -> torch.Tensor:
        # Enable sparse tensor integrity checks to prevent out-of-bounds
        # writes from maliciously crafted tensors
        with torch.sparse.check_sparse_tensor_invariants():
            tensor = torch.load(filepath, weights_only=True)
            return tensor.to_dense()

    def encode_base64(self, media: torch.Tensor) -> str:
        return pybase64.b64encode(media.numpy()).decode("utf-8")

__init__

__init__() -> None
Source code in vllm/multimodal/media/image.py
def __init__(self) -> None:
    super().__init__()

encode_base64

encode_base64(media: Tensor) -> str
Source code in vllm/multimodal/media/image.py
def encode_base64(self, media: torch.Tensor) -> str:
    return pybase64.b64encode(media.numpy()).decode("utf-8")

load_base64

load_base64(media_type: str, data: str) -> Tensor
Source code in vllm/multimodal/media/image.py
def load_base64(self, media_type: str, data: str) -> torch.Tensor:
    return self.load_bytes(pybase64.b64decode(data, validate=True))

load_bytes

load_bytes(data: bytes) -> Tensor
Source code in vllm/multimodal/media/image.py
def load_bytes(self, data: bytes) -> torch.Tensor:
    buffer = BytesIO(data)
    # Enable sparse tensor integrity checks to prevent out-of-bounds
    # writes from maliciously crafted tensors
    with torch.sparse.check_sparse_tensor_invariants():
        tensor = torch.load(buffer, weights_only=True)
        return tensor.to_dense()

load_file

load_file(filepath: Path) -> Tensor
Source code in vllm/multimodal/media/image.py
def load_file(self, filepath: Path) -> torch.Tensor:
    # Enable sparse tensor integrity checks to prevent out-of-bounds
    # writes from maliciously crafted tensors
    with torch.sparse.check_sparse_tensor_invariants():
        tensor = torch.load(filepath, weights_only=True)
        return tensor.to_dense()

ImageMediaIO

Bases: MediaIO[Image]

Source code in vllm/multimodal/media/image.py
class ImageMediaIO(MediaIO[Image.Image]):
    def __init__(self, image_mode: str = "RGB", **kwargs) -> None:
        super().__init__()

        self.image_mode = image_mode
        # `kwargs` contains custom arguments from
        # --media-io-kwargs for this modality.
        # They can be passed to the underlying
        # media loaders (e.g. custom implementations)
        # for flexible control.
        self.kwargs = kwargs

        # Extract RGBA background color from kwargs if provided
        # Default to white background for backward compatibility
        rgba_bg = kwargs.get("rgba_background_color", (255, 255, 255))
        # Convert list to tuple for consistency
        if isinstance(rgba_bg, list):
            rgba_bg = tuple(rgba_bg)

        # Validate rgba_background_color format
        if not (
            isinstance(rgba_bg, tuple)
            and len(rgba_bg) == 3
            and all(isinstance(c, int) and 0 <= c <= 255 for c in rgba_bg)
        ):
            raise ValueError(
                "rgba_background_color must be a list or tuple of 3 integers "
                "in the range [0, 255]."
            )
        self.rgba_background_color = rgba_bg

    def _convert_image_mode(
        self, image: Image.Image | MediaWithBytes[Image.Image]
    ) -> Image.Image:
        """Convert image mode with custom background color."""
        if isinstance(image, MediaWithBytes):
            image = image.media
        if image.mode == self.image_mode:
            return image
        elif image.mode == "RGBA" and self.image_mode == "RGB":
            return rgba_to_rgb(image, self.rgba_background_color)
        else:
            return convert_image_mode(image, self.image_mode)

    def load_bytes(self, data: bytes) -> MediaWithBytes[Image.Image]:
        image = Image.open(BytesIO(data))
        return MediaWithBytes(self._convert_image_mode(image), data)

    def load_base64(self, media_type: str, data: str) -> MediaWithBytes[Image.Image]:
        return self.load_bytes(pybase64.b64decode(data, validate=True))

    def load_file(self, filepath: Path) -> MediaWithBytes[Image.Image]:
        with open(filepath, "rb") as f:
            data = f.read()
        image = Image.open(BytesIO(data))
        return MediaWithBytes(self._convert_image_mode(image), data)

    def encode_base64(
        self,
        media: Image.Image,
        *,
        image_format: str | None = None,
    ) -> str:
        if image_format is None:
            logger.warning_once(
                "The default format of `ImageMediaIO.encode_base64` will be changed "
                'from "JPEG" to "PNG" in v0.15 to avoid lossy compression. '
                "To continue using the old default, "
                'pass `format="JPEG"` explicitly to silence this warning.'
            )
            image_format = "JPEG"

        image = media

        with BytesIO() as buffer:
            image = self._convert_image_mode(image)
            image.save(buffer, image_format)
            data = buffer.getvalue()

        return pybase64.b64encode(data).decode("utf-8")

image_mode instance-attribute

image_mode = image_mode

kwargs instance-attribute

kwargs = kwargs

rgba_background_color instance-attribute

rgba_background_color = rgba_bg

__init__

__init__(image_mode: str = 'RGB', **kwargs) -> None
Source code in vllm/multimodal/media/image.py
def __init__(self, image_mode: str = "RGB", **kwargs) -> None:
    super().__init__()

    self.image_mode = image_mode
    # `kwargs` contains custom arguments from
    # --media-io-kwargs for this modality.
    # They can be passed to the underlying
    # media loaders (e.g. custom implementations)
    # for flexible control.
    self.kwargs = kwargs

    # Extract RGBA background color from kwargs if provided
    # Default to white background for backward compatibility
    rgba_bg = kwargs.get("rgba_background_color", (255, 255, 255))
    # Convert list to tuple for consistency
    if isinstance(rgba_bg, list):
        rgba_bg = tuple(rgba_bg)

    # Validate rgba_background_color format
    if not (
        isinstance(rgba_bg, tuple)
        and len(rgba_bg) == 3
        and all(isinstance(c, int) and 0 <= c <= 255 for c in rgba_bg)
    ):
        raise ValueError(
            "rgba_background_color must be a list or tuple of 3 integers "
            "in the range [0, 255]."
        )
    self.rgba_background_color = rgba_bg

_convert_image_mode

_convert_image_mode(
    image: Image | MediaWithBytes[Image],
) -> Image

Convert image mode with custom background color.

Source code in vllm/multimodal/media/image.py
def _convert_image_mode(
    self, image: Image.Image | MediaWithBytes[Image.Image]
) -> Image.Image:
    """Convert image mode with custom background color."""
    if isinstance(image, MediaWithBytes):
        image = image.media
    if image.mode == self.image_mode:
        return image
    elif image.mode == "RGBA" and self.image_mode == "RGB":
        return rgba_to_rgb(image, self.rgba_background_color)
    else:
        return convert_image_mode(image, self.image_mode)

encode_base64

encode_base64(
    media: Image, *, image_format: str | None = None
) -> str
Source code in vllm/multimodal/media/image.py
def encode_base64(
    self,
    media: Image.Image,
    *,
    image_format: str | None = None,
) -> str:
    if image_format is None:
        logger.warning_once(
            "The default format of `ImageMediaIO.encode_base64` will be changed "
            'from "JPEG" to "PNG" in v0.15 to avoid lossy compression. '
            "To continue using the old default, "
            'pass `format="JPEG"` explicitly to silence this warning.'
        )
        image_format = "JPEG"

    image = media

    with BytesIO() as buffer:
        image = self._convert_image_mode(image)
        image.save(buffer, image_format)
        data = buffer.getvalue()

    return pybase64.b64encode(data).decode("utf-8")

load_base64

load_base64(
    media_type: str, data: str
) -> MediaWithBytes[Image]
Source code in vllm/multimodal/media/image.py
def load_base64(self, media_type: str, data: str) -> MediaWithBytes[Image.Image]:
    return self.load_bytes(pybase64.b64decode(data, validate=True))

load_bytes

load_bytes(data: bytes) -> MediaWithBytes[Image]
Source code in vllm/multimodal/media/image.py
def load_bytes(self, data: bytes) -> MediaWithBytes[Image.Image]:
    image = Image.open(BytesIO(data))
    return MediaWithBytes(self._convert_image_mode(image), data)

load_file

load_file(filepath: Path) -> MediaWithBytes[Image]
Source code in vllm/multimodal/media/image.py
def load_file(self, filepath: Path) -> MediaWithBytes[Image.Image]:
    with open(filepath, "rb") as f:
        data = f.read()
    image = Image.open(BytesIO(data))
    return MediaWithBytes(self._convert_image_mode(image), data)

MediaConnector

Source code in vllm/multimodal/media/connector.py
@MEDIA_CONNECTOR_REGISTRY.register("http")
class MediaConnector:
    def __init__(
        self,
        media_io_kwargs: dict[str, dict[str, Any]] | None = None,
        connection: HTTPConnection = global_http_connection,
        *,
        allowed_local_media_path: str = "",
        allowed_media_domains: list[str] | None = None,
    ) -> None:
        """
        Args:
            media_io_kwargs: Additional args passed to process media
                             inputs, keyed by modalities. For example,
                             to set num_frames for video, set
                             `--media-io-kwargs '{"video":{"num_frames":40}}'`
            connection: HTTP connection client to download media contents.
            allowed_local_media_path: A local directory to load media files from.
            allowed_media_domains: If set, only media URLs that belong to this
                                   domain can be used for multi-modal inputs.
        """
        super().__init__()

        self.media_io_kwargs: dict[str, dict[str, Any]] = (
            media_io_kwargs if media_io_kwargs else {}
        )
        self.connection = connection

        if allowed_local_media_path:
            allowed_local_media_path_ = Path(allowed_local_media_path)

            if not allowed_local_media_path_.exists():
                raise ValueError(
                    "Invalid `--allowed-local-media-path`: The path "
                    f"{allowed_local_media_path_} does not exist."
                )
            if not allowed_local_media_path_.is_dir():
                raise ValueError(
                    "Invalid `--allowed-local-media-path`: The path "
                    f"{allowed_local_media_path_} must be a directory."
                )
        else:
            allowed_local_media_path_ = None

        self.allowed_local_media_path = allowed_local_media_path_
        if allowed_media_domains is None:
            allowed_media_domains = []
        self.allowed_media_domains = allowed_media_domains

    def _load_data_url(
        self,
        url_spec: Url,
        media_io: MediaIO[_M],
    ) -> _M:  # type: ignore[type-var]
        url_spec_path = url_spec.path or ""
        data_spec, data = url_spec_path.split(",", 1)
        media_type, data_type = data_spec.split(";", 1)
        # media_type starts with a leading "/" (e.g., "/video/jpeg")
        media_type = media_type.lstrip("/")

        if data_type != "base64":
            msg = "Only base64 data URLs are supported for now."
            raise NotImplementedError(msg)

        return media_io.load_base64(media_type, data)

    def _load_file_url(
        self,
        url_spec: Url,
        media_io: MediaIO[_M],
    ) -> _M:  # type: ignore[type-var]
        allowed_local_media_path = self.allowed_local_media_path
        if allowed_local_media_path is None:
            raise RuntimeError(
                "Cannot load local files without `--allowed-local-media-path`."
            )

        url_spec_path = url_spec.path or ""
        url_spec_netloc = url_spec.netloc or ""
        filepath = Path(url2pathname(url_spec_netloc + url_spec_path))
        if allowed_local_media_path not in filepath.resolve().parents:
            raise ValueError(
                f"The file path {filepath} must be a subpath "
                f"of `--allowed-local-media-path {allowed_local_media_path}`."
            )

        return media_io.load_file(filepath)

    def _assert_url_in_allowed_media_domains(self, url_spec: Url) -> None:
        if (
            self.allowed_media_domains
            and url_spec.hostname not in self.allowed_media_domains
        ):
            raise ValueError(
                f"The URL must be from one of the allowed domains: "
                f"{self.allowed_media_domains}. Input URL domain: "
                f"{url_spec.hostname}"
            )

    def load_from_url(
        self,
        url: str,
        media_io: MediaIO[_M],
        *,
        fetch_timeout: int | None = None,
    ) -> _M:  # type: ignore[type-var]
        url_spec = parse_url(url)

        if url_spec.scheme and url_spec.scheme.startswith("http"):
            self._assert_url_in_allowed_media_domains(url_spec)

            connection = self.connection
            data = connection.get_bytes(
                url,
                timeout=fetch_timeout,
                allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
            )

            return media_io.load_bytes(data)

        if url_spec.scheme == "data":
            return self._load_data_url(url_spec, media_io)

        if url_spec.scheme == "file":
            return self._load_file_url(url_spec, media_io)

        msg = "The URL must be either a HTTP, data or file URL."
        raise ValueError(msg)

    async def load_from_url_async(
        self,
        url: str,
        media_io: MediaIO[_M],
        *,
        fetch_timeout: int | None = None,
    ) -> _M:
        url_spec = parse_url(url)
        loop = asyncio.get_running_loop()

        if url_spec.scheme and url_spec.scheme.startswith("http"):
            self._assert_url_in_allowed_media_domains(url_spec)

            connection = self.connection
            data = await connection.async_get_bytes(
                url,
                timeout=fetch_timeout,
                allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
            )
            future = loop.run_in_executor(global_thread_pool, media_io.load_bytes, data)
            return await future

        if url_spec.scheme == "data":
            future = loop.run_in_executor(
                global_thread_pool, self._load_data_url, url_spec, media_io
            )
            return await future

        if url_spec.scheme == "file":
            future = loop.run_in_executor(
                global_thread_pool, self._load_file_url, url_spec, media_io
            )
            return await future
        msg = "The URL must be either a HTTP, data or file URL."
        raise ValueError(msg)

    def fetch_audio(
        self,
        audio_url: str,
    ) -> tuple[np.ndarray, int | float]:
        """
        Load audio from a URL.
        """
        audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))

        return self.load_from_url(
            audio_url,
            audio_io,
            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
        )

    async def fetch_audio_async(
        self,
        audio_url: str,
    ) -> tuple[np.ndarray, int | float]:
        """
        Asynchronously fetch audio from a URL.
        """
        audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))

        return await self.load_from_url_async(
            audio_url,
            audio_io,
            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
        )

    def fetch_image(
        self,
        image_url: str,
        *,
        image_mode: str = "RGB",
    ) -> Image.Image:
        """
        Load a PIL image from an HTTP or base64 data URL.

        By default, the image is converted into RGB format.
        """
        image_io = ImageMediaIO(
            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
        )

        try:
            return self.load_from_url(
                image_url,
                image_io,
                fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
            )
        except UnidentifiedImageError as e:
            # convert to ValueError to be properly caught upstream
            raise ValueError(str(e)) from e

    async def fetch_image_async(
        self,
        image_url: str,
        *,
        image_mode: str = "RGB",
    ) -> Image.Image:
        """
        Asynchronously load a PIL image from an HTTP or base64 data URL.

        By default, the image is converted into RGB format.
        """
        image_io = ImageMediaIO(
            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
        )

        try:
            return await self.load_from_url_async(
                image_url,
                image_io,
                fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
            )
        except UnidentifiedImageError as e:
            # convert to ValueError to be properly caught upstream
            raise ValueError(str(e)) from e

    def fetch_video(
        self,
        video_url: str,
        *,
        image_mode: str = "RGB",
    ) -> tuple[npt.NDArray, dict[str, Any]]:
        """
        Load video from an HTTP or base64 data URL.
        """
        image_io = ImageMediaIO(
            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
        )
        video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {}))

        return self.load_from_url(
            video_url,
            video_io,
            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
        )

    async def fetch_video_async(
        self,
        video_url: str,
        *,
        image_mode: str = "RGB",
    ) -> tuple[npt.NDArray, dict[str, Any]]:
        """
        Asynchronously load video from an HTTP or base64 data URL.

        By default, the image is converted into RGB format.
        """
        image_io = ImageMediaIO(
            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
        )
        video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {}))

        return await self.load_from_url_async(
            video_url,
            video_io,
            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
        )

    def fetch_image_embedding(
        self,
        data: str,
    ) -> torch.Tensor:
        """
        Load image embedding from a URL.
        """
        image_embedding_io = ImageEmbeddingMediaIO()

        return image_embedding_io.load_base64("", data)

    def fetch_audio_embedding(
        self,
        data: str,
    ) -> torch.Tensor:
        """
        Load audio embedding from a URL.
        """
        audio_embedding_io = AudioEmbeddingMediaIO()

        return audio_embedding_io.load_base64("", data)

allowed_local_media_path instance-attribute

allowed_local_media_path = allowed_local_media_path_

allowed_media_domains instance-attribute

allowed_media_domains = allowed_media_domains

connection instance-attribute

connection = connection

media_io_kwargs instance-attribute

media_io_kwargs: dict[str, dict[str, Any]] = (
    media_io_kwargs if media_io_kwargs else {}
)

__init__

__init__(
    media_io_kwargs: dict[str, dict[str, Any]]
    | None = None,
    connection: HTTPConnection = global_http_connection,
    *,
    allowed_local_media_path: str = "",
    allowed_media_domains: list[str] | None = None,
) -> None

Parameters:

Name Type Description Default
media_io_kwargs dict[str, dict[str, Any]] | None

Additional args passed to process media inputs, keyed by modalities. For example, to set num_frames for video, set --media-io-kwargs '{"video":{"num_frames":40}}'

None
connection HTTPConnection

HTTP connection client to download media contents.

global_http_connection
allowed_local_media_path str

A local directory to load media files from.

''
allowed_media_domains list[str] | None

If set, only media URLs that belong to this domain can be used for multi-modal inputs.

None
Source code in vllm/multimodal/media/connector.py
def __init__(
    self,
    media_io_kwargs: dict[str, dict[str, Any]] | None = None,
    connection: HTTPConnection = global_http_connection,
    *,
    allowed_local_media_path: str = "",
    allowed_media_domains: list[str] | None = None,
) -> None:
    """
    Args:
        media_io_kwargs: Additional args passed to process media
                         inputs, keyed by modalities. For example,
                         to set num_frames for video, set
                         `--media-io-kwargs '{"video":{"num_frames":40}}'`
        connection: HTTP connection client to download media contents.
        allowed_local_media_path: A local directory to load media files from.
        allowed_media_domains: If set, only media URLs that belong to this
                               domain can be used for multi-modal inputs.
    """
    super().__init__()

    self.media_io_kwargs: dict[str, dict[str, Any]] = (
        media_io_kwargs if media_io_kwargs else {}
    )
    self.connection = connection

    if allowed_local_media_path:
        allowed_local_media_path_ = Path(allowed_local_media_path)

        if not allowed_local_media_path_.exists():
            raise ValueError(
                "Invalid `--allowed-local-media-path`: The path "
                f"{allowed_local_media_path_} does not exist."
            )
        if not allowed_local_media_path_.is_dir():
            raise ValueError(
                "Invalid `--allowed-local-media-path`: The path "
                f"{allowed_local_media_path_} must be a directory."
            )
    else:
        allowed_local_media_path_ = None

    self.allowed_local_media_path = allowed_local_media_path_
    if allowed_media_domains is None:
        allowed_media_domains = []
    self.allowed_media_domains = allowed_media_domains

_assert_url_in_allowed_media_domains

_assert_url_in_allowed_media_domains(url_spec: Url) -> None
Source code in vllm/multimodal/media/connector.py
def _assert_url_in_allowed_media_domains(self, url_spec: Url) -> None:
    if (
        self.allowed_media_domains
        and url_spec.hostname not in self.allowed_media_domains
    ):
        raise ValueError(
            f"The URL must be from one of the allowed domains: "
            f"{self.allowed_media_domains}. Input URL domain: "
            f"{url_spec.hostname}"
        )

_load_data_url

_load_data_url(url_spec: Url, media_io: MediaIO[_M]) -> _M
Source code in vllm/multimodal/media/connector.py
def _load_data_url(
    self,
    url_spec: Url,
    media_io: MediaIO[_M],
) -> _M:  # type: ignore[type-var]
    url_spec_path = url_spec.path or ""
    data_spec, data = url_spec_path.split(",", 1)
    media_type, data_type = data_spec.split(";", 1)
    # media_type starts with a leading "/" (e.g., "/video/jpeg")
    media_type = media_type.lstrip("/")

    if data_type != "base64":
        msg = "Only base64 data URLs are supported for now."
        raise NotImplementedError(msg)

    return media_io.load_base64(media_type, data)

_load_file_url

_load_file_url(url_spec: Url, media_io: MediaIO[_M]) -> _M
Source code in vllm/multimodal/media/connector.py
def _load_file_url(
    self,
    url_spec: Url,
    media_io: MediaIO[_M],
) -> _M:  # type: ignore[type-var]
    allowed_local_media_path = self.allowed_local_media_path
    if allowed_local_media_path is None:
        raise RuntimeError(
            "Cannot load local files without `--allowed-local-media-path`."
        )

    url_spec_path = url_spec.path or ""
    url_spec_netloc = url_spec.netloc or ""
    filepath = Path(url2pathname(url_spec_netloc + url_spec_path))
    if allowed_local_media_path not in filepath.resolve().parents:
        raise ValueError(
            f"The file path {filepath} must be a subpath "
            f"of `--allowed-local-media-path {allowed_local_media_path}`."
        )

    return media_io.load_file(filepath)

fetch_audio

fetch_audio(audio_url: str) -> tuple[ndarray, int | float]

Load audio from a URL.

Source code in vllm/multimodal/media/connector.py
def fetch_audio(
    self,
    audio_url: str,
) -> tuple[np.ndarray, int | float]:
    """
    Load audio from a URL.
    """
    audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))

    return self.load_from_url(
        audio_url,
        audio_io,
        fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
    )

fetch_audio_async async

fetch_audio_async(
    audio_url: str,
) -> tuple[ndarray, int | float]

Asynchronously fetch audio from a URL.

Source code in vllm/multimodal/media/connector.py
async def fetch_audio_async(
    self,
    audio_url: str,
) -> tuple[np.ndarray, int | float]:
    """
    Asynchronously fetch audio from a URL.
    """
    audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))

    return await self.load_from_url_async(
        audio_url,
        audio_io,
        fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
    )

fetch_audio_embedding

fetch_audio_embedding(data: str) -> Tensor

Load audio embedding from a URL.

Source code in vllm/multimodal/media/connector.py
def fetch_audio_embedding(
    self,
    data: str,
) -> torch.Tensor:
    """
    Load audio embedding from a URL.
    """
    audio_embedding_io = AudioEmbeddingMediaIO()

    return audio_embedding_io.load_base64("", data)

fetch_image

fetch_image(
    image_url: str, *, image_mode: str = "RGB"
) -> Image

Load a PIL image from an HTTP or base64 data URL.

By default, the image is converted into RGB format.

Source code in vllm/multimodal/media/connector.py
def fetch_image(
    self,
    image_url: str,
    *,
    image_mode: str = "RGB",
) -> Image.Image:
    """
    Load a PIL image from an HTTP or base64 data URL.

    By default, the image is converted into RGB format.
    """
    image_io = ImageMediaIO(
        image_mode=image_mode, **self.media_io_kwargs.get("image", {})
    )

    try:
        return self.load_from_url(
            image_url,
            image_io,
            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
        )
    except UnidentifiedImageError as e:
        # convert to ValueError to be properly caught upstream
        raise ValueError(str(e)) from e

fetch_image_async async

fetch_image_async(
    image_url: str, *, image_mode: str = "RGB"
) -> Image

Asynchronously load a PIL image from an HTTP or base64 data URL.

By default, the image is converted into RGB format.

Source code in vllm/multimodal/media/connector.py
async def fetch_image_async(
    self,
    image_url: str,
    *,
    image_mode: str = "RGB",
) -> Image.Image:
    """
    Asynchronously load a PIL image from an HTTP or base64 data URL.

    By default, the image is converted into RGB format.
    """
    image_io = ImageMediaIO(
        image_mode=image_mode, **self.media_io_kwargs.get("image", {})
    )

    try:
        return await self.load_from_url_async(
            image_url,
            image_io,
            fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
        )
    except UnidentifiedImageError as e:
        # convert to ValueError to be properly caught upstream
        raise ValueError(str(e)) from e

fetch_image_embedding

fetch_image_embedding(data: str) -> Tensor

Load image embedding from a URL.

Source code in vllm/multimodal/media/connector.py
def fetch_image_embedding(
    self,
    data: str,
) -> torch.Tensor:
    """
    Load image embedding from a URL.
    """
    image_embedding_io = ImageEmbeddingMediaIO()

    return image_embedding_io.load_base64("", data)

fetch_video

fetch_video(
    video_url: str, *, image_mode: str = "RGB"
) -> tuple[NDArray, dict[str, Any]]

Load video from an HTTP or base64 data URL.

Source code in vllm/multimodal/media/connector.py
def fetch_video(
    self,
    video_url: str,
    *,
    image_mode: str = "RGB",
) -> tuple[npt.NDArray, dict[str, Any]]:
    """
    Load video from an HTTP or base64 data URL.
    """
    image_io = ImageMediaIO(
        image_mode=image_mode, **self.media_io_kwargs.get("image", {})
    )
    video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {}))

    return self.load_from_url(
        video_url,
        video_io,
        fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
    )

fetch_video_async async

fetch_video_async(
    video_url: str, *, image_mode: str = "RGB"
) -> tuple[NDArray, dict[str, Any]]

Asynchronously load video from an HTTP or base64 data URL.

By default, the image is converted into RGB format.

Source code in vllm/multimodal/media/connector.py
async def fetch_video_async(
    self,
    video_url: str,
    *,
    image_mode: str = "RGB",
) -> tuple[npt.NDArray, dict[str, Any]]:
    """
    Asynchronously load video from an HTTP or base64 data URL.

    By default, the image is converted into RGB format.
    """
    image_io = ImageMediaIO(
        image_mode=image_mode, **self.media_io_kwargs.get("image", {})
    )
    video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {}))

    return await self.load_from_url_async(
        video_url,
        video_io,
        fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
    )

load_from_url

load_from_url(
    url: str,
    media_io: MediaIO[_M],
    *,
    fetch_timeout: int | None = None,
) -> _M
Source code in vllm/multimodal/media/connector.py
def load_from_url(
    self,
    url: str,
    media_io: MediaIO[_M],
    *,
    fetch_timeout: int | None = None,
) -> _M:  # type: ignore[type-var]
    url_spec = parse_url(url)

    if url_spec.scheme and url_spec.scheme.startswith("http"):
        self._assert_url_in_allowed_media_domains(url_spec)

        connection = self.connection
        data = connection.get_bytes(
            url,
            timeout=fetch_timeout,
            allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
        )

        return media_io.load_bytes(data)

    if url_spec.scheme == "data":
        return self._load_data_url(url_spec, media_io)

    if url_spec.scheme == "file":
        return self._load_file_url(url_spec, media_io)

    msg = "The URL must be either a HTTP, data or file URL."
    raise ValueError(msg)

load_from_url_async async

load_from_url_async(
    url: str,
    media_io: MediaIO[_M],
    *,
    fetch_timeout: int | None = None,
) -> _M
Source code in vllm/multimodal/media/connector.py
async def load_from_url_async(
    self,
    url: str,
    media_io: MediaIO[_M],
    *,
    fetch_timeout: int | None = None,
) -> _M:
    url_spec = parse_url(url)
    loop = asyncio.get_running_loop()

    if url_spec.scheme and url_spec.scheme.startswith("http"):
        self._assert_url_in_allowed_media_domains(url_spec)

        connection = self.connection
        data = await connection.async_get_bytes(
            url,
            timeout=fetch_timeout,
            allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
        )
        future = loop.run_in_executor(global_thread_pool, media_io.load_bytes, data)
        return await future

    if url_spec.scheme == "data":
        future = loop.run_in_executor(
            global_thread_pool, self._load_data_url, url_spec, media_io
        )
        return await future

    if url_spec.scheme == "file":
        future = loop.run_in_executor(
            global_thread_pool, self._load_file_url, url_spec, media_io
        )
        return await future
    msg = "The URL must be either a HTTP, data or file URL."
    raise ValueError(msg)

MediaIO

Bases: ABC, Generic[_T]

Source code in vllm/multimodal/media/base.py
class MediaIO(ABC, Generic[_T]):
    @abstractmethod
    def load_bytes(self, data: bytes) -> _T:
        raise NotImplementedError

    @abstractmethod
    def load_base64(self, media_type: str, data: str) -> _T:
        """
        List of media types:
        https://www.iana.org/assignments/media-types/media-types.xhtml
        """
        raise NotImplementedError

    @abstractmethod
    def load_file(self, filepath: Path) -> _T:
        raise NotImplementedError

load_base64 abstractmethod

load_base64(media_type: str, data: str) -> _T

List of media types: https://www.iana.org/assignments/media-types/media-types.xhtml

Source code in vllm/multimodal/media/base.py
@abstractmethod
def load_base64(self, media_type: str, data: str) -> _T:
    """
    List of media types:
    https://www.iana.org/assignments/media-types/media-types.xhtml
    """
    raise NotImplementedError

load_bytes abstractmethod

load_bytes(data: bytes) -> _T
Source code in vllm/multimodal/media/base.py
@abstractmethod
def load_bytes(self, data: bytes) -> _T:
    raise NotImplementedError

load_file abstractmethod

load_file(filepath: Path) -> _T
Source code in vllm/multimodal/media/base.py
@abstractmethod
def load_file(self, filepath: Path) -> _T:
    raise NotImplementedError

MediaWithBytes dataclass

Bases: Generic[_T]

Wrapper that couples a media object with its original encoded bytes.

This ensures the raw bytes and media object remain synchronized, preventing cache corruption from in-place modifications.

The wrapper delegates attribute access to the underlying media object, making it behave transparently like the wrapped type (e.g., PIL.Image).

NOTE: Currently, this wrapper is used only for the image modality.

Source code in vllm/multimodal/media/base.py
@dataclass
class MediaWithBytes(Generic[_T]):
    """
    Wrapper that couples a media object with its original encoded bytes.

    This ensures the raw bytes and media object remain synchronized,
    preventing cache corruption from in-place modifications.

    The wrapper delegates attribute access to the underlying media object,
    making it behave transparently like the wrapped type (e.g., PIL.Image).

    NOTE: Currently, this wrapper is used only for the image modality.
    """

    media: _T
    original_bytes: bytes

    def __array__(self, *args, **kwargs) -> np.ndarray:
        """Allow np.array(obj) to return np.array(obj.media)."""
        return np.array(self.media, *args, **kwargs)

    def __getstate__(self):
        return self.__dict__.copy()

    def __setstate__(self, state: dict[str, Any]):
        self.__dict__.update(state)

    def __getattr__(self, name: str):
        """Delegate attribute access to the underlying media object."""
        return getattr(self.media, name)

media instance-attribute

media: _T

original_bytes instance-attribute

original_bytes: bytes

__array__

__array__(*args, **kwargs) -> ndarray

Allow np.array(obj) to return np.array(obj.media).

Source code in vllm/multimodal/media/base.py
def __array__(self, *args, **kwargs) -> np.ndarray:
    """Allow np.array(obj) to return np.array(obj.media)."""
    return np.array(self.media, *args, **kwargs)

__getattr__

__getattr__(name: str)

Delegate attribute access to the underlying media object.

Source code in vllm/multimodal/media/base.py
def __getattr__(self, name: str):
    """Delegate attribute access to the underlying media object."""
    return getattr(self.media, name)

__getstate__

__getstate__()
Source code in vllm/multimodal/media/base.py
def __getstate__(self):
    return self.__dict__.copy()

__init__

__init__(media: _T, original_bytes: bytes) -> None

__setstate__

__setstate__(state: dict[str, Any])
Source code in vllm/multimodal/media/base.py
def __setstate__(self, state: dict[str, Any]):
    self.__dict__.update(state)

VideoMediaIO

Bases: MediaIO[tuple[NDArray, dict[str, Any]]]

Source code in vllm/multimodal/media/video.py
class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
    def __init__(
        self,
        image_io: ImageMediaIO,
        num_frames: int = 32,
        **kwargs,
    ) -> None:
        super().__init__()

        self.image_io = image_io
        self.num_frames = num_frames
        # `kwargs` contains custom arguments from
        # --media-io-kwargs for this modality.
        # They can be passed to the underlying
        # media loaders (e.g. custom implementations)
        # for flexible control.

        # Allow per-request override of video backend via kwargs.
        # This enables users to specify a different backend than the
        # global VLLM_VIDEO_LOADER_BACKEND env var, e.g.:
        #   --media-io-kwargs '{"video": {"video_backend": "torchcodec"}}'
        video_loader_backend = (
            kwargs.pop("video_backend", None) or envs.VLLM_VIDEO_LOADER_BACKEND
        )
        self.kwargs = kwargs
        self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend)

    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]:
        return self.video_loader.load_bytes(
            data, num_frames=self.num_frames, **self.kwargs
        )

    def load_base64(
        self, media_type: str, data: str
    ) -> tuple[npt.NDArray, dict[str, Any]]:
        if media_type.lower() == "video/jpeg":
            load_frame = partial(
                self.image_io.load_base64,
                "image/jpeg",
            )

            return np.stack(
                [np.asarray(load_frame(frame_data)) for frame_data in data.split(",")]
            ), {}

        return self.load_bytes(base64.b64decode(data))

    def load_file(self, filepath: Path) -> tuple[npt.NDArray, dict[str, Any]]:
        with filepath.open("rb") as f:
            data = f.read()

        return self.load_bytes(data)

    def encode_base64(
        self,
        media: npt.NDArray,
        *,
        video_format: str = "JPEG",
    ) -> str:
        video = media

        if video_format == "JPEG":
            encode_frame = partial(
                self.image_io.encode_base64,
                image_format=video_format,
            )

            return ",".join(encode_frame(Image.fromarray(frame)) for frame in video)

        msg = "Only JPEG format is supported for now."
        raise NotImplementedError(msg)

image_io instance-attribute

image_io = image_io

kwargs instance-attribute

kwargs = kwargs

num_frames instance-attribute

num_frames = num_frames

video_loader instance-attribute

video_loader = load(video_loader_backend)

__init__

__init__(
    image_io: ImageMediaIO, num_frames: int = 32, **kwargs
) -> None
Source code in vllm/multimodal/media/video.py
def __init__(
    self,
    image_io: ImageMediaIO,
    num_frames: int = 32,
    **kwargs,
) -> None:
    super().__init__()

    self.image_io = image_io
    self.num_frames = num_frames
    # `kwargs` contains custom arguments from
    # --media-io-kwargs for this modality.
    # They can be passed to the underlying
    # media loaders (e.g. custom implementations)
    # for flexible control.

    # Allow per-request override of video backend via kwargs.
    # This enables users to specify a different backend than the
    # global VLLM_VIDEO_LOADER_BACKEND env var, e.g.:
    #   --media-io-kwargs '{"video": {"video_backend": "torchcodec"}}'
    video_loader_backend = (
        kwargs.pop("video_backend", None) or envs.VLLM_VIDEO_LOADER_BACKEND
    )
    self.kwargs = kwargs
    self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend)

encode_base64

encode_base64(
    media: NDArray, *, video_format: str = "JPEG"
) -> str
Source code in vllm/multimodal/media/video.py
def encode_base64(
    self,
    media: npt.NDArray,
    *,
    video_format: str = "JPEG",
) -> str:
    video = media

    if video_format == "JPEG":
        encode_frame = partial(
            self.image_io.encode_base64,
            image_format=video_format,
        )

        return ",".join(encode_frame(Image.fromarray(frame)) for frame in video)

    msg = "Only JPEG format is supported for now."
    raise NotImplementedError(msg)

load_base64

load_base64(
    media_type: str, data: str
) -> tuple[NDArray, dict[str, Any]]
Source code in vllm/multimodal/media/video.py
def load_base64(
    self, media_type: str, data: str
) -> tuple[npt.NDArray, dict[str, Any]]:
    if media_type.lower() == "video/jpeg":
        load_frame = partial(
            self.image_io.load_base64,
            "image/jpeg",
        )

        return np.stack(
            [np.asarray(load_frame(frame_data)) for frame_data in data.split(",")]
        ), {}

    return self.load_bytes(base64.b64decode(data))

load_bytes

load_bytes(data: bytes) -> tuple[NDArray, dict[str, Any]]
Source code in vllm/multimodal/media/video.py
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]:
    return self.video_loader.load_bytes(
        data, num_frames=self.num_frames, **self.kwargs
    )

load_file

load_file(filepath: Path) -> tuple[NDArray, dict[str, Any]]
Source code in vllm/multimodal/media/video.py
def load_file(self, filepath: Path) -> tuple[npt.NDArray, dict[str, Any]]:
    with filepath.open("rb") as f:
        data = f.read()

    return self.load_bytes(data)