Skip to content

vllm.transformers_utils.configs.funaudiochat

FunAudioChatAudioEncoderConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/funaudiochat.py
class FunAudioChatAudioEncoderConfig(PretrainedConfig):
    model_type = "funaudiochat_audio_encoder"

    def __init__(
        self,
        _attn_implementation: str | None = None,
        num_mel_bins: int = 128,
        encoder_layers: int = 32,
        encoder_attention_heads: int = 20,
        encoder_ffn_dim: int = 5120,
        d_model: int = 1280,
        dropout: float = 0.0,
        attention_dropout: float = 0.0,
        activation_function: str = "gelu",
        activation_dropout: float = 0.0,
        scale_embedding: bool = False,
        initializer_range: float = 0.02,
        max_source_positions: int = 1500,
        n_window: int = 100,
        output_dim: int = 3584,
        bos_token_id: int | None = None,
        codebook_size: int | None = None,
        continuous_features_mode: str = "replace",
        crq_transformer_config: dict | None = None,
        eos_token_id: int | None = None,
        group_size: int = 5,
        enable_audio_invert_tower: bool = True,
        pad_token_id: int | None = None,
        **kwargs,
    ) -> None:
        attn_impl = kwargs.pop("_attn_implementation", None) or _attn_implementation
        super().__init__(**kwargs)
        # Match HF default for attention implementation selection.
        self._attn_implementation = attn_impl or "sdpa"

        self.num_mel_bins = num_mel_bins
        self.d_model = d_model
        self.encoder_layers = encoder_layers
        self.encoder_attention_heads = encoder_attention_heads
        self.encoder_ffn_dim = encoder_ffn_dim
        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.activation_function = activation_function
        self.activation_dropout = activation_dropout
        self.num_hidden_layers = encoder_layers
        self.initializer_range = initializer_range
        self.scale_embedding = scale_embedding
        self.max_source_positions = max_source_positions
        self.n_window = n_window
        self.output_dim = output_dim

        self.bos_token_id = bos_token_id
        self.codebook_size = codebook_size
        self.continuous_features_mode = continuous_features_mode
        self.crq_transformer_config = crq_transformer_config
        self.eos_token_id = eos_token_id
        self.group_size = group_size
        self.enable_audio_invert_tower = enable_audio_invert_tower
        self.pad_token_id = pad_token_id

_attn_implementation instance-attribute

_attn_implementation = attn_impl or 'sdpa'

activation_dropout instance-attribute

activation_dropout = activation_dropout

activation_function instance-attribute

activation_function = activation_function

attention_dropout instance-attribute

attention_dropout = attention_dropout

bos_token_id instance-attribute

bos_token_id = bos_token_id

codebook_size instance-attribute

codebook_size = codebook_size

continuous_features_mode instance-attribute

continuous_features_mode = continuous_features_mode

crq_transformer_config instance-attribute

crq_transformer_config = crq_transformer_config

d_model instance-attribute

d_model = d_model

dropout instance-attribute

dropout = dropout

enable_audio_invert_tower instance-attribute

enable_audio_invert_tower = enable_audio_invert_tower

encoder_attention_heads instance-attribute

encoder_attention_heads = encoder_attention_heads

encoder_ffn_dim instance-attribute

encoder_ffn_dim = encoder_ffn_dim

encoder_layers instance-attribute

encoder_layers = encoder_layers

eos_token_id instance-attribute

eos_token_id = eos_token_id

group_size instance-attribute

group_size = group_size

initializer_range instance-attribute

initializer_range = initializer_range

max_source_positions instance-attribute

max_source_positions = max_source_positions

model_type class-attribute instance-attribute

model_type = 'funaudiochat_audio_encoder'

n_window instance-attribute

n_window = n_window

num_hidden_layers instance-attribute

num_hidden_layers = encoder_layers

num_mel_bins instance-attribute

num_mel_bins = num_mel_bins

output_dim instance-attribute

output_dim = output_dim

pad_token_id instance-attribute

pad_token_id = pad_token_id

scale_embedding instance-attribute

scale_embedding = scale_embedding

__init__

__init__(
    _attn_implementation: str | None = None,
    num_mel_bins: int = 128,
    encoder_layers: int = 32,
    encoder_attention_heads: int = 20,
    encoder_ffn_dim: int = 5120,
    d_model: int = 1280,
    dropout: float = 0.0,
    attention_dropout: float = 0.0,
    activation_function: str = "gelu",
    activation_dropout: float = 0.0,
    scale_embedding: bool = False,
    initializer_range: float = 0.02,
    max_source_positions: int = 1500,
    n_window: int = 100,
    output_dim: int = 3584,
    bos_token_id: int | None = None,
    codebook_size: int | None = None,
    continuous_features_mode: str = "replace",
    crq_transformer_config: dict | None = None,
    eos_token_id: int | None = None,
    group_size: int = 5,
    enable_audio_invert_tower: bool = True,
    pad_token_id: int | None = None,
    **kwargs,
) -> None
Source code in vllm/transformers_utils/configs/funaudiochat.py
def __init__(
    self,
    _attn_implementation: str | None = None,
    num_mel_bins: int = 128,
    encoder_layers: int = 32,
    encoder_attention_heads: int = 20,
    encoder_ffn_dim: int = 5120,
    d_model: int = 1280,
    dropout: float = 0.0,
    attention_dropout: float = 0.0,
    activation_function: str = "gelu",
    activation_dropout: float = 0.0,
    scale_embedding: bool = False,
    initializer_range: float = 0.02,
    max_source_positions: int = 1500,
    n_window: int = 100,
    output_dim: int = 3584,
    bos_token_id: int | None = None,
    codebook_size: int | None = None,
    continuous_features_mode: str = "replace",
    crq_transformer_config: dict | None = None,
    eos_token_id: int | None = None,
    group_size: int = 5,
    enable_audio_invert_tower: bool = True,
    pad_token_id: int | None = None,
    **kwargs,
) -> None:
    attn_impl = kwargs.pop("_attn_implementation", None) or _attn_implementation
    super().__init__(**kwargs)
    # Match HF default for attention implementation selection.
    self._attn_implementation = attn_impl or "sdpa"

    self.num_mel_bins = num_mel_bins
    self.d_model = d_model
    self.encoder_layers = encoder_layers
    self.encoder_attention_heads = encoder_attention_heads
    self.encoder_ffn_dim = encoder_ffn_dim
    self.dropout = dropout
    self.attention_dropout = attention_dropout
    self.activation_function = activation_function
    self.activation_dropout = activation_dropout
    self.num_hidden_layers = encoder_layers
    self.initializer_range = initializer_range
    self.scale_embedding = scale_embedding
    self.max_source_positions = max_source_positions
    self.n_window = n_window
    self.output_dim = output_dim

    self.bos_token_id = bos_token_id
    self.codebook_size = codebook_size
    self.continuous_features_mode = continuous_features_mode
    self.crq_transformer_config = crq_transformer_config
    self.eos_token_id = eos_token_id
    self.group_size = group_size
    self.enable_audio_invert_tower = enable_audio_invert_tower
    self.pad_token_id = pad_token_id

FunAudioChatConfig

Bases: PretrainedConfig

Source code in vllm/transformers_utils/configs/funaudiochat.py
class FunAudioChatConfig(PretrainedConfig):
    model_type = "funaudiochat"
    attribute_map = {
        "audio_token_id": "audio_token_index",
    }

    def __init__(
        self,
        audio_config: PretrainedConfig | dict | None = None,
        text_config: PretrainedConfig | dict | None = None,
        audio_token_index: int = 151646,
        ignore_index: int = -100,
        hidden_size: int | None = None,
        **kwargs,
    ) -> None:
        self.audio_token_index = audio_token_index
        self.ignore_index = ignore_index

        if isinstance(audio_config, dict):
            audio_config.setdefault(
                "model_type", FunAudioChatAudioEncoderConfig.model_type
            )
            audio_config = FunAudioChatAudioEncoderConfig(**audio_config)
        elif audio_config is None:
            audio_config = FunAudioChatAudioEncoderConfig()
        self.audio_config = audio_config

        if isinstance(text_config, dict):
            # Default to qwen2 for backwards compatibility; FunAudioChat uses
            # qwen3 in practice for recent checkpoints.
            text_config.setdefault("model_type", "qwen2")
            import transformers

            text_cls = transformers.CONFIG_MAPPING[text_config["model_type"]]
            text_config = text_cls(**text_config)
        elif text_config is None:
            import transformers

            text_config = transformers.CONFIG_MAPPING["qwen2"]()
        self.text_config = text_config

        self.hidden_size = (
            int(self.text_config.hidden_size)
            if hidden_size is None
            else int(hidden_size)
        )

        super().__init__(**kwargs)

attribute_map class-attribute instance-attribute

attribute_map = {'audio_token_id': 'audio_token_index'}

audio_config instance-attribute

audio_config = audio_config

audio_token_index instance-attribute

audio_token_index = audio_token_index

hidden_size instance-attribute

hidden_size = (
    int(hidden_size)
    if hidden_size is None
    else int(hidden_size)
)

ignore_index instance-attribute

ignore_index = ignore_index

model_type class-attribute instance-attribute

model_type = 'funaudiochat'

text_config instance-attribute

text_config = text_config

__init__

__init__(
    audio_config: PretrainedConfig | dict | None = None,
    text_config: PretrainedConfig | dict | None = None,
    audio_token_index: int = 151646,
    ignore_index: int = -100,
    hidden_size: int | None = None,
    **kwargs,
) -> None
Source code in vllm/transformers_utils/configs/funaudiochat.py
def __init__(
    self,
    audio_config: PretrainedConfig | dict | None = None,
    text_config: PretrainedConfig | dict | None = None,
    audio_token_index: int = 151646,
    ignore_index: int = -100,
    hidden_size: int | None = None,
    **kwargs,
) -> None:
    self.audio_token_index = audio_token_index
    self.ignore_index = ignore_index

    if isinstance(audio_config, dict):
        audio_config.setdefault(
            "model_type", FunAudioChatAudioEncoderConfig.model_type
        )
        audio_config = FunAudioChatAudioEncoderConfig(**audio_config)
    elif audio_config is None:
        audio_config = FunAudioChatAudioEncoderConfig()
    self.audio_config = audio_config

    if isinstance(text_config, dict):
        # Default to qwen2 for backwards compatibility; FunAudioChat uses
        # qwen3 in practice for recent checkpoints.
        text_config.setdefault("model_type", "qwen2")
        import transformers

        text_cls = transformers.CONFIG_MAPPING[text_config["model_type"]]
        text_config = text_cls(**text_config)
    elif text_config is None:
        import transformers

        text_config = transformers.CONFIG_MAPPING["qwen2"]()
    self.text_config = text_config

    self.hidden_size = (
        int(self.text_config.hidden_size)
        if hidden_size is None
        else int(hidden_size)
    )

    super().__init__(**kwargs)