vllm.model_executor.models.interns1_pro ¶

Inference-only InternS1Pro model compatible with HuggingFace weights.

logger `module-attribute` ¶

logger = init_logger(__name__)

InternS1ProForConditionalGeneration ¶

Bases: Qwen3VLForConditionalGeneration, Qwen3VLMoeMixtureOfExperts

Source code in vllm/model_executor/models/interns1_pro.py

@MULTIMODAL_REGISTRY.register_processor(
    Qwen3VLMultiModalProcessor,
    info=InternS1ProProcessingInfo,
    dummy_inputs=Qwen3VLDummyInputsBuilder,
)
class InternS1ProForConditionalGeneration(
    Qwen3VLForConditionalGeneration, Qwen3VLMoeMixtureOfExperts
):
    is_3d_moe_weight: bool = True
    packed_modules_mapping = {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
    }

    # To ensure correct weight loading and mapping.
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={
            "model.visual.": "visual.",
            "lm_head.": "language_model.lm_head.",
            "model.language_model.": "language_model.model.",
        },
        orig_to_new_suffix={
            # Handle FOPE rotary embeddings
            ".rotary_emb.sin_coef": ".layers.0.self_attn.rotary_emb.sin_coef",
            ".rotary_emb.cos_coef": ".layers.0.self_attn.rotary_emb.cos_coef",
        },
    )

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        config: PretrainedConfig = vllm_config.model_config.hf_config
        multimodal_config = vllm_config.model_config.multimodal_config

        self.config = config
        self.multimodal_config = multimodal_config
        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
        self.video_pruning_rate = multimodal_config.video_pruning_rate
        self.is_multimodal_pruning_enabled = (
            multimodal_config.is_multimodal_pruning_enabled()
        )

        if not multimodal_config.get_limit_per_prompt(
            "image"
        ) and not multimodal_config.get_limit_per_prompt("video"):
            self.visual = None
        else:
            self.visual = Qwen3_VisionTransformer(
                config.vision_config,
                norm_eps=getattr(config, "rms_norm_eps", 1e-6),
                multimodal_config=multimodal_config,
                prefix=maybe_prefix(prefix, "visual"),
            )

        self.language_model = InternS1ProMoeLLMForCausalLM(
            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "language_model")
        )
        # Whether to include the gate_up_proj mapping is determined by
        # the language model.
        self.packed_modules_mapping = (
            self.packed_modules_mapping | self.language_model.packed_modules_mapping
        )

        self.make_empty_intermediate_tensors = (
            self.language_model.make_empty_intermediate_tensors
        )

        self.use_deepstack = hasattr(config.vision_config, "deepstack_visual_indexes")
        self.deepstack_num_level = (
            len(config.vision_config.deepstack_visual_indexes)
            if self.use_deepstack
            else 0
        )
        self.visual_dim = config.vision_config.out_hidden_size
        self.multiscale_dim = self.visual_dim * self.deepstack_num_level

        # Set MoE hyperparameters
        self.set_moe_parameters()

    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
        """load weights"""
        skip_prefixes = ["model.time_series."]
        if self.visual is None:
            skip_prefixes.append("visual.")
        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

config `instance-attribute` ¶

config = config

deepstack_num_level `instance-attribute` ¶

deepstack_num_level = (
    len(deepstack_visual_indexes) if use_deepstack else 0
)

hf_to_vllm_mapper `class-attribute` `instance-attribute` ¶

hf_to_vllm_mapper = WeightsMapper(
    orig_to_new_prefix={
        "model.visual.": "visual.",
        "lm_head.": "language_model.lm_head.",
        "model.language_model.": "language_model.model.",
    },
    orig_to_new_suffix={
        ".rotary_emb.sin_coef": ".layers.0.self_attn.rotary_emb.sin_coef",
        ".rotary_emb.cos_coef": ".layers.0.self_attn.rotary_emb.cos_coef",
    },
)

is_3d_moe_weight `class-attribute` `instance-attribute` ¶

is_3d_moe_weight: bool = True

is_multimodal_pruning_enabled `instance-attribute` ¶

is_multimodal_pruning_enabled = (
    is_multimodal_pruning_enabled()
)

language_model `instance-attribute` ¶

language_model = InternS1ProMoeLLMForCausalLM(
    vllm_config=vllm_config,
    prefix=maybe_prefix(prefix, "language_model"),
)

make_empty_intermediate_tensors `instance-attribute` ¶

make_empty_intermediate_tensors = (
    make_empty_intermediate_tensors
)

multimodal_config `instance-attribute` ¶

multimodal_config = multimodal_config

multiscale_dim `instance-attribute` ¶

multiscale_dim = visual_dim * deepstack_num_level

packed_modules_mapping `class-attribute` `instance-attribute` ¶

packed_modules_mapping = (
    packed_modules_mapping | packed_modules_mapping
)

use_data_parallel `instance-attribute` ¶

use_data_parallel = mm_encoder_tp_mode == 'data'

use_deepstack `instance-attribute` ¶

use_deepstack = hasattr(
    vision_config, "deepstack_visual_indexes"
)

video_pruning_rate `instance-attribute` ¶

video_pruning_rate = video_pruning_rate

visual `instance-attribute` ¶

visual = None

visual_dim `instance-attribute` ¶

visual_dim = out_hidden_size

init ¶

__init__(*, vllm_config: VllmConfig, prefix: str = '')

Source code in vllm/model_executor/models/interns1_pro.py

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()
    config: PretrainedConfig = vllm_config.model_config.hf_config
    multimodal_config = vllm_config.model_config.multimodal_config

    self.config = config
    self.multimodal_config = multimodal_config
    self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
    self.video_pruning_rate = multimodal_config.video_pruning_rate
    self.is_multimodal_pruning_enabled = (
        multimodal_config.is_multimodal_pruning_enabled()
    )

    if not multimodal_config.get_limit_per_prompt(
        "image"
    ) and not multimodal_config.get_limit_per_prompt("video"):
        self.visual = None
    else:
        self.visual = Qwen3_VisionTransformer(
            config.vision_config,
            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
            multimodal_config=multimodal_config,
            prefix=maybe_prefix(prefix, "visual"),
        )

    self.language_model = InternS1ProMoeLLMForCausalLM(
        vllm_config=vllm_config, prefix=maybe_prefix(prefix, "language_model")
    )
    # Whether to include the gate_up_proj mapping is determined by
    # the language model.
    self.packed_modules_mapping = (
        self.packed_modules_mapping | self.language_model.packed_modules_mapping
    )

    self.make_empty_intermediate_tensors = (
        self.language_model.make_empty_intermediate_tensors
    )

    self.use_deepstack = hasattr(config.vision_config, "deepstack_visual_indexes")
    self.deepstack_num_level = (
        len(config.vision_config.deepstack_visual_indexes)
        if self.use_deepstack
        else 0
    )
    self.visual_dim = config.vision_config.out_hidden_size
    self.multiscale_dim = self.visual_dim * self.deepstack_num_level

    # Set MoE hyperparameters
    self.set_moe_parameters()

load_weights ¶

load_weights(weights: Iterable[tuple[str, Tensor]])

load weights

Source code in vllm/model_executor/models/interns1_pro.py

def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
    """load weights"""
    skip_prefixes = ["model.time_series."]
    if self.visual is None:
        skip_prefixes.append("visual.")
    loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
    return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

InternS1ProMoeAttention ¶

Bases: Module

Source code in vllm/model_executor/models/interns1_pro.py

class InternS1ProMoeAttention(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        num_kv_heads: int,
        rope_parameters: dict[str, Any],
        max_position_embeddings: int = 32768,
        head_dim: int | None = None,
        rms_norm_eps: float = 1e-06,
        qkv_bias: bool = False,
        cache_config: CacheConfig | None = None,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
        dual_chunk_attention_config: dict[str, Any] | None = None,
    ) -> None:
        super().__init__()
        self.hidden_size = hidden_size
        tp_size = get_tensor_model_parallel_world_size()
        self.total_num_heads = num_heads
        assert self.total_num_heads % tp_size == 0
        self.num_heads = self.total_num_heads // tp_size
        self.total_num_kv_heads = num_kv_heads
        if self.total_num_kv_heads >= tp_size:
            # Number of KV heads is greater than TP size, so we partition
            # the KV heads across multiple tensor parallel GPUs.
            assert self.total_num_kv_heads % tp_size == 0
        else:
            # Number of KV heads is less than TP size, so we replicate
            # the KV heads across multiple tensor parallel GPUs.
            assert tp_size % self.total_num_kv_heads == 0
        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
        self.q_size = self.num_heads * self.head_dim
        self.kv_size = self.num_kv_heads * self.head_dim
        self.scaling = self.head_dim**-0.5
        self.max_position_embeddings = max_position_embeddings
        self.dual_chunk_attention_config = dual_chunk_attention_config

        self.qkv_proj = QKVParallelLinear(
            hidden_size,
            self.head_dim,
            self.total_num_heads,
            self.total_num_kv_heads,
            bias=qkv_bias,
            quant_config=quant_config,
            prefix=f"{prefix}.qkv_proj",
        )

        self.o_proj = RowParallelLinear(
            self.total_num_heads * self.head_dim,
            hidden_size,
            bias=False,
            quant_config=quant_config,
            prefix=f"{prefix}.o_proj",
        )

        rope_parameters["num_key_value_heads"] = self.num_kv_heads
        self.rotary_emb = get_rope(
            self.head_dim,
            max_position=max_position_embeddings,
            rope_parameters=rope_parameters,
            dual_chunk_attention_config=dual_chunk_attention_config,
        )

        self.attn = Attention(
            self.num_heads,
            self.head_dim,
            self.scaling,
            num_kv_heads=self.num_kv_heads,
            cache_config=cache_config,
            quant_config=quant_config,
            prefix=f"{prefix}.attn",
            **{
                "layer_idx": extract_layer_index(prefix),
                "dual_chunk_attention_config": dual_chunk_attention_config,
            }
            if dual_chunk_attention_config
            else {},
        )

        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)

    def forward(
        self,
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
    ) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
        # Add qk-norm
        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
        q_by_head = self.q_norm(q_by_head)
        q = q_by_head.view(q.shape)

        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
        k_by_head = self.k_norm(k_by_head)
        k = k_by_head.view(k.shape)
        q, k = self.rotary_emb.forward_native(positions, q, k)
        attn_output = self.attn(q, k, v)
        output, _ = self.o_proj(attn_output)
        return output

attn `instance-attribute` ¶

attn = Attention(
    num_heads,
    head_dim,
    scaling,
    num_kv_heads=num_kv_heads,
    cache_config=cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.attn",
    **(
        {
            "layer_idx": extract_layer_index(prefix),
            "dual_chunk_attention_config": dual_chunk_attention_config,
        }
        if dual_chunk_attention_config
        else {}
    ),
)

dual_chunk_attention_config `instance-attribute` ¶

dual_chunk_attention_config = dual_chunk_attention_config

head_dim `instance-attribute` ¶

head_dim = head_dim or hidden_size // total_num_heads

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

k_norm `instance-attribute` ¶

k_norm = RMSNorm(head_dim, eps=rms_norm_eps)

kv_size `instance-attribute` ¶

kv_size = num_kv_heads * head_dim

max_position_embeddings `instance-attribute` ¶

max_position_embeddings = max_position_embeddings

num_heads `instance-attribute` ¶

num_heads = total_num_heads // tp_size

num_kv_heads `instance-attribute` ¶

num_kv_heads = max(1, total_num_kv_heads // tp_size)

o_proj `instance-attribute` ¶

o_proj = RowParallelLinear(
    total_num_heads * head_dim,
    hidden_size,
    bias=False,
    quant_config=quant_config,
    prefix=f"{prefix}.o_proj",
)

q_norm `instance-attribute` ¶

q_norm = RMSNorm(head_dim, eps=rms_norm_eps)

q_size `instance-attribute` ¶

q_size = num_heads * head_dim

qkv_proj `instance-attribute` ¶

qkv_proj = QKVParallelLinear(
    hidden_size,
    head_dim,
    total_num_heads,
    total_num_kv_heads,
    bias=qkv_bias,
    quant_config=quant_config,
    prefix=f"{prefix}.qkv_proj",
)

rotary_emb `instance-attribute` ¶

rotary_emb = get_rope(
    head_dim,
    max_position=max_position_embeddings,
    rope_parameters=rope_parameters,
    dual_chunk_attention_config=dual_chunk_attention_config,
)

scaling `instance-attribute` ¶

scaling = head_dim ** -0.5

total_num_heads `instance-attribute` ¶

total_num_heads = num_heads

total_num_kv_heads `instance-attribute` ¶

total_num_kv_heads = num_kv_heads

init ¶

__init__(
    hidden_size: int,
    num_heads: int,
    num_kv_heads: int,
    rope_parameters: dict[str, Any],
    max_position_embeddings: int = 32768,
    head_dim: int | None = None,
    rms_norm_eps: float = 1e-06,
    qkv_bias: bool = False,
    cache_config: CacheConfig | None = None,
    quant_config: QuantizationConfig | None = None,
    prefix: str = "",
    dual_chunk_attention_config: dict[str, Any]
    | None = None,
) -> None

Source code in vllm/model_executor/models/interns1_pro.py

def __init__(
    self,
    hidden_size: int,
    num_heads: int,
    num_kv_heads: int,
    rope_parameters: dict[str, Any],
    max_position_embeddings: int = 32768,
    head_dim: int | None = None,
    rms_norm_eps: float = 1e-06,
    qkv_bias: bool = False,
    cache_config: CacheConfig | None = None,
    quant_config: QuantizationConfig | None = None,
    prefix: str = "",
    dual_chunk_attention_config: dict[str, Any] | None = None,
) -> None:
    super().__init__()
    self.hidden_size = hidden_size
    tp_size = get_tensor_model_parallel_world_size()
    self.total_num_heads = num_heads
    assert self.total_num_heads % tp_size == 0
    self.num_heads = self.total_num_heads // tp_size
    self.total_num_kv_heads = num_kv_heads
    if self.total_num_kv_heads >= tp_size:
        # Number of KV heads is greater than TP size, so we partition
        # the KV heads across multiple tensor parallel GPUs.
        assert self.total_num_kv_heads % tp_size == 0
    else:
        # Number of KV heads is less than TP size, so we replicate
        # the KV heads across multiple tensor parallel GPUs.
        assert tp_size % self.total_num_kv_heads == 0
    self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
    self.head_dim = head_dim or (hidden_size // self.total_num_heads)
    self.q_size = self.num_heads * self.head_dim
    self.kv_size = self.num_kv_heads * self.head_dim
    self.scaling = self.head_dim**-0.5
    self.max_position_embeddings = max_position_embeddings
    self.dual_chunk_attention_config = dual_chunk_attention_config

    self.qkv_proj = QKVParallelLinear(
        hidden_size,
        self.head_dim,
        self.total_num_heads,
        self.total_num_kv_heads,
        bias=qkv_bias,
        quant_config=quant_config,
        prefix=f"{prefix}.qkv_proj",
    )

    self.o_proj = RowParallelLinear(
        self.total_num_heads * self.head_dim,
        hidden_size,
        bias=False,
        quant_config=quant_config,
        prefix=f"{prefix}.o_proj",
    )

    rope_parameters["num_key_value_heads"] = self.num_kv_heads
    self.rotary_emb = get_rope(
        self.head_dim,
        max_position=max_position_embeddings,
        rope_parameters=rope_parameters,
        dual_chunk_attention_config=dual_chunk_attention_config,
    )

    self.attn = Attention(
        self.num_heads,
        self.head_dim,
        self.scaling,
        num_kv_heads=self.num_kv_heads,
        cache_config=cache_config,
        quant_config=quant_config,
        prefix=f"{prefix}.attn",
        **{
            "layer_idx": extract_layer_index(prefix),
            "dual_chunk_attention_config": dual_chunk_attention_config,
        }
        if dual_chunk_attention_config
        else {},
    )

    self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
    self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)

forward ¶

forward(positions: Tensor, hidden_states: Tensor) -> Tensor

Source code in vllm/model_executor/models/interns1_pro.py

def forward(
    self,
    positions: torch.Tensor,
    hidden_states: torch.Tensor,
) -> torch.Tensor:
    qkv, _ = self.qkv_proj(hidden_states)
    q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
    # Add qk-norm
    q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim)
    q_by_head = self.q_norm(q_by_head)
    q = q_by_head.view(q.shape)

    k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim)
    k_by_head = self.k_norm(k_by_head)
    k = k_by_head.view(k.shape)
    q, k = self.rotary_emb.forward_native(positions, q, k)
    attn_output = self.attn(q, k, v)
    output, _ = self.o_proj(attn_output)
    return output

InternS1ProMoeDecoderLayer ¶

Bases: Module

Source code in vllm/model_executor/models/interns1_pro.py

class InternS1ProMoeDecoderLayer(nn.Module):
    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
        super().__init__()

        config = vllm_config.model_config.hf_text_config
        cache_config = vllm_config.cache_config
        quant_config = vllm_config.quant_config

        self.hidden_size = config.hidden_size
        max_position_embeddings = getattr(config, "max_position_embeddings", 32768)
        dual_chunk_attention_config = getattr(
            config, "dual_chunk_attention_config", None
        )

        # update rope related parameters
        rope_scaling = config.rope_scaling
        fope_keys = {"fope_init_factor", "fope_sep_head", "num_inv_freq"}
        use_fope = any(rope_scaling.get(key) is not None for key in fope_keys)
        fope_init_factor = rope_scaling.get("fope_init_factor", None)
        fope_sep_head = rope_scaling.get("fope_sep_head", None)
        num_inv_freq = rope_scaling.get("num_inv_freq", None)

        config.rope_parameters["use_fope"] = use_fope
        config.rope_parameters["fope_init_factor"] = fope_init_factor
        config.rope_parameters["fope_sep_head"] = fope_sep_head
        config.rope_parameters["num_inv_freq"] = num_inv_freq

        assert use_fope, "should use FOPE for InternS1Pro model"
        self.self_attn = InternS1ProMoeAttention(
            hidden_size=self.hidden_size,
            num_heads=config.num_attention_heads,
            num_kv_heads=config.num_key_value_heads,
            rope_parameters=config.rope_parameters,
            max_position_embeddings=max_position_embeddings,
            rms_norm_eps=config.rms_norm_eps,
            qkv_bias=getattr(config, "attention_bias", False),
            head_dim=getattr(config, "head_dim", None),
            cache_config=cache_config,
            quant_config=quant_config,
            prefix=f"{prefix}.self_attn",
            dual_chunk_attention_config=dual_chunk_attention_config,
        )

        # `mlp_only_layers` in the config.
        layer_idx = extract_layer_index(prefix)
        mlp_only_layers = (
            [] if not hasattr(config, "mlp_only_layers") else config.mlp_only_layers
        )
        if (layer_idx not in mlp_only_layers) and (
            config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
        ):
            self.mlp = InternS1ProMoeSparseMoeBlock(
                vllm_config=vllm_config, prefix=f"{prefix}.mlp"
            )
        else:
            self.mlp = InternS1ProMoeMLP(
                hidden_size=config.hidden_size,
                intermediate_size=config.intermediate_size,
                hidden_act=config.hidden_act,
                quant_config=quant_config,
                prefix=f"{prefix}.mlp",
            )
        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = RMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )

    def forward(
        self,
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
        residual: torch.Tensor | None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        # Self Attention
        if residual is None:
            residual = hidden_states
            hidden_states = self.input_layernorm(hidden_states)
        else:
            hidden_states, residual = self.input_layernorm(hidden_states, residual)
        hidden_states = self.self_attn(
            positions=positions,
            hidden_states=hidden_states,
        )

        # Fully Connected
        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
        hidden_states = self.mlp(hidden_states)
        return hidden_states, residual

hidden_size `instance-attribute` ¶

hidden_size = hidden_size

input_layernorm `instance-attribute` ¶

input_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps)

mlp `instance-attribute` ¶

mlp = InternS1ProMoeSparseMoeBlock(
    vllm_config=vllm_config, prefix=f"{prefix}.mlp"
)

post_attention_layernorm `instance-attribute` ¶

post_attention_layernorm = RMSNorm(
    hidden_size, eps=rms_norm_eps
)

self_attn `instance-attribute` ¶

self_attn = InternS1ProMoeAttention(
    hidden_size=hidden_size,
    num_heads=num_attention_heads,
    num_kv_heads=num_key_value_heads,
    rope_parameters=rope_parameters,
    max_position_embeddings=max_position_embeddings,
    rms_norm_eps=rms_norm_eps,
    qkv_bias=getattr(config, "attention_bias", False),
    head_dim=getattr(config, "head_dim", None),
    cache_config=cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.self_attn",
    dual_chunk_attention_config=dual_chunk_attention_config,
)

init ¶

__init__(vllm_config: VllmConfig, prefix: str = '') -> None

Source code in vllm/model_executor/models/interns1_pro.py

def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
    super().__init__()

    config = vllm_config.model_config.hf_text_config
    cache_config = vllm_config.cache_config
    quant_config = vllm_config.quant_config

    self.hidden_size = config.hidden_size
    max_position_embeddings = getattr(config, "max_position_embeddings", 32768)
    dual_chunk_attention_config = getattr(
        config, "dual_chunk_attention_config", None
    )

    # update rope related parameters
    rope_scaling = config.rope_scaling
    fope_keys = {"fope_init_factor", "fope_sep_head", "num_inv_freq"}
    use_fope = any(rope_scaling.get(key) is not None for key in fope_keys)
    fope_init_factor = rope_scaling.get("fope_init_factor", None)
    fope_sep_head = rope_scaling.get("fope_sep_head", None)
    num_inv_freq = rope_scaling.get("num_inv_freq", None)

    config.rope_parameters["use_fope"] = use_fope
    config.rope_parameters["fope_init_factor"] = fope_init_factor
    config.rope_parameters["fope_sep_head"] = fope_sep_head
    config.rope_parameters["num_inv_freq"] = num_inv_freq

    assert use_fope, "should use FOPE for InternS1Pro model"
    self.self_attn = InternS1ProMoeAttention(
        hidden_size=self.hidden_size,
        num_heads=config.num_attention_heads,
        num_kv_heads=config.num_key_value_heads,
        rope_parameters=config.rope_parameters,
        max_position_embeddings=max_position_embeddings,
        rms_norm_eps=config.rms_norm_eps,
        qkv_bias=getattr(config, "attention_bias", False),
        head_dim=getattr(config, "head_dim", None),
        cache_config=cache_config,
        quant_config=quant_config,
        prefix=f"{prefix}.self_attn",
        dual_chunk_attention_config=dual_chunk_attention_config,
    )

    # `mlp_only_layers` in the config.
    layer_idx = extract_layer_index(prefix)
    mlp_only_layers = (
        [] if not hasattr(config, "mlp_only_layers") else config.mlp_only_layers
    )
    if (layer_idx not in mlp_only_layers) and (
        config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
    ):
        self.mlp = InternS1ProMoeSparseMoeBlock(
            vllm_config=vllm_config, prefix=f"{prefix}.mlp"
        )
    else:
        self.mlp = InternS1ProMoeMLP(
            hidden_size=config.hidden_size,
            intermediate_size=config.intermediate_size,
            hidden_act=config.hidden_act,
            quant_config=quant_config,
            prefix=f"{prefix}.mlp",
        )
    self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
    self.post_attention_layernorm = RMSNorm(
        config.hidden_size, eps=config.rms_norm_eps
    )

forward ¶

forward(
    positions: Tensor,
    hidden_states: Tensor,
    residual: Tensor | None,
) -> tuple[Tensor, Tensor]

Source code in vllm/model_executor/models/interns1_pro.py

def forward(
    self,
    positions: torch.Tensor,
    hidden_states: torch.Tensor,
    residual: torch.Tensor | None,
) -> tuple[torch.Tensor, torch.Tensor]:
    # Self Attention
    if residual is None:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
    else:
        hidden_states, residual = self.input_layernorm(hidden_states, residual)
    hidden_states = self.self_attn(
        positions=positions,
        hidden_states=hidden_states,
    )

    # Fully Connected
    hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
    hidden_states = self.mlp(hidden_states)
    return hidden_states, residual

InternS1ProMoeLLMForCausalLM ¶

Bases: Qwen3MoeForCausalLM

Source code in vllm/model_executor/models/interns1_pro.py

class InternS1ProMoeLLMForCausalLM(Qwen3MoeForCausalLM):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        self.config = vllm_config.model_config.hf_config.text_config
        self.quant_config = vllm_config.quant_config
        self.model = InternS1ProMoeLLMModel(
            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
        )
        self.lm_head = ParallelLMHead(
            self.config.vocab_size,
            self.config.hidden_size,
            quant_config=self.quant_config,
            prefix=maybe_prefix(prefix, "lm_head"),
        )
        if self.config.tie_word_embeddings:
            self.lm_head.weight = self.model.embed_tokens.weight
        self.logits_processor = LogitsProcessor(self.config.vocab_size)
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors
        )

config `instance-attribute` ¶

config = text_config

lm_head `instance-attribute` ¶

lm_head = ParallelLMHead(
    vocab_size,
    hidden_size,
    quant_config=quant_config,
    prefix=maybe_prefix(prefix, "lm_head"),
)

logits_processor `instance-attribute` ¶

logits_processor = LogitsProcessor(vocab_size)

make_empty_intermediate_tensors `instance-attribute` ¶

make_empty_intermediate_tensors = (
    make_empty_intermediate_tensors
)

model `instance-attribute` ¶

model = InternS1ProMoeLLMModel(
    vllm_config=vllm_config,
    prefix=maybe_prefix(prefix, "model"),
)

quant_config `instance-attribute` ¶

quant_config = quant_config

init ¶

__init__(*, vllm_config: VllmConfig, prefix: str = '')

Source code in vllm/model_executor/models/interns1_pro.py

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()
    self.config = vllm_config.model_config.hf_config.text_config
    self.quant_config = vllm_config.quant_config
    self.model = InternS1ProMoeLLMModel(
        vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
    )
    self.lm_head = ParallelLMHead(
        self.config.vocab_size,
        self.config.hidden_size,
        quant_config=self.quant_config,
        prefix=maybe_prefix(prefix, "lm_head"),
    )
    if self.config.tie_word_embeddings:
        self.lm_head.weight = self.model.embed_tokens.weight
    self.logits_processor = LogitsProcessor(self.config.vocab_size)
    self.make_empty_intermediate_tensors = (
        self.model.make_empty_intermediate_tensors
    )

InternS1ProMoeLLMModel ¶

Bases: Qwen3MoeLLMModel

Source code in vllm/model_executor/models/interns1_pro.py

class InternS1ProMoeLLMModel(Qwen3MoeLLMModel):
    def __init__(
        self,
        *,
        vllm_config: VllmConfig,
        prefix: str = "",
        decoder_layer_type: type[torch.nn.Module] = InternS1ProMoeDecoderLayer,
    ):
        super().__init__(
            vllm_config=vllm_config,
            prefix=prefix,
            decoder_layer_type=decoder_layer_type,
        )

init ¶

__init__(
    *,
    vllm_config: VllmConfig,
    prefix: str = "",
    decoder_layer_type: type[
        Module
    ] = InternS1ProMoeDecoderLayer,
)

Source code in vllm/model_executor/models/interns1_pro.py

def __init__(
    self,
    *,
    vllm_config: VllmConfig,
    prefix: str = "",
    decoder_layer_type: type[torch.nn.Module] = InternS1ProMoeDecoderLayer,
):
    super().__init__(
        vllm_config=vllm_config,
        prefix=prefix,
        decoder_layer_type=decoder_layer_type,
    )

InternS1ProMoeMLP ¶

Bases: Module

Source code in vllm/model_executor/models/interns1_pro.py

class InternS1ProMoeMLP(nn.Module):
    def __init__(
        self,
        hidden_size: int,
        intermediate_size: int,
        hidden_act: str,
        quant_config: QuantizationConfig | None = None,
        reduce_results: bool = True,
        prefix: str = "",
    ) -> None:
        super().__init__()
        self.gate_up_proj = MergedColumnParallelLinear(
            hidden_size,
            [intermediate_size] * 2,
            bias=False,
            quant_config=quant_config,
            prefix=f"{prefix}.gate_up_proj",
        )
        self.down_proj = RowParallelLinear(
            intermediate_size,
            hidden_size,
            bias=False,
            quant_config=quant_config,
            reduce_results=reduce_results,
            prefix=f"{prefix}.down_proj",
        )
        if hidden_act != "silu":
            raise ValueError(
                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
            )
        self.act_fn = SiluAndMul()

    def forward(self, x):
        gate_up, _ = self.gate_up_proj(x)
        x = self.act_fn(gate_up)
        x, _ = self.down_proj(x)
        return x

act_fn `instance-attribute` ¶

act_fn = SiluAndMul()

down_proj `instance-attribute` ¶

down_proj = RowParallelLinear(
    intermediate_size,
    hidden_size,
    bias=False,
    quant_config=quant_config,
    reduce_results=reduce_results,
    prefix=f"{prefix}.down_proj",
)

gate_up_proj `instance-attribute` ¶

gate_up_proj = MergedColumnParallelLinear(
    hidden_size,
    [intermediate_size] * 2,
    bias=False,
    quant_config=quant_config,
    prefix=f"{prefix}.gate_up_proj",
)

init ¶

__init__(
    hidden_size: int,
    intermediate_size: int,
    hidden_act: str,
    quant_config: QuantizationConfig | None = None,
    reduce_results: bool = True,
    prefix: str = "",
) -> None

Source code in vllm/model_executor/models/interns1_pro.py

def __init__(
    self,
    hidden_size: int,
    intermediate_size: int,
    hidden_act: str,
    quant_config: QuantizationConfig | None = None,
    reduce_results: bool = True,
    prefix: str = "",
) -> None:
    super().__init__()
    self.gate_up_proj = MergedColumnParallelLinear(
        hidden_size,
        [intermediate_size] * 2,
        bias=False,
        quant_config=quant_config,
        prefix=f"{prefix}.gate_up_proj",
    )
    self.down_proj = RowParallelLinear(
        intermediate_size,
        hidden_size,
        bias=False,
        quant_config=quant_config,
        reduce_results=reduce_results,
        prefix=f"{prefix}.down_proj",
    )
    if hidden_act != "silu":
        raise ValueError(
            f"Unsupported activation: {hidden_act}. Only silu is supported for now."
        )
    self.act_fn = SiluAndMul()

forward ¶

forward(x)

Source code in vllm/model_executor/models/interns1_pro.py

def forward(self, x):
    gate_up, _ = self.gate_up_proj(x)
    x = self.act_fn(gate_up)
    x, _ = self.down_proj(x)
    return x

InternS1ProMoeSparseMoeBlock ¶

Bases: Module

Source code in vllm/model_executor/models/interns1_pro.py

class InternS1ProMoeSparseMoeBlock(nn.Module):
    def __init__(
        self,
        vllm_config: VllmConfig,
        prefix: str = "",
    ):
        super().__init__()

        config = vllm_config.model_config.hf_text_config
        parallel_config = vllm_config.parallel_config
        quant_config = vllm_config.quant_config

        self.tp_size = get_tensor_model_parallel_world_size()

        self.ep_group = get_ep_group().device_group
        self.ep_rank = get_ep_group().rank_in_group
        self.ep_size = self.ep_group.size()
        self.n_routed_experts = config.num_experts

        self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe

        if self.tp_size > config.num_experts:
            raise ValueError(
                f"Tensor parallel size {self.tp_size} is greater than "
                f"the number of experts {config.num_experts}."
            )

        # Load balancing settings.
        eplb_config = vllm_config.parallel_config.eplb_config
        self.enable_eplb = parallel_config.enable_eplb

        self.n_logical_experts = self.n_routed_experts
        self.n_redundant_experts = eplb_config.num_redundant_experts
        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
        self.n_local_physical_experts = self.n_physical_experts // self.ep_size

        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
        self.physical_expert_end = (
            self.physical_expert_start + self.n_local_physical_experts
        )

        # For custom routing function
        self.n_groups = getattr(config, "router_n_groups", -1)

        self.experts = FusedMoE(
            num_experts=self.n_routed_experts,
            top_k=config.num_experts_per_tok,
            hidden_size=config.hidden_size,
            intermediate_size=config.moe_intermediate_size,
            reduce_results=True,
            renormalize=config.norm_topk_prob,
            quant_config=quant_config,
            prefix=f"{prefix}.experts",
            enable_eplb=self.enable_eplb,
            num_redundant_experts=self.n_redundant_experts,
            is_sequence_parallel=self.is_sequence_parallel,
            routing_method_type=RoutingMethodType.Renormalize,
            custom_routing_function=self._custom_routing_function,
        )

        self.gate = ReplicatedLinear(
            config.hidden_size,
            config.num_experts,
            bias=False,
            prefix=f"{prefix}.gate",
        )

    @staticmethod
    @functools.lru_cache
    def get_group_offsets(n_groups: int, group_size: int, device: str):
        group_offsets = (torch.arange(n_groups, device=device) * group_size).view(
            1, -1, 1
        )  # [1, n_groups, 1]
        return group_offsets

    # TODO: zhouxinyu, use vllm routing functions
    def _custom_routing_function(
        self,
        hidden_states: torch.Tensor,
        gating_output: torch.Tensor,
        topk: int,
        renormalize: bool,
    ) -> torch.Tensor:
        routing_weights = torch.softmax(gating_output, dim=-1, dtype=torch.float32)

        if self.n_groups > 0:
            assert routing_weights.shape[-1] % self.n_groups == 0, (
                f"{routing_weights.shape[-1]} cannot be divided by {self.n_groups}"
            )
            per_group_top_k = topk // self.n_groups
            group_size = routing_weights.shape[-1] // self.n_groups
            group_offsets = self.get_group_offsets(
                self.n_groups, group_size, routing_weights.device
            )
            routing_weights = routing_weights.unflatten(-1, (self.n_groups, group_size))
            topk_weights, topk_ids = torch.topk(
                routing_weights, per_group_top_k, dim=-1
            )
            topk_ids = (topk_ids + group_offsets).flatten(-2, -1)
            topk_weights = topk_weights.flatten(-2, -1)
        else:
            topk_weights, topk_ids = torch.topk(routing_weights, topk, dim=-1)

        if renormalize:
            topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)

        return topk_weights, topk_ids

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        assert hidden_states.dim() <= 2, (
            "InternS1ProMoeSparseMoeBlock only supports 1D or 2D inputs"
        )
        is_input_1d = hidden_states.dim() == 1
        num_tokens, hidden_dim = hidden_states.shape
        hidden_states = hidden_states.view(-1, hidden_dim)

        if self.is_sequence_parallel:
            hidden_states = sequence_parallel_chunk(hidden_states)

        # router_logits: (num_tokens, n_experts)
        router_logits, _ = self.gate(hidden_states)
        final_hidden_states = self.experts(
            hidden_states=hidden_states, router_logits=router_logits
        )

        if self.is_sequence_parallel:
            final_hidden_states = tensor_model_parallel_all_gather(
                final_hidden_states, 0
            )
            final_hidden_states = final_hidden_states[:num_tokens]

        # return to 1d if input is 1d
        return final_hidden_states.squeeze(0) if is_input_1d else final_hidden_states

enable_eplb `instance-attribute` ¶

enable_eplb = enable_eplb

ep_group `instance-attribute` ¶

ep_group = device_group

ep_rank `instance-attribute` ¶

ep_rank = rank_in_group

ep_size `instance-attribute` ¶

ep_size = size()

experts `instance-attribute` ¶

experts = FusedMoE(
    num_experts=n_routed_experts,
    top_k=num_experts_per_tok,
    hidden_size=hidden_size,
    intermediate_size=moe_intermediate_size,
    reduce_results=True,
    renormalize=norm_topk_prob,
    quant_config=quant_config,
    prefix=f"{prefix}.experts",
    enable_eplb=enable_eplb,
    num_redundant_experts=n_redundant_experts,
    is_sequence_parallel=is_sequence_parallel,
    routing_method_type=Renormalize,
    custom_routing_function=_custom_routing_function,
)

gate `instance-attribute` ¶

gate = ReplicatedLinear(
    hidden_size,
    num_experts,
    bias=False,
    prefix=f"{prefix}.gate",
)

is_sequence_parallel `instance-attribute` ¶

is_sequence_parallel = use_sequence_parallel_moe

n_groups `instance-attribute` ¶

n_groups = getattr(config, 'router_n_groups', -1)

n_local_physical_experts `instance-attribute` ¶

n_local_physical_experts = n_physical_experts // ep_size

n_logical_experts `instance-attribute` ¶

n_logical_experts = n_routed_experts

n_physical_experts `instance-attribute` ¶

n_physical_experts = n_logical_experts + n_redundant_experts

n_redundant_experts `instance-attribute` ¶

n_redundant_experts = num_redundant_experts

n_routed_experts `instance-attribute` ¶

n_routed_experts = num_experts

physical_expert_end `instance-attribute` ¶

physical_expert_end = (
    physical_expert_start + n_local_physical_experts
)

physical_expert_start `instance-attribute` ¶

physical_expert_start = ep_rank * n_local_physical_experts

tp_size `instance-attribute` ¶

tp_size = get_tensor_model_parallel_world_size()

init ¶

__init__(vllm_config: VllmConfig, prefix: str = '')

Source code in vllm/model_executor/models/interns1_pro.py

def __init__(
    self,
    vllm_config: VllmConfig,
    prefix: str = "",
):
    super().__init__()

    config = vllm_config.model_config.hf_text_config
    parallel_config = vllm_config.parallel_config
    quant_config = vllm_config.quant_config

    self.tp_size = get_tensor_model_parallel_world_size()

    self.ep_group = get_ep_group().device_group
    self.ep_rank = get_ep_group().rank_in_group
    self.ep_size = self.ep_group.size()
    self.n_routed_experts = config.num_experts

    self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe

    if self.tp_size > config.num_experts:
        raise ValueError(
            f"Tensor parallel size {self.tp_size} is greater than "
            f"the number of experts {config.num_experts}."
        )

    # Load balancing settings.
    eplb_config = vllm_config.parallel_config.eplb_config
    self.enable_eplb = parallel_config.enable_eplb

    self.n_logical_experts = self.n_routed_experts
    self.n_redundant_experts = eplb_config.num_redundant_experts
    self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
    self.n_local_physical_experts = self.n_physical_experts // self.ep_size

    self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
    self.physical_expert_end = (
        self.physical_expert_start + self.n_local_physical_experts
    )

    # For custom routing function
    self.n_groups = getattr(config, "router_n_groups", -1)

    self.experts = FusedMoE(
        num_experts=self.n_routed_experts,
        top_k=config.num_experts_per_tok,
        hidden_size=config.hidden_size,
        intermediate_size=config.moe_intermediate_size,
        reduce_results=True,
        renormalize=config.norm_topk_prob,
        quant_config=quant_config,
        prefix=f"{prefix}.experts",
        enable_eplb=self.enable_eplb,
        num_redundant_experts=self.n_redundant_experts,
        is_sequence_parallel=self.is_sequence_parallel,
        routing_method_type=RoutingMethodType.Renormalize,
        custom_routing_function=self._custom_routing_function,
    )

    self.gate = ReplicatedLinear(
        config.hidden_size,
        config.num_experts,
        bias=False,
        prefix=f"{prefix}.gate",
    )

_custom_routing_function ¶

_custom_routing_function(
    hidden_states: Tensor,
    gating_output: Tensor,
    topk: int,
    renormalize: bool,
) -> Tensor

Source code in vllm/model_executor/models/interns1_pro.py

def _custom_routing_function(
    self,
    hidden_states: torch.Tensor,
    gating_output: torch.Tensor,
    topk: int,
    renormalize: bool,
) -> torch.Tensor:
    routing_weights = torch.softmax(gating_output, dim=-1, dtype=torch.float32)

    if self.n_groups > 0:
        assert routing_weights.shape[-1] % self.n_groups == 0, (
            f"{routing_weights.shape[-1]} cannot be divided by {self.n_groups}"
        )
        per_group_top_k = topk // self.n_groups
        group_size = routing_weights.shape[-1] // self.n_groups
        group_offsets = self.get_group_offsets(
            self.n_groups, group_size, routing_weights.device
        )
        routing_weights = routing_weights.unflatten(-1, (self.n_groups, group_size))
        topk_weights, topk_ids = torch.topk(
            routing_weights, per_group_top_k, dim=-1
        )
        topk_ids = (topk_ids + group_offsets).flatten(-2, -1)
        topk_weights = topk_weights.flatten(-2, -1)
    else:
        topk_weights, topk_ids = torch.topk(routing_weights, topk, dim=-1)

    if renormalize:
        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)

    return topk_weights, topk_ids

forward ¶

forward(hidden_states: Tensor) -> Tensor

Source code in vllm/model_executor/models/interns1_pro.py

def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
    assert hidden_states.dim() <= 2, (
        "InternS1ProMoeSparseMoeBlock only supports 1D or 2D inputs"
    )
    is_input_1d = hidden_states.dim() == 1
    num_tokens, hidden_dim = hidden_states.shape
    hidden_states = hidden_states.view(-1, hidden_dim)

    if self.is_sequence_parallel:
        hidden_states = sequence_parallel_chunk(hidden_states)

    # router_logits: (num_tokens, n_experts)
    router_logits, _ = self.gate(hidden_states)
    final_hidden_states = self.experts(
        hidden_states=hidden_states, router_logits=router_logits
    )

    if self.is_sequence_parallel:
        final_hidden_states = tensor_model_parallel_all_gather(
            final_hidden_states, 0
        )
        final_hidden_states = final_hidden_states[:num_tokens]

    # return to 1d if input is 1d
    return final_hidden_states.squeeze(0) if is_input_1d else final_hidden_states

get_group_offsets `cached` `staticmethod` ¶

get_group_offsets(
    n_groups: int, group_size: int, device: str
)

Source code in vllm/model_executor/models/interns1_pro.py

@staticmethod
@functools.lru_cache
def get_group_offsets(n_groups: int, group_size: int, device: str):
    group_offsets = (torch.arange(n_groups, device=device) * group_size).view(
        1, -1, 1
    )  # [1, n_groups, 1]
    return group_offsets

InternS1ProProcessingInfo ¶

Bases: Qwen3VLProcessingInfo

Source code in vllm/model_executor/models/interns1_pro.py

class InternS1ProProcessingInfo(Qwen3VLProcessingInfo):
    def get_hf_config(self):
        return self.ctx.get_hf_config()

    def get_hf_processor(self, **kwargs: object) -> AutoProcessor:
        return AutoProcessor.from_pretrained(
            self.ctx.model_config.model,
            trust_remote_code=True,
            **kwargs,
        )

get_hf_config ¶

get_hf_config()

Source code in vllm/model_executor/models/interns1_pro.py

def get_hf_config(self):
    return self.ctx.get_hf_config()

get_hf_processor ¶

get_hf_processor(**kwargs: object) -> AutoProcessor

Source code in vllm/model_executor/models/interns1_pro.py

def get_hf_processor(self, **kwargs: object) -> AutoProcessor:
    return AutoProcessor.from_pretrained(
        self.ctx.model_config.model,
        trust_remote_code=True,
        **kwargs,
    )

Qwen3VLMoeMixtureOfExperts ¶

Bases: MixtureOfExperts

Source code in vllm/model_executor/models/interns1_pro.py

class Qwen3VLMoeMixtureOfExperts(MixtureOfExperts):
    def update_physical_experts_metadata(
        self,
        num_physical_experts: int,
        num_local_physical_experts: int,
    ) -> None:
        assert self.num_local_physical_experts == num_local_physical_experts
        self.num_physical_experts = num_physical_experts
        self.num_local_physical_experts = num_local_physical_experts
        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
        for layer in self.language_model.model.layers:
            if isinstance(layer.mlp, InternS1ProMoeSparseMoeBlock):
                moe = layer.mlp
                moe.n_local_physical_experts = num_local_physical_experts
                moe.n_physical_experts = num_physical_experts
                moe.n_redundant_experts = self.num_redundant_experts
                moe.experts.update_expert_map()

    def set_moe_parameters(self):
        self.expert_weights = []

        self.moe_layers = []
        example_moe = None
        for layer in self.language_model.model.layers:
            if hasattr(layer, "mlp") and isinstance(
                layer.mlp, InternS1ProMoeSparseMoeBlock
            ):
                example_moe = layer.mlp
                self.moe_layers.append(layer.mlp.experts)

        if example_moe is None:
            raise RuntimeError("No InternS1ProMoe layer found in the language_model.")

        # Set MoE hyperparameters
        self.num_moe_layers = len(self.moe_layers)
        self.num_expert_groups = 1
        self.num_shared_experts = 0
        self.num_logical_experts = example_moe.n_logical_experts
        self.num_physical_experts = example_moe.n_physical_experts
        self.num_local_physical_experts = example_moe.n_local_physical_experts
        self.num_routed_experts = example_moe.n_routed_experts
        self.num_redundant_experts = example_moe.n_redundant_experts

set_moe_parameters ¶

set_moe_parameters()

Source code in vllm/model_executor/models/interns1_pro.py

def set_moe_parameters(self):
    self.expert_weights = []

    self.moe_layers = []
    example_moe = None
    for layer in self.language_model.model.layers:
        if hasattr(layer, "mlp") and isinstance(
            layer.mlp, InternS1ProMoeSparseMoeBlock
        ):
            example_moe = layer.mlp
            self.moe_layers.append(layer.mlp.experts)

    if example_moe is None:
        raise RuntimeError("No InternS1ProMoe layer found in the language_model.")

    # Set MoE hyperparameters
    self.num_moe_layers = len(self.moe_layers)
    self.num_expert_groups = 1
    self.num_shared_experts = 0
    self.num_logical_experts = example_moe.n_logical_experts
    self.num_physical_experts = example_moe.n_physical_experts
    self.num_local_physical_experts = example_moe.n_local_physical_experts
    self.num_routed_experts = example_moe.n_routed_experts
    self.num_redundant_experts = example_moe.n_redundant_experts

update_physical_experts_metadata ¶

update_physical_experts_metadata(
    num_physical_experts: int,
    num_local_physical_experts: int,
) -> None

Source code in vllm/model_executor/models/interns1_pro.py

def update_physical_experts_metadata(
    self,
    num_physical_experts: int,
    num_local_physical_experts: int,
) -> None:
    assert self.num_local_physical_experts == num_local_physical_experts
    self.num_physical_experts = num_physical_experts
    self.num_local_physical_experts = num_local_physical_experts
    self.num_redundant_experts = num_physical_experts - self.num_logical_experts
    for layer in self.language_model.model.layers:
        if isinstance(layer.mlp, InternS1ProMoeSparseMoeBlock):
            moe = layer.mlp
            moe.n_local_physical_experts = num_local_physical_experts
            moe.n_physical_experts = num_physical_experts
            moe.n_redundant_experts = self.num_redundant_experts
            moe.experts.update_expert_map()

vllm.model_executor.models.interns1_pro ¶

logger module-attribute ¶

InternS1ProForConditionalGeneration ¶

config instance-attribute ¶

deepstack_num_level instance-attribute ¶

hf_to_vllm_mapper class-attribute instance-attribute ¶

is_3d_moe_weight class-attribute instance-attribute ¶

is_multimodal_pruning_enabled instance-attribute ¶

language_model instance-attribute ¶

make_empty_intermediate_tensors instance-attribute ¶

multimodal_config instance-attribute ¶

multiscale_dim instance-attribute ¶

packed_modules_mapping class-attribute instance-attribute ¶

use_data_parallel instance-attribute ¶

use_deepstack instance-attribute ¶

video_pruning_rate instance-attribute ¶

visual instance-attribute ¶

visual_dim instance-attribute ¶

__init__ ¶

load_weights ¶

InternS1ProMoeAttention ¶

attn instance-attribute ¶

dual_chunk_attention_config instance-attribute ¶

head_dim instance-attribute ¶

hidden_size instance-attribute ¶

k_norm instance-attribute ¶

kv_size instance-attribute ¶

max_position_embeddings instance-attribute ¶

num_heads instance-attribute ¶

num_kv_heads instance-attribute ¶

o_proj instance-attribute ¶

q_norm instance-attribute ¶

q_size instance-attribute ¶

qkv_proj instance-attribute ¶

rotary_emb instance-attribute ¶

scaling instance-attribute ¶

total_num_heads instance-attribute ¶

total_num_kv_heads instance-attribute ¶

__init__ ¶

forward ¶

InternS1ProMoeDecoderLayer ¶

hidden_size instance-attribute ¶

input_layernorm instance-attribute ¶

mlp instance-attribute ¶

post_attention_layernorm instance-attribute ¶

self_attn instance-attribute ¶

__init__ ¶

forward ¶

InternS1ProMoeLLMForCausalLM ¶

config instance-attribute ¶

lm_head instance-attribute ¶

logits_processor instance-attribute ¶

make_empty_intermediate_tensors instance-attribute ¶

model instance-attribute ¶

quant_config instance-attribute ¶

__init__ ¶

InternS1ProMoeLLMModel ¶

__init__ ¶

InternS1ProMoeMLP ¶

act_fn instance-attribute ¶

down_proj instance-attribute ¶

gate_up_proj instance-attribute ¶

__init__ ¶

forward ¶

InternS1ProMoeSparseMoeBlock ¶

enable_eplb instance-attribute ¶

ep_group instance-attribute ¶

ep_rank instance-attribute ¶

ep_size instance-attribute ¶

experts instance-attribute ¶

gate instance-attribute ¶

is_sequence_parallel instance-attribute ¶

n_groups instance-attribute ¶

n_local_physical_experts instance-attribute ¶

n_logical_experts instance-attribute ¶

n_physical_experts instance-attribute ¶

n_redundant_experts instance-attribute ¶

n_routed_experts instance-attribute ¶

physical_expert_end instance-attribute ¶

physical_expert_start instance-attribute ¶

logger `module-attribute` ¶

config `instance-attribute` ¶

deepstack_num_level `instance-attribute` ¶

hf_to_vllm_mapper `class-attribute` `instance-attribute` ¶

is_3d_moe_weight `class-attribute` `instance-attribute` ¶

is_multimodal_pruning_enabled `instance-attribute` ¶

language_model `instance-attribute` ¶

make_empty_intermediate_tensors `instance-attribute` ¶

multimodal_config `instance-attribute` ¶

multiscale_dim `instance-attribute` ¶

packed_modules_mapping `class-attribute` `instance-attribute` ¶

use_data_parallel `instance-attribute` ¶

use_deepstack `instance-attribute` ¶

video_pruning_rate `instance-attribute` ¶

visual `instance-attribute` ¶

visual_dim `instance-attribute` ¶

init ¶

attn `instance-attribute` ¶

dual_chunk_attention_config `instance-attribute` ¶

head_dim `instance-attribute` ¶

hidden_size `instance-attribute` ¶

k_norm `instance-attribute` ¶

kv_size `instance-attribute` ¶

max_position_embeddings `instance-attribute` ¶

num_heads `instance-attribute` ¶

num_kv_heads `instance-attribute` ¶

o_proj `instance-attribute` ¶

q_norm `instance-attribute` ¶

q_size `instance-attribute` ¶

qkv_proj `instance-attribute` ¶

rotary_emb `instance-attribute` ¶

scaling `instance-attribute` ¶

total_num_heads `instance-attribute` ¶

total_num_kv_heads `instance-attribute` ¶

init ¶

hidden_size `instance-attribute` ¶

input_layernorm `instance-attribute` ¶

mlp `instance-attribute` ¶

post_attention_layernorm `instance-attribute` ¶

self_attn `instance-attribute` ¶

init ¶

config `instance-attribute` ¶

lm_head `instance-attribute` ¶

logits_processor `instance-attribute` ¶

make_empty_intermediate_tensors `instance-attribute` ¶

model `instance-attribute` ¶

quant_config `instance-attribute` ¶

init ¶

init ¶

act_fn `instance-attribute` ¶

down_proj `instance-attribute` ¶

gate_up_proj `instance-attribute` ¶

init ¶

enable_eplb `instance-attribute` ¶

ep_group `instance-attribute` ¶

ep_rank `instance-attribute` ¶

ep_size `instance-attribute` ¶

experts `instance-attribute` ¶

gate `instance-attribute` ¶

is_sequence_parallel `instance-attribute` ¶

n_groups `instance-attribute` ¶

n_local_physical_experts `instance-attribute` ¶

n_logical_experts `instance-attribute` ¶

n_physical_experts `instance-attribute` ¶

n_redundant_experts `instance-attribute` ¶

n_routed_experts `instance-attribute` ¶

physical_expert_end `instance-attribute` ¶

physical_expert_start `instance-attribute` ¶

tp_size `instance-attribute` ¶

init ¶

get_group_offsets `cached` `staticmethod` ¶