Skip to content

vllm.model_executor.layers.fused_moe.xpu_fused_moe

XPUExperts

Bases: FusedMoEPermuteExpertsUnpermute

Source code in vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute):
    @property
    def expects_unquantized_inputs(self) -> bool:
        return True

    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.Standard

    @staticmethod
    def _supports_current_device() -> bool:
        return current_platform.is_xpu()

    @staticmethod
    def _supports_no_act_and_mul() -> bool:
        return False

    @staticmethod
    def _supports_activation(activation: str) -> bool:
        return activation in ["silu", "gelu", "swigluoai"]

    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
        return True

    @staticmethod
    def _supports_quant_scheme(
        weight_key: QuantKey | None,
        activation_key: QuantKey | None,
    ) -> bool:
        # TODO: dispatch based on device.
        SUPPORTED_W_A = [
            (None, None),
            (kFp8StaticTensorSym, None),
        ]
        return (weight_key, activation_key) in SUPPORTED_W_A

    def supports_chunking(self) -> bool:
        return False

    def supports_expert_map(self) -> bool:
        return True

    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        return TopKWeightAndReduceNoOP()

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        activation: str,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        workspace1 = (0,)
        workspace2 = (0,)
        output = (M, K)
        return (workspace1, workspace2, output)

    def apply(
        self,
        output: torch.Tensor,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        activation: str,
        global_num_experts: int,
        expert_map: torch.Tensor | None,
        a1q_scale: torch.Tensor | None,
        a2_scale: torch.Tensor | None,
        workspace13: torch.Tensor,
        workspace2: torch.Tensor,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        apply_router_weight_on_input: bool,
    ):
        topk = topk_ids.size(-1)
        xpu_fused_moe(
            hidden_states=hidden_states,
            w13=w1,
            w13_scales=a1q_scale,
            w13_bias=self.w1_bias,
            w2=w2,
            w2_scales=a2_scale,
            w2_bias=self.w2_bias,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            n_experts_per_token=topk,
            activation=activation,
            num_experts=self.moe_config.num_local_experts,
            ep_rank=self.moe_config.ep_rank,
            ep_size=self.moe_config.ep_size,
            output=output,
        )
        return

expects_unquantized_inputs property

expects_unquantized_inputs: bool

_supports_activation staticmethod

_supports_activation(activation: str) -> bool
Source code in vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
@staticmethod
def _supports_activation(activation: str) -> bool:
    return activation in ["silu", "gelu", "swigluoai"]

_supports_current_device staticmethod

_supports_current_device() -> bool
Source code in vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
@staticmethod
def _supports_current_device() -> bool:
    return current_platform.is_xpu()

_supports_no_act_and_mul staticmethod

_supports_no_act_and_mul() -> bool
Source code in vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
@staticmethod
def _supports_no_act_and_mul() -> bool:
    return False

_supports_parallel_config staticmethod

_supports_parallel_config(
    moe_parallel_config: FusedMoEParallelConfig,
) -> bool
Source code in vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
@staticmethod
def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
    return True

_supports_quant_scheme staticmethod

_supports_quant_scheme(
    weight_key: QuantKey | None,
    activation_key: QuantKey | None,
) -> bool
Source code in vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
@staticmethod
def _supports_quant_scheme(
    weight_key: QuantKey | None,
    activation_key: QuantKey | None,
) -> bool:
    # TODO: dispatch based on device.
    SUPPORTED_W_A = [
        (None, None),
        (kFp8StaticTensorSym, None),
    ]
    return (weight_key, activation_key) in SUPPORTED_W_A

activation_format staticmethod

activation_format() -> FusedMoEActivationFormat
Source code in vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
@staticmethod
def activation_format() -> mk.FusedMoEActivationFormat:
    return mk.FusedMoEActivationFormat.Standard

apply

apply(
    output: Tensor,
    hidden_states: Tensor,
    w1: Tensor,
    w2: Tensor,
    topk_weights: Tensor,
    topk_ids: Tensor,
    activation: str,
    global_num_experts: int,
    expert_map: Tensor | None,
    a1q_scale: Tensor | None,
    a2_scale: Tensor | None,
    workspace13: Tensor,
    workspace2: Tensor,
    expert_tokens_meta: ExpertTokensMetadata | None,
    apply_router_weight_on_input: bool,
)
Source code in vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
def apply(
    self,
    output: torch.Tensor,
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    activation: str,
    global_num_experts: int,
    expert_map: torch.Tensor | None,
    a1q_scale: torch.Tensor | None,
    a2_scale: torch.Tensor | None,
    workspace13: torch.Tensor,
    workspace2: torch.Tensor,
    expert_tokens_meta: mk.ExpertTokensMetadata | None,
    apply_router_weight_on_input: bool,
):
    topk = topk_ids.size(-1)
    xpu_fused_moe(
        hidden_states=hidden_states,
        w13=w1,
        w13_scales=a1q_scale,
        w13_bias=self.w1_bias,
        w2=w2,
        w2_scales=a2_scale,
        w2_bias=self.w2_bias,
        topk_weights=topk_weights,
        topk_ids=topk_ids,
        n_experts_per_token=topk,
        activation=activation,
        num_experts=self.moe_config.num_local_experts,
        ep_rank=self.moe_config.ep_rank,
        ep_size=self.moe_config.ep_size,
        output=output,
    )
    return

finalize_weight_and_reduce_impl

finalize_weight_and_reduce_impl() -> TopKWeightAndReduce
Source code in vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
    return TopKWeightAndReduceNoOP()

supports_chunking

supports_chunking() -> bool
Source code in vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
def supports_chunking(self) -> bool:
    return False

supports_expert_map

supports_expert_map() -> bool
Source code in vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
def supports_expert_map(self) -> bool:
    return True

workspace_shapes

workspace_shapes(
    M: int,
    N: int,
    K: int,
    topk: int,
    global_num_experts: int,
    local_num_experts: int,
    expert_tokens_meta: ExpertTokensMetadata | None,
    activation: str,
) -> tuple[
    tuple[int, ...], tuple[int, ...], tuple[int, ...]
]
Source code in vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
def workspace_shapes(
    self,
    M: int,
    N: int,
    K: int,
    topk: int,
    global_num_experts: int,
    local_num_experts: int,
    expert_tokens_meta: mk.ExpertTokensMetadata | None,
    activation: str,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
    workspace1 = (0,)
    workspace2 = (0,)
    output = (M, K)
    return (workspace1, workspace2, output)