vllm.model_executor.layers.activation ¶

Custom activation functions.

_ACTIVATION_AND_MUL_REGISTRY `module-attribute` ¶

_ACTIVATION_AND_MUL_REGISTRY = LazyDict(
    {
        "gelu": lambda: GeluAndMul(),
        "silu": lambda: SiluAndMul(),
        "geglu": lambda: GeluAndMul(),
        "swigluoai": lambda *args,
        **kwargs: SwigluOAIAndMul(*args, **kwargs),
    }
)

_ACTIVATION_REGISTRY `module-attribute` ¶

_ACTIVATION_REGISTRY = LazyDict(
    {
        "gelu": lambda: GELU(),
        "gelu_fast": lambda: FastGELU(),
        "gelu_new": lambda: NewGELU(),
        "gelu_pytorch_tanh": lambda: (
            warning_once(
                "[ROCm] PyTorch's native GELU with tanh approximation is unstable. Falling back to GELU(approximate='none')."
            ),
            GELU(approximate="none"),
        )[1]
        if is_rocm()
        else GELU(approximate="tanh"),
        "relu": lambda: ReLU(),
        "relu2": lambda: ReLUSquaredActivation(),
        "silu": lambda: SiLU(),
        "quick_gelu": lambda: QuickGELU(),
        "tanh": lambda: Tanh(),
        "sigmoid": lambda: Sigmoid(),
        "xielu": lambda: XIELU(),
    }
)

logger `module-attribute` ¶

logger = init_logger(__name__)

FastGELU ¶

Bases: CustomOp

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("gelu_fast")
class FastGELU(CustomOp):
    # --8<-- [end:gelu_fast]

    def __init__(self):
        super().__init__()
        if current_platform.is_cuda_alike() or current_platform.is_cpu():
            self.op = torch.ops._C.gelu_fast
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops

            self.op = ipex_ops.gelu_fast

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        out = torch.empty_like(x)
        self.op(out, x)
        return out

    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
        return self.op(x)

op `instance-attribute` ¶

op = gelu_fast

init ¶

__init__()

Source code in vllm/model_executor/layers/activation.py

def __init__(self):
    super().__init__()
    if current_platform.is_cuda_alike() or current_platform.is_cpu():
        self.op = torch.ops._C.gelu_fast
    elif current_platform.is_xpu():
        from vllm._ipex_ops import ipex_ops

        self.op = ipex_ops.gelu_fast

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    out = torch.empty_like(x)
    self.op(out, x)
    return out

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))

forward_xpu ¶

forward_xpu(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
    return self.op(x)

FatreluAndMul ¶

Bases: CustomOp

An activation function for FATReLU.

The function computes x -> FATReLU(x[:d]) * x[d:] where d = x.shape[-1] // 2. This is used in openbmb/MiniCPM-S-1B-sft.

Shapes

x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) return: (num_tokens, d) or (batch_size, seq_len, d)

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("fatrelu_and_mul")
class FatreluAndMul(CustomOp):
    """An activation function for FATReLU.

    The function computes x -> FATReLU(x[:d]) * x[d:] where
    d = x.shape[-1] // 2.
    This is used in openbmb/MiniCPM-S-1B-sft.

    Shapes:
        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
        return: (num_tokens, d) or (batch_size, seq_len, d)
    """

    # --8<-- [end:fatrelu_and_mul]

    def __init__(self, threshold: float = 0.0):
        super().__init__()
        self.threshold = threshold
        if current_platform.is_cuda_alike():
            self.op = torch.ops._C.fatrelu_and_mul
        elif current_platform.is_cpu():
            self._forward_method = self.forward_native

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        x1 = x[..., :d]
        x2 = x[..., d:]
        x1 = F.threshold(x1, self.threshold, 0.0)
        return x1 * x2

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        output_shape = x.shape[:-1] + (d,)
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x, self.threshold)
        return out

_forward_method `instance-attribute` ¶

_forward_method = forward_native

op `instance-attribute` ¶

op = fatrelu_and_mul

threshold `instance-attribute` ¶

threshold = threshold

init ¶

__init__(threshold: float = 0.0)

Source code in vllm/model_executor/layers/activation.py

def __init__(self, threshold: float = 0.0):
    super().__init__()
    self.threshold = threshold
    if current_platform.is_cuda_alike():
        self.op = torch.ops._C.fatrelu_and_mul
    elif current_platform.is_cpu():
        self._forward_method = self.forward_native

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    output_shape = x.shape[:-1] + (d,)
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    self.op(out, x, self.threshold)
    return out

forward_native ¶

forward_native(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    x1 = x[..., :d]
    x2 = x[..., d:]
    x1 = F.threshold(x1, self.threshold, 0.0)
    return x1 * x2

GeluAndMul ¶

Bases: CustomOp

An activation function for GeGLU.

The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.

Shapes

x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d) return: (batch_size, seq_len, d) or (num_tokens, d)

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("gelu_and_mul")
class GeluAndMul(CustomOp):
    """An activation function for GeGLU.

    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.

    Shapes:
        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
        return: (batch_size, seq_len, d) or (num_tokens, d)
    """

    # --8<-- [end:gelu_and_mul]

    def __init__(self, approximate: str = "none"):
        super().__init__()
        self.approximate = approximate
        if approximate not in ("none", "tanh"):
            raise ValueError(f"Unknown approximate mode: {approximate}")
        if current_platform.is_cuda_alike() or current_platform.is_cpu():
            if approximate == "none":
                self.op = torch.ops._C.gelu_and_mul
            elif approximate == "tanh":
                self.op = torch.ops._C.gelu_tanh_and_mul
        if current_platform.is_rocm() and approximate == "tanh":
            logger.warning_once(
                "[ROCm] PyTorch's native GELU with tanh approximation is unstable "
                "with torch.compile. For native implementation, fallback to 'none' "
                "approximation. The custom kernel implementation is unaffected."
            )
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops

            if approximate == "none":
                self.op = ipex_ops.gelu_and_mul
            else:
                self.op = ipex_ops.gelu_tanh_and_mul

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        # TODO: [ROCm] PyTorch's native GELU with tanh is unstable with torch.compile
        approximate = self.approximate
        if current_platform.is_rocm() and approximate == "tanh":
            approximate = "none"
        d = x.shape[-1] // 2
        return F.gelu(x[..., :d], approximate=approximate) * x[..., d:]

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        output_shape = x.shape[:-1] + (d,)
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x)
        return out

    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        output_shape = x.shape[:-1] + (d,)
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x)
        return out

    def extra_repr(self) -> str:
        return f"approximate={repr(self.approximate)}"

approximate `instance-attribute` ¶

approximate = approximate

op `instance-attribute` ¶

op = gelu_and_mul

init ¶

__init__(approximate: str = 'none')

Source code in vllm/model_executor/layers/activation.py

def __init__(self, approximate: str = "none"):
    super().__init__()
    self.approximate = approximate
    if approximate not in ("none", "tanh"):
        raise ValueError(f"Unknown approximate mode: {approximate}")
    if current_platform.is_cuda_alike() or current_platform.is_cpu():
        if approximate == "none":
            self.op = torch.ops._C.gelu_and_mul
        elif approximate == "tanh":
            self.op = torch.ops._C.gelu_tanh_and_mul
    if current_platform.is_rocm() and approximate == "tanh":
        logger.warning_once(
            "[ROCm] PyTorch's native GELU with tanh approximation is unstable "
            "with torch.compile. For native implementation, fallback to 'none' "
            "approximation. The custom kernel implementation is unaffected."
        )
    elif current_platform.is_xpu():
        from vllm._ipex_ops import ipex_ops

        if approximate == "none":
            self.op = ipex_ops.gelu_and_mul
        else:
            self.op = ipex_ops.gelu_tanh_and_mul

extra_repr ¶

extra_repr() -> str

Source code in vllm/model_executor/layers/activation.py

def extra_repr(self) -> str:
    return f"approximate={repr(self.approximate)}"

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    output_shape = x.shape[:-1] + (d,)
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    self.op(out, x)
    return out

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    # TODO: [ROCm] PyTorch's native GELU with tanh is unstable with torch.compile
    approximate = self.approximate
    if current_platform.is_rocm() and approximate == "tanh":
        approximate = "none"
    d = x.shape[-1] // 2
    return F.gelu(x[..., :d], approximate=approximate) * x[..., d:]

forward_xpu ¶

forward_xpu(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    output_shape = x.shape[:-1] + (d,)
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    self.op(out, x)
    return out

GeluAndMulSparse ¶

Bases: CustomOp

An activation function for GeluAndMulSparse. This activation function is used in Gemma3n. It computes: up_proj = self.up_proj(x) gate_proj = self.gate_proj(x) gate_proj = self._gaussian_topk(gate_proj) # sparsity activations = self.act_fn(gate_proj) # gelu down_proj = self.down_proj(activations * up_proj) Shapes: x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) return: (num_tokens, d) or (batch_size, seq_len, d)

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("gelu_and_mul_sparse")
class GeluAndMulSparse(CustomOp):
    """An activation function for GeluAndMulSparse.
    This activation function is used in Gemma3n. It computes:
        up_proj = self.up_proj(x)
        gate_proj = self.gate_proj(x)
        gate_proj = self._gaussian_topk(gate_proj) # sparsity
        activations = self.act_fn(gate_proj) # gelu
        down_proj = self.down_proj(activations * up_proj)
    Shapes:
        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
        return: (num_tokens, d) or (batch_size, seq_len, d)
    """

    # --8<-- [end:gelu_and_mul_sparse]

    def __init__(self, activation_sparsity: float, approximate: str = "none"):
        super().__init__()
        # Gelu.
        self.approximate = approximate
        if approximate not in ("none", "tanh"):
            raise ValueError(f"Unknown approximate mode: {approximate}")
        if current_platform.is_rocm() and approximate == "tanh":
            # TODO:[ROCm] PyTorch native GELU with tanh is unstable with torch.compile
            logger.warning_once(
                "[ROCm] Pytorch's native GELU with tanh approximation is currently "
                "unstable and produces garbage. Fallback to 'none' approximation."
            )
            self.approximate = "none"

        # Sparsity.
        if activation_sparsity == 0.0:
            raise ValueError("activation_sparsity is 0.0. Please use GeluAndMul.")
        target_sparsity_tensor = torch.tensor(activation_sparsity, dtype=torch.float32)
        normal_dist = torch.distributions.normal.Normal(0, 1)
        self.std_multiplier = normal_dist.icdf(target_sparsity_tensor)

    def _gaussian_topk(self, x: torch.Tensor) -> torch.Tensor:
        """Get % sparse percentile of the Gaussian distribution."""
        # NOTE(rob): for TP>1, we could all-gather to get the means/std.
        # But we do not do this because in expectation they are the same
        # and in practice the eval scores are good without gathering.
        mean = torch.mean(x, dim=-1, keepdim=True)
        std = torch.std(x, dim=-1, keepdim=True, unbiased=False)
        cutoff_x = mean + std * self.std_multiplier
        return nn.functional.relu(x - cutoff_x)

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        d = x.shape[-1] // 2
        out = self._gaussian_topk(x[..., :d])
        out = F.gelu(out, approximate=self.approximate)
        return out * x[..., d:]

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        return self.forward_native(x)

approximate `instance-attribute` ¶

approximate = approximate

std_multiplier `instance-attribute` ¶

std_multiplier = icdf(target_sparsity_tensor)

init ¶

__init__(
    activation_sparsity: float, approximate: str = "none"
)

Source code in vllm/model_executor/layers/activation.py

def __init__(self, activation_sparsity: float, approximate: str = "none"):
    super().__init__()
    # Gelu.
    self.approximate = approximate
    if approximate not in ("none", "tanh"):
        raise ValueError(f"Unknown approximate mode: {approximate}")
    if current_platform.is_rocm() and approximate == "tanh":
        # TODO:[ROCm] PyTorch native GELU with tanh is unstable with torch.compile
        logger.warning_once(
            "[ROCm] Pytorch's native GELU with tanh approximation is currently "
            "unstable and produces garbage. Fallback to 'none' approximation."
        )
        self.approximate = "none"

    # Sparsity.
    if activation_sparsity == 0.0:
        raise ValueError("activation_sparsity is 0.0. Please use GeluAndMul.")
    target_sparsity_tensor = torch.tensor(activation_sparsity, dtype=torch.float32)
    normal_dist = torch.distributions.normal.Normal(0, 1)
    self.std_multiplier = normal_dist.icdf(target_sparsity_tensor)

_gaussian_topk ¶

_gaussian_topk(x: Tensor) -> Tensor

Get % sparse percentile of the Gaussian distribution.

Source code in vllm/model_executor/layers/activation.py

def _gaussian_topk(self, x: torch.Tensor) -> torch.Tensor:
    """Get % sparse percentile of the Gaussian distribution."""
    # NOTE(rob): for TP>1, we could all-gather to get the means/std.
    # But we do not do this because in expectation they are the same
    # and in practice the eval scores are good without gathering.
    mean = torch.mean(x, dim=-1, keepdim=True)
    std = torch.std(x, dim=-1, keepdim=True, unbiased=False)
    cutoff_x = mean + std * self.std_multiplier
    return nn.functional.relu(x - cutoff_x)

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    return self.forward_native(x)

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    d = x.shape[-1] // 2
    out = self._gaussian_topk(x[..., :d])
    out = F.gelu(out, approximate=self.approximate)
    return out * x[..., d:]

MulAndSilu ¶

Bases: CustomOp

An activation function for SwiGLU.

The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.

Shapes

x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) return: (num_tokens, d) or (batch_size, seq_len, d)

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("mul_and_silu")
class MulAndSilu(CustomOp):
    """An activation function for SwiGLU.

    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.

    Shapes:
        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
        return: (num_tokens, d) or (batch_size, seq_len, d)
    """

    # --8<-- [end:mul_and_silu]

    def __init__(self):
        super().__init__()
        if current_platform.is_cuda_alike():
            self.op = torch.ops._C.mul_and_silu
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops

            self.op = ipex_ops.silu_and_mul
        elif current_platform.is_cpu():
            self._forward_method = self.forward_native

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        d = x.shape[-1] // 2
        return x[..., :d] * F.silu(x[..., d:])

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        output_shape = x.shape[:-1] + (d,)
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x)
        return out

_forward_method `instance-attribute` ¶

_forward_method = forward_native

op `instance-attribute` ¶

op = mul_and_silu

init ¶

__init__()

Source code in vllm/model_executor/layers/activation.py

def __init__(self):
    super().__init__()
    if current_platform.is_cuda_alike():
        self.op = torch.ops._C.mul_and_silu
    elif current_platform.is_xpu():
        from vllm._ipex_ops import ipex_ops

        self.op = ipex_ops.silu_and_mul
    elif current_platform.is_cpu():
        self._forward_method = self.forward_native

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    output_shape = x.shape[:-1] + (d,)
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    self.op(out, x)
    return out

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    d = x.shape[-1] // 2
    return x[..., :d] * F.silu(x[..., d:])

NewGELU ¶

Bases: CustomOp

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("gelu_new")
class NewGELU(CustomOp):
    # --8<-- [end:gelu_new]

    def __init__(self):
        super().__init__()
        if current_platform.is_cuda_alike() or current_platform.is_cpu():
            self.op = torch.ops._C.gelu_new
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops

            self.op = ipex_ops.gelu_new

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        c = math.sqrt(2.0 / math.pi)
        return 0.5 * x * (1.0 + torch.tanh(c * (x + 0.044715 * torch.pow(x, 3.0))))

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        out = torch.empty_like(x)
        self.op(out, x)
        return out

    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
        return self.op(x)

op `instance-attribute` ¶

op = gelu_new

init ¶

__init__()

Source code in vllm/model_executor/layers/activation.py

def __init__(self):
    super().__init__()
    if current_platform.is_cuda_alike() or current_platform.is_cpu():
        self.op = torch.ops._C.gelu_new
    elif current_platform.is_xpu():
        from vllm._ipex_ops import ipex_ops

        self.op = ipex_ops.gelu_new

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    out = torch.empty_like(x)
    self.op(out, x)
    return out

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    c = math.sqrt(2.0 / math.pi)
    return 0.5 * x * (1.0 + torch.tanh(c * (x + 0.044715 * torch.pow(x, 3.0))))

forward_xpu ¶

forward_xpu(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
    return self.op(x)

QuickGELU ¶

Bases: CustomOp

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("quick_gelu")
class QuickGELU(CustomOp):
    # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90
    # --8<-- [end:quick_gelu]

    def __init__(self):
        super().__init__()
        if current_platform.is_cuda_alike() or current_platform.is_cpu():
            self.op = torch.ops._C.gelu_quick
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops

            self.op = ipex_ops.gelu_quick

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        return x * torch.sigmoid(1.702 * x)

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        out = torch.empty_like(x)
        self.op(out, x)
        return out

    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
        out = torch.empty_like(x)
        self.op(out, x)
        return out

op `instance-attribute` ¶

op = gelu_quick

init ¶

__init__()

Source code in vllm/model_executor/layers/activation.py

def __init__(self):
    super().__init__()
    if current_platform.is_cuda_alike() or current_platform.is_cpu():
        self.op = torch.ops._C.gelu_quick
    elif current_platform.is_xpu():
        from vllm._ipex_ops import ipex_ops

        self.op = ipex_ops.gelu_quick

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    out = torch.empty_like(x)
    self.op(out, x)
    return out

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    return x * torch.sigmoid(1.702 * x)

forward_xpu ¶

forward_xpu(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
    out = torch.empty_like(x)
    self.op(out, x)
    return out

ReLUSquaredActivation ¶

Bases: CustomOp

Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("relu2")
class ReLUSquaredActivation(CustomOp):
    """
    Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
    """

    # --8<-- [end:relu2]

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        return torch.square(F.relu(x))

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        # TODO : implement cuda kernels
        return self.forward_native(x)

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    # TODO : implement cuda kernels
    return self.forward_native(x)

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    return torch.square(F.relu(x))

ScaledActivation ¶

Bases: Module

An activation function with post-scale parameters.

This is used for some quantization methods like AWQ.

Source code in vllm/model_executor/layers/activation.py

class ScaledActivation(nn.Module):
    """An activation function with post-scale parameters.

    This is used for some quantization methods like AWQ.
    """

    def __init__(
        self,
        act_module: nn.Module,
        intermediate_size: int,
        input_is_parallel: bool = True,
        params_dtype: torch.dtype | None = None,
    ):
        super().__init__()
        self.act = act_module
        self.input_is_parallel = input_is_parallel
        if input_is_parallel:
            tp_size = get_tensor_model_parallel_world_size()
            intermediate_size_per_partition = divide(intermediate_size, tp_size)
        else:
            intermediate_size_per_partition = intermediate_size
        if params_dtype is None:
            params_dtype = torch.get_default_dtype()
        self.scales = nn.Parameter(
            torch.empty(intermediate_size_per_partition, dtype=params_dtype)
        )
        set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.act(x) / self.scales

    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
        param_data = param.data
        if self.input_is_parallel:
            tp_rank = get_tensor_model_parallel_rank()
            shard_size = param_data.shape[0]
            start_idx = tp_rank * shard_size
            loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
        assert param_data.shape == loaded_weight.shape
        param_data.copy_(loaded_weight)

act `instance-attribute` ¶

act = act_module

input_is_parallel `instance-attribute` ¶

input_is_parallel = input_is_parallel

scales `instance-attribute` ¶

scales = Parameter(
    empty(
        intermediate_size_per_partition, dtype=params_dtype
    )
)

init ¶

__init__(
    act_module: Module,
    intermediate_size: int,
    input_is_parallel: bool = True,
    params_dtype: dtype | None = None,
)

Source code in vllm/model_executor/layers/activation.py

def __init__(
    self,
    act_module: nn.Module,
    intermediate_size: int,
    input_is_parallel: bool = True,
    params_dtype: torch.dtype | None = None,
):
    super().__init__()
    self.act = act_module
    self.input_is_parallel = input_is_parallel
    if input_is_parallel:
        tp_size = get_tensor_model_parallel_world_size()
        intermediate_size_per_partition = divide(intermediate_size, tp_size)
    else:
        intermediate_size_per_partition = intermediate_size
    if params_dtype is None:
        params_dtype = torch.get_default_dtype()
    self.scales = nn.Parameter(
        torch.empty(intermediate_size_per_partition, dtype=params_dtype)
    )
    set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})

forward ¶

forward(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward(self, x: torch.Tensor) -> torch.Tensor:
    return self.act(x) / self.scales

weight_loader ¶

weight_loader(param: Parameter, loaded_weight: Tensor)

Source code in vllm/model_executor/layers/activation.py

def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
    param_data = param.data
    if self.input_is_parallel:
        tp_rank = get_tensor_model_parallel_rank()
        shard_size = param_data.shape[0]
        start_idx = tp_rank * shard_size
        loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
    assert param_data.shape == loaded_weight.shape
    param_data.copy_(loaded_weight)

SiluAndMul ¶

Bases: CustomOp

An activation function for SwiGLU.

The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.

Shapes

x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) return: (num_tokens, d) or (batch_size, seq_len, d)

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("silu_and_mul")
class SiluAndMul(CustomOp):
    """An activation function for SwiGLU.

    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.

    Shapes:
        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
        return: (num_tokens, d) or (batch_size, seq_len, d)
    """

    # --8<-- [end:silu_and_mul]

    def __init__(self, *, compile_native: bool = True):
        super().__init__(compile_native=compile_native)
        if current_platform.is_cuda_alike():
            self.op = torch.ops._C.silu_and_mul
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops

            self.op = ipex_ops.silu_and_mul
        elif current_platform.is_cpu():
            self._forward_method = self.forward_native

    @staticmethod
    def forward_native(x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        d = x.shape[-1] // 2
        return F.silu(x[..., :d]) * x[..., d:]

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        output_shape = x.shape[:-1] + (d,)
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x)
        return out

    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        output_shape = x.shape[:-1] + (d,)
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x)
        return out

_forward_method `instance-attribute` ¶

_forward_method = forward_native

op `instance-attribute` ¶

op = silu_and_mul

init ¶

__init__(*, compile_native: bool = True)

Source code in vllm/model_executor/layers/activation.py

def __init__(self, *, compile_native: bool = True):
    super().__init__(compile_native=compile_native)
    if current_platform.is_cuda_alike():
        self.op = torch.ops._C.silu_and_mul
    elif current_platform.is_xpu():
        from vllm._ipex_ops import ipex_ops

        self.op = ipex_ops.silu_and_mul
    elif current_platform.is_cpu():
        self._forward_method = self.forward_native

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    output_shape = x.shape[:-1] + (d,)
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    self.op(out, x)
    return out

forward_native `staticmethod` ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

@staticmethod
def forward_native(x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    d = x.shape[-1] // 2
    return F.silu(x[..., :d]) * x[..., d:]

forward_xpu ¶

forward_xpu(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    output_shape = x.shape[:-1] + (d,)
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    self.op(out, x)
    return out

SwigluOAIAndMul ¶

Bases: CustomOp

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("swigluoai_and_mul")
class SwigluOAIAndMul(CustomOp):
    # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110
    # --8<-- [end:swigluoai_and_mul]

    def __init__(self, alpha: float = 1.702, limit: float = 7.0):
        super().__init__()
        self.alpha = alpha
        self.limit = limit

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""

        gate, up = x[..., ::2], x[..., 1::2]
        gate = gate.clamp(min=None, max=self.limit)
        up = up.clamp(min=-self.limit, max=self.limit)
        glu = gate * torch.sigmoid(gate * self.alpha)
        gated_output = (up + 1) * glu
        return gated_output

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        output_shape = x.shape[:-1] + (d,)
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit)
        return out

    def extra_repr(self) -> str:
        return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}"

alpha `instance-attribute` ¶

alpha = alpha

limit `instance-attribute` ¶

limit = limit

init ¶

__init__(alpha: float = 1.702, limit: float = 7.0)

Source code in vllm/model_executor/layers/activation.py

def __init__(self, alpha: float = 1.702, limit: float = 7.0):
    super().__init__()
    self.alpha = alpha
    self.limit = limit

extra_repr ¶

extra_repr() -> str

Source code in vllm/model_executor/layers/activation.py

def extra_repr(self) -> str:
    return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}"

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    output_shape = x.shape[:-1] + (d,)
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit)
    return out

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""

    gate, up = x[..., ::2], x[..., 1::2]
    gate = gate.clamp(min=None, max=self.limit)
    up = up.clamp(min=-self.limit, max=self.limit)
    glu = gate * torch.sigmoid(gate * self.alpha)
    gated_output = (up + 1) * glu
    return gated_output

SwigluStepAndMul ¶

Bases: CustomOp

An activation function for SwiGLU with clamping.

Computes x -> silu(x[:d]).clamp(max=limit) * x[d:].clamp(-limit, limit) where d = x.shape[-1] // 2.

Shapes

x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d) return: (num_tokens, d) or (batch_size, seq_len, d)

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("swiglustep_and_mul")
class SwigluStepAndMul(CustomOp):
    """An activation function for SwiGLU with clamping.

    Computes x -> silu(x[:d]).clamp(max=limit) * x[d:].clamp(-limit, limit)
    where d = x.shape[-1] // 2.

    Shapes:
        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
        return: (num_tokens, d) or (batch_size, seq_len, d)
    """

    def __init__(self, limit: float = 7.0):
        super().__init__()
        if limit is None:
            raise ValueError("SwigluStepAndMul requires limit to be set.")
        self.limit = limit

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        gate, up = x.chunk(2, dim=-1)
        gate = F.silu(gate)
        gate = gate.clamp(max=self.limit)
        up = up.clamp(min=-self.limit, max=self.limit)
        return gate * up

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        output_shape = x.shape[:-1] + (d,)
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        swiglustep_and_mul_triton(out, x, self.limit)
        return out

    def extra_repr(self) -> str:
        return f"limit={repr(self.limit)}"

limit `instance-attribute` ¶

limit = limit

init ¶

__init__(limit: float = 7.0)

Source code in vllm/model_executor/layers/activation.py

def __init__(self, limit: float = 7.0):
    super().__init__()
    if limit is None:
        raise ValueError("SwigluStepAndMul requires limit to be set.")
    self.limit = limit

extra_repr ¶

extra_repr() -> str

Source code in vllm/model_executor/layers/activation.py

def extra_repr(self) -> str:
    return f"limit={repr(self.limit)}"

forward_cuda ¶

forward_cuda(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
    d = x.shape[-1] // 2
    output_shape = x.shape[:-1] + (d,)
    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
    swiglustep_and_mul_triton(out, x, self.limit)
    return out

forward_native ¶

forward_native(x: Tensor) -> Tensor

PyTorch-native implementation equivalent to forward().

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, x: torch.Tensor) -> torch.Tensor:
    """PyTorch-native implementation equivalent to forward()."""
    gate, up = x.chunk(2, dim=-1)
    gate = F.silu(gate)
    gate = gate.clamp(max=self.limit)
    up = up.clamp(min=-self.limit, max=self.limit)
    return gate * up

XIELU ¶

Bases: CustomOp

Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010 If the user has installed the nickjbrowning/XIELU, we import xIELU CUDA Otherwise, we emit a single warning and use xIELU Python

Source code in vllm/model_executor/layers/activation.py

@CustomOp.register("xielu")
class XIELU(CustomOp):
    """
    Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010
    If the user has installed the nickjbrowning/XIELU, we import xIELU CUDA
    Otherwise, we emit a single warning and use xIELU Python
    """

    # --8<-- [end:xielu]

    def __init__(
        self,
        alpha_p_init: float = 0.8,
        alpha_n_init: float = 0.8,
        beta: float = 0.5,
        eps: float = -1e-6,
        dtype: torch.dtype = torch.bfloat16,
        with_vector_loads: bool = False,
    ):
        super().__init__()
        self.alpha_p = nn.Parameter(
            torch.log(torch.exp(torch.tensor(alpha_p_init, dtype=dtype)) - 1).unsqueeze(
                0
            )
        )
        self.alpha_n = nn.Parameter(
            torch.log(
                torch.exp(torch.tensor(alpha_n_init - beta, dtype=dtype)) - 1
            ).unsqueeze(0)
        )
        self.register_buffer("beta", torch.tensor(beta, dtype=dtype))
        self.register_buffer("eps", torch.tensor(eps, dtype=dtype))
        self.with_vector_loads = with_vector_loads
        # Temporary until xIELU CUDA fully implemented
        self._beta_scalar = float(self.beta.detach().cpu().float().item())
        self._eps_scalar = float(self.eps.detach().cpu().float().item())

        self._xielu_cuda_obj = None
        try:
            import xielu.ops  # noqa: F401

            self._xielu_cuda_obj = torch.classes.xielu.XIELU()
            msg = "Using experimental xIELU CUDA."
            try:
                from torch._dynamo import allow_in_graph

                self._xielu_cuda_fn = allow_in_graph(self._xielu_cuda)
                msg += " Enabled torch._dynamo for xIELU CUDA."
            except Exception as err:
                msg += (
                    f" Could not enable torch._dynamo for xIELU ({err}) - "
                    "this may result in slower performance."
                )
                self._xielu_cuda_fn = self._xielu_cuda
            logger.warning_once(msg)
        except Exception as err:
            logger.warning_once(
                "CUDA-fused xIELU not available (%s) –"
                " falling back to a Python version.\n"
                "For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`",
                str(err),
            )

    def _xielu_python(self, x: torch.Tensor) -> torch.Tensor:
        alpha_p = nn.functional.softplus(self.alpha_p)
        alpha_n = self.beta + nn.functional.softplus(self.alpha_n)
        return torch.where(
            x > 0,
            alpha_p * x * x + self.beta * x,
            (torch.expm1(torch.min(x, self.eps)) - x) * alpha_n + self.beta * x,
        )

    def _xielu_cuda(self, x: torch.Tensor) -> torch.Tensor:
        """Firewall function to prevent torch.compile from seeing .item()"""
        assert self._xielu_cuda_obj is not None, "XIELU CUDA object must not be None"
        original_shape = x.shape
        # CUDA kernel expects 3D tensors, reshape if needed
        while x.dim() < 3:
            x = x.unsqueeze(0)
        if x.dim() > 3:
            x = x.view(-1, 1, x.size(-1))
        if original_shape != x.shape:
            logger.warning_once(
                "Warning: xIELU input tensor expects 3 dimensions"
                " but got (shape: %s). Reshaping to (shape: %s).",
                original_shape,
                x.shape,
            )
        result = self._xielu_cuda_obj.forward(
            x,
            self.alpha_p,
            self.alpha_n,
            # Temporary until xIELU CUDA fully implemented ->
            # self.{beta,eps}.item()
            self._beta_scalar,
            self._eps_scalar,
            self.with_vector_loads,
        )
        return result.view(original_shape)

    def forward_native(self, input: torch.Tensor) -> torch.Tensor:
        if self._xielu_cuda_obj is not None and input.is_cuda:
            if not torch._dynamo.is_compiling():
                return self._xielu_cuda_fn(input)
            else:
                logger.warning_once(
                    "torch._dynamo is compiling, using Python version of xIELU."
                )
        return self._xielu_python(input)

    def forward_cuda(self, input: torch.Tensor) -> torch.Tensor:
        return self.forward_native(input)

_beta_scalar `instance-attribute` ¶

_beta_scalar = float(item())

_eps_scalar `instance-attribute` ¶

_eps_scalar = float(item())

_xielu_cuda_fn `instance-attribute` ¶

_xielu_cuda_fn = allow_in_graph(_xielu_cuda)

_xielu_cuda_obj `instance-attribute` ¶

_xielu_cuda_obj = XIELU()

alpha_n `instance-attribute` ¶

alpha_n = Parameter(unsqueeze(0))

alpha_p `instance-attribute` ¶

alpha_p = Parameter(unsqueeze(0))

with_vector_loads `instance-attribute` ¶

with_vector_loads = with_vector_loads

init ¶

__init__(
    alpha_p_init: float = 0.8,
    alpha_n_init: float = 0.8,
    beta: float = 0.5,
    eps: float = -1e-06,
    dtype: dtype = bfloat16,
    with_vector_loads: bool = False,
)

Source code in vllm/model_executor/layers/activation.py

def __init__(
    self,
    alpha_p_init: float = 0.8,
    alpha_n_init: float = 0.8,
    beta: float = 0.5,
    eps: float = -1e-6,
    dtype: torch.dtype = torch.bfloat16,
    with_vector_loads: bool = False,
):
    super().__init__()
    self.alpha_p = nn.Parameter(
        torch.log(torch.exp(torch.tensor(alpha_p_init, dtype=dtype)) - 1).unsqueeze(
            0
        )
    )
    self.alpha_n = nn.Parameter(
        torch.log(
            torch.exp(torch.tensor(alpha_n_init - beta, dtype=dtype)) - 1
        ).unsqueeze(0)
    )
    self.register_buffer("beta", torch.tensor(beta, dtype=dtype))
    self.register_buffer("eps", torch.tensor(eps, dtype=dtype))
    self.with_vector_loads = with_vector_loads
    # Temporary until xIELU CUDA fully implemented
    self._beta_scalar = float(self.beta.detach().cpu().float().item())
    self._eps_scalar = float(self.eps.detach().cpu().float().item())

    self._xielu_cuda_obj = None
    try:
        import xielu.ops  # noqa: F401

        self._xielu_cuda_obj = torch.classes.xielu.XIELU()
        msg = "Using experimental xIELU CUDA."
        try:
            from torch._dynamo import allow_in_graph

            self._xielu_cuda_fn = allow_in_graph(self._xielu_cuda)
            msg += " Enabled torch._dynamo for xIELU CUDA."
        except Exception as err:
            msg += (
                f" Could not enable torch._dynamo for xIELU ({err}) - "
                "this may result in slower performance."
            )
            self._xielu_cuda_fn = self._xielu_cuda
        logger.warning_once(msg)
    except Exception as err:
        logger.warning_once(
            "CUDA-fused xIELU not available (%s) –"
            " falling back to a Python version.\n"
            "For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`",
            str(err),
        )

_xielu_cuda ¶

_xielu_cuda(x: Tensor) -> Tensor

Firewall function to prevent torch.compile from seeing .item()

Source code in vllm/model_executor/layers/activation.py

def _xielu_cuda(self, x: torch.Tensor) -> torch.Tensor:
    """Firewall function to prevent torch.compile from seeing .item()"""
    assert self._xielu_cuda_obj is not None, "XIELU CUDA object must not be None"
    original_shape = x.shape
    # CUDA kernel expects 3D tensors, reshape if needed
    while x.dim() < 3:
        x = x.unsqueeze(0)
    if x.dim() > 3:
        x = x.view(-1, 1, x.size(-1))
    if original_shape != x.shape:
        logger.warning_once(
            "Warning: xIELU input tensor expects 3 dimensions"
            " but got (shape: %s). Reshaping to (shape: %s).",
            original_shape,
            x.shape,
        )
    result = self._xielu_cuda_obj.forward(
        x,
        self.alpha_p,
        self.alpha_n,
        # Temporary until xIELU CUDA fully implemented ->
        # self.{beta,eps}.item()
        self._beta_scalar,
        self._eps_scalar,
        self.with_vector_loads,
    )
    return result.view(original_shape)

_xielu_python ¶

_xielu_python(x: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def _xielu_python(self, x: torch.Tensor) -> torch.Tensor:
    alpha_p = nn.functional.softplus(self.alpha_p)
    alpha_n = self.beta + nn.functional.softplus(self.alpha_n)
    return torch.where(
        x > 0,
        alpha_p * x * x + self.beta * x,
        (torch.expm1(torch.min(x, self.eps)) - x) * alpha_n + self.beta * x,
    )

forward_cuda ¶

forward_cuda(input: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_cuda(self, input: torch.Tensor) -> torch.Tensor:
    return self.forward_native(input)

forward_native ¶

forward_native(input: Tensor) -> Tensor

Source code in vllm/model_executor/layers/activation.py

def forward_native(self, input: torch.Tensor) -> torch.Tensor:
    if self._xielu_cuda_obj is not None and input.is_cuda:
        if not torch._dynamo.is_compiling():
            return self._xielu_cuda_fn(input)
        else:
            logger.warning_once(
                "torch._dynamo is compiling, using Python version of xIELU."
            )
    return self._xielu_python(input)

_swiglustep_and_mul_kernel ¶

_swiglustep_and_mul_kernel(
    o_ptr,
    o_stride,
    x_ptr,
    x_stride,
    limit: constexpr,
    d: constexpr,
    BLOCK_SIZE: constexpr,
) -> None

Source code in vllm/model_executor/layers/activation.py

@triton.jit
def _swiglustep_and_mul_kernel(
    o_ptr,
    o_stride,
    x_ptr,
    x_stride,
    limit: tl.constexpr,
    d: tl.constexpr,
    BLOCK_SIZE: tl.constexpr,
) -> None:
    i = tl.program_id(axis=0).to(tl.int64)
    j = tl.program_id(axis=1)
    o_row_ptr = o_ptr + o_stride * i
    x_row_ptr = x_ptr + x_stride * i
    offsets = j * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    mask = offsets < d

    gate = tl.load(x_row_ptr + offsets, mask=mask).to(tl.float32)
    up = tl.load(x_row_ptr + offsets + d, mask=mask).to(tl.float32)

    gate_silu = tl.sigmoid(gate) * gate
    gate_clamped = tl.minimum(gate_silu, limit)
    up_clamped = tl.minimum(tl.maximum(up, -limit), limit)

    result = gate_clamped * up_clamped
    result = result.to(x_ptr.dtype.element_ty)
    tl.store(o_row_ptr + offsets, result, mask=mask)

get_act_and_mul_fn ¶

get_act_and_mul_fn(act_fn_name: str) -> Module

Get an activation-and-mul (i.e. SiluAndMul) function by name.

Source code in vllm/model_executor/layers/activation.py

def get_act_and_mul_fn(act_fn_name: str) -> nn.Module:
    """Get an activation-and-mul (i.e. SiluAndMul) function by name."""
    act_fn_name = act_fn_name.lower()
    if act_fn_name not in _ACTIVATION_AND_MUL_REGISTRY:
        raise ValueError(f"Activation function {act_fn_name!r} is not supported.")

    return _ACTIVATION_AND_MUL_REGISTRY[act_fn_name]

get_act_fn ¶

get_act_fn(act_fn_name: str) -> Module

Get an activation function by name.

Source code in vllm/model_executor/layers/activation.py

def get_act_fn(act_fn_name: str) -> nn.Module:
    """Get an activation function by name."""
    act_fn_name = act_fn_name.lower()

    if act_fn_name.startswith("torch.nn.modules."):
        activation_name = act_fn_name.split(".")[-1]
        if activation_name == "identity":
            return nn.Identity()
        act_fn_name = activation_name

    if act_fn_name not in _ACTIVATION_REGISTRY:
        raise ValueError(f"Activation function {act_fn_name!r} is not supported.")

    return _ACTIVATION_REGISTRY[act_fn_name]

swiglustep_and_mul_triton ¶

swiglustep_and_mul_triton(
    output: Tensor, input: Tensor, limit: float = 7.0
)

Source code in vllm/model_executor/layers/activation.py

def swiglustep_and_mul_triton(
    output: torch.Tensor, input: torch.Tensor, limit: float = 7.0
):
    b, n = input.shape
    assert input.ndim == 2
    assert n % 2 == 0
    d = n // 2

    def grid(meta):
        return (b, triton.cdiv(d, meta["BLOCK_SIZE"]))

    _swiglustep_and_mul_kernel[grid](
        output,
        output.stride(0),
        input,
        input.stride(0),
        limit=limit,
        d=d,
        BLOCK_SIZE=1024,
    )

vllm.model_executor.layers.activation ¶

_ACTIVATION_AND_MUL_REGISTRY module-attribute ¶

_ACTIVATION_REGISTRY module-attribute ¶

logger module-attribute ¶

FastGELU ¶

op instance-attribute ¶

__init__ ¶

forward_cuda ¶

forward_native ¶

forward_xpu ¶

FatreluAndMul ¶

_forward_method instance-attribute ¶

op instance-attribute ¶

threshold instance-attribute ¶

__init__ ¶

forward_cuda ¶

forward_native ¶

GeluAndMul ¶

approximate instance-attribute ¶

op instance-attribute ¶

__init__ ¶

extra_repr ¶

forward_cuda ¶

forward_native ¶

forward_xpu ¶

GeluAndMulSparse ¶

approximate instance-attribute ¶

std_multiplier instance-attribute ¶

__init__ ¶

_gaussian_topk ¶

forward_cuda ¶

forward_native ¶

MulAndSilu ¶

_forward_method instance-attribute ¶

op instance-attribute ¶

__init__ ¶

forward_cuda ¶

forward_native ¶

NewGELU ¶

op instance-attribute ¶

__init__ ¶

forward_cuda ¶

forward_native ¶

forward_xpu ¶

QuickGELU ¶

op instance-attribute ¶

__init__ ¶

forward_cuda ¶

forward_native ¶

forward_xpu ¶

ReLUSquaredActivation ¶

forward_cuda ¶

forward_native ¶

ScaledActivation ¶

act instance-attribute ¶

input_is_parallel instance-attribute ¶

scales instance-attribute ¶

__init__ ¶

forward ¶

weight_loader ¶

SiluAndMul ¶

_forward_method instance-attribute ¶

op instance-attribute ¶

__init__ ¶

forward_cuda ¶

forward_native staticmethod ¶

forward_xpu ¶

SwigluOAIAndMul ¶

alpha instance-attribute ¶

limit instance-attribute ¶

__init__ ¶

extra_repr ¶

forward_cuda ¶

forward_native ¶

SwigluStepAndMul ¶

limit instance-attribute ¶

__init__ ¶

extra_repr ¶

forward_cuda ¶

forward_native ¶

_ACTIVATION_AND_MUL_REGISTRY `module-attribute` ¶

_ACTIVATION_REGISTRY `module-attribute` ¶

logger `module-attribute` ¶

op `instance-attribute` ¶

init ¶

_forward_method `instance-attribute` ¶

op `instance-attribute` ¶

threshold `instance-attribute` ¶

init ¶

approximate `instance-attribute` ¶

op `instance-attribute` ¶

init ¶

approximate `instance-attribute` ¶

std_multiplier `instance-attribute` ¶

init ¶

_forward_method `instance-attribute` ¶

op `instance-attribute` ¶

init ¶

op `instance-attribute` ¶

init ¶

op `instance-attribute` ¶

init ¶

act `instance-attribute` ¶

input_is_parallel `instance-attribute` ¶

scales `instance-attribute` ¶

init ¶

_forward_method `instance-attribute` ¶

op `instance-attribute` ¶

init ¶

forward_native `staticmethod` ¶

alpha `instance-attribute` ¶

limit `instance-attribute` ¶

init ¶

limit `instance-attribute` ¶

init ¶

_beta_scalar `instance-attribute` ¶

_eps_scalar `instance-attribute` ¶

_xielu_cuda_fn `instance-attribute` ¶

_xielu_cuda_obj `instance-attribute` ¶

alpha_n `instance-attribute` ¶

alpha_p `instance-attribute` ¶

with_vector_loads `instance-attribute` ¶

init ¶