From c14876fdf2d2f2a9de358b5e62612e72db3eea3a Mon Sep 17 00:00:00 2001 From: wooway777 Date: Wed, 10 Jun 2026 14:53:29 +0800 Subject: [PATCH] feat: add cuda silu_and_mul infinilm --- src/base/silu_and_mul_infinilm.h | 70 +++++++++++++++++++ .../ops/silu_and_mul_infinilm/kernel.h | 22 ++++++ .../metax/ops/silu_and_mul_infinilm/kernel.h | 22 ++++++ .../moore/ops/silu_and_mul_infinilm/kernel.h | 23 ++++++ .../nvidia/ops/silu_and_mul_infinilm/kernel.h | 22 ++++++ .../cuda/ops/silu_and_mul_infinilm/kernel.cuh | 46 ++++++++++++ .../cuda/ops/silu_and_mul_infinilm/kernel.h | 47 +++++++++++++ tests/test_silu_and_mul_infinilm.py | 54 ++++++++++++++ 8 files changed, 306 insertions(+) create mode 100644 src/base/silu_and_mul_infinilm.h create mode 100644 src/native/cuda/iluvatar/ops/silu_and_mul_infinilm/kernel.h create mode 100644 src/native/cuda/metax/ops/silu_and_mul_infinilm/kernel.h create mode 100644 src/native/cuda/moore/ops/silu_and_mul_infinilm/kernel.h create mode 100644 src/native/cuda/nvidia/ops/silu_and_mul_infinilm/kernel.h create mode 100644 src/native/cuda/ops/silu_and_mul_infinilm/kernel.cuh create mode 100644 src/native/cuda/ops/silu_and_mul_infinilm/kernel.h create mode 100644 tests/test_silu_and_mul_infinilm.py diff --git a/src/base/silu_and_mul_infinilm.h b/src/base/silu_and_mul_infinilm.h new file mode 100644 index 000000000..3f20d5048 --- /dev/null +++ b/src/base/silu_and_mul_infinilm.h @@ -0,0 +1,70 @@ +#ifndef INFINI_OPS_BASE_SILU_AND_MUL_INFINILM_H_ +#define INFINI_OPS_BASE_SILU_AND_MUL_INFINILM_H_ + +#include + +#include "operator.h" + +namespace infini::ops { + +class SiluAndMulInfinilm : public Operator { + public: + SiluAndMulInfinilm(const Tensor input, Tensor out) + : input_shape_{input.shape()}, + input_strides_{input.strides()}, + input_type_{input.dtype()}, + out_shape_{out.shape()}, + out_strides_{out.strides()}, + out_type_{out.dtype()}, + output_size_{out.numel()}, + ndim_{out.ndim()}, + hidden_size_{out.size(out.ndim() - 1)}, + row_count_{out.numel() / hidden_size_}, + device_index_{out.device().index()} { + assert(input.ndim() == out.ndim() && + "`SiluAndMulInfinilm` input and output ranks must match"); + assert(input_type_ == out_type_ && + "`SiluAndMulInfinilm` input and output dtypes must match"); + assert( + input.size(input.ndim() - 1) == 2 * hidden_size_ && + "`SiluAndMulInfinilm` input last dimension must be twice output last " + "dimension"); + for (Tensor::Size i = 0; i + 1 < ndim_; ++i) { + assert(input.size(i) == out.size(i) && + "`SiluAndMulInfinilm` leading dimensions must match"); + } + assert(input.IsContiguous() && out.IsContiguous() && + "`SiluAndMulInfinilm` only supports contiguous tensors"); + assert(!out.HasBroadcastDim() && + "`SiluAndMulInfinilm` output must not have broadcasted dimensions"); + } + + virtual void operator()(const Tensor input, Tensor out) const = 0; + + protected: + Tensor::Shape input_shape_; + + Tensor::Strides input_strides_; + + DataType input_type_; + + Tensor::Shape out_shape_; + + Tensor::Strides out_strides_; + + DataType out_type_; + + Tensor::Size output_size_{0}; + + Tensor::Size ndim_{0}; + + Tensor::Size hidden_size_{0}; + + Tensor::Size row_count_{0}; + + int device_index_{0}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/iluvatar/ops/silu_and_mul_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/silu_and_mul_infinilm/kernel.h new file mode 100644 index 000000000..b29469062 --- /dev/null +++ b/src/native/cuda/iluvatar/ops/silu_and_mul_infinilm/kernel.h @@ -0,0 +1,22 @@ +#ifndef INFINI_OPS_ILUVATAR_SILU_AND_MUL_INFINILM_KERNEL_H_ +#define INFINI_OPS_ILUVATAR_SILU_AND_MUL_INFINILM_KERNEL_H_ + +#include + +#include "native/cuda/iluvatar/caster.cuh" +#include "native/cuda/iluvatar/runtime_.h" +#include "native/cuda/ops/silu_and_mul_infinilm/kernel.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaSiluAndMulInfinilm> { + public: + using CudaSiluAndMulInfinilm< + Runtime>::CudaSiluAndMulInfinilm; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/metax/ops/silu_and_mul_infinilm/kernel.h b/src/native/cuda/metax/ops/silu_and_mul_infinilm/kernel.h new file mode 100644 index 000000000..beaafe030 --- /dev/null +++ b/src/native/cuda/metax/ops/silu_and_mul_infinilm/kernel.h @@ -0,0 +1,22 @@ +#ifndef INFINI_OPS_METAX_SILU_AND_MUL_INFINILM_KERNEL_H_ +#define INFINI_OPS_METAX_SILU_AND_MUL_INFINILM_KERNEL_H_ + +#include + +#include "native/cuda/metax/caster.cuh" +#include "native/cuda/metax/runtime_.h" +#include "native/cuda/ops/silu_and_mul_infinilm/kernel.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaSiluAndMulInfinilm> { + public: + using CudaSiluAndMulInfinilm< + Runtime>::CudaSiluAndMulInfinilm; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/moore/ops/silu_and_mul_infinilm/kernel.h b/src/native/cuda/moore/ops/silu_and_mul_infinilm/kernel.h new file mode 100644 index 000000000..fd0f36209 --- /dev/null +++ b/src/native/cuda/moore/ops/silu_and_mul_infinilm/kernel.h @@ -0,0 +1,23 @@ +#ifndef INFINI_OPS_MOORE_SILU_AND_MUL_INFINILM_KERNEL_H_ +#define INFINI_OPS_MOORE_SILU_AND_MUL_INFINILM_KERNEL_H_ + +#include + +#include "native/cuda/moore/caster.cuh" +#include "native/cuda/moore/polyfills.cuh" +#include "native/cuda/moore/runtime_.h" +#include "native/cuda/ops/silu_and_mul_infinilm/kernel.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaSiluAndMulInfinilm> { + public: + using CudaSiluAndMulInfinilm< + Runtime>::CudaSiluAndMulInfinilm; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/nvidia/ops/silu_and_mul_infinilm/kernel.h b/src/native/cuda/nvidia/ops/silu_and_mul_infinilm/kernel.h new file mode 100644 index 000000000..9f908b1fd --- /dev/null +++ b/src/native/cuda/nvidia/ops/silu_and_mul_infinilm/kernel.h @@ -0,0 +1,22 @@ +#ifndef INFINI_OPS_NVIDIA_SILU_AND_MUL_INFINILM_KERNEL_H_ +#define INFINI_OPS_NVIDIA_SILU_AND_MUL_INFINILM_KERNEL_H_ + +#include + +#include "native/cuda/nvidia/caster.cuh" +#include "native/cuda/nvidia/runtime_.h" +#include "native/cuda/ops/silu_and_mul_infinilm/kernel.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaSiluAndMulInfinilm> { + public: + using CudaSiluAndMulInfinilm< + Runtime>::CudaSiluAndMulInfinilm; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/ops/silu_and_mul_infinilm/kernel.cuh b/src/native/cuda/ops/silu_and_mul_infinilm/kernel.cuh new file mode 100644 index 000000000..140d5d43d --- /dev/null +++ b/src/native/cuda/ops/silu_and_mul_infinilm/kernel.cuh @@ -0,0 +1,46 @@ +#ifndef INFINI_OPS_CUDA_SILU_AND_MUL_INFINILM_KERNEL_CUH_ +#define INFINI_OPS_CUDA_SILU_AND_MUL_INFINILM_KERNEL_CUH_ + +#include +#include +#include + +#include "native/cuda/caster.cuh" + +namespace infini::ops { + +namespace { + +template +__device__ __forceinline__ T SiluAndMulInfinilmValue(T gate, T up) { + if constexpr (std::is_same_v) { + return (gate / (1.0 + exp(-gate))) * up; + } else { + const float g = Caster::template Cast(gate); + const float u = Caster::template Cast(up); + const float y = (g / (1.0f + expf(-g))) * u; + return Caster::template Cast(y); + } +} + +} // namespace + +template +__global__ void SiluAndMulInfinilmKernel(T* __restrict__ out, + const T* __restrict__ input, + size_t output_size, + size_t hidden_size) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < output_size) { + size_t col = idx % hidden_size; + size_t row = idx / hidden_size; + size_t input_base = row * hidden_size * 2; + out[idx] = SiluAndMulInfinilmValue( + input[input_base + col], input[input_base + hidden_size + col]); + } +} + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/ops/silu_and_mul_infinilm/kernel.h b/src/native/cuda/ops/silu_and_mul_infinilm/kernel.h new file mode 100644 index 000000000..3f3b049fb --- /dev/null +++ b/src/native/cuda/ops/silu_and_mul_infinilm/kernel.h @@ -0,0 +1,47 @@ +#ifndef INFINI_OPS_CUDA_SILU_AND_MUL_INFINILM_KERNEL_H_ +#define INFINI_OPS_CUDA_SILU_AND_MUL_INFINILM_KERNEL_H_ + +#include +#include + +#include "base/silu_and_mul_infinilm.h" +#include "common/generic_utils.h" +#include "data_type.h" +#include "dispatcher.h" +#include "native/cuda/ops/silu_and_mul_infinilm/kernel.cuh" +#include "native/cuda/runtime_utils.h" + +namespace infini::ops { + +template +class CudaSiluAndMulInfinilm : public SiluAndMulInfinilm { + public: + using SiluAndMulInfinilm::SiluAndMulInfinilm; + + void operator()(const Tensor input, Tensor out) const override { + auto cuda_stream = + static_cast(stream_ ? stream_ : 0); + int block_size = std::min( + RuntimeUtils::GetOptimalBlockSize(), 1024); + dim3 block(std::min(static_cast(block_size), output_size_)); + dim3 grid(utils::CeilDiv(output_size_, block.x)); + + DispatchFunc>( + {static_cast(out_type_), block_size}, + [&](auto list_tag) { + using T = TypeMapType(list_tag)>; + constexpr int kBlockSize = ListGet<1>(list_tag); + + SiluAndMulInfinilmKernel + <<>>( + reinterpret_cast(out.data()), + reinterpret_cast(input.data()), output_size_, + hidden_size_); + }, + "CudaSiluAndMulInfinilm::operator()"); + } +}; + +} // namespace infini::ops + +#endif diff --git a/tests/test_silu_and_mul_infinilm.py b/tests/test_silu_and_mul_infinilm.py new file mode 100644 index 000000000..8eab051fa --- /dev/null +++ b/tests/test_silu_and_mul_infinilm.py @@ -0,0 +1,54 @@ +import infini.ops +import pytest +import torch + +from tests.utils import Payload, empty_strided, get_stream, randn_strided + + +@pytest.mark.auto_act_and_assert +@pytest.mark.parametrize( + "input_shape, output_shape", + ( + ((2, 8), (2, 4)), + ((1024, 1024), (1024, 512)), + ((16, 8192), (16, 4096)), + ((2, 128, 2048), (2, 128, 1024)), + ((8, 1, 4096), (8, 1, 2048)), + ((2, 4, 16, 256), (2, 4, 16, 128)), + ), +) +@pytest.mark.parametrize( + ("dtype", "rtol", "atol"), + ( + (torch.float32, 1e-6, 1e-6), + (torch.float16, 1e-3, 1e-3), + (torch.bfloat16, 1e-2, 1e-2), + ), +) +def test_silu_and_mul_infinilm(input_shape, output_shape, dtype, device, rtol, atol): + input = randn_strided(input_shape, None, dtype=dtype, device=device) + out = empty_strided(output_shape, None, dtype=dtype, device=device) + + return Payload( + _silu_and_mul_infinilm, + _torch_silu_and_mul_infinilm, + (input, out), + {}, + rtol=rtol, + atol=atol, + ) + + +def _silu_and_mul_infinilm(input, out): + infini.ops.silu_and_mul_infinilm(input, out, stream=get_stream(input.device)) + + return out + + +def _torch_silu_and_mul_infinilm(input, out): + hidden = input.shape[-1] // 2 + gate = input[..., :hidden] + up = input[..., hidden:] + out.copy_(torch.nn.functional.silu(gate) * up) + + return out