InfiniTensor · wooway777 · Jun 10, 2026
diff --git a/src/base/gelu_infinilm.h b/src/base/gelu_infinilm.h
@@ -0,0 +1,67 @@
+#ifndef INFINI_OPS_BASE_GELU_INFINILM_H_
+#define INFINI_OPS_BASE_GELU_INFINILM_H_
+
+#include <cassert>
+#include <string>
+
+#include "operator.h"
+
+namespace infini::ops {
+
+class GeluInfinilm : public Operator<GeluInfinilm> {
+ public:
+  GeluInfinilm(const Tensor input, const std::string approximate, Tensor out)
+      : input_shape_{input.shape()},
+        input_strides_{input.strides()},
+        input_type_{input.dtype()},
+        out_shape_{out.shape()},
+        out_strides_{out.strides()},
+        out_type_{out.dtype()},
+        approximate_{approximate},
+        output_size_{out.numel()},
+        ndim_{out.ndim()},
+        is_input_contiguous_{input.IsContiguous()},
+        is_out_contiguous_{out.IsContiguous()},
+        device_index_{out.device().index()} {
+    assert(input_shape_ == out_shape_ &&
+           "`GeluInfinilm` input and output shapes must match");
+    assert(input_type_ == out_type_ &&
+           "`GeluInfinilm` input and output dtypes must match");
+    assert((approximate.empty() || approximate == "none") &&
+           "`GeluInfinilm` only supports exact approximation");
+    assert(!out.HasBroadcastDim() &&
+           "`GeluInfinilm` output must not have broadcasted dimensions");
+  }
+
+  virtual void operator()(const Tensor input, const std::string approximate,
+                          Tensor out) const = 0;
+
+ protected:
+  Tensor::Shape input_shape_;
+
+  Tensor::Strides input_strides_;
+
+  DataType input_type_;
+
+  Tensor::Shape out_shape_;
+
+  Tensor::Strides out_strides_;
+
+  DataType out_type_;
+
+  std::string approximate_{};
+
+  Tensor::Size output_size_{0};
+
+  Tensor::Size ndim_{0};
+
+  bool is_input_contiguous_{false};
+
+  bool is_out_contiguous_{false};
+
+  int device_index_{0};
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/iluvatar/ops/gelu_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/gelu_infinilm/kernel.h
@@ -0,0 +1,21 @@
+#ifndef INFINI_OPS_ILUVATAR_GELU_INFINILM_KERNEL_H_
+#define INFINI_OPS_ILUVATAR_GELU_INFINILM_KERNEL_H_
+
+#include <utility>
+
+#include "native/cuda/iluvatar/caster.cuh"
+#include "native/cuda/iluvatar/runtime_.h"
+#include "native/cuda/ops/gelu_infinilm/kernel.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<GeluInfinilm, Device::Type::kIluvatar>
+    : public CudaGeluInfinilm<Runtime<Device::Type::kIluvatar>> {
+ public:
+  using CudaGeluInfinilm<Runtime<Device::Type::kIluvatar>>::CudaGeluInfinilm;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/metax/ops/gelu_infinilm/kernel.h b/src/native/cuda/metax/ops/gelu_infinilm/kernel.h
@@ -0,0 +1,21 @@
+#ifndef INFINI_OPS_METAX_GELU_INFINILM_KERNEL_H_
+#define INFINI_OPS_METAX_GELU_INFINILM_KERNEL_H_
+
+#include <utility>
+
+#include "native/cuda/metax/caster.cuh"
+#include "native/cuda/metax/runtime_.h"
+#include "native/cuda/ops/gelu_infinilm/kernel.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<GeluInfinilm, Device::Type::kMetax>
+    : public CudaGeluInfinilm<Runtime<Device::Type::kMetax>> {
+ public:
+  using CudaGeluInfinilm<Runtime<Device::Type::kMetax>>::CudaGeluInfinilm;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/moore/ops/gelu_infinilm/kernel.h b/src/native/cuda/moore/ops/gelu_infinilm/kernel.h
@@ -0,0 +1,22 @@
+#ifndef INFINI_OPS_MOORE_GELU_INFINILM_KERNEL_H_
+#define INFINI_OPS_MOORE_GELU_INFINILM_KERNEL_H_
+
+#include <utility>
+
+#include "native/cuda/moore/caster.cuh"
+#include "native/cuda/moore/polyfills.cuh"
+#include "native/cuda/moore/runtime_.h"
+#include "native/cuda/ops/gelu_infinilm/kernel.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<GeluInfinilm, Device::Type::kMoore>
+    : public CudaGeluInfinilm<Runtime<Device::Type::kMoore>> {
+ public:
+  using CudaGeluInfinilm<Runtime<Device::Type::kMoore>>::CudaGeluInfinilm;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/nvidia/ops/gelu_infinilm/kernel.h b/src/native/cuda/nvidia/ops/gelu_infinilm/kernel.h
@@ -0,0 +1,21 @@
+#ifndef INFINI_OPS_NVIDIA_GELU_INFINILM_KERNEL_H_
+#define INFINI_OPS_NVIDIA_GELU_INFINILM_KERNEL_H_
+
+#include <utility>
+
+#include "native/cuda/nvidia/caster.cuh"
+#include "native/cuda/nvidia/runtime_.h"
+#include "native/cuda/ops/gelu_infinilm/kernel.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<GeluInfinilm, Device::Type::kNvidia>
+    : public CudaGeluInfinilm<Runtime<Device::Type::kNvidia>> {
+ public:
+  using CudaGeluInfinilm<Runtime<Device::Type::kNvidia>>::CudaGeluInfinilm;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/ops/gelu_infinilm/kernel.cuh b/src/native/cuda/ops/gelu_infinilm/kernel.cuh
@@ -0,0 +1,51 @@
+#ifndef INFINI_OPS_CUDA_GELU_INFINILM_KERNEL_CUH_
+#define INFINI_OPS_CUDA_GELU_INFINILM_KERNEL_CUH_
+
+#include <cmath>
+#include <cstddef>
+
+#include "native/cuda/caster.cuh"
+#include "native/cuda/kernel_commons.cuh"
+
+namespace infini::ops {
+
+namespace {
+
+template <Device::Type kDev, typename T>
+__device__ __forceinline__ T GeluInfinilmExact(T x) {
+  if constexpr (std::is_same_v<T, double>) {
+    const double v = x;
+    return 0.5 * v * (1.0 + erf(v * 0.70710678118654752440));
+  } else {
+    const float v = Caster<kDev>::template Cast<float>(x);
+    const float y = 0.5f * v * (1.0f + erff(v * 0.70710678118654752440f));
+    return Caster<kDev>::template Cast<T>(y);
+  }
+}
+
+}  // namespace
+
+template <Device::Type kDev, typename T, unsigned int block_size>
+__global__ void GeluInfinilmKernel(T* __restrict__ out,
+                                   const T* __restrict__ input,
+                                   const size_t* __restrict__ out_shape,
+                                   const size_t* __restrict__ input_shape,
+                                   const ptrdiff_t* __restrict__ out_strides,
+                                   const ptrdiff_t* __restrict__ input_strides,
+                                   size_t output_size, size_t ndim,
+                                   bool out_contiguous, bool input_contiguous) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < output_size) {
+    size_t out_idx =
+        out_contiguous ? idx : IndexToOffset(idx, ndim, out_shape, out_strides);
+    size_t input_idx =
+        input_contiguous ? idx
+                         : IndexToOffset(idx, ndim, input_shape, input_strides);
+    out[out_idx] = GeluInfinilmExact<kDev>(input[input_idx]);
+  }
+}
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/ops/gelu_infinilm/kernel.h b/src/native/cuda/ops/gelu_infinilm/kernel.h
@@ -0,0 +1,95 @@
+#ifndef INFINI_OPS_CUDA_GELU_INFINILM_KERNEL_H_
+#define INFINI_OPS_CUDA_GELU_INFINILM_KERNEL_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstring>
+#include <vector>
+
+#include "base/gelu_infinilm.h"
+#include "common/generic_utils.h"
+#include "data_type.h"
+#include "dispatcher.h"
+#include "native/cuda/kernel_commons.cuh"
+#include "native/cuda/ops/gelu_infinilm/kernel.cuh"
+#include "native/cuda/runtime_utils.h"
+
+namespace infini::ops {
+
+template <typename Backend>
+class CudaGeluInfinilm : public GeluInfinilm {
+ public:
+  CudaGeluInfinilm(const Tensor input, const std::string approximate,
+                   Tensor out)
+      : GeluInfinilm{input, approximate, out} {
+    size_t shape_size = ndim_ * sizeof(*d_input_shape_);
+    size_t strides_size = ndim_ * sizeof(*d_input_strides_);
+    const size_t metadata_size = 2 * (shape_size + strides_size);
+    std::vector<std::byte> metadata(metadata_size);
+
+    Backend::Malloc((void**)&d_metadata_, metadata_size);
+
+    size_t offset = 0;
+    d_input_shape_ = reinterpret_cast<Tensor::Size*>(d_metadata_ + offset);
+    std::memcpy(metadata.data() + offset, input_shape_.data(), shape_size);
+    offset += shape_size;
+
+    d_out_shape_ = reinterpret_cast<Tensor::Size*>(d_metadata_ + offset);
+    std::memcpy(metadata.data() + offset, out_shape_.data(), shape_size);
+    offset += shape_size;
+
+    d_input_strides_ = reinterpret_cast<Tensor::Stride*>(d_metadata_ + offset);
+    std::memcpy(metadata.data() + offset, input_strides_.data(), strides_size);
+    offset += strides_size;
+
+    d_out_strides_ = reinterpret_cast<Tensor::Stride*>(d_metadata_ + offset);
+    std::memcpy(metadata.data() + offset, out_strides_.data(), strides_size);
+
+    Backend::Memcpy(d_metadata_, metadata.data(), metadata_size,
+                    Backend::MemcpyHostToDevice);
+  }
+
+  ~CudaGeluInfinilm() { Backend::Free(d_metadata_); }
+
+  void operator()(const Tensor input, const std::string approximate,
+                  Tensor out) const override {
+    (void)approximate;
+    auto cuda_stream =
+        static_cast<typename Backend::Stream>(stream_ ? stream_ : 0);
+    int block_size = std::min(
+        RuntimeUtils<Backend::kDeviceType>::GetOptimalBlockSize(), 1024);
+    dim3 block(std::min(static_cast<Tensor::Size>(block_size), output_size_));
+    dim3 grid(utils::CeilDiv(output_size_, block.x));
+
+    DispatchFunc<AllFloatTypes, List<128, 256, 512, 1024>>(
+        {static_cast<int64_t>(out_type_), block_size},
+        [&](auto list_tag) {
+          using T = TypeMapType<Backend::kDeviceType, ListGet<0>(list_tag)>;
+          constexpr int kBlockSize = ListGet<1>(list_tag);
+
+          GeluInfinilmKernel<Backend::kDeviceType, T, kBlockSize>
+              <<<grid, block, 0, cuda_stream>>>(
+                  reinterpret_cast<T*>(out.data()),
+                  reinterpret_cast<const T*>(input.data()), d_out_shape_,
+                  d_input_shape_, d_out_strides_, d_input_strides_,
+                  output_size_, ndim_, is_out_contiguous_,
+                  is_input_contiguous_);
+        },
+        "CudaGeluInfinilm::operator()");
+  }
+
+ private:
+  std::byte* d_metadata_{nullptr};
+
+  Tensor::Size* d_input_shape_{nullptr};
+
+  Tensor::Size* d_out_shape_{nullptr};
+
+  Tensor::Stride* d_input_strides_{nullptr};
+
+  Tensor::Stride* d_out_strides_{nullptr};
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/tests/test_gelu_infinilm.py b/tests/test_gelu_infinilm.py
@@ -0,0 +1,72 @@
+import infini.ops
+import pytest
+import torch
+
+from tests.utils import Payload, empty_strided, get_stream, randn_strided
+
+
+@pytest.mark.auto_act_and_assert
+@pytest.mark.parametrize(
+    "shape, input_strides, out_strides, inplace",
+    (
+        ((13, 4), None, None, False),
+        ((13, 4), None, None, True),
+        ((13, 4), (10, 1), (10, 1), False),
+        ((13, 4), (10, 1), (10, 1), True),
+        ((13, 4, 4), None, None, False),
+        ((13, 4, 4), None, None, True),
+        ((13, 4, 4), (20, 4, 1), (20, 4, 1), False),
+        ((13, 4, 4), (20, 4, 1), (20, 4, 1), True),
+        ((16, 5632), None, None, False),
+        ((16, 5632), None, None, True),
+        ((16, 5632), (13312, 1), (13312, 1), False),
+        ((16, 5632), (13312, 1), (13312, 1), True),
+        ((4, 4, 5632), None, None, False),
+        ((4, 4, 5632), None, None, True),
+        ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), False),
+        ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), True),
+    ),
+)
+@pytest.mark.parametrize(
+    ("dtype", "rtol", "atol"),
+    (
+        (torch.float64, 1e-6, 1e-6),
+        (torch.float32, 1e-5, 1e-5),
+        (torch.float16, 1e-3, 1e-3),
+        (torch.bfloat16, 1e-2, 1e-2),
+    ),
+)
+def test_gelu_infinilm(
+    shape, input_strides, out_strides, inplace, dtype, device, rtol, atol
+):
+    if device == "musa" and dtype == torch.float64:
+        pytest.skip("MUSA does not support float64 GELU_INFINILM")
+
+    input = randn_strided(shape, input_strides, dtype=dtype, device=device)
+    out = (
+        input
+        if inplace
+        else empty_strided(shape, out_strides, dtype=dtype, device=device)
+    )
+
+    return Payload(
+        _gelu_infinilm,
+        _torch_gelu_infinilm,
+        (input, out),
+        {},
+        rtol=rtol,
+        atol=atol,
+    )
+
+
+def _gelu_infinilm(input, out):
+    infini.ops.gelu_infinilm(input, "none", out, stream=get_stream(input.device))
+
+    return out
+
+
+def _torch_gelu_infinilm(input, out):
+    result = torch.nn.functional.gelu(input, approximate="none")
+    out.copy_(result)
+
+    return out