InfiniTensor · wooway777 · Jun 10, 2026
diff --git a/src/base/zeros_infinilm.h b/src/base/zeros_infinilm.h
@@ -0,0 +1,57 @@
+#ifndef INFINI_OPS_BASE_ZEROS_INFINILM_H_
+#define INFINI_OPS_BASE_ZEROS_INFINILM_H_
+
+#include <cassert>
+
+#include "operator.h"
+
+namespace infini::ops {
+
+class ZerosInfinilm : public Operator<ZerosInfinilm> {
+ public:
+  ZerosInfinilm(const Tensor input, Tensor out)
+      : input_shape_{input.shape()},
+        input_strides_{input.strides()},
+        input_type_{input.dtype()},
+        out_shape_{out.shape()},
+        out_strides_{out.strides()},
+        out_type_{out.dtype()},
+        output_size_{out.numel()},
+        ndim_{out.ndim()},
+        is_out_contiguous_{out.IsContiguous()},
+        device_index_{out.device().index()} {
+    assert(input_shape_ == out_shape_ &&
+           "`ZerosInfinilm` input and output shapes must match");
+    assert(input_type_ == out_type_ &&
+           "`ZerosInfinilm` input and output dtypes must match");
+    assert(!out.HasBroadcastDim() &&
+           "`ZerosInfinilm` output must not have broadcasted dimensions");
+  }
+
+  virtual void operator()(const Tensor input, Tensor out) const = 0;
+
+ protected:
+  Tensor::Shape input_shape_;
+
+  Tensor::Strides input_strides_;
+
+  DataType input_type_;
+
+  Tensor::Shape out_shape_;
+
+  Tensor::Strides out_strides_;
+
+  DataType out_type_;
+
+  Tensor::Size output_size_{0};
+
+  Tensor::Size ndim_{0};
+
+  bool is_out_contiguous_{false};
+
+  int device_index_{0};
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/iluvatar/ops/zeros_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/zeros_infinilm/kernel.h
@@ -0,0 +1,21 @@
+#ifndef INFINI_OPS_ILUVATAR_ZEROS_INFINILM_KERNEL_H_
+#define INFINI_OPS_ILUVATAR_ZEROS_INFINILM_KERNEL_H_
+
+#include <utility>
+
+#include "native/cuda/iluvatar/caster.cuh"
+#include "native/cuda/iluvatar/runtime_.h"
+#include "native/cuda/ops/zeros_infinilm/kernel.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<ZerosInfinilm, Device::Type::kIluvatar>
+    : public CudaZerosInfinilm<Runtime<Device::Type::kIluvatar>> {
+ public:
+  using CudaZerosInfinilm<Runtime<Device::Type::kIluvatar>>::CudaZerosInfinilm;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/metax/ops/zeros_infinilm/kernel.h b/src/native/cuda/metax/ops/zeros_infinilm/kernel.h
@@ -0,0 +1,21 @@
+#ifndef INFINI_OPS_METAX_ZEROS_INFINILM_KERNEL_H_
+#define INFINI_OPS_METAX_ZEROS_INFINILM_KERNEL_H_
+
+#include <utility>
+
+#include "native/cuda/metax/caster.cuh"
+#include "native/cuda/metax/runtime_.h"
+#include "native/cuda/ops/zeros_infinilm/kernel.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<ZerosInfinilm, Device::Type::kMetax>
+    : public CudaZerosInfinilm<Runtime<Device::Type::kMetax>> {
+ public:
+  using CudaZerosInfinilm<Runtime<Device::Type::kMetax>>::CudaZerosInfinilm;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/moore/ops/zeros_infinilm/kernel.h b/src/native/cuda/moore/ops/zeros_infinilm/kernel.h
@@ -0,0 +1,22 @@
+#ifndef INFINI_OPS_MOORE_ZEROS_INFINILM_KERNEL_H_
+#define INFINI_OPS_MOORE_ZEROS_INFINILM_KERNEL_H_
+
+#include <utility>
+
+#include "native/cuda/moore/caster.cuh"
+#include "native/cuda/moore/polyfills.cuh"
+#include "native/cuda/moore/runtime_.h"
+#include "native/cuda/ops/zeros_infinilm/kernel.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<ZerosInfinilm, Device::Type::kMoore>
+    : public CudaZerosInfinilm<Runtime<Device::Type::kMoore>> {
+ public:
+  using CudaZerosInfinilm<Runtime<Device::Type::kMoore>>::CudaZerosInfinilm;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/nvidia/ops/zeros_infinilm/kernel.h b/src/native/cuda/nvidia/ops/zeros_infinilm/kernel.h
@@ -0,0 +1,21 @@
+#ifndef INFINI_OPS_NVIDIA_ZEROS_INFINILM_KERNEL_H_
+#define INFINI_OPS_NVIDIA_ZEROS_INFINILM_KERNEL_H_
+
+#include <utility>
+
+#include "native/cuda/nvidia/caster.cuh"
+#include "native/cuda/nvidia/runtime_.h"
+#include "native/cuda/ops/zeros_infinilm/kernel.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<ZerosInfinilm, Device::Type::kNvidia>
+    : public CudaZerosInfinilm<Runtime<Device::Type::kNvidia>> {
+ public:
+  using CudaZerosInfinilm<Runtime<Device::Type::kNvidia>>::CudaZerosInfinilm;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/ops/zeros_infinilm/kernel.cuh b/src/native/cuda/ops/zeros_infinilm/kernel.cuh
@@ -0,0 +1,28 @@
+#ifndef INFINI_OPS_CUDA_ZEROS_INFINILM_KERNEL_CUH_
+#define INFINI_OPS_CUDA_ZEROS_INFINILM_KERNEL_CUH_
+
+#include <cstddef>
+
+#include "native/cuda/caster.cuh"
+#include "native/cuda/kernel_commons.cuh"
+
+namespace infini::ops {
+
+template <Device::Type kDev, typename T, unsigned int block_size>
+__global__ void ZerosInfinilmKernel(T* __restrict__ out,
+                                    const size_t* __restrict__ out_shape,
+                                    const ptrdiff_t* __restrict__ out_strides,
+                                    size_t output_size, size_t ndim,
+                                    bool out_contiguous) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < output_size) {
+    size_t out_idx =
+        out_contiguous ? idx : IndexToOffset(idx, ndim, out_shape, out_strides);
+    out[out_idx] = Caster<kDev>::template Cast<T>(0);
+  }
+}
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/ops/zeros_infinilm/kernel.h b/src/native/cuda/ops/zeros_infinilm/kernel.h
@@ -0,0 +1,78 @@
+#ifndef INFINI_OPS_CUDA_ZEROS_INFINILM_KERNEL_H_
+#define INFINI_OPS_CUDA_ZEROS_INFINILM_KERNEL_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstring>
+#include <vector>
+
+#include "base/zeros_infinilm.h"
+#include "common/generic_utils.h"
+#include "data_type.h"
+#include "dispatcher.h"
+#include "native/cuda/kernel_commons.cuh"
+#include "native/cuda/ops/zeros_infinilm/kernel.cuh"
+#include "native/cuda/runtime_utils.h"
+
+namespace infini::ops {
+
+template <typename Backend>
+class CudaZerosInfinilm : public ZerosInfinilm {
+ public:
+  CudaZerosInfinilm(const Tensor input, Tensor out)
+      : ZerosInfinilm{input, out} {
+    size_t shape_size = ndim_ * sizeof(*d_out_shape_);
+    size_t strides_size = ndim_ * sizeof(*d_out_strides_);
+    const size_t metadata_size = shape_size + strides_size;
+    std::vector<std::byte> metadata(metadata_size);
+
+    Backend::Malloc((void**)&d_metadata_, metadata_size);
+
+    size_t offset = 0;
+    d_out_shape_ = reinterpret_cast<Tensor::Size*>(d_metadata_ + offset);
+    std::memcpy(metadata.data() + offset, out_shape_.data(), shape_size);
+    offset += shape_size;
+
+    d_out_strides_ = reinterpret_cast<Tensor::Stride*>(d_metadata_ + offset);
+    std::memcpy(metadata.data() + offset, out_strides_.data(), strides_size);
+
+    Backend::Memcpy(d_metadata_, metadata.data(), metadata_size,
+                    Backend::MemcpyHostToDevice);
+  }
+
+  ~CudaZerosInfinilm() { Backend::Free(d_metadata_); }
+
+  void operator()(const Tensor input, Tensor out) const override {
+    (void)input;
+    auto cuda_stream =
+        static_cast<typename Backend::Stream>(stream_ ? stream_ : 0);
+    int block_size = std::min(
+        RuntimeUtils<Backend::kDeviceType>::GetOptimalBlockSize(), 1024);
+    dim3 block(std::min(static_cast<Tensor::Size>(block_size), output_size_));
+    dim3 grid(utils::CeilDiv(output_size_, block.x));
+
+    DispatchFunc<AllTypes, List<128, 256, 512, 1024>>(
+        {static_cast<int64_t>(out_type_), block_size},
+        [&](auto list_tag) {
+          using T = TypeMapType<Backend::kDeviceType, ListGet<0>(list_tag)>;
+          constexpr int kBlockSize = ListGet<1>(list_tag);
+
+          ZerosInfinilmKernel<Backend::kDeviceType, T, kBlockSize>
+              <<<grid, block, 0, cuda_stream>>>(
+                  reinterpret_cast<T*>(out.data()), d_out_shape_,
+                  d_out_strides_, output_size_, ndim_, is_out_contiguous_);
+        },
+        "CudaZerosInfinilm::operator()");
+  }
+
+ private:
+  std::byte* d_metadata_{nullptr};
+
+  Tensor::Size* d_out_shape_{nullptr};
+
+  Tensor::Stride* d_out_strides_{nullptr};
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/tests/test_zeros_infinilm.py b/tests/test_zeros_infinilm.py
@@ -0,0 +1,91 @@
+import infini.ops
+import pytest
+import torch
+
+from tests.utils import (
+    Payload,
+    empty_strided,
+    get_stream,
+    randint_strided,
+    randn_strided,
+)
+
+
+@pytest.mark.auto_act_and_assert
+@pytest.mark.parametrize(
+    "shape, input_strides, out_strides, inplace",
+    (
+        ((13, 4), None, None, False),
+        ((13, 4), None, None, True),
+        ((13, 4), (10, 1), (10, 1), False),
+        ((13, 4), (0, 1), None, False),
+        ((13, 4, 4), None, None, False),
+        ((13, 4, 4), None, None, True),
+        ((13, 4, 4), (20, 4, 1), (20, 4, 1), False),
+        ((16, 5632), None, None, False),
+        ((16, 5632), None, None, True),
+        ((16, 5632), (13312, 1), (13312, 1), False),
+        ((4, 4, 5632), None, None, False),
+        ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), False),
+    ),
+)
+@pytest.mark.parametrize(
+    "dtype",
+    (
+        torch.uint8,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.float64,
+        torch.float32,
+        torch.float16,
+        torch.bfloat16,
+    ),
+)
+def test_zeros_infinilm(shape, input_strides, out_strides, inplace, dtype, device):
+    if device == "musa" and dtype == torch.float64:
+        pytest.skip("MUSA does not support float64 zeros_infinilm")
+
+    input = _make_input(shape, input_strides, dtype=dtype, device=device)
+    out = (
+        input
+        if inplace
+        else empty_strided(shape, out_strides, dtype=dtype, device=device)
+    )
+    if not inplace:
+        _fill_nonzero(out)
+
+    return Payload(
+        _zeros_infinilm,
+        _torch_zeros_infinilm,
+        (input, out),
+        {},
+        rtol=0,
+        atol=0,
+    )
+
+
+def _make_input(shape, strides, *, dtype, device):
+    if dtype.is_floating_point:
+        return randn_strided(shape, strides, dtype=dtype, device=device)
+    return randint_strided(1, 16, shape, strides, dtype=dtype, device=device)
+
+
+def _fill_nonzero(tensor):
+    if tensor.dtype.is_floating_point:
+        tensor.fill_(1)
+    else:
+        tensor.fill_(1)
+
+
+def _zeros_infinilm(input, out):
+    infini.ops.zeros_infinilm(input, out, stream=get_stream(input.device))
+
+    return out
+
+
+def _torch_zeros_infinilm(input, out):
+    out.zero_()
+
+    return out