InfiniTensor · wooway777 · Jun 10, 2026
diff --git a/src/base/rearrange_infinilm.h b/src/base/rearrange_infinilm.h
@@ -0,0 +1,60 @@
+#ifndef INFINI_OPS_BASE_REARRANGE_INFINILM_H_
+#define INFINI_OPS_BASE_REARRANGE_INFINILM_H_
+
+#include <cassert>
+
+#include "operator.h"
+
+namespace infini::ops {
+
+class RearrangeInfinilm : public Operator<RearrangeInfinilm> {
+ public:
+  RearrangeInfinilm(const Tensor input, Tensor out)
+      : input_shape_{input.shape()},
+        input_strides_{input.strides()},
+        input_type_{input.dtype()},
+        out_shape_{out.shape()},
+        out_strides_{out.strides()},
+        out_type_{out.dtype()},
+        output_size_{out.numel()},
+        ndim_{out.ndim()},
+        is_input_contiguous_{input.IsContiguous()},
+        is_out_contiguous_{out.IsContiguous()},
+        device_index_{out.device().index()} {
+    assert(input_shape_ == out_shape_ &&
+           "`RearrangeInfinilm` input and output shapes must match");
+    assert(input_type_ == out_type_ &&
+           "`RearrangeInfinilm` input and output dtypes must match");
+    assert(!out.HasBroadcastDim() &&
+           "`RearrangeInfinilm` output must not have broadcasted dimensions");
+  }
+
+  virtual void operator()(const Tensor input, Tensor out) const = 0;
+
+ protected:
+  Tensor::Shape input_shape_;
+
+  Tensor::Strides input_strides_;
+
+  DataType input_type_;
+
+  Tensor::Shape out_shape_;
+
+  Tensor::Strides out_strides_;
+
+  DataType out_type_;
+
+  Tensor::Size output_size_{0};
+
+  Tensor::Size ndim_{0};
+
+  bool is_input_contiguous_{false};
+
+  bool is_out_contiguous_{false};
+
+  int device_index_{0};
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/iluvatar/ops/rearrange_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/rearrange_infinilm/kernel.h
@@ -0,0 +1,22 @@
+#ifndef INFINI_OPS_ILUVATAR_REARRANGE_INFINILM_KERNEL_H_
+#define INFINI_OPS_ILUVATAR_REARRANGE_INFINILM_KERNEL_H_
+
+#include <utility>
+
+#include "native/cuda/iluvatar/caster.cuh"
+#include "native/cuda/iluvatar/runtime_.h"
+#include "native/cuda/ops/rearrange_infinilm/kernel.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<RearrangeInfinilm, Device::Type::kIluvatar>
+    : public CudaRearrangeInfinilm<Runtime<Device::Type::kIluvatar>> {
+ public:
+  using CudaRearrangeInfinilm<
+      Runtime<Device::Type::kIluvatar>>::CudaRearrangeInfinilm;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/metax/ops/rearrange_infinilm/kernel.h b/src/native/cuda/metax/ops/rearrange_infinilm/kernel.h
@@ -0,0 +1,22 @@
+#ifndef INFINI_OPS_METAX_REARRANGE_INFINILM_KERNEL_H_
+#define INFINI_OPS_METAX_REARRANGE_INFINILM_KERNEL_H_
+
+#include <utility>
+
+#include "native/cuda/metax/caster.cuh"
+#include "native/cuda/metax/runtime_.h"
+#include "native/cuda/ops/rearrange_infinilm/kernel.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<RearrangeInfinilm, Device::Type::kMetax>
+    : public CudaRearrangeInfinilm<Runtime<Device::Type::kMetax>> {
+ public:
+  using CudaRearrangeInfinilm<
+      Runtime<Device::Type::kMetax>>::CudaRearrangeInfinilm;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/moore/ops/rearrange_infinilm/kernel.h b/src/native/cuda/moore/ops/rearrange_infinilm/kernel.h
@@ -0,0 +1,23 @@
+#ifndef INFINI_OPS_MOORE_REARRANGE_INFINILM_KERNEL_H_
+#define INFINI_OPS_MOORE_REARRANGE_INFINILM_KERNEL_H_
+
+#include <utility>
+
+#include "native/cuda/moore/caster.cuh"
+#include "native/cuda/moore/polyfills.cuh"
+#include "native/cuda/moore/runtime_.h"
+#include "native/cuda/ops/rearrange_infinilm/kernel.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<RearrangeInfinilm, Device::Type::kMoore>
+    : public CudaRearrangeInfinilm<Runtime<Device::Type::kMoore>> {
+ public:
+  using CudaRearrangeInfinilm<
+      Runtime<Device::Type::kMoore>>::CudaRearrangeInfinilm;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/nvidia/ops/rearrange_infinilm/kernel.h b/src/native/cuda/nvidia/ops/rearrange_infinilm/kernel.h
@@ -0,0 +1,22 @@
+#ifndef INFINI_OPS_NVIDIA_REARRANGE_INFINILM_KERNEL_H_
+#define INFINI_OPS_NVIDIA_REARRANGE_INFINILM_KERNEL_H_
+
+#include <utility>
+
+#include "native/cuda/nvidia/caster.cuh"
+#include "native/cuda/nvidia/runtime_.h"
+#include "native/cuda/ops/rearrange_infinilm/kernel.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<RearrangeInfinilm, Device::Type::kNvidia>
+    : public CudaRearrangeInfinilm<Runtime<Device::Type::kNvidia>> {
+ public:
+  using CudaRearrangeInfinilm<
+      Runtime<Device::Type::kNvidia>>::CudaRearrangeInfinilm;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/ops/rearrange_infinilm/kernel.cuh b/src/native/cuda/ops/rearrange_infinilm/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef INFINI_OPS_CUDA_REARRANGE_INFINILM_KERNEL_CUH_
+#define INFINI_OPS_CUDA_REARRANGE_INFINILM_KERNEL_CUH_
+
+#include <cstddef>
+
+#include "native/cuda/kernel_commons.cuh"
+
+namespace infini::ops {
+
+template <typename T, unsigned int block_size>
+__global__ void RearrangeInfinilmKernel(
+    T* __restrict__ out, const T* __restrict__ input,
+    const size_t* __restrict__ out_shape,
+    const size_t* __restrict__ input_shape,
+    const ptrdiff_t* __restrict__ out_strides,
+    const ptrdiff_t* __restrict__ input_strides, size_t output_size,
+    size_t ndim, bool out_contiguous, bool input_contiguous) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < output_size) {
+    size_t out_idx =
+        out_contiguous ? idx : IndexToOffset(idx, ndim, out_shape, out_strides);
+    size_t input_idx =
+        input_contiguous ? idx
+                         : IndexToOffset(idx, ndim, input_shape, input_strides);
+    out[out_idx] = input[input_idx];
+  }
+}
+
+}  // namespace infini::ops
+
+#endif
diff --git a/src/native/cuda/ops/rearrange_infinilm/kernel.h b/src/native/cuda/ops/rearrange_infinilm/kernel.h
@@ -0,0 +1,92 @@
+#ifndef INFINI_OPS_CUDA_REARRANGE_INFINILM_KERNEL_H_
+#define INFINI_OPS_CUDA_REARRANGE_INFINILM_KERNEL_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstring>
+#include <vector>
+
+#include "base/rearrange_infinilm.h"
+#include "common/generic_utils.h"
+#include "data_type.h"
+#include "dispatcher.h"
+#include "native/cuda/kernel_commons.cuh"
+#include "native/cuda/ops/rearrange_infinilm/kernel.cuh"
+#include "native/cuda/runtime_utils.h"
+
+namespace infini::ops {
+
+template <typename Backend>
+class CudaRearrangeInfinilm : public RearrangeInfinilm {
+ public:
+  CudaRearrangeInfinilm(const Tensor input, Tensor out)
+      : RearrangeInfinilm{input, out} {
+    size_t shape_size = ndim_ * sizeof(*d_input_shape_);
+    size_t strides_size = ndim_ * sizeof(*d_input_strides_);
+    const size_t metadata_size = 2 * (shape_size + strides_size);
+    std::vector<std::byte> metadata(metadata_size);
+
+    Backend::Malloc((void**)&d_metadata_, metadata_size);
+
+    size_t offset = 0;
+    d_input_shape_ = reinterpret_cast<Tensor::Size*>(d_metadata_ + offset);
+    std::memcpy(metadata.data() + offset, input_shape_.data(), shape_size);
+    offset += shape_size;
+
+    d_out_shape_ = reinterpret_cast<Tensor::Size*>(d_metadata_ + offset);
+    std::memcpy(metadata.data() + offset, out_shape_.data(), shape_size);
+    offset += shape_size;
+
+    d_input_strides_ = reinterpret_cast<Tensor::Stride*>(d_metadata_ + offset);
+    std::memcpy(metadata.data() + offset, input_strides_.data(), strides_size);
+    offset += strides_size;
+
+    d_out_strides_ = reinterpret_cast<Tensor::Stride*>(d_metadata_ + offset);
+    std::memcpy(metadata.data() + offset, out_strides_.data(), strides_size);
+
+    Backend::Memcpy(d_metadata_, metadata.data(), metadata_size,
+                    Backend::MemcpyHostToDevice);
+  }
+
+  ~CudaRearrangeInfinilm() { Backend::Free(d_metadata_); }
+
+  void operator()(const Tensor input, Tensor out) const override {
+    auto cuda_stream =
+        static_cast<typename Backend::Stream>(stream_ ? stream_ : 0);
+    int block_size = std::min(
+        RuntimeUtils<Backend::kDeviceType>::GetOptimalBlockSize(), 1024);
+    dim3 block(std::min(static_cast<Tensor::Size>(block_size), output_size_));
+    dim3 grid(utils::CeilDiv(output_size_, block.x));
+
+    DispatchFunc<AllTypes, List<128, 256, 512, 1024>>(
+        {static_cast<int64_t>(out_type_), block_size},
+        [&](auto list_tag) {
+          using T = TypeMapType<Backend::kDeviceType, ListGet<0>(list_tag)>;
+          constexpr int kBlockSize = ListGet<1>(list_tag);
+
+          RearrangeInfinilmKernel<T, kBlockSize>
+              <<<grid, block, 0, cuda_stream>>>(
+                  reinterpret_cast<T*>(out.data()),
+                  reinterpret_cast<const T*>(input.data()), d_out_shape_,
+                  d_input_shape_, d_out_strides_, d_input_strides_,
+                  output_size_, ndim_, is_out_contiguous_,
+                  is_input_contiguous_);
+        },
+        "CudaRearrangeInfinilm::operator()");
+  }
+
+ private:
+  std::byte* d_metadata_{nullptr};
+
+  Tensor::Size* d_input_shape_{nullptr};
+
+  Tensor::Size* d_out_shape_{nullptr};
+
+  Tensor::Stride* d_input_strides_{nullptr};
+
+  Tensor::Stride* d_out_strides_{nullptr};
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/tests/test_rearrange_infinilm.py b/tests/test_rearrange_infinilm.py
@@ -0,0 +1,86 @@
+import infini.ops
+import pytest
+import torch
+
+from tests.utils import Payload, empty_strided, get_stream, randn_strided
+
+
+def _row_major_strides(shape):
+    stride = 1
+    strides = [1]
+    for dim in reversed(shape[1:]):
+        stride *= dim
+        strides.insert(0, stride)
+
+    return tuple(strides)
+
+
+def _column_major_strides(shape):
+    stride = 1
+    strides = [stride]
+    for dim in shape[:-1]:
+        stride *= dim
+        strides.append(stride)
+
+    return tuple(strides)
+
+
+@pytest.mark.auto_act_and_assert
+@pytest.mark.parametrize(
+    "shape, input_strides, out_strides",
+    (
+        ((100, 100), (1, 100), (100, 1)),
+        ((4, 4), (1, 4), (4, 1)),
+        ((4, 6, 64), (64, 4 * 64, 1), (6 * 64, 64, 1)),
+        ((2000, 2000), (1, 2000), (2000, 1)),
+        ((2001, 2001), (1, 2001), (2001, 1)),
+        ((2, 2, 2, 4), (16, 8, 4, 1), (16, 8, 1, 2)),
+        (
+            (3, 4, 7, 53, 9),
+            _row_major_strides((3, 4, 7, 53, 9)),
+            _column_major_strides((3, 4, 7, 53, 9)),
+        ),
+        (
+            (3, 4, 50, 50, 5, 7),
+            _row_major_strides((3, 4, 50, 50, 5, 7)),
+            _column_major_strides((3, 4, 50, 50, 5, 7)),
+        ),
+        ((15, 10752), (0, 1), (10752, 1)),
+        ((2, 2, 2, 2, 2, 2), (4, 8, 16, 32, 64, 128), (64, 32, 16, 8, 4, 2)),
+        ((8, 4, 20, 64), (5120, 64, 256, 1), None),
+        ((8, 4, 20, 64), (5120, 64, 256, 1), (1048576, 262144, 64, 1)),
+    ),
+)
+@pytest.mark.parametrize(
+    ("dtype", "rtol", "atol"),
+    (
+        (torch.float32, 0, 0),
+        (torch.float16, 0, 0),
+    ),
+)
+def test_rearrange_infinilm(
+    shape, input_strides, out_strides, dtype, device, rtol, atol
+):
+    input = randn_strided(shape, input_strides, dtype=dtype, device=device)
+    out = empty_strided(shape, out_strides, dtype=dtype, device=device)
+
+    return Payload(
+        _rearrange_infinilm,
+        _torch_rearrange_infinilm,
+        (input, out),
+        {},
+        rtol=rtol,
+        atol=atol,
+    )
+
+
+def _rearrange_infinilm(input, out):
+    infini.ops.rearrange_infinilm(input, out, stream=get_stream(input.device))
+
+    return out
+
+
+def _torch_rearrange_infinilm(input, out):
+    out.copy_(input.expand_as(out))
+
+    return out