diff --git a/src/base/rearrange_infinilm.h b/src/base/rearrange_infinilm.h new file mode 100644 index 000000000..1131b0bf5 --- /dev/null +++ b/src/base/rearrange_infinilm.h @@ -0,0 +1,60 @@ +#ifndef INFINI_OPS_BASE_REARRANGE_INFINILM_H_ +#define INFINI_OPS_BASE_REARRANGE_INFINILM_H_ + +#include + +#include "operator.h" + +namespace infini::ops { + +class RearrangeInfinilm : public Operator { + public: + RearrangeInfinilm(const Tensor input, Tensor out) + : input_shape_{input.shape()}, + input_strides_{input.strides()}, + input_type_{input.dtype()}, + out_shape_{out.shape()}, + out_strides_{out.strides()}, + out_type_{out.dtype()}, + output_size_{out.numel()}, + ndim_{out.ndim()}, + is_input_contiguous_{input.IsContiguous()}, + is_out_contiguous_{out.IsContiguous()}, + device_index_{out.device().index()} { + assert(input_shape_ == out_shape_ && + "`RearrangeInfinilm` input and output shapes must match"); + assert(input_type_ == out_type_ && + "`RearrangeInfinilm` input and output dtypes must match"); + assert(!out.HasBroadcastDim() && + "`RearrangeInfinilm` output must not have broadcasted dimensions"); + } + + virtual void operator()(const Tensor input, Tensor out) const = 0; + + protected: + Tensor::Shape input_shape_; + + Tensor::Strides input_strides_; + + DataType input_type_; + + Tensor::Shape out_shape_; + + Tensor::Strides out_strides_; + + DataType out_type_; + + Tensor::Size output_size_{0}; + + Tensor::Size ndim_{0}; + + bool is_input_contiguous_{false}; + + bool is_out_contiguous_{false}; + + int device_index_{0}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/iluvatar/ops/rearrange_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/rearrange_infinilm/kernel.h new file mode 100644 index 000000000..5ba9a9198 --- /dev/null +++ b/src/native/cuda/iluvatar/ops/rearrange_infinilm/kernel.h @@ -0,0 +1,22 @@ +#ifndef INFINI_OPS_ILUVATAR_REARRANGE_INFINILM_KERNEL_H_ +#define INFINI_OPS_ILUVATAR_REARRANGE_INFINILM_KERNEL_H_ + +#include + +#include "native/cuda/iluvatar/caster.cuh" +#include "native/cuda/iluvatar/runtime_.h" +#include "native/cuda/ops/rearrange_infinilm/kernel.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaRearrangeInfinilm> { + public: + using CudaRearrangeInfinilm< + Runtime>::CudaRearrangeInfinilm; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/metax/ops/rearrange_infinilm/kernel.h b/src/native/cuda/metax/ops/rearrange_infinilm/kernel.h new file mode 100644 index 000000000..4a92bf678 --- /dev/null +++ b/src/native/cuda/metax/ops/rearrange_infinilm/kernel.h @@ -0,0 +1,22 @@ +#ifndef INFINI_OPS_METAX_REARRANGE_INFINILM_KERNEL_H_ +#define INFINI_OPS_METAX_REARRANGE_INFINILM_KERNEL_H_ + +#include + +#include "native/cuda/metax/caster.cuh" +#include "native/cuda/metax/runtime_.h" +#include "native/cuda/ops/rearrange_infinilm/kernel.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaRearrangeInfinilm> { + public: + using CudaRearrangeInfinilm< + Runtime>::CudaRearrangeInfinilm; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/moore/ops/rearrange_infinilm/kernel.h b/src/native/cuda/moore/ops/rearrange_infinilm/kernel.h new file mode 100644 index 000000000..bd8e69896 --- /dev/null +++ b/src/native/cuda/moore/ops/rearrange_infinilm/kernel.h @@ -0,0 +1,23 @@ +#ifndef INFINI_OPS_MOORE_REARRANGE_INFINILM_KERNEL_H_ +#define INFINI_OPS_MOORE_REARRANGE_INFINILM_KERNEL_H_ + +#include + +#include "native/cuda/moore/caster.cuh" +#include "native/cuda/moore/polyfills.cuh" +#include "native/cuda/moore/runtime_.h" +#include "native/cuda/ops/rearrange_infinilm/kernel.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaRearrangeInfinilm> { + public: + using CudaRearrangeInfinilm< + Runtime>::CudaRearrangeInfinilm; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/nvidia/ops/rearrange_infinilm/kernel.h b/src/native/cuda/nvidia/ops/rearrange_infinilm/kernel.h new file mode 100644 index 000000000..8ab25c69a --- /dev/null +++ b/src/native/cuda/nvidia/ops/rearrange_infinilm/kernel.h @@ -0,0 +1,22 @@ +#ifndef INFINI_OPS_NVIDIA_REARRANGE_INFINILM_KERNEL_H_ +#define INFINI_OPS_NVIDIA_REARRANGE_INFINILM_KERNEL_H_ + +#include + +#include "native/cuda/nvidia/caster.cuh" +#include "native/cuda/nvidia/runtime_.h" +#include "native/cuda/ops/rearrange_infinilm/kernel.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaRearrangeInfinilm> { + public: + using CudaRearrangeInfinilm< + Runtime>::CudaRearrangeInfinilm; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/ops/rearrange_infinilm/kernel.cuh b/src/native/cuda/ops/rearrange_infinilm/kernel.cuh new file mode 100644 index 000000000..143434449 --- /dev/null +++ b/src/native/cuda/ops/rearrange_infinilm/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef INFINI_OPS_CUDA_REARRANGE_INFINILM_KERNEL_CUH_ +#define INFINI_OPS_CUDA_REARRANGE_INFINILM_KERNEL_CUH_ + +#include + +#include "native/cuda/kernel_commons.cuh" + +namespace infini::ops { + +template +__global__ void RearrangeInfinilmKernel( + T* __restrict__ out, const T* __restrict__ input, + const size_t* __restrict__ out_shape, + const size_t* __restrict__ input_shape, + const ptrdiff_t* __restrict__ out_strides, + const ptrdiff_t* __restrict__ input_strides, size_t output_size, + size_t ndim, bool out_contiguous, bool input_contiguous) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < output_size) { + size_t out_idx = + out_contiguous ? idx : IndexToOffset(idx, ndim, out_shape, out_strides); + size_t input_idx = + input_contiguous ? idx + : IndexToOffset(idx, ndim, input_shape, input_strides); + out[out_idx] = input[input_idx]; + } +} + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/ops/rearrange_infinilm/kernel.h b/src/native/cuda/ops/rearrange_infinilm/kernel.h new file mode 100644 index 000000000..433636612 --- /dev/null +++ b/src/native/cuda/ops/rearrange_infinilm/kernel.h @@ -0,0 +1,92 @@ +#ifndef INFINI_OPS_CUDA_REARRANGE_INFINILM_KERNEL_H_ +#define INFINI_OPS_CUDA_REARRANGE_INFINILM_KERNEL_H_ + +#include +#include +#include +#include + +#include "base/rearrange_infinilm.h" +#include "common/generic_utils.h" +#include "data_type.h" +#include "dispatcher.h" +#include "native/cuda/kernel_commons.cuh" +#include "native/cuda/ops/rearrange_infinilm/kernel.cuh" +#include "native/cuda/runtime_utils.h" + +namespace infini::ops { + +template +class CudaRearrangeInfinilm : public RearrangeInfinilm { + public: + CudaRearrangeInfinilm(const Tensor input, Tensor out) + : RearrangeInfinilm{input, out} { + size_t shape_size = ndim_ * sizeof(*d_input_shape_); + size_t strides_size = ndim_ * sizeof(*d_input_strides_); + const size_t metadata_size = 2 * (shape_size + strides_size); + std::vector metadata(metadata_size); + + Backend::Malloc((void**)&d_metadata_, metadata_size); + + size_t offset = 0; + d_input_shape_ = reinterpret_cast(d_metadata_ + offset); + std::memcpy(metadata.data() + offset, input_shape_.data(), shape_size); + offset += shape_size; + + d_out_shape_ = reinterpret_cast(d_metadata_ + offset); + std::memcpy(metadata.data() + offset, out_shape_.data(), shape_size); + offset += shape_size; + + d_input_strides_ = reinterpret_cast(d_metadata_ + offset); + std::memcpy(metadata.data() + offset, input_strides_.data(), strides_size); + offset += strides_size; + + d_out_strides_ = reinterpret_cast(d_metadata_ + offset); + std::memcpy(metadata.data() + offset, out_strides_.data(), strides_size); + + Backend::Memcpy(d_metadata_, metadata.data(), metadata_size, + Backend::MemcpyHostToDevice); + } + + ~CudaRearrangeInfinilm() { Backend::Free(d_metadata_); } + + void operator()(const Tensor input, Tensor out) const override { + auto cuda_stream = + static_cast(stream_ ? stream_ : 0); + int block_size = std::min( + RuntimeUtils::GetOptimalBlockSize(), 1024); + dim3 block(std::min(static_cast(block_size), output_size_)); + dim3 grid(utils::CeilDiv(output_size_, block.x)); + + DispatchFunc>( + {static_cast(out_type_), block_size}, + [&](auto list_tag) { + using T = TypeMapType(list_tag)>; + constexpr int kBlockSize = ListGet<1>(list_tag); + + RearrangeInfinilmKernel + <<>>( + reinterpret_cast(out.data()), + reinterpret_cast(input.data()), d_out_shape_, + d_input_shape_, d_out_strides_, d_input_strides_, + output_size_, ndim_, is_out_contiguous_, + is_input_contiguous_); + }, + "CudaRearrangeInfinilm::operator()"); + } + + private: + std::byte* d_metadata_{nullptr}; + + Tensor::Size* d_input_shape_{nullptr}; + + Tensor::Size* d_out_shape_{nullptr}; + + Tensor::Stride* d_input_strides_{nullptr}; + + Tensor::Stride* d_out_strides_{nullptr}; +}; + +} // namespace infini::ops + +#endif diff --git a/tests/test_rearrange_infinilm.py b/tests/test_rearrange_infinilm.py new file mode 100644 index 000000000..d9ad54fa7 --- /dev/null +++ b/tests/test_rearrange_infinilm.py @@ -0,0 +1,86 @@ +import infini.ops +import pytest +import torch + +from tests.utils import Payload, empty_strided, get_stream, randn_strided + + +def _row_major_strides(shape): + stride = 1 + strides = [1] + for dim in reversed(shape[1:]): + stride *= dim + strides.insert(0, stride) + + return tuple(strides) + + +def _column_major_strides(shape): + stride = 1 + strides = [stride] + for dim in shape[:-1]: + stride *= dim + strides.append(stride) + + return tuple(strides) + + +@pytest.mark.auto_act_and_assert +@pytest.mark.parametrize( + "shape, input_strides, out_strides", + ( + ((100, 100), (1, 100), (100, 1)), + ((4, 4), (1, 4), (4, 1)), + ((4, 6, 64), (64, 4 * 64, 1), (6 * 64, 64, 1)), + ((2000, 2000), (1, 2000), (2000, 1)), + ((2001, 2001), (1, 2001), (2001, 1)), + ((2, 2, 2, 4), (16, 8, 4, 1), (16, 8, 1, 2)), + ( + (3, 4, 7, 53, 9), + _row_major_strides((3, 4, 7, 53, 9)), + _column_major_strides((3, 4, 7, 53, 9)), + ), + ( + (3, 4, 50, 50, 5, 7), + _row_major_strides((3, 4, 50, 50, 5, 7)), + _column_major_strides((3, 4, 50, 50, 5, 7)), + ), + ((15, 10752), (0, 1), (10752, 1)), + ((2, 2, 2, 2, 2, 2), (4, 8, 16, 32, 64, 128), (64, 32, 16, 8, 4, 2)), + ((8, 4, 20, 64), (5120, 64, 256, 1), None), + ((8, 4, 20, 64), (5120, 64, 256, 1), (1048576, 262144, 64, 1)), + ), +) +@pytest.mark.parametrize( + ("dtype", "rtol", "atol"), + ( + (torch.float32, 0, 0), + (torch.float16, 0, 0), + ), +) +def test_rearrange_infinilm( + shape, input_strides, out_strides, dtype, device, rtol, atol +): + input = randn_strided(shape, input_strides, dtype=dtype, device=device) + out = empty_strided(shape, out_strides, dtype=dtype, device=device) + + return Payload( + _rearrange_infinilm, + _torch_rearrange_infinilm, + (input, out), + {}, + rtol=rtol, + atol=atol, + ) + + +def _rearrange_infinilm(input, out): + infini.ops.rearrange_infinilm(input, out, stream=get_stream(input.device)) + + return out + + +def _torch_rearrange_infinilm(input, out): + out.copy_(input.expand_as(out)) + + return out