diff --git a/src/base/zeros_infinilm.h b/src/base/zeros_infinilm.h new file mode 100644 index 000000000..280f51a2b --- /dev/null +++ b/src/base/zeros_infinilm.h @@ -0,0 +1,57 @@ +#ifndef INFINI_OPS_BASE_ZEROS_INFINILM_H_ +#define INFINI_OPS_BASE_ZEROS_INFINILM_H_ + +#include + +#include "operator.h" + +namespace infini::ops { + +class ZerosInfinilm : public Operator { + public: + ZerosInfinilm(const Tensor input, Tensor out) + : input_shape_{input.shape()}, + input_strides_{input.strides()}, + input_type_{input.dtype()}, + out_shape_{out.shape()}, + out_strides_{out.strides()}, + out_type_{out.dtype()}, + output_size_{out.numel()}, + ndim_{out.ndim()}, + is_out_contiguous_{out.IsContiguous()}, + device_index_{out.device().index()} { + assert(input_shape_ == out_shape_ && + "`ZerosInfinilm` input and output shapes must match"); + assert(input_type_ == out_type_ && + "`ZerosInfinilm` input and output dtypes must match"); + assert(!out.HasBroadcastDim() && + "`ZerosInfinilm` output must not have broadcasted dimensions"); + } + + virtual void operator()(const Tensor input, Tensor out) const = 0; + + protected: + Tensor::Shape input_shape_; + + Tensor::Strides input_strides_; + + DataType input_type_; + + Tensor::Shape out_shape_; + + Tensor::Strides out_strides_; + + DataType out_type_; + + Tensor::Size output_size_{0}; + + Tensor::Size ndim_{0}; + + bool is_out_contiguous_{false}; + + int device_index_{0}; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/iluvatar/ops/zeros_infinilm/kernel.h b/src/native/cuda/iluvatar/ops/zeros_infinilm/kernel.h new file mode 100644 index 000000000..0360063ea --- /dev/null +++ b/src/native/cuda/iluvatar/ops/zeros_infinilm/kernel.h @@ -0,0 +1,21 @@ +#ifndef INFINI_OPS_ILUVATAR_ZEROS_INFINILM_KERNEL_H_ +#define INFINI_OPS_ILUVATAR_ZEROS_INFINILM_KERNEL_H_ + +#include + +#include "native/cuda/iluvatar/caster.cuh" +#include "native/cuda/iluvatar/runtime_.h" +#include "native/cuda/ops/zeros_infinilm/kernel.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaZerosInfinilm> { + public: + using CudaZerosInfinilm>::CudaZerosInfinilm; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/metax/ops/zeros_infinilm/kernel.h b/src/native/cuda/metax/ops/zeros_infinilm/kernel.h new file mode 100644 index 000000000..a6ff877e7 --- /dev/null +++ b/src/native/cuda/metax/ops/zeros_infinilm/kernel.h @@ -0,0 +1,21 @@ +#ifndef INFINI_OPS_METAX_ZEROS_INFINILM_KERNEL_H_ +#define INFINI_OPS_METAX_ZEROS_INFINILM_KERNEL_H_ + +#include + +#include "native/cuda/metax/caster.cuh" +#include "native/cuda/metax/runtime_.h" +#include "native/cuda/ops/zeros_infinilm/kernel.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaZerosInfinilm> { + public: + using CudaZerosInfinilm>::CudaZerosInfinilm; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/moore/ops/zeros_infinilm/kernel.h b/src/native/cuda/moore/ops/zeros_infinilm/kernel.h new file mode 100644 index 000000000..1e929a78e --- /dev/null +++ b/src/native/cuda/moore/ops/zeros_infinilm/kernel.h @@ -0,0 +1,22 @@ +#ifndef INFINI_OPS_MOORE_ZEROS_INFINILM_KERNEL_H_ +#define INFINI_OPS_MOORE_ZEROS_INFINILM_KERNEL_H_ + +#include + +#include "native/cuda/moore/caster.cuh" +#include "native/cuda/moore/polyfills.cuh" +#include "native/cuda/moore/runtime_.h" +#include "native/cuda/ops/zeros_infinilm/kernel.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaZerosInfinilm> { + public: + using CudaZerosInfinilm>::CudaZerosInfinilm; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/nvidia/ops/zeros_infinilm/kernel.h b/src/native/cuda/nvidia/ops/zeros_infinilm/kernel.h new file mode 100644 index 000000000..a6092c5dc --- /dev/null +++ b/src/native/cuda/nvidia/ops/zeros_infinilm/kernel.h @@ -0,0 +1,21 @@ +#ifndef INFINI_OPS_NVIDIA_ZEROS_INFINILM_KERNEL_H_ +#define INFINI_OPS_NVIDIA_ZEROS_INFINILM_KERNEL_H_ + +#include + +#include "native/cuda/nvidia/caster.cuh" +#include "native/cuda/nvidia/runtime_.h" +#include "native/cuda/ops/zeros_infinilm/kernel.h" + +namespace infini::ops { + +template <> +class Operator + : public CudaZerosInfinilm> { + public: + using CudaZerosInfinilm>::CudaZerosInfinilm; +}; + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/ops/zeros_infinilm/kernel.cuh b/src/native/cuda/ops/zeros_infinilm/kernel.cuh new file mode 100644 index 000000000..7f94d2a70 --- /dev/null +++ b/src/native/cuda/ops/zeros_infinilm/kernel.cuh @@ -0,0 +1,28 @@ +#ifndef INFINI_OPS_CUDA_ZEROS_INFINILM_KERNEL_CUH_ +#define INFINI_OPS_CUDA_ZEROS_INFINILM_KERNEL_CUH_ + +#include + +#include "native/cuda/caster.cuh" +#include "native/cuda/kernel_commons.cuh" + +namespace infini::ops { + +template +__global__ void ZerosInfinilmKernel(T* __restrict__ out, + const size_t* __restrict__ out_shape, + const ptrdiff_t* __restrict__ out_strides, + size_t output_size, size_t ndim, + bool out_contiguous) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < output_size) { + size_t out_idx = + out_contiguous ? idx : IndexToOffset(idx, ndim, out_shape, out_strides); + out[out_idx] = Caster::template Cast(0); + } +} + +} // namespace infini::ops + +#endif diff --git a/src/native/cuda/ops/zeros_infinilm/kernel.h b/src/native/cuda/ops/zeros_infinilm/kernel.h new file mode 100644 index 000000000..4b8db9277 --- /dev/null +++ b/src/native/cuda/ops/zeros_infinilm/kernel.h @@ -0,0 +1,78 @@ +#ifndef INFINI_OPS_CUDA_ZEROS_INFINILM_KERNEL_H_ +#define INFINI_OPS_CUDA_ZEROS_INFINILM_KERNEL_H_ + +#include +#include +#include +#include + +#include "base/zeros_infinilm.h" +#include "common/generic_utils.h" +#include "data_type.h" +#include "dispatcher.h" +#include "native/cuda/kernel_commons.cuh" +#include "native/cuda/ops/zeros_infinilm/kernel.cuh" +#include "native/cuda/runtime_utils.h" + +namespace infini::ops { + +template +class CudaZerosInfinilm : public ZerosInfinilm { + public: + CudaZerosInfinilm(const Tensor input, Tensor out) + : ZerosInfinilm{input, out} { + size_t shape_size = ndim_ * sizeof(*d_out_shape_); + size_t strides_size = ndim_ * sizeof(*d_out_strides_); + const size_t metadata_size = shape_size + strides_size; + std::vector metadata(metadata_size); + + Backend::Malloc((void**)&d_metadata_, metadata_size); + + size_t offset = 0; + d_out_shape_ = reinterpret_cast(d_metadata_ + offset); + std::memcpy(metadata.data() + offset, out_shape_.data(), shape_size); + offset += shape_size; + + d_out_strides_ = reinterpret_cast(d_metadata_ + offset); + std::memcpy(metadata.data() + offset, out_strides_.data(), strides_size); + + Backend::Memcpy(d_metadata_, metadata.data(), metadata_size, + Backend::MemcpyHostToDevice); + } + + ~CudaZerosInfinilm() { Backend::Free(d_metadata_); } + + void operator()(const Tensor input, Tensor out) const override { + (void)input; + auto cuda_stream = + static_cast(stream_ ? stream_ : 0); + int block_size = std::min( + RuntimeUtils::GetOptimalBlockSize(), 1024); + dim3 block(std::min(static_cast(block_size), output_size_)); + dim3 grid(utils::CeilDiv(output_size_, block.x)); + + DispatchFunc>( + {static_cast(out_type_), block_size}, + [&](auto list_tag) { + using T = TypeMapType(list_tag)>; + constexpr int kBlockSize = ListGet<1>(list_tag); + + ZerosInfinilmKernel + <<>>( + reinterpret_cast(out.data()), d_out_shape_, + d_out_strides_, output_size_, ndim_, is_out_contiguous_); + }, + "CudaZerosInfinilm::operator()"); + } + + private: + std::byte* d_metadata_{nullptr}; + + Tensor::Size* d_out_shape_{nullptr}; + + Tensor::Stride* d_out_strides_{nullptr}; +}; + +} // namespace infini::ops + +#endif diff --git a/tests/test_zeros_infinilm.py b/tests/test_zeros_infinilm.py new file mode 100644 index 000000000..38026503c --- /dev/null +++ b/tests/test_zeros_infinilm.py @@ -0,0 +1,91 @@ +import infini.ops +import pytest +import torch + +from tests.utils import ( + Payload, + empty_strided, + get_stream, + randint_strided, + randn_strided, +) + + +@pytest.mark.auto_act_and_assert +@pytest.mark.parametrize( + "shape, input_strides, out_strides, inplace", + ( + ((13, 4), None, None, False), + ((13, 4), None, None, True), + ((13, 4), (10, 1), (10, 1), False), + ((13, 4), (0, 1), None, False), + ((13, 4, 4), None, None, False), + ((13, 4, 4), None, None, True), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), False), + ((16, 5632), None, None, False), + ((16, 5632), None, None, True), + ((16, 5632), (13312, 1), (13312, 1), False), + ((4, 4, 5632), None, None, False), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), False), + ), +) +@pytest.mark.parametrize( + "dtype", + ( + torch.uint8, + torch.int8, + torch.int16, + torch.int32, + torch.int64, + torch.float64, + torch.float32, + torch.float16, + torch.bfloat16, + ), +) +def test_zeros_infinilm(shape, input_strides, out_strides, inplace, dtype, device): + if device == "musa" and dtype == torch.float64: + pytest.skip("MUSA does not support float64 zeros_infinilm") + + input = _make_input(shape, input_strides, dtype=dtype, device=device) + out = ( + input + if inplace + else empty_strided(shape, out_strides, dtype=dtype, device=device) + ) + if not inplace: + _fill_nonzero(out) + + return Payload( + _zeros_infinilm, + _torch_zeros_infinilm, + (input, out), + {}, + rtol=0, + atol=0, + ) + + +def _make_input(shape, strides, *, dtype, device): + if dtype.is_floating_point: + return randn_strided(shape, strides, dtype=dtype, device=device) + return randint_strided(1, 16, shape, strides, dtype=dtype, device=device) + + +def _fill_nonzero(tensor): + if tensor.dtype.is_floating_point: + tensor.fill_(1) + else: + tensor.fill_(1) + + +def _zeros_infinilm(input, out): + infini.ops.zeros_infinilm(input, out, stream=get_stream(input.device)) + + return out + + +def _torch_zeros_infinilm(input, out): + out.zero_() + + return out