diff --git a/src/infiniop/ops/rope/ascend/rope_ascend_kernel.cpp b/src/infiniop/ops/rope/ascend/rope_ascend_kernel.cpp index a18d60f4a..91c6b29da 100644 --- a/src/infiniop/ops/rope/ascend/rope_ascend_kernel.cpp +++ b/src/infiniop/ops/rope/ascend/rope_ascend_kernel.cpp @@ -362,6 +362,7 @@ class RoPEKernelNeox { size_t _tile_len; size_t _copy_len; size_t _half_len; + size_t _half_copy_len; size_t _batch; size_t _nhead; @@ -399,6 +400,7 @@ __aicore__ inline void RoPEKernelNeox::init(GM_ADDR y, this->_st_xnh = st_xnh; this->_st_xbatch = st_xbatch; _copy_len = alignTileLen(dh, BYTE_ALIGN); + _half_copy_len = alignTileLen(_half_len, BYTE_ALIGN); _block_idx = GetBlockIdx(); @@ -410,27 +412,28 @@ __aicore__ inline void RoPEKernelNeox::init(GM_ADDR y, pipe.InitBuffer(_in_que, BUFFER_NUM, _copy_len * sizeof(T)); pipe.InitBuffer(_out_que, BUFFER_NUM, _copy_len * sizeof(T)); - pipe.InitBuffer(_sin_que, BUFFER_NUM, _half_len * sizeof(T)); - pipe.InitBuffer(_cos_que, BUFFER_NUM, _half_len * sizeof(T)); + pipe.InitBuffer(_sin_que, BUFFER_NUM, _half_copy_len * sizeof(T)); + pipe.InitBuffer(_cos_que, BUFFER_NUM, _half_copy_len * sizeof(T)); if constexpr (std::is_same::value) { + size_t half_float_copy_len = alignTileLen(_half_len, BYTE_ALIGN); pipe.InitBuffer(_tmp_float_input, _copy_len * sizeof(float)); - pipe.InitBuffer(_tmp_float_sin, _half_len * sizeof(float)); - pipe.InitBuffer(_tmp_float_cos, _half_len * sizeof(float)); + pipe.InitBuffer(_tmp_float_sin, half_float_copy_len * sizeof(float)); + pipe.InitBuffer(_tmp_float_cos, half_float_copy_len * sizeof(float)); pipe.InitBuffer(_tmp_float_output, _tile_len * sizeof(float)); - pipe.InitBuffer(_tmp_first_half, _half_len * sizeof(float)); - pipe.InitBuffer(_tmp_second_half, _half_len * sizeof(float)); - pipe.InitBuffer(_tmp_result1, _half_len * sizeof(float)); - pipe.InitBuffer(_tmp_result2, _half_len * sizeof(float)); - pipe.InitBuffer(_tmp_result3, _half_len * sizeof(float)); - pipe.InitBuffer(_tmp_result4, _half_len * sizeof(float)); + pipe.InitBuffer(_tmp_first_half, half_float_copy_len * sizeof(float)); + pipe.InitBuffer(_tmp_second_half, half_float_copy_len * sizeof(float)); + pipe.InitBuffer(_tmp_result1, half_float_copy_len * sizeof(float)); + pipe.InitBuffer(_tmp_result2, half_float_copy_len * sizeof(float)); + pipe.InitBuffer(_tmp_result3, half_float_copy_len * sizeof(float)); + pipe.InitBuffer(_tmp_result4, half_float_copy_len * sizeof(float)); } else { - pipe.InitBuffer(_tmp_first_half, _half_len * sizeof(T)); - pipe.InitBuffer(_tmp_second_half, _half_len * sizeof(T)); - pipe.InitBuffer(_tmp_result1, _half_len * sizeof(T)); - pipe.InitBuffer(_tmp_result2, _half_len * sizeof(T)); - pipe.InitBuffer(_tmp_result3, _half_len * sizeof(T)); - pipe.InitBuffer(_tmp_result4, _half_len * sizeof(T)); + pipe.InitBuffer(_tmp_first_half, _half_copy_len * sizeof(T)); + pipe.InitBuffer(_tmp_second_half, _half_copy_len * sizeof(T)); + pipe.InitBuffer(_tmp_result1, _half_copy_len * sizeof(T)); + pipe.InitBuffer(_tmp_result2, _half_copy_len * sizeof(T)); + pipe.InitBuffer(_tmp_result3, _half_copy_len * sizeof(T)); + pipe.InitBuffer(_tmp_result4, _half_copy_len * sizeof(T)); } } @@ -446,8 +449,10 @@ __aicore__ inline void RoPEKernelNeox::copyIn(size_t i) { auto idx = batch_idx * _st_xbatch + i * _st_xnt + head_idx * _st_xnh; DataCopy(input_ub, _x_gm[idx], _copy_len); auto pos_idx = _p_gm(i); - DataCopy(sin_ub, _sin_gm[pos_idx * _half_len], _half_len); - DataCopy(cos_ub, _cos_gm[pos_idx * _half_len], _half_len); + DataCopyExtParams halfCopyParams = {1, static_cast(_half_len * sizeof(T)), 0, 0, 0}; + DataCopyPadExtParams halfPadParams{true, 0, 0, 0}; + DataCopyPad(sin_ub, _sin_gm[pos_idx * _half_len], halfCopyParams, halfPadParams); + DataCopyPad(cos_ub, _cos_gm[pos_idx * _half_len], halfCopyParams, halfPadParams); _in_que.EnQue(input_ub); _sin_que.EnQue(sin_ub); _cos_que.EnQue(cos_ub); @@ -470,26 +475,32 @@ __aicore__ inline void RoPEKernelNeox::compute(size_t i) { LocalTensor result2_f = _tmp_result2.Get(); LocalTensor result3_f = _tmp_result3.Get(); LocalTensor result4_f = _tmp_result4.Get(); + size_t half_float_copy_len = alignTileLen(_half_len, BYTE_ALIGN); Cast(input_f, input_ub, AscendC::RoundMode::CAST_NONE, _copy_len); - Cast(sin_f, sin_ub, AscendC::RoundMode::CAST_NONE, _half_len); - Cast(cos_f, cos_ub, AscendC::RoundMode::CAST_NONE, _half_len); - - for (size_t j = 0; j < _half_len; j++) { - first_half_f(j) = input_f(j); - second_half_f(j) = input_f(_half_len + j); + Cast(sin_f, sin_ub, AscendC::RoundMode::CAST_NONE, half_float_copy_len); + Cast(cos_f, cos_ub, AscendC::RoundMode::CAST_NONE, half_float_copy_len); + + for (size_t j = 0; j < half_float_copy_len; j++) { + if (j < _half_len) { + first_half_f(j) = input_f(j); + second_half_f(j) = input_f(_half_len + j); + } else { + first_half_f(j) = 0.0f; + second_half_f(j) = 0.0f; + } } PipeBarrier(); - Mul(result1_f, first_half_f, cos_f, _half_len); - Mul(result2_f, second_half_f, sin_f, _half_len); + Mul(result1_f, first_half_f, cos_f, half_float_copy_len); + Mul(result2_f, second_half_f, sin_f, half_float_copy_len); PipeBarrier(); - Sub(result3_f, result1_f, result2_f, _half_len); + Sub(result3_f, result1_f, result2_f, half_float_copy_len); - Mul(result1_f, first_half_f, sin_f, _half_len); - Mul(result2_f, second_half_f, cos_f, _half_len); + Mul(result1_f, first_half_f, sin_f, half_float_copy_len); + Mul(result2_f, second_half_f, cos_f, half_float_copy_len); PipeBarrier(); - Add(result4_f, result1_f, result2_f, _half_len); + Add(result4_f, result1_f, result2_f, half_float_copy_len); LocalTensor output_f = _tmp_float_output.Get(); for (size_t j = 0; j < _half_len; j++) { @@ -507,21 +518,26 @@ __aicore__ inline void RoPEKernelNeox::compute(size_t i) { LocalTensor result3 = _tmp_result3.Get(); LocalTensor result4 = _tmp_result4.Get(); - for (size_t j = 0; j < _half_len; j++) { - first_half(j) = input_ub(j); - second_half(j) = input_ub(_half_len + j); + for (size_t j = 0; j < _half_copy_len; j++) { + if (j < _half_len) { + first_half(j) = input_ub(j); + second_half(j) = input_ub(_half_len + j); + } else { + first_half(j) = static_cast(0); + second_half(j) = static_cast(0); + } } PipeBarrier(); - Mul(result1, first_half, cos_ub, _half_len); - Mul(result2, second_half, sin_ub, _half_len); + Mul(result1, first_half, cos_ub, _half_copy_len); + Mul(result2, second_half, sin_ub, _half_copy_len); PipeBarrier(); - Sub(result3, result1, result2, _half_len); + Sub(result3, result1, result2, _half_copy_len); - Mul(result1, first_half, sin_ub, _half_len); - Mul(result2, second_half, cos_ub, _half_len); + Mul(result1, first_half, sin_ub, _half_copy_len); + Mul(result2, second_half, cos_ub, _half_copy_len); PipeBarrier(); - Add(result4, result1, result2, _half_len); + Add(result4, result1, result2, _half_copy_len); for (size_t j = 0; j < _half_len; j++) { output_ub(j) = result3(j);