diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index d07c3a16a84..097bd0cb9de 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -926,7 +926,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { strncmp(op->name, "ffn_moe_weights_sum_clamped", sizeof("ffn_moe_weights_sum_clamped") - 1) == 0) { return true; } - break; + break; } case GGML_OP_FLASH_ATTN_EXT: { if (op->src[4] != nullptr) { diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index de8bcdb38de..44aca2f4aba 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -35,7 +35,7 @@ OutputVector translate_rope(const NodeContext & context) { ov::Output res; - auto data_node = context.get_input(0).get_node_shared_ptr(); + auto data_node = process_view_input_new(context, 0).get_node_shared_ptr(); auto output_shape = context.get_output_shape().to_shape(); int32_t * op_params = context.get_output_op_params(); const int mode = op_case; @@ -55,7 +55,16 @@ OutputVector translate_rope(const NodeContext & context) { if (context.get_input_size() == 3) { rope_freqs_weight = context.get_input(2).get_node_shared_ptr(); } - auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight, mode == TYPE_IMROPE); + std::shared_ptr token_len_per_seq; + if (context.has_input("token_len_per_seq")) { + token_len_per_seq = context.get_input("token_len_per_seq").get_node_shared_ptr(); + } + auto sin_cos = make_sin_cos(op_params, + inp_pos, + rope_freqs_weight, + mode == TYPE_IMROPE, + false, + token_len_per_seq); sin_theta_node = sin_cos.first; cos_theta_node = sin_cos.second; } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 189de0fc37f..c22d95e05a8 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -124,6 +124,12 @@ void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) if (ggml_model_decoder.has_mixed_rope_params()) { return; } + // Dynamic active-sequence slicing is reconstructed per ROPE node. Reusing a + // single shared rope_sin/rope_cos across the whole graph is unsafe here, + // because the graph-level inp_pos does not necessarily match each ROPE use. + if (tensor_map.find("seq_active_start") != tensor_map.end() && tensor_map.find("seq_active_end") != tensor_map.end()) { + return; + } int32_t * rope_params = ggml_model_decoder.get_rope_params(); if (tensor_map.find("inp_pos") == tensor_map.end() || rope_params == nullptr) { return; diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index e0344aee3b8..c4082e071ee 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -121,7 +121,8 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params std::shared_ptr inp_pos, std::shared_ptr rope_freqs_weight, bool imrope, - bool stateful) { + bool stateful, + std::shared_ptr token_len_per_seq) { if (stateful) { inp_pos = std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); inp_pos = std::make_shared(inp_pos, ov::element::f32); @@ -140,6 +141,13 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params auto pos_perm = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 3, 1, 2}); inp_pos = std::make_shared(inp_pos, pos_perm); + + if (!imrope && token_len_per_seq) { + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + inp_pos = std::make_shared(inp_pos, zero, token_len_per_seq, one, axis); + } } float freq_base; diff --git a/ggml/src/ggml-openvino/openvino/utils.h b/ggml/src/ggml-openvino/openvino/utils.h index 53f793b57d7..343491e0f2c 100644 --- a/ggml/src/ggml-openvino/openvino/utils.h +++ b/ggml/src/ggml-openvino/openvino/utils.h @@ -68,7 +68,8 @@ std::pair, ov::Output> make_sin_cos(int32_t* rope_params, std::shared_ptr inp_pos, std::shared_ptr rope_freqs_weight = nullptr, bool imrope = false, - bool stateful = false); + bool stateful = false, + std::shared_ptr token_len_per_seq = nullptr); ov::Output process_view_input(const NodeContext& context, int input_index, int slice_len = 0);