From c7c2a442d7ef6b6ce55c2be9cf44426553946883 Mon Sep 17 00:00:00 2001 From: wangpengcheng Date: Thu, 11 Jun 2026 02:26:27 +0000 Subject: [PATCH] issue/424 -Clean up unused code. --- csrc/cache/kv_cache.cpp | 225 +-------- csrc/cache/kv_cache.hpp | 173 +------ csrc/config/config_factory.cpp | 9 +- csrc/{models => }/debug_utils/hooks.cpp | 0 csrc/{models => }/debug_utils/hooks.hpp | 0 .../{models => }/debug_utils/tensor_utils.hpp | 0 csrc/engine/rank_worker.cpp | 32 +- .../layers/causal_lm_templates/text_model.hpp | 3 - .../rotary_embedding_factory.cpp | 1 - .../baichuan/baichuan_for_causal_lm.hpp | 1 - csrc/models/fm9g/fm9g_for_causal_lm.cpp | 4 - csrc/models/fm9g/fm9g_for_causal_lm.hpp | 1 - csrc/models/glm4/glm4_decoder_layer.hpp | 2 - csrc/models/infinilm_model.cpp | 1 - csrc/models/infinilm_model.hpp | 1 - .../internlm3/internlm3_for_causal_lm.hpp | 1 - csrc/models/llama/llama_for_causal_lm.cpp | 6 +- .../llama_legacy/legacy_fused_linear.cpp | 396 ---------------- .../llama_legacy/legacy_fused_linear.hpp | 328 ------------- csrc/models/llama_legacy/llama.hpp | 24 - csrc/models/llama_legacy/llama_attention.cpp | 439 ------------------ csrc/models/llama_legacy/llama_attention.hpp | 142 ------ csrc/models/llama_legacy/llama_config.hpp | 95 ---- .../llama_legacy/llama_decoder_layer.cpp | 50 -- .../llama_legacy/llama_decoder_layer.hpp | 89 ---- .../llama_legacy/llama_for_causal_lm.cpp | 51 -- .../llama_legacy/llama_for_causal_lm.hpp | 63 --- csrc/models/llama_legacy/llama_mlp.cpp | 97 ---- csrc/models/llama_legacy/llama_mlp.hpp | 81 ---- csrc/models/llama_legacy/llama_model.cpp | 125 ----- csrc/models/llama_legacy/llama_model.hpp | 98 ---- csrc/models/model_factory.cpp | 21 - csrc/models/model_factory.hpp | 8 - csrc/models/qwen2/qwen2_for_causal_lm.cpp | 6 +- csrc/models/qwen3_moe/qwen3_moe_experts.hpp | 2 - .../qwen3_moe/qwen3_moe_topk_router.hpp | 2 - csrc/pybind11/bindings.cc | 5 +- csrc/pybind11/engine/engine.hpp | 32 +- csrc/pybind11/models/llama_legacy.hpp | 216 --------- .../models/llama/configuration_llama.py | 13 +- xmake.lua | 9 - 41 files changed, 66 insertions(+), 2786 deletions(-) rename csrc/{models => }/debug_utils/hooks.cpp (100%) rename csrc/{models => }/debug_utils/hooks.hpp (100%) rename csrc/{models => }/debug_utils/tensor_utils.hpp (100%) delete mode 100644 csrc/models/llama_legacy/legacy_fused_linear.cpp delete mode 100644 csrc/models/llama_legacy/legacy_fused_linear.hpp delete mode 100644 csrc/models/llama_legacy/llama.hpp delete mode 100644 csrc/models/llama_legacy/llama_attention.cpp delete mode 100644 csrc/models/llama_legacy/llama_attention.hpp delete mode 100644 csrc/models/llama_legacy/llama_config.hpp delete mode 100644 csrc/models/llama_legacy/llama_decoder_layer.cpp delete mode 100644 csrc/models/llama_legacy/llama_decoder_layer.hpp delete mode 100644 csrc/models/llama_legacy/llama_for_causal_lm.cpp delete mode 100644 csrc/models/llama_legacy/llama_for_causal_lm.hpp delete mode 100644 csrc/models/llama_legacy/llama_mlp.cpp delete mode 100644 csrc/models/llama_legacy/llama_mlp.hpp delete mode 100644 csrc/models/llama_legacy/llama_model.cpp delete mode 100644 csrc/models/llama_legacy/llama_model.hpp delete mode 100644 csrc/pybind11/models/llama_legacy.hpp diff --git a/csrc/cache/kv_cache.cpp b/csrc/cache/kv_cache.cpp index df0ceb29d..92734c05f 100644 --- a/csrc/cache/kv_cache.cpp +++ b/csrc/cache/kv_cache.cpp @@ -2,8 +2,6 @@ #include "../global_state/global_state.hpp" #include "../utils.hpp" -#include "infinicore/ops.hpp" -#include namespace infinilm::cache { // ========================== @@ -32,58 +30,12 @@ StaticKVCacheConfig::max_cache_len() const { return max_cache_len_; } +namespace StaticKVCache { + // ========================== // StaticKVCache // ========================== - -StaticKVCache::StaticKVCache( - infinicore::Size k_dim, - infinicore::Size v_dim, - infinicore::Size num_k_heads, - infinicore::Size num_v_heads, - infinicore::Size num_layers, - infinicore::Size max_positional_embedding, - infinicore::DataType dtype, - const StaticKVCacheConfig &config, - const engine::distributed::RankInfo &rank_info) - : Cache(), - k_dim_(k_dim), - v_dim_(v_dim), - rank_batch_size_(config.max_batch_size()), - cache_len_(config.max_cache_len() == std::numeric_limits::max() || config.max_cache_len() == 0 ? max_positional_embedding : config.max_cache_len()), - rank_num_layers_(num_layers), - dtype_(dtype) { - - bool is_kv_replica = (num_k_heads < rank_info.tp_size && num_v_heads < rank_info.tp_size && num_k_heads == num_v_heads && rank_info.tp_size % num_k_heads == 0); - - num_rank_k_heads_ = is_kv_replica ? 1 : (num_k_heads / rank_info.tp_size); - num_rank_v_heads_ = is_kv_replica ? 1 : (num_v_heads / rank_info.tp_size); - // Allocate K cache - k_caches_ = infinicore::Tensor::empty( - {rank_num_layers_, - rank_batch_size_, - num_rank_k_heads_, - cache_len_, - k_dim_}, - dtype_, - rank_info.device); - set_zeros(k_caches_); - - // Allocate V cache - v_caches_ = infinicore::Tensor::empty( - {rank_num_layers_, - rank_batch_size_, - num_rank_v_heads_, - cache_len_, - v_dim_}, - dtype_, - rank_info.device); - set_zeros(v_caches_); - - infinicore::context::syncStream(); -} - -infinicore::Tensor StaticKVCache::create_layer_kv_cache( +infinicore::Tensor create_layer_kv_cache( const infinicore::Size k_dim, const infinicore::Size v_dim, const infinicore::Size num_k_heads, @@ -120,45 +72,7 @@ infinicore::Tensor StaticKVCache::create_layer_kv_cache( return kv_cache; } - -std::tuple -StaticKVCache::update(size_t layer_idx, - const infinicore::Tensor &k, - const infinicore::Tensor &v, - const infinicore::Tensor &past_sequence_lengths) { - ASSERT(layer_idx < rank_num_layers_); - - auto batch_size = k->size(0); - auto update_len = k->size(2); - - ASSERT_EQ(batch_size, rank_batch_size_); - - auto k_cache_layer = k_caches_->narrow({{0, layer_idx, 1}})->squeeze(0); - auto v_cache_layer = v_caches_->narrow({{0, layer_idx, 1}})->squeeze(0); - - auto device = k_cache_layer->device(); - -#ifdef ENABLE_KV_CACHING - infinicore::op::kv_caching_( - k_cache_layer, - v_cache_layer, - k, - v, - past_sequence_lengths); -#else - size_t cache_pos = reinterpret_cast(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0]; - auto result_len = cache_pos + update_len; - ASSERT(result_len <= cache_len_); - - auto k_cache_update = k_cache_layer->narrow({{2, cache_pos, update_len}}); - auto v_cache_update = v_cache_layer->narrow({{2, cache_pos, update_len}}); - - k_cache_update->copy_from(k); - v_cache_update->copy_from(v); -#endif - - return {k_cache_layer, v_cache_layer}; -} +}; // namespace StaticKVCache // ========================== // PagedKVCacheConfig @@ -185,56 +99,11 @@ PagedKVCacheConfig::block_size() const { return block_size_; } +namespace PagedKVCache { // ========================== // PagedKVCache // ========================== -PagedKVCache::PagedKVCache( - infinicore::Size k_dim, - infinicore::Size v_dim, - infinicore::Size num_k_heads, - infinicore::Size num_v_heads, - infinicore::Size num_layers, - infinicore::DataType dtype, - const PagedKVCacheConfig &config, - const engine::distributed::RankInfo &rank_info) - : Cache(), - k_dim_(k_dim), - v_dim_(v_dim), - rank_num_layers_(num_layers), - dtype_(dtype), - num_blocks_per_layer_(config.num_blocks()), - block_size_(config.block_size()) { - - bool is_kv_replica = (num_k_heads < rank_info.tp_size && num_v_heads < rank_info.tp_size && num_k_heads == num_v_heads && rank_info.tp_size % num_k_heads == 0); - - num_rank_k_heads_ = is_kv_replica ? 1 : (num_k_heads / rank_info.tp_size); - num_rank_v_heads_ = is_kv_replica ? 1 : (num_v_heads / rank_info.tp_size); - // [num_layers, num_blocks, num_rank_k_heads, block_size, k_dim] - k_caches_ = infinicore::Tensor::empty( - {rank_num_layers_, - num_blocks_per_layer_, - num_rank_k_heads_, - block_size_, - k_dim_}, - dtype_, - rank_info.device); - set_zeros(k_caches_); - - // [num_layers, num_blocks, num_rank_v_heads, block_size, v_dim] - v_caches_ = infinicore::Tensor::empty( - {rank_num_layers_, - num_blocks_per_layer_, - num_rank_v_heads_, - block_size_, - v_dim_}, - dtype_, - rank_info.device); - set_zeros(v_caches_); - - infinicore::context::syncStream(); -} - -infinicore::Tensor PagedKVCache::create_layer_kv_cache( +infinicore::Tensor create_layer_kv_cache( infinicore::Size k_dim, infinicore::Size v_dim, infinicore::Size num_k_heads, @@ -273,86 +142,6 @@ infinicore::Tensor PagedKVCache::create_layer_kv_cache( return kv_cache; } - -std::tuple PagedKVCache::update( - size_t layer_idx, - const infinicore::Tensor &k, - const infinicore::Tensor &v, - const infinicore::Tensor &slot_mapping) { - - auto &&[k_cache_layer, v_cache_layer] = get_paged_kv(layer_idx); - - infinicore::op::paged_caching_( - k_cache_layer, - v_cache_layer, - k, - v, - slot_mapping); - return {k_cache_layer, v_cache_layer}; -} - -std::tuple -PagedKVCache::get_paged_kv(size_t layer_idx) { - auto k_cache_layer = k_caches_->narrow({{0, layer_idx, 1}})->squeeze(0); - auto v_cache_layer = v_caches_->narrow({{0, layer_idx, 1}})->squeeze(0); - return {k_cache_layer, v_cache_layer}; -} - -std::tuple -PagedKVCache::get_contiguous_kv( - size_t layer_idx, - const infinicore::Tensor block_tables, - const infinicore::Tensor cache_lens, - const infinicore::Tensor input_offsets, - size_t request_id) { - ASSERT_EQ(block_tables->dtype(), infinicore::DataType::I32); - ASSERT_EQ(cache_lens->dtype(), infinicore::DataType::I32); - ASSERT_EQ(input_offsets->dtype(), infinicore::DataType::I32); - - auto nreq = block_tables->size(0); - auto block_tables_cpu = block_tables->to(infinicore::Device::cpu()); - auto cache_lens_cpu = cache_lens->to(infinicore::Device::cpu()); - auto input_offsets_cpu = input_offsets->to(infinicore::Device::cpu()); - infinicore::context::syncDevice(); - - // [num_blocks, num_rank_v_heads, block_size, v_dim] - auto &&[k_cache_layer, v_cache_layer] = get_paged_kv(layer_idx); - - auto req = request_id; - auto cache_lens_ptr = reinterpret_cast(cache_lens_cpu->data()); - auto input_offsets_ptr = reinterpret_cast(input_offsets_cpu->data()); - int32_t total_len = cache_lens_ptr[req] + (input_offsets_ptr[req + 1] - input_offsets_ptr[req]); - - auto full_k = infinicore::Tensor::empty( - {num_rank_k_heads_, (size_t)total_len, k_dim_}, - k_cache_layer->dtype(), k_cache_layer->device()); - - auto full_v = infinicore::Tensor::empty( - {num_rank_v_heads_, (size_t)total_len, v_dim_}, - v_cache_layer->dtype(), v_cache_layer->device()); - - size_t nblocks = total_len / block_size_; - size_t r = total_len % block_size_; - - for (size_t b = 0; b < nblocks; b++) { - size_t bid = *((int32_t *)(block_tables_cpu->narrow({{0, req, 1}, {1, b, 1}})->data())); - - full_k->narrow({{1, b * block_size_, block_size_}}) - ->copy_from(k_cache_layer->narrow({{0, bid, 1}})->squeeze(0)); - full_v->narrow({{1, b * block_size_, block_size_}}) - ->copy_from(v_cache_layer->narrow({{0, bid, 1}})->squeeze(0)); - } - - if (r > 0) { - size_t bid = *((int32_t *)(block_tables_cpu->narrow({{0, req, 1}, {1, nblocks, 1}})->data())); - - full_k->narrow({{1, nblocks * block_size_, r}}) - ->copy_from(k_cache_layer->narrow({{0, bid, 1}})->squeeze(0)->narrow({{1, 0, r}})); - full_v->narrow({{1, nblocks * block_size_, r}}) - ->copy_from(v_cache_layer->narrow({{0, bid, 1}})->squeeze(0)->narrow({{1, 0, r}})); - } - - return {full_k, full_v}; -} +}; // namespace PagedKVCache } // namespace infinilm::cache diff --git a/csrc/cache/kv_cache.hpp b/csrc/cache/kv_cache.hpp index e6e640df2..4d0a9a704 100644 --- a/csrc/cache/kv_cache.hpp +++ b/csrc/cache/kv_cache.hpp @@ -1,20 +1,10 @@ #pragma once #include "base_cache.hpp" - -#include "infinicore/context/context.hpp" -#include "infinicore/device.hpp" -#include "infinicore/tensor.hpp" #include -#include #include #include -#include -#include -#include - -#include namespace infinilm::cache { class StaticKVCacheConfig final : public CacheConfig { @@ -32,64 +22,18 @@ class StaticKVCacheConfig final : public CacheConfig { infinicore::Size max_cache_len_; }; -class StaticKVCache final : public Cache { -public: - StaticKVCache( - infinicore::Size k_dim, - infinicore::Size v_dim, - infinicore::Size num_k_heads, - infinicore::Size num_v_heads, - infinicore::Size num_layers, - infinicore::Size max_positional_embedding, - infinicore::DataType dtype, - const StaticKVCacheConfig &config, - const engine::distributed::RankInfo &rank_info); - - static infinicore::Tensor create_layer_kv_cache( - const infinicore::Size k_dim, - const infinicore::Size v_dim, - const infinicore::Size num_k_heads, - const infinicore::Size num_v_heads, - const infinicore::Size max_positional_embedding, - const infinicore::DataType dtype, - const StaticKVCacheConfig &config); - - /** - * @brief Update KV cache at a given layer and cache position. - * - * @param layer_idx Which transformer layer - * @param k [batch, num_rank_k_heads, seq_len, k_dim] - * @param v [batch, num_rank_v_heads, seq_len, v_dim] - * @param cache_pos Sequence position to write - * - * @return (full_k, full_v) - * full_k: [batch, num_rank_k_heads, cache_pos + seq_len, k_dim] - * full_v: [batch, num_rank_v_heads, cache_pos + seq_len, v_dim] - */ - std::tuple - update(size_t layer_idx, - const infinicore::Tensor &k, - const infinicore::Tensor &v, - const infinicore::Tensor &past_sequence_lengths); +namespace StaticKVCache { - ~StaticKVCache() override = default; +infinicore::Tensor create_layer_kv_cache( + infinicore::Size k_dim, + infinicore::Size v_dim, + infinicore::Size num_k_heads, + infinicore::Size num_v_heads, + infinicore::Size max_positional_embedding, + infinicore::DataType dtype, + const StaticKVCacheConfig &config); -private: - infinicore::Size k_dim_; - infinicore::Size v_dim_; - infinicore::Size num_rank_k_heads_; - infinicore::Size num_rank_v_heads_; - infinicore::Size rank_batch_size_; - infinicore::Size cache_len_; - infinicore::Size rank_num_layers_; - infinicore::DataType dtype_; - - // [num_layers, max_batch, num_rank_k_heads, max_cache_len, k_dim] - infinicore::Tensor k_caches_; - - // [num_layers, max_batch, num_rank_v_heads, max_cache_len, v_dim] - infinicore::Tensor v_caches_; -}; +} // namespace StaticKVCache class PagedKVCacheConfig final : public CacheConfig { public: @@ -106,94 +50,15 @@ class PagedKVCacheConfig final : public CacheConfig { size_t block_size_; }; -class PagedKVCache final : public Cache { -public: - PagedKVCache( - infinicore::Size k_dim, - infinicore::Size v_dim, - infinicore::Size num_k_heads, - infinicore::Size num_v_heads, - infinicore::Size num_layers, - infinicore::DataType dtype, - const PagedKVCacheConfig &config, - const engine::distributed::RankInfo &rank_info); - - static infinicore::Tensor create_layer_kv_cache( - infinicore::Size k_dim, - infinicore::Size v_dim, - infinicore::Size num_k_heads, - infinicore::Size num_v_heads, - infinicore::DataType dtype, - const PagedKVCacheConfig &config); - - /** - * @brief Update Paged KV cache at a given layer given slot info for each token. - * - * @param layer_idx Which paged attention layer - * @param k [num_rank_k_heads, seq_len, k_dim] - * @param v [num_rank_v_heads, seq_len, v_dim] - * @param slot_mapping [seq_len] - * - * @return (full_k, full_v) - * full_k: [num_blocks, num_rank_k_heads, block_size, k_dim] - * full_v: [num_blocks, num_rank_v_heads, block_size, v_dim] - */ - std::tuple - update(size_t layer_idx, - const infinicore::Tensor &k, - const infinicore::Tensor &v, - const infinicore::Tensor &slot_mapping); - - /** - * @brief Get Paged KV cache at a given layer. - * - * @param layer_idx Which paged attention layer - * - * @return (full_k, full_v) - * full_k: [num_blocks, num_rank_k_heads, block_size, k_dim] - * full_v: [num_blocks, num_rank_v_heads, block_size, v_dim] - */ - std::tuple - get_paged_kv(size_t layer_idx); - - /** - * @brief Get contiguous KV cache at a given layer, given the request info - * among a continuous request batch. - * - * @param layer_idx Which paged attention layer - * @param block_tables [num_requests, max_blocks_per_request] - * @param cache_lens [num_requests] - * @param input_offsets [num_requests + 1] - * @param request_id Which request among a continuous batch of requests - * - * @return (full_k, full_v) - * full_k: [num_rank_k_heads, total_len, k_dim] - * full_v: [num_rank_v_heads, total_len, v_dim] - */ - std::tuple - get_contiguous_kv(size_t layer_idx, - const infinicore::Tensor block_tables, - const infinicore::Tensor cache_lens, - const infinicore::Tensor input_offsets, - size_t request_id = 0); +namespace PagedKVCache { +infinicore::Tensor create_layer_kv_cache( + infinicore::Size k_dim, + infinicore::Size v_dim, + infinicore::Size num_k_heads, + infinicore::Size num_v_heads, + infinicore::DataType dtype, + const PagedKVCacheConfig &config); - ~PagedKVCache() override - = default; - -private: - infinicore::Size k_dim_; - infinicore::Size v_dim_; - infinicore::Size num_rank_k_heads_; - infinicore::Size num_rank_v_heads_; - infinicore::Size rank_num_layers_; - infinicore::DataType dtype_; - infinicore::Size block_size_; - infinicore::Size num_blocks_per_layer_; - // [num_layers, num_blocks, num_rank_k_heads, block_size, k_dim] - infinicore::Tensor k_caches_; - - // [num_layers, num_blocks, num_rank_v_heads, block_size, v_dim] - infinicore::Tensor v_caches_; -}; +} // namespace PagedKVCache } // namespace infinilm::cache diff --git a/csrc/config/config_factory.cpp b/csrc/config/config_factory.cpp index 09e21e933..0467f4536 100644 --- a/csrc/config/config_factory.cpp +++ b/csrc/config/config_factory.cpp @@ -7,9 +7,6 @@ namespace infinilm::config { std::shared_ptr ConfigFactory::createConfig(const std::string &config_str) { const nlohmann::json config_json = nlohmann::json::parse(config_str); auto model_config = std::make_shared(config_json); - if (nullptr == model_config) { - throw std::runtime_error("infinilm::config::ConfigFactory::createConfig: model_config is not initialized"); - } const std::string model_type = model_config->get("model_type"); const auto &config_map = models::get_model_config_map(); @@ -17,11 +14,7 @@ std::shared_ptr ConfigFactory::createConfig(const if (it != config_map.end()) { it->second(model_config); } else { - std::vector classic_models = {"llama", "qwen2", "minicpm", "fm9g", "fm9g7b"}; - const std::string &model_type = model_config->get("model_type"); - if (std::find(classic_models.begin(), classic_models.end(), model_type) == classic_models.end()) { - throw std::invalid_argument("infinilm::config::ConfigFactory::createConfig: Unsupported model config type: " + model_type); - } + throw std::invalid_argument("infinilm::config::ConfigFactory::createConfig: Unsupported model config type: " + model_type); } return model_config; diff --git a/csrc/models/debug_utils/hooks.cpp b/csrc/debug_utils/hooks.cpp similarity index 100% rename from csrc/models/debug_utils/hooks.cpp rename to csrc/debug_utils/hooks.cpp diff --git a/csrc/models/debug_utils/hooks.hpp b/csrc/debug_utils/hooks.hpp similarity index 100% rename from csrc/models/debug_utils/hooks.hpp rename to csrc/debug_utils/hooks.hpp diff --git a/csrc/models/debug_utils/tensor_utils.hpp b/csrc/debug_utils/tensor_utils.hpp similarity index 100% rename from csrc/models/debug_utils/tensor_utils.hpp rename to csrc/debug_utils/tensor_utils.hpp diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp index 87568fd6a..1fa34e126 100644 --- a/csrc/engine/rank_worker.cpp +++ b/csrc/engine/rank_worker.cpp @@ -1,10 +1,6 @@ #include "rank_worker.hpp" - -#include "../global_state/global_state.hpp" #include "../models/model_factory.hpp" -#include "../models/models_registry.hpp" #include "infinicore/ops.hpp" -#include #include #include @@ -254,30 +250,10 @@ void RankWorker::thread_loop() { infinilm::global_state::initialize_infinilm_config(infinilm_config_); // Create model using factory (may be expensive) - const std::string &model_type = model_config_->get("model_type"); - const auto &model_map = models::get_causal_lm_model_map(); - auto it = model_map.find(model_type); - if (it != model_map.end()) { - model_ = InfinilmModelFactory::createModel( - model_config_, - rank_info_.device, - pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr); - } else { - std::vector classic_models = {"llama", "qwen2", "minicpm", "fm9g", "fm9g7b"}; - if ((std::find(classic_models.begin(), classic_models.end(), model_type) != classic_models.end())) { - model_ = InfinilmModelFactory::createModel( - model_config_, - rank_info_, - pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr, - attention_backend_); - } else { - throw std::runtime_error("RankWorker::thread_loop(): Unsupported model config type: " + model_type); - } - } - - if (!model_) { - throw std::runtime_error("Failed to create model"); - } + model_ = InfinilmModelFactory::createModel( + model_config_, + rank_info_.device, + pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr); if (enable_graph_compiling_) { compiler_ = std::make_unique(model_, barrier_); } diff --git a/csrc/layers/causal_lm_templates/text_model.hpp b/csrc/layers/causal_lm_templates/text_model.hpp index 62a52798b..143215997 100644 --- a/csrc/layers/causal_lm_templates/text_model.hpp +++ b/csrc/layers/causal_lm_templates/text_model.hpp @@ -6,7 +6,6 @@ #include "infinicore/nn/rmsnorm.hpp" #include "infinicore/tensor.hpp" #include -#include namespace infinilm::layers::causal_lm_templates { @@ -28,9 +27,7 @@ class TextModel : public infinicore::nn::Module { const auto &dtype{model_config->get_dtype()}; size_t vocab_size = model_config->get("vocab_size"); size_t hidden_size = model_config->get("hidden_size"); - size_t max_position_embeddings = model_config->get("max_position_embeddings"); size_t num_hidden_layers = model_config->get("num_hidden_layers"); - double rope_theta = model_config->get("rope_theta"); double rms_norm_eps = model_config->get("rms_norm_eps"); embed_tokens_ = this->register_module("embed_tokens", vocab_size, hidden_size, std::nullopt, dtype, device); diff --git a/csrc/layers/rotary_embedding/rotary_embedding_factory.cpp b/csrc/layers/rotary_embedding/rotary_embedding_factory.cpp index 4866de9db..5f44a372e 100644 --- a/csrc/layers/rotary_embedding/rotary_embedding_factory.cpp +++ b/csrc/layers/rotary_embedding/rotary_embedding_factory.cpp @@ -1,6 +1,5 @@ #include "rotary_embedding_factory.hpp" #include "../../config/model_config.hpp" -#include namespace infinilm::layers::rotary_embedding { diff --git a/csrc/models/baichuan/baichuan_for_causal_lm.hpp b/csrc/models/baichuan/baichuan_for_causal_lm.hpp index 0e17bb85a..752d7008d 100644 --- a/csrc/models/baichuan/baichuan_for_causal_lm.hpp +++ b/csrc/models/baichuan/baichuan_for_causal_lm.hpp @@ -1,6 +1,5 @@ #pragma once -#include "../../layers/common_modules.hpp" #include "../llama/llama_for_causal_lm.hpp" #include diff --git a/csrc/models/fm9g/fm9g_for_causal_lm.cpp b/csrc/models/fm9g/fm9g_for_causal_lm.cpp index 25be3762c..1c2748e90 100644 --- a/csrc/models/fm9g/fm9g_for_causal_lm.cpp +++ b/csrc/models/fm9g/fm9g_for_causal_lm.cpp @@ -16,8 +16,6 @@ std::shared_ptr create_fm9g_model_config(std::sha namespace { -#ifndef USE_CLASSIC_LLAMA - INFINILM_REGISTER_CAUSAL_LM_MODEL( fm9g, infinilm::models::fm9g::FM9GForCausalLM, @@ -33,6 +31,4 @@ INFINILM_REGISTER_CAUSAL_LM_MODEL( infinilm::models::fm9g::FM9GForCausalLM, infinilm::models::fm9g::create_fm9g_model_config); -#endif - } // namespace diff --git a/csrc/models/fm9g/fm9g_for_causal_lm.hpp b/csrc/models/fm9g/fm9g_for_causal_lm.hpp index 49e65efaf..1b348315a 100644 --- a/csrc/models/fm9g/fm9g_for_causal_lm.hpp +++ b/csrc/models/fm9g/fm9g_for_causal_lm.hpp @@ -1,7 +1,6 @@ #pragma once #include "../../layers/common_modules.hpp" -#include "infinicore/nn/linear.hpp" #include #include diff --git a/csrc/models/glm4/glm4_decoder_layer.hpp b/csrc/models/glm4/glm4_decoder_layer.hpp index ddfebfcdc..236b174e5 100644 --- a/csrc/models/glm4/glm4_decoder_layer.hpp +++ b/csrc/models/glm4/glm4_decoder_layer.hpp @@ -1,8 +1,6 @@ #pragma once -#include "../../backends/attention_backends.hpp" #include "../../config/model_config.hpp" -#include "../../engine/distributed/distributed.hpp" #include "../../layers/common_modules.hpp" #include "infinicore/nn/module.hpp" #include "infinicore/nn/rmsnorm.hpp" diff --git a/csrc/models/infinilm_model.cpp b/csrc/models/infinilm_model.cpp index 3923474ed..3c7c4f351 100644 --- a/csrc/models/infinilm_model.cpp +++ b/csrc/models/infinilm_model.cpp @@ -1,5 +1,4 @@ #include "infinilm_model.hpp" -#include "../backends/attention_backends.hpp" #include "../cache/kv_cache.hpp" #include "../global_state/global_state.hpp" #include "../layers/attention/attention.hpp" diff --git a/csrc/models/infinilm_model.hpp b/csrc/models/infinilm_model.hpp index 5cabcef23..06b2ca7af 100644 --- a/csrc/models/infinilm_model.hpp +++ b/csrc/models/infinilm_model.hpp @@ -3,7 +3,6 @@ #include "../backends/attention_backends.hpp" #include "../cache/cache.hpp" #include "../config/model_config.hpp" -#include "../layers/linear/linear.hpp" #include "infinicore/nn/module.hpp" #include "infinicore/tensor.hpp" diff --git a/csrc/models/internlm3/internlm3_for_causal_lm.hpp b/csrc/models/internlm3/internlm3_for_causal_lm.hpp index 5ad014fb5..ef68e010a 100644 --- a/csrc/models/internlm3/internlm3_for_causal_lm.hpp +++ b/csrc/models/internlm3/internlm3_for_causal_lm.hpp @@ -1,6 +1,5 @@ #pragma once -#include "../../layers/common_modules.hpp" #include "../llama/llama_for_causal_lm.hpp" #include diff --git a/csrc/models/llama/llama_for_causal_lm.cpp b/csrc/models/llama/llama_for_causal_lm.cpp index fd3438bde..37ce411b7 100644 --- a/csrc/models/llama/llama_for_causal_lm.cpp +++ b/csrc/models/llama/llama_for_causal_lm.cpp @@ -14,7 +14,7 @@ std::shared_ptr create_llama_model_config(std::sh if (!config_json.contains("head_dim")) { config_json["head_dim"] = model_config->get("hidden_size") - / model_config->get("num_attention_heads"); + / model_config->get("num_attention_heads"); } if (!config_json.contains("attention_bias")) { @@ -28,13 +28,9 @@ std::shared_ptr create_llama_model_config(std::sh namespace { -#ifndef USE_CLASSIC_LLAMA - INFINILM_REGISTER_CAUSAL_LM_MODEL( llama, infinilm::models::llama::LlamaForCausalLM, infinilm::models::llama::create_llama_model_config); -#endif - } // namespace diff --git a/csrc/models/llama_legacy/legacy_fused_linear.cpp b/csrc/models/llama_legacy/legacy_fused_linear.cpp deleted file mode 100644 index 3736cdf03..000000000 --- a/csrc/models/llama_legacy/legacy_fused_linear.cpp +++ /dev/null @@ -1,396 +0,0 @@ -/** - * @deprecated Legacy fused linear implementations based on InfiniCore. - * Removal target: v0.2.0 (Q2 2026) - */ - -#include "legacy_fused_linear.hpp" - -#include - -namespace infinilm::layers::linear { -// DEPRECATED BEGIN -// --------------------------------------------------------- -// LegacyQKVParallelLinear -// --------------------------------------------------------- -LegacyQKVParallelLinear::LegacyQKVParallelLinear(size_t hidden_size, - size_t head_dim, - size_t num_q_head, - size_t num_kv_head, - bool bias, - const infinicore::DataType &dtype, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : LegacyQKVParallelLinear(hidden_size, - head_dim, head_dim, head_dim, - num_q_head, num_kv_head, num_kv_head, - bias, bias, bias, - dtype, device, rank_info) {} - -LegacyQKVParallelLinear::LegacyQKVParallelLinear(size_t hidden_size, - size_t q_dim, size_t k_dim, size_t v_dim, - size_t num_q_head, size_t num_k_head, size_t num_v_head, - bool q_bias, bool k_bias, bool v_bias, - const infinicore::DataType &dtype, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : infinicore::nn::ColumnParallelLinear( - hidden_size, - num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim, - (q_bias || k_bias || v_bias), - dtype, - device, - rank_info.tp_rank, - rank_info.tp_size), - q_dim_(q_dim), - k_dim_(k_dim), - v_dim_(v_dim), - num_q_head_(num_q_head), - num_k_head_(num_k_head), - num_v_head_(num_v_head), - q_bias_(q_bias), - k_bias_(k_bias), - v_bias_(v_bias) { - if (num_q_head % tp_size_ != 0 || num_k_head % tp_size_ != 0 || num_v_head % tp_size_ != 0) { - throw std::runtime_error("LegacyQKVParallelLinear: num_[q|k|v]_head must be divisible by tp_size"); - } - - if ((q_bias_ != k_bias_) || (k_bias_ != v_bias_)) { - throw std::runtime_error("q_bias, k_bias, v_bias must all match"); - } - - q_out_size_ = num_q_head_ * q_dim_ / tp_size_; - k_out_size_ = num_k_head_ * k_dim_ / tp_size_; - v_out_size_ = num_v_head_ * v_dim_ / tp_size_; -} - -LegacyQKVParallelLinear::LegacyQKVParallelLinear(size_t hidden_size, - size_t head_dim, - size_t num_q_head, - size_t num_kv_head, - std::shared_ptr quantization, - bool bias, - const infinicore::DataType &dtype, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : LegacyQKVParallelLinear(hidden_size, - head_dim, head_dim, head_dim, - num_q_head, num_kv_head, num_kv_head, - bias, bias, bias, - quantization, - dtype, device, rank_info) {} - -LegacyQKVParallelLinear::LegacyQKVParallelLinear(size_t hidden_size, - size_t q_dim, size_t k_dim, size_t v_dim, - size_t num_q_head, size_t num_k_head, size_t num_v_head, - bool q_bias, bool k_bias, bool v_bias, - std::shared_ptr quantization, - const infinicore::DataType &dtype, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : infinicore::nn::ColumnParallelLinear( - hidden_size, - calculate_out_feature_size(num_q_head, q_dim, num_k_head, k_dim, num_v_head, v_dim, rank_info), - quantization, - (q_bias || k_bias || v_bias), - dtype, - device, - rank_info.tp_rank, - rank_info.tp_size), - q_dim_(q_dim), - k_dim_(k_dim), - v_dim_(v_dim), - num_q_head_(num_q_head), - num_k_head_(num_k_head), - num_v_head_(num_v_head), - q_bias_(q_bias), - k_bias_(k_bias), - v_bias_(v_bias), - num_kv_head_replicas_(calculate_kv_replicas(num_k_head, rank_info.tp_size)) { - - if ((q_bias_ != k_bias_) || (k_bias_ != v_bias_)) { - throw std::runtime_error("q_bias, k_bias, v_bias must all match"); - } - - q_out_size_ = num_q_head_ * q_dim_ / tp_size_; - k_out_size_ = num_kv_head_replicas_ * num_k_head_ * k_dim_ / tp_size_; - v_out_size_ = num_kv_head_replicas_ * num_v_head_ * v_dim_ / tp_size_; -} - -std::tuple -LegacyQKVParallelLinear::forward_split(infinicore::Tensor &input) { - auto output = this->forward(input); - - auto q_out = output->narrow({{2, 0, q_out_size_}}); - auto k_out = output->narrow({{2, q_out_size_, k_out_size_}}); - auto v_out = output->narrow({{2, q_out_size_ + k_out_size_, v_out_size_}}); - - return std::make_tuple(q_out, k_out, v_out); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_q_weight() const { - return infinicore::nn::Parameter( - weight_->narrow({{0, 0, q_out_size_}}), - 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_k_weight() const { - return infinicore::nn::Parameter( - weight_->narrow({{0, q_out_size_, k_out_size_}}), - 0, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_v_weight() const { - return infinicore::nn::Parameter( - weight_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}), - 0, tp_rank_, tp_size_, num_v_head_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_q_weight_scale() const { - return infinicore::nn::Parameter( - weight_scale_->narrow({{0, 0, q_out_size_}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_k_weight_scale() const { - return infinicore::nn::Parameter( - weight_scale_->narrow({{0, q_out_size_, k_out_size_}}), - 0, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_v_weight_scale() const { - return infinicore::nn::Parameter( - weight_scale_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}), - 0, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_q_weight_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_->narrow({{1, 0, q_out_size_ / scaling_factor}}), - 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_k_weight_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_->narrow({{1, q_out_size_ / scaling_factor, k_out_size_ / scaling_factor}}), - 1, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_v_weight_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_->narrow({{1, (q_out_size_ + k_out_size_) / scaling_factor, v_out_size_ / scaling_factor}}), - 1, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_q_weight_scale_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_scale_->narrow({{1, 0, q_out_size_ / scaling_factor}}), 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_k_weight_scale_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_scale_->narrow({{1, q_out_size_ / scaling_factor, k_out_size_ / scaling_factor}}), - 1, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_v_weight_scale_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_scale_->narrow({{1, (q_out_size_ + k_out_size_) / scaling_factor, v_out_size_ / scaling_factor}}), - 1, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_q_weight_zeros_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_zeros_->narrow({{1, 0, q_out_size_ / scaling_factor}}), 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_k_weight_zeros_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_zeros_->narrow({{1, q_out_size_ / scaling_factor, k_out_size_ / scaling_factor}}), - 1, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_v_weight_zeros_awq(int scaling_factor) const { - return infinicore::nn::Parameter( - weight_zeros_->narrow({{1, (q_out_size_ + k_out_size_) / scaling_factor, v_out_size_ / scaling_factor}}), - 1, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_q_weight_zeros() const { - return infinicore::nn::Parameter( - weight_zeros_->narrow({{0, 0, q_out_size_}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_k_weight_zeros() const { - return infinicore::nn::Parameter( - weight_zeros_->narrow({{0, q_out_size_, k_out_size_}}), - 0, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_v_weight_zeros() const { - return infinicore::nn::Parameter( - weight_zeros_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}), - 0, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_q_bias() const { - if (!q_bias_) { - return infinicore::nn::Parameter(); - } - return infinicore::nn::Parameter( - bias_->narrow({{0, 0, q_out_size_}}), - 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_k_bias() const { - if (!k_bias_) { - return infinicore::nn::Parameter(); - } - return infinicore::nn::Parameter( - bias_->narrow({{0, q_out_size_, k_out_size_}}), - 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_v_bias() const { - if (!v_bias_) { - return infinicore::nn::Parameter(); - } - return infinicore::nn::Parameter( - bias_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}), - 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_q_g_idx_gptq() const { - return infinicore::nn::Parameter(gidx_->narrow({{0, 0, in_features_ / tp_size_}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_k_g_idx_gptq() const { - return infinicore::nn::Parameter(gidx_->narrow({{0, 0, in_features_ / tp_size_}}), 0, tp_rank_, tp_size_, num_k_head_); -} - -infinicore::nn::Parameter LegacyQKVParallelLinear::get_v_g_idx_gptq() const { - return infinicore::nn::Parameter(gidx_->narrow({{0, 0, in_features_ / tp_size_}}), 0, tp_rank_, tp_size_, num_k_head_); -} - -bool LegacyQKVParallelLinear::has_q_bias() const { return q_bias_; } -bool LegacyQKVParallelLinear::has_k_bias() const { return k_bias_; } -bool LegacyQKVParallelLinear::has_v_bias() const { return v_bias_; } - -// --------------------------------------------------------- -// LegacyGateUpParallelLinear -// --------------------------------------------------------- -LegacyGateUpParallelLinear::LegacyGateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias, - const infinicore::DataType &dtype, const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : LegacyGateUpParallelLinear(hidden_size, intermediate_size, bias, bias, dtype, device, rank_info) { -} - -LegacyGateUpParallelLinear::LegacyGateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, - const infinicore::DataType &dtype, const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) { - if (gate_bias_ != up_bias_) { - throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time"); - } -} - -LegacyGateUpParallelLinear::LegacyGateUpParallelLinear(size_t hidden_size, size_t intermediate_size, std::shared_ptr quantization, bool bias, - const infinicore::DataType &dtype, const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : LegacyGateUpParallelLinear(hidden_size, intermediate_size, bias, bias, quantization, dtype, device, rank_info) { -} - -LegacyGateUpParallelLinear::LegacyGateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, - std::shared_ptr quantization, - const infinicore::DataType &dtype, const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, quantization, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) { - if (gate_bias_ != up_bias_) { - throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time"); - } -} - -std::tuple LegacyGateUpParallelLinear::forward_split(infinicore::Tensor &input) { - auto output = this->forward(input); - auto cols = output->shape()[2]; - auto gate_output = output->narrow({{2, 0, cols / 2}}); - auto up_output = output->narrow({{2, cols / 2, cols / 2}}); - return std::make_tuple(gate_output, up_output); -} - -infinicore::nn::Parameter LegacyGateUpParallelLinear::get_gate_weight() const { - return infinicore::nn::Parameter(weight_->narrow({{0, 0, weight_->size(0) / 2}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyGateUpParallelLinear::get_gate_bias() const { - if (!gate_bias_) { - return infinicore::nn::Parameter(); - } else { - return infinicore::nn::Parameter(bias_->narrow({{0, 0, bias_->size(0) / 2}}), 0, tp_rank_, tp_size_); - } -} - -infinicore::nn::Parameter LegacyGateUpParallelLinear::get_up_weight() const { - return infinicore::nn::Parameter(weight_->narrow({{0, weight_->size(0) / 2, weight_->size(0) / 2}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyGateUpParallelLinear::get_up_bias() const { - if (!up_bias_) { - return infinicore::nn::Parameter(); - } else { - return infinicore::nn::Parameter(bias_->narrow({{0, bias_->size(0) / 2, bias_->size(0) / 2}}), - 0, tp_rank_, tp_size_); - } -} - -infinicore::nn::Parameter LegacyGateUpParallelLinear::get_gate_weight_scale() const { - return infinicore::nn::Parameter(weight_scale_->narrow({{0, 0, weight_scale_->size(0) / 2}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyGateUpParallelLinear::get_up_weight_scale() const { - return infinicore::nn::Parameter(weight_scale_->narrow({{0, weight_scale_->size(0) / 2, weight_scale_->size(0) / 2}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyGateUpParallelLinear::get_gate_weight_zeros() const { - return infinicore::nn::Parameter(weight_zeros_->narrow({{0, 0, weight_zeros_->size(0) / 2}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyGateUpParallelLinear::get_up_weight_zeros() const { - return infinicore::nn::Parameter(weight_zeros_->narrow({{0, weight_zeros_->size(0) / 2, weight_zeros_->size(0) / 2}}), 0, tp_rank_, tp_size_); -} - -bool LegacyGateUpParallelLinear::has_gate_bias() const { return gate_bias_; } -bool LegacyGateUpParallelLinear::has_up_bias() const { return up_bias_; } - -infinicore::nn::Parameter LegacyGateUpParallelLinear::get_gate_weight_awq() const { - return infinicore::nn::Parameter(weight_->narrow({{1, 0, weight_->size(1) / 2}}), 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyGateUpParallelLinear::get_up_weight_awq() const { - return infinicore::nn::Parameter(weight_->narrow({{1, weight_->size(1) / 2, weight_->size(1) / 2}}), 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyGateUpParallelLinear::get_gate_weight_scale_awq() const { - return infinicore::nn::Parameter(weight_scale_->narrow({{1, 0, weight_scale_->size(1) / 2}}), 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyGateUpParallelLinear::get_up_weight_scale_awq() const { - return infinicore::nn::Parameter(weight_scale_->narrow({{1, weight_scale_->size(1) / 2, weight_scale_->size(1) / 2}}), 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyGateUpParallelLinear::get_gate_weight_zeros_awq() const { - return infinicore::nn::Parameter(weight_zeros_->narrow({{1, 0, weight_zeros_->size(1) / 2}}), 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyGateUpParallelLinear::get_up_weight_zeros_awq() const { - return infinicore::nn::Parameter(weight_zeros_->narrow({{1, weight_zeros_->size(1) / 2, weight_zeros_->size(1) / 2}}), 1, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyGateUpParallelLinear::get_gate_g_idx_gptq() const { - return infinicore::nn::Parameter(gidx_->narrow({{0, 0, gidx_->size(0)}}), 0, tp_rank_, tp_size_); -} - -infinicore::nn::Parameter LegacyGateUpParallelLinear::get_up_g_idx_gptq() const { - return infinicore::nn::Parameter(gidx_->narrow({{0, 0, gidx_->size(0)}}), 0, tp_rank_, tp_size_); -} -// DEPRECATED END - -} // namespace infinilm::layers::linear diff --git a/csrc/models/llama_legacy/legacy_fused_linear.hpp b/csrc/models/llama_legacy/legacy_fused_linear.hpp deleted file mode 100644 index 42c90b73a..000000000 --- a/csrc/models/llama_legacy/legacy_fused_linear.hpp +++ /dev/null @@ -1,328 +0,0 @@ -#pragma once - -/** - * @deprecated Legacy fused linear classes based on InfiniCore. - * - * These classes inherit from infinicore::nn::ColumnParallelLinear and use the - * infinicore::quantization namespace. They exist solely for backward - * compatibility with the deprecated LlamaConfig-based constructors. - * - * Removal target: v0.2.0 (Q2 2026) - */ - -#include "../../config/model_config.hpp" -#include "../../engine/distributed/communication_group.hpp" -#include "infinicore/nn/linear.hpp" -#include "infinicore/quantization.hpp" -#include - -namespace infinilm::layers::linear { - -// DEPRECATED BEGIN - -/** - * Convert infinilm::quantization::BaseQuantization to infinicore::quantization::BaseQuantization. - * Needed because model_config now returns infinilm types but legacy classes use infinicore types. - */ -inline std::shared_ptr -to_legacy_quant(const std::shared_ptr &quant) { - if (!quant) { - return std::make_shared(nlohmann::json{}); - } - switch (quant->get_quant_scheme()) { - case infinilm::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: - return std::make_shared(quant->get_config()); - case infinilm::quantization::QuantScheme::AWQ_W4A16: - return std::make_shared(quant->get_config()); - case infinilm::quantization::QuantScheme::GPTQ_W4A16_QY: - return std::make_shared(quant->get_config()); - case infinilm::quantization::QuantScheme::GPTQ_W4A16: - return std::make_shared(quant->get_config()); - default: - return std::make_shared(quant->get_config()); - } -} - -inline infinicore::quantization::QuantScheme -to_legacy_quant_scheme(infinilm::quantization::QuantScheme scheme) { - return static_cast(static_cast(scheme)); -} - -inline infinicore::quantization::KVQuantAlgo -to_legacy_kv_quant_algo(infinilm::quantization::KVQuantAlgo algo) { - return static_cast(static_cast(algo)); -} - -class LegacyQKVParallelLinear : public infinicore::nn::ColumnParallelLinear { -public: - explicit LegacyQKVParallelLinear(size_t hidden_size, - size_t q_dim, size_t k_dim, size_t v_dim, - size_t num_q_head, size_t num_k_head, size_t num_v_head, - bool q_bias, bool k_bias, bool v_bias, - const infinicore::DataType &dtype = infinicore::DataType::F32, - const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - - explicit LegacyQKVParallelLinear(size_t hidden_size, - size_t head_dim, - size_t num_q_head, size_t num_kv_head, - bool bias = false, - const infinicore::DataType &dtype = infinicore::DataType::F32, - const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - - explicit LegacyQKVParallelLinear(size_t hidden_size, - size_t q_dim, size_t k_dim, size_t v_dim, - size_t num_q_head, size_t num_k_head, size_t num_v_head, - bool q_bias, bool k_bias, bool v_bias, - std::shared_ptr quantization, - const infinicore::DataType &dtype = infinicore::DataType::F32, - const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - - explicit LegacyQKVParallelLinear(size_t hidden_size, - size_t head_dim, - size_t num_q_head, size_t num_kv_head, - std::shared_ptr quantization, - bool bias = false, - const infinicore::DataType &dtype = infinicore::DataType::F32, - const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - - std::tuple - forward_split(infinicore::Tensor &input); - - infinicore::nn::Parameter get_q_weight() const; - infinicore::nn::Parameter get_k_weight() const; - infinicore::nn::Parameter get_v_weight() const; - infinicore::nn::Parameter get_q_weight_scale() const; - infinicore::nn::Parameter get_k_weight_scale() const; - infinicore::nn::Parameter get_v_weight_scale() const; - infinicore::nn::Parameter get_q_weight_zeros() const; - infinicore::nn::Parameter get_k_weight_zeros() const; - infinicore::nn::Parameter get_v_weight_zeros() const; - - infinicore::nn::Parameter get_q_weight_awq(int scaling_factor) const; - infinicore::nn::Parameter get_k_weight_awq(int scaling_factor) const; - infinicore::nn::Parameter get_v_weight_awq(int scaling_factor) const; - infinicore::nn::Parameter get_q_weight_scale_awq(int scaling_factor) const; - infinicore::nn::Parameter get_k_weight_scale_awq(int scaling_factor) const; - infinicore::nn::Parameter get_v_weight_scale_awq(int scaling_factor) const; - infinicore::nn::Parameter get_q_weight_zeros_awq(int scaling_factor) const; - infinicore::nn::Parameter get_k_weight_zeros_awq(int scaling_factor) const; - infinicore::nn::Parameter get_v_weight_zeros_awq(int scaling_factor) const; - - infinicore::nn::Parameter get_q_bias() const; - infinicore::nn::Parameter get_k_bias() const; - infinicore::nn::Parameter get_v_bias() const; - - infinicore::nn::Parameter get_q_g_idx_gptq() const; - infinicore::nn::Parameter get_k_g_idx_gptq() const; - infinicore::nn::Parameter get_v_g_idx_gptq() const; - - bool has_q_bias() const; - bool has_k_bias() const; - bool has_v_bias() const; - -private: - static size_t calculate_kv_replicas(size_t num_k_head, size_t tp_size) { - if (num_k_head % tp_size == 0) { - return 1; - } - if (tp_size % num_k_head == 0) { - return (tp_size + num_k_head - 1) / num_k_head; - } - throw std::runtime_error("Invalid KV head configuration"); - } - - static size_t - calculate_out_feature_size(size_t num_q_head, size_t q_dim, size_t num_k_head, size_t k_dim, size_t num_v_head, size_t v_dim, engine::distributed::RankInfo rank_info) { - return num_q_head * q_dim + num_k_head * k_dim * calculate_kv_replicas(num_k_head, rank_info.tp_size) + num_v_head * v_dim * calculate_kv_replicas(num_v_head, rank_info.tp_size); - } - -private: - size_t q_dim_; - size_t k_dim_; - size_t v_dim_; - size_t num_q_head_; - size_t num_k_head_; - size_t num_v_head_; - bool q_bias_; - bool k_bias_; - bool v_bias_; - size_t q_out_size_; - size_t k_out_size_; - size_t v_out_size_; - size_t num_kv_head_replicas_ = 1; -}; - -class LegacyGateUpParallelLinear : public infinicore::nn::ColumnParallelLinear { -public: - LegacyGateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias = false, - const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - - LegacyGateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, - const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - - LegacyGateUpParallelLinear(size_t hidden_size, size_t intermediate_size, std::shared_ptr quantization, - bool bias = false, - const infinicore::DataType &dtype = infinicore::DataType::F32, - const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - - LegacyGateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, - std::shared_ptr quantization, - const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - - std::tuple forward_split(infinicore::Tensor &input); - - infinicore::nn::Parameter get_gate_weight() const; - infinicore::nn::Parameter get_gate_bias() const; - infinicore::nn::Parameter get_up_weight() const; - infinicore::nn::Parameter get_up_bias() const; - infinicore::nn::Parameter get_gate_weight_scale() const; - infinicore::nn::Parameter get_up_weight_scale() const; - infinicore::nn::Parameter get_gate_weight_zeros() const; - infinicore::nn::Parameter get_up_weight_zeros() const; - infinicore::nn::Parameter get_gate_weight_awq() const; - infinicore::nn::Parameter get_up_weight_awq() const; - infinicore::nn::Parameter get_gate_weight_scale_awq() const; - infinicore::nn::Parameter get_up_weight_scale_awq() const; - infinicore::nn::Parameter get_gate_weight_zeros_awq() const; - infinicore::nn::Parameter get_up_weight_zeros_awq() const; - infinicore::nn::Parameter get_gate_g_idx_gptq() const; - infinicore::nn::Parameter get_up_g_idx_gptq() const; - - bool has_gate_bias() const; - bool has_up_bias() const; - -private: - bool gate_bias_; - bool up_bias_; -}; -// DEPRECATED END - -// DEPRECATED BEGIN — Legacy macros -#define INFINILM_LEGACY_QKV_LINEAR_INIT(name, q_name, k_name, v_name, ...) \ - name##_ = std::make_shared(__VA_ARGS__); \ - this->register_parameter(std::string(q_name) + ".weight", name##_->get_q_weight()); \ - this->register_parameter(std::string(k_name) + ".weight", name##_->get_k_weight()); \ - this->register_parameter(std::string(v_name) + ".weight", name##_->get_v_weight()); \ - if (name##_->has_q_bias()) \ - this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias()); \ - if (name##_->has_k_bias()) \ - this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias()); \ - if (name##_->has_v_bias()) \ - this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias()); - -#define INFINILM_LEGACY_QKV_LINEAR_W8A8_INIT(name, q_name, k_name, v_name, ...) \ - name##_ = std::make_shared(__VA_ARGS__); \ - this->register_parameter(std::string(q_name) + ".weight", name##_->get_q_weight()); \ - this->register_parameter(std::string(q_name) + ".weight_scale", name##_->get_q_weight_scale()); \ - this->register_parameter(std::string(k_name) + ".weight", name##_->get_k_weight()); \ - this->register_parameter(std::string(k_name) + ".weight_scale", name##_->get_k_weight_scale()); \ - this->register_parameter(std::string(v_name) + ".weight", name##_->get_v_weight()); \ - this->register_parameter(std::string(v_name) + ".weight_scale", name##_->get_v_weight_scale()); \ - if (name##_->has_q_bias()) \ - this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias()); \ - if (name##_->has_k_bias()) \ - this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias()); \ - if (name##_->has_v_bias()) \ - this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias()); - -#define INFINILM_LEGACY_QKV_LINEAR_W4A16AWQ_INIT(name, q_name, k_name, v_name, ...) \ - name##_ = std::make_shared(__VA_ARGS__); \ - auto awq_ptr = std::static_pointer_cast(name##_->get_quantization()); \ - int packing_num = awq_ptr->get_packing_num(); \ - this->register_parameter(std::string(q_name) + ".qweight", name##_->get_q_weight_awq(packing_num)); \ - this->register_parameter(std::string(q_name) + ".qzeros", name##_->get_q_weight_zeros_awq(packing_num)); \ - this->register_parameter(std::string(q_name) + ".scales", name##_->get_q_weight_scale_awq(1)); \ - this->register_parameter(std::string(k_name) + ".qweight", name##_->get_k_weight_awq(packing_num)); \ - this->register_parameter(std::string(k_name) + ".qzeros", name##_->get_k_weight_zeros_awq(packing_num)); \ - this->register_parameter(std::string(k_name) + ".scales", name##_->get_k_weight_scale_awq(1)); \ - this->register_parameter(std::string(v_name) + ".qweight", name##_->get_v_weight_awq(packing_num)); \ - this->register_parameter(std::string(v_name) + ".qzeros", name##_->get_v_weight_zeros_awq(packing_num)); \ - this->register_parameter(std::string(v_name) + ".scales", name##_->get_v_weight_scale_awq(1)); \ - if (name##_->has_q_bias()) \ - this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias()); \ - if (name##_->has_k_bias()) \ - this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias()); \ - if (name##_->has_v_bias()) \ - this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias()); - -#define INFINILM_LEGACY_QKV_LINEAR_W4A16GPTQ_INIT(name, q_name, k_name, v_name, ...) \ - name##_ = std::make_shared(__VA_ARGS__); \ - auto gptq_ptr = std::static_pointer_cast(name##_->get_quantization()); \ - int packing_num = gptq_ptr->get_packing_num(); \ - this->register_parameter(std::string(q_name) + ".qweight", name##_->get_q_weight_awq(1)); \ - this->register_parameter(std::string(q_name) + ".qzeros", name##_->get_q_weight_zeros_awq(8)); \ - this->register_parameter(std::string(q_name) + ".scales", name##_->get_q_weight_scale_awq(1)); \ - this->register_parameter(std::string(q_name) + ".g_idx", name##_->get_q_g_idx_gptq()); \ - this->register_parameter(std::string(k_name) + ".qweight", name##_->get_k_weight_awq(1)); \ - this->register_parameter(std::string(k_name) + ".qzeros", name##_->get_k_weight_zeros_awq(8)); \ - this->register_parameter(std::string(k_name) + ".scales", name##_->get_k_weight_scale_awq(1)); \ - this->register_parameter(std::string(k_name) + ".g_idx", name##_->get_k_g_idx_gptq()); \ - this->register_parameter(std::string(v_name) + ".qweight", name##_->get_v_weight_awq(1)); \ - this->register_parameter(std::string(v_name) + ".qzeros", name##_->get_v_weight_zeros_awq(8)); \ - this->register_parameter(std::string(v_name) + ".scales", name##_->get_v_weight_scale_awq(1)); \ - this->register_parameter(std::string(v_name) + ".g_idx", name##_->get_v_g_idx_gptq()); \ - if (name##_->has_q_bias()) \ - this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias()); \ - if (name##_->has_k_bias()) \ - this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias()); \ - if (name##_->has_v_bias()) \ - this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias()); - -#define INFINILM_LEGACY_GATE_UP_LINEAR_INIT(name, gate_name, up_name, ...) \ - name##_ = std::make_shared(__VA_ARGS__); \ - this->register_parameter(std::string(gate_name) + ".weight", name##_->get_gate_weight()); \ - this->register_parameter(std::string(up_name) + ".weight", name##_->get_up_weight()); \ - if (name##_->has_gate_bias()) \ - this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias()); \ - if (name##_->has_up_bias()) \ - this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias()); - -#define INFINILM_LEGACY_GATE_UP_LINEAR_W8A8_INIT(name, gate_name, up_name, ...) \ - name##_ = std::make_shared(__VA_ARGS__); \ - this->register_parameter(std::string(gate_name) + ".weight", name##_->get_gate_weight()); \ - this->register_parameter(std::string(gate_name) + ".weight_scale", name##_->get_gate_weight_scale()); \ - this->register_parameter(std::string(up_name) + ".weight", name##_->get_up_weight()); \ - this->register_parameter(std::string(up_name) + ".weight_scale", name##_->get_up_weight_scale()); \ - if (name##_->has_gate_bias()) \ - this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias()); \ - if (name##_->has_up_bias()) \ - this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias()); - -#define INFINILM_LEGACY_GATE_UP_LINEAR_W4A16AWQ_INIT(name, gate_name, up_name, ...) \ - name##_ = std::make_shared(__VA_ARGS__); \ - this->register_parameter(std::string(gate_name) + ".qweight", name##_->get_gate_weight_awq()); \ - this->register_parameter(std::string(gate_name) + ".qzeros", name##_->get_gate_weight_zeros_awq()); \ - this->register_parameter(std::string(gate_name) + ".scales", name##_->get_gate_weight_scale_awq()); \ - this->register_parameter(std::string(up_name) + ".qweight", name##_->get_up_weight_awq()); \ - this->register_parameter(std::string(up_name) + ".qzeros", name##_->get_up_weight_zeros_awq()); \ - this->register_parameter(std::string(up_name) + ".scales", name##_->get_up_weight_scale_awq()); \ - if (name##_->has_gate_bias()) \ - this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias()); \ - if (name##_->has_up_bias()) \ - this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias()); - -#define INFINILM_LEGACY_GATE_UP_LINEAR_W4A16GPTQ_INIT(name, gate_name, up_name, ...) \ - name##_ = std::make_shared(__VA_ARGS__); \ - this->register_parameter(std::string(gate_name) + ".qweight", name##_->get_gate_weight_awq()); \ - this->register_parameter(std::string(gate_name) + ".qzeros", name##_->get_gate_weight_zeros_awq()); \ - this->register_parameter(std::string(gate_name) + ".scales", name##_->get_gate_weight_scale_awq()); \ - this->register_parameter(std::string(gate_name) + ".g_idx", name##_->get_gate_g_idx_gptq()); \ - this->register_parameter(std::string(up_name) + ".qweight", name##_->get_up_weight_awq()); \ - this->register_parameter(std::string(up_name) + ".qzeros", name##_->get_up_weight_zeros_awq()); \ - this->register_parameter(std::string(up_name) + ".scales", name##_->get_up_weight_scale_awq()); \ - this->register_parameter(std::string(up_name) + ".g_idx", name##_->get_up_g_idx_gptq()); \ - if (name##_->has_gate_bias()) \ - this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias()); \ - if (name##_->has_up_bias()) \ - this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias()); -// DEPRECATED END - -} // namespace infinilm::layers::linear diff --git a/csrc/models/llama_legacy/llama.hpp b/csrc/models/llama_legacy/llama.hpp deleted file mode 100644 index 8402a1abc..000000000 --- a/csrc/models/llama_legacy/llama.hpp +++ /dev/null @@ -1,24 +0,0 @@ -#pragma once - -/** - * @file llama.hpp - * @brief Main header file for Llama model architecture - * - * This header includes all components of the Llama model architecture - * built using InfiniCore::nn::Module pattern. - * - * Components: - * - LlamaConfig: Model configuration structure - * - LlamaAttention: Multi-head self-attention module - * - LlamaMLP: Feed-forward network module - * - LlamaDecoderLayer: Single transformer decoder layer - * - LlamaModel: Core transformer model (without LM head) - * - LlamaForCausalLM: Complete model with language modeling head - */ - -#include "../../config/model_config.hpp" -#include "llama_attention.hpp" -#include "llama_decoder_layer.hpp" -#include "llama_for_causal_lm.hpp" -#include "llama_mlp.hpp" -#include "llama_model.hpp" diff --git a/csrc/models/llama_legacy/llama_attention.cpp b/csrc/models/llama_legacy/llama_attention.cpp deleted file mode 100644 index e2b2350a4..000000000 --- a/csrc/models/llama_legacy/llama_attention.cpp +++ /dev/null @@ -1,439 +0,0 @@ -#include "llama_attention.hpp" - -#include "../../utils.hpp" -#include "infinicore/nn/linear.hpp" -#include "infinicore/nn/rope.hpp" -#include "infinicore/ops.hpp" -#include "infinicore/ops/mha_kvcache.hpp" -#include "infinicore/ops/mha_varlen.hpp" -#include "infinicore/ops/mul.hpp" -#include "infinicore/ops/per_tensor_dequant_i8.hpp" -#include "infinicore/ops/per_tensor_quant_i8.hpp" - -#include -#include -#include -#include -#include -#include -#include - -namespace infinilm::models::llama_legacy { - -using layers::linear::to_legacy_quant; -using layers::linear::to_legacy_quant_scheme; -using layers::linear::to_legacy_kv_quant_algo; - -/** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ -LlamaAttention::LlamaAttention(const LlamaConfig &config, - const infinicore::Device &device, - size_t layer_idx, - engine::distributed::RankInfo rank_info, - backends::AttentionBackend attention_backend) - : layer_idx_(layer_idx), - hidden_size_(config.hidden_size), - num_attention_heads_(config.num_attention_heads), - num_key_value_heads_(config.num_key_value_heads), - head_dim_(config.head_dim), - kv_dim_(config.kv_dim()), - use_bias_(config.attention_bias), - use_output_bias_(config.attention_output_bias), - use_qk_norm_(config.qk_norm), - max_position_embeddings_(config.max_position_embeddings), - rank_info_(rank_info), - attention_backend_(attention_backend) { - const auto &dtype{config.dtype}; - - int tp_rank = rank_info.tp_rank; - int tp_size = rank_info.tp_size; - - int num_attention_heads = config.num_attention_heads; - int num_key_value_heads = config.num_key_value_heads; - - if ((num_key_value_heads >= tp_size) && (0 == (num_key_value_heads % tp_size))) { - this->num_attention_heads_ = num_attention_heads / tp_size; - this->num_key_value_heads_ = num_key_value_heads / tp_size; - } else { - throw std::runtime_error("num_attention_heads / tp_size error."); - } - scaling_ = 1.0f / std::sqrt(static_cast(head_dim_)); - - // Initialize projection layers - INFINILM_LEGACY_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, config.num_attention_heads, config.num_key_value_heads, use_bias_, - dtype, device, rank_info); - // Output projection uses attention_output_bias (can be different from qkv) - INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads * head_dim_, hidden_size_, use_output_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - - // Initialize qk RMSNorm - if (use_qk_norm_) { - INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, config.rms_norm_eps, dtype, device); - INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, config.rms_norm_eps, dtype, device); - } -} - -LlamaAttention::LlamaAttention(std::shared_ptr model_config, - const infinicore::Device &device, - size_t layer_idx, - engine::distributed::RankInfo rank_info, - backends::AttentionBackend attention_backend) - : model_config_(model_config), - layer_idx_(layer_idx), - hidden_size_(model_config->get("hidden_size")), - num_attention_heads_(model_config->get("num_attention_heads")), - num_key_value_heads_(model_config->get("num_key_value_heads")), - head_dim_(model_config->get_head_dim()), - kv_dim_(model_config->get_kv_dim()), - use_bias_(model_config->get_or("attention_bias", true)), - use_output_bias_(model_config->get_or("attention_output_bias", false)), - max_position_embeddings_(model_config->get("max_position_embeddings")), - rank_info_(rank_info), - attention_backend_(attention_backend) { - const auto &dtype{model_config_->get_dtype()}; - - int tp_rank = rank_info.tp_rank; - int tp_size = rank_info.tp_size; - - int num_attention_heads = model_config_->get("num_attention_heads"); - int num_key_value_heads = model_config_->get("num_key_value_heads"); - - if ((num_key_value_heads >= tp_size) && (0 == (num_key_value_heads % tp_size))) { - this->num_attention_heads_ = num_attention_heads / tp_size; - this->num_key_value_heads_ = num_key_value_heads / tp_size; - } else { - throw std::runtime_error("num_attention_heads / tp_size error."); - } - scaling_ = 1.0f / std::sqrt(static_cast(head_dim_)); - - auto quant_scheme = to_legacy_quant_scheme(this->model_config_->get_quant_scheme()); - switch (quant_scheme) { - case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: - INFINILM_LEGACY_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_, - dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_output_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - - case infinicore::quantization::QuantScheme::AWQ_W4A16: { - INFINILM_LEGACY_QKV_LINEAR_W4A16AWQ_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_, - dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_output_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: { - - INFINILM_LEGACY_QKV_LINEAR_W4A16GPTQ_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_, - dtype, device, rank_info); - - INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_output_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - - break; - } - default: - INFINILM_LEGACY_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_, - dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_output_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } - if (model_config_->get("model_type") == "qwen3") { - INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config_->get("rms_norm_eps"), dtype, device); - INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config_->get("rms_norm_eps"), dtype, device); - } - - switch (to_legacy_kv_quant_algo(this->model_config_->get_kv_quant_scheme())) { - case (infinicore::quantization::KVQuantAlgo::INT8): { - INFINICORE_NN_PARAMETER_INIT(kv_cache_k_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1)); - INFINICORE_NN_PARAMETER_INIT(kv_cache_v_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1)); - break; - } - default: { - break; - } - } -} - -infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_states, - const infinicore::Tensor &position_ids, - std::shared_ptr kv_cache, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths) const { - // Input shape: [batch, seq_len, hidden_size] - auto hidden_states_mutable = hidden_states; - auto shape = hidden_states->shape(); - size_t batch_size = shape[0]; - size_t seq_len = shape[1]; - - // 1. Project Q, K, V - auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable); - - if (use_qk_norm_ || model_config_->get_or("model_type", "None") == "qwen3") { - q = q_norm_->forward(q->view({batch_size * seq_len, num_attention_heads_, head_dim_})); - k = k_norm_->forward(k->view({batch_size * seq_len, num_key_value_heads_, head_dim_})); - } - - // 2. Reshape for multi-head attention - // Reshape Q, K, V to include batch dimension - // Python: query_states = self.q_proj(hidden_states).view(querys_shape) - // The view operation requires the tensor to be contiguous in the required dimensions - auto q_reshaped = q->view({batch_size, seq_len, num_attention_heads_, head_dim_}); - auto k_reshaped = k->view({batch_size, seq_len, num_key_value_heads_, head_dim_}); - auto v_reshaped = v->view({batch_size, seq_len, num_key_value_heads_, head_dim_}); - - // 3. Prepare position_ids for RoPE - align with Python pattern - // Python: bs, num = pos_ids.shape; pos_ids = pos_ids.view((bs * num,)) - auto pos_shape = position_ids->shape(); - infinicore::Tensor pos_ids_for_rope = position_ids; - if (pos_shape.size() == 2) { - auto pos_narrowed = position_ids->narrow({{0, 0, 1}}); - pos_ids_for_rope = pos_narrowed->contiguous()->view({pos_shape[1]}); - } else if (pos_shape.size() == 1) { - pos_ids_for_rope = position_ids->contiguous(); - } else { - throw std::runtime_error("Unexpected position_ids shape"); - } - - // 4. Apply RoPE to Q and K - auto q_rope = infinicore::Tensor::empty({batch_size, num_attention_heads_, seq_len, head_dim_}, q_reshaped->dtype(), q_reshaped->device())->permute({0, 2, 1, 3}); - rotary_emb_->forward(q_rope, q_reshaped, pos_ids_for_rope); // [bs, seq_len, n_q_head, head_dim] - rotary_emb_->forward(k_reshaped, pos_ids_for_rope, true); // [bs, seq_len, n_kv_head, head_dim] - - infinilm::KVQuantUtils::quantize( - k_reshaped, v_reshaped, - this->model_config_->get_kv_quant_scheme(), - this->kv_cache_k_scale_, - this->kv_cache_v_scale_); - - // 5. Prepare KV caches - // Convert to [batch, n_head, seq_len, head_dim] for cache - // Ensure contiguous after permute for F16 compatibility with cache operations - q_reshaped = q_rope->permute({0, 2, 1, 3}); // [bs, n_q_head, seq_len, head_dim] - auto k_permuted = k_reshaped->permute({0, 2, 1, 3}); // [bs, n_kv_head, seq_len, head_dim] - auto v_permuted = v_reshaped->permute({0, 2, 1, 3}); // [bs, n_kv_head, seq_len, head_dim] - infinicore::Tensor k_total; // [bs, n_kv_head, max_seq_len, head_dim] - infinicore::Tensor v_total; // [bs, n_kv_head, max_seq_len, head_dim] - if (kv_cache == nullptr) { - k_total = k_permuted; - v_total = v_permuted; - } else if (auto static_kv_cache = std::dynamic_pointer_cast(kv_cache)) { - auto [k_total_tmp, v_total_tmp] = static_kv_cache->update(layer_idx_, k_permuted, v_permuted, past_sequence_lengths.value()); - k_total = k_total_tmp; - v_total = v_total_tmp; - } else { - throw std::runtime_error("LlamaAttention: Unsupported kvcache type"); - } - - infinicore::Tensor attn_output; - if (false) { - // experimental nineoothed flash attention - attn_output = infinicore::op::flash_attention(q_reshaped, k_total, v_total, total_sequence_lengths.value(), scaling_, true); - attn_output = attn_output->permute({0, 2, 1, 3}) - ->contiguous() - ->view({batch_size, seq_len, num_attention_heads_ * head_dim_}); // [bs, seq_len, n_q_head * head_dim] - } else { - size_t total_seq_len = reinterpret_cast(total_sequence_lengths.value()->to(infinicore::Device::cpu())->data())[0]; - - infinilm::KVQuantUtils::dequantize( - k_total, v_total, - this->model_config_->get_kv_quant_scheme(), - this->kv_cache_k_scale_, - this->kv_cache_v_scale_, - q_reshaped); - - k_total = k_total->narrow({{2, 0, total_seq_len}}); // [bs, n_kv_head, total_seq_len, head_dim] - v_total = v_total->narrow({{2, 0, total_seq_len}}); // [bs, n_kv_head, total_seq_len, head_dim] - - // 6. Compute attention - size_t ngroup = num_attention_heads_ / num_key_value_heads_; - auto Q = q_reshaped->view({batch_size * num_key_value_heads_, ngroup * seq_len, head_dim_}); - auto K = k_total->view({batch_size * num_key_value_heads_, total_seq_len, head_dim_}); - auto V = v_total->view({batch_size * num_key_value_heads_, total_seq_len, head_dim_}); - - auto K_transposed = K->permute({0, 2, 1}); // [bs * n_kv_head, head_dim, total_seq_len] - - auto attn_weight = infinicore::op::matmul(Q, K_transposed, scaling_); // [bs * n_kv_head, ng * seq_len, total_seq_len] - - auto attn_weight_softmax = attn_weight->view({batch_size * num_attention_heads_, seq_len, total_seq_len}); - infinicore::op::causal_softmax_(attn_weight_softmax, attn_weight_softmax); - - auto out = infinicore::op::matmul(attn_weight, V); // [bs * n_kv_head, ng * seq_len, head_dim] - - attn_output = out->view({batch_size, num_attention_heads_, seq_len, head_dim_}) - ->permute({0, 2, 1, 3}) - ->contiguous() - ->view({batch_size, seq_len, num_attention_heads_ * head_dim_}); // [bs, seq_len, n_q_head * head_dim] - } - - auto output = o_proj_->forward(attn_output); - - return output; -} - -infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidden_states, - const infinicore::Tensor &position_ids, - std::shared_ptr paged_kv_cache, - std::optional total_sequence_lengths, - std::optional input_offsets, - std::optional cu_seqlens, - std::optional block_tables, - std::optional slot_mapping) const { - ASSERT(block_tables.has_value()); - ASSERT(slot_mapping.has_value()); - - // Input shape: [batch, seq_len, hidden_size] - auto hidden_states_mutable = hidden_states; - auto shape = hidden_states->shape(); - size_t batch_size = shape[0]; - size_t seq_len = shape[1]; - - // Only support batchsize==1, all requests should be flattened along seqlen dimension - ASSERT_EQ(batch_size, 1); - // Decode only if total_len == num_requests - bool is_prefill = (seq_len != total_sequence_lengths.value()->shape()[0]); - - // 1. Project Q, K, V - auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable); - - // 2. Reshape for multi-head attention - - // Reshape Q, K, V to include batch dimension - // Python: query_states = self.q_proj(hidden_states).view(querys_shape) - // The view operation requires the tensor to be contiguous in the required dimensions - auto q_reshaped = q->view({seq_len, num_attention_heads_, head_dim_}); - auto k_reshaped = k->view({seq_len, num_key_value_heads_, head_dim_}); - auto v_reshaped = v->view({seq_len, num_key_value_heads_, head_dim_}); - - if (use_qk_norm_ || model_config_->get_or("model_type", "None") == "qwen3") { - q_reshaped = q_norm_->forward(q_reshaped); - k_reshaped = k_norm_->forward(k_reshaped); - } - - // 3. Prepare position_ids for RoPE - align with Python pattern - auto pos_shape = position_ids->shape(); - infinicore::Tensor pos_ids_for_rope = position_ids; - if (pos_shape.size() == 2) { - auto pos_narrowed = position_ids->narrow({{0, 0, 1}}); - pos_ids_for_rope = pos_narrowed->view({pos_shape[1]}); - } else if (pos_shape.size() == 1) { - pos_ids_for_rope = position_ids; - } else { - throw std::runtime_error("Unexpected position_ids shape"); - } - - // 4. Apply RoPE to Q and K - rotary_emb_->forward(q_reshaped, pos_ids_for_rope, true); // [bs, seq_len, n_q_head, head_dim] - rotary_emb_->forward(k_reshaped, pos_ids_for_rope, true); // [bs, seq_len, n_kv_head, head_dim] - - // 5. Prepare KV caches - // Ensure contiguous after permute for F16 compatibility with cache operations - auto [k_total, v_total] = paged_kv_cache->update(layer_idx_, - k_reshaped, - v_reshaped, - slot_mapping.value()); - - // 6. Compute attention - infinicore::Tensor attn_output = infinicore::Tensor::empty({seq_len, num_attention_heads_, head_dim_}, q_reshaped->dtype(), q_reshaped->device()); - - if (is_prefill) { - if (attention_backend_ == backends::AttentionBackend::FLASH_ATTN) { - infinicore::op::mha_varlen_( - attn_output, - q_reshaped, - k_total->permute({0, 2, 1, 3}), - v_total->permute({0, 2, 1, 3}), - input_offsets.value(), - cu_seqlens.value(), - block_tables.value(), - max_position_embeddings_, - max_position_embeddings_, - std::nullopt, - scaling_); - } else { - infinicore::op::paged_attention_prefill_( - attn_output, - q_reshaped, - k_total, - v_total, - block_tables.value(), - total_sequence_lengths.value(), - input_offsets.value(), - std::nullopt, - scaling_); - } - } else { - if (attention_backend_ == backends::AttentionBackend::FLASH_ATTN) { - // FA2 decode path: flash::mha_fwd_kvcache - // In paged-attn mode, seq_len = actual batch_size (one query token per sequence). - // q_reshaped: [seq_len, num_heads, head_dim] → [seq_len, 1, num_heads, head_dim] - // k/v cache: [num_blocks, num_kv_heads, block_size, head_dim] - // → permute {0,2,1,3} → [num_blocks, block_size, num_kv_heads, head_dim] - auto q_for_fa = q_reshaped->view({seq_len, 1, num_attention_heads_, head_dim_}); - auto attn_out_4d = infinicore::op::mha_kvcache( - q_for_fa, - k_total->permute({0, 2, 1, 3}), // [num_blocks, block_size, num_kv_heads, head_dim] - v_total->permute({0, 2, 1, 3}), - total_sequence_lengths.value(), // [seq_len] int32 (one entry per sequence) - block_tables.value(), // [seq_len, max_num_blocks_per_seq] int32 - std::nullopt, - scaling_); - attn_output = attn_out_4d->view({seq_len, num_attention_heads_, head_dim_}); - } else { - infinicore::op::paged_attention_( - attn_output, - q_reshaped, - k_total, - v_total, - block_tables.value(), - total_sequence_lengths.value(), - std::nullopt, - scaling_); - } - } - - // 7. Project output - attn_output - = attn_output->view({1, seq_len, num_attention_heads_ * head_dim_}); - return o_proj_->forward(attn_output); -} - -infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_states, - const infinicore::Tensor &position_ids, - std::shared_ptr kv_cache, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths, - std::optional input_offsets, - std::optional cu_seqlens, - std::optional block_tables, - std::optional slot_mapping) const { - if (!rotary_emb_) { - throw std::runtime_error("LlamaAttention: rotary_emb not configured"); - } - - infinicore::Tensor output; - if (auto paged_kv_cache = std::dynamic_pointer_cast(kv_cache)) { - output = forward_paged_(hidden_states, position_ids, paged_kv_cache, total_sequence_lengths, input_offsets, cu_seqlens, block_tables, slot_mapping); - } else { - - output = forward_(hidden_states, position_ids, kv_cache, past_sequence_lengths, total_sequence_lengths); - } - return output; -} - -void LlamaAttention::set_rotary_emb(const std::shared_ptr &rotary_emb) { - rotary_emb_ = rotary_emb; -} - -} // namespace infinilm::models::llama_legacy diff --git a/csrc/models/llama_legacy/llama_attention.hpp b/csrc/models/llama_legacy/llama_attention.hpp deleted file mode 100644 index 6579be438..000000000 --- a/csrc/models/llama_legacy/llama_attention.hpp +++ /dev/null @@ -1,142 +0,0 @@ -#pragma once - -#include "../../backends/attention_backends.hpp" -#include "../../cache/kv_cache.hpp" -#include "../../config/model_config.hpp" -#include "../../engine/distributed/distributed.hpp" -#include "../../layers/linear/fused_linear.hpp" -#include "../../layers/quantization/kv_quant.hpp" -#include "legacy_fused_linear.hpp" -#include "llama_config.hpp" - -#include "infinicore/nn/linear.hpp" -#include "infinicore/nn/module.hpp" -#include "infinicore/nn/rmsnorm.hpp" -#include "infinicore/nn/rope.hpp" -#include "infinicore/tensor.hpp" -#include "llama_config.hpp" -#include -#include -#include - -namespace infinilm::models::llama_legacy { - -class LlamaAttention : public infinicore::nn::Module { -public: - /** - * @brief Construct LlamaAttention module - * - * @param config Model configuration - * @param device Device to create tensors on - * @param layer_idx Layer index for cache access - * @param dtype Optional data type for model parameters (defaults to F32) - */ - /** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ - LlamaAttention(const LlamaConfig &config, - const infinicore::Device &device, - size_t layer_idx, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); - - LlamaAttention(std::shared_ptr model_config, - const infinicore::Device &device, - size_t layer_idx, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); - - /** - * @brief Forward pass: compute attention - * - * @param hidden_states Input tensor of shape [batch, seq_len, hidden_size] - * @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len] - * @param kv_cache Optional model-level KV cache for incremental decoding - * @return Output tensor of shape [batch, seq_len, hidden_size] - */ - infinicore::Tensor forward(const infinicore::Tensor &hidden_states, - const infinicore::Tensor &position_ids, - std::shared_ptr kv_cache, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths, - std::optional input_offsets, - std::optional cu_seqlens, - std::optional block_tables, - std::optional slot_mapping) const; - - /** - * @brief Get the layer index - */ - size_t layer_idx() const { return layer_idx_; } - - /** - * @brief Provide shared RoPE module from parent model. - */ - void set_rotary_emb(const std::shared_ptr &rotary_emb); - - // Module information - size_t num_heads() const { return num_attention_heads_; } - size_t num_kv_heads() const { return num_key_value_heads_; } - size_t head_dim() const { return head_dim_; } - size_t hidden_size() const { return hidden_size_; } - -private: - infinicore::Tensor forward_(const infinicore::Tensor &hidden_states, - const infinicore::Tensor &position_ids, - std::shared_ptr kv_cache, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths) const; - - infinicore::Tensor forward_paged_(const infinicore::Tensor &hidden_states, - const infinicore::Tensor &position_ids, - std::shared_ptr kv_cache, - std::optional total_sequence_lengths, - std::optional input_offsets, - std::optional cu_seqlens, - std::optional block_tables, - std::optional slot_mapping) const; - -protected: - // Projection layers - INFINICORE_NN_MODULE(layers::linear::LegacyQKVParallelLinear, qkv_proj); - INFINICORE_NN_MODULE(infinicore::nn::RowParallelLinear, o_proj); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, q_norm); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, k_norm); - - engine::distributed::RankInfo rank_info_; - - // Shared Rotary Position Embeddings (RoPE) - std::shared_ptr rotary_emb_; - - // For off-line kv cache quantization - INFINICORE_NN_PARAMETER(kv_cache_k_scale); - INFINICORE_NN_PARAMETER(kv_cache_v_scale); - -private: - std::shared_ptr model_config_ = std::make_shared(); - size_t layer_idx_; // Layer index for cache access - size_t hidden_size_; - size_t num_attention_heads_; - size_t num_key_value_heads_; - size_t head_dim_; - size_t kv_dim_; - bool use_bias_; // Bias for Q/K/V projections - bool use_output_bias_; // Bias for output projection (o_proj) - bool use_qk_norm_ = false; // Whether to use QK RMSNorm - size_t max_position_embeddings_; // For cache initialization (deprecated, kept for compatibility) - - float scaling_; - - backends::AttentionBackend attention_backend_; -}; - -} // namespace infinilm::models::llama_legacy diff --git a/csrc/models/llama_legacy/llama_config.hpp b/csrc/models/llama_legacy/llama_config.hpp deleted file mode 100644 index 44cee1b89..000000000 --- a/csrc/models/llama_legacy/llama_config.hpp +++ /dev/null @@ -1,95 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include "../infinilm_model.hpp" - -#include - -namespace infinilm::models::llama_legacy { - -/** - * @brief Configuration structure for Llama model architecture - * - * This struct holds all hyperparameters needed to construct a Llama model. - * It follows the same structure as HuggingFace's LlamaConfig. - */ -struct LlamaConfig : public InfinilmModel::Config { - // Data type - infinicore::DataType dtype = infinicore::DataType::F32; - - // Vocabulary and embedding - size_t vocab_size = 32000; // Vocabulary size - size_t hidden_size = 4096; // Hidden dimension size - size_t intermediate_size = 11008; // MLP intermediate dimension - - // Architecture - size_t num_hidden_layers = 32; // Number of decoder layers - size_t num_attention_heads = 32; // Number of attention heads - size_t num_key_value_heads = 32; // Number of key-value heads (for GQA) - size_t head_dim = 128; // Attention head dimension (hidden_size / num_attention_heads) - - // Position embeddings - size_t max_position_embeddings = 2048; // Maximum sequence length - double rope_theta = 10000.0; // RoPE base frequency - - std::shared_ptr rope_scaling = nullptr; // RoPE scaling type - - // Normalization - double rms_norm_eps = 1e-6; // RMSNorm epsilon - - // Activation - std::string hidden_act = "silu"; // Activation function (typically "silu") - std::string model_type = "llama"; // Model type identifier (matches HF configs) - - // Optional features - bool use_cache = true; // Whether to use KV cache - bool attention_bias = true; // Whether to use bias in Q/K/V projections (default true for 9G7B compatibility) - bool attention_output_bias = false; // Whether to use bias in output projection (o_proj) - bool mlp_bias = false; // Whether to use bias in MLP projections - bool tie_word_embeddings = false; // Whether to tie input/output embeddings - bool qk_norm = false; // Whether to use QK RMSNorm - - // Training/initialization parameters - double attention_dropout = 0.0; // Dropout ratio for attention probabilities - double initializer_range = 0.02; // Standard deviation for weight initialization - size_t pretraining_tp = 1; // Tensor parallelism rank used during pretraining - - // Model metadata - std::string name_or_path = ""; // Model name or path identifier - - // Token IDs - int64_t pad_token_id = -1; // Padding token ID (optional) - std::vector bos_token_id = {1}; // Beginning of sequence token ID(s) - std::vector eos_token_id = {2}; // End of sequence token ID(s) - - /** - * @brief Compute key-value dimension for Grouped Query Attention (GQA) - * @return The dimension for key/value projections - */ - size_t kv_dim() const { - return hidden_size * num_key_value_heads / num_attention_heads; - } - - /** - * @brief Validate configuration parameters - * @return true if configuration is valid - */ - bool validate() const { - if (hidden_size % num_attention_heads != 0) { - return false; - } - if (num_attention_heads % num_key_value_heads != 0) { - return false; - } - if (head_dim != hidden_size / num_attention_heads) { - return false; - } - return true; - } -}; - -} // namespace infinilm::models::llama_legacy diff --git a/csrc/models/llama_legacy/llama_decoder_layer.cpp b/csrc/models/llama_legacy/llama_decoder_layer.cpp deleted file mode 100644 index 0cb7fb83a..000000000 --- a/csrc/models/llama_legacy/llama_decoder_layer.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#include "llama_decoder_layer.hpp" -#include "infinicore/nn/rmsnorm.hpp" -#include "infinicore/ops.hpp" -#include - -namespace infinilm::models::llama_legacy { - -LlamaDecoderLayer::LlamaDecoderLayer(std::shared_ptr model_config, - const infinicore::Device &device, - size_t layer_idx, - engine::distributed::RankInfo rank_info, - backends::AttentionBackend attention_backend) : model_config_(model_config), layer_idx_(layer_idx), rank_info_(rank_info) { - const auto &dtype{model_config_->get_dtype()}; - input_layernorm_ = this->register_module("input_layernorm", model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), - dtype, device); - post_attention_layernorm_ = this->register_module("post_attention_layernorm", model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), - dtype, device); - - self_attn_ = this->register_module("self_attn", model_config_, device, layer_idx, rank_info_, attention_backend); - mlp_ = this->register_module("mlp", model_config_, device, rank_info_); -} - -std::tuple -LlamaDecoderLayer::forward(infinicore::Tensor &hidden_states, - infinicore::Tensor &residual, - const infinicore::Tensor &position_ids, - std::shared_ptr kv_cache, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths, - std::optional input_offsets, - std::optional cu_seqlens, - std::optional block_tables, - std::optional slot_mapping) const { - // 1. Attention layer normalization - input_layernorm_->forward_inplace(hidden_states, residual); - - // 2. Self-attention - hidden_states = self_attn_->forward( - hidden_states, position_ids, kv_cache, past_sequence_lengths, total_sequence_lengths, input_offsets, cu_seqlens, block_tables, slot_mapping); - - // 3. Post-attention layer normalization - post_attention_layernorm_->forward_inplace(hidden_states, residual); - - // 4. MLP - hidden_states = mlp_->forward(hidden_states); - - return std::make_tuple(hidden_states, residual); -} - -} // namespace infinilm::models::llama_legacy diff --git a/csrc/models/llama_legacy/llama_decoder_layer.hpp b/csrc/models/llama_legacy/llama_decoder_layer.hpp deleted file mode 100644 index 9943639b1..000000000 --- a/csrc/models/llama_legacy/llama_decoder_layer.hpp +++ /dev/null @@ -1,89 +0,0 @@ -#pragma once - -#include "infinicore/device.hpp" -#include "infinicore/nn/module.hpp" -#include "infinicore/nn/rmsnorm.hpp" -#include "infinicore/tensor.hpp" -#include "llama_attention.hpp" -#include "llama_config.hpp" -#include "llama_mlp.hpp" - -#include "../../engine/distributed/distributed.hpp" - -namespace infinilm::models::llama_legacy { - -/** - * @brief Single decoder layer (transformer block) for Llama - * - * Each decoder layer consists of: - * - Input layer normalization (RMSNorm) - * - Self-attention mechanism - * - Post-attention layer normalization (RMSNorm) - * - MLP feed-forward network - * - * Residual connections are applied around both attention and MLP blocks. - */ -class LlamaDecoderLayer : public infinicore::nn::Module { -public: - /** - * @brief Construct LlamaDecoderLayer module - * - * @param config Model configuration - * @param device Device to create tensors on - * @param layer_idx Layer index for cache management and debugging - * @param dtype Optional data type for model parameters (defaults to F32) - */ - LlamaDecoderLayer(std::shared_ptr model_config, - const infinicore::Device &device, - size_t layer_idx, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); - - /** - * @brief Forward pass: process one decoder layer - * - * @param hidden_states [batch, seq_len, hidden_size], will be modified - * @param residual [batch, seq_len, hidden_size], will be modified - * @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len] - * @param kv_cache Optional KV cache for incremental decoding - * @return Output tensor of shape [batch, seq_len, hidden_size] - * Updated residual tensor of shape [batch, seq_len, hidden_size] - */ - std::tuple - forward(infinicore::Tensor &hidden_states, - infinicore::Tensor &residual, - const infinicore::Tensor &position_ids, - std::shared_ptr kv_cache, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths, - std::optional input_offsets, - std::optional cu_seqlens, - std::optional block_tables, - std::optional slot_mappin) const; - - /** - * @brief Get the layer index - */ - size_t layer_idx() const { return layer_idx_; } - - void set_rotary_emb(const std::shared_ptr &rotary_emb) { - if (self_attn_) { - self_attn_->set_rotary_emb(rotary_emb); - } - } - -protected: - // Layer normalization - std::shared_ptr input_layernorm_; - std::shared_ptr post_attention_layernorm_; - - std::shared_ptr self_attn_; - std::shared_ptr mlp_; - engine::distributed::RankInfo rank_info_; - std::shared_ptr model_config_; - -private: - size_t layer_idx_; // Layer index for cache management and debugging -}; - -} // namespace infinilm::models::llama_legacy diff --git a/csrc/models/llama_legacy/llama_for_causal_lm.cpp b/csrc/models/llama_legacy/llama_for_causal_lm.cpp deleted file mode 100644 index 2b0f5d72e..000000000 --- a/csrc/models/llama_legacy/llama_for_causal_lm.cpp +++ /dev/null @@ -1,51 +0,0 @@ -#include "llama_for_causal_lm.hpp" -#include "infinicore/context/context.hpp" -#include "infinicore/nn/linear.hpp" -#include "infinicore/ops.hpp" -namespace infinilm::models::llama_legacy { - -LlamaForCausalLM::LlamaForCausalLM(std::shared_ptr model_config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info, - backends::AttentionBackend attention_backend) { - spdlog::warn("infinilm::models::llama_legacy: LlamaForCausalLM is no longer supported, please use the new model instead."); - - device_ = device; - const auto &dtype{model_config->get_dtype()}; - - model_ = this->register_module("model", model_config, device, rank_info, attention_backend); - lm_head_ = this->register_module("lm_head", model_config->get("hidden_size"), model_config->get("vocab_size"), false, - dtype, device); -} - -LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const { - auto input_ids = input.input_ids.value(); - auto position_ids = input.position_ids.value(); - auto past_sequence_lengths = input.past_sequence_lengths; - auto total_sequence_length = input.total_sequence_lengths; - auto input_offsets = input.input_offsets; - auto cu_seqlens = input.cu_seqlens; - auto block_tables = input.block_tables; - auto slot_mapping = input.slot_mapping; - - auto hidden_states = model_->forward( - input_ids, position_ids, past_sequence_lengths, total_sequence_length, input_offsets, cu_seqlens, block_tables, slot_mapping); - - auto logits = lm_head_->forward(hidden_states); - return {logits}; -} - -infinicore::Tensor LlamaForCausalLM::logits_from_hidden(const infinicore::Tensor &hidden_states) const { - return lm_head_->forward(const_cast(hidden_states)); -} - -void LlamaForCausalLM::reset_cache(const cache::CacheConfig *cache_config) { - cache_config_ = cache_config->unique_copy(); - model_->reset_cache(cache_config_.get()); -} - -const cache::CacheConfig *LlamaForCausalLM::get_cache_config() const { - return cache_config_.get(); -} - -} // namespace infinilm::models::llama_legacy diff --git a/csrc/models/llama_legacy/llama_for_causal_lm.hpp b/csrc/models/llama_legacy/llama_for_causal_lm.hpp deleted file mode 100644 index 1920dbaf6..000000000 --- a/csrc/models/llama_legacy/llama_for_causal_lm.hpp +++ /dev/null @@ -1,63 +0,0 @@ -#pragma once - -#include "../infinilm_model.hpp" -#include "llama_model.hpp" - -#include "infinicore/device.hpp" -#include "infinicore/nn/linear.hpp" -#include "infinicore/nn/module.hpp" -#include "infinicore/tensor.hpp" - -#include "../../engine/distributed/distributed.hpp" - -namespace infinilm::models::llama_legacy { - -/** - * @brief Llama model for Causal Language Modeling - * - * Extends LlamaModel by adding a language modeling head (lm_head) that - * projects hidden states to vocabulary logits. - * - * This matches the structure of HuggingFace's LlamaForCausalLM. - */ -class LlamaForCausalLM : public InfinilmModel { -public: - /** - * @brief Construct LlamaForCausalLM module - * - * @param config Model configuration - * @param device Device to create tensors on - */ - LlamaForCausalLM(std::shared_ptr model_config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); - - /** - * @brief Forward pass: compute language modeling logits - * - * @param input Encapsulated input tensors and other parameters - * @return Output structure containing the result - */ - Output forward(const Input &input) const; - - infinicore::Tensor logits_from_hidden(const infinicore::Tensor &hidden_states) const; - - void reset_cache(const cache::CacheConfig *cache_config) override; - - const cache::CacheConfig *get_cache_config() const override; - - // Module information - LlamaModel &model() { return *model_; } - const LlamaModel &model() const { return *model_; } - -protected: - INFINICORE_NN_MODULE(LlamaModel, model); - - // Language modeling head - INFINICORE_NN_MODULE(infinicore::nn::Linear, lm_head); - - std::unique_ptr cache_config_; -}; - -} // namespace infinilm::models::llama_legacy diff --git a/csrc/models/llama_legacy/llama_mlp.cpp b/csrc/models/llama_legacy/llama_mlp.cpp deleted file mode 100644 index 65daffac5..000000000 --- a/csrc/models/llama_legacy/llama_mlp.cpp +++ /dev/null @@ -1,97 +0,0 @@ -#include "llama_mlp.hpp" -#include "infinicore/nn/linear.hpp" -#include "infinicore/ops.hpp" - -namespace infinilm::models::llama_legacy { - -using layers::linear::to_legacy_quant; -using layers::linear::to_legacy_quant_scheme; -/** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ -LlamaMLP::LlamaMLP(const LlamaConfig &config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : hidden_size_(config.hidden_size), - intermediate_size_(config.intermediate_size), - use_bias_(config.mlp_bias), rank_info_(rank_info) { - const auto &dtype{config.dtype}; - - int tp_rank = rank_info.tp_rank; - int tp_size = rank_info.tp_size; - - // Initialize projection layers - INFINILM_LEGACY_GATE_UP_LINEAR_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, use_bias_, - dtype, device, rank_info_); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, use_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); -} - -LlamaMLP::LlamaMLP(std::shared_ptr model_config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : model_config_(model_config), hidden_size_(model_config->get("hidden_size")), - intermediate_size_(model_config->get("intermediate_size")), - use_bias_(model_config->get_or("mlp_bias", false)), rank_info_(rank_info) { - - const auto &dtype{model_config_->get_dtype()}; - - int tp_rank = rank_info.tp_rank; - int tp_size = rank_info.tp_size; - - // Initialize projection layers - auto quant_scheme = to_legacy_quant_scheme(this->model_config_->get_quant_scheme()); - switch (quant_scheme) { - case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: - INFINILM_LEGACY_GATE_UP_LINEAR_W8A8_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_, - dtype, device, rank_info_); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - case infinicore::quantization::QuantScheme::AWQ_W4A16: - INFINILM_LEGACY_GATE_UP_LINEAR_W4A16AWQ_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_, - dtype, device, rank_info_); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: - INFINILM_LEGACY_GATE_UP_LINEAR_W4A16GPTQ_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_, - dtype, device, rank_info_); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - default: - INFINILM_LEGACY_GATE_UP_LINEAR_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_, - dtype, device, rank_info_); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); - break; - } -} - -infinicore::Tensor LlamaMLP::forward(const infinicore::Tensor &hidden_states) const { - // 1. Project to gate and up - auto hidden_states_mutable = hidden_states; - auto [gate, up] = gate_up_proj_->forward_split(hidden_states_mutable); - - // 2. Apply SwiGLU: silu(gate) * up - // Note: swiglu kernel expects (up, gate) and computes gate * sigmoid(gate) * up - // So we pass (up, gate) to get the correct result: gate * sigmoid(gate) * up - auto intermediate = infinicore::op::swiglu(up, gate); - - // 3. Project down - auto output = down_proj_->forward(intermediate); - - return output; -} - -} // namespace infinilm::models::llama_legacy diff --git a/csrc/models/llama_legacy/llama_mlp.hpp b/csrc/models/llama_legacy/llama_mlp.hpp deleted file mode 100644 index d89518cfe..000000000 --- a/csrc/models/llama_legacy/llama_mlp.hpp +++ /dev/null @@ -1,81 +0,0 @@ -#pragma once - -#include "./legacy_fused_linear.hpp" -#include "llama_config.hpp" - -#include "../../config/model_config.hpp" -#include "infinicore/device.hpp" -#include "infinicore/nn/linear.hpp" -#include "infinicore/nn/module.hpp" -#include "infinicore/tensor.hpp" -#include "llama_config.hpp" - -#include "../../engine/distributed/distributed.hpp" - -namespace infinilm::models::llama_legacy { - -/** - * @brief MLP (Feed-Forward Network) module for Llama - * - * Implements the MLP block with: - * - Gate projection - * - Up projection - * - Down projection - * - SiLU activation function - * - * Formula: down_proj(SiLU(gate_proj(x)) * up_proj(x)) - */ -class LlamaMLP : public infinicore::nn::Module { -public: - /** - * @brief Construct LlamaMLP module - * - * @param config Model configuration - * @param device Device to create tensors on - * @param dtype Optional data type for model parameters (defaults to F32) - */ - /** - * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). - * - * ⚠️ DEVELOPMENT POLICY: - * - NO new development or feature additions permitted on this interface - * - Only critical bug fixes (security/stability) allowed until removal - * - All new code MUST migrate to the polymorphic overload below - * - * Replacement: Use the polymorphic overload of this same function name with updated signature - * Reason: Legacy signature lacks support for dynamic quantization modes. - * Removal target: v0.2.0 (Q2 2026) - */ - LlamaMLP(const LlamaConfig &config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - - LlamaMLP(std::shared_ptr model_config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - - /** - * @brief Forward pass: compute MLP output - * - * @param hidden_states Input tensor of shape [batch, seq_len, hidden_size] - * @return Output tensor of shape [batch, seq_len, hidden_size] - */ - infinicore::Tensor forward(const infinicore::Tensor &hidden_states) const; - - // Module information - size_t hidden_size() const { return hidden_size_; } - size_t intermediate_size() const { return intermediate_size_; } - -protected: - INFINICORE_NN_MODULE(layers::linear::LegacyGateUpParallelLinear, gate_up_proj); - INFINICORE_NN_MODULE(infinicore::nn::RowParallelLinear, down_proj); - - engine::distributed::RankInfo rank_info_; - size_t hidden_size_; - size_t intermediate_size_; - bool use_bias_; - - std::shared_ptr model_config_; -}; - -} // namespace infinilm::models::llama_legacy diff --git a/csrc/models/llama_legacy/llama_model.cpp b/csrc/models/llama_legacy/llama_model.cpp deleted file mode 100644 index b7ffea2cf..000000000 --- a/csrc/models/llama_legacy/llama_model.cpp +++ /dev/null @@ -1,125 +0,0 @@ -#include "llama_model.hpp" -#include "../../layers/rotary_embedding/rotary_embedding_factory.hpp" -#include "infinicore/nn/embedding.hpp" -#include "infinicore/nn/rmsnorm.hpp" -#include "infinicore/nn/rope.hpp" -#include "infinicore/ops.hpp" -#include - -namespace infinilm::models::llama_legacy { - -LlamaModel::LlamaModel(std::shared_ptr model_config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info, - backends::AttentionBackend attention_backend) - : model_config_(model_config), rank_info_(rank_info) { - const auto &dtype{model_config_->get_dtype()}; - INFINICORE_NN_MODULE_INIT(embed_tokens, model_config_->get("vocab_size"), model_config_->get("hidden_size"), - std::nullopt, dtype, device); - layers_.reserve(model_config_->get("num_hidden_layers")); - for (size_t i = 0; i < model_config_->get("num_hidden_layers"); ++i) { - layers_.push_back(this->register_module( - "layers." + std::to_string(i), model_config_, device, i, rank_info, attention_backend)); - } - INFINICORE_NN_MODULE_INIT(norm, model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), - dtype, device); - auto rope_scaling_config = infinilm::layers::rotary_embedding::make_scaling_config(model_config_); - INFINICORE_NN_MODULE_INIT(rotary_emb, model_config_->get_head_dim(), model_config->get_rotary_dim(), model_config_->get("max_position_embeddings"), - model_config_->get("rope_theta"), infinicore::nn::RoPE::Algo::GPT_NEOX, - dtype, device, rope_scaling_config); - - for (auto &layer : layers_) { - if (layer) { - layer->set_rotary_emb(rotary_emb_); - } - } -} - -infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids, - const infinicore::Tensor &position_ids, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths, - std::optional input_offsets, - std::optional cu_seqlens, - std::optional block_tables, - std::optional slot_mapping) const { - // 1. Embed tokens: input_ids -> [batch, seq_len, hidden_size] - auto hidden_states = embed_tokens_->forward(input_ids); - - // 2. Process through all decoder layers - size_t num_layers = layers_.size(); - infinicore::Tensor residual; - for (size_t i = 0; i < num_layers; ++i) { - layers_.at(i)->forward( - hidden_states, - residual, - position_ids, - kv_cache_, - past_sequence_lengths, - total_sequence_lengths, - input_offsets, - cu_seqlens, - block_tables, - slot_mapping); - } - - norm_->forward_inplace(hidden_states, residual); - - return hidden_states; -} - -infinicore::Tensor LlamaModel::forward_embeds(const infinicore::Tensor &inputs_embeds, - const infinicore::Tensor &position_ids, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths, - std::optional input_offsets, - std::optional cu_seqlens, - std::optional block_tables, - std::optional slot_mapping) const { - auto hidden_states = inputs_embeds; - size_t num_layers = layers_.size(); - infinicore::Tensor residual; - for (size_t i = 0; i < num_layers; ++i) { - layers_.at(i)->forward(hidden_states, residual, position_ids, kv_cache_, past_sequence_lengths, total_sequence_lengths, input_offsets, cu_seqlens, block_tables, slot_mapping); - } - norm_->forward_inplace(hidden_states, residual); - - return hidden_states; -} - -infinicore::Tensor LlamaModel::embed_tokens(const infinicore::Tensor &input_ids) const { - return embed_tokens_->forward(input_ids); -} - -void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) { - if (cache_config == nullptr) { - kv_cache_ = nullptr; - return; - } - if (auto kv_cache_config = dynamic_cast(cache_config)) { - kv_cache_ = std::make_shared( - model_config_->get_head_dim(), - model_config_->get_head_dim(), - model_config_->get("num_key_value_heads"), - model_config_->get("num_key_value_heads"), - model_config_->get("num_hidden_layers"), - model_config_->get("max_position_embeddings"), - model_config_->get_kv_cache_dtype(), - *kv_cache_config, - rank_info_); - } else if (auto paged_kv_cache_config = dynamic_cast(cache_config)) { - kv_cache_ = std::make_shared( - model_config_->get_head_dim(), - model_config_->get_head_dim(), - model_config_->get("num_key_value_heads"), - model_config_->get("num_key_value_heads"), - model_config_->get("num_hidden_layers"), - model_config_->get_kv_cache_dtype(), - *paged_kv_cache_config, - rank_info_); - } else { - throw std::runtime_error("Unsupported cache type"); - } -} - -} // namespace infinilm::models::llama_legacy diff --git a/csrc/models/llama_legacy/llama_model.hpp b/csrc/models/llama_legacy/llama_model.hpp deleted file mode 100644 index d08c51e4d..000000000 --- a/csrc/models/llama_legacy/llama_model.hpp +++ /dev/null @@ -1,98 +0,0 @@ -#pragma once - -#include "../../cache/kv_cache.hpp" -#include "llama_decoder_layer.hpp" - -#include "infinicore/nn/embedding.hpp" -#include "infinicore/nn/module.hpp" -#include "infinicore/nn/rmsnorm.hpp" -#include "infinicore/nn/rope.hpp" -#include "infinicore/tensor.hpp" -#include "llama_config.hpp" -#include "llama_decoder_layer.hpp" -#include -#include - -#include "../../engine/distributed/distributed.hpp" - -namespace infinilm::models::llama_legacy { - -/** - * @brief Main Llama model architecture (without language modeling head) - * - * This is the core transformer model consisting of: - * - Token embeddings (embed_tokens) - * - Multiple decoder layers (layers) - * - Final layer normalization (norm) - * - Rotary Position Embeddings (rotary_emb) - * - * This matches the structure of HuggingFace's LlamaModel. - */ -class LlamaModel : public infinicore::nn::Module { -public: - /** - * @brief Construct LlamaModel module - * - * @param config Model configuration - * @param device Device to create tensors on - * @param dtype Optional data type for model parameters (defaults to F32) - */ - LlamaModel(std::shared_ptr model_config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); - - /** - * @brief Forward pass: process input through the model - * - * @param input_ids Token IDs tensor of shape [batch, seq_len]. Batch is 1 when continuous batch is used, - * and tokens from all requests are concatenated along seq_len dimension. - * @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len] - * @param past_sequence_lengths Cache positions tensor of shape [n_req] - * @param total_sequence_lengths Total sequence lengths tensor of shape [n_req] - * @param input_offsets Input offsets (starting position) of each request in a continuous batch of shape [n_req + 1] - * @return Output tensor of shape [batch, seq_len, hidden_size] - */ - infinicore::Tensor forward(const infinicore::Tensor &input_ids, - const infinicore::Tensor &position_ids, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths, - std::optional input_offsets, - std::optional cu_seqlens, - std::optional block_tables, - std::optional slot_mapping) const; - - infinicore::Tensor forward_embeds(const infinicore::Tensor &inputs_embeds, - const infinicore::Tensor &position_ids, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths, - std::optional input_offsets, - std::optional cu_seqlens, - std::optional block_tables, - std::optional slot_mapping) const; - - infinicore::Tensor embed_tokens(const infinicore::Tensor &input_ids) const; - - void reset_cache(const cache::CacheConfig *cache_config); - - // Module information - size_t num_layers() const { return model_config_->get("num_hidden_layers"); } - -protected: - INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens); - - INFINICORE_NN_MODULE_VEC(LlamaDecoderLayer, layers); - - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, norm); - - INFINICORE_NN_MODULE(infinicore::nn::RoPE, rotary_emb); - - engine::distributed::RankInfo rank_info_; - - std::shared_ptr kv_cache_; - -private: - std::shared_ptr model_config_; -}; - -} // namespace infinilm::models::llama_legacy diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp index f11948672..0a54257e3 100644 --- a/csrc/models/model_factory.cpp +++ b/csrc/models/model_factory.cpp @@ -1,29 +1,8 @@ #include "model_factory.hpp" -#include "llama_legacy/llama_for_causal_lm.hpp" #include "models_registry.hpp" namespace infinilm { -std::shared_ptr InfinilmModelFactory::createModel( - std::shared_ptr model_config, - engine::distributed::RankInfo rank_info, - const cache::CacheConfig *cache, - backends::AttentionBackend attention_backend) { - std::shared_ptr model; - if (true) { - model = std::make_shared( - model_config, rank_info.device, rank_info, attention_backend); - } else { - throw std::invalid_argument("InfinilmModelFactory::createModel: Unsupported model config type"); - } - - if (cache) { - model->reset_cache(cache); - } - - return model; -} - std::shared_ptr InfinilmModelFactory::createModel( std::shared_ptr model_config, const infinicore::Device &device, diff --git a/csrc/models/model_factory.hpp b/csrc/models/model_factory.hpp index b3c476f11..87108f93f 100644 --- a/csrc/models/model_factory.hpp +++ b/csrc/models/model_factory.hpp @@ -1,19 +1,11 @@ #pragma once -#include "../backends/attention_backends.hpp" -#include "../engine/distributed/distributed.hpp" #include "infinilm_model.hpp" namespace infinilm { class InfinilmModelFactory { public: - static std::shared_ptr createModel( - std::shared_ptr model_config, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - const cache::CacheConfig *cache = nullptr, - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); - static std::shared_ptr createModel( std::shared_ptr model_config, const infinicore::Device &device, diff --git a/csrc/models/qwen2/qwen2_for_causal_lm.cpp b/csrc/models/qwen2/qwen2_for_causal_lm.cpp index 8be5294cc..2191197ef 100644 --- a/csrc/models/qwen2/qwen2_for_causal_lm.cpp +++ b/csrc/models/qwen2/qwen2_for_causal_lm.cpp @@ -14,7 +14,7 @@ std::shared_ptr create_qwen2_model_config(std::sh if (!config_json.contains("head_dim")) { size_t head_dim = model_config->get("hidden_size") - / model_config->get("num_attention_heads"); + / model_config->get("num_attention_heads"); config_json["head_dim"] = head_dim; } @@ -25,13 +25,9 @@ std::shared_ptr create_qwen2_model_config(std::sh namespace { -#ifndef USE_CLASSIC_LLAMA - INFINILM_REGISTER_CAUSAL_LM_MODEL( qwen2, infinilm::models::qwen2::Qwen2ForCausalLM, infinilm::models::qwen2::create_qwen2_model_config); -#endif - } // namespace diff --git a/csrc/models/qwen3_moe/qwen3_moe_experts.hpp b/csrc/models/qwen3_moe/qwen3_moe_experts.hpp index 50210b2ae..90f5739bf 100644 --- a/csrc/models/qwen3_moe/qwen3_moe_experts.hpp +++ b/csrc/models/qwen3_moe/qwen3_moe_experts.hpp @@ -1,5 +1,3 @@ - - #pragma once #include "../../layers/common_modules.hpp" diff --git a/csrc/models/qwen3_moe/qwen3_moe_topk_router.hpp b/csrc/models/qwen3_moe/qwen3_moe_topk_router.hpp index 60b30cfb8..dadadb3ff 100644 --- a/csrc/models/qwen3_moe/qwen3_moe_topk_router.hpp +++ b/csrc/models/qwen3_moe/qwen3_moe_topk_router.hpp @@ -1,5 +1,3 @@ - - #pragma once #include "../../layers/common_modules.hpp" diff --git a/csrc/pybind11/bindings.cc b/csrc/pybind11/bindings.cc index 0be1c04b5..63846338b 100644 --- a/csrc/pybind11/bindings.cc +++ b/csrc/pybind11/bindings.cc @@ -2,15 +2,14 @@ #include "cache/cache.hpp" #include "engine/engine.hpp" -#include "models/llama_legacy.hpp" namespace py = pybind11; PYBIND11_MODULE(_infinilm, m) { - m.doc() = "InfiniLM Llama model Python bindings"; + m.doc() = "InfiniLM Python bindings"; infinilm::cache::bind_cache(m); - infinilm::models::llama_legacy::bind_llama(m); + infinilm::engine::bind_hook_registry(m); infinilm::engine::distributed::bind_dist_config(m); infinilm::engine::bind_infer_engine(m); } diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp index 8e470984e..6022f25ec 100644 --- a/csrc/pybind11/engine/engine.hpp +++ b/csrc/pybind11/engine/engine.hpp @@ -1,3 +1,4 @@ +#include "../../debug_utils/hooks.hpp" #include "../../engine/infer_engine.hpp" #include "infinicore/tensor.hpp" #include @@ -28,12 +29,37 @@ inline void bind_dist_config(py::module &m) { namespace infinilm::engine { +inline void bind_hook_registry(py::module &m) { + using infinilm::models::debug_utils::HookRegistry; + + // TODO: HookRegistry should be moved out from Llama-specific bindings to InfiniCore as common utils in future work + // Bind HookRegistry + py::class_>(m, "HookRegistry") + .def(py::init<>()) + .def( + "register_hook", [](HookRegistry &self, const std::string &name, py::object callback) { + // Convert Python callable to C++ function + self.register_hook(name, [callback](const std::string &hook_name, const infinicore::Tensor &tensor, int layer_idx) { + try { + // Call Python callback with hook name, tensor, and layer index + callback(hook_name, tensor, layer_idx); + } catch (const py::error_already_set &e) { + // Re-raise Python exception + throw; + } + }); + }, + py::arg("name"), py::arg("callback")) + .def("clear", &HookRegistry::clear) + .def("has_hooks", &HookRegistry::has_hooks); +} + inline void bind_infer_engine(py::module &m) { py::class_> infer_engine(m, "InferEngine"); infer_engine .def(py::init([]( - const std::string &model_path, + const std::string &config_str, const distributed::DistConfig &dist, infinicore::Device::Type dev, std::shared_ptr cache_cfg, @@ -41,7 +67,7 @@ inline void bind_infer_engine(py::module &m) { const std::string &attention_backend, std::optional kv_cache_dtype) { return std::make_shared( - model_path, + config_str, dist, dev, cache_cfg ? cache_cfg.get() : nullptr, @@ -49,7 +75,7 @@ inline void bind_infer_engine(py::module &m) { infinilm::backends::parse_attention_backend(attention_backend), kv_cache_dtype); }), - py::arg("model_path") = "", + py::arg("config_str") = "", py::arg("distributed_config") = distributed::DistConfig(), py::arg("device_type") = infinicore::context::getDevice().getType(), py::arg("cache_config") = py::none(), diff --git a/csrc/pybind11/models/llama_legacy.hpp b/csrc/pybind11/models/llama_legacy.hpp deleted file mode 100644 index 67c8e74f1..000000000 --- a/csrc/pybind11/models/llama_legacy.hpp +++ /dev/null @@ -1,216 +0,0 @@ -#pragma once - -#include "../../cache/kv_cache.hpp" -#include "../../models/debug_utils/hooks.hpp" -#include "../../models/llama_legacy/llama.hpp" -#include "../../models/llama_legacy/llama_attention.hpp" -#include "infinicore/device.hpp" -#include "infinicore/nn/module.hpp" -#include "infinicore/nn/rope.hpp" -#include "infinicore/tensor.hpp" -#include -#include -#include - -namespace py = pybind11; -using infinicore::Device; -using infinilm::models::debug_utils::HookRegistry; - -namespace infinilm::models::llama_legacy { - -inline void bind_llama(py::module &m) { - // TODO: HookRegistry should be moved out from Llama-specific bindings to InfiniCore as common utils in future work - // Bind HookRegistry - py::class_>(m, "HookRegistry") - .def(py::init<>()) - .def( - "register_hook", [](HookRegistry &self, const std::string &name, py::object callback) { - // Convert Python callable to C++ function - self.register_hook(name, [callback](const std::string &hook_name, const infinicore::Tensor &tensor, int layer_idx) { - try { - // Call Python callback with hook name, tensor, and layer index - callback(hook_name, tensor, layer_idx); - } catch (const py::error_already_set &e) { - // Re-raise Python exception - throw; - } - }); - }, - py::arg("name"), py::arg("callback")) - .def("clear", &HookRegistry::clear) - .def("has_hooks", &HookRegistry::has_hooks); - - py::class_ config(m, "Config"); - - // Bind LlamaConfig - py::class_ llama_config(m, "LlamaConfig"); - llama_config - .def(py::init<>()) - // TODO: Change this to `dtype` after updating InfiniCore pybind11 exposing mechanism. - .def_readwrite("_dtype", &LlamaConfig::dtype) - .def_readwrite("vocab_size", &LlamaConfig::vocab_size) - .def_readwrite("hidden_size", &LlamaConfig::hidden_size) - .def_readwrite("intermediate_size", &LlamaConfig::intermediate_size) - .def_readwrite("num_hidden_layers", &LlamaConfig::num_hidden_layers) - .def_readwrite("num_attention_heads", &LlamaConfig::num_attention_heads) - .def_readwrite("num_key_value_heads", &LlamaConfig::num_key_value_heads) - .def_readwrite("head_dim", &LlamaConfig::head_dim) - .def_readwrite("max_position_embeddings", &LlamaConfig::max_position_embeddings) - .def_readwrite("rms_norm_eps", &LlamaConfig::rms_norm_eps) - .def_readwrite("hidden_act", &LlamaConfig::hidden_act) - .def_readwrite("model_type", &LlamaConfig::model_type) - .def_readwrite("rope_theta", &LlamaConfig::rope_theta) - .def_readwrite("attention_bias", &LlamaConfig::attention_bias) - .def_readwrite("attention_output_bias", &LlamaConfig::attention_output_bias) - .def_readwrite("mlp_bias", &LlamaConfig::mlp_bias) - .def_readwrite("tie_word_embeddings", &LlamaConfig::tie_word_embeddings) - .def_readwrite("qk_norm", &LlamaConfig::qk_norm) - .def_readwrite("use_cache", &LlamaConfig::use_cache) - .def_readwrite("attention_dropout", &LlamaConfig::attention_dropout) - .def_readwrite("initializer_range", &LlamaConfig::initializer_range) - .def_readwrite("pretraining_tp", &LlamaConfig::pretraining_tp) - .def_readwrite("name_or_path", &LlamaConfig::name_or_path) - .def_readwrite("pad_token_id", &LlamaConfig::pad_token_id) - .def_property( - "bos_token_id", [](const LlamaConfig &self) { - // Always return as list to match Python config format - return py::cast(self.bos_token_id); }, [](LlamaConfig &self, py::object value) { - // Accept both single int and list - if (py::isinstance(value)) { - self.bos_token_id = {value.cast()}; - } else if (py::isinstance(value) || py::isinstance(value)) { - self.bos_token_id = value.cast>(); - } else { - throw py::type_error("bos_token_id must be int or list of ints"); - } }) - .def_property( - "eos_token_id", [](const LlamaConfig &self) { - // Always return as list to match Python config format - return py::cast(self.eos_token_id); }, [](LlamaConfig &self, py::object value) { - // Accept both single int and list - if (py::isinstance(value)) { - self.eos_token_id = {value.cast()}; - } else if (py::isinstance(value) || py::isinstance(value)) { - self.eos_token_id = value.cast>(); - } else { - throw py::type_error("eos_token_id must be int or list of ints"); - } }) - .def_property( - "rope_scaling", - - // ---------- getter ---------- - [](const LlamaConfig &self) -> py::object { - if (!self.rope_scaling) { - return py::none(); - } - - using ScalingConfig = infinicore::nn::RopeScalingConfig; - using LongRopeConfig = infinicore::nn::LongRopeScalingConfig; - - py::dict d; - - if (auto *lr = dynamic_cast(self.rope_scaling.get())) { - d["type"] = "longrope"; - d["rope_type"] = "longrope"; - d["factor"] = lr->factor(); - d["original_max_position_embeddings"] = lr->original_max_position_embeddings(); - d["short_factor"] = lr->short_factor(); - d["long_factor"] = lr->long_factor(); - } else { - throw std::runtime_error("Unknown RoPE scaling type"); - } - - return std::move(d); - }, - - // ---------- setter ---------- - [](LlamaConfig &self, py::object value) { - if (value.is_none()) { - self.rope_scaling.reset(); - return; - } - - if (!py::isinstance(value)) { - throw py::type_error("rope_scaling must be a dict or None"); - } - - py::dict d = value.cast(); - - auto get_str = [&](const char *k) { - if (!d.contains(k)) { - throw py::key_error(k); - } - return py::cast(d[k]); - }; - - std::string type = d.contains("rope_type") - ? py::cast(d["rope_type"]) - : get_str("type"); - - if (type == "longrope") { - using LongRopeConfig = infinicore::nn::LongRopeScalingConfig; - - if (!d.contains("short_factor") || !d.contains("long_factor") || !d.contains("original_max_position_embeddings")) { - throw py::value_error( - "longrope requires short_factor, long_factor, " - "original_max_position_embeddings"); - } - - std::vector short_factor = py::cast>(d["short_factor"]); - std::vector long_factor = py::cast>(d["long_factor"]); - - size_t original_max_position_embeddings = py::cast(d["original_max_position_embeddings"]); - - float factor = 1.0f; - if (d.contains("factor")) { - factor = py::cast(d["factor"]); - } - - self.rope_scaling = std::make_shared( - std::move(short_factor), - std::move(long_factor), - original_max_position_embeddings, - factor); - } else { - throw py::value_error("Unsupported rope_scaling type: " + type); - } - }) - .def("validate", &LlamaConfig::validate) - .def("kv_dim", &LlamaConfig::kv_dim) - // Add __dir__ to make attributes discoverable via dir() in Python - .def("__dir__", [](const LlamaConfig &self) { - py::list dir_list; - dir_list.append("vocab_size"); - dir_list.append("hidden_size"); - dir_list.append("intermediate_size"); - dir_list.append("num_hidden_layers"); - dir_list.append("num_attention_heads"); - dir_list.append("num_key_value_heads"); - dir_list.append("head_dim"); - dir_list.append("max_position_embeddings"); - dir_list.append("rms_norm_eps"); - dir_list.append("hidden_act"); - dir_list.append("model_type"); - dir_list.append("rope_theta"); - dir_list.append("rope_scaling"); - dir_list.append("attention_bias"); - dir_list.append("attention_output_bias"); - dir_list.append("mlp_bias"); - dir_list.append("tie_word_embeddings"); - dir_list.append("qk_norm"); - dir_list.append("use_cache"); - dir_list.append("attention_dropout"); - dir_list.append("initializer_range"); - dir_list.append("pretraining_tp"); - dir_list.append("name_or_path"); - dir_list.append("pad_token_id"); - dir_list.append("bos_token_id"); - dir_list.append("eos_token_id"); - dir_list.append("validate"); - dir_list.append("kv_dim"); - return dir_list; }); - - // Note: Device is already bound in InfiniCore bindings, so we don't need to bind it here -} - -} // namespace infinilm::models::llama_legacy diff --git a/python/infinilm/models/llama/configuration_llama.py b/python/infinilm/models/llama/configuration_llama.py index 15776c848..b41440d07 100644 --- a/python/infinilm/models/llama/configuration_llama.py +++ b/python/infinilm/models/llama/configuration_llama.py @@ -17,12 +17,10 @@ import infinicore -from infinilm.lib import _infinilm - from ...configuration_utils import PretrainedConfig -class LlamaConfig(PretrainedConfig, _infinilm.LlamaConfig): +class LlamaConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -140,6 +138,7 @@ class LlamaConfig(PretrainedConfig, _infinilm.LlamaConfig): ```""" model_type = "llama" + keys_to_ignore_at_inference = ["past_key_values"] # Default tensor parallel plan for base model `LlamaModel` base_model_tp_plan = { @@ -184,15 +183,7 @@ def __init__( torch_dtype=None, **kwargs, ): - _infinilm.LlamaConfig.__init__(self) - - original_model_type = kwargs.get("model_type", None) - if original_model_type == "qwen3": - self.qk_norm = True - - # --- self.model_type = "llama" - self.name_or_path = "" self.pad_token_id = pad_token_id self.bos_token_id = bos_token_id diff --git a/xmake.lua b/xmake.lua index 2b1b51d37..d9863a523 100644 --- a/xmake.lua +++ b/xmake.lua @@ -18,15 +18,6 @@ if has_config("use-kv-caching") then add_defines("ENABLE_KV_CACHING") end -option("use-classic-llama") - set_default(false) - set_showmenu(true) - set_description("Whether to using the classic LlamaForCausalLM") -option_end() - -if has_config("use-classic-llama") then - add_defines("USE_CLASSIC_LLAMA") -end target("infinicore_infer") set_kind("shared")