From c7c2a442d7ef6b6ce55c2be9cf44426553946883 Mon Sep 17 00:00:00 2001
From: wangpengcheng <wangpengcheng@example.com>
Date: Thu, 11 Jun 2026 02:26:27 +0000
Subject: [PATCH] issue/424 -Clean up unused code.

---
 csrc/cache/kv_cache.cpp                       | 225 +--------
 csrc/cache/kv_cache.hpp                       | 173 +------
 csrc/config/config_factory.cpp                |   9 +-
 csrc/{models => }/debug_utils/hooks.cpp       |   0
 csrc/{models => }/debug_utils/hooks.hpp       |   0
 .../{models => }/debug_utils/tensor_utils.hpp |   0
 csrc/engine/rank_worker.cpp                   |  32 +-
 .../layers/causal_lm_templates/text_model.hpp |   3 -
 .../rotary_embedding_factory.cpp              |   1 -
 .../baichuan/baichuan_for_causal_lm.hpp       |   1 -
 csrc/models/fm9g/fm9g_for_causal_lm.cpp       |   4 -
 csrc/models/fm9g/fm9g_for_causal_lm.hpp       |   1 -
 csrc/models/glm4/glm4_decoder_layer.hpp       |   2 -
 csrc/models/infinilm_model.cpp                |   1 -
 csrc/models/infinilm_model.hpp                |   1 -
 .../internlm3/internlm3_for_causal_lm.hpp     |   1 -
 csrc/models/llama/llama_for_causal_lm.cpp     |   6 +-
 .../llama_legacy/legacy_fused_linear.cpp      | 396 ----------------
 .../llama_legacy/legacy_fused_linear.hpp      | 328 -------------
 csrc/models/llama_legacy/llama.hpp            |  24 -
 csrc/models/llama_legacy/llama_attention.cpp  | 439 ------------------
 csrc/models/llama_legacy/llama_attention.hpp  | 142 ------
 csrc/models/llama_legacy/llama_config.hpp     |  95 ----
 .../llama_legacy/llama_decoder_layer.cpp      |  50 --
 .../llama_legacy/llama_decoder_layer.hpp      |  89 ----
 .../llama_legacy/llama_for_causal_lm.cpp      |  51 --
 .../llama_legacy/llama_for_causal_lm.hpp      |  63 ---
 csrc/models/llama_legacy/llama_mlp.cpp        |  97 ----
 csrc/models/llama_legacy/llama_mlp.hpp        |  81 ----
 csrc/models/llama_legacy/llama_model.cpp      | 125 -----
 csrc/models/llama_legacy/llama_model.hpp      |  98 ----
 csrc/models/model_factory.cpp                 |  21 -
 csrc/models/model_factory.hpp                 |   8 -
 csrc/models/qwen2/qwen2_for_causal_lm.cpp     |   6 +-
 csrc/models/qwen3_moe/qwen3_moe_experts.hpp   |   2 -
 .../qwen3_moe/qwen3_moe_topk_router.hpp       |   2 -
 csrc/pybind11/bindings.cc                     |   5 +-
 csrc/pybind11/engine/engine.hpp               |  32 +-
 csrc/pybind11/models/llama_legacy.hpp         | 216 ---------
 .../models/llama/configuration_llama.py       |  13 +-
 xmake.lua                                     |   9 -
 41 files changed, 66 insertions(+), 2786 deletions(-)
 rename csrc/{models => }/debug_utils/hooks.cpp (100%)
 rename csrc/{models => }/debug_utils/hooks.hpp (100%)
 rename csrc/{models => }/debug_utils/tensor_utils.hpp (100%)
 delete mode 100644 csrc/models/llama_legacy/legacy_fused_linear.cpp
 delete mode 100644 csrc/models/llama_legacy/legacy_fused_linear.hpp
 delete mode 100644 csrc/models/llama_legacy/llama.hpp
 delete mode 100644 csrc/models/llama_legacy/llama_attention.cpp
 delete mode 100644 csrc/models/llama_legacy/llama_attention.hpp
 delete mode 100644 csrc/models/llama_legacy/llama_config.hpp
 delete mode 100644 csrc/models/llama_legacy/llama_decoder_layer.cpp
 delete mode 100644 csrc/models/llama_legacy/llama_decoder_layer.hpp
 delete mode 100644 csrc/models/llama_legacy/llama_for_causal_lm.cpp
 delete mode 100644 csrc/models/llama_legacy/llama_for_causal_lm.hpp
 delete mode 100644 csrc/models/llama_legacy/llama_mlp.cpp
 delete mode 100644 csrc/models/llama_legacy/llama_mlp.hpp
 delete mode 100644 csrc/models/llama_legacy/llama_model.cpp
 delete mode 100644 csrc/models/llama_legacy/llama_model.hpp
 delete mode 100644 csrc/pybind11/models/llama_legacy.hpp
diff --git a/csrc/cache/kv_cache.cpp b/csrc/cache/kv_cache.cpp
index df0ceb29d..92734c05f 100644
--- a/csrc/cache/kv_cache.cpp
+++ b/csrc/cache/kv_cache.cpp
@@ -2,8 +2,6 @@
 
 #include "../global_state/global_state.hpp"
 #include "../utils.hpp"
-#include "infinicore/ops.hpp"
-#include <stdexcept>
 
 namespace infinilm::cache {
 // ==========================
@@ -32,58 +30,12 @@ StaticKVCacheConfig::max_cache_len() const {
     return max_cache_len_;
 }
 
+namespace StaticKVCache {
+
 // ==========================
 // StaticKVCache
 // ==========================
-
-StaticKVCache::StaticKVCache(
-    infinicore::Size k_dim,
-    infinicore::Size v_dim,
-    infinicore::Size num_k_heads,
-    infinicore::Size num_v_heads,
-    infinicore::Size num_layers,
-    infinicore::Size max_positional_embedding,
-    infinicore::DataType dtype,
-    const StaticKVCacheConfig &config,
-    const engine::distributed::RankInfo &rank_info)
-    : Cache(),
-      k_dim_(k_dim),
-      v_dim_(v_dim),
-      rank_batch_size_(config.max_batch_size()),
-      cache_len_(config.max_cache_len() == std::numeric_limits<infinicore::Size>::max() || config.max_cache_len() == 0 ? max_positional_embedding : config.max_cache_len()),
-      rank_num_layers_(num_layers),
-      dtype_(dtype) {
-
-    bool is_kv_replica = (num_k_heads < rank_info.tp_size && num_v_heads < rank_info.tp_size && num_k_heads == num_v_heads && rank_info.tp_size % num_k_heads == 0);
-
-    num_rank_k_heads_ = is_kv_replica ? 1 : (num_k_heads / rank_info.tp_size);
-    num_rank_v_heads_ = is_kv_replica ? 1 : (num_v_heads / rank_info.tp_size);
-    // Allocate K cache
-    k_caches_ = infinicore::Tensor::empty(
-        {rank_num_layers_,
-         rank_batch_size_,
-         num_rank_k_heads_,
-         cache_len_,
-         k_dim_},
-        dtype_,
-        rank_info.device);
-    set_zeros(k_caches_);
-
-    // Allocate V cache
-    v_caches_ = infinicore::Tensor::empty(
-        {rank_num_layers_,
-         rank_batch_size_,
-         num_rank_v_heads_,
-         cache_len_,
-         v_dim_},
-        dtype_,
-        rank_info.device);
-    set_zeros(v_caches_);
-
-    infinicore::context::syncStream();
-}
-
-infinicore::Tensor StaticKVCache::create_layer_kv_cache(
+infinicore::Tensor create_layer_kv_cache(
     const infinicore::Size k_dim,
     const infinicore::Size v_dim,
     const infinicore::Size num_k_heads,
@@ -120,45 +72,7 @@ infinicore::Tensor StaticKVCache::create_layer_kv_cache(
 
     return kv_cache;
 }
-
-std::tuple<infinicore::Tensor, infinicore::Tensor>
-StaticKVCache::update(size_t layer_idx,
-                      const infinicore::Tensor &k,
-                      const infinicore::Tensor &v,
-                      const infinicore::Tensor &past_sequence_lengths) {
-    ASSERT(layer_idx < rank_num_layers_);
-
-    auto batch_size = k->size(0);
-    auto update_len = k->size(2);
-
-    ASSERT_EQ(batch_size, rank_batch_size_);
-
-    auto k_cache_layer = k_caches_->narrow({{0, layer_idx, 1}})->squeeze(0);
-    auto v_cache_layer = v_caches_->narrow({{0, layer_idx, 1}})->squeeze(0);
-
-    auto device = k_cache_layer->device();
-
-#ifdef ENABLE_KV_CACHING
-    infinicore::op::kv_caching_(
-        k_cache_layer,
-        v_cache_layer,
-        k,
-        v,
-        past_sequence_lengths);
-#else
-    size_t cache_pos = reinterpret_cast<int32_t *>(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0];
-    auto result_len = cache_pos + update_len;
-    ASSERT(result_len <= cache_len_);
-
-    auto k_cache_update = k_cache_layer->narrow({{2, cache_pos, update_len}});
-    auto v_cache_update = v_cache_layer->narrow({{2, cache_pos, update_len}});
-
-    k_cache_update->copy_from(k);
-    v_cache_update->copy_from(v);
-#endif
-
-    return {k_cache_layer, v_cache_layer};
-}
+}; // namespace StaticKVCache
 
 // ==========================
 // PagedKVCacheConfig
@@ -185,56 +99,11 @@ PagedKVCacheConfig::block_size() const {
     return block_size_;
 }
 
+namespace PagedKVCache {
 // ==========================
 // PagedKVCache
 // ==========================
-PagedKVCache::PagedKVCache(
-    infinicore::Size k_dim,
-    infinicore::Size v_dim,
-    infinicore::Size num_k_heads,
-    infinicore::Size num_v_heads,
-    infinicore::Size num_layers,
-    infinicore::DataType dtype,
-    const PagedKVCacheConfig &config,
-    const engine::distributed::RankInfo &rank_info)
-    : Cache(),
-      k_dim_(k_dim),
-      v_dim_(v_dim),
-      rank_num_layers_(num_layers),
-      dtype_(dtype),
-      num_blocks_per_layer_(config.num_blocks()),
-      block_size_(config.block_size()) {
-
-    bool is_kv_replica = (num_k_heads < rank_info.tp_size && num_v_heads < rank_info.tp_size && num_k_heads == num_v_heads && rank_info.tp_size % num_k_heads == 0);
-
-    num_rank_k_heads_ = is_kv_replica ? 1 : (num_k_heads / rank_info.tp_size);
-    num_rank_v_heads_ = is_kv_replica ? 1 : (num_v_heads / rank_info.tp_size);
-    // [num_layers, num_blocks, num_rank_k_heads, block_size, k_dim]
-    k_caches_ = infinicore::Tensor::empty(
-        {rank_num_layers_,
-         num_blocks_per_layer_,
-         num_rank_k_heads_,
-         block_size_,
-         k_dim_},
-        dtype_,
-        rank_info.device);
-    set_zeros(k_caches_);
-
-    // [num_layers, num_blocks, num_rank_v_heads, block_size, v_dim]
-    v_caches_ = infinicore::Tensor::empty(
-        {rank_num_layers_,
-         num_blocks_per_layer_,
-         num_rank_v_heads_,
-         block_size_,
-         v_dim_},
-        dtype_,
-        rank_info.device);
-    set_zeros(v_caches_);
-
-    infinicore::context::syncStream();
-}
-
-infinicore::Tensor PagedKVCache::create_layer_kv_cache(
+infinicore::Tensor create_layer_kv_cache(
     infinicore::Size k_dim,
     infinicore::Size v_dim,
     infinicore::Size num_k_heads,
@@ -273,86 +142,6 @@ infinicore::Tensor PagedKVCache::create_layer_kv_cache(
 
     return kv_cache;
 }
-
-std::tuple<infinicore::Tensor, infinicore::Tensor> PagedKVCache::update(
-    size_t layer_idx,
-    const infinicore::Tensor &k,
-    const infinicore::Tensor &v,
-    const infinicore::Tensor &slot_mapping) {
-
-    auto &&[k_cache_layer, v_cache_layer] = get_paged_kv(layer_idx);
-
-    infinicore::op::paged_caching_(
-        k_cache_layer,
-        v_cache_layer,
-        k,
-        v,
-        slot_mapping);
-    return {k_cache_layer, v_cache_layer};
-}
-
-std::tuple<infinicore::Tensor, infinicore::Tensor>
-PagedKVCache::get_paged_kv(size_t layer_idx) {
-    auto k_cache_layer = k_caches_->narrow({{0, layer_idx, 1}})->squeeze(0);
-    auto v_cache_layer = v_caches_->narrow({{0, layer_idx, 1}})->squeeze(0);
-    return {k_cache_layer, v_cache_layer};
-}
-
-std::tuple<infinicore::Tensor, infinicore::Tensor>
-PagedKVCache::get_contiguous_kv(
-    size_t layer_idx,
-    const infinicore::Tensor block_tables,
-    const infinicore::Tensor cache_lens,
-    const infinicore::Tensor input_offsets,
-    size_t request_id) {
-    ASSERT_EQ(block_tables->dtype(), infinicore::DataType::I32);
-    ASSERT_EQ(cache_lens->dtype(), infinicore::DataType::I32);
-    ASSERT_EQ(input_offsets->dtype(), infinicore::DataType::I32);
-
-    auto nreq = block_tables->size(0);
-    auto block_tables_cpu = block_tables->to(infinicore::Device::cpu());
-    auto cache_lens_cpu = cache_lens->to(infinicore::Device::cpu());
-    auto input_offsets_cpu = input_offsets->to(infinicore::Device::cpu());
-    infinicore::context::syncDevice();
-
-    // [num_blocks, num_rank_v_heads, block_size, v_dim]
-    auto &&[k_cache_layer, v_cache_layer] = get_paged_kv(layer_idx);
-
-    auto req = request_id;
-    auto cache_lens_ptr = reinterpret_cast<const int32_t *>(cache_lens_cpu->data());
-    auto input_offsets_ptr = reinterpret_cast<const int32_t *>(input_offsets_cpu->data());
-    int32_t total_len = cache_lens_ptr[req] + (input_offsets_ptr[req + 1] - input_offsets_ptr[req]);
-
-    auto full_k = infinicore::Tensor::empty(
-        {num_rank_k_heads_, (size_t)total_len, k_dim_},
-        k_cache_layer->dtype(), k_cache_layer->device());
-
-    auto full_v = infinicore::Tensor::empty(
-        {num_rank_v_heads_, (size_t)total_len, v_dim_},
-        v_cache_layer->dtype(), v_cache_layer->device());
-
-    size_t nblocks = total_len / block_size_;
-    size_t r = total_len % block_size_;
-
-    for (size_t b = 0; b < nblocks; b++) {
-        size_t bid = *((int32_t *)(block_tables_cpu->narrow({{0, req, 1}, {1, b, 1}})->data()));
-
-        full_k->narrow({{1, b * block_size_, block_size_}})
-            ->copy_from(k_cache_layer->narrow({{0, bid, 1}})->squeeze(0));
-        full_v->narrow({{1, b * block_size_, block_size_}})
-            ->copy_from(v_cache_layer->narrow({{0, bid, 1}})->squeeze(0));
-    }
-
-    if (r > 0) {
-        size_t bid = *((int32_t *)(block_tables_cpu->narrow({{0, req, 1}, {1, nblocks, 1}})->data()));
-
-        full_k->narrow({{1, nblocks * block_size_, r}})
-            ->copy_from(k_cache_layer->narrow({{0, bid, 1}})->squeeze(0)->narrow({{1, 0, r}}));
-        full_v->narrow({{1, nblocks * block_size_, r}})
-            ->copy_from(v_cache_layer->narrow({{0, bid, 1}})->squeeze(0)->narrow({{1, 0, r}}));
-    }
-
-    return {full_k, full_v};
-}
+}; // namespace PagedKVCache
 
 } // namespace infinilm::cache
diff --git a/csrc/cache/kv_cache.hpp b/csrc/cache/kv_cache.hpp
index e6e640df2..4d0a9a704 100644
--- a/csrc/cache/kv_cache.hpp
+++ b/csrc/cache/kv_cache.hpp
@@ -1,20 +1,10 @@
 #pragma once
 
 #include "base_cache.hpp"
-
-#include "infinicore/context/context.hpp"
-#include "infinicore/device.hpp"
-#include "infinicore/tensor.hpp"
 #include <infinicore/dtype.hpp>
 
-#include <algorithm>
 #include <limits>
 #include <memory>
-#include <numeric>
-#include <stdexcept>
-#include <utility>
-
-#include <spdlog/spdlog.h>
 
 namespace infinilm::cache {
 class StaticKVCacheConfig final : public CacheConfig {
@@ -32,64 +22,18 @@ class StaticKVCacheConfig final : public CacheConfig {
     infinicore::Size max_cache_len_;
 };
 
-class StaticKVCache final : public Cache {
-public:
-    StaticKVCache(
-        infinicore::Size k_dim,
-        infinicore::Size v_dim,
-        infinicore::Size num_k_heads,
-        infinicore::Size num_v_heads,
-        infinicore::Size num_layers,
-        infinicore::Size max_positional_embedding,
-        infinicore::DataType dtype,
-        const StaticKVCacheConfig &config,
-        const engine::distributed::RankInfo &rank_info);
-
-    static infinicore::Tensor create_layer_kv_cache(
-        const infinicore::Size k_dim,
-        const infinicore::Size v_dim,
-        const infinicore::Size num_k_heads,
-        const infinicore::Size num_v_heads,
-        const infinicore::Size max_positional_embedding,
-        const infinicore::DataType dtype,
-        const StaticKVCacheConfig &config);
-
-    /**
-     * @brief Update KV cache at a given layer and cache position.
-     *
-     * @param layer_idx Which transformer layer
-     * @param k         [batch, num_rank_k_heads, seq_len, k_dim]
-     * @param v         [batch, num_rank_v_heads, seq_len, v_dim]
-     * @param cache_pos Sequence position to write
-     *
-     * @return (full_k, full_v)
-     *         full_k: [batch, num_rank_k_heads, cache_pos + seq_len, k_dim]
-     *         full_v: [batch, num_rank_v_heads, cache_pos + seq_len, v_dim]
-     */
-    std::tuple<infinicore::Tensor, infinicore::Tensor>
-    update(size_t layer_idx,
-           const infinicore::Tensor &k,
-           const infinicore::Tensor &v,
-           const infinicore::Tensor &past_sequence_lengths);
+namespace StaticKVCache {
 
-    ~StaticKVCache() override = default;
+infinicore::Tensor create_layer_kv_cache(
+    infinicore::Size k_dim,
+    infinicore::Size v_dim,
+    infinicore::Size num_k_heads,
+    infinicore::Size num_v_heads,
+    infinicore::Size max_positional_embedding,
+    infinicore::DataType dtype,
+    const StaticKVCacheConfig &config);
 
-private:
-    infinicore::Size k_dim_;
-    infinicore::Size v_dim_;
-    infinicore::Size num_rank_k_heads_;
-    infinicore::Size num_rank_v_heads_;
-    infinicore::Size rank_batch_size_;
-    infinicore::Size cache_len_;
-    infinicore::Size rank_num_layers_;
-    infinicore::DataType dtype_;
-
-    // [num_layers, max_batch, num_rank_k_heads, max_cache_len, k_dim]
-    infinicore::Tensor k_caches_;
-
-    // [num_layers, max_batch, num_rank_v_heads, max_cache_len, v_dim]
-    infinicore::Tensor v_caches_;
-};
+} // namespace StaticKVCache
 
 class PagedKVCacheConfig final : public CacheConfig {
 public:
@@ -106,94 +50,15 @@ class PagedKVCacheConfig final : public CacheConfig {
     size_t block_size_;
 };
 
-class PagedKVCache final : public Cache {
-public:
-    PagedKVCache(
-        infinicore::Size k_dim,
-        infinicore::Size v_dim,
-        infinicore::Size num_k_heads,
-        infinicore::Size num_v_heads,
-        infinicore::Size num_layers,
-        infinicore::DataType dtype,
-        const PagedKVCacheConfig &config,
-        const engine::distributed::RankInfo &rank_info);
-
-    static infinicore::Tensor create_layer_kv_cache(
-        infinicore::Size k_dim,
-        infinicore::Size v_dim,
-        infinicore::Size num_k_heads,
-        infinicore::Size num_v_heads,
-        infinicore::DataType dtype,
-        const PagedKVCacheConfig &config);
-
-    /**
-     * @brief Update Paged KV cache at a given layer given slot info for each token.
-     *
-     * @param layer_idx Which paged attention layer
-     * @param k         [num_rank_k_heads, seq_len, k_dim]
-     * @param v         [num_rank_v_heads, seq_len, v_dim]
-     * @param slot_mapping [seq_len]
-     *
-     * @return (full_k, full_v)
-     *         full_k: [num_blocks, num_rank_k_heads, block_size, k_dim]
-     *         full_v: [num_blocks, num_rank_v_heads, block_size, v_dim]
-     */
-    std::tuple<infinicore::Tensor, infinicore::Tensor>
-    update(size_t layer_idx,
-           const infinicore::Tensor &k,
-           const infinicore::Tensor &v,
-           const infinicore::Tensor &slot_mapping);
-
-    /**
-     * @brief Get Paged KV cache at a given layer.
-     *
-     * @param layer_idx Which paged attention layer
-     *
-     * @return (full_k, full_v)
-     *         full_k: [num_blocks, num_rank_k_heads, block_size, k_dim]
-     *         full_v: [num_blocks, num_rank_v_heads, block_size, v_dim]
-     */
-    std::tuple<infinicore::Tensor, infinicore::Tensor>
-    get_paged_kv(size_t layer_idx);
-
-    /**
-     * @brief Get contiguous KV cache at a given layer, given the request info
-     * among a continuous request batch.
-     *
-     * @param layer_idx Which paged attention layer
-     * @param block_tables [num_requests, max_blocks_per_request]
-     * @param cache_lens [num_requests]
-     * @param input_offsets [num_requests + 1]
-     * @param request_id Which request among a continuous batch of requests
-     *
-     * @return (full_k, full_v)
-     *         full_k: [num_rank_k_heads, total_len, k_dim]
-     *         full_v: [num_rank_v_heads, total_len, v_dim]
-     */
-    std::tuple<infinicore::Tensor, infinicore::Tensor>
-    get_contiguous_kv(size_t layer_idx,
-                      const infinicore::Tensor block_tables,
-                      const infinicore::Tensor cache_lens,
-                      const infinicore::Tensor input_offsets,
-                      size_t request_id = 0);
+namespace PagedKVCache {
+infinicore::Tensor create_layer_kv_cache(
+    infinicore::Size k_dim,
+    infinicore::Size v_dim,
+    infinicore::Size num_k_heads,
+    infinicore::Size num_v_heads,
+    infinicore::DataType dtype,
+    const PagedKVCacheConfig &config);
 
-    ~PagedKVCache() override
-        = default;
-
-private:
-    infinicore::Size k_dim_;
-    infinicore::Size v_dim_;
-    infinicore::Size num_rank_k_heads_;
-    infinicore::Size num_rank_v_heads_;
-    infinicore::Size rank_num_layers_;
-    infinicore::DataType dtype_;
-    infinicore::Size block_size_;
-    infinicore::Size num_blocks_per_layer_;
-    // [num_layers, num_blocks, num_rank_k_heads, block_size, k_dim]
-    infinicore::Tensor k_caches_;
-
-    // [num_layers, num_blocks, num_rank_v_heads, block_size, v_dim]
-    infinicore::Tensor v_caches_;
-};
+} // namespace PagedKVCache
 
 } // namespace infinilm::cache
diff --git a/csrc/config/config_factory.cpp b/csrc/config/config_factory.cpp
index 09e21e933..0467f4536 100644
--- a/csrc/config/config_factory.cpp
+++ b/csrc/config/config_factory.cpp
@@ -7,9 +7,6 @@ namespace infinilm::config {
 std::shared_ptr<infinilm::config::ModelConfig> ConfigFactory::createConfig(const std::string &config_str) {
     const nlohmann::json config_json = nlohmann::json::parse(config_str);
     auto model_config = std::make_shared<infinilm::config::ModelConfig>(config_json);
-    if (nullptr == model_config) {
-        throw std::runtime_error("infinilm::config::ConfigFactory::createConfig: model_config is not initialized");
-    }
 
     const std::string model_type = model_config->get<std::string>("model_type");
     const auto &config_map = models::get_model_config_map();
@@ -17,11 +14,7 @@ std::shared_ptr<infinilm::config::ModelConfig> ConfigFactory::createConfig(const
     if (it != config_map.end()) {
         it->second(model_config);
     } else {
-        std::vector<std::string> classic_models = {"llama", "qwen2", "minicpm", "fm9g", "fm9g7b"};
-        const std::string &model_type = model_config->get<std::string>("model_type");
-        if (std::find(classic_models.begin(), classic_models.end(), model_type) == classic_models.end()) {
-            throw std::invalid_argument("infinilm::config::ConfigFactory::createConfig: Unsupported model config type: " + model_type);
-        }
+        throw std::invalid_argument("infinilm::config::ConfigFactory::createConfig: Unsupported model config type: " + model_type);
     }
 
     return model_config;
diff --git a/csrc/models/debug_utils/hooks.cpp b/csrc/debug_utils/hooks.cpp
similarity index 100%
rename from csrc/models/debug_utils/hooks.cpp
rename to csrc/debug_utils/hooks.cpp
diff --git a/csrc/models/debug_utils/hooks.hpp b/csrc/debug_utils/hooks.hpp
similarity index 100%
rename from csrc/models/debug_utils/hooks.hpp
rename to csrc/debug_utils/hooks.hpp
diff --git a/csrc/models/debug_utils/tensor_utils.hpp b/csrc/debug_utils/tensor_utils.hpp
similarity index 100%
rename from csrc/models/debug_utils/tensor_utils.hpp
rename to csrc/debug_utils/tensor_utils.hpp
diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp
index 87568fd6a..1fa34e126 100644
--- a/csrc/engine/rank_worker.cpp
+++ b/csrc/engine/rank_worker.cpp
@@ -1,10 +1,6 @@
 #include "rank_worker.hpp"
-
-#include "../global_state/global_state.hpp"
 #include "../models/model_factory.hpp"
-#include "../models/models_registry.hpp"
 #include "infinicore/ops.hpp"
-#include <iostream>
 #include <spdlog/spdlog.h>
 #include <stdexcept>
 
@@ -254,30 +250,10 @@ void RankWorker::thread_loop() {
             infinilm::global_state::initialize_infinilm_config(infinilm_config_);
 
             // Create model using factory (may be expensive)
-            const std::string &model_type = model_config_->get<std::string>("model_type");
-            const auto &model_map = models::get_causal_lm_model_map();
-            auto it = model_map.find(model_type);
-            if (it != model_map.end()) {
-                model_ = InfinilmModelFactory::createModel(
-                    model_config_,
-                    rank_info_.device,
-                    pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr);
-            } else {
-                std::vector<std::string> classic_models = {"llama", "qwen2", "minicpm", "fm9g", "fm9g7b"};
-                if ((std::find(classic_models.begin(), classic_models.end(), model_type) != classic_models.end())) {
-                    model_ = InfinilmModelFactory::createModel(
-                        model_config_,
-                        rank_info_,
-                        pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr,
-                        attention_backend_);
-                } else {
-                    throw std::runtime_error("RankWorker::thread_loop(): Unsupported model config type: " + model_type);
-                }
-            }
-
-            if (!model_) {
-                throw std::runtime_error("Failed to create model");
-            }
+            model_ = InfinilmModelFactory::createModel(
+                model_config_,
+                rank_info_.device,
+                pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr);
             if (enable_graph_compiling_) {
                 compiler_ = std::make_unique<GeneralCompiler>(model_, barrier_);
             }
diff --git a/csrc/layers/causal_lm_templates/text_model.hpp b/csrc/layers/causal_lm_templates/text_model.hpp
index 62a52798b..143215997 100644
--- a/csrc/layers/causal_lm_templates/text_model.hpp
+++ b/csrc/layers/causal_lm_templates/text_model.hpp
@@ -6,7 +6,6 @@
 #include "infinicore/nn/rmsnorm.hpp"
 #include "infinicore/tensor.hpp"
 #include <memory>
-#include <vector>
 
 namespace infinilm::layers::causal_lm_templates {
 
@@ -28,9 +27,7 @@ class TextModel : public infinicore::nn::Module {
         const auto &dtype{model_config->get_dtype()};
         size_t vocab_size = model_config->get<size_t>("vocab_size");
         size_t hidden_size = model_config->get<size_t>("hidden_size");
-        size_t max_position_embeddings = model_config->get<size_t>("max_position_embeddings");
         size_t num_hidden_layers = model_config->get<size_t>("num_hidden_layers");
-        double rope_theta = model_config->get<double>("rope_theta");
         double rms_norm_eps = model_config->get<double>("rms_norm_eps");
 
         embed_tokens_ = this->register_module<infinicore::nn::Embedding>("embed_tokens", vocab_size, hidden_size, std::nullopt, dtype, device);
diff --git a/csrc/layers/rotary_embedding/rotary_embedding_factory.cpp b/csrc/layers/rotary_embedding/rotary_embedding_factory.cpp
index 4866de9db..5f44a372e 100644
--- a/csrc/layers/rotary_embedding/rotary_embedding_factory.cpp
+++ b/csrc/layers/rotary_embedding/rotary_embedding_factory.cpp
@@ -1,6 +1,5 @@
 #include "rotary_embedding_factory.hpp"
 #include "../../config/model_config.hpp"
-#include <stdexcept>
 
 namespace infinilm::layers::rotary_embedding {
 
diff --git a/csrc/models/baichuan/baichuan_for_causal_lm.hpp b/csrc/models/baichuan/baichuan_for_causal_lm.hpp
index 0e17bb85a..752d7008d 100644
--- a/csrc/models/baichuan/baichuan_for_causal_lm.hpp
+++ b/csrc/models/baichuan/baichuan_for_causal_lm.hpp
@@ -1,6 +1,5 @@
 #pragma once
 
-#include "../../layers/common_modules.hpp"
 #include "../llama/llama_for_causal_lm.hpp"
 #include <memory>
 
diff --git a/csrc/models/fm9g/fm9g_for_causal_lm.cpp b/csrc/models/fm9g/fm9g_for_causal_lm.cpp
index 25be3762c..1c2748e90 100644
--- a/csrc/models/fm9g/fm9g_for_causal_lm.cpp
+++ b/csrc/models/fm9g/fm9g_for_causal_lm.cpp
@@ -16,8 +16,6 @@ std::shared_ptr<infinilm::config::ModelConfig> create_fm9g_model_config(std::sha
 
 namespace {
 
-#ifndef USE_CLASSIC_LLAMA
-
 INFINILM_REGISTER_CAUSAL_LM_MODEL(
     fm9g,
     infinilm::models::fm9g::FM9GForCausalLM,
@@ -33,6 +31,4 @@ INFINILM_REGISTER_CAUSAL_LM_MODEL(
     infinilm::models::fm9g::FM9GForCausalLM,
     infinilm::models::fm9g::create_fm9g_model_config);
 
-#endif
-
 } // namespace
diff --git a/csrc/models/fm9g/fm9g_for_causal_lm.hpp b/csrc/models/fm9g/fm9g_for_causal_lm.hpp
index 49e65efaf..1b348315a 100644
--- a/csrc/models/fm9g/fm9g_for_causal_lm.hpp
+++ b/csrc/models/fm9g/fm9g_for_causal_lm.hpp
@@ -1,7 +1,6 @@
 #pragma once
 
 #include "../../layers/common_modules.hpp"
-#include "infinicore/nn/linear.hpp"
 #include <cmath>
 #include <memory>
 
diff --git a/csrc/models/glm4/glm4_decoder_layer.hpp b/csrc/models/glm4/glm4_decoder_layer.hpp
index ddfebfcdc..236b174e5 100644
--- a/csrc/models/glm4/glm4_decoder_layer.hpp
+++ b/csrc/models/glm4/glm4_decoder_layer.hpp
@@ -1,8 +1,6 @@
 #pragma once
 
-#include "../../backends/attention_backends.hpp"
 #include "../../config/model_config.hpp"
-#include "../../engine/distributed/distributed.hpp"
 #include "../../layers/common_modules.hpp"
 #include "infinicore/nn/module.hpp"
 #include "infinicore/nn/rmsnorm.hpp"
diff --git a/csrc/models/infinilm_model.cpp b/csrc/models/infinilm_model.cpp
index 3923474ed..3c7c4f351 100644
--- a/csrc/models/infinilm_model.cpp
+++ b/csrc/models/infinilm_model.cpp
@@ -1,5 +1,4 @@
 #include "infinilm_model.hpp"
-#include "../backends/attention_backends.hpp"
 #include "../cache/kv_cache.hpp"
 #include "../global_state/global_state.hpp"
 #include "../layers/attention/attention.hpp"
diff --git a/csrc/models/infinilm_model.hpp b/csrc/models/infinilm_model.hpp
index 5cabcef23..06b2ca7af 100644
--- a/csrc/models/infinilm_model.hpp
+++ b/csrc/models/infinilm_model.hpp
@@ -3,7 +3,6 @@
 #include "../backends/attention_backends.hpp"
 #include "../cache/cache.hpp"
 #include "../config/model_config.hpp"
-#include "../layers/linear/linear.hpp"
 #include "infinicore/nn/module.hpp"
 #include "infinicore/tensor.hpp"
 
diff --git a/csrc/models/internlm3/internlm3_for_causal_lm.hpp b/csrc/models/internlm3/internlm3_for_causal_lm.hpp
index 5ad014fb5..ef68e010a 100644
--- a/csrc/models/internlm3/internlm3_for_causal_lm.hpp
+++ b/csrc/models/internlm3/internlm3_for_causal_lm.hpp
@@ -1,6 +1,5 @@
 #pragma once
 
-#include "../../layers/common_modules.hpp"
 #include "../llama/llama_for_causal_lm.hpp"
 #include <memory>
 
diff --git a/csrc/models/llama/llama_for_causal_lm.cpp b/csrc/models/llama/llama_for_causal_lm.cpp
index fd3438bde..37ce411b7 100644
--- a/csrc/models/llama/llama_for_causal_lm.cpp
+++ b/csrc/models/llama/llama_for_causal_lm.cpp
@@ -14,7 +14,7 @@ std::shared_ptr<infinilm::config::ModelConfig> create_llama_model_config(std::sh
 
     if (!config_json.contains("head_dim")) {
         config_json["head_dim"] = model_config->get<size_t>("hidden_size")
-            / model_config->get<size_t>("num_attention_heads");
+                                / model_config->get<size_t>("num_attention_heads");
     }
 
     if (!config_json.contains("attention_bias")) {
@@ -28,13 +28,9 @@ std::shared_ptr<infinilm::config::ModelConfig> create_llama_model_config(std::sh
 
 namespace {
 
-#ifndef USE_CLASSIC_LLAMA
-
 INFINILM_REGISTER_CAUSAL_LM_MODEL(
     llama,
     infinilm::models::llama::LlamaForCausalLM,
     infinilm::models::llama::create_llama_model_config);
 
-#endif
-
 } // namespace
diff --git a/csrc/models/llama_legacy/legacy_fused_linear.cpp b/csrc/models/llama_legacy/legacy_fused_linear.cpp
deleted file mode 100644
index 3736cdf03..000000000
--- a/csrc/models/llama_legacy/legacy_fused_linear.cpp
+++ /dev/null
@@ -1,396 +0,0 @@
-/**
- * @deprecated Legacy fused linear implementations based on InfiniCore.
- * Removal target: v0.2.0 (Q2 2026)
- */
-
-#include "legacy_fused_linear.hpp"
-
-#include <spdlog/spdlog.h>
-
-namespace infinilm::layers::linear {
-// DEPRECATED BEGIN
-// ---------------------------------------------------------
-// LegacyQKVParallelLinear
-// ---------------------------------------------------------
-LegacyQKVParallelLinear::LegacyQKVParallelLinear(size_t hidden_size,
-                                                 size_t head_dim,
-                                                 size_t num_q_head,
-                                                 size_t num_kv_head,
-                                                 bool bias,
-                                                 const infinicore::DataType &dtype,
-                                                 const infinicore::Device &device,
-                                                 engine::distributed::RankInfo rank_info)
-    : LegacyQKVParallelLinear(hidden_size,
-                              head_dim, head_dim, head_dim,
-                              num_q_head, num_kv_head, num_kv_head,
-                              bias, bias, bias,
-                              dtype, device, rank_info) {}
-
-LegacyQKVParallelLinear::LegacyQKVParallelLinear(size_t hidden_size,
-                                                 size_t q_dim, size_t k_dim, size_t v_dim,
-                                                 size_t num_q_head, size_t num_k_head, size_t num_v_head,
-                                                 bool q_bias, bool k_bias, bool v_bias,
-                                                 const infinicore::DataType &dtype,
-                                                 const infinicore::Device &device,
-                                                 engine::distributed::RankInfo rank_info)
-    : infinicore::nn::ColumnParallelLinear(
-          hidden_size,
-          num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim,
-          (q_bias || k_bias || v_bias),
-          dtype,
-          device,
-          rank_info.tp_rank,
-          rank_info.tp_size),
-      q_dim_(q_dim),
-      k_dim_(k_dim),
-      v_dim_(v_dim),
-      num_q_head_(num_q_head),
-      num_k_head_(num_k_head),
-      num_v_head_(num_v_head),
-      q_bias_(q_bias),
-      k_bias_(k_bias),
-      v_bias_(v_bias) {
-    if (num_q_head % tp_size_ != 0 || num_k_head % tp_size_ != 0 || num_v_head % tp_size_ != 0) {
-        throw std::runtime_error("LegacyQKVParallelLinear: num_[q|k|v]_head must be divisible by tp_size");
-    }
-
-    if ((q_bias_ != k_bias_) || (k_bias_ != v_bias_)) {
-        throw std::runtime_error("q_bias, k_bias, v_bias must all match");
-    }
-
-    q_out_size_ = num_q_head_ * q_dim_ / tp_size_;
-    k_out_size_ = num_k_head_ * k_dim_ / tp_size_;
-    v_out_size_ = num_v_head_ * v_dim_ / tp_size_;
-}
-
-LegacyQKVParallelLinear::LegacyQKVParallelLinear(size_t hidden_size,
-                                                 size_t head_dim,
-                                                 size_t num_q_head,
-                                                 size_t num_kv_head,
-                                                 std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
-                                                 bool bias,
-                                                 const infinicore::DataType &dtype,
-                                                 const infinicore::Device &device,
-                                                 engine::distributed::RankInfo rank_info)
-    : LegacyQKVParallelLinear(hidden_size,
-                              head_dim, head_dim, head_dim,
-                              num_q_head, num_kv_head, num_kv_head,
-                              bias, bias, bias,
-                              quantization,
-                              dtype, device, rank_info) {}
-
-LegacyQKVParallelLinear::LegacyQKVParallelLinear(size_t hidden_size,
-                                                 size_t q_dim, size_t k_dim, size_t v_dim,
-                                                 size_t num_q_head, size_t num_k_head, size_t num_v_head,
-                                                 bool q_bias, bool k_bias, bool v_bias,
-                                                 std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
-                                                 const infinicore::DataType &dtype,
-                                                 const infinicore::Device &device,
-                                                 engine::distributed::RankInfo rank_info)
-    : infinicore::nn::ColumnParallelLinear(
-          hidden_size,
-          calculate_out_feature_size(num_q_head, q_dim, num_k_head, k_dim, num_v_head, v_dim, rank_info),
-          quantization,
-          (q_bias || k_bias || v_bias),
-          dtype,
-          device,
-          rank_info.tp_rank,
-          rank_info.tp_size),
-      q_dim_(q_dim),
-      k_dim_(k_dim),
-      v_dim_(v_dim),
-      num_q_head_(num_q_head),
-      num_k_head_(num_k_head),
-      num_v_head_(num_v_head),
-      q_bias_(q_bias),
-      k_bias_(k_bias),
-      v_bias_(v_bias),
-      num_kv_head_replicas_(calculate_kv_replicas(num_k_head, rank_info.tp_size)) {
-
-    if ((q_bias_ != k_bias_) || (k_bias_ != v_bias_)) {
-        throw std::runtime_error("q_bias, k_bias, v_bias must all match");
-    }
-
-    q_out_size_ = num_q_head_ * q_dim_ / tp_size_;
-    k_out_size_ = num_kv_head_replicas_ * num_k_head_ * k_dim_ / tp_size_;
-    v_out_size_ = num_kv_head_replicas_ * num_v_head_ * v_dim_ / tp_size_;
-}
-
-std::tuple<infinicore::Tensor, infinicore::Tensor, infinicore::Tensor>
-LegacyQKVParallelLinear::forward_split(infinicore::Tensor &input) {
-    auto output = this->forward(input);
-
-    auto q_out = output->narrow({{2, 0, q_out_size_}});
-    auto k_out = output->narrow({{2, q_out_size_, k_out_size_}});
-    auto v_out = output->narrow({{2, q_out_size_ + k_out_size_, v_out_size_}});
-
-    return std::make_tuple(q_out, k_out, v_out);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_q_weight() const {
-    return infinicore::nn::Parameter(
-        weight_->narrow({{0, 0, q_out_size_}}),
-        0, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_k_weight() const {
-    return infinicore::nn::Parameter(
-        weight_->narrow({{0, q_out_size_, k_out_size_}}),
-        0, tp_rank_, tp_size_, num_k_head_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_v_weight() const {
-    return infinicore::nn::Parameter(
-        weight_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}),
-        0, tp_rank_, tp_size_, num_v_head_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_q_weight_scale() const {
-    return infinicore::nn::Parameter(
-        weight_scale_->narrow({{0, 0, q_out_size_}}), 0, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_k_weight_scale() const {
-    return infinicore::nn::Parameter(
-        weight_scale_->narrow({{0, q_out_size_, k_out_size_}}),
-        0, tp_rank_, tp_size_, num_k_head_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_v_weight_scale() const {
-    return infinicore::nn::Parameter(
-        weight_scale_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}),
-        0, tp_rank_, tp_size_, num_k_head_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_q_weight_awq(int scaling_factor) const {
-    return infinicore::nn::Parameter(
-        weight_->narrow({{1, 0, q_out_size_ / scaling_factor}}),
-        1, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_k_weight_awq(int scaling_factor) const {
-    return infinicore::nn::Parameter(
-        weight_->narrow({{1, q_out_size_ / scaling_factor, k_out_size_ / scaling_factor}}),
-        1, tp_rank_, tp_size_, num_k_head_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_v_weight_awq(int scaling_factor) const {
-    return infinicore::nn::Parameter(
-        weight_->narrow({{1, (q_out_size_ + k_out_size_) / scaling_factor, v_out_size_ / scaling_factor}}),
-        1, tp_rank_, tp_size_, num_k_head_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_q_weight_scale_awq(int scaling_factor) const {
-    return infinicore::nn::Parameter(
-        weight_scale_->narrow({{1, 0, q_out_size_ / scaling_factor}}), 1, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_k_weight_scale_awq(int scaling_factor) const {
-    return infinicore::nn::Parameter(
-        weight_scale_->narrow({{1, q_out_size_ / scaling_factor, k_out_size_ / scaling_factor}}),
-        1, tp_rank_, tp_size_, num_k_head_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_v_weight_scale_awq(int scaling_factor) const {
-    return infinicore::nn::Parameter(
-        weight_scale_->narrow({{1, (q_out_size_ + k_out_size_) / scaling_factor, v_out_size_ / scaling_factor}}),
-        1, tp_rank_, tp_size_, num_k_head_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_q_weight_zeros_awq(int scaling_factor) const {
-    return infinicore::nn::Parameter(
-        weight_zeros_->narrow({{1, 0, q_out_size_ / scaling_factor}}), 1, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_k_weight_zeros_awq(int scaling_factor) const {
-    return infinicore::nn::Parameter(
-        weight_zeros_->narrow({{1, q_out_size_ / scaling_factor, k_out_size_ / scaling_factor}}),
-        1, tp_rank_, tp_size_, num_k_head_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_v_weight_zeros_awq(int scaling_factor) const {
-    return infinicore::nn::Parameter(
-        weight_zeros_->narrow({{1, (q_out_size_ + k_out_size_) / scaling_factor, v_out_size_ / scaling_factor}}),
-        1, tp_rank_, tp_size_, num_k_head_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_q_weight_zeros() const {
-    return infinicore::nn::Parameter(
-        weight_zeros_->narrow({{0, 0, q_out_size_}}), 0, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_k_weight_zeros() const {
-    return infinicore::nn::Parameter(
-        weight_zeros_->narrow({{0, q_out_size_, k_out_size_}}),
-        0, tp_rank_, tp_size_, num_k_head_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_v_weight_zeros() const {
-    return infinicore::nn::Parameter(
-        weight_zeros_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}),
-        0, tp_rank_, tp_size_, num_k_head_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_q_bias() const {
-    if (!q_bias_) {
-        return infinicore::nn::Parameter();
-    }
-    return infinicore::nn::Parameter(
-        bias_->narrow({{0, 0, q_out_size_}}),
-        0, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_k_bias() const {
-    if (!k_bias_) {
-        return infinicore::nn::Parameter();
-    }
-    return infinicore::nn::Parameter(
-        bias_->narrow({{0, q_out_size_, k_out_size_}}),
-        0, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_v_bias() const {
-    if (!v_bias_) {
-        return infinicore::nn::Parameter();
-    }
-    return infinicore::nn::Parameter(
-        bias_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}),
-        0, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_q_g_idx_gptq() const {
-    return infinicore::nn::Parameter(gidx_->narrow({{0, 0, in_features_ / tp_size_}}), 0, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_k_g_idx_gptq() const {
-    return infinicore::nn::Parameter(gidx_->narrow({{0, 0, in_features_ / tp_size_}}), 0, tp_rank_, tp_size_, num_k_head_);
-}
-
-infinicore::nn::Parameter LegacyQKVParallelLinear::get_v_g_idx_gptq() const {
-    return infinicore::nn::Parameter(gidx_->narrow({{0, 0, in_features_ / tp_size_}}), 0, tp_rank_, tp_size_, num_k_head_);
-}
-
-bool LegacyQKVParallelLinear::has_q_bias() const { return q_bias_; }
-bool LegacyQKVParallelLinear::has_k_bias() const { return k_bias_; }
-bool LegacyQKVParallelLinear::has_v_bias() const { return v_bias_; }
-
-// ---------------------------------------------------------
-// LegacyGateUpParallelLinear
-// ---------------------------------------------------------
-LegacyGateUpParallelLinear::LegacyGateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias,
-                                                       const infinicore::DataType &dtype, const infinicore::Device &device,
-                                                       engine::distributed::RankInfo rank_info)
-    : LegacyGateUpParallelLinear(hidden_size, intermediate_size, bias, bias, dtype, device, rank_info) {
-}
-
-LegacyGateUpParallelLinear::LegacyGateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias,
-                                                       const infinicore::DataType &dtype, const infinicore::Device &device,
-                                                       engine::distributed::RankInfo rank_info)
-    : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) {
-    if (gate_bias_ != up_bias_) {
-        throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time");
-    }
-}
-
-LegacyGateUpParallelLinear::LegacyGateUpParallelLinear(size_t hidden_size, size_t intermediate_size, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
-                                                       const infinicore::DataType &dtype, const infinicore::Device &device,
-                                                       engine::distributed::RankInfo rank_info)
-    : LegacyGateUpParallelLinear(hidden_size, intermediate_size, bias, bias, quantization, dtype, device, rank_info) {
-}
-
-LegacyGateUpParallelLinear::LegacyGateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias,
-                                                       std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
-                                                       const infinicore::DataType &dtype, const infinicore::Device &device,
-                                                       engine::distributed::RankInfo rank_info)
-    : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, quantization, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) {
-    if (gate_bias_ != up_bias_) {
-        throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time");
-    }
-}
-
-std::tuple<infinicore::Tensor, infinicore::Tensor> LegacyGateUpParallelLinear::forward_split(infinicore::Tensor &input) {
-    auto output = this->forward(input);
-    auto cols = output->shape()[2];
-    auto gate_output = output->narrow({{2, 0, cols / 2}});
-    auto up_output = output->narrow({{2, cols / 2, cols / 2}});
-    return std::make_tuple(gate_output, up_output);
-}
-
-infinicore::nn::Parameter LegacyGateUpParallelLinear::get_gate_weight() const {
-    return infinicore::nn::Parameter(weight_->narrow({{0, 0, weight_->size(0) / 2}}), 0, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyGateUpParallelLinear::get_gate_bias() const {
-    if (!gate_bias_) {
-        return infinicore::nn::Parameter();
-    } else {
-        return infinicore::nn::Parameter(bias_->narrow({{0, 0, bias_->size(0) / 2}}), 0, tp_rank_, tp_size_);
-    }
-}
-
-infinicore::nn::Parameter LegacyGateUpParallelLinear::get_up_weight() const {
-    return infinicore::nn::Parameter(weight_->narrow({{0, weight_->size(0) / 2, weight_->size(0) / 2}}), 0, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyGateUpParallelLinear::get_up_bias() const {
-    if (!up_bias_) {
-        return infinicore::nn::Parameter();
-    } else {
-        return infinicore::nn::Parameter(bias_->narrow({{0, bias_->size(0) / 2, bias_->size(0) / 2}}),
-                                         0, tp_rank_, tp_size_);
-    }
-}
-
-infinicore::nn::Parameter LegacyGateUpParallelLinear::get_gate_weight_scale() const {
-    return infinicore::nn::Parameter(weight_scale_->narrow({{0, 0, weight_scale_->size(0) / 2}}), 0, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyGateUpParallelLinear::get_up_weight_scale() const {
-    return infinicore::nn::Parameter(weight_scale_->narrow({{0, weight_scale_->size(0) / 2, weight_scale_->size(0) / 2}}), 0, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyGateUpParallelLinear::get_gate_weight_zeros() const {
-    return infinicore::nn::Parameter(weight_zeros_->narrow({{0, 0, weight_zeros_->size(0) / 2}}), 0, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyGateUpParallelLinear::get_up_weight_zeros() const {
-    return infinicore::nn::Parameter(weight_zeros_->narrow({{0, weight_zeros_->size(0) / 2, weight_zeros_->size(0) / 2}}), 0, tp_rank_, tp_size_);
-}
-
-bool LegacyGateUpParallelLinear::has_gate_bias() const { return gate_bias_; }
-bool LegacyGateUpParallelLinear::has_up_bias() const { return up_bias_; }
-
-infinicore::nn::Parameter LegacyGateUpParallelLinear::get_gate_weight_awq() const {
-    return infinicore::nn::Parameter(weight_->narrow({{1, 0, weight_->size(1) / 2}}), 1, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyGateUpParallelLinear::get_up_weight_awq() const {
-    return infinicore::nn::Parameter(weight_->narrow({{1, weight_->size(1) / 2, weight_->size(1) / 2}}), 1, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyGateUpParallelLinear::get_gate_weight_scale_awq() const {
-    return infinicore::nn::Parameter(weight_scale_->narrow({{1, 0, weight_scale_->size(1) / 2}}), 1, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyGateUpParallelLinear::get_up_weight_scale_awq() const {
-    return infinicore::nn::Parameter(weight_scale_->narrow({{1, weight_scale_->size(1) / 2, weight_scale_->size(1) / 2}}), 1, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyGateUpParallelLinear::get_gate_weight_zeros_awq() const {
-    return infinicore::nn::Parameter(weight_zeros_->narrow({{1, 0, weight_zeros_->size(1) / 2}}), 1, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyGateUpParallelLinear::get_up_weight_zeros_awq() const {
-    return infinicore::nn::Parameter(weight_zeros_->narrow({{1, weight_zeros_->size(1) / 2, weight_zeros_->size(1) / 2}}), 1, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyGateUpParallelLinear::get_gate_g_idx_gptq() const {
-    return infinicore::nn::Parameter(gidx_->narrow({{0, 0, gidx_->size(0)}}), 0, tp_rank_, tp_size_);
-}
-
-infinicore::nn::Parameter LegacyGateUpParallelLinear::get_up_g_idx_gptq() const {
-    return infinicore::nn::Parameter(gidx_->narrow({{0, 0, gidx_->size(0)}}), 0, tp_rank_, tp_size_);
-}
-// DEPRECATED END
-
-} // namespace infinilm::layers::linear
diff --git a/csrc/models/llama_legacy/legacy_fused_linear.hpp b/csrc/models/llama_legacy/legacy_fused_linear.hpp
deleted file mode 100644
index 42c90b73a..000000000
--- a/csrc/models/llama_legacy/legacy_fused_linear.hpp
+++ /dev/null
@@ -1,328 +0,0 @@
-#pragma once
-
-/**
- * @deprecated Legacy fused linear classes based on InfiniCore.
- *
- * These classes inherit from infinicore::nn::ColumnParallelLinear and use the
- * infinicore::quantization namespace. They exist solely for backward
- * compatibility with the deprecated LlamaConfig-based constructors.
- *
- * Removal target: v0.2.0 (Q2 2026)
- */
-
-#include "../../config/model_config.hpp"
-#include "../../engine/distributed/communication_group.hpp"
-#include "infinicore/nn/linear.hpp"
-#include "infinicore/quantization.hpp"
-#include <iostream>
-
-namespace infinilm::layers::linear {
-
-// DEPRECATED BEGIN
-
-/**
- * Convert infinilm::quantization::BaseQuantization to infinicore::quantization::BaseQuantization.
- * Needed because model_config now returns infinilm types but legacy classes use infinicore types.
- */
-inline std::shared_ptr<infinicore::quantization::BaseQuantization>
-to_legacy_quant(const std::shared_ptr<infinilm::quantization::BaseQuantization> &quant) {
-    if (!quant) {
-        return std::make_shared<infinicore::quantization::NoneQuantization>(nlohmann::json{});
-    }
-    switch (quant->get_quant_scheme()) {
-    case infinilm::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8:
-        return std::make_shared<infinicore::quantization::CompressedTensors>(quant->get_config());
-    case infinilm::quantization::QuantScheme::AWQ_W4A16:
-        return std::make_shared<infinicore::quantization::AWQ>(quant->get_config());
-    case infinilm::quantization::QuantScheme::GPTQ_W4A16_QY:
-        return std::make_shared<infinicore::quantization::GPTQ_QY>(quant->get_config());
-    case infinilm::quantization::QuantScheme::GPTQ_W4A16:
-        return std::make_shared<infinicore::quantization::GPTQ>(quant->get_config());
-    default:
-        return std::make_shared<infinicore::quantization::NoneQuantization>(quant->get_config());
-    }
-}
-
-inline infinicore::quantization::QuantScheme
-to_legacy_quant_scheme(infinilm::quantization::QuantScheme scheme) {
-    return static_cast<infinicore::quantization::QuantScheme>(static_cast<int>(scheme));
-}
-
-inline infinicore::quantization::KVQuantAlgo
-to_legacy_kv_quant_algo(infinilm::quantization::KVQuantAlgo algo) {
-    return static_cast<infinicore::quantization::KVQuantAlgo>(static_cast<int>(algo));
-}
-
-class LegacyQKVParallelLinear : public infinicore::nn::ColumnParallelLinear {
-public:
-    explicit LegacyQKVParallelLinear(size_t hidden_size,
-                                     size_t q_dim, size_t k_dim, size_t v_dim,
-                                     size_t num_q_head, size_t num_k_head, size_t num_v_head,
-                                     bool q_bias, bool k_bias, bool v_bias,
-                                     const infinicore::DataType &dtype = infinicore::DataType::F32,
-                                     const infinicore::Device &device = infinicore::Device(),
-                                     engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
-
-    explicit LegacyQKVParallelLinear(size_t hidden_size,
-                                     size_t head_dim,
-                                     size_t num_q_head, size_t num_kv_head,
-                                     bool bias = false,
-                                     const infinicore::DataType &dtype = infinicore::DataType::F32,
-                                     const infinicore::Device &device = infinicore::Device(),
-                                     engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
-
-    explicit LegacyQKVParallelLinear(size_t hidden_size,
-                                     size_t q_dim, size_t k_dim, size_t v_dim,
-                                     size_t num_q_head, size_t num_k_head, size_t num_v_head,
-                                     bool q_bias, bool k_bias, bool v_bias,
-                                     std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
-                                     const infinicore::DataType &dtype = infinicore::DataType::F32,
-                                     const infinicore::Device &device = infinicore::Device(),
-                                     engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
-
-    explicit LegacyQKVParallelLinear(size_t hidden_size,
-                                     size_t head_dim,
-                                     size_t num_q_head, size_t num_kv_head,
-                                     std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
-                                     bool bias = false,
-                                     const infinicore::DataType &dtype = infinicore::DataType::F32,
-                                     const infinicore::Device &device = infinicore::Device(),
-                                     engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
-
-    std::tuple<infinicore::Tensor, infinicore::Tensor, infinicore::Tensor>
-    forward_split(infinicore::Tensor &input);
-
-    infinicore::nn::Parameter get_q_weight() const;
-    infinicore::nn::Parameter get_k_weight() const;
-    infinicore::nn::Parameter get_v_weight() const;
-    infinicore::nn::Parameter get_q_weight_scale() const;
-    infinicore::nn::Parameter get_k_weight_scale() const;
-    infinicore::nn::Parameter get_v_weight_scale() const;
-    infinicore::nn::Parameter get_q_weight_zeros() const;
-    infinicore::nn::Parameter get_k_weight_zeros() const;
-    infinicore::nn::Parameter get_v_weight_zeros() const;
-
-    infinicore::nn::Parameter get_q_weight_awq(int scaling_factor) const;
-    infinicore::nn::Parameter get_k_weight_awq(int scaling_factor) const;
-    infinicore::nn::Parameter get_v_weight_awq(int scaling_factor) const;
-    infinicore::nn::Parameter get_q_weight_scale_awq(int scaling_factor) const;
-    infinicore::nn::Parameter get_k_weight_scale_awq(int scaling_factor) const;
-    infinicore::nn::Parameter get_v_weight_scale_awq(int scaling_factor) const;
-    infinicore::nn::Parameter get_q_weight_zeros_awq(int scaling_factor) const;
-    infinicore::nn::Parameter get_k_weight_zeros_awq(int scaling_factor) const;
-    infinicore::nn::Parameter get_v_weight_zeros_awq(int scaling_factor) const;
-
-    infinicore::nn::Parameter get_q_bias() const;
-    infinicore::nn::Parameter get_k_bias() const;
-    infinicore::nn::Parameter get_v_bias() const;
-
-    infinicore::nn::Parameter get_q_g_idx_gptq() const;
-    infinicore::nn::Parameter get_k_g_idx_gptq() const;
-    infinicore::nn::Parameter get_v_g_idx_gptq() const;
-
-    bool has_q_bias() const;
-    bool has_k_bias() const;
-    bool has_v_bias() const;
-
-private:
-    static size_t calculate_kv_replicas(size_t num_k_head, size_t tp_size) {
-        if (num_k_head % tp_size == 0) {
-            return 1;
-        }
-        if (tp_size % num_k_head == 0) {
-            return (tp_size + num_k_head - 1) / num_k_head;
-        }
-        throw std::runtime_error("Invalid KV head configuration");
-    }
-
-    static size_t
-    calculate_out_feature_size(size_t num_q_head, size_t q_dim, size_t num_k_head, size_t k_dim, size_t num_v_head, size_t v_dim, engine::distributed::RankInfo rank_info) {
-        return num_q_head * q_dim + num_k_head * k_dim * calculate_kv_replicas(num_k_head, rank_info.tp_size) + num_v_head * v_dim * calculate_kv_replicas(num_v_head, rank_info.tp_size);
-    }
-
-private:
-    size_t q_dim_;
-    size_t k_dim_;
-    size_t v_dim_;
-    size_t num_q_head_;
-    size_t num_k_head_;
-    size_t num_v_head_;
-    bool q_bias_;
-    bool k_bias_;
-    bool v_bias_;
-    size_t q_out_size_;
-    size_t k_out_size_;
-    size_t v_out_size_;
-    size_t num_kv_head_replicas_ = 1;
-};
-
-class LegacyGateUpParallelLinear : public infinicore::nn::ColumnParallelLinear {
-public:
-    LegacyGateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias = false,
-                               const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(),
-                               engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
-
-    LegacyGateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias,
-                               const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(),
-                               engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
-
-    LegacyGateUpParallelLinear(size_t hidden_size, size_t intermediate_size, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
-                               bool bias = false,
-                               const infinicore::DataType &dtype = infinicore::DataType::F32,
-                               const infinicore::Device &device = infinicore::Device(),
-                               engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
-
-    LegacyGateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias,
-                               std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
-                               const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(),
-                               engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
-
-    std::tuple<infinicore::Tensor, infinicore::Tensor> forward_split(infinicore::Tensor &input);
-
-    infinicore::nn::Parameter get_gate_weight() const;
-    infinicore::nn::Parameter get_gate_bias() const;
-    infinicore::nn::Parameter get_up_weight() const;
-    infinicore::nn::Parameter get_up_bias() const;
-    infinicore::nn::Parameter get_gate_weight_scale() const;
-    infinicore::nn::Parameter get_up_weight_scale() const;
-    infinicore::nn::Parameter get_gate_weight_zeros() const;
-    infinicore::nn::Parameter get_up_weight_zeros() const;
-    infinicore::nn::Parameter get_gate_weight_awq() const;
-    infinicore::nn::Parameter get_up_weight_awq() const;
-    infinicore::nn::Parameter get_gate_weight_scale_awq() const;
-    infinicore::nn::Parameter get_up_weight_scale_awq() const;
-    infinicore::nn::Parameter get_gate_weight_zeros_awq() const;
-    infinicore::nn::Parameter get_up_weight_zeros_awq() const;
-    infinicore::nn::Parameter get_gate_g_idx_gptq() const;
-    infinicore::nn::Parameter get_up_g_idx_gptq() const;
-
-    bool has_gate_bias() const;
-    bool has_up_bias() const;
-
-private:
-    bool gate_bias_;
-    bool up_bias_;
-};
-// DEPRECATED END
-
-// DEPRECATED BEGIN — Legacy macros
-#define INFINILM_LEGACY_QKV_LINEAR_INIT(name, q_name, k_name, v_name, ...)                     \
-    name##_ = std::make_shared<layers::linear::LegacyQKVParallelLinear>(__VA_ARGS__);          \
-    this->register_parameter(std::string(q_name) + ".weight", name##_->get_q_weight());        \
-    this->register_parameter(std::string(k_name) + ".weight", name##_->get_k_weight());        \
-    this->register_parameter(std::string(v_name) + ".weight", name##_->get_v_weight());        \
-    if (name##_->has_q_bias())                                                                  \
-        this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias());        \
-    if (name##_->has_k_bias())                                                                  \
-        this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias());        \
-    if (name##_->has_v_bias())                                                                  \
-        this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias());
-
-#define INFINILM_LEGACY_QKV_LINEAR_W8A8_INIT(name, q_name, k_name, v_name, ...)                            \
-    name##_ = std::make_shared<layers::linear::LegacyQKVParallelLinear>(__VA_ARGS__);                      \
-    this->register_parameter(std::string(q_name) + ".weight", name##_->get_q_weight());                    \
-    this->register_parameter(std::string(q_name) + ".weight_scale", name##_->get_q_weight_scale());        \
-    this->register_parameter(std::string(k_name) + ".weight", name##_->get_k_weight());                    \
-    this->register_parameter(std::string(k_name) + ".weight_scale", name##_->get_k_weight_scale());        \
-    this->register_parameter(std::string(v_name) + ".weight", name##_->get_v_weight());                    \
-    this->register_parameter(std::string(v_name) + ".weight_scale", name##_->get_v_weight_scale());        \
-    if (name##_->has_q_bias())                                                                             \
-        this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias());                    \
-    if (name##_->has_k_bias())                                                                             \
-        this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias());                    \
-    if (name##_->has_v_bias())                                                                             \
-        this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias());
-
-#define INFINILM_LEGACY_QKV_LINEAR_W4A16AWQ_INIT(name, q_name, k_name, v_name, ...)                                \
-    name##_ = std::make_shared<layers::linear::LegacyQKVParallelLinear>(__VA_ARGS__);                               \
-    auto awq_ptr = std::static_pointer_cast<infinicore::quantization::AWQ>(name##_->get_quantization());            \
-    int packing_num = awq_ptr->get_packing_num();                                                                   \
-    this->register_parameter(std::string(q_name) + ".qweight", name##_->get_q_weight_awq(packing_num));             \
-    this->register_parameter(std::string(q_name) + ".qzeros", name##_->get_q_weight_zeros_awq(packing_num));        \
-    this->register_parameter(std::string(q_name) + ".scales", name##_->get_q_weight_scale_awq(1));                  \
-    this->register_parameter(std::string(k_name) + ".qweight", name##_->get_k_weight_awq(packing_num));             \
-    this->register_parameter(std::string(k_name) + ".qzeros", name##_->get_k_weight_zeros_awq(packing_num));        \
-    this->register_parameter(std::string(k_name) + ".scales", name##_->get_k_weight_scale_awq(1));                  \
-    this->register_parameter(std::string(v_name) + ".qweight", name##_->get_v_weight_awq(packing_num));             \
-    this->register_parameter(std::string(v_name) + ".qzeros", name##_->get_v_weight_zeros_awq(packing_num));        \
-    this->register_parameter(std::string(v_name) + ".scales", name##_->get_v_weight_scale_awq(1));                  \
-    if (name##_->has_q_bias())                                                                                      \
-        this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias());                             \
-    if (name##_->has_k_bias())                                                                                      \
-        this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias());                             \
-    if (name##_->has_v_bias())                                                                                      \
-        this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias());
-
-#define INFINILM_LEGACY_QKV_LINEAR_W4A16GPTQ_INIT(name, q_name, k_name, v_name, ...)                                \
-    name##_ = std::make_shared<layers::linear::LegacyQKVParallelLinear>(__VA_ARGS__);                                \
-    auto gptq_ptr = std::static_pointer_cast<infinicore::quantization::GPTQ_QY>(name##_->get_quantization());        \
-    int packing_num = gptq_ptr->get_packing_num();                                                                   \
-    this->register_parameter(std::string(q_name) + ".qweight", name##_->get_q_weight_awq(1));                        \
-    this->register_parameter(std::string(q_name) + ".qzeros", name##_->get_q_weight_zeros_awq(8));                   \
-    this->register_parameter(std::string(q_name) + ".scales", name##_->get_q_weight_scale_awq(1));                   \
-    this->register_parameter(std::string(q_name) + ".g_idx", name##_->get_q_g_idx_gptq());                           \
-    this->register_parameter(std::string(k_name) + ".qweight", name##_->get_k_weight_awq(1));                        \
-    this->register_parameter(std::string(k_name) + ".qzeros", name##_->get_k_weight_zeros_awq(8));                   \
-    this->register_parameter(std::string(k_name) + ".scales", name##_->get_k_weight_scale_awq(1));                   \
-    this->register_parameter(std::string(k_name) + ".g_idx", name##_->get_k_g_idx_gptq());                           \
-    this->register_parameter(std::string(v_name) + ".qweight", name##_->get_v_weight_awq(1));                        \
-    this->register_parameter(std::string(v_name) + ".qzeros", name##_->get_v_weight_zeros_awq(8));                   \
-    this->register_parameter(std::string(v_name) + ".scales", name##_->get_v_weight_scale_awq(1));                   \
-    this->register_parameter(std::string(v_name) + ".g_idx", name##_->get_v_g_idx_gptq());                           \
-    if (name##_->has_q_bias())                                                                                       \
-        this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias());                              \
-    if (name##_->has_k_bias())                                                                                       \
-        this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias());                              \
-    if (name##_->has_v_bias())                                                                                       \
-        this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias());
-
-#define INFINILM_LEGACY_GATE_UP_LINEAR_INIT(name, gate_name, up_name, ...)                          \
-    name##_ = std::make_shared<layers::linear::LegacyGateUpParallelLinear>(__VA_ARGS__);            \
-    this->register_parameter(std::string(gate_name) + ".weight", name##_->get_gate_weight());      \
-    this->register_parameter(std::string(up_name) + ".weight", name##_->get_up_weight());          \
-    if (name##_->has_gate_bias())                                                                   \
-        this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias());      \
-    if (name##_->has_up_bias())                                                                     \
-        this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias());
-
-#define INFINILM_LEGACY_GATE_UP_LINEAR_W8A8_INIT(name, gate_name, up_name, ...)                                  \
-    name##_ = std::make_shared<layers::linear::LegacyGateUpParallelLinear>(__VA_ARGS__);                          \
-    this->register_parameter(std::string(gate_name) + ".weight", name##_->get_gate_weight());                    \
-    this->register_parameter(std::string(gate_name) + ".weight_scale", name##_->get_gate_weight_scale());        \
-    this->register_parameter(std::string(up_name) + ".weight", name##_->get_up_weight());                        \
-    this->register_parameter(std::string(up_name) + ".weight_scale", name##_->get_up_weight_scale());            \
-    if (name##_->has_gate_bias())                                                                                 \
-        this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias());                    \
-    if (name##_->has_up_bias())                                                                                   \
-        this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias());
-
-#define INFINILM_LEGACY_GATE_UP_LINEAR_W4A16AWQ_INIT(name, gate_name, up_name, ...)                            \
-    name##_ = std::make_shared<layers::linear::LegacyGateUpParallelLinear>(__VA_ARGS__);                        \
-    this->register_parameter(std::string(gate_name) + ".qweight", name##_->get_gate_weight_awq());              \
-    this->register_parameter(std::string(gate_name) + ".qzeros", name##_->get_gate_weight_zeros_awq());        \
-    this->register_parameter(std::string(gate_name) + ".scales", name##_->get_gate_weight_scale_awq());        \
-    this->register_parameter(std::string(up_name) + ".qweight", name##_->get_up_weight_awq());                  \
-    this->register_parameter(std::string(up_name) + ".qzeros", name##_->get_up_weight_zeros_awq());            \
-    this->register_parameter(std::string(up_name) + ".scales", name##_->get_up_weight_scale_awq());            \
-    if (name##_->has_gate_bias())                                                                               \
-        this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias());                   \
-    if (name##_->has_up_bias())                                                                                 \
-        this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias());
-
-#define INFINILM_LEGACY_GATE_UP_LINEAR_W4A16GPTQ_INIT(name, gate_name, up_name, ...)                           \
-    name##_ = std::make_shared<layers::linear::LegacyGateUpParallelLinear>(__VA_ARGS__);                        \
-    this->register_parameter(std::string(gate_name) + ".qweight", name##_->get_gate_weight_awq());              \
-    this->register_parameter(std::string(gate_name) + ".qzeros", name##_->get_gate_weight_zeros_awq());        \
-    this->register_parameter(std::string(gate_name) + ".scales", name##_->get_gate_weight_scale_awq());        \
-    this->register_parameter(std::string(gate_name) + ".g_idx", name##_->get_gate_g_idx_gptq());               \
-    this->register_parameter(std::string(up_name) + ".qweight", name##_->get_up_weight_awq());                  \
-    this->register_parameter(std::string(up_name) + ".qzeros", name##_->get_up_weight_zeros_awq());            \
-    this->register_parameter(std::string(up_name) + ".scales", name##_->get_up_weight_scale_awq());            \
-    this->register_parameter(std::string(up_name) + ".g_idx", name##_->get_up_g_idx_gptq());                   \
-    if (name##_->has_gate_bias())                                                                               \
-        this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias());                   \
-    if (name##_->has_up_bias())                                                                                 \
-        this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias());
-// DEPRECATED END
-
-} // namespace infinilm::layers::linear
diff --git a/csrc/models/llama_legacy/llama.hpp b/csrc/models/llama_legacy/llama.hpp
deleted file mode 100644
index 8402a1abc..000000000
--- a/csrc/models/llama_legacy/llama.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#pragma once
-
-/**
- * @file llama.hpp
- * @brief Main header file for Llama model architecture
- *
- * This header includes all components of the Llama model architecture
- * built using InfiniCore::nn::Module pattern.
- *
- * Components:
- * - LlamaConfig: Model configuration structure
- * - LlamaAttention: Multi-head self-attention module
- * - LlamaMLP: Feed-forward network module
- * - LlamaDecoderLayer: Single transformer decoder layer
- * - LlamaModel: Core transformer model (without LM head)
- * - LlamaForCausalLM: Complete model with language modeling head
- */
-
-#include "../../config/model_config.hpp"
-#include "llama_attention.hpp"
-#include "llama_decoder_layer.hpp"
-#include "llama_for_causal_lm.hpp"
-#include "llama_mlp.hpp"
-#include "llama_model.hpp"
diff --git a/csrc/models/llama_legacy/llama_attention.cpp b/csrc/models/llama_legacy/llama_attention.cpp
deleted file mode 100644
index e2b2350a4..000000000
--- a/csrc/models/llama_legacy/llama_attention.cpp
+++ /dev/null
@@ -1,439 +0,0 @@
-#include "llama_attention.hpp"
-
-#include "../../utils.hpp"
-#include "infinicore/nn/linear.hpp"
-#include "infinicore/nn/rope.hpp"
-#include "infinicore/ops.hpp"
-#include "infinicore/ops/mha_kvcache.hpp"
-#include "infinicore/ops/mha_varlen.hpp"
-#include "infinicore/ops/mul.hpp"
-#include "infinicore/ops/per_tensor_dequant_i8.hpp"
-#include "infinicore/ops/per_tensor_quant_i8.hpp"
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-#include <optional>
-#include <spdlog/spdlog.h>
-#include <stdexcept>
-#include <vector>
-
-namespace infinilm::models::llama_legacy {
-
-using layers::linear::to_legacy_quant;
-using layers::linear::to_legacy_quant_scheme;
-using layers::linear::to_legacy_kv_quant_algo;
-
-/**
- * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
- *
- * ⚠️ DEVELOPMENT POLICY:
- *   - NO new development or feature additions permitted on this interface
- *   - Only critical bug fixes (security/stability) allowed until removal
- *   - All new code MUST migrate to the polymorphic overload below
- *
- * Replacement: Use the polymorphic overload of this same function name with updated signature
- * Reason: Legacy signature lacks support for dynamic quantization modes.
- * Removal target: v0.2.0 (Q2 2026)
- */
-LlamaAttention::LlamaAttention(const LlamaConfig &config,
-                               const infinicore::Device &device,
-                               size_t layer_idx,
-                               engine::distributed::RankInfo rank_info,
-                               backends::AttentionBackend attention_backend)
-    : layer_idx_(layer_idx),
-      hidden_size_(config.hidden_size),
-      num_attention_heads_(config.num_attention_heads),
-      num_key_value_heads_(config.num_key_value_heads),
-      head_dim_(config.head_dim),
-      kv_dim_(config.kv_dim()),
-      use_bias_(config.attention_bias),
-      use_output_bias_(config.attention_output_bias),
-      use_qk_norm_(config.qk_norm),
-      max_position_embeddings_(config.max_position_embeddings),
-      rank_info_(rank_info),
-      attention_backend_(attention_backend) {
-    const auto &dtype{config.dtype};
-
-    int tp_rank = rank_info.tp_rank;
-    int tp_size = rank_info.tp_size;
-
-    int num_attention_heads = config.num_attention_heads;
-    int num_key_value_heads = config.num_key_value_heads;
-
-    if ((num_key_value_heads >= tp_size) && (0 == (num_key_value_heads % tp_size))) {
-        this->num_attention_heads_ = num_attention_heads / tp_size;
-        this->num_key_value_heads_ = num_key_value_heads / tp_size;
-    } else {
-        throw std::runtime_error("num_attention_heads / tp_size error.");
-    }
-    scaling_ = 1.0f / std::sqrt(static_cast<float>(head_dim_));
-
-    // Initialize projection layers
-    INFINILM_LEGACY_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, config.num_attention_heads, config.num_key_value_heads, use_bias_,
-                             dtype, device, rank_info);
-    // Output projection uses attention_output_bias (can be different from qkv)
-    INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads * head_dim_, hidden_size_, use_output_bias_,
-                              dtype, device, tp_rank, tp_size, rank_info.comm);
-
-    // Initialize qk RMSNorm
-    if (use_qk_norm_) {
-        INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, config.rms_norm_eps, dtype, device);
-        INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, config.rms_norm_eps, dtype, device);
-    }
-}
-
-LlamaAttention::LlamaAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                               const infinicore::Device &device,
-                               size_t layer_idx,
-                               engine::distributed::RankInfo rank_info,
-                               backends::AttentionBackend attention_backend)
-    : model_config_(model_config),
-      layer_idx_(layer_idx),
-      hidden_size_(model_config->get<size_t>("hidden_size")),
-      num_attention_heads_(model_config->get<size_t>("num_attention_heads")),
-      num_key_value_heads_(model_config->get<size_t>("num_key_value_heads")),
-      head_dim_(model_config->get_head_dim()),
-      kv_dim_(model_config->get_kv_dim()),
-      use_bias_(model_config->get_or<bool>("attention_bias", true)),
-      use_output_bias_(model_config->get_or<bool>("attention_output_bias", false)),
-      max_position_embeddings_(model_config->get<size_t>("max_position_embeddings")),
-      rank_info_(rank_info),
-      attention_backend_(attention_backend) {
-    const auto &dtype{model_config_->get_dtype()};
-
-    int tp_rank = rank_info.tp_rank;
-    int tp_size = rank_info.tp_size;
-
-    int num_attention_heads = model_config_->get<size_t>("num_attention_heads");
-    int num_key_value_heads = model_config_->get<size_t>("num_key_value_heads");
-
-    if ((num_key_value_heads >= tp_size) && (0 == (num_key_value_heads % tp_size))) {
-        this->num_attention_heads_ = num_attention_heads / tp_size;
-        this->num_key_value_heads_ = num_key_value_heads / tp_size;
-    } else {
-        throw std::runtime_error("num_attention_heads / tp_size error.");
-    }
-    scaling_ = 1.0f / std::sqrt(static_cast<float>(head_dim_));
-
-    auto quant_scheme = to_legacy_quant_scheme(this->model_config_->get_quant_scheme());
-    switch (quant_scheme) {
-    case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8:
-        INFINILM_LEGACY_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get<size_t>("num_attention_heads"), model_config_->get<size_t>("num_key_value_heads"), to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_,
-                                      dtype, device, rank_info);
-        INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get<size_t>("num_attention_heads") * head_dim_, hidden_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_output_bias_,
-                                  dtype, device, tp_rank, tp_size, rank_info.comm);
-        break;
-
-    case infinicore::quantization::QuantScheme::AWQ_W4A16: {
-        INFINILM_LEGACY_QKV_LINEAR_W4A16AWQ_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get<size_t>("num_attention_heads"), model_config_->get<size_t>("num_key_value_heads"), to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_,
-                                          dtype, device, rank_info);
-        INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get<size_t>("num_attention_heads") * head_dim_, hidden_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_output_bias_,
-                                  dtype, device, tp_rank, tp_size, rank_info.comm);
-        break;
-    }
-    case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY: {
-
-        INFINILM_LEGACY_QKV_LINEAR_W4A16GPTQ_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get<size_t>("num_attention_heads"), model_config_->get<size_t>("num_key_value_heads"), to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_,
-                                           dtype, device, rank_info);
-
-        INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get<size_t>("num_attention_heads") * head_dim_, hidden_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_output_bias_,
-                                  dtype, device, tp_rank, tp_size, rank_info.comm);
-
-        break;
-    }
-    default:
-        INFINILM_LEGACY_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get<size_t>("num_attention_heads"), model_config_->get<size_t>("num_key_value_heads"), to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_,
-                                 dtype, device, rank_info);
-        INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get<size_t>("num_attention_heads") * head_dim_, hidden_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_output_bias_,
-                                  dtype, device, tp_rank, tp_size, rank_info.comm);
-        break;
-    }
-    if (model_config_->get<std::string>("model_type") == "qwen3") {
-        INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config_->get<double>("rms_norm_eps"), dtype, device);
-        INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config_->get<double>("rms_norm_eps"), dtype, device);
-    }
-
-    switch (to_legacy_kv_quant_algo(this->model_config_->get_kv_quant_scheme())) {
-    case (infinicore::quantization::KVQuantAlgo::INT8): {
-        INFINICORE_NN_PARAMETER_INIT(kv_cache_k_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1));
-        INFINICORE_NN_PARAMETER_INIT(kv_cache_v_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1));
-        break;
-    }
-    default: {
-        break;
-    }
-    }
-}
-
-infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_states,
-                                            const infinicore::Tensor &position_ids,
-                                            std::shared_ptr<infinilm::cache::Cache> kv_cache,
-                                            std::optional<infinicore::Tensor> past_sequence_lengths,
-                                            std::optional<infinicore::Tensor> total_sequence_lengths) const {
-    // Input shape: [batch, seq_len, hidden_size]
-    auto hidden_states_mutable = hidden_states;
-    auto shape = hidden_states->shape();
-    size_t batch_size = shape[0];
-    size_t seq_len = shape[1];
-
-    // 1. Project Q, K, V
-    auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable);
-
-    if (use_qk_norm_ || model_config_->get_or<std::string>("model_type", "None") == "qwen3") {
-        q = q_norm_->forward(q->view({batch_size * seq_len, num_attention_heads_, head_dim_}));
-        k = k_norm_->forward(k->view({batch_size * seq_len, num_key_value_heads_, head_dim_}));
-    }
-
-    // 2. Reshape for multi-head attention
-    // Reshape Q, K, V to include batch dimension
-    // Python: query_states = self.q_proj(hidden_states).view(querys_shape)
-    // The view operation requires the tensor to be contiguous in the required dimensions
-    auto q_reshaped = q->view({batch_size, seq_len, num_attention_heads_, head_dim_});
-    auto k_reshaped = k->view({batch_size, seq_len, num_key_value_heads_, head_dim_});
-    auto v_reshaped = v->view({batch_size, seq_len, num_key_value_heads_, head_dim_});
-
-    // 3. Prepare position_ids for RoPE - align with Python pattern
-    // Python: bs, num = pos_ids.shape; pos_ids = pos_ids.view((bs * num,))
-    auto pos_shape = position_ids->shape();
-    infinicore::Tensor pos_ids_for_rope = position_ids;
-    if (pos_shape.size() == 2) {
-        auto pos_narrowed = position_ids->narrow({{0, 0, 1}});
-        pos_ids_for_rope = pos_narrowed->contiguous()->view({pos_shape[1]});
-    } else if (pos_shape.size() == 1) {
-        pos_ids_for_rope = position_ids->contiguous();
-    } else {
-        throw std::runtime_error("Unexpected position_ids shape");
-    }
-
-    // 4. Apply RoPE to Q and K
-    auto q_rope = infinicore::Tensor::empty({batch_size, num_attention_heads_, seq_len, head_dim_}, q_reshaped->dtype(), q_reshaped->device())->permute({0, 2, 1, 3});
-    rotary_emb_->forward(q_rope, q_reshaped, pos_ids_for_rope); // [bs, seq_len, n_q_head, head_dim]
-    rotary_emb_->forward(k_reshaped, pos_ids_for_rope, true);   // [bs, seq_len, n_kv_head, head_dim]
-
-    infinilm::KVQuantUtils::quantize(
-        k_reshaped, v_reshaped,
-        this->model_config_->get_kv_quant_scheme(),
-        this->kv_cache_k_scale_,
-        this->kv_cache_v_scale_);
-
-    // 5. Prepare KV caches
-    // Convert to [batch, n_head, seq_len, head_dim] for cache
-    // Ensure contiguous after permute for F16 compatibility with cache operations
-    q_reshaped = q_rope->permute({0, 2, 1, 3});          // [bs, n_q_head, seq_len, head_dim]
-    auto k_permuted = k_reshaped->permute({0, 2, 1, 3}); // [bs, n_kv_head, seq_len, head_dim]
-    auto v_permuted = v_reshaped->permute({0, 2, 1, 3}); // [bs, n_kv_head, seq_len, head_dim]
-    infinicore::Tensor k_total;                          // [bs, n_kv_head, max_seq_len, head_dim]
-    infinicore::Tensor v_total;                          // [bs, n_kv_head, max_seq_len, head_dim]
-    if (kv_cache == nullptr) {
-        k_total = k_permuted;
-        v_total = v_permuted;
-    } else if (auto static_kv_cache = std::dynamic_pointer_cast<cache::StaticKVCache>(kv_cache)) {
-        auto [k_total_tmp, v_total_tmp] = static_kv_cache->update(layer_idx_, k_permuted, v_permuted, past_sequence_lengths.value());
-        k_total = k_total_tmp;
-        v_total = v_total_tmp;
-    } else {
-        throw std::runtime_error("LlamaAttention: Unsupported kvcache type");
-    }
-
-    infinicore::Tensor attn_output;
-    if (false) {
-        // experimental nineoothed flash attention
-        attn_output = infinicore::op::flash_attention(q_reshaped, k_total, v_total, total_sequence_lengths.value(), scaling_, true);
-        attn_output = attn_output->permute({0, 2, 1, 3})
-                          ->contiguous()
-                          ->view({batch_size, seq_len, num_attention_heads_ * head_dim_}); // [bs, seq_len, n_q_head * head_dim]
-    } else {
-        size_t total_seq_len = reinterpret_cast<int32_t *>(total_sequence_lengths.value()->to(infinicore::Device::cpu())->data())[0];
-
-        infinilm::KVQuantUtils::dequantize(
-            k_total, v_total,
-            this->model_config_->get_kv_quant_scheme(),
-            this->kv_cache_k_scale_,
-            this->kv_cache_v_scale_,
-            q_reshaped);
-
-        k_total = k_total->narrow({{2, 0, total_seq_len}}); // [bs, n_kv_head, total_seq_len, head_dim]
-        v_total = v_total->narrow({{2, 0, total_seq_len}}); // [bs, n_kv_head, total_seq_len, head_dim]
-
-        // 6. Compute attention
-        size_t ngroup = num_attention_heads_ / num_key_value_heads_;
-        auto Q = q_reshaped->view({batch_size * num_key_value_heads_, ngroup * seq_len, head_dim_});
-        auto K = k_total->view({batch_size * num_key_value_heads_, total_seq_len, head_dim_});
-        auto V = v_total->view({batch_size * num_key_value_heads_, total_seq_len, head_dim_});
-
-        auto K_transposed = K->permute({0, 2, 1}); // [bs * n_kv_head, head_dim, total_seq_len]
-
-        auto attn_weight = infinicore::op::matmul(Q, K_transposed, scaling_); // [bs * n_kv_head, ng * seq_len, total_seq_len]
-
-        auto attn_weight_softmax = attn_weight->view({batch_size * num_attention_heads_, seq_len, total_seq_len});
-        infinicore::op::causal_softmax_(attn_weight_softmax, attn_weight_softmax);
-
-        auto out = infinicore::op::matmul(attn_weight, V); // [bs * n_kv_head, ng * seq_len, head_dim]
-
-        attn_output = out->view({batch_size, num_attention_heads_, seq_len, head_dim_})
-                          ->permute({0, 2, 1, 3})
-                          ->contiguous()
-                          ->view({batch_size, seq_len, num_attention_heads_ * head_dim_}); // [bs, seq_len, n_q_head * head_dim]
-    }
-
-    auto output = o_proj_->forward(attn_output);
-
-    return output;
-}
-
-infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidden_states,
-                                                  const infinicore::Tensor &position_ids,
-                                                  std::shared_ptr<infinilm::cache::PagedKVCache> paged_kv_cache,
-                                                  std::optional<infinicore::Tensor> total_sequence_lengths,
-                                                  std::optional<infinicore::Tensor> input_offsets,
-                                                  std::optional<infinicore::Tensor> cu_seqlens,
-                                                  std::optional<infinicore::Tensor> block_tables,
-                                                  std::optional<infinicore::Tensor> slot_mapping) const {
-    ASSERT(block_tables.has_value());
-    ASSERT(slot_mapping.has_value());
-
-    // Input shape: [batch, seq_len, hidden_size]
-    auto hidden_states_mutable = hidden_states;
-    auto shape = hidden_states->shape();
-    size_t batch_size = shape[0];
-    size_t seq_len = shape[1];
-
-    // Only support batchsize==1, all requests should be flattened along seqlen dimension
-    ASSERT_EQ(batch_size, 1);
-    // Decode only if total_len == num_requests
-    bool is_prefill = (seq_len != total_sequence_lengths.value()->shape()[0]);
-
-    // 1. Project Q, K, V
-    auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable);
-
-    // 2. Reshape for multi-head attention
-
-    // Reshape Q, K, V to include batch dimension
-    // Python: query_states = self.q_proj(hidden_states).view(querys_shape)
-    // The view operation requires the tensor to be contiguous in the required dimensions
-    auto q_reshaped = q->view({seq_len, num_attention_heads_, head_dim_});
-    auto k_reshaped = k->view({seq_len, num_key_value_heads_, head_dim_});
-    auto v_reshaped = v->view({seq_len, num_key_value_heads_, head_dim_});
-
-    if (use_qk_norm_ || model_config_->get_or<std::string>("model_type", "None") == "qwen3") {
-        q_reshaped = q_norm_->forward(q_reshaped);
-        k_reshaped = k_norm_->forward(k_reshaped);
-    }
-
-    // 3. Prepare position_ids for RoPE - align with Python pattern
-    auto pos_shape = position_ids->shape();
-    infinicore::Tensor pos_ids_for_rope = position_ids;
-    if (pos_shape.size() == 2) {
-        auto pos_narrowed = position_ids->narrow({{0, 0, 1}});
-        pos_ids_for_rope = pos_narrowed->view({pos_shape[1]});
-    } else if (pos_shape.size() == 1) {
-        pos_ids_for_rope = position_ids;
-    } else {
-        throw std::runtime_error("Unexpected position_ids shape");
-    }
-
-    // 4. Apply RoPE to Q and K
-    rotary_emb_->forward(q_reshaped, pos_ids_for_rope, true); // [bs, seq_len, n_q_head, head_dim]
-    rotary_emb_->forward(k_reshaped, pos_ids_for_rope, true); // [bs, seq_len, n_kv_head, head_dim]
-
-    //  5. Prepare KV caches
-    //  Ensure contiguous after permute for F16 compatibility with cache operations
-    auto [k_total, v_total] = paged_kv_cache->update(layer_idx_,
-                                                     k_reshaped,
-                                                     v_reshaped,
-                                                     slot_mapping.value());
-
-    // 6. Compute attention
-    infinicore::Tensor attn_output = infinicore::Tensor::empty({seq_len, num_attention_heads_, head_dim_}, q_reshaped->dtype(), q_reshaped->device());
-
-    if (is_prefill) {
-        if (attention_backend_ == backends::AttentionBackend::FLASH_ATTN) {
-            infinicore::op::mha_varlen_(
-                attn_output,
-                q_reshaped,
-                k_total->permute({0, 2, 1, 3}),
-                v_total->permute({0, 2, 1, 3}),
-                input_offsets.value(),
-                cu_seqlens.value(),
-                block_tables.value(),
-                max_position_embeddings_,
-                max_position_embeddings_,
-                std::nullopt,
-                scaling_);
-        } else {
-            infinicore::op::paged_attention_prefill_(
-                attn_output,
-                q_reshaped,
-                k_total,
-                v_total,
-                block_tables.value(),
-                total_sequence_lengths.value(),
-                input_offsets.value(),
-                std::nullopt,
-                scaling_);
-        }
-    } else {
-        if (attention_backend_ == backends::AttentionBackend::FLASH_ATTN) {
-            // FA2 decode path: flash::mha_fwd_kvcache
-            // In paged-attn mode, seq_len = actual batch_size (one query token per sequence).
-            // q_reshaped: [seq_len, num_heads, head_dim] → [seq_len, 1, num_heads, head_dim]
-            // k/v cache:  [num_blocks, num_kv_heads, block_size, head_dim]
-            //           → permute {0,2,1,3} → [num_blocks, block_size, num_kv_heads, head_dim]
-            auto q_for_fa = q_reshaped->view({seq_len, 1, num_attention_heads_, head_dim_});
-            auto attn_out_4d = infinicore::op::mha_kvcache(
-                q_for_fa,
-                k_total->permute({0, 2, 1, 3}), // [num_blocks, block_size, num_kv_heads, head_dim]
-                v_total->permute({0, 2, 1, 3}),
-                total_sequence_lengths.value(), // [seq_len] int32 (one entry per sequence)
-                block_tables.value(),           // [seq_len, max_num_blocks_per_seq] int32
-                std::nullopt,
-                scaling_);
-            attn_output = attn_out_4d->view({seq_len, num_attention_heads_, head_dim_});
-        } else {
-            infinicore::op::paged_attention_(
-                attn_output,
-                q_reshaped,
-                k_total,
-                v_total,
-                block_tables.value(),
-                total_sequence_lengths.value(),
-                std::nullopt,
-                scaling_);
-        }
-    }
-
-    // 7. Project output
-    attn_output
-        = attn_output->view({1, seq_len, num_attention_heads_ * head_dim_});
-    return o_proj_->forward(attn_output);
-}
-
-infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_states,
-                                           const infinicore::Tensor &position_ids,
-                                           std::shared_ptr<cache::Cache> kv_cache,
-                                           std::optional<infinicore::Tensor> past_sequence_lengths,
-                                           std::optional<infinicore::Tensor> total_sequence_lengths,
-                                           std::optional<infinicore::Tensor> input_offsets,
-                                           std::optional<infinicore::Tensor> cu_seqlens,
-                                           std::optional<infinicore::Tensor> block_tables,
-                                           std::optional<infinicore::Tensor> slot_mapping) const {
-    if (!rotary_emb_) {
-        throw std::runtime_error("LlamaAttention: rotary_emb not configured");
-    }
-
-    infinicore::Tensor output;
-    if (auto paged_kv_cache = std::dynamic_pointer_cast<cache::PagedKVCache>(kv_cache)) {
-        output = forward_paged_(hidden_states, position_ids, paged_kv_cache, total_sequence_lengths, input_offsets, cu_seqlens, block_tables, slot_mapping);
-    } else {
-
-        output = forward_(hidden_states, position_ids, kv_cache, past_sequence_lengths, total_sequence_lengths);
-    }
-    return output;
-}
-
-void LlamaAttention::set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb) {
-    rotary_emb_ = rotary_emb;
-}
-
-} // namespace infinilm::models::llama_legacy
diff --git a/csrc/models/llama_legacy/llama_attention.hpp b/csrc/models/llama_legacy/llama_attention.hpp
deleted file mode 100644
index 6579be438..000000000
--- a/csrc/models/llama_legacy/llama_attention.hpp
+++ /dev/null
@@ -1,142 +0,0 @@
-#pragma once
-
-#include "../../backends/attention_backends.hpp"
-#include "../../cache/kv_cache.hpp"
-#include "../../config/model_config.hpp"
-#include "../../engine/distributed/distributed.hpp"
-#include "../../layers/linear/fused_linear.hpp"
-#include "../../layers/quantization/kv_quant.hpp"
-#include "legacy_fused_linear.hpp"
-#include "llama_config.hpp"
-
-#include "infinicore/nn/linear.hpp"
-#include "infinicore/nn/module.hpp"
-#include "infinicore/nn/rmsnorm.hpp"
-#include "infinicore/nn/rope.hpp"
-#include "infinicore/tensor.hpp"
-#include "llama_config.hpp"
-#include <algorithm>
-#include <memory>
-#include <utility>
-
-namespace infinilm::models::llama_legacy {
-
-class LlamaAttention : public infinicore::nn::Module {
-public:
-    /**
-     * @brief Construct LlamaAttention module
-     *
-     * @param config Model configuration
-     * @param device Device to create tensors on
-     * @param layer_idx Layer index for cache access
-     * @param dtype Optional data type for model parameters (defaults to F32)
-     */
-    /**
-     * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
-     *
-     * ⚠️ DEVELOPMENT POLICY:
-     *   - NO new development or feature additions permitted on this interface
-     *   - Only critical bug fixes (security/stability) allowed until removal
-     *   - All new code MUST migrate to the polymorphic overload below
-     *
-     * Replacement: Use the polymorphic overload of this same function name with updated signature
-     * Reason: Legacy signature lacks support for dynamic quantization modes.
-     * Removal target: v0.2.0 (Q2 2026)
-     */
-    LlamaAttention(const LlamaConfig &config,
-                   const infinicore::Device &device,
-                   size_t layer_idx,
-                   engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
-                   backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
-
-    LlamaAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                   const infinicore::Device &device,
-                   size_t layer_idx,
-                   engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
-                   backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
-
-    /**
-     * @brief Forward pass: compute attention
-     *
-     * @param hidden_states Input tensor of shape [batch, seq_len, hidden_size]
-     * @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len]
-     * @param kv_cache Optional model-level KV cache for incremental decoding
-     * @return Output tensor of shape [batch, seq_len, hidden_size]
-     */
-    infinicore::Tensor forward(const infinicore::Tensor &hidden_states,
-                               const infinicore::Tensor &position_ids,
-                               std::shared_ptr<infinilm::cache::Cache> kv_cache,
-                               std::optional<infinicore::Tensor> past_sequence_lengths,
-                               std::optional<infinicore::Tensor> total_sequence_lengths,
-                               std::optional<infinicore::Tensor> input_offsets,
-                               std::optional<infinicore::Tensor> cu_seqlens,
-                               std::optional<infinicore::Tensor> block_tables,
-                               std::optional<infinicore::Tensor> slot_mapping) const;
-
-    /**
-     * @brief Get the layer index
-     */
-    size_t layer_idx() const { return layer_idx_; }
-
-    /**
-     * @brief Provide shared RoPE module from parent model.
-     */
-    void set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb);
-
-    // Module information
-    size_t num_heads() const { return num_attention_heads_; }
-    size_t num_kv_heads() const { return num_key_value_heads_; }
-    size_t head_dim() const { return head_dim_; }
-    size_t hidden_size() const { return hidden_size_; }
-
-private:
-    infinicore::Tensor forward_(const infinicore::Tensor &hidden_states,
-                                const infinicore::Tensor &position_ids,
-                                std::shared_ptr<infinilm::cache::Cache> kv_cache,
-                                std::optional<infinicore::Tensor> past_sequence_lengths,
-                                std::optional<infinicore::Tensor> total_sequence_lengths) const;
-
-    infinicore::Tensor forward_paged_(const infinicore::Tensor &hidden_states,
-                                      const infinicore::Tensor &position_ids,
-                                      std::shared_ptr<infinilm::cache::PagedKVCache> kv_cache,
-                                      std::optional<infinicore::Tensor> total_sequence_lengths,
-                                      std::optional<infinicore::Tensor> input_offsets,
-                                      std::optional<infinicore::Tensor> cu_seqlens,
-                                      std::optional<infinicore::Tensor> block_tables,
-                                      std::optional<infinicore::Tensor> slot_mapping) const;
-
-protected:
-    // Projection layers
-    INFINICORE_NN_MODULE(layers::linear::LegacyQKVParallelLinear, qkv_proj);
-    INFINICORE_NN_MODULE(infinicore::nn::RowParallelLinear, o_proj);
-    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, q_norm);
-    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, k_norm);
-
-    engine::distributed::RankInfo rank_info_;
-
-    // Shared Rotary Position Embeddings (RoPE)
-    std::shared_ptr<infinicore::nn::RoPE> rotary_emb_;
-
-    // For off-line kv cache quantization
-    INFINICORE_NN_PARAMETER(kv_cache_k_scale);
-    INFINICORE_NN_PARAMETER(kv_cache_v_scale);
-
-private:
-    std::shared_ptr<infinilm::config::ModelConfig> model_config_ = std::make_shared<infinilm::config::ModelConfig>();
-    size_t layer_idx_; // Layer index for cache access
-    size_t hidden_size_;
-    size_t num_attention_heads_;
-    size_t num_key_value_heads_;
-    size_t head_dim_;
-    size_t kv_dim_;
-    bool use_bias_;                  // Bias for Q/K/V projections
-    bool use_output_bias_;           // Bias for output projection (o_proj)
-    bool use_qk_norm_ = false;       // Whether to use QK RMSNorm
-    size_t max_position_embeddings_; // For cache initialization (deprecated, kept for compatibility)
-
-    float scaling_;
-
-    backends::AttentionBackend attention_backend_;
-};
-
-} // namespace infinilm::models::llama_legacy
diff --git a/csrc/models/llama_legacy/llama_config.hpp b/csrc/models/llama_legacy/llama_config.hpp
deleted file mode 100644
index 44cee1b89..000000000
--- a/csrc/models/llama_legacy/llama_config.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-#include <string>
-#include <vector>
-
-#include "../infinilm_model.hpp"
-
-#include <infinicore/nn/rope.hpp>
-
-namespace infinilm::models::llama_legacy {
-
-/**
- * @brief Configuration structure for Llama model architecture
- *
- * This struct holds all hyperparameters needed to construct a Llama model.
- * It follows the same structure as HuggingFace's LlamaConfig.
- */
-struct LlamaConfig : public InfinilmModel::Config {
-    // Data type
-    infinicore::DataType dtype = infinicore::DataType::F32;
-
-    // Vocabulary and embedding
-    size_t vocab_size = 32000;        // Vocabulary size
-    size_t hidden_size = 4096;        // Hidden dimension size
-    size_t intermediate_size = 11008; // MLP intermediate dimension
-
-    // Architecture
-    size_t num_hidden_layers = 32;   // Number of decoder layers
-    size_t num_attention_heads = 32; // Number of attention heads
-    size_t num_key_value_heads = 32; // Number of key-value heads (for GQA)
-    size_t head_dim = 128;           // Attention head dimension (hidden_size / num_attention_heads)
-
-    // Position embeddings
-    size_t max_position_embeddings = 2048; // Maximum sequence length
-    double rope_theta = 10000.0;           // RoPE base frequency
-
-    std::shared_ptr<infinicore::nn::RopeScalingConfig> rope_scaling = nullptr; // RoPE scaling type
-
-    // Normalization
-    double rms_norm_eps = 1e-6; // RMSNorm epsilon
-
-    // Activation
-    std::string hidden_act = "silu";  // Activation function (typically "silu")
-    std::string model_type = "llama"; // Model type identifier (matches HF configs)
-
-    // Optional features
-    bool use_cache = true;              // Whether to use KV cache
-    bool attention_bias = true;         // Whether to use bias in Q/K/V projections (default true for 9G7B compatibility)
-    bool attention_output_bias = false; // Whether to use bias in output projection (o_proj)
-    bool mlp_bias = false;              // Whether to use bias in MLP projections
-    bool tie_word_embeddings = false;   // Whether to tie input/output embeddings
-    bool qk_norm = false;               // Whether to use QK RMSNorm
-
-    // Training/initialization parameters
-    double attention_dropout = 0.0;  // Dropout ratio for attention probabilities
-    double initializer_range = 0.02; // Standard deviation for weight initialization
-    size_t pretraining_tp = 1;       // Tensor parallelism rank used during pretraining
-
-    // Model metadata
-    std::string name_or_path = ""; // Model name or path identifier
-
-    // Token IDs
-    int64_t pad_token_id = -1;               // Padding token ID (optional)
-    std::vector<int64_t> bos_token_id = {1}; // Beginning of sequence token ID(s)
-    std::vector<int64_t> eos_token_id = {2}; // End of sequence token ID(s)
-
-    /**
-     * @brief Compute key-value dimension for Grouped Query Attention (GQA)
-     * @return The dimension for key/value projections
-     */
-    size_t kv_dim() const {
-        return hidden_size * num_key_value_heads / num_attention_heads;
-    }
-
-    /**
-     * @brief Validate configuration parameters
-     * @return true if configuration is valid
-     */
-    bool validate() const {
-        if (hidden_size % num_attention_heads != 0) {
-            return false;
-        }
-        if (num_attention_heads % num_key_value_heads != 0) {
-            return false;
-        }
-        if (head_dim != hidden_size / num_attention_heads) {
-            return false;
-        }
-        return true;
-    }
-};
-
-} // namespace infinilm::models::llama_legacy
diff --git a/csrc/models/llama_legacy/llama_decoder_layer.cpp b/csrc/models/llama_legacy/llama_decoder_layer.cpp
deleted file mode 100644
index 0cb7fb83a..000000000
--- a/csrc/models/llama_legacy/llama_decoder_layer.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-#include "llama_decoder_layer.hpp"
-#include "infinicore/nn/rmsnorm.hpp"
-#include "infinicore/ops.hpp"
-#include <optional>
-
-namespace infinilm::models::llama_legacy {
-
-LlamaDecoderLayer::LlamaDecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                                     const infinicore::Device &device,
-                                     size_t layer_idx,
-                                     engine::distributed::RankInfo rank_info,
-                                     backends::AttentionBackend attention_backend) : model_config_(model_config), layer_idx_(layer_idx), rank_info_(rank_info) {
-    const auto &dtype{model_config_->get_dtype()};
-    input_layernorm_ = this->register_module<infinicore::nn::RMSNorm>("input_layernorm", model_config_->get<size_t>("hidden_size"), model_config_->get<double>("rms_norm_eps"),
-                              dtype, device);
-    post_attention_layernorm_ = this->register_module<infinicore::nn::RMSNorm>("post_attention_layernorm", model_config_->get<size_t>("hidden_size"), model_config_->get<double>("rms_norm_eps"),
-                              dtype, device);
-
-    self_attn_ = this->register_module<LlamaAttention>("self_attn", model_config_, device, layer_idx, rank_info_, attention_backend);
-    mlp_ = this->register_module<LlamaMLP>("mlp", model_config_, device, rank_info_);
-}
-
-std::tuple<infinicore::Tensor, infinicore::Tensor>
-LlamaDecoderLayer::forward(infinicore::Tensor &hidden_states,
-                           infinicore::Tensor &residual,
-                           const infinicore::Tensor &position_ids,
-                           std::shared_ptr<infinilm::cache::Cache> kv_cache,
-                           std::optional<infinicore::Tensor> past_sequence_lengths,
-                           std::optional<infinicore::Tensor> total_sequence_lengths,
-                           std::optional<infinicore::Tensor> input_offsets,
-                           std::optional<infinicore::Tensor> cu_seqlens,
-                           std::optional<infinicore::Tensor> block_tables,
-                           std::optional<infinicore::Tensor> slot_mapping) const {
-    // 1. Attention layer normalization
-    input_layernorm_->forward_inplace(hidden_states, residual);
-
-    // 2. Self-attention
-    hidden_states = self_attn_->forward(
-        hidden_states, position_ids, kv_cache, past_sequence_lengths, total_sequence_lengths, input_offsets, cu_seqlens, block_tables, slot_mapping);
-
-    // 3. Post-attention layer normalization
-    post_attention_layernorm_->forward_inplace(hidden_states, residual);
-
-    // 4. MLP
-    hidden_states = mlp_->forward(hidden_states);
-
-    return std::make_tuple(hidden_states, residual);
-}
-
-} // namespace infinilm::models::llama_legacy
diff --git a/csrc/models/llama_legacy/llama_decoder_layer.hpp b/csrc/models/llama_legacy/llama_decoder_layer.hpp
deleted file mode 100644
index 9943639b1..000000000
--- a/csrc/models/llama_legacy/llama_decoder_layer.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-#pragma once
-
-#include "infinicore/device.hpp"
-#include "infinicore/nn/module.hpp"
-#include "infinicore/nn/rmsnorm.hpp"
-#include "infinicore/tensor.hpp"
-#include "llama_attention.hpp"
-#include "llama_config.hpp"
-#include "llama_mlp.hpp"
-
-#include "../../engine/distributed/distributed.hpp"
-
-namespace infinilm::models::llama_legacy {
-
-/**
- * @brief Single decoder layer (transformer block) for Llama
- *
- * Each decoder layer consists of:
- * - Input layer normalization (RMSNorm)
- * - Self-attention mechanism
- * - Post-attention layer normalization (RMSNorm)
- * - MLP feed-forward network
- *
- * Residual connections are applied around both attention and MLP blocks.
- */
-class LlamaDecoderLayer : public infinicore::nn::Module {
-public:
-    /**
-     * @brief Construct LlamaDecoderLayer module
-     *
-     * @param config Model configuration
-     * @param device Device to create tensors on
-     * @param layer_idx Layer index for cache management and debugging
-     * @param dtype Optional data type for model parameters (defaults to F32)
-     */
-    LlamaDecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                      const infinicore::Device &device,
-                      size_t layer_idx,
-                      engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
-                      backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
-
-    /**
-     * @brief Forward pass: process one decoder layer
-     *
-     * @param hidden_states [batch, seq_len, hidden_size], will be modified
-     * @param residual [batch, seq_len, hidden_size], will be modified
-     * @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len]
-     * @param kv_cache Optional KV cache for incremental decoding
-     * @return Output tensor of shape [batch, seq_len, hidden_size]
-     *         Updated residual tensor of shape [batch, seq_len, hidden_size]
-     */
-    std::tuple<infinicore::Tensor, infinicore::Tensor>
-    forward(infinicore::Tensor &hidden_states,
-            infinicore::Tensor &residual,
-            const infinicore::Tensor &position_ids,
-            std::shared_ptr<infinilm::cache::Cache> kv_cache,
-            std::optional<infinicore::Tensor> past_sequence_lengths,
-            std::optional<infinicore::Tensor> total_sequence_lengths,
-            std::optional<infinicore::Tensor> input_offsets,
-            std::optional<infinicore::Tensor> cu_seqlens,
-            std::optional<infinicore::Tensor> block_tables,
-            std::optional<infinicore::Tensor> slot_mappin) const;
-
-    /**
-     * @brief Get the layer index
-     */
-    size_t layer_idx() const { return layer_idx_; }
-
-    void set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb) {
-        if (self_attn_) {
-            self_attn_->set_rotary_emb(rotary_emb);
-        }
-    }
-
-protected:
-    // Layer normalization
-    std::shared_ptr<infinicore::nn::RMSNorm> input_layernorm_;
-    std::shared_ptr<infinicore::nn::RMSNorm> post_attention_layernorm_;
-
-    std::shared_ptr<LlamaAttention> self_attn_;
-    std::shared_ptr<LlamaMLP> mlp_;
-    engine::distributed::RankInfo rank_info_;
-    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
-
-private:
-    size_t layer_idx_; // Layer index for cache management and debugging
-};
-
-} // namespace infinilm::models::llama_legacy
diff --git a/csrc/models/llama_legacy/llama_for_causal_lm.cpp b/csrc/models/llama_legacy/llama_for_causal_lm.cpp
deleted file mode 100644
index 2b0f5d72e..000000000
--- a/csrc/models/llama_legacy/llama_for_causal_lm.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "llama_for_causal_lm.hpp"
-#include "infinicore/context/context.hpp"
-#include "infinicore/nn/linear.hpp"
-#include "infinicore/ops.hpp"
-namespace infinilm::models::llama_legacy {
-
-LlamaForCausalLM::LlamaForCausalLM(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                                   const infinicore::Device &device,
-                                   engine::distributed::RankInfo rank_info,
-                                   backends::AttentionBackend attention_backend) {
-    spdlog::warn("infinilm::models::llama_legacy: LlamaForCausalLM is no longer supported, please use the new model instead.");
-
-    device_ = device;
-    const auto &dtype{model_config->get_dtype()};
-
-    model_ = this->register_module<LlamaModel>("model", model_config, device, rank_info, attention_backend);
-    lm_head_ = this->register_module<infinicore::nn::Linear>("lm_head", model_config->get<size_t>("hidden_size"), model_config->get<size_t>("vocab_size"), false,
-                              dtype, device);
-}
-
-LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const {
-    auto input_ids = input.input_ids.value();
-    auto position_ids = input.position_ids.value();
-    auto past_sequence_lengths = input.past_sequence_lengths;
-    auto total_sequence_length = input.total_sequence_lengths;
-    auto input_offsets = input.input_offsets;
-    auto cu_seqlens = input.cu_seqlens;
-    auto block_tables = input.block_tables;
-    auto slot_mapping = input.slot_mapping;
-
-    auto hidden_states = model_->forward(
-        input_ids, position_ids, past_sequence_lengths, total_sequence_length, input_offsets, cu_seqlens, block_tables, slot_mapping);
-
-    auto logits = lm_head_->forward(hidden_states);
-    return {logits};
-}
-
-infinicore::Tensor LlamaForCausalLM::logits_from_hidden(const infinicore::Tensor &hidden_states) const {
-    return lm_head_->forward(const_cast<infinicore::Tensor &>(hidden_states));
-}
-
-void LlamaForCausalLM::reset_cache(const cache::CacheConfig *cache_config) {
-    cache_config_ = cache_config->unique_copy();
-    model_->reset_cache(cache_config_.get());
-}
-
-const cache::CacheConfig *LlamaForCausalLM::get_cache_config() const {
-    return cache_config_.get();
-}
-
-} // namespace infinilm::models::llama_legacy
diff --git a/csrc/models/llama_legacy/llama_for_causal_lm.hpp b/csrc/models/llama_legacy/llama_for_causal_lm.hpp
deleted file mode 100644
index 1920dbaf6..000000000
--- a/csrc/models/llama_legacy/llama_for_causal_lm.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-#pragma once
-
-#include "../infinilm_model.hpp"
-#include "llama_model.hpp"
-
-#include "infinicore/device.hpp"
-#include "infinicore/nn/linear.hpp"
-#include "infinicore/nn/module.hpp"
-#include "infinicore/tensor.hpp"
-
-#include "../../engine/distributed/distributed.hpp"
-
-namespace infinilm::models::llama_legacy {
-
-/**
- * @brief Llama model for Causal Language Modeling
- *
- * Extends LlamaModel by adding a language modeling head (lm_head) that
- * projects hidden states to vocabulary logits.
- *
- * This matches the structure of HuggingFace's LlamaForCausalLM.
- */
-class LlamaForCausalLM : public InfinilmModel {
-public:
-    /**
-     * @brief Construct LlamaForCausalLM module
-     *
-     * @param config Model configuration
-     * @param device Device to create tensors on
-     */
-    LlamaForCausalLM(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                     const infinicore::Device &device,
-                     engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
-                     backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
-
-    /**
-     * @brief Forward pass: compute language modeling logits
-     *
-     * @param input Encapsulated input tensors and other parameters
-     * @return Output structure containing the result
-     */
-    Output forward(const Input &input) const;
-
-    infinicore::Tensor logits_from_hidden(const infinicore::Tensor &hidden_states) const;
-
-    void reset_cache(const cache::CacheConfig *cache_config) override;
-
-    const cache::CacheConfig *get_cache_config() const override;
-
-    // Module information
-    LlamaModel &model() { return *model_; }
-    const LlamaModel &model() const { return *model_; }
-
-protected:
-    INFINICORE_NN_MODULE(LlamaModel, model);
-
-    // Language modeling head
-    INFINICORE_NN_MODULE(infinicore::nn::Linear, lm_head);
-
-    std::unique_ptr<cache::CacheConfig> cache_config_;
-};
-
-} // namespace infinilm::models::llama_legacy
diff --git a/csrc/models/llama_legacy/llama_mlp.cpp b/csrc/models/llama_legacy/llama_mlp.cpp
deleted file mode 100644
index 65daffac5..000000000
--- a/csrc/models/llama_legacy/llama_mlp.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-#include "llama_mlp.hpp"
-#include "infinicore/nn/linear.hpp"
-#include "infinicore/ops.hpp"
-
-namespace infinilm::models::llama_legacy {
-
-using layers::linear::to_legacy_quant;
-using layers::linear::to_legacy_quant_scheme;
-/**
- * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
- *
- * ⚠️ DEVELOPMENT POLICY:
- *   - NO new development or feature additions permitted on this interface
- *   - Only critical bug fixes (security/stability) allowed until removal
- *   - All new code MUST migrate to the polymorphic overload below
- *
- * Replacement: Use the polymorphic overload of this same function name with updated signature
- * Reason: Legacy signature lacks support for dynamic quantization modes.
- * Removal target: v0.2.0 (Q2 2026)
- */
-LlamaMLP::LlamaMLP(const LlamaConfig &config,
-                   const infinicore::Device &device,
-                   engine::distributed::RankInfo rank_info)
-    : hidden_size_(config.hidden_size),
-      intermediate_size_(config.intermediate_size),
-      use_bias_(config.mlp_bias), rank_info_(rank_info) {
-    const auto &dtype{config.dtype};
-
-    int tp_rank = rank_info.tp_rank;
-    int tp_size = rank_info.tp_size;
-
-    // Initialize projection layers
-    INFINILM_LEGACY_GATE_UP_LINEAR_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, use_bias_,
-                                 dtype, device, rank_info_);
-    INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, use_bias_,
-                              dtype, device, tp_rank, tp_size, rank_info.comm);
-}
-
-LlamaMLP::LlamaMLP(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                   const infinicore::Device &device,
-                   engine::distributed::RankInfo rank_info)
-    : model_config_(model_config), hidden_size_(model_config->get<size_t>("hidden_size")),
-      intermediate_size_(model_config->get<size_t>("intermediate_size")),
-      use_bias_(model_config->get_or<bool>("mlp_bias", false)), rank_info_(rank_info) {
-
-    const auto &dtype{model_config_->get_dtype()};
-
-    int tp_rank = rank_info.tp_rank;
-    int tp_size = rank_info.tp_size;
-
-    // Initialize projection layers
-    auto quant_scheme = to_legacy_quant_scheme(this->model_config_->get_quant_scheme());
-    switch (quant_scheme) {
-    case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8:
-        INFINILM_LEGACY_GATE_UP_LINEAR_W8A8_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_,
-                                          dtype, device, rank_info_);
-        INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_,
-                                  dtype, device, tp_rank, tp_size, rank_info.comm);
-        break;
-    case infinicore::quantization::QuantScheme::AWQ_W4A16:
-        INFINILM_LEGACY_GATE_UP_LINEAR_W4A16AWQ_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_,
-                                              dtype, device, rank_info_);
-        INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_,
-                                  dtype, device, tp_rank, tp_size, rank_info.comm);
-        break;
-    case infinicore::quantization::QuantScheme::GPTQ_W4A16_QY:
-        INFINILM_LEGACY_GATE_UP_LINEAR_W4A16GPTQ_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_,
-                                               dtype, device, rank_info_);
-        INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_,
-                                  dtype, device, tp_rank, tp_size, rank_info.comm);
-        break;
-    default:
-        INFINILM_LEGACY_GATE_UP_LINEAR_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_,
-                                     dtype, device, rank_info_);
-        INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, to_legacy_quant(this->model_config_->get_quantization_method()), use_bias_,
-                                  dtype, device, tp_rank, tp_size, rank_info.comm);
-        break;
-    }
-}
-
-infinicore::Tensor LlamaMLP::forward(const infinicore::Tensor &hidden_states) const {
-    // 1. Project to gate and up
-    auto hidden_states_mutable = hidden_states;
-    auto [gate, up] = gate_up_proj_->forward_split(hidden_states_mutable);
-
-    // 2. Apply SwiGLU: silu(gate) * up
-    // Note: swiglu kernel expects (up, gate) and computes gate * sigmoid(gate) * up
-    // So we pass (up, gate) to get the correct result: gate * sigmoid(gate) * up
-    auto intermediate = infinicore::op::swiglu(up, gate);
-
-    // 3. Project down
-    auto output = down_proj_->forward(intermediate);
-
-    return output;
-}
-
-} // namespace infinilm::models::llama_legacy
diff --git a/csrc/models/llama_legacy/llama_mlp.hpp b/csrc/models/llama_legacy/llama_mlp.hpp
deleted file mode 100644
index d89518cfe..000000000
--- a/csrc/models/llama_legacy/llama_mlp.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-#pragma once
-
-#include "./legacy_fused_linear.hpp"
-#include "llama_config.hpp"
-
-#include "../../config/model_config.hpp"
-#include "infinicore/device.hpp"
-#include "infinicore/nn/linear.hpp"
-#include "infinicore/nn/module.hpp"
-#include "infinicore/tensor.hpp"
-#include "llama_config.hpp"
-
-#include "../../engine/distributed/distributed.hpp"
-
-namespace infinilm::models::llama_legacy {
-
-/**
- * @brief MLP (Feed-Forward Network) module for Llama
- *
- * Implements the MLP block with:
- * - Gate projection
- * - Up projection
- * - Down projection
- * - SiLU activation function
- *
- * Formula: down_proj(SiLU(gate_proj(x)) * up_proj(x))
- */
-class LlamaMLP : public infinicore::nn::Module {
-public:
-    /**
-     * @brief Construct LlamaMLP module
-     *
-     * @param config Model configuration
-     * @param device Device to create tensors on
-     * @param dtype Optional data type for model parameters (defaults to F32)
-     */
-    /**
-     * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
-     *
-     * ⚠️ DEVELOPMENT POLICY:
-     *   - NO new development or feature additions permitted on this interface
-     *   - Only critical bug fixes (security/stability) allowed until removal
-     *   - All new code MUST migrate to the polymorphic overload below
-     *
-     * Replacement: Use the polymorphic overload of this same function name with updated signature
-     * Reason: Legacy signature lacks support for dynamic quantization modes.
-     * Removal target: v0.2.0 (Q2 2026)
-     */
-    LlamaMLP(const LlamaConfig &config,
-             const infinicore::Device &device,
-             engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
-
-    LlamaMLP(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-             const infinicore::Device &device,
-             engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
-
-    /**
-     * @brief Forward pass: compute MLP output
-     *
-     * @param hidden_states Input tensor of shape [batch, seq_len, hidden_size]
-     * @return Output tensor of shape [batch, seq_len, hidden_size]
-     */
-    infinicore::Tensor forward(const infinicore::Tensor &hidden_states) const;
-
-    // Module information
-    size_t hidden_size() const { return hidden_size_; }
-    size_t intermediate_size() const { return intermediate_size_; }
-
-protected:
-    INFINICORE_NN_MODULE(layers::linear::LegacyGateUpParallelLinear, gate_up_proj);
-    INFINICORE_NN_MODULE(infinicore::nn::RowParallelLinear, down_proj);
-
-    engine::distributed::RankInfo rank_info_;
-    size_t hidden_size_;
-    size_t intermediate_size_;
-    bool use_bias_;
-
-    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
-};
-
-} // namespace infinilm::models::llama_legacy
diff --git a/csrc/models/llama_legacy/llama_model.cpp b/csrc/models/llama_legacy/llama_model.cpp
deleted file mode 100644
index b7ffea2cf..000000000
--- a/csrc/models/llama_legacy/llama_model.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-#include "llama_model.hpp"
-#include "../../layers/rotary_embedding/rotary_embedding_factory.hpp"
-#include "infinicore/nn/embedding.hpp"
-#include "infinicore/nn/rmsnorm.hpp"
-#include "infinicore/nn/rope.hpp"
-#include "infinicore/ops.hpp"
-#include <iostream>
-
-namespace infinilm::models::llama_legacy {
-
-LlamaModel::LlamaModel(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                       const infinicore::Device &device,
-                       engine::distributed::RankInfo rank_info,
-                       backends::AttentionBackend attention_backend)
-    : model_config_(model_config), rank_info_(rank_info) {
-    const auto &dtype{model_config_->get_dtype()};
-    INFINICORE_NN_MODULE_INIT(embed_tokens, model_config_->get<size_t>("vocab_size"), model_config_->get<size_t>("hidden_size"),
-                              std::nullopt, dtype, device);
-    layers_.reserve(model_config_->get<size_t>("num_hidden_layers"));
-    for (size_t i = 0; i < model_config_->get<size_t>("num_hidden_layers"); ++i) {
-        layers_.push_back(this->register_module<LlamaDecoderLayer>(
-            "layers." + std::to_string(i), model_config_, device, i, rank_info, attention_backend));
-    }
-    INFINICORE_NN_MODULE_INIT(norm, model_config_->get<size_t>("hidden_size"), model_config_->get<double>("rms_norm_eps"),
-                              dtype, device);
-    auto rope_scaling_config = infinilm::layers::rotary_embedding::make_scaling_config(model_config_);
-    INFINICORE_NN_MODULE_INIT(rotary_emb, model_config_->get_head_dim(), model_config->get_rotary_dim(), model_config_->get<size_t>("max_position_embeddings"),
-                              model_config_->get<double>("rope_theta"), infinicore::nn::RoPE::Algo::GPT_NEOX,
-                              dtype, device, rope_scaling_config);
-
-    for (auto &layer : layers_) {
-        if (layer) {
-            layer->set_rotary_emb(rotary_emb_);
-        }
-    }
-}
-
-infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids,
-                                       const infinicore::Tensor &position_ids,
-                                       std::optional<infinicore::Tensor> past_sequence_lengths,
-                                       std::optional<infinicore::Tensor> total_sequence_lengths,
-                                       std::optional<infinicore::Tensor> input_offsets,
-                                       std::optional<infinicore::Tensor> cu_seqlens,
-                                       std::optional<infinicore::Tensor> block_tables,
-                                       std::optional<infinicore::Tensor> slot_mapping) const {
-    // 1. Embed tokens: input_ids -> [batch, seq_len, hidden_size]
-    auto hidden_states = embed_tokens_->forward(input_ids);
-
-    // 2. Process through all decoder layers
-    size_t num_layers = layers_.size();
-    infinicore::Tensor residual;
-    for (size_t i = 0; i < num_layers; ++i) {
-        layers_.at(i)->forward(
-            hidden_states,
-            residual,
-            position_ids,
-            kv_cache_,
-            past_sequence_lengths,
-            total_sequence_lengths,
-            input_offsets,
-            cu_seqlens,
-            block_tables,
-            slot_mapping);
-    }
-
-    norm_->forward_inplace(hidden_states, residual);
-
-    return hidden_states;
-}
-
-infinicore::Tensor LlamaModel::forward_embeds(const infinicore::Tensor &inputs_embeds,
-                                              const infinicore::Tensor &position_ids,
-                                              std::optional<infinicore::Tensor> past_sequence_lengths,
-                                              std::optional<infinicore::Tensor> total_sequence_lengths,
-                                              std::optional<infinicore::Tensor> input_offsets,
-                                              std::optional<infinicore::Tensor> cu_seqlens,
-                                              std::optional<infinicore::Tensor> block_tables,
-                                              std::optional<infinicore::Tensor> slot_mapping) const {
-    auto hidden_states = inputs_embeds;
-    size_t num_layers = layers_.size();
-    infinicore::Tensor residual;
-    for (size_t i = 0; i < num_layers; ++i) {
-        layers_.at(i)->forward(hidden_states, residual, position_ids, kv_cache_, past_sequence_lengths, total_sequence_lengths, input_offsets, cu_seqlens, block_tables, slot_mapping);
-    }
-    norm_->forward_inplace(hidden_states, residual);
-
-    return hidden_states;
-}
-
-infinicore::Tensor LlamaModel::embed_tokens(const infinicore::Tensor &input_ids) const {
-    return embed_tokens_->forward(input_ids);
-}
-
-void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) {
-    if (cache_config == nullptr) {
-        kv_cache_ = nullptr;
-        return;
-    }
-    if (auto kv_cache_config = dynamic_cast<const cache::StaticKVCacheConfig *>(cache_config)) {
-        kv_cache_ = std::make_shared<cache::StaticKVCache>(
-            model_config_->get_head_dim(),
-            model_config_->get_head_dim(),
-            model_config_->get<size_t>("num_key_value_heads"),
-            model_config_->get<size_t>("num_key_value_heads"),
-            model_config_->get<size_t>("num_hidden_layers"),
-            model_config_->get<size_t>("max_position_embeddings"),
-            model_config_->get_kv_cache_dtype(),
-            *kv_cache_config,
-            rank_info_);
-    } else if (auto paged_kv_cache_config = dynamic_cast<const cache::PagedKVCacheConfig *>(cache_config)) {
-        kv_cache_ = std::make_shared<cache::PagedKVCache>(
-            model_config_->get_head_dim(),
-            model_config_->get_head_dim(),
-            model_config_->get<size_t>("num_key_value_heads"),
-            model_config_->get<size_t>("num_key_value_heads"),
-            model_config_->get<size_t>("num_hidden_layers"),
-            model_config_->get_kv_cache_dtype(),
-            *paged_kv_cache_config,
-            rank_info_);
-    } else {
-        throw std::runtime_error("Unsupported cache type");
-    }
-}
-
-} // namespace infinilm::models::llama_legacy
diff --git a/csrc/models/llama_legacy/llama_model.hpp b/csrc/models/llama_legacy/llama_model.hpp
deleted file mode 100644
index d08c51e4d..000000000
--- a/csrc/models/llama_legacy/llama_model.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-#pragma once
-
-#include "../../cache/kv_cache.hpp"
-#include "llama_decoder_layer.hpp"
-
-#include "infinicore/nn/embedding.hpp"
-#include "infinicore/nn/module.hpp"
-#include "infinicore/nn/rmsnorm.hpp"
-#include "infinicore/nn/rope.hpp"
-#include "infinicore/tensor.hpp"
-#include "llama_config.hpp"
-#include "llama_decoder_layer.hpp"
-#include <memory>
-#include <vector>
-
-#include "../../engine/distributed/distributed.hpp"
-
-namespace infinilm::models::llama_legacy {
-
-/**
- * @brief Main Llama model architecture (without language modeling head)
- *
- * This is the core transformer model consisting of:
- * - Token embeddings (embed_tokens)
- * - Multiple decoder layers (layers)
- * - Final layer normalization (norm)
- * - Rotary Position Embeddings (rotary_emb)
- *
- * This matches the structure of HuggingFace's LlamaModel.
- */
-class LlamaModel : public infinicore::nn::Module {
-public:
-    /**
-     * @brief Construct LlamaModel module
-     *
-     * @param config Model configuration
-     * @param device Device to create tensors on
-     * @param dtype Optional data type for model parameters (defaults to F32)
-     */
-    LlamaModel(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-               const infinicore::Device &device,
-               engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
-               backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
-
-    /**
-     * @brief Forward pass: process input through the model
-     *
-     * @param input_ids Token IDs tensor of shape [batch, seq_len]. Batch is 1 when continuous batch is used,
-     *                 and tokens from all requests are concatenated along seq_len dimension.
-     * @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len]
-     * @param past_sequence_lengths Cache positions tensor of shape [n_req]
-     * @param total_sequence_lengths Total sequence lengths tensor of shape [n_req]
-     * @param input_offsets Input offsets (starting position) of each request in a continuous batch of shape [n_req + 1]
-     * @return Output tensor of shape [batch, seq_len, hidden_size]
-     */
-    infinicore::Tensor forward(const infinicore::Tensor &input_ids,
-                               const infinicore::Tensor &position_ids,
-                               std::optional<infinicore::Tensor> past_sequence_lengths,
-                               std::optional<infinicore::Tensor> total_sequence_lengths,
-                               std::optional<infinicore::Tensor> input_offsets,
-                               std::optional<infinicore::Tensor> cu_seqlens,
-                               std::optional<infinicore::Tensor> block_tables,
-                               std::optional<infinicore::Tensor> slot_mapping) const;
-
-    infinicore::Tensor forward_embeds(const infinicore::Tensor &inputs_embeds,
-                                      const infinicore::Tensor &position_ids,
-                                      std::optional<infinicore::Tensor> past_sequence_lengths,
-                                      std::optional<infinicore::Tensor> total_sequence_lengths,
-                                      std::optional<infinicore::Tensor> input_offsets,
-                                      std::optional<infinicore::Tensor> cu_seqlens,
-                                      std::optional<infinicore::Tensor> block_tables,
-                                      std::optional<infinicore::Tensor> slot_mapping) const;
-
-    infinicore::Tensor embed_tokens(const infinicore::Tensor &input_ids) const;
-
-    void reset_cache(const cache::CacheConfig *cache_config);
-
-    // Module information
-    size_t num_layers() const { return model_config_->get<size_t>("num_hidden_layers"); }
-
-protected:
-    INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens);
-
-    INFINICORE_NN_MODULE_VEC(LlamaDecoderLayer, layers);
-
-    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, norm);
-
-    INFINICORE_NN_MODULE(infinicore::nn::RoPE, rotary_emb);
-
-    engine::distributed::RankInfo rank_info_;
-
-    std::shared_ptr<cache::Cache> kv_cache_;
-
-private:
-    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
-};
-
-} // namespace infinilm::models::llama_legacy
diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp
index f11948672..0a54257e3 100644
--- a/csrc/models/model_factory.cpp
+++ b/csrc/models/model_factory.cpp
@@ -1,29 +1,8 @@
 #include "model_factory.hpp"
-#include "llama_legacy/llama_for_causal_lm.hpp"
 #include "models_registry.hpp"
 
 namespace infinilm {
 
-std::shared_ptr<InfinilmModel> InfinilmModelFactory::createModel(
-    std::shared_ptr<infinilm::config::ModelConfig> model_config,
-    engine::distributed::RankInfo rank_info,
-    const cache::CacheConfig *cache,
-    backends::AttentionBackend attention_backend) {
-    std::shared_ptr<InfinilmModel> model;
-    if (true) {
-        model = std::make_shared<models::llama_legacy::LlamaForCausalLM>(
-            model_config, rank_info.device, rank_info, attention_backend);
-    } else {
-        throw std::invalid_argument("InfinilmModelFactory::createModel: Unsupported model config type");
-    }
-
-    if (cache) {
-        model->reset_cache(cache);
-    }
-
-    return model;
-}
-
 std::shared_ptr<InfinilmModel> InfinilmModelFactory::createModel(
     std::shared_ptr<infinilm::config::ModelConfig> model_config,
     const infinicore::Device &device,
diff --git a/csrc/models/model_factory.hpp b/csrc/models/model_factory.hpp
index b3c476f11..87108f93f 100644
--- a/csrc/models/model_factory.hpp
+++ b/csrc/models/model_factory.hpp
@@ -1,19 +1,11 @@
 #pragma once
 
-#include "../backends/attention_backends.hpp"
-#include "../engine/distributed/distributed.hpp"
 #include "infinilm_model.hpp"
 
 namespace infinilm {
 
 class InfinilmModelFactory {
 public:
-    static std::shared_ptr<InfinilmModel> createModel(
-        std::shared_ptr<infinilm::config::ModelConfig> model_config,
-        engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
-        const cache::CacheConfig *cache = nullptr,
-        backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
-
     static std::shared_ptr<InfinilmModel> createModel(
         std::shared_ptr<infinilm::config::ModelConfig> model_config,
         const infinicore::Device &device,
diff --git a/csrc/models/qwen2/qwen2_for_causal_lm.cpp b/csrc/models/qwen2/qwen2_for_causal_lm.cpp
index 8be5294cc..2191197ef 100644
--- a/csrc/models/qwen2/qwen2_for_causal_lm.cpp
+++ b/csrc/models/qwen2/qwen2_for_causal_lm.cpp
@@ -14,7 +14,7 @@ std::shared_ptr<infinilm::config::ModelConfig> create_qwen2_model_config(std::sh
 
     if (!config_json.contains("head_dim")) {
         size_t head_dim = model_config->get<size_t>("hidden_size")
-            / model_config->get<size_t>("num_attention_heads");
+                        / model_config->get<size_t>("num_attention_heads");
         config_json["head_dim"] = head_dim;
     }
 
@@ -25,13 +25,9 @@ std::shared_ptr<infinilm::config::ModelConfig> create_qwen2_model_config(std::sh
 
 namespace {
 
-#ifndef USE_CLASSIC_LLAMA
-
 INFINILM_REGISTER_CAUSAL_LM_MODEL(
     qwen2,
     infinilm::models::qwen2::Qwen2ForCausalLM,
     infinilm::models::qwen2::create_qwen2_model_config);
 
-#endif
-
 } // namespace
diff --git a/csrc/models/qwen3_moe/qwen3_moe_experts.hpp b/csrc/models/qwen3_moe/qwen3_moe_experts.hpp
index 50210b2ae..90f5739bf 100644
--- a/csrc/models/qwen3_moe/qwen3_moe_experts.hpp
+++ b/csrc/models/qwen3_moe/qwen3_moe_experts.hpp
@@ -1,5 +1,3 @@
-
-
 #pragma once
 #include "../../layers/common_modules.hpp"
 
diff --git a/csrc/models/qwen3_moe/qwen3_moe_topk_router.hpp b/csrc/models/qwen3_moe/qwen3_moe_topk_router.hpp
index 60b30cfb8..dadadb3ff 100644
--- a/csrc/models/qwen3_moe/qwen3_moe_topk_router.hpp
+++ b/csrc/models/qwen3_moe/qwen3_moe_topk_router.hpp
@@ -1,5 +1,3 @@
-
-
 #pragma once
 #include "../../layers/common_modules.hpp"
 
diff --git a/csrc/pybind11/bindings.cc b/csrc/pybind11/bindings.cc
index 0be1c04b5..63846338b 100644
--- a/csrc/pybind11/bindings.cc
+++ b/csrc/pybind11/bindings.cc
@@ -2,15 +2,14 @@
 
 #include "cache/cache.hpp"
 #include "engine/engine.hpp"
-#include "models/llama_legacy.hpp"
 
 namespace py = pybind11;
 
 PYBIND11_MODULE(_infinilm, m) {
-    m.doc() = "InfiniLM Llama model Python bindings";
+    m.doc() = "InfiniLM Python bindings";
 
     infinilm::cache::bind_cache(m);
-    infinilm::models::llama_legacy::bind_llama(m);
+    infinilm::engine::bind_hook_registry(m);
     infinilm::engine::distributed::bind_dist_config(m);
     infinilm::engine::bind_infer_engine(m);
 }
diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp
index 8e470984e..6022f25ec 100644
--- a/csrc/pybind11/engine/engine.hpp
+++ b/csrc/pybind11/engine/engine.hpp
@@ -1,3 +1,4 @@
+#include "../../debug_utils/hooks.hpp"
 #include "../../engine/infer_engine.hpp"
 #include "infinicore/tensor.hpp"
 #include <pybind11/pybind11.h>
@@ -28,12 +29,37 @@ inline void bind_dist_config(py::module &m) {
 
 namespace infinilm::engine {
 
+inline void bind_hook_registry(py::module &m) {
+    using infinilm::models::debug_utils::HookRegistry;
+
+    // TODO: HookRegistry should be moved out from Llama-specific bindings to InfiniCore as common utils in future work
+    // Bind HookRegistry
+    py::class_<HookRegistry, std::shared_ptr<HookRegistry>>(m, "HookRegistry")
+        .def(py::init<>())
+        .def(
+            "register_hook", [](HookRegistry &self, const std::string &name, py::object callback) {
+                // Convert Python callable to C++ function
+                self.register_hook(name, [callback](const std::string &hook_name, const infinicore::Tensor &tensor, int layer_idx) {
+                    try {
+                        // Call Python callback with hook name, tensor, and layer index
+                        callback(hook_name, tensor, layer_idx);
+                    } catch (const py::error_already_set &e) {
+                        // Re-raise Python exception
+                        throw;
+                    }
+                });
+            },
+            py::arg("name"), py::arg("callback"))
+        .def("clear", &HookRegistry::clear)
+        .def("has_hooks", &HookRegistry::has_hooks);
+}
+
 inline void bind_infer_engine(py::module &m) {
     py::class_<InferEngine, std::shared_ptr<InferEngine>> infer_engine(m, "InferEngine");
 
     infer_engine
         .def(py::init([](
-                          const std::string &model_path,
+                          const std::string &config_str,
                           const distributed::DistConfig &dist,
                           infinicore::Device::Type dev,
                           std::shared_ptr<const infinilm::cache::CacheConfig> cache_cfg,
@@ -41,7 +67,7 @@ inline void bind_infer_engine(py::module &m) {
                           const std::string &attention_backend,
                           std::optional<infinicore::DataType> kv_cache_dtype) {
                  return std::make_shared<InferEngine>(
-                     model_path,
+                     config_str,
                      dist,
                      dev,
                      cache_cfg ? cache_cfg.get() : nullptr,
@@ -49,7 +75,7 @@ inline void bind_infer_engine(py::module &m) {
                      infinilm::backends::parse_attention_backend(attention_backend),
                      kv_cache_dtype);
              }),
-             py::arg("model_path") = "",
+             py::arg("config_str") = "",
              py::arg("distributed_config") = distributed::DistConfig(),
              py::arg("device_type") = infinicore::context::getDevice().getType(),
              py::arg("cache_config") = py::none(),
diff --git a/csrc/pybind11/models/llama_legacy.hpp b/csrc/pybind11/models/llama_legacy.hpp
deleted file mode 100644
index 67c8e74f1..000000000
--- a/csrc/pybind11/models/llama_legacy.hpp
+++ /dev/null
@@ -1,216 +0,0 @@
-#pragma once
-
-#include "../../cache/kv_cache.hpp"
-#include "../../models/debug_utils/hooks.hpp"
-#include "../../models/llama_legacy/llama.hpp"
-#include "../../models/llama_legacy/llama_attention.hpp"
-#include "infinicore/device.hpp"
-#include "infinicore/nn/module.hpp"
-#include "infinicore/nn/rope.hpp"
-#include "infinicore/tensor.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
-using infinicore::Device;
-using infinilm::models::debug_utils::HookRegistry;
-
-namespace infinilm::models::llama_legacy {
-
-inline void bind_llama(py::module &m) {
-    // TODO: HookRegistry should be moved out from Llama-specific bindings to InfiniCore as common utils in future work
-    // Bind HookRegistry
-    py::class_<HookRegistry, std::shared_ptr<HookRegistry>>(m, "HookRegistry")
-        .def(py::init<>())
-        .def(
-            "register_hook", [](HookRegistry &self, const std::string &name, py::object callback) {
-                // Convert Python callable to C++ function
-                self.register_hook(name, [callback](const std::string &hook_name, const infinicore::Tensor &tensor, int layer_idx) {
-                    try {
-                        // Call Python callback with hook name, tensor, and layer index
-                        callback(hook_name, tensor, layer_idx);
-                    } catch (const py::error_already_set &e) {
-                        // Re-raise Python exception
-                        throw;
-                    }
-                });
-            },
-            py::arg("name"), py::arg("callback"))
-        .def("clear", &HookRegistry::clear)
-        .def("has_hooks", &HookRegistry::has_hooks);
-
-    py::class_<InfinilmModel::Config> config(m, "Config");
-
-    // Bind LlamaConfig
-    py::class_<LlamaConfig, InfinilmModel::Config> llama_config(m, "LlamaConfig");
-    llama_config
-        .def(py::init<>())
-        // TODO: Change this to `dtype` after updating InfiniCore pybind11 exposing mechanism.
-        .def_readwrite("_dtype", &LlamaConfig::dtype)
-        .def_readwrite("vocab_size", &LlamaConfig::vocab_size)
-        .def_readwrite("hidden_size", &LlamaConfig::hidden_size)
-        .def_readwrite("intermediate_size", &LlamaConfig::intermediate_size)
-        .def_readwrite("num_hidden_layers", &LlamaConfig::num_hidden_layers)
-        .def_readwrite("num_attention_heads", &LlamaConfig::num_attention_heads)
-        .def_readwrite("num_key_value_heads", &LlamaConfig::num_key_value_heads)
-        .def_readwrite("head_dim", &LlamaConfig::head_dim)
-        .def_readwrite("max_position_embeddings", &LlamaConfig::max_position_embeddings)
-        .def_readwrite("rms_norm_eps", &LlamaConfig::rms_norm_eps)
-        .def_readwrite("hidden_act", &LlamaConfig::hidden_act)
-        .def_readwrite("model_type", &LlamaConfig::model_type)
-        .def_readwrite("rope_theta", &LlamaConfig::rope_theta)
-        .def_readwrite("attention_bias", &LlamaConfig::attention_bias)
-        .def_readwrite("attention_output_bias", &LlamaConfig::attention_output_bias)
-        .def_readwrite("mlp_bias", &LlamaConfig::mlp_bias)
-        .def_readwrite("tie_word_embeddings", &LlamaConfig::tie_word_embeddings)
-        .def_readwrite("qk_norm", &LlamaConfig::qk_norm)
-        .def_readwrite("use_cache", &LlamaConfig::use_cache)
-        .def_readwrite("attention_dropout", &LlamaConfig::attention_dropout)
-        .def_readwrite("initializer_range", &LlamaConfig::initializer_range)
-        .def_readwrite("pretraining_tp", &LlamaConfig::pretraining_tp)
-        .def_readwrite("name_or_path", &LlamaConfig::name_or_path)
-        .def_readwrite("pad_token_id", &LlamaConfig::pad_token_id)
-        .def_property(
-            "bos_token_id", [](const LlamaConfig &self) {
-                // Always return as list to match Python config format
-                return py::cast(self.bos_token_id); }, [](LlamaConfig &self, py::object value) {
-                // Accept both single int and list
-                if (py::isinstance<py::int_>(value)) {
-                    self.bos_token_id = {value.cast<int64_t>()};
-                } else if (py::isinstance<py::list>(value) || py::isinstance<py::tuple>(value)) {
-                    self.bos_token_id = value.cast<std::vector<int64_t>>();
-                } else {
-                    throw py::type_error("bos_token_id must be int or list of ints");
-                } })
-        .def_property(
-            "eos_token_id", [](const LlamaConfig &self) {
-                // Always return as list to match Python config format
-                return py::cast(self.eos_token_id); }, [](LlamaConfig &self, py::object value) {
-                // Accept both single int and list
-                if (py::isinstance<py::int_>(value)) {
-                    self.eos_token_id = {value.cast<int64_t>()};
-                } else if (py::isinstance<py::list>(value) || py::isinstance<py::tuple>(value)) {
-                    self.eos_token_id = value.cast<std::vector<int64_t>>();
-                } else {
-                    throw py::type_error("eos_token_id must be int or list of ints");
-                } })
-        .def_property(
-            "rope_scaling",
-
-            // ---------- getter ----------
-            [](const LlamaConfig &self) -> py::object {
-                if (!self.rope_scaling) {
-                    return py::none();
-                }
-
-                using ScalingConfig = infinicore::nn::RopeScalingConfig;
-                using LongRopeConfig = infinicore::nn::LongRopeScalingConfig;
-
-                py::dict d;
-
-                if (auto *lr = dynamic_cast<const LongRopeConfig *>(self.rope_scaling.get())) {
-                    d["type"] = "longrope";
-                    d["rope_type"] = "longrope";
-                    d["factor"] = lr->factor();
-                    d["original_max_position_embeddings"] = lr->original_max_position_embeddings();
-                    d["short_factor"] = lr->short_factor();
-                    d["long_factor"] = lr->long_factor();
-                } else {
-                    throw std::runtime_error("Unknown RoPE scaling type");
-                }
-
-                return std::move(d);
-            },
-
-            // ---------- setter ----------
-            [](LlamaConfig &self, py::object value) {
-                if (value.is_none()) {
-                    self.rope_scaling.reset();
-                    return;
-                }
-
-                if (!py::isinstance<py::dict>(value)) {
-                    throw py::type_error("rope_scaling must be a dict or None");
-                }
-
-                py::dict d = value.cast<py::dict>();
-
-                auto get_str = [&](const char *k) {
-                    if (!d.contains(k)) {
-                        throw py::key_error(k);
-                    }
-                    return py::cast<std::string>(d[k]);
-                };
-
-                std::string type = d.contains("rope_type")
-                                     ? py::cast<std::string>(d["rope_type"])
-                                     : get_str("type");
-
-                if (type == "longrope") {
-                    using LongRopeConfig = infinicore::nn::LongRopeScalingConfig;
-
-                    if (!d.contains("short_factor") || !d.contains("long_factor") || !d.contains("original_max_position_embeddings")) {
-                        throw py::value_error(
-                            "longrope requires short_factor, long_factor, "
-                            "original_max_position_embeddings");
-                    }
-
-                    std::vector<float> short_factor = py::cast<std::vector<float>>(d["short_factor"]);
-                    std::vector<float> long_factor = py::cast<std::vector<float>>(d["long_factor"]);
-
-                    size_t original_max_position_embeddings = py::cast<size_t>(d["original_max_position_embeddings"]);
-
-                    float factor = 1.0f;
-                    if (d.contains("factor")) {
-                        factor = py::cast<float>(d["factor"]);
-                    }
-
-                    self.rope_scaling = std::make_shared<LongRopeConfig>(
-                        std::move(short_factor),
-                        std::move(long_factor),
-                        original_max_position_embeddings,
-                        factor);
-                } else {
-                    throw py::value_error("Unsupported rope_scaling type: " + type);
-                }
-            })
-        .def("validate", &LlamaConfig::validate)
-        .def("kv_dim", &LlamaConfig::kv_dim)
-        // Add __dir__ to make attributes discoverable via dir() in Python
-        .def("__dir__", [](const LlamaConfig &self) {
-            py::list dir_list;
-            dir_list.append("vocab_size");
-            dir_list.append("hidden_size");
-            dir_list.append("intermediate_size");
-            dir_list.append("num_hidden_layers");
-            dir_list.append("num_attention_heads");
-            dir_list.append("num_key_value_heads");
-            dir_list.append("head_dim");
-            dir_list.append("max_position_embeddings");
-            dir_list.append("rms_norm_eps");
-            dir_list.append("hidden_act");
-            dir_list.append("model_type");
-            dir_list.append("rope_theta");
-            dir_list.append("rope_scaling");
-            dir_list.append("attention_bias");
-            dir_list.append("attention_output_bias");
-            dir_list.append("mlp_bias");
-            dir_list.append("tie_word_embeddings");
-            dir_list.append("qk_norm");
-            dir_list.append("use_cache");
-            dir_list.append("attention_dropout");
-            dir_list.append("initializer_range");
-            dir_list.append("pretraining_tp");
-            dir_list.append("name_or_path");
-            dir_list.append("pad_token_id");
-            dir_list.append("bos_token_id");
-            dir_list.append("eos_token_id");
-            dir_list.append("validate");
-            dir_list.append("kv_dim");
-            return dir_list; });
-
-    // Note: Device is already bound in InfiniCore bindings, so we don't need to bind it here
-}
-
-} // namespace infinilm::models::llama_legacy
diff --git a/python/infinilm/models/llama/configuration_llama.py b/python/infinilm/models/llama/configuration_llama.py
index 15776c848..b41440d07 100644
--- a/python/infinilm/models/llama/configuration_llama.py
+++ b/python/infinilm/models/llama/configuration_llama.py
@@ -17,12 +17,10 @@
 
 import infinicore
 
-from infinilm.lib import _infinilm
-
 from ...configuration_utils import PretrainedConfig
 
 
-class LlamaConfig(PretrainedConfig, _infinilm.LlamaConfig):
+class LlamaConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
@@ -140,6 +138,7 @@ class LlamaConfig(PretrainedConfig, _infinilm.LlamaConfig):
     ```"""
 
     model_type = "llama"
+
     keys_to_ignore_at_inference = ["past_key_values"]
     # Default tensor parallel plan for base model `LlamaModel`
     base_model_tp_plan = {
@@ -184,15 +183,7 @@ def __init__(
         torch_dtype=None,
         **kwargs,
     ):
-        _infinilm.LlamaConfig.__init__(self)
-
-        original_model_type = kwargs.get("model_type", None)
-        if original_model_type == "qwen3":
-            self.qk_norm = True
-
-        # ---
         self.model_type = "llama"
-        self.name_or_path = ""
 
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
diff --git a/xmake.lua b/xmake.lua
index 2b1b51d37..d9863a523 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -18,15 +18,6 @@ if has_config("use-kv-caching") then
     add_defines("ENABLE_KV_CACHING")
 end
 
-option("use-classic-llama")
-    set_default(false)
-    set_showmenu(true)
-    set_description("Whether to using the classic LlamaForCausalLM")
-option_end()
-
-if has_config("use-classic-llama") then
-    add_defines("USE_CLASSIC_LLAMA")
-end
 
 target("infinicore_infer")
     set_kind("shared")