InfiniTensor · chen2021673 · Jun 8, 2026 · kilinchange · May 23, 2026 · chen2021673
diff --git a/.clang-format b/.clang-format
@@ -8,6 +8,35 @@ ColumnLimit: 120
 AllowShortBlocksOnASingleLine: Always 
 AllowShortLoopsOnASingleLine: true
 InsertBraces: true
+SortIncludes: CaseSensitive
+IncludeBlocks: Regroup
+IncludeCategories:
+  # Paired headers are assigned priority 0 by clang-format.
+  # This is based on the file basename, so same-named headers from another
+  # module may be treated as paired headers, for example kernels/cpu/linear.cc
+  # and include/autograd/linear.h. clang-format only sorts existing includes;
+  # it does not add new includes.
+  # C and POSIX system headers.
+  - Regex: ^<(assert|complex|ctype|errno|fenv|fcntl|float|inttypes|iso646|limits|locale|math|pthread|semaphore|setjmp|signal|stdalign|stdarg|stdatomic|stdbool|stddef|stdint|stdio|stdlib|string|sys/.*|tgmath|threads|time|uchar|unistd|wchar|wctype)\.h>$
+    Priority: 1
+  # C++ standard library headers.
+  - Regex: ^<(algorithm|any|array|atomic|barrier|bit|bitset|cassert|ccomplex|cctype|cerrno|cfenv|cfloat|charconv|chrono|cinttypes|ciso646|climits|clocale|cmath|codecvt|compare|complex|concepts|condition_variable|coroutine|csetjmp|csignal|cstdalign|cstdarg|cstdbool|cstddef|cstdint|cstdio|cstdlib|cstring|ctgmath|ctime|cuchar|cwchar|cwctype|deque|exception|execution|expected|filesystem|format|forward_list|fstream|functional|future|initializer_list|iomanip|ios|iosfwd|iostream|istream|iterator|latch|limits|list|locale|map|memory|memory_resource|mutex|new|numbers|numeric|optional|ostream|queue|random|ranges|ratio|regex|scoped_allocator|semaphore|set|shared_mutex|source_location|span|sstream|stack|stdexcept|stop_token|streambuf|string|string_view|strstream|syncstream|system_error|thread|tuple|type_traits|typeindex|typeinfo|unordered_map|unordered_set|utility|valarray|variant|vector|version)>$
+    Priority: 2
+  # Other external library headers, for example CUDA/NCCL/MPI.
+  - Regex: ^<.*>$
+    Priority: 3
+  # third_party headers included with quotes.
+  - Regex: ^"(third_party/|Eigen/|gflags/|glog/|gtest/)
+    Priority: 4
+  # Public project interfaces.
+  - Regex: ^"infini_train/include/
+    Priority: 5
+  # Internal project implementation headers.
+  - Regex: ^"(infini_train/src/|tests/)
+    Priority: 6
+  # Other local quoted headers.
+  - Regex: ^".*"$
+    Priority: 7
 BreakBeforeBraces: Custom
 BraceWrapping:
   AfterCaseLabel: false

diff --git a/.github/workflows/format-check.yaml b/.github/workflows/format-check.yaml
@@ -16,12 +16,43 @@ jobs:
     - name: Checkout code
       uses: actions/checkout@v4
 
+    - name: Install system dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y clang-format-16 include-what-you-use
+
     - name: Install Python dependencies
       run: |
         python3 -m pip install --upgrade pip
-        pip install black
+        pip install black colorama
 
     - name: Run format check
       run: |
         python3 scripts/format.py --path infini_train example --check
 
+    - name: Run custom style check
+      run: |
+        python3 scripts/style_check.py --path infini_train example
+
+    - name: Configure compile database for IWYU
+      # Keep IWYU advisory until the existing codebase is fully cleaned up.
+      continue-on-error: true
+      run: |
+        cmake -S . -B build-iwyu -DUSE_CUDA=OFF -DUSE_MACA=OFF -DUSE_MPI=OFF -DUSE_OMP=OFF -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+
+    - name: Run IWYU check
+      continue-on-error: true
+      run: |
+        if command -v iwyu_tool.py >/dev/null; then
+          IWYU_TOOL="$(command -v iwyu_tool.py)"
+        else
+          IWYU_TOOL="$(command -v iwyu_tool)"
+        fi
+        mapfile -t IWYU_SOURCES < <(
+          find infini_train example -type f \( -name '*.c' -o -name '*.cc' -o -name '*.cpp' -o -name '*.cxx' \) \
+            ! -path 'infini_train/src/core/ccl/cuda/*' \
+            ! -path 'infini_train/src/core/runtime/cuda/*' \
+            ! -path 'infini_train/src/core/ccl/maca/*' \
+            ! -path 'infini_train/src/core/runtime/maca/*'
+        )
+        "${IWYU_TOOL}" -p build-iwyu -j "$(nproc)" "${IWYU_SOURCES[@]}"
diff --git a/example/common/tokenizer.cc b/example/common/tokenizer.cc
@@ -9,11 +9,12 @@
 
 #include "glog/logging.h"
 
-#include "example/common/utils.h"
 #include "infini_train/include/nn/functional.h"
 #include "infini_train/include/nn/modules/module.h"
 #include "infini_train/include/tensor.h"
 
+#include "example/common/utils.h"
+
 namespace infini_train {
 
 constexpr uint32_t kGpt2Eot = 50256;

diff --git a/example/gpt2/checkpoint_loader.cc b/example/gpt2/checkpoint_loader.cc
@@ -12,8 +12,6 @@
 
 #include "glog/logging.h"
 
-#include "example/common/utils.h"
-#include "example/gpt2/config.h"
 #include "infini_train/include/nn/modules/normalization.h"
 #include "infini_train/include/nn/modules/sparse.h"
 #include "infini_train/include/nn/modules/transformer/causal_self_attention.h"
@@ -24,6 +22,9 @@
 #include "infini_train/include/nn/parallel/tensor_parallel.h"
 #include "infini_train/include/tensor.h"
 
+#include "example/common/utils.h"
+#include "example/gpt2/config.h"
+
 using namespace infini_train;
 namespace nn = infini_train::nn;
 
@@ -101,7 +102,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
     // ========== pp_size：num_stages; vpp_size: num_chunks_per_stage ==========
     int pp_size = nn::parallel::global::GetPipelineParallelSize();
     int vpp_size = nn::parallel::global::GetVirtualPipelineParallelSize();
-    auto pp_rank = nn::parallel::pp_rank;
+    auto pp_rank = nn::parallel::tls_pp_rank;
     auto [is_first_stage, is_last_stage, layer_ranges_per_chunk]
         = nn::parallel::PipelineParallel::GetStageInfo(n_layer, pp_size, pp_rank, vpp_size);
     // ========== layer to chunk ==========
@@ -110,7 +111,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
         for (int i = start; i < end; ++i) { owned_layers[i] = true; }
     }
 
-    auto tp_rank = nn::parallel::tp_rank;
+    auto tp_rank = nn::parallel::tls_tp_rank;
     // calculate xx_size_per_partition
     const int64_t vpp = model_vocab_size / tp_size;
     const int64_t v_start = static_cast<int64_t>(tp_rank) * vpp;

diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
@@ -137,7 +137,7 @@ void Train(const nn::parallel::Rank &rank) {
 
     // Set thread-local global rank
     // TODO(dcj): Use DeviceGuardImpl to get GlobalRank later.
-    nn::parallel::global::thread_global_rank = rank.GlobalRank();
+    nn::parallel::global::tls_thread_global_rank = rank.GlobalRank();
 
     const ProcessGroup *ddp_pg = nullptr;
     const ProcessGroup *tp_pg = nullptr;
@@ -158,15 +158,14 @@ void Train(const nn::parallel::Rank &rank) {
                                             GetTensorParallelGroupRanks(rank.GlobalRank()));
             tp_rank = tp_pg->GetGroupRank(rank.GlobalRank());
             // NOTE(zbl): Reserved for VocabParallelEmbedding
-            nn::parallel::tp_rank = tp_rank;
+            nn::parallel::tls_tp_rank = tp_rank;
         }
 
         if (pp_world_size > 1) {
             pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
                                             GetPipelineParallelGroupRanks(rank.GlobalRank()));
             pp_rank = pp_pg->GetGroupRank(rank.GlobalRank());
-
-            nn::parallel::pp_rank = pp_rank;
+            nn::parallel::tls_pp_rank = pp_rank;
         }
     } else {
         device = FLAGS_device == kDeviceCPU ? Device() : Device(Device::DeviceType::kCUDA, 0);

diff --git a/example/llama3/checkpoint_loader.cc b/example/llama3/checkpoint_loader.cc
@@ -12,8 +12,6 @@
 
 #include "glog/logging.h"
 
-#include "example/common/utils.h"
-#include "example/llama3/config.h"
 #include "infini_train/include/nn/modules/normalization.h"
 #include "infini_train/include/nn/modules/transformer/causal_self_attention.h"
 #include "infini_train/include/nn/modules/transformer/mlp.h"
@@ -22,6 +20,9 @@
 #include "infini_train/include/nn/parallel/tensor_parallel.h"
 #include "infini_train/include/tensor.h"
 
+#include "example/common/utils.h"
+#include "example/llama3/config.h"
+
 using namespace infini_train;
 namespace nn = infini_train::nn;
 
@@ -86,7 +87,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
     // ========== pp_size：num_stages; vpp_size: num_chunks_per_stage ==========
     int pp_size = nn::parallel::global::GetPipelineParallelSize();
     int vpp_size = nn::parallel::global::GetVirtualPipelineParallelSize();
-    auto pp_rank = nn::parallel::pp_rank;
+    auto pp_rank = nn::parallel::tls_pp_rank;
     auto [is_first_stage, is_last_stage, layer_ranges_per_chunk]
         = nn::parallel::PipelineParallel::GetStageInfo(n_layer, pp_size, pp_rank, vpp_size);
     // ========== layer to chunk ==========
@@ -96,7 +97,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
     }
 
     const int tp_size = nn::parallel::global::GetTensorParallelSize();
-    const int tp_rank = nn::parallel::tp_rank;
+    const int tp_rank = nn::parallel::tls_tp_rank;
 
     CHECK_EQ(n_embd % tp_size, 0) << "n_embd must be divisible by TP world size.";
     CHECK_EQ(n_head % tp_size, 0) << "n_head must be divisible by TP world size.";

diff --git a/example/llama3/main.cc b/example/llama3/main.cc
@@ -122,7 +122,7 @@ void Train(const nn::parallel::Rank &rank) {
     int pp_rank = 0;
 
     // Set thread-local global rank
-    nn::parallel::global::thread_global_rank = rank.GlobalRank();
+    nn::parallel::global::tls_thread_global_rank = rank.GlobalRank();
 
     const ProcessGroup *ddp_pg = nullptr;
     const ProcessGroup *tp_pg = nullptr;
@@ -143,15 +143,14 @@ void Train(const nn::parallel::Rank &rank) {
                                             GetTensorParallelGroupRanks(rank.GlobalRank()));
             tp_rank = tp_pg->GetGroupRank(rank.GlobalRank());
             // NOTE(zbl): Reserved for VocabParallelEmbedding
-            nn::parallel::tp_rank = tp_rank;
+            nn::parallel::tls_tp_rank = tp_rank;
         }
 
         if (pp_world_size > 1) {
             pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
                                             GetPipelineParallelGroupRanks(rank.GlobalRank()));
             pp_rank = pp_pg->GetGroupRank(rank.GlobalRank());
-
-            nn::parallel::pp_rank = pp_rank;
+            nn::parallel::tls_pp_rank = pp_rank;
         }
     } else {
         device = FLAGS_device == kDeviceCPU ? Device() : Device(Device::DeviceType::kCUDA, 0);

diff --git a/infini_train/include/autograd/grad_mode.h b/infini_train/include/autograd/grad_mode.h
@@ -5,13 +5,12 @@ namespace infini_train::autograd {
 
 class GradMode {
 public:
-    // Whether to enable Autograd (enabled by default)
-    static bool IsEnabled() { return grad_enabled_; }
-    static void SetEnabled(bool enabled) { grad_enabled_ = enabled; }
+    // Whether to enable Autograd (enabled by default).
+    static bool IsEnabled() { return tls_grad_enabled_; }
+    static void SetEnabled(bool enabled) { tls_grad_enabled_ = enabled; }
 
 private:
-    // grad mode should be thread_local
-    static thread_local bool grad_enabled_;
+    static thread_local bool tls_grad_enabled_;
 };
 
 // RAII: Disable grad (align with torch.no_grad)

diff --git a/infini_train/include/nn/parallel/global.h b/infini_train/include/nn/parallel/global.h
@@ -6,7 +6,7 @@
 
 namespace infini_train::nn::parallel::global {
 
-extern thread_local int thread_global_rank;
+extern thread_local int tls_thread_global_rank;
 
 enum Axis : uint8_t { DP = 0, TP = 1, PP = 2, AXIS_COUNT = 3 };
 

diff --git a/infini_train/include/nn/parallel/pp/pipeline_parallel.h b/infini_train/include/nn/parallel/pp/pipeline_parallel.h
@@ -16,7 +16,7 @@ namespace infini_train::nn::parallel {
 class PipelineStage;
 class PipelineSchedule;
 
-extern thread_local int pp_rank;
+extern thread_local int tls_pp_rank;
 
 struct StageInfo {
     bool is_first_stage;

diff --git a/infini_train/include/nn/parallel/tensor_parallel.h b/infini_train/include/nn/parallel/tensor_parallel.h
@@ -16,7 +16,7 @@ namespace infini_train::nn::parallel {
 
 // NOTE(zbl): Reserved for VocabParallelEmbedding, since rank is needed in its constructor before any Device exists
 //            On other occasions, should use Device::Rank()
-extern thread_local int tp_rank;
+extern thread_local int tls_tp_rank;
 
 class ColumnParallelLinear : public nn::CloneableModule<ColumnParallelLinear> {
 public:

diff --git a/infini_train/include/profiler.h b/infini_train/include/profiler.h
@@ -17,23 +17,23 @@ namespace core {
 class Event;
 }
 
-inline thread_local int g_profiling_depth = 0;
+inline thread_local int tls_profiling_depth = 0;
 
 struct ProfileContext {
     std::string name;
     Device::DeviceType device;
 };
 
-inline thread_local ProfileContext g_profile_context;
+inline thread_local ProfileContext tls_profile_context;
 
 inline void SetProfileContext(const std::string &name, Device::DeviceType device) {
-    if (g_profiling_depth == 0) {
-        g_profile_context.name = name;
-        g_profile_context.device = device;
+    if (tls_profiling_depth == 0) {
+        tls_profile_context.name = name;
+        tls_profile_context.device = device;
     }
 }
 
-inline const ProfileContext &GetProfileContext() { return g_profile_context; }
+inline const ProfileContext &GetProfileContext() { return tls_profile_context; }
 
 struct KernelProfileInfo {
     int64_t host_total_us = 0;
@@ -89,13 +89,14 @@ class Profiler {
     std::string current_tag_ = "Untagged";
 
     // thread-local tracking
-    thread_local static inline std::map<std::string, std::chrono::high_resolution_clock::time_point> cpu_timing_map_;
+    thread_local static inline std::map<std::string, std::chrono::high_resolution_clock::time_point>
+        tls_cpu_timing_map_;
 
     struct EventPair {
         core::Event *start = nullptr;
         core::Event *stop = nullptr;
     };
 
-    thread_local static inline std::map<std::string, EventPair> device_timing_map_;
+    thread_local static inline std::map<std::string, EventPair> tls_device_timing_map_;
 };
 } // namespace infini_train
diff --git a/infini_train/include/utils/global_module_hook_registry.h b/infini_train/include/utils/global_module_hook_registry.h
@@ -1,12 +1,13 @@
 #pragma once
 
-#include "infini_train/include/common/hook.h"
-#include "infini_train/include/tensor.h"
 #include <functional>
 #include <memory>
 #include <mutex>
 #include <vector>
 
+#include "infini_train/include/common/hook.h"
+#include "infini_train/include/tensor.h"
+
 namespace infini_train {
 namespace nn {
 class Module;

diff --git a/infini_train/src/autograd/grad_mode.cc b/infini_train/src/autograd/grad_mode.cc
@@ -1,5 +1,5 @@
 #include "infini_train/include/autograd/grad_mode.h"
 
 namespace infini_train::autograd {
-thread_local bool GradMode::grad_enabled_ = true;
+thread_local bool GradMode::tls_grad_enabled_ = true;
 } // namespace infini_train::autograd
diff --git a/infini_train/src/core/ccl/cuda/nccl_impl.cc b/infini_train/src/core/ccl/cuda/nccl_impl.cc
@@ -1,8 +1,9 @@
 #include "infini_train/src/core/ccl/cuda/nccl_impl.h"
 
-#include <nccl.h>
 #include <vector>
 
+#include <nccl.h>
+
 #include "glog/logging.h"
 
 #include "infini_train/include/common/cuda/common_cuda.h"

diff --git a/infini_train/src/kernels/cpu/outer.cc b/infini_train/src/kernels/cpu/outer.cc
@@ -1,5 +1,6 @@
-#include <cstdint>
 #include <fcntl.h>
+
+#include <cstdint>
 #include <memory>
 #include <tuple>
 

diff --git a/infini_train/src/nn/modules/transformer/transformer.cc b/infini_train/src/nn/modules/transformer/transformer.cc
@@ -198,7 +198,7 @@ std::vector<std::shared_ptr<Tensor>> TransformerLastStage::Forward(const std::ve
 TransformerModel::TransformerModel(const TransformerConfig config)
     : CloneableModule(kType), config_(config),
       stage_info_(nn::parallel::PipelineParallel::GetStageInfo(
-          config_.n_layer, nn::parallel::global::GetPipelineParallelSize(), nn::parallel::pp_rank,
+          config_.n_layer, nn::parallel::global::GetPipelineParallelSize(), nn::parallel::tls_pp_rank,
           nn::parallel::global::GetVirtualPipelineParallelSize())) {
     auto tp_world_size = nn::parallel::global::GetTensorParallelSize();
 

diff --git a/infini_train/src/nn/modules/transformer/transformer_config.cc b/infini_train/src/nn/modules/transformer/transformer_config.cc
@@ -8,7 +8,7 @@ bool TransformerConfig::UseGQA() const { return n_kv_head < n_head; }
 
 int TransformerConfig::GetChunkSize() const {
     auto stage_info = parallel::PipelineParallel::GetStageInfo(n_layer, parallel::global::GetPipelineParallelSize(),
-                                                               parallel::pp_rank,
+                                                               parallel::tls_pp_rank,
                                                                parallel::global::GetVirtualPipelineParallelSize());
     return stage_info.layer_ranges_per_chunk.size();
 }

diff --git a/infini_train/src/nn/parallel/global.cc b/infini_train/src/nn/parallel/global.cc
@@ -22,7 +22,7 @@ std::string GetEnvAsStr(const std::string &name, const std::string &default_valu
 
 namespace infini_train::nn::parallel::global {
 
-thread_local int thread_global_rank = 0;
+thread_local int tls_thread_global_rank = 0;
 
 void Layout::InitStrides() {
     // Calculate strides

diff --git a/infini_train/src/nn/parallel/pp/pipeline_parallel.cc b/infini_train/src/nn/parallel/pp/pipeline_parallel.cc
@@ -15,7 +15,7 @@ namespace {
 constexpr char kModuleName[] = "module";
 } // namespace
 
-thread_local int pp_rank = 0;
+thread_local int tls_pp_rank = 0;
 
 void PipelineParallel::BuildPipelineStage(const std::vector<std::vector<int64_t>> &recv_shape, Device device,
                                           std::vector<std::shared_ptr<Module>> &&chunks) {

diff --git a/infini_train/src/nn/parallel/rank.cc b/infini_train/src/nn/parallel/rank.cc
@@ -1,4 +1,5 @@
 #include "infini_train/include/nn/parallel/rank.h"
+
 #include "infini_train/include/nn/parallel/global.h"
 
 namespace infini_train::nn::parallel {