Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .clang-format

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个文件不要动。

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

我感觉其他不动,关于头文件顺序检查可以保留。但我看头文件检测也都是简单的按规则匹配,有可能出现错误(文件名相同但并不是对应头文件)或后续需要更新(比如添加third_party)。

Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,35 @@ ColumnLimit: 120
AllowShortBlocksOnASingleLine: Always
AllowShortLoopsOnASingleLine: true
InsertBraces: true
SortIncludes: CaseSensitive
IncludeBlocks: Regroup
IncludeCategories:
# Paired headers are assigned priority 0 by clang-format.
# This is based on the file basename, so same-named headers from another
# module may be treated as paired headers, for example kernels/cpu/linear.cc
# and include/autograd/linear.h. clang-format only sorts existing includes;
# it does not add new includes.
# C and POSIX system headers.
- Regex: ^<(assert|complex|ctype|errno|fenv|fcntl|float|inttypes|iso646|limits|locale|math|pthread|semaphore|setjmp|signal|stdalign|stdarg|stdatomic|stdbool|stddef|stdint|stdio|stdlib|string|sys/.*|tgmath|threads|time|uchar|unistd|wchar|wctype)\.h>$
Priority: 1
# C++ standard library headers.
- Regex: ^<(algorithm|any|array|atomic|barrier|bit|bitset|cassert|ccomplex|cctype|cerrno|cfenv|cfloat|charconv|chrono|cinttypes|ciso646|climits|clocale|cmath|codecvt|compare|complex|concepts|condition_variable|coroutine|csetjmp|csignal|cstdalign|cstdarg|cstdbool|cstddef|cstdint|cstdio|cstdlib|cstring|ctgmath|ctime|cuchar|cwchar|cwctype|deque|exception|execution|expected|filesystem|format|forward_list|fstream|functional|future|initializer_list|iomanip|ios|iosfwd|iostream|istream|iterator|latch|limits|list|locale|map|memory|memory_resource|mutex|new|numbers|numeric|optional|ostream|queue|random|ranges|ratio|regex|scoped_allocator|semaphore|set|shared_mutex|source_location|span|sstream|stack|stdexcept|stop_token|streambuf|string|string_view|strstream|syncstream|system_error|thread|tuple|type_traits|typeindex|typeinfo|unordered_map|unordered_set|utility|valarray|variant|vector|version)>$
Priority: 2
# Other external library headers, for example CUDA/NCCL/MPI.
- Regex: ^<.*>$
Priority: 3
# third_party headers included with quotes.
- Regex: ^"(third_party/|Eigen/|gflags/|glog/|gtest/)
Priority: 4
# Public project interfaces.
- Regex: ^"infini_train/include/
Priority: 5
# Internal project implementation headers.
- Regex: ^"(infini_train/src/|tests/)
Priority: 6
# Other local quoted headers.
- Regex: ^".*"$
Priority: 7
BreakBeforeBraces: Custom
BraceWrapping:
AfterCaseLabel: false
Expand Down
33 changes: 32 additions & 1 deletion .github/workflows/format-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,43 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4

- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y clang-format-16 include-what-you-use

- name: Install Python dependencies
run: |
python3 -m pip install --upgrade pip
pip install black
pip install black colorama

- name: Run format check
run: |
python3 scripts/format.py --path infini_train example --check

- name: Run custom style check
run: |
python3 scripts/style_check.py --path infini_train example

- name: Configure compile database for IWYU
# Keep IWYU advisory until the existing codebase is fully cleaned up.
continue-on-error: true
run: |
cmake -S . -B build-iwyu -DUSE_CUDA=OFF -DUSE_MACA=OFF -DUSE_MPI=OFF -DUSE_OMP=OFF -DCMAKE_EXPORT_COMPILE_COMMANDS=ON

- name: Run IWYU check
continue-on-error: true
run: |
if command -v iwyu_tool.py >/dev/null; then
IWYU_TOOL="$(command -v iwyu_tool.py)"
else
IWYU_TOOL="$(command -v iwyu_tool)"
fi
mapfile -t IWYU_SOURCES < <(
find infini_train example -type f \( -name '*.c' -o -name '*.cc' -o -name '*.cpp' -o -name '*.cxx' \) \
! -path 'infini_train/src/core/ccl/cuda/*' \
! -path 'infini_train/src/core/runtime/cuda/*' \
! -path 'infini_train/src/core/ccl/maca/*' \
! -path 'infini_train/src/core/runtime/maca/*'
)
"${IWYU_TOOL}" -p build-iwyu -j "$(nproc)" "${IWYU_SOURCES[@]}"
3 changes: 2 additions & 1 deletion example/common/tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@

#include "glog/logging.h"

#include "example/common/utils.h"
#include "infini_train/include/nn/functional.h"
#include "infini_train/include/nn/modules/module.h"
#include "infini_train/include/tensor.h"

#include "example/common/utils.h"

namespace infini_train {

constexpr uint32_t kGpt2Eot = 50256;
Expand Down
9 changes: 5 additions & 4 deletions example/gpt2/checkpoint_loader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@

#include "glog/logging.h"

#include "example/common/utils.h"
#include "example/gpt2/config.h"
#include "infini_train/include/nn/modules/normalization.h"
#include "infini_train/include/nn/modules/sparse.h"
#include "infini_train/include/nn/modules/transformer/causal_self_attention.h"
Expand All @@ -24,6 +22,9 @@
#include "infini_train/include/nn/parallel/tensor_parallel.h"
#include "infini_train/include/tensor.h"

#include "example/common/utils.h"
#include "example/gpt2/config.h"

using namespace infini_train;
namespace nn = infini_train::nn;

Expand Down Expand Up @@ -101,7 +102,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
// ========== pp_size:num_stages; vpp_size: num_chunks_per_stage ==========
int pp_size = nn::parallel::global::GetPipelineParallelSize();
int vpp_size = nn::parallel::global::GetVirtualPipelineParallelSize();
auto pp_rank = nn::parallel::pp_rank;
auto pp_rank = nn::parallel::tls_pp_rank;
auto [is_first_stage, is_last_stage, layer_ranges_per_chunk]
= nn::parallel::PipelineParallel::GetStageInfo(n_layer, pp_size, pp_rank, vpp_size);
// ========== layer to chunk ==========
Expand All @@ -110,7 +111,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
for (int i = start; i < end; ++i) { owned_layers[i] = true; }
}

auto tp_rank = nn::parallel::tp_rank;
auto tp_rank = nn::parallel::tls_tp_rank;
// calculate xx_size_per_partition
const int64_t vpp = model_vocab_size / tp_size;
const int64_t v_start = static_cast<int64_t>(tp_rank) * vpp;
Expand Down
7 changes: 3 additions & 4 deletions example/gpt2/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ void Train(const nn::parallel::Rank &rank) {

// Set thread-local global rank
// TODO(dcj): Use DeviceGuardImpl to get GlobalRank later.
nn::parallel::global::thread_global_rank = rank.GlobalRank();
nn::parallel::global::tls_thread_global_rank = rank.GlobalRank();

const ProcessGroup *ddp_pg = nullptr;
const ProcessGroup *tp_pg = nullptr;
Expand All @@ -158,15 +158,14 @@ void Train(const nn::parallel::Rank &rank) {
GetTensorParallelGroupRanks(rank.GlobalRank()));
tp_rank = tp_pg->GetGroupRank(rank.GlobalRank());
// NOTE(zbl): Reserved for VocabParallelEmbedding
nn::parallel::tp_rank = tp_rank;
nn::parallel::tls_tp_rank = tp_rank;
}

if (pp_world_size > 1) {
pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
GetPipelineParallelGroupRanks(rank.GlobalRank()));
pp_rank = pp_pg->GetGroupRank(rank.GlobalRank());

nn::parallel::pp_rank = pp_rank;
nn::parallel::tls_pp_rank = pp_rank;
}
} else {
device = FLAGS_device == kDeviceCPU ? Device() : Device(Device::DeviceType::kCUDA, 0);
Expand Down
9 changes: 5 additions & 4 deletions example/llama3/checkpoint_loader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@

#include "glog/logging.h"

#include "example/common/utils.h"
#include "example/llama3/config.h"
#include "infini_train/include/nn/modules/normalization.h"
#include "infini_train/include/nn/modules/transformer/causal_self_attention.h"
#include "infini_train/include/nn/modules/transformer/mlp.h"
Expand All @@ -22,6 +20,9 @@
#include "infini_train/include/nn/parallel/tensor_parallel.h"
#include "infini_train/include/tensor.h"

#include "example/common/utils.h"
#include "example/llama3/config.h"

using namespace infini_train;
namespace nn = infini_train::nn;

Expand Down Expand Up @@ -86,7 +87,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
// ========== pp_size:num_stages; vpp_size: num_chunks_per_stage ==========
int pp_size = nn::parallel::global::GetPipelineParallelSize();
int vpp_size = nn::parallel::global::GetVirtualPipelineParallelSize();
auto pp_rank = nn::parallel::pp_rank;
auto pp_rank = nn::parallel::tls_pp_rank;
auto [is_first_stage, is_last_stage, layer_ranges_per_chunk]
= nn::parallel::PipelineParallel::GetStageInfo(n_layer, pp_size, pp_rank, vpp_size);
// ========== layer to chunk ==========
Expand All @@ -96,7 +97,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
}

const int tp_size = nn::parallel::global::GetTensorParallelSize();
const int tp_rank = nn::parallel::tp_rank;
const int tp_rank = nn::parallel::tls_tp_rank;

CHECK_EQ(n_embd % tp_size, 0) << "n_embd must be divisible by TP world size.";
CHECK_EQ(n_head % tp_size, 0) << "n_head must be divisible by TP world size.";
Expand Down
7 changes: 3 additions & 4 deletions example/llama3/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ void Train(const nn::parallel::Rank &rank) {
int pp_rank = 0;

// Set thread-local global rank
nn::parallel::global::thread_global_rank = rank.GlobalRank();
nn::parallel::global::tls_thread_global_rank = rank.GlobalRank();

const ProcessGroup *ddp_pg = nullptr;
const ProcessGroup *tp_pg = nullptr;
Expand All @@ -143,15 +143,14 @@ void Train(const nn::parallel::Rank &rank) {
GetTensorParallelGroupRanks(rank.GlobalRank()));
tp_rank = tp_pg->GetGroupRank(rank.GlobalRank());
// NOTE(zbl): Reserved for VocabParallelEmbedding
nn::parallel::tp_rank = tp_rank;
nn::parallel::tls_tp_rank = tp_rank;
}

if (pp_world_size > 1) {
pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
GetPipelineParallelGroupRanks(rank.GlobalRank()));
pp_rank = pp_pg->GetGroupRank(rank.GlobalRank());

nn::parallel::pp_rank = pp_rank;
nn::parallel::tls_pp_rank = pp_rank;
}
} else {
device = FLAGS_device == kDeviceCPU ? Device() : Device(Device::DeviceType::kCUDA, 0);
Expand Down
9 changes: 4 additions & 5 deletions infini_train/include/autograd/grad_mode.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,12 @@ namespace infini_train::autograd {

class GradMode {
public:
// Whether to enable Autograd (enabled by default)
static bool IsEnabled() { return grad_enabled_; }
static void SetEnabled(bool enabled) { grad_enabled_ = enabled; }
// Whether to enable Autograd (enabled by default).
static bool IsEnabled() { return tls_grad_enabled_; }
static void SetEnabled(bool enabled) { tls_grad_enabled_ = enabled; }

private:
// grad mode should be thread_local
static thread_local bool grad_enabled_;
static thread_local bool tls_grad_enabled_;
};

// RAII: Disable grad (align with torch.no_grad)
Expand Down
2 changes: 1 addition & 1 deletion infini_train/include/nn/parallel/global.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

namespace infini_train::nn::parallel::global {

extern thread_local int thread_global_rank;
extern thread_local int tls_thread_global_rank;

enum Axis : uint8_t { DP = 0, TP = 1, PP = 2, AXIS_COUNT = 3 };

Expand Down
2 changes: 1 addition & 1 deletion infini_train/include/nn/parallel/pp/pipeline_parallel.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ namespace infini_train::nn::parallel {
class PipelineStage;
class PipelineSchedule;

extern thread_local int pp_rank;
extern thread_local int tls_pp_rank;

struct StageInfo {
bool is_first_stage;
Expand Down
2 changes: 1 addition & 1 deletion infini_train/include/nn/parallel/tensor_parallel.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ namespace infini_train::nn::parallel {

// NOTE(zbl): Reserved for VocabParallelEmbedding, since rank is needed in its constructor before any Device exists
// On other occasions, should use Device::Rank()
extern thread_local int tp_rank;
extern thread_local int tls_tp_rank;

class ColumnParallelLinear : public nn::CloneableModule<ColumnParallelLinear> {
public:
Expand Down
17 changes: 9 additions & 8 deletions infini_train/include/profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,23 @@ namespace core {
class Event;
}

inline thread_local int g_profiling_depth = 0;
inline thread_local int tls_profiling_depth = 0;

struct ProfileContext {
std::string name;
Device::DeviceType device;
};

inline thread_local ProfileContext g_profile_context;
inline thread_local ProfileContext tls_profile_context;

inline void SetProfileContext(const std::string &name, Device::DeviceType device) {
if (g_profiling_depth == 0) {
g_profile_context.name = name;
g_profile_context.device = device;
if (tls_profiling_depth == 0) {
tls_profile_context.name = name;
tls_profile_context.device = device;
}
}

inline const ProfileContext &GetProfileContext() { return g_profile_context; }
inline const ProfileContext &GetProfileContext() { return tls_profile_context; }

struct KernelProfileInfo {
int64_t host_total_us = 0;
Expand Down Expand Up @@ -89,13 +89,14 @@ class Profiler {
std::string current_tag_ = "Untagged";

// thread-local tracking
thread_local static inline std::map<std::string, std::chrono::high_resolution_clock::time_point> cpu_timing_map_;
thread_local static inline std::map<std::string, std::chrono::high_resolution_clock::time_point>
tls_cpu_timing_map_;

struct EventPair {
core::Event *start = nullptr;
core::Event *stop = nullptr;
};

thread_local static inline std::map<std::string, EventPair> device_timing_map_;
thread_local static inline std::map<std::string, EventPair> tls_device_timing_map_;
};
} // namespace infini_train
5 changes: 3 additions & 2 deletions infini_train/include/utils/global_module_hook_registry.h
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
#pragma once

#include "infini_train/include/common/hook.h"
#include "infini_train/include/tensor.h"
#include <functional>
#include <memory>
#include <mutex>
#include <vector>

#include "infini_train/include/common/hook.h"
#include "infini_train/include/tensor.h"

namespace infini_train {
namespace nn {
class Module;
Expand Down
2 changes: 1 addition & 1 deletion infini_train/src/autograd/grad_mode.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "infini_train/include/autograd/grad_mode.h"

namespace infini_train::autograd {
thread_local bool GradMode::grad_enabled_ = true;
thread_local bool GradMode::tls_grad_enabled_ = true;
} // namespace infini_train::autograd
3 changes: 2 additions & 1 deletion infini_train/src/core/ccl/cuda/nccl_impl.cc
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#include "infini_train/src/core/ccl/cuda/nccl_impl.h"

#include <nccl.h>
#include <vector>

#include <nccl.h>

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

c 语言系统库头文件应当放在 c++标准库头文件上一个分组。

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NCCL 是第三方通信库,不是 c 语言系统库吧


#include "glog/logging.h"

#include "infini_train/include/common/cuda/common_cuda.h"
Expand Down
3 changes: 2 additions & 1 deletion infini_train/src/kernels/cpu/outer.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <cstdint>
#include <fcntl.h>

#include <cstdint>
#include <memory>
#include <tuple>

Expand Down
2 changes: 1 addition & 1 deletion infini_train/src/nn/modules/transformer/transformer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ std::vector<std::shared_ptr<Tensor>> TransformerLastStage::Forward(const std::ve
TransformerModel::TransformerModel(const TransformerConfig config)
: CloneableModule(kType), config_(config),
stage_info_(nn::parallel::PipelineParallel::GetStageInfo(
config_.n_layer, nn::parallel::global::GetPipelineParallelSize(), nn::parallel::pp_rank,
config_.n_layer, nn::parallel::global::GetPipelineParallelSize(), nn::parallel::tls_pp_rank,
nn::parallel::global::GetVirtualPipelineParallelSize())) {
auto tp_world_size = nn::parallel::global::GetTensorParallelSize();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ bool TransformerConfig::UseGQA() const { return n_kv_head < n_head; }

int TransformerConfig::GetChunkSize() const {
auto stage_info = parallel::PipelineParallel::GetStageInfo(n_layer, parallel::global::GetPipelineParallelSize(),
parallel::pp_rank,
parallel::tls_pp_rank,
parallel::global::GetVirtualPipelineParallelSize());
return stage_info.layer_ranges_per_chunk.size();
}
Expand Down
2 changes: 1 addition & 1 deletion infini_train/src/nn/parallel/global.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ std::string GetEnvAsStr(const std::string &name, const std::string &default_valu

namespace infini_train::nn::parallel::global {

thread_local int thread_global_rank = 0;
thread_local int tls_thread_global_rank = 0;

void Layout::InitStrides() {
// Calculate strides
Expand Down
2 changes: 1 addition & 1 deletion infini_train/src/nn/parallel/pp/pipeline_parallel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ namespace {
constexpr char kModuleName[] = "module";
} // namespace

thread_local int pp_rank = 0;
thread_local int tls_pp_rank = 0;

void PipelineParallel::BuildPipelineStage(const std::vector<std::vector<int64_t>> &recv_shape, Device device,
std::vector<std::shared_ptr<Module>> &&chunks) {
Expand Down
1 change: 1 addition & 0 deletions infini_train/src/nn/parallel/rank.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "infini_train/include/nn/parallel/rank.h"

#include "infini_train/include/nn/parallel/global.h"

namespace infini_train::nn::parallel {
Expand Down
Loading
Loading