Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion csrc/engine/compiler/paged_compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,11 @@ PagedCompiler::Compiled PagedCompiler::get_compiled(const InfinilmModel::Input &
graph_input.slot_mapping.value()->copy_from(input.slot_mapping.value());

auto graph = std::get<0>(result->second.compiled);
auto shared_output = std::shared_ptr<InfinilmModel::Output>(new InfinilmModel::Output{std::get<1>(result->second.compiled)->logits->resume_from_blob_()});
// Reuse the GraphTensor output captured at compile time.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nvidia会存在double free 的问题。定位到是graph也给Tensor一个deleter, 导致了二次释放。 修改了shared_output后,不再有double free 的问题。

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

为什么会二次释放?如果你删掉,会不会导致有些地址无法释放?

// Do not call resume_from_blob_() on workspace-backed logits:
// that registers a second deleter on the same GPU block and
// triggers double free in PinnableBlockAllocator.
auto shared_output = std::get<1>(result->second.compiled);

return std::make_tuple(graph, shared_output);
}
Expand Down
6 changes: 5 additions & 1 deletion csrc/engine/compiler/static_batching_compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,11 @@ StaticBatchingCompiler::Compiled StaticBatchingCompiler::get_compiled(
graph_input.total_sequence_lengths.value()->copy_from(input.total_sequence_lengths.value());

auto graph = std::get<0>(result->second.compiled);
auto shared_output = std::shared_ptr<InfinilmModel::Output>(new InfinilmModel::Output{std::get<1>(result->second.compiled)->logits->resume_from_blob_()});
// Reuse the GraphTensor output captured at compile time.
// Do not call resume_from_blob_() on workspace-backed logits:
// that registers a second deleter on the same GPU block and
// triggers double free in PinnableBlockAllocator.
auto shared_output = std::get<1>(result->second.compiled);
return std::make_tuple(graph, shared_output);
}
} else {
Expand Down
5 changes: 3 additions & 2 deletions csrc/engine/infer_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,16 @@ InferEngine::InferEngine(
const cache::CacheConfig *cache_config,
bool enable_graph_compiling,
backends::AttentionBackend attention_backend,
std::optional<infinicore::DataType> kv_cache_dtype) // Changed parameter
std::optional<infinicore::DataType> kv_cache_dtype, // Changed parameter
size_t max_num_batched_tokens)
: communication_group_(distributed_config, device_type), attention_backend_(attention_backend) {
if (cache_config != nullptr) {
cache_config_ = cache_config->unique_copy();
}

// Load model config if model_path is provided, model_path must be valid, and config.json exists
this->model_config_ = infinilm::config::ConfigFactory::createConfig(config_str);
auto infinilm_config = std::make_shared<infinilm::global_state::InfinilmConfig>(attention_backend, this->model_config_);
auto infinilm_config = std::make_shared<infinilm::global_state::InfinilmConfig>(attention_backend, this->model_config_, max_num_batched_tokens);

// Only support offline int8 kv cache quantization in this version
if (kv_cache_dtype.has_value()) {
Expand Down
3 changes: 2 additions & 1 deletion csrc/engine/infer_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ class InferEngine {
const cache::CacheConfig *cache_config = nullptr,
bool enable_graph_compiling = false,
backends::AttentionBackend attention_backend = backends::AttentionBackend::Default,
std::optional<infinicore::DataType> kv_cache_dtype = std::nullopt);
std::optional<infinicore::DataType> kv_cache_dtype = std::nullopt,
size_t max_num_batched_tokens = 2048);

// Load a parameter to all workers (each can extract its shard inside RankWorker)
void load_param(const std::string &name, const infinicore::Tensor &param);
Expand Down
10 changes: 10 additions & 0 deletions csrc/engine/rank_worker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,15 @@ void RankWorker::thread_loop() {
if (!model_) {
throw std::runtime_error("Failed to create model");
}

infinicore::context::syncStream();

if (infinilm_config_->enable_workspace_manager) {
forward_context_.workspace_manager.finalize_and_bind();
// forward_context_.workspace_manager.log_registrations();
}
infinicore::context::syncStream();

if (enable_graph_compiling_) {
compiler_ = std::make_unique<GeneralCompiler>(model_, barrier_);
}
Expand Down Expand Up @@ -394,6 +403,7 @@ void RankWorker::thread_loop() {
try {
{
std::lock_guard<std::mutex> lk(mutex_);
infinilm::global_state::get_forward_context().workspace_manager.reset_runtime_buffers();

infinicore::Tensor logits;
// Try to get compiled graph
Expand Down
4 changes: 4 additions & 0 deletions csrc/global_state/forward_context.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#pragma once

#include "../models/infinilm_model.hpp"
#include "../utils.hpp"
#include "workspace_manager.hpp"
#include <vector>

namespace infinilm::global_state {

Expand Down Expand Up @@ -48,6 +51,7 @@ struct ForwardContext {
AttentionMetadata attn_metadata;
MultiModalMetadata mm_metadata;
std::vector<infinicore::Tensor> kv_cache_vec;
WorkspaceManager workspace_manager;
};

void initialize_forward_context(ForwardContext &forward_context);
Expand Down
15 changes: 13 additions & 2 deletions csrc/global_state/infinilm_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,24 @@ struct InfinilmConfig {
public:
InfinilmConfig() = default;
InfinilmConfig(const infinilm::backends::AttentionBackend &backend,
const std::shared_ptr<infinilm::config::ModelConfig> &model_config)
const std::shared_ptr<infinilm::config::ModelConfig> &model_config,
size_t max_num_batched_tokens)
: attention_backend(backend),
model_config(model_config) {}
model_config(model_config),
max_num_batched_tokens(max_num_batched_tokens) {

if (max_num_batched_tokens > 0) {
const size_t max_position_embeddings = model_config->get<size_t>("max_position_embeddings");
ASSERT(max_num_batched_tokens >= 512 && max_num_batched_tokens <= max_position_embeddings);
enable_workspace_manager = true;
}
}

public:
infinilm::backends::AttentionBackend attention_backend;
std::shared_ptr<infinilm::config::ModelConfig> model_config;
size_t max_num_batched_tokens = 0;
bool enable_workspace_manager{false};
};

/**
Expand Down
Loading