From f2c8bab8531da961de09e4bf808695a4a269bb75 Mon Sep 17 00:00:00 2001
From: huidesheng <1832140001@qq.com>
Date: Tue, 12 May 2026 17:45:29 +0800
Subject: [PATCH 1/9] add chunkprefill and prefill cuda graph

---
 csrc/engine/compiler/general_compiler.cpp     | 12 ++-
 csrc/engine/compiler/general_compiler.hpp     |  5 +-
 csrc/engine/infer_engine.cpp                  |  4 +
 csrc/engine/infer_engine.hpp                  |  2 +
 csrc/engine/rank_worker.cpp                   |  6 +-
 csrc/engine/rank_worker.hpp                   |  3 +
 csrc/pybind11/engine/engine.hpp               |  6 ++
 python/infinilm/base_config.py                |  4 +
 python/infinilm/infer_engine.py               |  2 +
 python/infinilm/llm/llm.py                    | 44 ++++++++++-
 python/infinilm/llm/request.py                | 17 +++++
 python/infinilm/llm/scheduler.py              | 41 +++++++++-
 .../processors/basic_llm_processor.py         | 40 +++++++---
 python/infinilm/server/inference_server.py    | 14 ++++
 scripts/infer_task.py                         | 21 ++++++
 scripts/launch_server.py                      | 75 +++++++++++++++----
 16 files changed, 266 insertions(+), 30 deletions(-)
diff --git a/csrc/engine/compiler/general_compiler.cpp b/csrc/engine/compiler/general_compiler.cpp
index 84ee670d4..36c6420f0 100644
--- a/csrc/engine/compiler/general_compiler.cpp
+++ b/csrc/engine/compiler/general_compiler.cpp
@@ -1,13 +1,18 @@
 #include "general_compiler.hpp"
 
 namespace infinilm::engine {
-GeneralCompiler::GeneralCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier) : GraphCompiler(model, barrier) {
+GeneralCompiler::GeneralCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier, bool enable_chunk_prefill_graph)
+    : GraphCompiler(model, barrier), enable_chunk_prefill_graph_(enable_chunk_prefill_graph) {
     static_batching_compiler_ = std::make_unique<StaticBatchingCompiler>(model_, barrier);
+    chunk_prefill_compiler_ = std::make_unique<ChunkPrefillCompiler>(model_, barrier);
     paged_compiler_ = std::make_unique<PagedCompiler>(model_, barrier);
 }
 
 void GeneralCompiler::compile() {
     static_batching_compiler_->compile();
+    if (enable_chunk_prefill_graph_) {
+        chunk_prefill_compiler_->compile();
+    }
     paged_compiler_->compile();
 }
 
@@ -19,6 +24,11 @@ GeneralCompiler::Compiled GeneralCompiler::get_compiled(const InfinilmModel::Inp
     if (std::get<0>(result) != nullptr && std::get<1>(result) != nullptr) {
         return result;
     }
+    // chunk-prefill must be checked before decode (decode would also match if chunk_size==1)
+    result = chunk_prefill_compiler_.get()->get_compiled(input);
+    if (std::get<0>(result) != nullptr && std::get<1>(result) != nullptr) {
+        return result;
+    }
     result = paged_compiler_.get()->get_compiled(input);
     return result;
 }
diff --git a/csrc/engine/compiler/general_compiler.hpp b/csrc/engine/compiler/general_compiler.hpp
index e8b84b5d9..3edbcea0c 100644
--- a/csrc/engine/compiler/general_compiler.hpp
+++ b/csrc/engine/compiler/general_compiler.hpp
@@ -1,12 +1,13 @@
 #pragma once
 
+#include "chunk_prefill_compiler.hpp"
 #include "paged_compiler.hpp"
 #include "static_batching_compiler.hpp"
 
 namespace infinilm::engine {
 class GeneralCompiler : public GraphCompiler {
 public:
-    GeneralCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier);
+    GeneralCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier, bool enable_chunk_prefill_graph = false);
 
     void compile() override;
 
@@ -15,5 +16,7 @@ class GeneralCompiler : public GraphCompiler {
 private:
     std::unique_ptr<StaticBatchingCompiler> static_batching_compiler_;
     std::unique_ptr<PagedCompiler> paged_compiler_;
+    std::unique_ptr<ChunkPrefillCompiler> chunk_prefill_compiler_;
+    bool enable_chunk_prefill_graph_;
 };
 } // namespace infinilm::engine
diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp
index db0dfdd47..5b6ea143e 100644
--- a/csrc/engine/infer_engine.cpp
+++ b/csrc/engine/infer_engine.cpp
@@ -25,6 +25,7 @@ InferEngine::InferEngine(
     infinicore::Device::Type device_type,
     const cache::CacheConfig *cache_config,
     bool enable_graph_compiling,
+    bool enable_chunk_prefill_graph,
     backends::AttentionBackend attention_backend) // Changed parameter
     : communication_group_(distributed_config, device_type),
       legacy_model_config_(config),
@@ -43,6 +44,7 @@ InferEngine::InferEngine(
             cache_config_ != nullptr ? cache_config_.get() : nullptr,
             barrier_.get(),
             enable_graph_compiling,
+            enable_chunk_prefill_graph,
             attention_backend_));
     }
 
@@ -56,6 +58,7 @@ InferEngine::InferEngine(
     infinicore::Device::Type device_type,
     const cache::CacheConfig *cache_config,
     bool enable_graph_compiling,
+    bool enable_chunk_prefill_graph,
     backends::AttentionBackend attention_backend,
     std::optional<infinicore::DataType> kv_cache_dtype) // Changed parameter
     : communication_group_(distributed_config, device_type), attention_backend_(attention_backend) {
@@ -82,6 +85,7 @@ InferEngine::InferEngine(
             cache_config_ != nullptr ? cache_config_.get() : nullptr,
             barrier_.get(),
             enable_graph_compiling,
+            enable_chunk_prefill_graph,
             attention_backend_));
     }
     // Compile the model on all workers
diff --git a/csrc/engine/infer_engine.hpp b/csrc/engine/infer_engine.hpp
index e36ec3699..153600c48 100644
--- a/csrc/engine/infer_engine.hpp
+++ b/csrc/engine/infer_engine.hpp
@@ -39,6 +39,7 @@ class InferEngine {
         infinicore::Device::Type device_type = infinicore::context::getDevice().getType(),
         const cache::CacheConfig *cache_config = nullptr,
         bool enable_graph_compiling = false,
+        bool enable_chunk_prefill_graph = false,
         backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
 
     InferEngine(
@@ -47,6 +48,7 @@ class InferEngine {
         infinicore::Device::Type device_type = infinicore::context::getDevice().getType(),
         const cache::CacheConfig *cache_config = nullptr,
         bool enable_graph_compiling = false,
+        bool enable_chunk_prefill_graph = false,
         backends::AttentionBackend attention_backend = backends::AttentionBackend::Default,
         std::optional<infinicore::DataType> kv_cache_dtype = std::nullopt);
 
diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp
index 8a94c441e..e607c569f 100644
--- a/csrc/engine/rank_worker.cpp
+++ b/csrc/engine/rank_worker.cpp
@@ -27,11 +27,13 @@ RankWorker::RankWorker(const InfinilmModel::Config &model_config,
                        const cache::CacheConfig *cache_config,
                        RankBarrier *barrier,
                        bool enable_graph_compiling,
+                       bool enable_chunk_prefill_graph,
                        backends::AttentionBackend attention_backend)
     : legacy_model_config_(model_config),
       rank_info_(rank_info),
       attention_backend_(attention_backend),
       enable_graph_compiling_(enable_graph_compiling),
+      enable_chunk_prefill_graph_(enable_chunk_prefill_graph),
       job_cmd_(Command::INIT),
       has_job_(false),
       job_done_(false),
@@ -56,12 +58,14 @@ RankWorker::RankWorker(
     const cache::CacheConfig *cache_config,
     RankBarrier *barrier,
     bool enable_graph_compiling,
+    bool enable_chunk_prefill_graph,
     backends::AttentionBackend attention_backend)
     : infinilm_config_(infinilm_config),
       model_config_(infinilm_config->model_config),
       rank_info_(rank_info),
       attention_backend_(attention_backend),
       enable_graph_compiling_(enable_graph_compiling),
+      enable_chunk_prefill_graph_(enable_chunk_prefill_graph),
       job_cmd_(Command::INIT),
       has_job_(false),
       job_done_(false),
@@ -303,7 +307,7 @@ void RankWorker::thread_loop() {
                 throw std::runtime_error("Failed to create model");
             }
             if (enable_graph_compiling_) {
-                compiler_ = std::make_unique<GeneralCompiler>(model_, barrier_);
+                compiler_ = std::make_unique<GeneralCompiler>(model_, barrier_, enable_chunk_prefill_graph_);
             }
 
             init_done_ = true;
diff --git a/csrc/engine/rank_worker.hpp b/csrc/engine/rank_worker.hpp
index f6adcf476..b045adf65 100644
--- a/csrc/engine/rank_worker.hpp
+++ b/csrc/engine/rank_worker.hpp
@@ -75,6 +75,7 @@ class RankWorker {
                const cache::CacheConfig *cache_config,
                RankBarrier *barrier,
                bool enable_graph_compiling,
+               bool enable_chunk_prefill_graph,
                backends::AttentionBackend attention_backend);
 
     RankWorker(std::shared_ptr<infinilm::global_state::InfinilmConfig> infinilm_config,
@@ -82,6 +83,7 @@ class RankWorker {
                const cache::CacheConfig *cache_config,
                RankBarrier *barrier,
                bool enable_graph_compiling,
+               bool enable_chunk_prefill_graph,
                backends::AttentionBackend attention_backend);
 
     // Submit a parameter load job and wait until the load completes on the worker thread.
@@ -131,6 +133,7 @@ class RankWorker {
 
     // Graph Compiling
     bool enable_graph_compiling_;
+    bool enable_chunk_prefill_graph_;
     std::unique_ptr<GraphCompiler> compiler_;
 
     // Command for the pending job (protected by mutex_)
diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp
index 2741c9cd7..a479f66be 100644
--- a/csrc/pybind11/engine/engine.hpp
+++ b/csrc/pybind11/engine/engine.hpp
@@ -37,6 +37,7 @@ inline void bind_infer_engine(py::module &m) {
                           infinicore::Device::Type dev,
                           std::shared_ptr<const infinilm::cache::CacheConfig> cache_cfg,
                           bool enable_graph_compiling,
+                          bool enable_chunk_prefill_graph,
                           const std::string &attention_backend) {
                  return std::make_shared<InferEngine>(
                      cfg,
@@ -44,6 +45,7 @@ inline void bind_infer_engine(py::module &m) {
                      dev,
                      cache_cfg ? cache_cfg.get() : nullptr,
                      enable_graph_compiling,
+                     enable_chunk_prefill_graph,
                      infinilm::backends::parse_attention_backend(attention_backend));
              }),
              py::arg("config"),
@@ -51,6 +53,7 @@ inline void bind_infer_engine(py::module &m) {
              py::arg("device_type") = infinicore::context::getDevice().getType(),
              py::arg("cache_config") = py::none(),
              py::arg("enable_graph_compiling") = false,
+             py::arg("enable_chunk_prefill_graph") = false,
              py::arg("attention_backend") = "default")
         .def("load_param", &InferEngine::load_param,
              py::arg("name"), py::arg("param"),
@@ -81,6 +84,7 @@ inline void bind_infer_engine(py::module &m) {
                           infinicore::Device::Type dev,
                           std::shared_ptr<const infinilm::cache::CacheConfig> cache_cfg,
                           bool enable_graph_compiling,
+                          bool enable_chunk_prefill_graph,
                           const std::string &attention_backend,
                           std::optional<infinicore::DataType> kv_cache_dtype) {
                  return std::make_shared<InferEngine>(
@@ -89,6 +93,7 @@ inline void bind_infer_engine(py::module &m) {
                      dev,
                      cache_cfg ? cache_cfg.get() : nullptr,
                      enable_graph_compiling,
+                     enable_chunk_prefill_graph,
                      infinilm::backends::parse_attention_backend(attention_backend),
                      kv_cache_dtype);
              }),
@@ -97,6 +102,7 @@ inline void bind_infer_engine(py::module &m) {
              py::arg("device_type") = infinicore::context::getDevice().getType(),
              py::arg("cache_config") = py::none(),
              py::arg("enable_graph_compiling") = false,
+             py::arg("enable_chunk_prefill_graph") = false,
              py::arg("attention_backend") = "default",
              py::arg("kv_cache_dtype") = py::none())
         .def("load_param", &InferEngine::load_param,
diff --git a/python/infinilm/base_config.py b/python/infinilm/base_config.py
index aab5dd459..d7c32568e 100644
--- a/python/infinilm/base_config.py
+++ b/python/infinilm/base_config.py
@@ -61,6 +61,8 @@ def __init__(self):
 
         self.attn = self.args.attn
         self.enable_graph = self.args.enable_graph
+        self.enable_chunk_prefill_graph = self.args.enable_chunk_prefill_graph
+        self.chunk_size = self.args.chunk_size
         self.enable_paged_attn = self.args.enable_paged_attn
         self.num_blocks = self.args.num_blocks
         self.block_size = self.args.block_size
@@ -122,6 +124,8 @@ def _add_common_args(self):
             choices=["default", "paged-attn", "flash-attn"],
         )
         self.parser.add_argument("--enable-graph", action="store_true")
+        self.parser.add_argument("--enable-chunk-prefill-graph", action="store_true", help="enable chunk-prefill graph compiling")
+        self.parser.add_argument("--chunk-size", type=int, default=512, help="tokens per chunked-prefill slice (0 to disable)")
         self.parser.add_argument(
             "--enable-paged-attn",
             action="store_true",
diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py
index 13bb18a19..2477bbc61 100644
--- a/python/infinilm/infer_engine.py
+++ b/python/infinilm/infer_engine.py
@@ -45,6 +45,7 @@ def __init__(
         distributed_config=DistConfig(1),
         cache_config=None,
         enable_graph_compiling=False,
+        enable_chunk_prefill_graph=False,
         attention_backend="default",
         kv_cache_dtype=None,
     ):
@@ -60,6 +61,7 @@ def __init__(
             device._underlying.type,
             cache_config,
             enable_graph_compiling,
+            enable_chunk_prefill_graph,
             attention_backend,
             (
                 parse_dtype(kv_cache_dtype)._underlying
diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py
index cba3af83a..90de3edcc 100644
--- a/python/infinilm/llm/llm.py
+++ b/python/infinilm/llm/llm.py
@@ -72,6 +72,8 @@ class EngineConfig:
     top_p: float = 0.8
     top_k: int = 1
     enable_graph: bool = False
+    enable_chunk_prefill_graph: bool = False
+    chunk_size: int = 0
     attn_backend: str = "default"
     skip_load: bool = False
 
@@ -91,6 +93,7 @@ def __init__(self, config: EngineConfig):
             device=self.device,
             distributed_config=DistConfig(config.tensor_parallel_size),
             enable_graph_compiling=config.enable_graph,
+            enable_chunk_prefill_graph=config.enable_chunk_prefill_graph,
             attention_backend=config.attn_backend,
         )
 
@@ -167,6 +170,8 @@ def _init_device(self):
 
     def add_request(self, request: InferenceRequest):
         """Add a request to the scheduler."""
+        if self.cache_type == "paged" and self.config.chunk_size > 0:
+            request.chunk_size = self.config.chunk_size
         self.scheduler.add_request(request)
 
     def step(self) -> tuple[list[InferenceRequest], list[tuple]]:
@@ -210,7 +215,18 @@ def _update_requests(
         sampled_tokens: List[int],
     ) -> List[tuple]:
         """Update request status after inference step."""
-        if is_prefill:
+        # Detect a chunked-prefill mid-step: single request, prefill phase,
+        # and this chunk does not yet cover the whole prompt. In that case
+        # we must NOT consume a sampled token, NOT commit prefill blocks,
+        # and re-enqueue the request to keep chunking.
+        chunk_mid_step = (
+            is_prefill
+            and len(requests) == 1
+            and requests[0].is_chunking()
+            and not requests[0].chunk_is_last()
+        )
+
+        if is_prefill and not chunk_mid_step:
             match self.cache_type:
                 case "paged":
                     self.scheduler.cache_manager.reset_req_blocks()
@@ -218,6 +234,20 @@ def _update_requests(
                     self.scheduler.update_cache()
                 case _:
                     raise ValueError(f"Unsupported cache_type: {self.cache_type}")
+
+        if chunk_mid_step:
+            req = requests[0]
+            req.chunk_prefill_offset += req.chunk_size
+            # If this request was aborted while chunking, drop it.
+            if req.is_aborted():
+                logger.info(
+                    f"Request {req.request_id} aborted by client during chunked-prefill"
+                )
+                return []
+            # Re-enqueue to keep producing chunks; no token sampled yet.
+            self.scheduler.requeue_chunking(req)
+            return []
+
         pending = []
         for req, token_id in zip(requests, sampled_tokens):
             if req.is_aborted():
@@ -227,6 +257,10 @@ def _update_requests(
                 continue
 
             if req.is_prefill:
+                # Clean up chunked-prefill state on the final chunk so the
+                # next forward pass on this request takes the decode path.
+                req.chunk_prefill_offset = 0
+                req.chunk_size = 0
                 req.is_prefill = False
 
             req.generated_token_ids.append(token_id)
@@ -361,6 +395,8 @@ def __init__(
         top_p: float = 0.8,
         top_k: int = 1,
         enable_graph: bool = False,
+        enable_chunk_prefill_graph: bool = False,
+        chunk_size: int = 0,
         attn_backend: str = "default",
         skip_load: bool = False,
     ):
@@ -398,6 +434,8 @@ def __init__(
             top_p=top_p,
             top_k=top_k,
             enable_graph=enable_graph,
+            enable_chunk_prefill_graph=enable_chunk_prefill_graph,
+            chunk_size=chunk_size,
             attn_backend=attn_backend,
             skip_load=skip_load,
         )
@@ -539,6 +577,8 @@ def __init__(
         top_p: float = 0.8,
         top_k: int = 1,
         enable_graph: bool = False,
+        enable_chunk_prefill_graph: bool = False,
+        chunk_size: int = 0,
         attn_backend: str = "default",
     ):
         """Initialize AsyncLLMEngine.
@@ -575,6 +615,8 @@ def __init__(
             top_p=top_p,
             top_k=top_k,
             enable_graph=enable_graph,
+            enable_chunk_prefill_graph=enable_chunk_prefill_graph,
+            chunk_size=chunk_size,
             attn_backend=attn_backend,
         )
         self.engine = LLMEngine(config)
diff --git a/python/infinilm/llm/request.py b/python/infinilm/llm/request.py
index 15bcf69f4..679b6e4db 100644
--- a/python/infinilm/llm/request.py
+++ b/python/infinilm/llm/request.py
@@ -144,6 +144,11 @@ def __init__(
         self.num_cached_tokens: int = 0
         self.num_blocks: int = 0
 
+        # Chunked-prefill state (0 = disabled, otherwise tokens per chunk)
+        self.chunk_size: int = 0
+        # Number of prompt tokens already fed through forward as chunked-prefill
+        self.chunk_prefill_offset: int = 0
+
         # For server use
         self.request_data: Optional[dict] = request_data
         self.http_request: Optional[Any] = http_request
@@ -186,6 +191,18 @@ def get_num_blocks_required(self, block_size: int) -> int:
     def get_max_tokens(self) -> Optional[int]:
         return self.sampling_params.max_tokens
 
+    def is_chunking(self) -> bool:
+        """Return True if this request is in the middle of chunked-prefill."""
+        return (
+            self.chunk_size > 0
+            and self.is_prefill
+            and self.prompt_length > self.chunk_size
+        )
+
+    def chunk_is_last(self) -> bool:
+        """Return True if the next chunk would finish the prompt."""
+        return self.chunk_prefill_offset + self.chunk_size >= self.prompt_length
+
     def is_finished(self) -> bool:
         return self.status in [
             RequestStatus.FINISHED,
diff --git a/python/infinilm/llm/scheduler.py b/python/infinilm/llm/scheduler.py
index f9c11635a..95a844804 100644
--- a/python/infinilm/llm/scheduler.py
+++ b/python/infinilm/llm/scheduler.py
@@ -42,6 +42,9 @@ def __init__(
     ):
         self.waiting_queue = janus.Queue()
         self.running_queue = janus.Queue()
+        # Requests in the middle of chunked-prefill — scheduled at high priority,
+        # single-request batches only (to match the C++ ChunkPrefillCompiler graph signature).
+        self.chunking_queue = janus.Queue()
         self.max_batch_size = max_batch_size
 
         self.cache_manager = BlockManager(num_blocks=num_blocks, block_size=block_size)
@@ -53,7 +56,27 @@ def add_request(self, request: InferenceRequest):
             self.waiting_queue.sync_q.put(request)
 
     def schedule(self) -> Optional[SchedulerOutput]:
-        """Schedule and return batch of requests to execute."""
+        """Schedule and return batch of requests to execute.
+
+        Priority (mirrors launch_server.py chunked-prefill scheduling):
+          1. Running queue (decode) — short / latency-sensitive
+          2. Chunking queue (in-flight chunked-prefill) — single-request slice
+          3. Waiting queue (new prefill) — may start chunking if prompt is long
+        """
+        # 2) Continue an in-flight chunked-prefill request (single-request batch).
+        try:
+            req = self.chunking_queue.sync_q.get_nowait()
+        except queue.Empty:
+            req = None
+        if req is not None:
+            if req.is_finished():
+                self.complete_requests([req])
+            else:
+                return SchedulerOutput(
+                    scheduled_requests=[req],
+                    is_prefill=True,
+                )
+
         scheduled_requests = []
         is_prefill = False
 
@@ -91,6 +114,18 @@ def schedule(self) -> Optional[SchedulerOutput]:
 
             req.num_blocks = len(req.block_table)
             req.status = RequestStatus.RUNNING
+
+            # Start chunked-prefill: enqueue into chunking_queue and emit a
+            # single-request batch immediately. We don't mix chunked-prefill
+            # with other requests in the same batch — the C++ ChunkPrefillCompiler
+            # graph is keyed on (batch_size, chunk_size).
+            if req.chunk_size > 0 and req.prompt_length > req.chunk_size:
+                req.chunk_prefill_offset = 0
+                return SchedulerOutput(
+                    scheduled_requests=[req],
+                    is_prefill=True,
+                )
+
             scheduled_requests.append(req)
 
         # Return prefill batch if any waiting requests were scheduled
@@ -135,6 +170,10 @@ def schedule(self) -> Optional[SchedulerOutput]:
 
         return None
 
+    def requeue_chunking(self, req: InferenceRequest):
+        """Put a request back into the chunking queue after a chunk has run."""
+        self.chunking_queue.sync_q.put(req)
+
     def complete_requests(self, requests: List[InferenceRequest]):
         """Handle completed requests and free their blocks."""
         for req in requests:
diff --git a/python/infinilm/processors/basic_llm_processor.py b/python/infinilm/processors/basic_llm_processor.py
index 070a40622..f5e603ba4 100644
--- a/python/infinilm/processors/basic_llm_processor.py
+++ b/python/infinilm/processors/basic_llm_processor.py
@@ -185,19 +185,39 @@ def _build_model_input_from_batch_scheduler_output(
             if scheduler_output.is_prefill:
                 # Prefill phase
                 req_tokens = req.get_input_tokens()
-                tokens_to_compute = req_tokens[num_cached:]
-                tokens.extend(tokens_to_compute)
 
-                compute_len = len(tokens_to_compute)
-                seq_len = len(req_tokens)
-                seq_lens.append(seq_len)
+                # Chunked-prefill: only feed [chunk_prefill_offset : +chunk_size).
+                # past_kv_lengths = chunk_prefill_offset (attention sees the prefix
+                # already committed); total_kv_lengths = chunk_prefill_offset +
+                # len(tokens_to_compute). This keeps batch_size=1 and total_tokens
+                # == chunk_size so the C++ ChunkPrefillCompiler graph hits.
+                if req.is_chunking():
+                    start = req.chunk_prefill_offset
+                    end = min(start + req.chunk_size, len(req_tokens))
+                    tokens_to_compute = req_tokens[start:end]
+                    compute_len = len(tokens_to_compute)
+                    tokens.extend(tokens_to_compute)
+                    seq_len = end  # attention prefix length after this chunk
+                    seq_lens.append(seq_len)
+                    current_offset += compute_len
+                    seq_offsets.append(current_offset)
+                    slot_mapping.extend(req.slot_mapping[start:end])
+                    cached_lens.append(start)
+                    position_ids.extend(range(start, end))
+                else:
+                    tokens_to_compute = req_tokens[num_cached:]
+                    tokens.extend(tokens_to_compute)
 
-                current_offset += compute_len
-                seq_offsets.append(current_offset)
+                    compute_len = len(tokens_to_compute)
+                    seq_len = len(req_tokens)
+                    seq_lens.append(seq_len)
 
-                slot_mapping.extend(req.slot_mapping)
-                cached_lens.append(num_cached)
-                position_ids.extend(range(num_cached, num_cached + compute_len))
+                    current_offset += compute_len
+                    seq_offsets.append(current_offset)
+
+                    slot_mapping.extend(req.slot_mapping)
+                    cached_lens.append(num_cached)
+                    position_ids.extend(range(num_cached, num_cached + compute_len))
 
             else:
                 # Decode phase
diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py
index 71e9c992f..ac7e94e71 100644
--- a/python/infinilm/server/inference_server.py
+++ b/python/infinilm/server/inference_server.py
@@ -108,6 +108,8 @@ def __init__(
         host: str = "0.0.0.0",
         port: int = 8000,
         enable_graph: bool = False,
+        enable_chunk_prefill_graph: bool = False,
+        chunk_size: int = 0,
         attn_backend: str = "default",
         ignore_eos: bool = False,
     ):
@@ -130,6 +132,10 @@ def __init__(
             host: Server host address.
             port: Server port number.
             enable_graph: Whether to enable graph compiling.
+            enable_chunk_prefill_graph: Whether to enable chunk-prefill graph compiling.
+            chunk_size: Tokens per chunked-prefill slice (0 = disabled). When > 0 and paged
+                cache is used, long prompts are sliced and each slice goes through forward
+                separately so the C++ ChunkPrefillCompiler precompiled graph can be reused.
             attn_backend: Attention backend to use ('default', 'flash-attn').
         """
         self.model_path = model_path
@@ -150,6 +156,8 @@ def __init__(
         self.host = host
         self.port = port
         self.enable_graph = enable_graph
+        self.enable_chunk_prefill_graph = enable_chunk_prefill_graph
+        self.chunk_size = chunk_size
         self.attn_backend = attn_backend
         self.ignore_eos = ignore_eos
 
@@ -182,11 +190,15 @@ async def lifespan(app: FastAPI):
                 top_p=self.top_p,
                 top_k=self.top_k,
                 enable_graph=self.enable_graph,
+                enable_chunk_prefill_graph=self.enable_chunk_prefill_graph,
+                chunk_size=self.chunk_size,
                 attn_backend=self.attn_backend,
             )
             self.engine.start()
             logger.info(f"Engine initialized with model at {self.model_path}")
             logger.info(f"  enable_graph: {self.enable_graph}")
+            logger.info(f"  enable_chunk_prefill_graph: {self.enable_chunk_prefill_graph}")
+            logger.info(f"  chunk_size: {self.chunk_size}")
             yield
             self.engine.stop()
 
@@ -572,6 +584,8 @@ def main():
         host=cfg.host,
         port=cfg.port,
         enable_graph=cfg.enable_graph,
+        enable_chunk_prefill_graph=cfg.enable_chunk_prefill_graph,
+        chunk_size=cfg.chunk_size,
         attn_backend=cfg.attn,
         ignore_eos=cfg.ignore_eos,
     )
diff --git a/scripts/infer_task.py b/scripts/infer_task.py
index 0d1231b77..1851f0a0a 100644
--- a/scripts/infer_task.py
+++ b/scripts/infer_task.py
@@ -10,6 +10,8 @@ def __init__(self, id, tokens, max_tokens, temperature, topk, topp, end_tokens):
         self.end_tokens = end_tokens
         self._kv_cache = None
         self.pos = 0
+        self._discard_output = False
+        self._remaining_tokens = None
 
     def bind_kvcache(self, kv_cache, pos=0):
         self._kv_cache = kv_cache
@@ -24,6 +26,25 @@ def release_kvcache(self):
     def kvcache(self):
         return self._kv_cache
 
+    def setup_chunked_prefill(self, chunk_size):
+        if chunk_size <= 0 or len(self.tokens) <= chunk_size:
+            return
+        self._remaining_tokens = self.tokens[chunk_size:]
+        self.tokens = self.tokens[:chunk_size]
+        self._discard_output = True
+
+    def advance_prefill_chunk(self, chunk_size):
+        self._kv_cache.update_tokens(self.tokens, self.pos)
+        self.pos += len(self.tokens)
+
+        if len(self._remaining_tokens) <= chunk_size:
+            self.tokens = self._remaining_tokens
+            self._remaining_tokens = None
+            self._discard_output = False
+        else:
+            self.tokens = self._remaining_tokens[:chunk_size]
+            self._remaining_tokens = self._remaining_tokens[chunk_size:]
+
     def next(self, out_token):
         self._kv_cache.update_tokens(self.tokens, self.pos)
 
diff --git a/scripts/launch_server.py b/scripts/launch_server.py
index d04d4f69d..0639a28b4 100644
--- a/scripts/launch_server.py
+++ b/scripts/launch_server.py
@@ -64,6 +64,13 @@ def parse_args():
         default=None,
         help="Max token sequence length that model will handle (follows model config if not provided)",
     )
+    parser.add_argument(
+        "--chunk-size",
+        type=int,
+        default=512,
+        help="Maximum number of tokens per prefill chunk (default: 512). "
+             "Set to 0 to disable chunked prefill.",
+    )
     parser.add_argument(
         "--awq",
         action="store_true",
@@ -86,8 +93,10 @@ def parse_args():
 USE_AWQ = args.awq
 USE_GPTQ = args.gptq
 MAX_BATCH = args.max_batch
+CHUNK_SIZE = args.chunk_size
 print(
-    f"Using MAX_BATCH={MAX_BATCH}. Try reduce this value if out of memory error occurs."
+    f"Using MAX_BATCH={MAX_BATCH}, CHUNK_SIZE={CHUNK_SIZE}. "
+    f"Try reduce these values if out of memory error occurs."
 )
 
 
@@ -163,32 +172,66 @@ async def lifespan(app: FastAPI):
 
 
 # App loop: take requests from the queue, do inference, and put unfinished requests back into the queue.
+# Uses priority scheduling: decode/short tasks first, then prefill chunks.
 def worker_loop(app):
+    pending_prefill = []  # Low priority: chunked prefill tasks
+
     while True:
+        # Drain all available tasks from the queue
+        incoming = []
         try:
             task = app.state.request_queue.sync_q.get(timeout=0.01)
+            if task is None:
+                return
+            incoming.append(task)
         except queue.Empty:
-            continue
-
-        if task is None:
-            return
+            pass
 
-        batch = [task]
-        while len(batch) < MAX_BATCH:
+        while True:
             try:
-                req = app.state.request_queue.sync_q.get_nowait()
-                if req is not None:
-                    batch.append(req)
+                task = app.state.request_queue.sync_q.get_nowait()
+                if task is None:
+                    return
+                incoming.append(task)
             except queue.Empty:
                 break
+
+        # Separate into high priority (decode/new short) and low priority (prefill chunks)
+        high_priority = []
+        for t in incoming:
+            if t._discard_output:
+                pending_prefill.append(t)
+            else:
+                high_priority.append(t)
+
+        # Build batch: high priority first, then fill with prefill chunks
+        batch = []
+        while high_priority and len(batch) < MAX_BATCH:
+            batch.append(high_priority.pop(0))
+        while pending_prefill and len(batch) < MAX_BATCH:
+            batch.append(pending_prefill.pop(0))
+
+        if not batch:
+            continue
+
         output_tokens = app.state.model.batch_infer_one_round(batch)
         for task, token in zip(batch, output_tokens):
-            task.output(token)
-            if task.finish_reason is None:
-                app.state.request_queue.sync_q.put(task)
+            if task._discard_output:
+                task.advance_prefill_chunk(CHUNK_SIZE)
+                if task.finish_reason is None:
+                    if task._discard_output:
+                        pending_prefill.append(task)
+                    else:
+                        app.state.request_queue.sync_q.put(task)
+                else:
+                    app.state.kv_cache_pool.release_sync(task)
             else:
-                print(f"[INFO] Task {task.id} finished infer.")
-                app.state.kv_cache_pool.release_sync(task)
+                task.output(token)
+                if task.finish_reason is None:
+                    app.state.request_queue.sync_q.put(task)
+                else:
+                    print(f"[INFO] Task {task.id} finished infer.")
+                    app.state.kv_cache_pool.release_sync(task)
 
 
 def build_task(id_, request_data, request: Request):
@@ -214,6 +257,7 @@ async def chat_stream(id_, request_data, request: Request):
     try:
         infer_task = build_task(id_, request_data, request)
         await request.app.state.kv_cache_pool.acquire(infer_task)
+        infer_task.setup_chunked_prefill(CHUNK_SIZE)
 
         # Initial empty content
         chunk = json.dumps(
@@ -255,6 +299,7 @@ async def chat(id_, request_data, request: Request):
     try:
         infer_task = build_task(id_, request_data, request)
         await request.app.state.kv_cache_pool.acquire(infer_task)
+        infer_task.setup_chunked_prefill(CHUNK_SIZE)
         request.app.state.request_queue.sync_q.put(infer_task)
         output = []
         while True:

From bb68ca563604f079cfd35d3f5509b8e56ac654bf Mon Sep 17 00:00:00 2001
From: huidesheng <1832140001@qq.com>
Date: Wed, 13 May 2026 11:15:15 +0800
Subject: [PATCH 2/9] add chunk_prefill_compiler.cpp/.hpp

---
 .../compiler/chunk_prefill_compiler.cpp       | 186 ++++++++++++++++++
 .../compiler/chunk_prefill_compiler.hpp       |  42 ++++
 2 files changed, 228 insertions(+)
 create mode 100644 csrc/engine/compiler/chunk_prefill_compiler.cpp
 create mode 100644 csrc/engine/compiler/chunk_prefill_compiler.hpp

diff --git a/csrc/engine/compiler/chunk_prefill_compiler.cpp b/csrc/engine/compiler/chunk_prefill_compiler.cpp
new file mode 100644
index 000000000..266bd0e7e
--- /dev/null
+++ b/csrc/engine/compiler/chunk_prefill_compiler.cpp
@@ -0,0 +1,186 @@
+#include "chunk_prefill_compiler.hpp"
+#include "infinicore/context/context.hpp"
+
+
+namespace {
+inline void set_zeros(infinicore::Tensor &tensor) {
+    std::vector<uint8_t> zeros(tensor->nbytes(), 0);
+    infinicore::context::memcpyH2D(tensor->data(), zeros.data(), tensor->nbytes(), false);
+}
+} // namespace
+
+namespace infinilm::engine {
+
+ChunkPrefillCompiler::ChunkPrefillCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier)
+    : GraphCompiler(model, barrier) {
+    // Enumerate chunk sizes for chunk-prefill
+    for (size_t cs : {64, 128, 256, 512, 1024, 2048}) {
+        chunk_sizes_.push_back(cs);
+    }
+    // Enumerate batch sizes for prefill (typically smaller than decode)
+    for (size_t b = 1; b < 32; b++) {
+        prefill_batch_sizes_.push_back(b);
+    }
+    for (size_t b = 32; b < 64; b += 8) {
+        prefill_batch_sizes_.push_back(b);
+    }
+    for (size_t b = 64; b < 128; b += 16) {
+        prefill_batch_sizes_.push_back(b);
+    }
+    for (size_t b = 128; b < 256; b += 32) {
+        prefill_batch_sizes_.push_back(b);
+    }
+    for (size_t b = 256; b <= 512; b += 64) {
+        prefill_batch_sizes_.push_back(b);
+    }
+}
+
+void ChunkPrefillCompiler::compile() {
+    if (model_->get_cache_config() != nullptr &&
+        dynamic_cast<const cache::PagedKVCacheConfig *>(model_->get_cache_config())) {
+
+        const auto *paged_config =
+            dynamic_cast<const cache::PagedKVCacheConfig *>(model_->get_cache_config());
+        size_t nblocks = paged_config->num_blocks();
+
+        compiled_map_prefill_.clear();
+
+        // Max total tokens to avoid OOM during graph recording
+        constexpr size_t MAX_TOTAL_TOKENS = 4096;
+
+        // Pre-allocate a shared block_tables_holder for the largest (batch_size) we'll use
+        size_t max_batch = *std::max_element(prefill_batch_sizes_.begin(), prefill_batch_sizes_.end());
+        size_t block_per_req = nblocks / max_batch;
+        block_tables_holder_ = infinicore::Tensor::empty(
+            {nblocks}, infinicore::DataType::I32, infinicore::context::getDevice());
+        set_zeros(block_tables_holder_);
+
+        for (size_t b : prefill_batch_sizes_) {
+            for (size_t cs : chunk_sizes_) {
+                size_t total_tokens = b * cs;
+                if (total_tokens > MAX_TOTAL_TOKENS) {
+                    continue;
+                }
+
+                size_t bpr = nblocks / b; // block_per_req for this batch size
+
+                InfinilmModel::Input input;
+
+                // input_ids: [1, total_tokens] — all tokens for this batch packed together
+                input.input_ids = infinicore::Tensor::empty(
+                    {1, total_tokens}, infinicore::DataType::I64, infinicore::context::getDevice());
+                set_zeros(input.input_ids.value());
+
+                // position_ids: [total_tokens]
+                input.position_ids = infinicore::Tensor::empty(
+                    {total_tokens}, infinicore::DataType::I64, infinicore::context::getDevice());
+                set_zeros(input.position_ids.value());
+
+                // total_sequence_lengths: [b], set to cs (first-chunk scenario)
+                input.total_sequence_lengths = infinicore::Tensor::empty(
+                    {b}, infinicore::DataType::I32, infinicore::context::getDevice());
+                {
+                    std::vector<int32_t> tsl(b, static_cast<int32_t>(cs));
+                    infinicore::context::memcpyH2D(
+                        input.total_sequence_lengths.value()->data(),
+                        tsl.data(), b * sizeof(int32_t), false);
+                }
+
+                // input_offsets: [b+1], stride = cs
+                input.input_offsets = infinicore::Tensor::empty(
+                    {b + 1}, infinicore::DataType::I32, infinicore::context::getDevice());
+                {
+                    std::vector<int32_t> offsets(b + 1);
+                    for (size_t i = 0; i <= b; i++) {
+                        offsets[i] = static_cast<int32_t>(i * cs);
+                    }
+                    infinicore::context::memcpyH2D(
+                        input.input_offsets.value()->data(),
+                        offsets.data(), (b + 1) * sizeof(int32_t), false);
+                }
+
+                // cu_seqlens: [b+1], same layout as input_offsets for prefill
+                input.cu_seqlens = infinicore::Tensor::empty(
+                    {b + 1}, infinicore::DataType::I32, infinicore::context::getDevice());
+                {
+                    std::vector<int32_t> cu(b + 1);
+                    for (size_t i = 0; i <= b; i++) {
+                        cu[i] = static_cast<int32_t>(i * cs);
+                    }
+                    infinicore::context::memcpyH2D(
+                        input.cu_seqlens.value()->data(),
+                        cu.data(), (b + 1) * sizeof(int32_t), false);
+                }
+
+                // block_tables: view into the shared holder [b, bpr]
+                input.block_tables = block_tables_holder_->as_strided(
+                    {b, bpr}, {(ptrdiff_t)bpr, 1});
+
+                // slot_mapping: [total_tokens]
+                input.slot_mapping = infinicore::Tensor::empty(
+                    {total_tokens}, infinicore::DataType::I64, infinicore::context::getDevice());
+                set_zeros(input.slot_mapping.value());
+
+                barrier_->wait();
+                infinicore::context::startGraphRecording();
+                auto output = model_->forward(input);
+                auto graph = infinicore::context::stopGraphRecording();
+                barrier_->wait();
+
+                auto shared_output = std::shared_ptr<InfinilmModel::Output>(
+                    new InfinilmModel::Output{infinicore::graph::GraphTensor(output.logits)});
+
+                compiled_map_prefill_[std::make_tuple(b, cs)] =
+                    CompiledResult{std::move(input), std::make_tuple(graph, shared_output)};
+            }
+        }
+    }
+}
+
+ChunkPrefillCompiler::Compiled ChunkPrefillCompiler::get_compiled(const InfinilmModel::Input &input) {
+    if (model_->get_cache_config() == nullptr ||
+        !dynamic_cast<const cache::PagedKVCacheConfig *>(model_->get_cache_config())) {
+        return {nullptr, nullptr};
+    }
+
+    if (!input.block_tables.has_value() || !input.input_ids.has_value()) {
+        return {nullptr, nullptr};
+    }
+
+    size_t batch_size = input.block_tables.value()->size(0);
+    size_t block_per_req = input.block_tables.value()->size(1);
+    size_t total_tokens = input.input_ids.value()->size(1);
+
+    // Prefill: total_tokens is a multiple of batch_size, and chunk_size > 1
+    if (total_tokens == 0 || total_tokens % batch_size != 0) {
+        return {nullptr, nullptr};
+    }
+    size_t chunk_size = total_tokens / batch_size;
+    if (chunk_size <= 1) {
+        // Single-token case belongs to decode
+        return {nullptr, nullptr};
+    }
+
+    auto result = compiled_map_prefill_.find(std::make_tuple(batch_size, chunk_size));
+    if (result == compiled_map_prefill_.end()) {
+        return {nullptr, nullptr};
+    }
+
+    auto &graph_input = result->second.input;
+
+    graph_input.input_ids.value()->copy_from(input.input_ids.value());
+    graph_input.position_ids.value()->copy_from(input.position_ids.value());
+    graph_input.total_sequence_lengths.value()->copy_from(input.total_sequence_lengths.value());
+    graph_input.input_offsets.value()->copy_from(input.input_offsets.value());
+    graph_input.cu_seqlens.value()->copy_from(input.cu_seqlens.value());
+    graph_input.block_tables.value()->narrow({{1, 0, block_per_req}})->copy_from(input.block_tables.value());
+    graph_input.slot_mapping.value()->copy_from(input.slot_mapping.value());
+
+    auto graph = std::get<0>(result->second.compiled);
+    auto shared_output = std::shared_ptr<InfinilmModel::Output>(
+        new InfinilmModel::Output{std::get<1>(result->second.compiled)->logits->resume_from_blob_()});
+
+    return std::make_tuple(graph, shared_output);
+}
+
+} // namespace infinilm::engine
diff --git a/csrc/engine/compiler/chunk_prefill_compiler.hpp b/csrc/engine/compiler/chunk_prefill_compiler.hpp
new file mode 100644
index 000000000..bd701158a
--- /dev/null
+++ b/csrc/engine/compiler/chunk_prefill_compiler.hpp
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "graph_compiler.hpp"
+
+#include <unordered_map>
+
+namespace infinilm::engine {
+class ChunkPrefillCompiler : public GraphCompiler {
+public:
+    ChunkPrefillCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier);
+
+    void compile() override;
+
+    Compiled get_compiled(const InfinilmModel::Input &input) override;
+
+private:
+    struct TupleHash {
+        size_t operator()(const std::tuple<size_t, size_t> &t) const noexcept {
+            auto h1 = std::hash<size_t>{}(std::get<0>(t));
+            auto h2 = std::hash<size_t>{}(std::get<1>(t));
+            return h1 ^ (h2 + 0x9e3779b97f4a7c15ULL + (h1 << 6) + (h1 >> 2));
+        }
+    };
+
+    std::vector<size_t> chunk_sizes_;
+    std::vector<size_t> prefill_batch_sizes_;
+
+    infinicore::Tensor block_tables_holder_;
+
+    struct CompiledResult {
+        InfinilmModel::Input input;
+        Compiled compiled;
+    };
+
+    // Key: (batch_size, chunk_size)
+    std::unordered_map<
+        std::tuple<size_t, size_t>,
+        CompiledResult,
+        TupleHash>
+        compiled_map_prefill_;
+};
+} // namespace infinilm::engine

From c596fff4b8bd452fb501517789ee1b19ad1fc2a9 Mon Sep 17 00:00:00 2001
From: huidesheng <1832140001@qq.com>
Date: Fri, 15 May 2026 12:05:29 +0000
Subject: [PATCH 3/9] fix attn_metadata bug

---
 csrc/engine/compiler/chunk_prefill_compiler.cpp | 11 +++++++++++
 scripts/test_perf.py                            |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/csrc/engine/compiler/chunk_prefill_compiler.cpp b/csrc/engine/compiler/chunk_prefill_compiler.cpp
index 266bd0e7e..2b867800b 100644
--- a/csrc/engine/compiler/chunk_prefill_compiler.cpp
+++ b/csrc/engine/compiler/chunk_prefill_compiler.cpp
@@ -1,4 +1,5 @@
 #include "chunk_prefill_compiler.hpp"
+#include "../../global_state/global_state.hpp"
 #include "infinicore/context/context.hpp"
 
 
@@ -121,6 +122,16 @@ void ChunkPrefillCompiler::compile() {
                     {total_tokens}, infinicore::DataType::I64, infinicore::context::getDevice());
                 set_zeros(input.slot_mapping.value());
 
+                // Attention reads attn_metadata from thread-local forward context.
+                infinilm::global_state::get_forward_context().attn_metadata = {
+                    input.past_sequence_lengths,
+                    input.total_sequence_lengths,
+                    input.input_offsets,
+                    input.cu_seqlens,
+                    input.block_tables,
+                    input.slot_mapping,
+                };
+
                 barrier_->wait();
                 infinicore::context::startGraphRecording();
                 auto output = model_->forward(input);
diff --git a/scripts/test_perf.py b/scripts/test_perf.py
index 6a33d8f0d..74066ddc2 100644
--- a/scripts/test_perf.py
+++ b/scripts/test_perf.py
@@ -29,7 +29,7 @@
 
 NUM_REQUESTS = 64
 CONCURRENCY = 20
-API_URL = "http://127.0.0.1:8000"
+API_URL = "http://127.0.0.1:3456"
 MODEL = "FM9G-7B"
 
 

From 50d8eb268d0b515646fe16f883a4171d350c84fd Mon Sep 17 00:00:00 2001
From: huidesheng <1832140001@qq.com>
Date: Wed, 20 May 2026 11:46:49 +0000
Subject: [PATCH 4/9] fix schedule priority; add anti-starve mechanism; fix
 issue incompatible with Prefix Sharing

---
 .../compiler/chunk_prefill_compiler.cpp       |   2 +-
 python/infinilm/base_config.py                |   2 +-
 python/infinilm/llm/cache_manager.py          |  41 +++-
 python/infinilm/llm/request.py                |   2 +-
 python/infinilm/llm/scheduler.py              | 203 +++++++++++++-----
 .../processors/basic_llm_processor.py         |  40 ++--
 6 files changed, 207 insertions(+), 83 deletions(-)

diff --git a/csrc/engine/compiler/chunk_prefill_compiler.cpp b/csrc/engine/compiler/chunk_prefill_compiler.cpp
index 2b867800b..55ad56f31 100644
--- a/csrc/engine/compiler/chunk_prefill_compiler.cpp
+++ b/csrc/engine/compiler/chunk_prefill_compiler.cpp
@@ -15,7 +15,7 @@ namespace infinilm::engine {
 ChunkPrefillCompiler::ChunkPrefillCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier)
     : GraphCompiler(model, barrier) {
     // Enumerate chunk sizes for chunk-prefill
-    for (size_t cs : {64, 128, 256, 512, 1024, 2048}) {
+    for (size_t cs : {256}) {
         chunk_sizes_.push_back(cs);
     }
     // Enumerate batch sizes for prefill (typically smaller than decode)
diff --git a/python/infinilm/base_config.py b/python/infinilm/base_config.py
index d7c32568e..5ef2a8ffb 100644
--- a/python/infinilm/base_config.py
+++ b/python/infinilm/base_config.py
@@ -125,7 +125,7 @@ def _add_common_args(self):
         )
         self.parser.add_argument("--enable-graph", action="store_true")
         self.parser.add_argument("--enable-chunk-prefill-graph", action="store_true", help="enable chunk-prefill graph compiling")
-        self.parser.add_argument("--chunk-size", type=int, default=512, help="tokens per chunked-prefill slice (0 to disable)")
+        self.parser.add_argument("--chunk-size", type=int, default=0, help="tokens per chunked-prefill slice (0 to disable)")
         self.parser.add_argument(
             "--enable-paged-attn",
             action="store_true",
diff --git a/python/infinilm/llm/cache_manager.py b/python/infinilm/llm/cache_manager.py
index 44ca13762..df9f19577 100644
--- a/python/infinilm/llm/cache_manager.py
+++ b/python/infinilm/llm/cache_manager.py
@@ -119,24 +119,51 @@ def allocate_blocks(
     ) -> tuple[List[int], List[int], int]:
         """Allocate cache blocks for new request with prefix caching support.
 
-        Args:
-            token_ids: Input token sequence
-            block_table: Existing block_table (for decode phase)
+        Idempotent: if block_table already fully covers token_ids with valid
+        (still-active) blocks, returns a consistent (block_table, slot_mapping,
+        num_cached_tokens=0) without re-allocating.
 
-        Returns:
-            Tuple of (block_table, slot_mapping, num_cached_tokens)
+        Convention: len(slot_mapping) == num_tokens - num_cached_tokens
+                    (one slot per token that needs to be (re)computed).
         """
         if block_table is None:
             block_table = []
 
         num_tokens = len(token_ids)
-        num_blocks = (num_tokens + self.block_size - 1) // self.block_size
+        if num_tokens == 0:
+            return [], [], 0
+
+        num_blocks_needed = (num_tokens + self.block_size - 1) // self.block_size
+
+        # -------------------------------------------------------------- #
+        # Idempotent re-entry path                                       #
+        # -------------------------------------------------------------- #
+        # If block_table already covers the prompt AND all those blocks
+        # are still alive (ref_count > 0), reconstruct slot_mapping from
+        # block_table and return num_cached_tokens=0 (i.e., the forward
+        # will recompute everything into the same slots — wasteful but
+        # always correct, and keeps the slot_mapping length convention).
+        if block_table and len(block_table) >= num_blocks_needed:
+            bt = list(block_table[:num_blocks_needed])
+            if all(self.blocks[bid].ref_count > 0 for bid in bt):
+                slot_mapping = [
+                    bt[i // self.block_size] * self.block_size + (i % self.block_size)
+                    for i in range(num_tokens)
+                ]
+                # length = num_tokens = num_tokens - 0 ✓ matches convention
+                return bt, slot_mapping, 0
+            # Otherwise the block_table is stale — drop it and re-allocate.
+            block_table = []
+
+        # -------------------------------------------------------------- #
+        # Below: original code unchanged                                 #
+        # -------------------------------------------------------------- #
         slot_mapping = []
         num_cached_tokens = 0
         prefix_hash = -1
         cache_miss = False
 
-        for block_idx in range(num_blocks):
+        for block_idx in range(num_blocks_needed):
             start_idx = block_idx * self.block_size
             end_idx = min(start_idx + self.block_size, num_tokens)
             block_tokens = token_ids[start_idx:end_idx]
diff --git a/python/infinilm/llm/request.py b/python/infinilm/llm/request.py
index 679b6e4db..ef5c8cd2e 100644
--- a/python/infinilm/llm/request.py
+++ b/python/infinilm/llm/request.py
@@ -196,7 +196,7 @@ def is_chunking(self) -> bool:
         return (
             self.chunk_size > 0
             and self.is_prefill
-            and self.prompt_length > self.chunk_size
+            and (self.prompt_length - self.num_cached_tokens) > self.chunk_size
         )
 
     def chunk_is_last(self) -> bool:
diff --git a/python/infinilm/llm/scheduler.py b/python/infinilm/llm/scheduler.py
index 95a844804..5794de88b 100644
--- a/python/infinilm/llm/scheduler.py
+++ b/python/infinilm/llm/scheduler.py
@@ -28,10 +28,17 @@ def __init__(
 class Scheduler:
     """Request scheduler with integrated BlockManager for KV cache management.
 
-    Scheduling logic:
-    1. Running queue: Check for new blocks needed, update slot_mapping
-    2. Waiting queue: Try block reuse (prefix caching), allocate new blocks
-    3. Reference counting: Free blocks when requests complete
+    Scheduling priority (option A + B):
+      1. Decode (running_queue) — latency-sensitive, never starves anyone.
+      2. New prefill (waiting_queue) — preempts in-flight chunking so newly
+         arrived short requests don't wait for an entire long prefill.
+      3. Continue chunked-prefill (chunking_queue) — single-request batch.
+
+    Anti-starvation (option B):
+      After `max_waiting_yields` consecutive steps where waiting_queue won
+      over a non-empty chunking_queue, the next step is forced onto the
+      chunking_queue. This bounds the worst-case delay a long-prompt request
+      can suffer when there is a steady inflow of new short requests.
     """
 
     def __init__(
@@ -39,67 +46,137 @@ def __init__(
         max_batch_size: int = 16,
         num_blocks: int = 512,
         block_size: int = 256,
+        max_waiting_yields: int = 4,
     ):
         self.waiting_queue = janus.Queue()
         self.running_queue = janus.Queue()
-        # Requests in the middle of chunked-prefill — scheduled at high priority,
-        # single-request batches only (to match the C++ ChunkPrefillCompiler graph signature).
+        # Requests in the middle of chunked-prefill — single-request batch only
+        # (matches the C++ ChunkPrefillCompiler graph signature).
         self.chunking_queue = janus.Queue()
         self.max_batch_size = max_batch_size
 
         self.cache_manager = BlockManager(num_blocks=num_blocks, block_size=block_size)
         self.block_size = block_size
 
+        # ---- Anti-starvation state ----
+        # How many times waiting_queue has won over a non-empty chunking_queue
+        # since the last time chunking actually ran. Reset to 0 every time we
+        # run a chunking step.
+        self._waiting_yields_in_a_row: int = 0
+        # Upper bound on _waiting_yields_in_a_row before chunking is forced.
+        self.max_waiting_yields: int = max_waiting_yields
+
     def add_request(self, request: InferenceRequest):
         if request is not None:
             request.status = RequestStatus.WAITING
             self.waiting_queue.sync_q.put(request)
 
+    # ------------------------------------------------------------------ #
+    #  Main scheduling entrypoint                                        #
+    # ------------------------------------------------------------------ #
     def schedule(self) -> Optional[SchedulerOutput]:
         """Schedule and return batch of requests to execute.
 
-        Priority (mirrors launch_server.py chunked-prefill scheduling):
-          1. Running queue (decode) — short / latency-sensitive
-          2. Chunking queue (in-flight chunked-prefill) — single-request slice
-          3. Waiting queue (new prefill) — may start chunking if prompt is long
+        Priority (revised so chunk-prefill actually interleaves):
+          1. Decode 
+          2. New prefill (waiting_queue)
+          3. Continue chunked-prefill (chunking_queue)
+
+        Order each call:
+          1. Try decode (running_queue).
+          2. If we've yielded to waiting `max_waiting_yields` times in a row
+             AND chunking_queue is non-empty → force chunking this step.
+          3. Otherwise try waiting (may start a new chunking session and
+             emit a single-request batch immediately).
+          4. Otherwise try chunking (continue an in-flight chunked-prefill).
+          5. Otherwise (rare) — try waiting again as a final fallback.
         """
-        # 2) Continue an in-flight chunked-prefill request (single-request batch).
-        try:
-            req = self.chunking_queue.sync_q.get_nowait()
-        except queue.Empty:
-            req = None
-        if req is not None:
+        # 1) Decode first — cheap and latency-sensitive.
+        decode_out = self._try_schedule_decode()
+        if decode_out is not None:
+            return decode_out
+
+        # 2) Forced chunking after too many consecutive yields.
+        if self._waiting_yields_in_a_row >= self.max_waiting_yields:
+            chunking_out = self._try_schedule_chunking()
+            if chunking_out is not None:
+                self._waiting_yields_in_a_row = 0
+                return chunking_out
+            # chunking_queue was actually empty — fall through to normal path.
+
+        # 3) Waiting queue — newly arrived prefill preempts in-flight chunking.
+        # Snapshot whether chunking had anything BEFORE we drain waiting,
+        # so we can decide whether this counts as a "yield over chunking".
+        # (qsize() is racy but only affects the counter by ±1 in rare edge cases — not correctness)
+        chunking_was_nonempty = self.chunking_queue.sync_q.qsize() > 0
+
+        waiting_out = self._try_schedule_waiting()
+        if waiting_out is not None:
+            if chunking_was_nonempty:
+                # We took a step that COULD have been chunking — count it.
+                self._waiting_yields_in_a_row += 1
+            else:
+                # No chunking to yield from; nothing was actually deferred.
+                self._waiting_yields_in_a_row = 0
+            return waiting_out
+
+        # 4) Continue an in-flight chunked-prefill request.
+        chunking_out = self._try_schedule_chunking()
+        if chunking_out is not None:
+            self._waiting_yields_in_a_row = 0
+            return chunking_out
+
+        return None
+
+    # ------------------------------------------------------------------ #
+    #  Per-queue schedulers                                              #
+    # ------------------------------------------------------------------ #
+    def _try_schedule_chunking(self) -> Optional[SchedulerOutput]:
+        """Pull one in-flight chunked-prefill request and emit a single-request batch.
+
+        The C++ ChunkPrefillCompiler graph is keyed on (batch_size, chunk_size).
+        Python currently sends batch=1 — see chunk_prefill_compiler.cpp.
+        """
+        while True:
+            try:
+                req = self.chunking_queue.sync_q.get_nowait()
+            except queue.Empty:
+                return None
             if req.is_finished():
+                # Drain finished entries silently and keep looking.
                 self.complete_requests([req])
-            else:
-                return SchedulerOutput(
-                    scheduled_requests=[req],
-                    is_prefill=True,
-                )
+                continue
+            return SchedulerOutput(
+                scheduled_requests=[req],
+                is_prefill=True,
+            )
 
-        scheduled_requests = []
-        is_prefill = False
+    def _try_schedule_waiting(self) -> Optional[SchedulerOutput]:
+        """Pull new prefill requests from waiting_queue and form a prefill batch.
+
+        If any request triggers chunked-prefill (prompt_length > chunk_size > 0),
+        it's emitted alone as a single-request batch (the chunking graph requires
+        a uniform chunk_size across the batch, and we don't mix chunking with
+        regular prefill in the same batch).
+        """
+        scheduled_requests: List[InferenceRequest] = []
 
-        # Process Waiting queue (prefill phase)
         while len(scheduled_requests) < self.max_batch_size:
             try:
                 req = self.waiting_queue.sync_q.get_nowait()
             except queue.Empty:
                 break
-            # Skip requests that were already finished (e.g., timed out/canceled while waiting)
+
+            # Skip requests that were already finished (timed out / canceled while waiting).
             if req.is_finished():
                 self.complete_requests([req])
                 continue
 
             if not self.can_accept_request(req):
+                # Put it back; we'll retry next tick when cache pressure eases.
                 self.waiting_queue.sync_q.put(req)
                 break
 
-            # Skip requests that were already finished (e.g., timed out/canceled while waiting)
-            if req.is_finished():
-                self.complete_requests([req])
-                continue
-
             req_tokens = req.get_input_tokens()
             num_required_blocks = req.get_num_blocks_required(self.block_size)
 
@@ -107,47 +184,53 @@ def schedule(self) -> Optional[SchedulerOutput]:
                 if not self.cache_manager.try_free_blocks(num_required_blocks):
                     raise RuntimeError("No available cache blocks for new request")
 
-            # Allocate blocks with automatic prefix caching support
-            req.block_table, req.slot_mapping, req.num_cached_tokens = (
-                self.cache_manager.allocate_blocks(req_tokens, req.block_table)
-            )
-
+            # Allocate blocks (prefix caching applied automatically).
+            if not req.block_table:
+                req.block_table, req.slot_mapping, req.num_cached_tokens = (
+                    self.cache_manager.allocate_blocks(req_tokens, req.block_table)
+                )
+                
             req.num_blocks = len(req.block_table)
             req.status = RequestStatus.RUNNING
 
-            # Start chunked-prefill: enqueue into chunking_queue and emit a
-            # single-request batch immediately. We don't mix chunked-prefill
-            # with other requests in the same batch — the C++ ChunkPrefillCompiler
-            # graph is keyed on (batch_size, chunk_size).
-            if req.chunk_size > 0 and req.prompt_length > req.chunk_size:
-                req.chunk_prefill_offset = 0
-                return SchedulerOutput(
-                    scheduled_requests=[req],
-                    is_prefill=True,
-                )
+            # Start chunked-prefill: emit a single-request batch immediately
+            # to keep the C++ graph signature stable. The request will be
+            # requeued into chunking_queue by llm._update_requests after each
+            # chunk runs.
+            remaining = req.prompt_length - req.num_cached_tokens
+            if req.chunk_size > 0 and remaining > req.chunk_size:
+                req.chunk_prefill_offset = req.num_cached_tokens
+                if scheduled_requests:
+                    for already in scheduled_requests:
+                        already.status = RequestStatus.WAITING
+                        self.waiting_queue.sync_q.put(already)
+                return SchedulerOutput([req], is_prefill=True)
 
             scheduled_requests.append(req)
 
-        # Return prefill batch if any waiting requests were scheduled
         if scheduled_requests:
-            is_prefill = True
             return SchedulerOutput(
                 scheduled_requests=scheduled_requests,
-                is_prefill=is_prefill,
+                is_prefill=True,
             )
+        return None
+
+    def _try_schedule_decode(self) -> Optional[SchedulerOutput]:
+        """Pull running_queue requests into a decode batch."""
+        scheduled_requests: List[InferenceRequest] = []
 
-        # Process Running queue (decode phase)
         while len(scheduled_requests) < self.max_batch_size:
             try:
                 req = self.running_queue.sync_q.get_nowait()
             except queue.Empty:
                 break
-            # Skip requests that were already finished (e.g., timed out/canceled while running)
+
+            # Skip requests that were already finished (timed out / canceled while running).
             if req.is_finished():
                 self.complete_requests([req])
                 continue
 
-            # Decode phase: allocate slot for newly generated token
+            # Decode phase: allocate slot for newly generated token.
             try:
                 req.block_table, new_slot = self.cache_manager.append_slot(
                     req.block_table, req.get_total_length(), req.get_all_token_ids()
@@ -156,26 +239,30 @@ def schedule(self) -> Optional[SchedulerOutput]:
                 req.num_blocks = len(req.block_table)
                 req.num_cached_tokens = req.get_total_length() - 1
                 scheduled_requests.append(req)
-
             except RuntimeError as e:
                 raise RuntimeError("No available cache blocks for new token") from e
 
-        # Return decode batch if any running requests were scheduled
         if scheduled_requests:
-            is_prefill = False
             return SchedulerOutput(
                 scheduled_requests=scheduled_requests,
-                is_prefill=is_prefill,
+                is_prefill=False,
             )
-
         return None
 
+    # ------------------------------------------------------------------ #
+    #  External hooks (unchanged behavior)                               #
+    # ------------------------------------------------------------------ #
     def requeue_chunking(self, req: InferenceRequest):
         """Put a request back into the chunking queue after a chunk has run."""
         self.chunking_queue.sync_q.put(req)
 
     def complete_requests(self, requests: List[InferenceRequest]):
-        """Handle completed requests and free their blocks."""
+        """Handle completed requests and free their blocks.
+
+        Active (non-finished) requests passed here are re-enqueued into the
+        running_queue — this is how prefill-finished requests migrate into
+        the decode pipeline.
+        """
         for req in requests:
             if req.status in [
                 RequestStatus.FINISHED,
@@ -235,4 +322,4 @@ def get_cache_stats(self) -> dict:
             "num_free_blocks": self.cache_manager.get_num_free_blocks(),
             "num_req_blocks": len(self.cache_manager.req_block_ids),
             "num_used_blocks": len(self.cache_manager.used_block_ids),
-        }
+        }
\ No newline at end of file
diff --git a/python/infinilm/processors/basic_llm_processor.py b/python/infinilm/processors/basic_llm_processor.py
index f5e603ba4..397e9068f 100644
--- a/python/infinilm/processors/basic_llm_processor.py
+++ b/python/infinilm/processors/basic_llm_processor.py
@@ -183,62 +183,72 @@ def _build_model_input_from_batch_scheduler_output(
         for req in scheduler_output.scheduled_requests:
             num_cached = req.num_cached_tokens
             if scheduler_output.is_prefill:
-                # Prefill phase
                 req_tokens = req.get_input_tokens()
 
                 # Chunked-prefill: only feed [chunk_prefill_offset : +chunk_size).
-                # past_kv_lengths = chunk_prefill_offset (attention sees the prefix
-                # already committed); total_kv_lengths = chunk_prefill_offset +
-                # len(tokens_to_compute). This keeps batch_size=1 and total_tokens
-                # == chunk_size so the C++ ChunkPrefillCompiler graph hits.
                 if req.is_chunking():
                     start = req.chunk_prefill_offset
                     end = min(start + req.chunk_size, len(req_tokens))
                     tokens_to_compute = req_tokens[start:end]
                     compute_len = len(tokens_to_compute)
                     tokens.extend(tokens_to_compute)
-                    seq_len = end  # attention prefix length after this chunk
+                    seq_len = end
                     seq_lens.append(seq_len)
                     current_offset += compute_len
                     seq_offsets.append(current_offset)
-                    slot_mapping.extend(req.slot_mapping[start:end])
+                    # req.slot_mapping has length (prompt_length - num_cached) and is
+                    # indexed [0..prompt_length-num_cached). Translate absolute token
+                    # indices to slot_mapping indices.
+                    slot_start = start - num_cached
+                    slot_end = end - num_cached
+                    assert slot_start >= 0 and slot_end <= len(req.slot_mapping), (
+                        f"chunking slot slice out of range: start={start} "
+                        f"end={end} num_cached={num_cached} "
+                        f"len(slot_mapping)={len(req.slot_mapping)}"
+                    )
+                    slot_mapping.extend(req.slot_mapping[slot_start:slot_end])
                     cached_lens.append(start)
                     position_ids.extend(range(start, end))
                 else:
                     tokens_to_compute = req_tokens[num_cached:]
                     tokens.extend(tokens_to_compute)
-
                     compute_len = len(tokens_to_compute)
                     seq_len = len(req_tokens)
                     seq_lens.append(seq_len)
-
                     current_offset += compute_len
                     seq_offsets.append(current_offset)
-
                     slot_mapping.extend(req.slot_mapping)
                     cached_lens.append(num_cached)
                     position_ids.extend(range(num_cached, num_cached + compute_len))
-
             else:
-                # Decode phase
                 seq_len = req.get_total_length()
                 last_token = req.generated_token_ids[-1]
                 tokens.append(last_token)
                 seq_lens.append(seq_len)
-
                 current_offset += 1
                 seq_offsets.append(current_offset)
-
                 slot_mapping.extend(req.slot_mapping)
                 cached_lens.append(num_cached)
                 position_ids.append(seq_len - 1)
 
-            # Pad block_table to same length
             padded_block_table = req.block_table + [-1] * (
                 max_block_table_len - len(req.block_table)
             )
             block_tables.append(padded_block_table)
             cu_seqlens.append(cu_seqlens[-1] + seq_len)
+            
+        # guarantee non-empty tokens and slot_mapping to avoid downstream errors. If empty, raise with detailed debug info.
+        if not tokens or not slot_mapping:
+            states = [
+                (r.request_id[:8], r.is_prefill, r.is_chunking(),
+                r.chunk_prefill_offset, r.prompt_length, r.num_cached_tokens,
+                len(r.slot_mapping), r.status.name)
+                for r in scheduler_output.scheduled_requests
+            ]
+            raise RuntimeError(
+                f"build_model_inputs got empty tokens/slot_mapping. "
+                f"is_prefill={scheduler_output.is_prefill}, states={states}"
+            )
 
         return {
             "input_ids": infinicore.from_list([tokens], dtype=infinicore.int64),

From 04fdd6a429be5d6bb955046ed9aec461ebc63df4 Mon Sep 17 00:00:00 2001
From: huidesheng <1832140001@qq.com>
Date: Thu, 21 May 2026 11:19:42 +0000
Subject: [PATCH 5/9] change priority to prefill first and fix GIL problems

---
 python/infinilm/llm/llm.py       |  9 +++++--
 python/infinilm/llm/scheduler.py | 46 +++++++++++++-------------------
 scripts/test_perf.py             |  2 +-
 3 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py
index 90de3edcc..652150a87 100644
--- a/python/infinilm/llm/llm.py
+++ b/python/infinilm/llm/llm.py
@@ -662,8 +662,13 @@ def _step_loop(self):
                 requests, pending = self.engine.step()
                 if not requests:
                     time.sleep(0.01)
-                elif pending:
-                    self._loop.call_soon_threadsafe(self._batch_put, pending)
+                else:
+                    if pending:
+                        self._loop.call_soon_threadsafe(self._batch_put, pending)
+                    # Yield GIL so the asyncio main thread can deliver tokens
+                    # to clients between inference steps. Without this, the step
+                    # thread monopolizes the GIL and token streaming stalls.
+                    time.sleep(0.0005)
             except Exception as e:
                 logger.error(f"Error in step loop: {e}", exc_info=True)
                 self._healthy = False
diff --git a/python/infinilm/llm/scheduler.py b/python/infinilm/llm/scheduler.py
index 5794de88b..db0a09fb2 100644
--- a/python/infinilm/llm/scheduler.py
+++ b/python/infinilm/llm/scheduler.py
@@ -77,26 +77,17 @@ def add_request(self, request: InferenceRequest):
     def schedule(self) -> Optional[SchedulerOutput]:
         """Schedule and return batch of requests to execute.
 
-        Priority (revised so chunk-prefill actually interleaves):
-          1. Decode 
-          2. New prefill (waiting_queue)
-          3. Continue chunked-prefill (chunking_queue)
-
-        Order each call:
-          1. Try decode (running_queue).
-          2. If we've yielded to waiting `max_waiting_yields` times in a row
-             AND chunking_queue is non-empty → force chunking this step.
-          3. Otherwise try waiting (may start a new chunking session and
-             emit a single-request batch immediately).
-          4. Otherwise try chunking (continue an in-flight chunked-prefill).
-          5. Otherwise (rare) — try waiting again as a final fallback.
+        Priority (prefill first):
+        1. New prefill (waiting_queue) — minimize TTFT for new requests.
+        2. Decode (running_queue).
+        3. Continue chunked-prefill (chunking_queue).
+
+        Anti-starvation (only guards chunking against waiting):
+        After `max_waiting_yields` consecutive steps where waiting_queue won
+        over a non-empty chunking_queue, the next step is forced onto the
+        chunking_queue.
         """
-        # 1) Decode first — cheap and latency-sensitive.
-        decode_out = self._try_schedule_decode()
-        if decode_out is not None:
-            return decode_out
-
-        # 2) Forced chunking after too many consecutive yields.
+        # 0) Forced chunking after too many consecutive waiting yields.
         if self._waiting_yields_in_a_row >= self.max_waiting_yields:
             chunking_out = self._try_schedule_chunking()
             if chunking_out is not None:
@@ -104,23 +95,24 @@ def schedule(self) -> Optional[SchedulerOutput]:
                 return chunking_out
             # chunking_queue was actually empty — fall through to normal path.
 
-        # 3) Waiting queue — newly arrived prefill preempts in-flight chunking.
-        # Snapshot whether chunking had anything BEFORE we drain waiting,
-        # so we can decide whether this counts as a "yield over chunking".
-        # (qsize() is racy but only affects the counter by ±1 in rare edge cases — not correctness)
+        # 1) New prefill — highest priority.
+        # Snapshot chunking emptiness BEFORE running waiting, so we can decide
+        # whether this counts as a "yield over chunking".
         chunking_was_nonempty = self.chunking_queue.sync_q.qsize() > 0
-
         waiting_out = self._try_schedule_waiting()
         if waiting_out is not None:
             if chunking_was_nonempty:
-                # We took a step that COULD have been chunking — count it.
                 self._waiting_yields_in_a_row += 1
             else:
-                # No chunking to yield from; nothing was actually deferred.
                 self._waiting_yields_in_a_row = 0
             return waiting_out
 
-        # 4) Continue an in-flight chunked-prefill request.
+        # 2) Decode.
+        decode_out = self._try_schedule_decode()
+        if decode_out is not None:
+            return decode_out
+
+        # 3) Continue an in-flight chunked-prefill request.
         chunking_out = self._try_schedule_chunking()
         if chunking_out is not None:
             self._waiting_yields_in_a_row = 0
diff --git a/scripts/test_perf.py b/scripts/test_perf.py
index 74066ddc2..3e4116f54 100644
--- a/scripts/test_perf.py
+++ b/scripts/test_perf.py
@@ -29,7 +29,7 @@
 
 NUM_REQUESTS = 64
 CONCURRENCY = 20
-API_URL = "http://127.0.0.1:3456"
+API_URL = "http://127.0.0.1:2333"
 MODEL = "FM9G-7B"
 
 

From 8f0ecbb88956a57ad0bf06077fb7df8de3f221ba Mon Sep 17 00:00:00 2001
From: huidesheng <1832140001@qq.com>
Date: Tue, 26 May 2026 06:29:32 +0000
Subject: [PATCH 6/9] fix decode and chunk yield issue

---
 python/infinilm/llm/scheduler.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/infinilm/llm/scheduler.py b/python/infinilm/llm/scheduler.py
index db0a09fb2..65b748630 100644
--- a/python/infinilm/llm/scheduler.py
+++ b/python/infinilm/llm/scheduler.py
@@ -95,9 +95,7 @@ def schedule(self) -> Optional[SchedulerOutput]:
                 return chunking_out
             # chunking_queue was actually empty — fall through to normal path.
 
-        # 1) New prefill — highest priority.
-        # Snapshot chunking emptiness BEFORE running waiting, so we can decide
-        # whether this counts as a "yield over chunking".
+        # 1) New prefill 
         chunking_was_nonempty = self.chunking_queue.sync_q.qsize() > 0
         waiting_out = self._try_schedule_waiting()
         if waiting_out is not None:
@@ -108,8 +106,13 @@ def schedule(self) -> Optional[SchedulerOutput]:
             return waiting_out
 
         # 2) Decode.
+        chunking_was_nonempty = self.chunking_queue.sync_q.qsize() > 0
         decode_out = self._try_schedule_decode()
         if decode_out is not None:
+            if chunking_was_nonempty:
+                self._waiting_yields_in_a_row += 1
+            else:
+                self._waiting_yields_in_a_row = 0
             return decode_out
 
         # 3) Continue an in-flight chunked-prefill request.

From b27b5470406134e6f9ea91efba985671c695ac7c Mon Sep 17 00:00:00 2001
From: huidesheng <1832140001@qq.com>
Date: Thu, 28 May 2026 04:57:42 +0000
Subject: [PATCH 7/9] add batch chunk

---
 python/infinilm/llm/llm.py       |  23 ++--
 python/infinilm/llm/scheduler.py |  27 +++--
 scripts/test_chunk_prefill.py    | 194 +++++++++++++++++++++++++++++++
 scripts/test_perf_cp.py          | 156 +++++++++++++++++++++++++
 scripts/test_perf_mix.py         | 173 +++++++++++++++++++++++++++
 5 files changed, 548 insertions(+), 25 deletions(-)
 create mode 100644 scripts/test_chunk_prefill.py
 create mode 100644 scripts/test_perf_cp.py
 create mode 100644 scripts/test_perf_mix.py

diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py
index 652150a87..dd8687e52 100644
--- a/python/infinilm/llm/llm.py
+++ b/python/infinilm/llm/llm.py
@@ -221,9 +221,8 @@ def _update_requests(
         # and re-enqueue the request to keep chunking.
         chunk_mid_step = (
             is_prefill
-            and len(requests) == 1
-            and requests[0].is_chunking()
-            and not requests[0].chunk_is_last()
+            and len(requests) > 0
+            and all(r.is_chunking() and not r.chunk_is_last() for r in requests)
         )
 
         if is_prefill and not chunk_mid_step:
@@ -236,16 +235,14 @@ def _update_requests(
                     raise ValueError(f"Unsupported cache_type: {self.cache_type}")
 
         if chunk_mid_step:
-            req = requests[0]
-            req.chunk_prefill_offset += req.chunk_size
-            # If this request was aborted while chunking, drop it.
-            if req.is_aborted():
-                logger.info(
-                    f"Request {req.request_id} aborted by client during chunked-prefill"
-                )
-                return []
-            # Re-enqueue to keep producing chunks; no token sampled yet.
-            self.scheduler.requeue_chunking(req)
+            for req in requests:
+                req.chunk_prefill_offset += req.chunk_size
+                if req.is_aborted():
+                    logger.info(
+                        f"Request {req.request_id} aborted by client during chunked-prefill"
+                    )
+                    continue
+                self.scheduler.requeue_chunking(req)
             return []
 
         pending = []
diff --git a/python/infinilm/llm/scheduler.py b/python/infinilm/llm/scheduler.py
index 65b748630..e235842a3 100644
--- a/python/infinilm/llm/scheduler.py
+++ b/python/infinilm/llm/scheduler.py
@@ -127,24 +127,27 @@ def schedule(self) -> Optional[SchedulerOutput]:
     #  Per-queue schedulers                                              #
     # ------------------------------------------------------------------ #
     def _try_schedule_chunking(self) -> Optional[SchedulerOutput]:
-        """Pull one in-flight chunked-prefill request and emit a single-request batch.
-
-        The C++ ChunkPrefillCompiler graph is keyed on (batch_size, chunk_size).
-        Python currently sends batch=1 — see chunk_prefill_compiler.cpp.
-        """
-        while True:
+        scheduled: List[InferenceRequest] = []
+        while len(scheduled) < self.max_batch_size:
             try:
                 req = self.chunking_queue.sync_q.get_nowait()
             except queue.Empty:
-                return None
+                break
             if req.is_finished():
-                # Drain finished entries silently and keep looking.
                 self.complete_requests([req])
                 continue
-            return SchedulerOutput(
-                scheduled_requests=[req],
-                is_prefill=True,
-            )
+            # 最后一块（partial 或恰好等于 chunk_size 的最后整块）单独跑。
+            # 不能和中间整块混批：最后一块要采样+提交 block，中间块两个都不做。
+            if req.chunk_is_last():
+                if not scheduled:
+                    return SchedulerOutput([req], is_prefill=True)
+                # 已经攒了中间块，先把这个 last-chunk 放回队头，等下个 step 单独跑。
+                self.chunking_queue.sync_q.put(req)
+                break
+            scheduled.append(req)
+        if scheduled:
+            return SchedulerOutput(scheduled, is_prefill=True)
+        return None
 
     def _try_schedule_waiting(self) -> Optional[SchedulerOutput]:
         """Pull new prefill requests from waiting_queue and form a prefill batch.
diff --git a/scripts/test_chunk_prefill.py b/scripts/test_chunk_prefill.py
new file mode 100644
index 000000000..90d01b9e5
--- /dev/null
+++ b/scripts/test_chunk_prefill.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+"""
+一键对比 chunked prefill 开/关性能
+
+该脚本会依次启动 launch_server.py (chunk-size=0/256)，运行 test_perf_mix.py 取结果，最后输出对比。
+"""
+
+import argparse
+import os
+import re
+import signal
+import subprocess
+import sys
+import time
+
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+LM_DIR = os.path.dirname(SCRIPT_DIR)
+INFERENCE_SERVER = os.path.join(LM_DIR, "python", "infinilm", "server", "inference_server.py")
+TEST_SCRIPT = os.path.join(SCRIPT_DIR, "test_perf_mix.py")
+
+
+from openai import OpenAI, APIConnectionError, APIStatusError
+
+def wait_for_server(popen, host, port, model, timeout=300):
+    client = OpenAI(base_url=f"http://{host}:{port}", api_key="default")
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        if popen.poll() is not None:
+            raise RuntimeError(f"server exited early with code {popen.returncode}")
+        try:
+            client.chat.completions.create(
+                model=model,
+                messages=[{"role": "user", "content": "hi"}],
+                max_tokens=1,
+            )
+            return
+        except (APIConnectionError, APIStatusError):
+            time.sleep(1)
+    raise TimeoutError(f"server not ready within {timeout}s")
+
+
+def inference_server(chunk_size, device, port, batch_size, max_new_tokens, enable_paged_attn, enable_graph, model_path):
+    print(INFERENCE_SERVER)
+    args = ["CUDA_VISIBLE_DEVICES=12", sys.executable, INFERENCE_SERVER,
+            f"--chunk-size {chunk_size}",
+            f"--device {device}",
+            f"--port {port}",
+            f"--batch-size {batch_size}",
+            f"--max-new-tokens {max_new_tokens}",
+            f"--model {model_path}"]
+    if enable_paged_attn:
+        args.append("--enable-paged-attn")
+    if enable_graph:
+        args.append("--enable-graph")
+
+    popen = subprocess.Popen(" ".join(args), shell=True, preexec_fn=os.setsid, stderr=subprocess.STDOUT)
+    return popen
+
+
+import socket
+
+def stop_server(popen, host="127.0.0.1", port=2333, timeout=30):
+    if popen and popen.poll() is None:
+        os.killpg(os.getpgid(popen.pid), signal.SIGTERM)
+        try:
+            popen.wait(timeout=timeout)
+        except subprocess.TimeoutExpired:
+            os.killpg(os.getpgid(popen.pid), signal.SIGKILL)
+            popen.wait(timeout=5)
+
+    # 等端口真正释放（uvicorn 在 graceful shutdown 期间端口还开着）
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        try:
+            with socket.create_connection((host, port), timeout=0.5):
+                pass  # 还有人监听，继续等
+        except OSError:
+            return   # 端口已释放
+        time.sleep(0.3)
+    raise RuntimeError(f"port {port} still in use after stop_server")
+
+
+def run_test_perf():
+    cmd = f"{sys.executable} -u {TEST_SCRIPT}"
+    proc = subprocess.Popen(
+        cmd, shell=True, text=True, bufsize=1,
+        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+    )
+    lines = []
+    for line in proc.stdout:
+        sys.stdout.write(line)
+        sys.stdout.flush() # 子终端的输出直接转发到父终端，保持实时显示
+        lines.append(line)
+    proc.wait()
+    return proc.returncode, "".join(lines)   # 返回码 + test_perf_mix.py的输出文本
+
+def parse_stats(output):
+    def grab(pat):
+        m = re.search(pat, output)
+        return float(m.group(1)) if m else None
+    
+    success_m = re.search(r"成功请求数\s*:\s*(\d+)", output)
+    return {
+        "avg_ttft_s":       grab(r"Average TTFT\s*:\s*([0-9.]+)\s*s"),
+        "avg_e2e_s":        grab(r"Average latency\s*:\s*([0-9.]+)\s*s"),
+        "avg_ms_per_token": grab(r"Avg time per token\s*:\s*([0-9.]+)\s*ms/token"),
+        "avg_tps":          grab(r"Avg Token generation speed\s*:\s*([0-9.]+)"),
+        "rps":              grab(r"请求速率 \(RPS\)\s*:\s*([0-9.]+)"),
+        "success":          int(success_m.group(1)) if success_m else None,
+    }
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="比较 chunked prefill 开/关的 TTFT/E2E")
+    parser.add_argument("--device", type=str, default="iluvatar", help="设备类型")
+    parser.add_argument("--batch-size", type=int, default=16)
+    parser.add_argument("--max-new-tokens", type=int, default=16)
+    parser.add_argument("--enable-paged-attn", type=bool, default=True)
+    parser.add_argument("--enable-graph", type=bool, default=True)
+    parser.add_argument("--port", type=int, default=2333)
+    parser.add_argument("--model-path", type=str, default="/data-aisoft/mechdancer/models/9g_8b_thinking_llama/")
+
+    
+    args = parser.parse_args()
+
+    results = []
+
+    for chunk_size in (0, 256):
+        mode = "ON" if chunk_size > 0 else "OFF"
+        print("\n" + "="*78)
+        print(f"开始部署大模型推理服务，chunked prefill {mode} (chunk-size={chunk_size})，请等待服务启动完成...")
+
+        server = inference_server(chunk_size=chunk_size, device=args.device, port=args.port,
+                                batch_size=args.batch_size, max_new_tokens=args.max_new_tokens,
+                                enable_paged_attn=args.enable_paged_attn, enable_graph=args.enable_graph,
+                                model_path=args.model_path)
+        try:
+            wait_for_server(server, "127.0.0.1", args.port, model="FM9G-7B", timeout=300)
+            print("服务启动完成，开始跑 test_perf_mix.py (上一条200OK请求为服务测试成功标志)")
+            retcode, out = run_test_perf()
+            if retcode != 0:
+                print("test_perf_cp.py 执行失败，退出码", retcode)
+                print(out)
+                raise SystemExit(1)
+            
+            stats = parse_stats(out)
+            stats.update({"chunk_size": chunk_size, "mode": mode})
+            results.append(stats)
+            print(f"完成chunked prefill {mode}测试 -> {stats}")
+
+        finally:
+            stop_server(server, host="127.0.0.1", port=args.port)
+            print("服务已停止")
+
+    print("\n" + "#"*78)
+    print("最终对比（chunked prefill ON vs OFF）")
+    print("-"*78)
+
+    def fmt(v, unit="", spec=".3f"):
+        return "N/A" if v is None else f"{v:{spec}}{unit}"
+
+    def diff(a, b):
+        return None if (a is None or b is None) else a - b
+
+    def speedup_pct(on, off):
+        # 越小越快的指标：正数 = ON 比 OFF 快
+        if on is None or off is None or off == 0:
+            return None
+        return (off - on) / off * 100
+
+    on_r  = next((r for r in results if r["mode"] == "ON"),  None)
+    off_r = next((r for r in results if r["mode"] == "OFF"), None)
+
+    print(f"{'指标':<22}{'ON':>14}{'OFF':>14}{'Δ (ON-OFF)':>16}{'ON 提升':>12}")
+    print("-"*78)
+
+    def row(label, key, unit, spec=".3f", lower_is_better=True):
+        a = (on_r  or {}).get(key)
+        b = (off_r or {}).get(key)
+        pct = speedup_pct(a, b) if lower_is_better else speedup_pct(b, a)
+        print(
+            f"{label:<22}"
+            f"{fmt(a, unit, spec):>14}"
+            f"{fmt(b, unit, spec):>14}"
+            f"{fmt(diff(a, b), unit, '+'+spec):>16}"
+            f"{fmt(pct, '%', '+.2f'):>12}"
+        )
+
+    row("Avg TTFT",        "avg_ttft_s",       " s")
+    row("Avg E2E latency", "avg_e2e_s",        " s")
+    row("Avg ms/token",    "avg_ms_per_token", " ms", ".2f")
+    row("Avg tokens/s",    "avg_tps",          "",    ".2f", lower_is_better=False)
+    row("RPS",             "rps",              "",    ".2f", lower_is_better=False)
+    print("-"*78)
diff --git a/scripts/test_perf_cp.py b/scripts/test_perf_cp.py
new file mode 100644
index 000000000..4e55bae0f
--- /dev/null
+++ b/scripts/test_perf_cp.py
@@ -0,0 +1,156 @@
+"""
+Chunked Prefill TTFT Benchmark
+
+Test: send a long request, wait a short delay, then send a short request.
+Measure the short request's TTFT and E2E.
+
+With chunked prefill: short request inserts at next chunk boundary → lower TTFT
+Without chunked prefill: short request waits for full long prefill → higher TTFT
+
+Usage:
+  python3 scripts/test_perf.py [--rounds 5] [--delay 0.1]
+"""
+import asyncio
+import time
+from openai import AsyncOpenAI
+import argparse
+
+API_URL = "http://127.0.0.1:2333"
+MODEL = "jiuge"
+MAX_TOKENS = 30 # decode的tokens数
+
+_BASE_PARAGRAPHS = [
+    '''人工智能（Artificial Intelligence，简称AI）是计算机科学的一个分支，它企图了解智能的实质，并生产出一种新的能以人类智能相似的方式做出反应的智能机器。人工智能的研究包括机器人、语言识别、图像识别、自然语言处理和专家系统等。人工智能从诞生以来，理论和技术日益成熟，应用领域也不断扩大。可以设想，未来人工智能带来的科技产品，将会是人类智慧的容器。''',
+    '''1956年夏季，以麦卡赛、明斯基、罗切斯特和申农等为首的一批有远见卓识的年轻科学家在一起聚会，共同研究和探讨用机器模拟智能的一系列有关问题，并首次提出了人工智能这一术语，它标志着人工智能这门新兴学科的正式诞生。此后，IBM公司研制的专用计算机深蓝击败了国际象棋世界冠军卡斯帕罗夫。谷歌公司开发的AlphaGo程序战胜了围棋世界冠军李世石，这被认为是人工智能发展史上的一个重要里程碑。''',
+    '''量子计算是一种利用量子力学原理进行信息处理的计算方式。与经典计算机使用比特作为信息的基本单位不同，量子计算机使用量子比特。量子比特具有叠加态的特性，即一个量子比特可以同时处于0和1的状态，这使得量子计算机在处理某些特定问题时具有经典计算机无法比拟的优势。量子纠缠是量子计算中另一个关键概念，当两个量子比特发生纠缠时，测量其中一个的状态会立即影响另一个的状态。''',
+    '''根据联合国政府间气候变化专门委员会第六次评估报告，全球平均温度已经比工业化前水平上升了约1.1摄氏度。报告指出，人类活动是导致全球变暖的主要原因，其中化石燃料的燃烧、工业生产和土地利用变化是温室气体排放的主要来源。极端天气事件的频率和强度都在增加，包括热浪、干旱、暴雨和洪水。北极海冰面积持续缩小，格陵兰和南极冰盖加速融化。''',
+    '''深度学习是机器学习的一个分支，其核心是利用多层神经网络从大量数据中自动学习特征表示。卷积神经网络在图像识别领域取得了巨大成功，循环神经网络和Transformer架构则在自然语言处理领域展现了强大能力。近年来，大语言模型如GPT、BERT、LLaMA等引领了自然语言处理的技术革新，这些模型通过在海量文本数据上进行预训练，获得了强大的语言理解和生成能力。''',
+    '''在计算机体系结构领域，冯诺依曼架构仍然是现代计算机的基础。然而随着摩尔定律逐渐放缓，研究人员开始探索新型计算范式，包括神经形态计算、存内计算、光子计算等。GPU和TPU等专用加速器的发展极大推动了深度学习的进步。RISC-V开源指令集架构的兴起为芯片设计带来了新的可能性，而chiplet技术和先进封装则为突破制程限制提供了新的路径。''',
+    '''可再生能源的成本大幅下降，太阳能和风能已经成为最便宜的新增发电来源。电动汽车市场快速增长，电池技术不断进步。碳捕获和储存技术、绿色氢能等前沿技术也在加速发展。然而，要实现全球碳中和目标，仍需要在能源系统、交通运输、工业生产、建筑等领域进行深刻的变革。智能电网、储能技术、虚拟电厂等概念正在从理论走向实践。''',
+    '''生物信息学是一门利用计算机技术和数学方法研究生物学问题的交叉学科。基因组学、蛋白质组学、代谢组学等组学技术的发展产生了海量的生物数据。AlphaFold2在蛋白质结构预测方面取得了革命性突破，为药物研发和生命科学研究开辟了新的方向。CRISPR基因编辑技术的发展使得精准修改基因成为可能，为遗传疾病的治疗带来了希望。''',
+]
+
+
+def build_long_prompt(idx, target_chars=9000):
+    parts = [f"(文档编号{idx}) 请仔细阅读以下学术材料并总结：\n\n"]
+    i = idx
+    while sum(len(p) for p in parts) < target_chars:
+        parts.append(_BASE_PARAGRAPHS[i % len(_BASE_PARAGRAPHS)])
+        parts.append("\n\n")
+        i += 1
+    parts.append(f"以上是第{idx}份材料，请给出详细分析。")
+    return "".join(parts)
+
+
+async def measure_one_round(client, round_idx, delay_sec):
+    """
+    1. Fire a long request (starts prefill immediately)
+    2. After delay_sec, fire a short request
+    3. Return both TTFT and E2E for both requests
+    """
+    long_prompt = build_long_prompt(round_idx, target_chars=9000)
+    short_prompt = f"(编号{round_idx}) 1+1等于几？"
+
+    long_result = {}
+    short_result = {}
+
+    async def do_request(prompt, result_dict, delay=0):
+        if delay > 0:
+            await asyncio.sleep(delay)
+        t0 = time.time()
+        stream = await client.chat.completions.create(
+            model=MODEL,
+            messages=[{"role": "user", "content": prompt}],
+            stream=True,
+            max_new_tokens=MAX_TOKENS,
+            temperature=1.0,
+            top_p=1.0,
+            extra_body={"top_k": 1},
+        )
+        first_token_time = None
+        total_tokens = 0
+        async for chunk in stream:
+            if chunk.choices[0].delta.content:
+                if first_token_time is None:
+                    first_token_time = time.time()
+                total_tokens += 1
+            if chunk.choices[0].finish_reason is not None:
+                break
+        end_time = time.time()
+        result_dict["ttft"] = (first_token_time - t0) if first_token_time else None
+        result_dict["e2e"] = end_time - t0
+        result_dict["tokens"] = total_tokens
+
+    await asyncio.gather(
+        do_request(long_prompt, long_result, delay=0),
+        do_request(short_prompt, short_result, delay=delay_sec),
+    )
+    return long_result, short_result
+
+
+async def run_benchmark(rounds, delay):
+    client = AsyncOpenAI(base_url=API_URL, api_key="default")
+
+    # Warmup
+    print("Warmup...")
+    await measure_one_round(client, 100, delay)
+    print("Warmup done.\n")
+
+    long_ttfts = []
+    long_e2es = []
+    short_ttfts = []
+    short_e2es = []
+
+    for i in range(rounds):
+        lr, sr = await measure_one_round(client, i, delay) # lr = long request result, sr = short request result
+
+        lt = lr["ttft"] * 1000 if lr["ttft"] else 0
+        le = lr["e2e"] * 1000
+        st = sr["ttft"] * 1000 if sr["ttft"] else 0
+        se = sr["e2e"] * 1000
+
+        print(f"  Round {i}: LONG  TTFT={lt:>7.1f}ms  E2E={le:>8.1f}ms  tokens={lr['tokens']}")
+        print(f"           SHORT TTFT={st:>7.1f}ms  E2E={se:>8.1f}ms  tokens={sr['tokens']}")
+
+        if lr["ttft"]:
+            long_ttfts.append(lr["ttft"])
+        long_e2es.append(lr["e2e"])
+        if sr["ttft"]:
+            short_ttfts.append(sr["ttft"])
+        short_e2es.append(sr["e2e"])
+
+    sep = "=" * 60
+    print(f"\n{sep}")
+    print(f"  Chunked Prefill TTFT Benchmark")
+    print(f"{sep}")
+    print(f"  Rounds: {rounds}")
+    print(f"  Delay before short request: {delay}s")
+    print(f"  Long prompt: ~9000 chars")
+    print(f"  Max tokens: {MAX_TOKENS}")
+
+    def print_stats(label, ttfts, e2es):
+        if not ttfts:
+            return
+        print(f"\n  [{label}]")
+        print(f"    Avg TTFT: {sum(ttfts)/len(ttfts)*1000:>8.1f} ms")
+        print(f"    Min TTFT: {min(ttfts)*1000:>8.1f} ms")
+        print(f"    Max TTFT: {max(ttfts)*1000:>8.1f} ms")
+        print(f"    Avg E2E:  {sum(e2es)/len(e2es)*1000:>8.1f} ms")
+
+    print_stats("LONG ", long_ttfts, long_e2es)
+    print_stats("SHORT", short_ttfts, short_e2es)
+
+    if short_ttfts:
+        print(f"\n  >>> SHORT Avg TTFT = {sum(short_ttfts)/len(short_ttfts)*1000:.1f} ms <<<")
+        print(f"  >>> SHORT Avg E2E  = {sum(short_e2es)/len(short_e2es)*1000:.1f} ms <<<")
+    print(f"{sep}\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--rounds", type=int, default=5)
+    parser.add_argument("--delay", type=float, default=0.1,
+                        help="Seconds to wait before sending short request (default: 0.1)")
+    args = parser.parse_args()
+
+    asyncio.run(run_benchmark(args.rounds, args.delay))
diff --git a/scripts/test_perf_mix.py b/scripts/test_perf_mix.py
new file mode 100644
index 000000000..a9dd433e9
--- /dev/null
+++ b/scripts/test_perf_mix.py
@@ -0,0 +1,173 @@
+import asyncio
+import time
+from openai import AsyncOpenAI
+import argparse
+import random
+
+PROMPTS = [
+
+    # ~10000 tokens：极限长上下文，多文件代码重构
+    "下面给出 4 个相关文件，请重构以消除重复逻辑并提取公共抽象：\n\n"
+    + "# file: scheduler_v1.py\n" + "def schedule(reqs):\n    return sorted(reqs, key=lambda r: r.arrival)\n" * 100
+    + "\n# file: scheduler_v2.py\n" + "def schedule(reqs):\n    return sorted(reqs, key=lambda r: -r.priority)\n" * 100
+    + "\n# file: scheduler_v3.py\n" + "def schedule(reqs):\n    return sorted(reqs, key=lambda r: r.prompt_len)\n" * 100
+    + "\n# file: scheduler_v4.py\n" + "def schedule(reqs):\n    return sorted(reqs, key=lambda r: r.slo_deadline)\n" * 100,
+
+    
+
+    "1+1=?",
+    
+]
+
+NUM_REQUESTS = len(PROMPTS)
+CONCURRENCY = 5
+API_URL = "http://127.0.0.1:2333"
+MODEL = "FM9G-7B"
+
+
+async def benchmark_user(client, semaphore, queue, results, user_id, verbose):
+    while True:
+        async with semaphore:
+            task_id = await queue.get()
+            if task_id is None:
+                queue.task_done()
+                break
+
+            question = PROMPTS[task_id]
+            try:
+                print(f"🚀 User#{user_id} Sending request #{task_id}")
+
+                start_time = time.time()
+                stream = await client.chat.completions.create(
+                    model=MODEL,
+                    messages=[{"role": "user", "content": question}],
+                    stream=True,
+                )
+
+                first_token_time = None
+                total_tokens = 0
+                answer_chunks = []
+
+                async for chunk in stream:
+                    if first_token_time is None:
+                        first_token_time = time.time()
+                    delta = chunk.choices[0].delta.content
+                    if delta:
+                        answer_chunks.append(delta)
+                        total_tokens += 1
+                    if chunk.choices[0].finish_reason is not None:
+                        break
+
+                end_time = time.time()
+
+                ttft = first_token_time - start_time if first_token_time else None
+                elapsed_time = end_time - start_time if start_time else None
+                ms_per_token = (
+                    (elapsed_time / total_tokens * 1000)
+                    if total_tokens > 0 and elapsed_time
+                    else None
+                )
+                tokens_per_second = (
+                    total_tokens / elapsed_time if elapsed_time > 0 else 0
+                )
+
+                answer = "".join(answer_chunks)
+
+                results.append(
+                    (total_tokens, elapsed_time, tokens_per_second, ttft, ms_per_token)
+                )
+
+                if verbose:
+                    print(f"\n📝 Request #{task_id} (User #{user_id})")
+                    if ttft is not None:
+                        print(f"  ⏱ 首字延迟 TTFT: {ttft:.3f}s")
+                    if elapsed_time is not None:
+                        print(f"  ⏱ 总耗时: {elapsed_time:.3f}s")
+
+                    print(f"  🔤 解码 token 总数: {total_tokens}")
+                    if ms_per_token is not None:
+                        print(f"  📏 平均 token 解码时间: {ms_per_token:.2f} ms/token")
+                    else:
+                        print(f"  📏 平均 token 解码时间: N/A (no token generated)")
+                    print(f"  ❓ 提问: {question}")
+                    print(f"  💬 回答: {answer}\n")
+
+                queue.task_done()
+            except Exception as e:
+                if verbose:
+                    print(f"\n⚠️ Request #{task_id} (User #{user_id}) FAILED:")
+                    print(f"  ❌ Error: {e}\n")
+                queue.task_done()
+
+
+async def run_benchmark(verbose=False):
+    client = AsyncOpenAI(base_url=API_URL, api_key="default")
+    semaphore = asyncio.Semaphore(CONCURRENCY)
+    queue = asyncio.Queue()
+    results = []
+    for i in range(NUM_REQUESTS):
+        await queue.put(i)
+    for _ in range(CONCURRENCY):
+        await queue.put(None)
+
+    users = [
+        asyncio.create_task(
+            benchmark_user(client, semaphore, queue, results, user_id, verbose)
+        )
+        for user_id in range(CONCURRENCY)
+    ]
+
+    start_time = time.time()
+    await queue.join()
+    await asyncio.gather(*users)
+    end_time = time.time()
+
+    total_elapsed_time = end_time - start_time
+    tokens_list = [r[0] for r in results if r and r[0] is not None]
+    latencies = [r[1] for r in results if r and r[1] is not None]
+    tokens_per_second_list = [r[2] for r in results if r and r[2] is not None]
+    ttft_list = [r[3] for r in results if r and r[3] is not None]
+    ms_per_token_list = [r[4] for r in results if r and r[4] is not None]
+
+    successful_requests = len(results)
+    requests_per_second = (
+        successful_requests / total_elapsed_time if total_elapsed_time > 0 else 0
+    )
+    avg_latency = sum(latencies) / len(latencies) if latencies else 0
+    avg_tokens_per_second = (
+        sum(tokens_per_second_list) / len(tokens_per_second_list)
+        if tokens_per_second_list
+        else 0
+    )
+    avg_ttft = sum(ttft_list) / len(ttft_list) if ttft_list else 0
+    avg_ms_per_token = (
+        sum(ms_per_token_list) / len(ms_per_token_list) if ms_per_token_list else None
+    )
+
+    width_label = 24
+    sep = "-" * 60
+
+    print(f"\n=== 📊 性能指标汇总 ({MODEL}) ===")
+    print(sep)
+    print(f"{'并发数':<{width_label}}: {CONCURRENCY}")
+    print(f"{'请求总数':<{width_label}}: {NUM_REQUESTS}")
+    print(f"{'成功请求数':<{width_label}}: {successful_requests}")
+    print(f"{'总耗时':<{width_label}}: {total_elapsed_time:.2f} s")
+    print(f"{'总输出token数':<{width_label}}: {sum(tokens_list)}")
+    print(f"{'请求速率 (RPS)':<{width_label}}: {requests_per_second:.2f} requests/s")
+    print(sep)
+    print(f"{'Average latency':<{width_label}}: {avg_latency:.2f} s")
+    print(f"{'Average TTFT':<{width_label}}: {avg_ttft:.2f} s")
+    print(f"{'Avg time per token':<{width_label}}: {avg_ms_per_token:.2f} ms/token")
+    print(
+        f"{'Avg Token generation speed':<{width_label}}: {avg_tokens_per_second:.2f} tokens/s"
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--verbose", action="store_true")
+    args = parser.parse_args()
+
+    asyncio.run(run_benchmark(args.verbose))
+

From 9e025c7b45b47ca39d2a4bdce03f9c8f72c7e04d Mon Sep 17 00:00:00 2001
From: huidesheng <1832140001@qq.com>
Date: Tue, 9 Jun 2026 13:30:44 +0800
Subject: [PATCH 8/9] add test_burstgpt_bench and optimize scheduler

---
 python/infinilm/llm/scheduler.py           |  93 +--
 python/infinilm/server/inference_server.py |  15 +-
 scripts/pure_bench_serve.py                | 369 +++++++++++
 scripts/test_burstgpt_bench.py             | 708 +++++++++++++++++++++
 scripts/test_chunk_prefill.py              |  15 +-
 5 files changed, 1148 insertions(+), 52 deletions(-)
 create mode 100644 scripts/pure_bench_serve.py
 create mode 100755 scripts/test_burstgpt_bench.py

diff --git a/python/infinilm/llm/scheduler.py b/python/infinilm/llm/scheduler.py
index e235842a3..d3e73b53b 100644
--- a/python/infinilm/llm/scheduler.py
+++ b/python/infinilm/llm/scheduler.py
@@ -75,27 +75,21 @@ def add_request(self, request: InferenceRequest):
     #  Main scheduling entrypoint                                        #
     # ------------------------------------------------------------------ #
     def schedule(self) -> Optional[SchedulerOutput]:
-        """Schedule and return batch of requests to execute.
-
-        Priority (prefill first):
-        1. New prefill (waiting_queue) — minimize TTFT for new requests.
-        2. Decode (running_queue).
-        3. Continue chunked-prefill (chunking_queue).
-
-        Anti-starvation (only guards chunking against waiting):
-        After `max_waiting_yields` consecutive steps where waiting_queue won
-        over a non-empty chunking_queue, the next step is forced onto the
-        chunking_queue.
+        """Priority order:
+        0. Forced chunking after too many consecutive waiting yields.
+        1. New prefill (waiting_queue) — protect new-arrival TTFT.
+        2. Continue chunked-prefill (chunking_queue) — advance long-prompt
+            TTFT whenever waiting is idle; pay decode TPOT.
+        3. Decode (running_queue) — lowest priority.
         """
-        # 0) Forced chunking after too many consecutive waiting yields.
+        # 0) Forced chunking
         if self._waiting_yields_in_a_row >= self.max_waiting_yields:
             chunking_out = self._try_schedule_chunking()
             if chunking_out is not None:
                 self._waiting_yields_in_a_row = 0
                 return chunking_out
-            # chunking_queue was actually empty — fall through to normal path.
 
-        # 1) New prefill 
+        # 1) New prefill
         chunking_was_nonempty = self.chunking_queue.sync_q.qsize() > 0
         waiting_out = self._try_schedule_waiting()
         if waiting_out is not None:
@@ -105,46 +99,66 @@ def schedule(self) -> Optional[SchedulerOutput]:
                 self._waiting_yields_in_a_row = 0
             return waiting_out
 
-        # 2) Decode.
-        chunking_was_nonempty = self.chunking_queue.sync_q.qsize() > 0
-        decode_out = self._try_schedule_decode()
-        if decode_out is not None:
-            if chunking_was_nonempty:
-                self._waiting_yields_in_a_row += 1
-            else:
-                self._waiting_yields_in_a_row = 0
-            return decode_out
-
-        # 3) Continue an in-flight chunked-prefill request.
+        # 2) Chunking (raised above decode).
         chunking_out = self._try_schedule_chunking()
         if chunking_out is not None:
             self._waiting_yields_in_a_row = 0
             return chunking_out
 
+        # 3) Decode (lowest). If we reached here, chunking_queue was empty,
+        # so the yield counter naturally resets.
+        decode_out = self._try_schedule_decode()
+        if decode_out is not None:
+            self._waiting_yields_in_a_row = 0
+            return decode_out
+
         return None
 
     # ------------------------------------------------------------------ #
     #  Per-queue schedulers                                              #
     # ------------------------------------------------------------------ #
     def _try_schedule_chunking(self) -> Optional[SchedulerOutput]:
+        """Drain chunking_queue and form a batch of uniform chunk-kind.
+
+        Invariant (enforced by llm._update_requests' chunk_mid_step check):
+        a batch must be either ALL middle-chunks (no sample, no commit) OR
+        ALL last-chunks (sample + commit). Mixing them is unsafe.
+
+        Strategy: greedy drain. The first non-finished request seen fixes
+        the batch's kind. Subsequent same-kind requests are added up to
+        max_batch_size. Mismatched requests are buffered and re-enqueued at
+        the end so they get handled in the next schedule cycle. Order within
+        each kind is preserved.
+        """
         scheduled: List[InferenceRequest] = []
+        deferred: List[InferenceRequest] = []
+        kind_is_last: Optional[bool] = None
+
         while len(scheduled) < self.max_batch_size:
             try:
                 req = self.chunking_queue.sync_q.get_nowait()
             except queue.Empty:
                 break
+
             if req.is_finished():
                 self.complete_requests([req])
                 continue
-            # 最后一块（partial 或恰好等于 chunk_size 的最后整块）单独跑。
-            # 不能和中间整块混批：最后一块要采样+提交 block，中间块两个都不做。
-            if req.chunk_is_last():
-                if not scheduled:
-                    return SchedulerOutput([req], is_prefill=True)
-                # 已经攒了中间块，先把这个 last-chunk 放回队头，等下个 step 单独跑。
-                self.chunking_queue.sync_q.put(req)
-                break
-            scheduled.append(req)
+
+            cur_is_last = req.chunk_is_last()
+
+            if kind_is_last is None:
+                kind_is_last = cur_is_last
+                scheduled.append(req)
+            elif cur_is_last == kind_is_last:
+                scheduled.append(req)
+            else:
+                deferred.append(req)
+
+        # Re-enqueue deferred reqs preserving their relative order so they
+        # aren't permanently overtaken by newcomers.
+        for r in deferred:
+            self.chunking_queue.sync_q.put(r)
+
         if scheduled:
             return SchedulerOutput(scheduled, is_prefill=True)
         return None
@@ -165,13 +179,11 @@ def _try_schedule_waiting(self) -> Optional[SchedulerOutput]:
             except queue.Empty:
                 break
 
-            # Skip requests that were already finished (timed out / canceled while waiting).
             if req.is_finished():
                 self.complete_requests([req])
                 continue
 
             if not self.can_accept_request(req):
-                # Put it back; we'll retry next tick when cache pressure eases.
                 self.waiting_queue.sync_q.put(req)
                 break
 
@@ -182,19 +194,16 @@ def _try_schedule_waiting(self) -> Optional[SchedulerOutput]:
                 if not self.cache_manager.try_free_blocks(num_required_blocks):
                     raise RuntimeError("No available cache blocks for new request")
 
-            # Allocate blocks (prefix caching applied automatically).
             if not req.block_table:
                 req.block_table, req.slot_mapping, req.num_cached_tokens = (
                     self.cache_manager.allocate_blocks(req_tokens, req.block_table)
                 )
-                
+
             req.num_blocks = len(req.block_table)
             req.status = RequestStatus.RUNNING
 
             # Start chunked-prefill: emit a single-request batch immediately
-            # to keep the C++ graph signature stable. The request will be
-            # requeued into chunking_queue by llm._update_requests after each
-            # chunk runs.
+            # to keep the C++ graph signature stable.
             remaining = req.prompt_length - req.num_cached_tokens
             if req.chunk_size > 0 and remaining > req.chunk_size:
                 req.chunk_prefill_offset = req.num_cached_tokens
@@ -308,7 +317,7 @@ def can_accept_request(self, request: InferenceRequest) -> bool:
         total_length += request.sampling_params.max_tokens
         num_blocks_needed = (total_length + self.block_size - 1) // self.block_size
         total_required_blocks += num_blocks_needed
-
+        logger.info(f"accepted={total_required_blocks <= self.cache_manager.get_total_usable_blocks()}")
         # Compare with total usable blocks in cache manager
         return total_required_blocks <= self.cache_manager.get_total_usable_blocks()
 
diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py
index ac7e94e71..5918daf8b 100644
--- a/python/infinilm/server/inference_server.py
+++ b/python/infinilm/server/inference_server.py
@@ -329,11 +329,18 @@ def pick(key: str, default):
                 return sp.get(key)
             return default
 
-        # Accept common alias
-        max_tokens = pick("max_tokens", self.max_tokens)
+        # Accept common aliases. vLLM bench serve's OpenAI-chat backend sends
+        # max_completion_tokens, while older clients often send max_new_tokens.
+        max_tokens = None
+        for key in ("max_tokens", "max_completion_tokens", "max_new_tokens"):
+            if key in data and data.get(key) is not None:
+                max_tokens = data.get(key)
+                break
+            if key in sp and sp.get(key) is not None:
+                max_tokens = sp.get(key)
+                break
         if max_tokens is None:
-            # Some clients use max_new_tokens
-            max_tokens = pick("max_new_tokens", self.max_tokens)
+            max_tokens = self.max_tokens
 
         stop = pick("stop", None)
         if isinstance(stop, str):
diff --git a/scripts/pure_bench_serve.py b/scripts/pure_bench_serve.py
new file mode 100644
index 000000000..489ef0ba5
--- /dev/null
+++ b/scripts/pure_bench_serve.py
@@ -0,0 +1,369 @@
+#!/usr/bin/env python3
+"""纯 Python 版 BurstGPT 在线压测客户端，可无缝替换 `vllm bench serve`(shim)。
+
+设计目标：
+  * 接收与 InfiniLM 测试脚本传给 vllm_bench_serve_shim.py 完全相同的命令行参数；
+  * 复刻 vLLM benchmark serve 的行为(BurstGPT 采样、gamma/泊松到达、并发上限、
+    流式 TTFT/ITL/TPOT 统计);
+  * 写出与 vLLM 完全相同字段的结果 JSON(供测试脚本 load_result 读取)。
+
+只依赖标准库 + numpy + pandas + transformers，不需要 aiohttp / vllm。
+用法上把测试脚本的 `--shim-path` 指向本文件即可，其余参数原样透传。
+"""
+from __future__ import annotations
+
+import argparse
+import asyncio
+import http.client
+import json
+import os
+import sys
+import time
+import traceback
+import urllib.parse
+from dataclasses import dataclass, field
+
+import numpy as np
+import pandas as pd
+
+MILLISECONDS = 1000.0
+
+
+# --------------------------------------------------------------------------- #
+# 命令行参数：与 vLLM benchmark serve 的子集保持同名，未用到的也照单接收以便兼容。
+# --------------------------------------------------------------------------- #
+def parse_args(argv: list[str]) -> argparse.Namespace:
+    # 兼容 `vllm bench serve ...` 这种前缀调用
+    if argv[:2] == ["bench", "serve"]:
+        argv = argv[2:]
+
+    p = argparse.ArgumentParser(prog="pure bench serve")
+    p.add_argument("--backend", default="openai-chat")
+    p.add_argument("--base-url", required=True)
+    p.add_argument("--endpoint", default="/v1/chat/completions")
+    p.add_argument("--model", required=True)
+    p.add_argument("--tokenizer", default=None)
+    p.add_argument("--dataset-name", default="burstgpt")
+    p.add_argument("--dataset-path", required=True)
+    p.add_argument("--num-prompts", type=int, default=100)
+    p.add_argument("--request-rate", default="inf")
+    p.add_argument("--seed", type=int, default=0)
+    p.add_argument("--burstiness", type=float, default=1.0)
+    p.add_argument("--max-concurrency", type=int, default=None)
+    p.add_argument("--temperature", type=float, default=None)
+    p.add_argument("--ignore-eos", action="store_true")
+    p.add_argument("--request-timeout", type=float, default=600.0)
+    # 仅为兼容，不影响逻辑：
+    p.add_argument("--disable-tqdm", action="store_true")
+    p.add_argument("--save-result", action="store_true")
+    p.add_argument("--save-detailed", action="store_true")
+    p.add_argument("--result-dir", default=None)
+    p.add_argument("--result-filename", default=None)
+    p.add_argument("--metric-percentiles", default="99")
+    # 兜底：吞掉任何其它未知参数，避免因 vLLM 新增 flag 而崩
+    args, unknown = p.parse_known_args(argv)
+    if unknown:
+        print(f"[pure-bench] 忽略未识别参数: {unknown}")
+    return args
+
+
+def parse_rate(value: str) -> float:
+    if isinstance(value, (int, float)):
+        return float(value)
+    return float("inf") if value.lower() in {"inf", "infinity"} else float(value)
+
+
+# --------------------------------------------------------------------------- #
+# BurstGPT 数据集采样：严格对齐 vLLM BurstGPTDataset
+# --------------------------------------------------------------------------- #
+@dataclass
+class SampleRequest:
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+
+
+def sample_burstgpt(dataset_path: str, num_requests: int, seed: int, tokenizer) -> list[SampleRequest]:
+    df = pd.read_csv(dataset_path)
+    gpt4 = df[df["Model"] == "GPT-4"]
+    gpt4 = gpt4[gpt4["Response tokens"] > 0]
+    if len(gpt4) == 0:
+        raise RuntimeError(f"BurstGPT 过滤后为空: {dataset_path}")
+    replace = num_requests > len(gpt4)
+    data = gpt4.sample(n=num_requests, random_state=seed, replace=replace).values.tolist()
+
+    vocab_size = tokenizer.vocab_size
+    samples: list[SampleRequest] = []
+    for i in range(num_requests):
+        input_len = int(data[i][2])   # Request tokens
+        output_len = int(data[i][3])  # Response tokens
+        token_ids = [(i + j) % vocab_size for j in range(input_len)]
+        prompt = tokenizer.decode(token_ids)
+        samples.append(SampleRequest(prompt, input_len, output_len))
+    return samples
+
+
+# --------------------------------------------------------------------------- #
+# 到达时刻：复刻 vLLM get_request 的 gamma 采样 + 归一化
+# --------------------------------------------------------------------------- #
+def build_delays(n: int, request_rate: float, burstiness: float) -> list[float]:
+    delays: list[float] = []
+    for _ in range(n):
+        if request_rate == float("inf"):
+            delays.append(0.0)
+        elif burstiness == float("inf"):
+            delays.append(1.0 / request_rate)
+        else:
+            theta = 1.0 / (request_rate * burstiness)
+            delays.append(float(np.random.gamma(shape=burstiness, scale=theta)))
+    for i in range(1, n):
+        delays[i] += delays[i - 1]
+    if request_rate != float("inf") and delays and delays[-1] != 0:
+        target_total = n / request_rate
+        factor = target_total / delays[-1]
+        delays = [d * factor for d in delays]
+    return delays
+
+
+# --------------------------------------------------------------------------- #
+# 单请求：流式 chat completions，测 TTFT / ITL / latency / output_tokens
+# --------------------------------------------------------------------------- #
+@dataclass
+class RequestOutput:
+    prompt_len: int = 0
+    success: bool = False
+    generated_text: str = ""
+    output_tokens: int | None = None
+    ttft: float = 0.0
+    latency: float = 0.0
+    itl: list[float] = field(default_factory=list)
+    start_time: float = 0.0
+    error: str = ""
+
+
+def _do_request_blocking(
+    base_url: str,
+    endpoint: str,
+    model: str,
+    req: SampleRequest,
+    temperature: float,
+    ignore_eos: bool,
+    timeout: float,
+) -> RequestOutput:
+    out = RequestOutput(prompt_len=req.prompt_len)
+    parsed = urllib.parse.urlparse(base_url)
+    host = parsed.hostname
+    port = parsed.port or (443 if parsed.scheme == "https" else 80)
+    path = (parsed.path.rstrip("/") + endpoint) if parsed.path not in ("", "/") else endpoint
+
+    payload = {
+        "model": model,
+        "messages": [{"role": "user", "content": req.prompt}],
+        "temperature": temperature,
+        "max_completion_tokens": req.expected_output_len,
+        "stream": True,
+        "stream_options": {"include_usage": True},
+    }
+    if ignore_eos:
+        payload["ignore_eos"] = True
+    body = json.dumps(payload).encode("utf-8")
+    headers = {"Content-Type": "application/json"}
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+
+    conn_cls = http.client.HTTPSConnection if parsed.scheme == "https" else http.client.HTTPConnection
+    conn = conn_cls(host, port, timeout=timeout)
+    st = time.perf_counter()
+    out.start_time = st
+    most_recent = st
+    generated = ""
+    try:
+        conn.request("POST", path, body=body, headers=headers)
+        resp = conn.getresponse()
+        if resp.status != 200:
+            out.error = f"HTTP {resp.status} {resp.reason}"
+            return out
+        for raw in resp:  # 按 chunk/行迭代，数据到达即返回，可测 TTFT
+            line = raw.strip()
+            if not line or line.startswith(b":"):
+                continue
+            if not line.startswith(b"data:"):
+                continue
+            chunk = line[len(b"data:"):].strip()
+            if chunk == b"[DONE]":
+                continue
+            now = time.perf_counter()
+            data = json.loads(chunk)
+            choices = data.get("choices")
+            if choices:
+                delta = choices[0].get("delta", {})
+                content = delta.get("content")
+                if out.ttft == 0.0:
+                    out.ttft = now - st
+                else:
+                    out.itl.append(now - most_recent)
+                generated += content or ""
+                most_recent = now
+            usage = data.get("usage")
+            if usage and usage.get("completion_tokens") is not None:
+                out.output_tokens = usage.get("completion_tokens")
+        out.generated_text = generated
+        out.latency = most_recent - st
+        out.success = True
+    except Exception:
+        out.success = False
+        out.error = "".join(traceback.format_exception(*sys.exc_info()))
+    finally:
+        conn.close()
+    return out
+
+
+# --------------------------------------------------------------------------- #
+# 主压测循环：按到达计划派发，asyncio.Semaphore 控制并发上限
+# --------------------------------------------------------------------------- #
+async def run_benchmark(args, requests: list[SampleRequest], request_rate: float) -> tuple[list[RequestOutput], float]:
+    n = len(requests)
+    np.random.seed(args.seed)  # 与 vLLM 一致：到达采样前播种全局 numpy RNG
+    delays = build_delays(n, request_rate, args.burstiness)
+
+    sem = asyncio.Semaphore(args.max_concurrency) if args.max_concurrency else None
+    temperature = 0.0 if args.temperature is None else args.temperature
+    outputs: list[RequestOutput | None] = [None] * n
+    done = 0
+
+    async def one(i: int) -> None:
+        nonlocal done
+        async def call() -> RequestOutput:
+            return await asyncio.to_thread(
+                _do_request_blocking,
+                args.base_url, args.endpoint, args.model, requests[i],
+                temperature, args.ignore_eos, args.request_timeout,
+            )
+        if sem is not None:
+            async with sem:
+                outputs[i] = await call()
+        else:
+            outputs[i] = await call()
+        done += 1
+        # 机器可解析的进度行：父进程(测试脚本)心跳据此显示"已完成多少条"
+        print(f"[pure-bench] PROGRESS {done}/{n}", flush=True)
+
+    start = time.perf_counter()
+    tasks: list[asyncio.Task] = []
+    for i in range(n):
+        wait = start + delays[i] - time.perf_counter()
+        if wait > 0:
+            await asyncio.sleep(wait)
+        tasks.append(asyncio.create_task(one(i)))
+    await asyncio.gather(*tasks)
+    duration = time.perf_counter() - start
+    return [o for o in outputs], duration  # type: ignore[return-value]
+
+
+# --------------------------------------------------------------------------- #
+# 指标：完全对齐 vLLM calculate_metrics 的口径
+# --------------------------------------------------------------------------- #
+def compute_result(args, requests, outputs: list[RequestOutput], duration: float, tokenizer, percentiles: list[float]) -> dict:
+    actual_output_lens: list[int] = []
+    total_input = 0
+    completed = 0
+    ttfts: list[float] = []
+    tpots: list[float] = []
+    itls: list[float] = []
+    e2els: list[float] = []
+    failed = 0
+
+    for i, o in enumerate(outputs):
+        if o.success:
+            out_len = o.output_tokens
+            if not out_len:
+                out_len = len(tokenizer(o.generated_text, add_special_tokens=False).input_ids)
+            actual_output_lens.append(out_len)
+            total_input += requests[i].prompt_len
+            if out_len > 1:
+                tpots.append((o.latency - o.ttft) / (out_len - 1))
+            itls += o.itl
+            ttfts.append(o.ttft)
+            e2els.append(o.latency)
+            completed += 1
+        else:
+            actual_output_lens.append(0)
+            failed += 1
+
+    total_output = sum(actual_output_lens)
+    dur = duration
+
+    def stats(key: str, vals: list[float], result: dict) -> None:
+        arr = vals or [0.0]
+        result[f"mean_{key}_ms"] = float(np.mean(arr) * MILLISECONDS)
+        result[f"median_{key}_ms"] = float(np.median(arr) * MILLISECONDS)
+        result[f"std_{key}_ms"] = float(np.std(arr) * MILLISECONDS)
+        for p in percentiles:
+            pw = str(int(p)) if int(p) == p else str(p)
+            result[f"p{pw}_{key}_ms"] = float(np.percentile(arr, p) * MILLISECONDS)
+
+    result: dict = {
+        "duration": dur,
+        "completed": completed,
+        "failed": failed,
+        "total_input_tokens": total_input,
+        "total_output_tokens": total_output,
+        "request_throughput": completed / dur if dur > 0 else 0.0,
+        "output_throughput": total_output / dur if dur > 0 else 0.0,
+        "total_token_throughput": (total_input + total_output) / dur if dur > 0 else 0.0,
+        "input_lens": [o.prompt_len for o in outputs],
+        "output_lens": actual_output_lens,
+        "ttfts": [o.ttft for o in outputs],
+        "itls": [o.itl for o in outputs],
+        "generated_texts": [o.generated_text for o in outputs],
+        "errors": [o.error for o in outputs],
+    }
+    stats("ttft", ttfts, result)
+    stats("tpot", tpots, result)
+    stats("itl", itls, result)
+    stats("e2el", e2els, result)
+    return result
+
+
+# --------------------------------------------------------------------------- #
+def main() -> None:
+    args = parse_args(sys.argv[1:])
+    request_rate = parse_rate(args.request_rate)
+    percentiles = [float(x) for x in str(args.metric_percentiles).split(",") if x.strip()]
+
+    from transformers import AutoTokenizer
+    tok_src = args.tokenizer or args.model
+    tokenizer = AutoTokenizer.from_pretrained(tok_src, trust_remote_code=True)
+
+    print(f"[pure-bench] 数据集={args.dataset_path} 请求数={args.num_prompts} "
+          f"速率={args.request_rate} burstiness={args.burstiness} "
+          f"并发上限={args.max_concurrency} seed={args.seed}")
+    requests = sample_burstgpt(args.dataset_path, args.num_prompts, args.seed, tokenizer)
+
+    outputs, duration = asyncio.run(run_benchmark(args, requests, request_rate))
+    result = compute_result(args, requests, outputs, duration, tokenizer, percentiles)
+
+    print(f"[pure-bench] 完成 {result['completed']}/{args.num_prompts}，失败 {result['failed']}，"
+          f"用时 {duration:.2f}s，请求吞吐 {result['request_throughput']:.3f} req/s，"
+          f"输出吞吐 {result['output_throughput']:.3f} tok/s，"
+          f"平均 TTFT {result['mean_ttft_ms']:.2f}ms，平均 TPOT {result['mean_tpot_ms']:.2f}ms")
+
+    if args.save_result and args.result_dir and args.result_filename:
+        os.makedirs(args.result_dir, exist_ok=True)
+        out_path = os.path.join(args.result_dir, args.result_filename)
+        with open(out_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, ensure_ascii=False)
+        print(f"[pure-bench] 结果已写入 {out_path}")
+    else:
+        # 即便没要求保存，也按 result-dir/result-filename 写出，保证测试脚本能 load_result
+        if args.result_dir and args.result_filename:
+            os.makedirs(args.result_dir, exist_ok=True)
+            out_path = os.path.join(args.result_dir, args.result_filename)
+            with open(out_path, "w", encoding="utf-8") as f:
+                json.dump(result, f, ensure_ascii=False)
+            print(f"[pure-bench] 结果已写入 {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/test_burstgpt_bench.py b/scripts/test_burstgpt_bench.py
new file mode 100755
index 000000000..2b2f95851
--- /dev/null
+++ b/scripts/test_burstgpt_bench.py
@@ -0,0 +1,708 @@
+#!/usr/bin/env python3
+"""一键对比 optimized 开/关时的 BurstGPT 服务压测结果。
+
+该脚本会依次启动 InfiniLM 服务：
+optimized OFF 和 ON ，
+分别调用pure_bench_serve.py跑BurstGPT测试集，最后输出对比表。
+
+BurstGPT dataset 只取第 2、3 列(Request tokens / Response tokens)做输入输出长度,没有读取时间戳列,到达时间用 request_rate=2.0 + burstiness=1.0 的 Poisson 过程发出
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import math
+import os
+import selectors
+import signal
+import socket
+import subprocess
+import sys
+import tempfile
+import time
+import unicodedata
+import urllib.error
+import urllib.request
+from pathlib import Path
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+INFERENCE_SERVER = PROJECT_ROOT / "python" / "infinilm" / "server" / "inference_server.py"
+BENCH_SCRIPT = Path(__file__).resolve().parent / "pure_bench_serve.py"
+DEFAULT_MODEL = "/workspace/models/9g_8b_v2_thinking/9g_8b_thinking"
+DEFAULT_DATASET = "/workspace/datasets/burstgpt/BurstGPT_1.csv"
+DEFAULT_RESULT_DIR = "/workspace/bench_results"
+USE_COLOR = False
+
+
+def paint(text: str, code: str) -> str:
+    return f"\033[{code}m{text}\033[0m" if USE_COLOR else text
+
+
+def visual_width(text: str) -> int:
+    width = 0
+    for ch in text:
+        width += 2 if unicodedata.east_asian_width(ch) in {"F", "W"} else 1
+    return width
+
+
+def ljust_display(text: str, width: int) -> str:
+    return text + " " * max(0, width - visual_width(text))
+
+
+def print_bar(title: str, fill: str = "=") -> None:
+    line = fill * 78
+    print("\n" + paint(line, "36;1"))
+    print(paint(title, "36;1"))
+    print(paint(line, "36;1"))
+
+
+def print_kv(key: str, value: object) -> None:
+    print(f"  {paint(ljust_display(key, 18), '2')}: {value}")
+
+
+def format_duration(seconds: float) -> str:
+    seconds = int(seconds)
+    minutes, sec = divmod(seconds, 60)
+    hours, minutes = divmod(minutes, 60)
+    if hours:
+        return f"{hours}小时{minutes:02d}分{sec:02d}秒"
+    if minutes:
+        return f"{minutes}分{sec:02d}秒"
+    return f"{sec}秒"
+
+
+def terminate_process_group(popen: subprocess.Popen, timeout: int = 30) -> None:
+    if popen.poll() is not None:
+        return
+    os.killpg(os.getpgid(popen.pid), signal.SIGTERM)
+    try:
+        popen.wait(timeout=timeout)
+    except subprocess.TimeoutExpired:
+        os.killpg(os.getpgid(popen.pid), signal.SIGKILL)
+        popen.wait(timeout=5)
+
+
+def wait_for_port_free(host: str, port: int, timeout: int = 30) -> None:
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        try:
+            with socket.create_connection((host, port), timeout=0.5):
+                pass
+        except OSError:
+            return
+        time.sleep(0.3)
+    raise RuntimeError(f"port {port} still in use after stop_server")
+
+
+def parse_rate(value: str) -> float:
+    if value.lower() in {"inf", "infinity"}:
+        return float("inf")
+    return float(value)
+
+
+def rate_to_str(value: float) -> str:
+    return "inf" if math.isinf(value) else str(value)
+
+
+def sanitize(value: str) -> str:
+    return value.replace("/", "_").replace(".", "p").replace("-", "m")
+
+
+def count_and_filter_dataset(args: argparse.Namespace) -> Path:
+    print_bar("准备 BurstGPT 数据")
+    src = Path(args.dataset_path)
+    if not src.exists():
+        raise FileNotFoundError(f"BurstGPT dataset not found: {src}")
+
+    print_kv("原始数据集", src)
+    if args.use_full_dataset:
+        print("  使用完整 CSV，不做长度过滤。")
+        return src
+
+    out = Path(args.filtered_dataset_path) if args.filtered_dataset_path else None
+    if out is None:
+        out = src.with_name(
+            f"{src.stem}_gpt4_pos_req{args.max_request_tokens}_resp{args.max_response_tokens}.csv"
+        )
+    out.parent.mkdir(parents=True, exist_ok=True)
+    print_kv("过滤后数据集", out)
+    print_kv("过滤条件", f"Model=GPT-4, 0 < 输出 token <= {args.max_response_tokens}, 输入 token <= {args.max_request_tokens}")
+
+    total = gpt4 = positive = kept = 0
+    with src.open(newline="", encoding="utf-8") as f, out.open(
+        "w", newline="", encoding="utf-8"
+    ) as g:
+        reader = csv.DictReader(f)
+        writer = csv.DictWriter(g, fieldnames=reader.fieldnames)
+        writer.writeheader()
+        for row in reader:
+            total += 1
+            if row.get("Model") != "GPT-4":
+                continue
+            gpt4 += 1
+            try:
+                req_tokens = int(float(row.get("Request tokens") or 0))
+                resp_tokens = int(float(row.get("Response tokens") or 0))
+            except ValueError:
+                continue
+            if resp_tokens <= 0:
+                continue
+            positive += 1
+            if req_tokens > args.max_request_tokens or resp_tokens > args.max_response_tokens:
+                continue
+            writer.writerow(row)
+            kept += 1
+
+    print_kv("原始总行数", total)
+    print_kv("GPT-4 行数", gpt4)
+    print_kv("GPT-4 且输出非空", positive)
+    print_kv("本次可用行数", kept)
+    if kept == 0:
+        raise RuntimeError("Filtered BurstGPT dataset is empty; relax token limits.")
+    if kept < args.num_prompts:
+        print(
+            f"  注意：过滤后只有 {kept} 条，但本次请求 {args.num_prompts} 条；"
+            "vLLM 会重复采样。"
+        )
+    return out
+
+
+def chat_warmup(base_url: str, model: str, timeout: int = 30) -> None:
+    payload = {
+        "model": model,
+        "messages": [{"role": "user", "content": "hi"}],
+        "max_tokens": 1,
+        "temperature": 0,
+    }
+    req = urllib.request.Request(
+        f"{base_url}/v1/chat/completions",
+        data=json.dumps(payload).encode("utf-8"),
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=timeout) as resp:
+        if not 200 <= resp.status < 300:
+            raise RuntimeError(f"warmup request failed with HTTP {resp.status}")
+
+
+def wait_for_server(
+    base_url: str,
+    popen: subprocess.Popen | None,
+    timeout: int,
+    model: str,
+) -> None:
+    deadline = time.time() + timeout
+    started = time.monotonic()
+    last_notice = 0.0
+    health_url = f"{base_url}/health"
+    print_kv("健康检查", health_url)
+    while time.time() < deadline:
+        if popen is not None and popen.poll() is not None:
+            raise RuntimeError(f"server exited early with code {popen.returncode}")
+        try:
+            with urllib.request.urlopen(health_url, timeout=2) as resp:
+                if 200 <= resp.status < 300:
+                    chat_warmup(base_url, model)
+                    print(f"  服务已就绪，用时 {format_duration(time.monotonic() - started)}。")
+                    return
+        except (urllib.error.URLError, TimeoutError, OSError):
+            pass
+        now = time.monotonic()
+        if now - last_notice >= 15:
+            print(f"  正在等待服务启动... 已等待 {format_duration(now - started)}")
+            last_notice = now
+        time.sleep(2)
+    raise TimeoutError(f"server not ready within {timeout}s: {health_url}")
+
+
+def start_server(
+    args: argparse.Namespace,
+    chunk_size: int,
+    mode_label: str,
+) -> subprocess.Popen | None:
+    base_url = f"http://{args.client_host}:{args.port}"
+
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = args.cuda_visible_devices
+
+    cmd = [
+        sys.executable,
+        str(INFERENCE_SERVER),
+        "--device",
+        args.device,
+        "--model",
+        args.model_path,
+        "--backend",
+        args.backend,
+        "--max-batch-size",
+        str(args.max_batch_size),
+        "--max-new-tokens",
+        str(args.server_max_new_tokens),
+        "--host",
+        args.server_host,
+        "--port",
+        str(args.port),
+    ]
+    if args.enable_paged_attn:
+        cmd.append("--enable-paged-attn")
+    if args.enable_graph:
+        cmd.append("--enable-graph")
+    if args.enable_chunk_prefill_graph:
+        cmd.append("--enable-chunk-prefill-graph")
+    cmd.extend(["--chunk-size", str(chunk_size)])
+
+    print_bar(
+        f"开始部署大模型推理服务，optimized {mode_label} "
+        f"(chunk-size={chunk_size})，请等待服务启动完成..."
+    )
+    print_kv("GPU", f"CUDA_VISIBLE_DEVICES={args.cuda_visible_devices}")
+    print_kv("模型", args.model_path)
+    print_kv("后端", args.backend)
+    print_kv("optimized", mode_label)
+    if args.show_server_output:
+        print_kv("服务输出", "直接显示在终端")
+
+    server_stdout = None if args.show_server_output else subprocess.DEVNULL
+    popen = subprocess.Popen(
+        cmd,
+        cwd=str(PROJECT_ROOT),
+        env=env,
+        stdout=server_stdout,
+        stderr=subprocess.STDOUT,
+        text=True,
+        preexec_fn=os.setsid,
+    )
+    try:
+        wait_for_server(base_url, popen, args.server_timeout, args.served_model_name)
+    except Exception:
+        print("\n服务启动失败。需要看服务端输出时，可以加 --show-server-output 重新运行。")
+        raise
+    return popen
+
+
+def stop_server(
+    popen: subprocess.Popen | None,
+    host: str = "127.0.0.1",
+    port: int = 2333,
+    timeout: int = 30,
+) -> None:
+    if popen is None or popen.poll() is not None:
+        wait_for_port_free(host, port, timeout)
+        return
+    terminate_process_group(popen, timeout)
+    wait_for_port_free(host, port, timeout)
+
+
+def should_show_vllm_line(line: str) -> bool:
+    # 正常指标由 print_summary 统一用中文输出；这里只透出明显异常。
+    text = line.strip()
+    if not text:
+        return False
+    error_prefixes = (
+        "ERROR",
+        "CRITICAL",
+        "Traceback",
+    )
+    error_snippets = (
+        "RuntimeError",
+        "ValueError",
+        "ConnectionError",
+        "CUDA out of memory",
+        "No such file",
+    )
+    return text.startswith(error_prefixes) or any(snippet in text for snippet in error_snippets)
+
+
+def parse_progress(line: str) -> int | None:
+    # 解析 pure_bench_serve.py 打出的 "PROGRESS x/n" 进度行，返回已完成数 x。
+    marker = "PROGRESS "
+    idx = line.find(marker)
+    if idx == -1:
+        return None
+    frag = line[idx + len(marker):].strip().split()
+    if not frag or "/" not in frag[0]:
+        return None
+    try:
+        return int(frag[0].split("/")[0])
+    except ValueError:
+        return None
+
+
+def run_command_with_heartbeat(
+    cmd: list[str],
+    cwd: Path,
+    args: argparse.Namespace,
+    result_path: Path,
+) -> None:
+    started = time.monotonic()
+    last_heartbeat = started
+    captured: list[str] = []
+    completed_count = 0
+
+    proc = subprocess.Popen(
+        cmd,
+        cwd=str(cwd),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+        preexec_fn=os.setsid,
+    )
+    selector = selectors.DefaultSelector()
+    if proc.stdout is not None:
+        selector.register(proc.stdout, selectors.EVENT_READ)
+
+    try:
+        while proc.poll() is None:
+            for key, _ in selector.select(timeout=1):
+                line = key.fileobj.readline()
+                if not line:
+                    continue
+                captured.append(line)
+                parsed = parse_progress(line)
+                if parsed is not None:
+                    completed_count = parsed
+                elif args.show_vllm_output or should_show_vllm_line(line):
+                    print("  " + line.rstrip())
+
+            now = time.monotonic()
+            if args.progress_interval > 0 and now - last_heartbeat >= args.progress_interval:
+                print(
+                    "  压测正在运行："
+                    f"已用 {format_duration(now - started)}，"
+                    f"目标 {args.num_prompts} 条，已完成 {completed_count} 条，"
+                    f"并发上限 {args.max_concurrency}。"
+                )
+                last_heartbeat = now
+
+        if proc.stdout is not None:
+            for line in proc.stdout:
+                captured.append(line)
+                if args.show_vllm_output or should_show_vllm_line(line):
+                    print("  " + line.rstrip())
+    except KeyboardInterrupt:
+        print("\n收到中止信号，正在停止 vLLM benchmark 子进程...")
+        terminate_process_group(proc, timeout=10)
+        raise
+    finally:
+        selector.close()
+
+    if proc.returncode != 0:
+        print("\nvLLM benchmark 异常退出，最近输出如下：")
+        meaningful = [ln for ln in captured if parse_progress(ln) is None]
+        for line in meaningful[-80:]:
+            print("  " + line.rstrip())
+        raise subprocess.CalledProcessError(proc.returncode, cmd)
+
+
+def run_benchmark(
+    args: argparse.Namespace,
+    dataset_path: Path,
+    result_path: Path,
+    mode_label: str,
+) -> None:
+    cmd = [
+        sys.executable,
+        str(BENCH_SCRIPT),
+        "--backend",
+        "openai-chat",
+        "--base-url",
+        f"http://{args.client_host}:{args.port}",
+        "--endpoint",
+        "/v1/chat/completions",
+        "--model",
+        args.served_model_name,
+        "--tokenizer",
+        args.tokenizer or args.model_path,
+        "--dataset-name",
+        "burstgpt",
+        "--dataset-path",
+        str(dataset_path),
+        "--num-prompts",
+        str(args.num_prompts),
+        "--request-rate",
+        rate_to_str(args.request_rate),
+        "--seed",
+        str(args.seed),
+        "--burstiness",
+        str(args.burstiness),
+        "--max-concurrency",
+        str(args.max_concurrency),
+        "--disable-tqdm",
+        "--save-result",
+        "--result-dir",
+        str(result_path.parent),
+        "--result-filename",
+        result_path.name,
+    ]
+    if args.save_detailed:
+        cmd.append("--save-detailed")
+    if args.temperature is not None:
+        cmd.extend(["--temperature", str(args.temperature)])
+    if args.ignore_eos:
+        cmd.append("--ignore-eos")
+
+    print(paint(f"服务启动完成，开始跑 BurstGPT / vLLM benchmark（optimized {mode_label}）", "32;1"))
+    print_kv("请求数", args.num_prompts)
+    print_kv("请求速率", rate_to_str(args.request_rate))
+    print_kv("并发上限", args.max_concurrency)
+    print_kv("随机种子", args.seed)
+    print_kv("数据集", dataset_path)
+    if args.save_result_files:
+        print_kv("结果文件", result_path)
+    else:
+        print_kv("结果文件", "不保留（使用临时文件读取结果）")
+    print_kv("原始输出", "默认隐藏；需要调试时加 --show-vllm-output")
+    print("  开始压测。这一步可能持续较久，脚本会定时打印运行状态。")
+    run_command_with_heartbeat(cmd, PROJECT_ROOT, args, result_path)
+
+
+def load_result(result_path: Path) -> dict:
+    with result_path.open(encoding="utf-8") as f:
+        return json.load(f)
+
+
+def fmt(value: object, unit: str = "", spec: str = ".3f") -> str:
+    if value is None:
+        return "N/A"
+    if isinstance(value, (int, float)):
+        return f"{value:{spec}}{unit}"
+    return str(value)
+
+
+def print_run_summary(stats: dict) -> None:
+    print("本轮结果：")
+    if stats.get("result_path"):
+        print_kv("结果文件", stats["result_path"])
+    print_kv("完成请求数", fmt(stats.get("completed"), "", ".0f"))
+    print_kv("失败请求数", fmt(stats.get("failed"), "", ".0f"))
+    print_kv("请求吞吐", fmt(stats.get("request_throughput"), " req/s", ".3f"))
+    print_kv("输出吞吐", fmt(stats.get("output_throughput"), " tok/s", ".3f"))
+    print_kv("平均 TTFT", fmt(stats.get("mean_ttft_ms"), " ms", ".2f"))
+    print_kv("平均 TPOT", fmt(stats.get("mean_tpot_ms"), " ms", ".2f"))
+
+
+def diff(on_value: object, off_value: object) -> float | None:
+    if not isinstance(on_value, (int, float)) or not isinstance(off_value, (int, float)):
+        return None
+    return on_value - off_value
+
+
+def speedup_pct(on_value: object, off_value: object, lower_is_better: bool) -> float | None:
+    if not isinstance(on_value, (int, float)) or not isinstance(off_value, (int, float)):
+        return None
+    if off_value == 0:
+        return None
+    if lower_is_better:
+        return (off_value - on_value) / off_value * 100
+    return (on_value - off_value) / off_value * 100
+
+
+def print_comparison(results: list[dict]) -> None:
+    print("\n" + paint("#" * 78, "35;1"))
+    print(paint("最终对比（optimized ON vs OFF）", "35;1"))
+    print(paint("-" * 78, "35;1"))
+
+    on_r = next((r for r in results if r["mode"] == "ON"), None)
+    off_r = next((r for r in results if r["mode"] == "OFF"), None)
+    if not on_r or not off_r:
+        only = results[0]
+        print(f"只跑了 optimized {only['mode']}，没有生成 ON/OFF 对比。")
+        print_run_summary(only)
+        return
+
+    header = (
+        f"{ljust_display('指标', 22)}"
+        f"{'ON':>14}"
+        f"{'OFF':>14}"
+        f"{'Δ (ON-OFF)':>16}"
+        f"{'ON 提升':>12}"
+    )
+    print(paint(header, "1"))
+    print("-" * 78)
+
+    def row(label: str, key: str, unit: str, spec: str = ".3f", lower_is_better: bool = True) -> None:
+        on_value = on_r.get(key)
+        off_value = off_r.get(key)
+        delta = diff(on_value, off_value)
+        pct = speedup_pct(on_value, off_value, lower_is_better)
+        pct_text = f"{fmt(pct, '%', '+.2f'):>12}"
+        if isinstance(pct, (int, float)):
+            pct_text = paint(pct_text, "32;1" if pct >= 0 else "31;1")
+        print(
+            f"{ljust_display(label, 22)}"
+            f"{fmt(on_value, unit, spec):>14}"
+            f"{fmt(off_value, unit, spec):>14}"
+            f"{fmt(delta, unit, '+' + spec):>16}"
+            f"{pct_text}"
+        )
+
+    row("完成请求数", "completed", "", ".0f", lower_is_better=False)
+    row("失败请求数", "failed", "", ".0f", lower_is_better=True)
+    row("总耗时", "duration", " s", ".2f", lower_is_better=True)
+    row("请求吞吐", "request_throughput", " req/s", ".3f", lower_is_better=False)
+    row("输出吞吐", "output_throughput", " tok/s", ".3f", lower_is_better=False)
+    row("Avg TTFT", "mean_ttft_ms", " ms", ".2f", lower_is_better=True)
+    row("Median TTFT", "median_ttft_ms", " ms", ".2f", lower_is_better=True)
+    row("P99 TTFT", "p99_ttft_ms", " ms", ".2f", lower_is_better=True)
+    row("Avg TPOT", "mean_tpot_ms", " ms", ".2f", lower_is_better=True)
+    row("Median TPOT", "median_tpot_ms", " ms", ".2f", lower_is_better=True)
+    row("P99 TPOT", "p99_tpot_ms", " ms", ".2f", lower_is_better=True)
+    print("-" * 78)
+    if on_r.get("result_path"):
+        print_kv("ON 结果文件", on_r["result_path"])
+    if off_r.get("result_path"):
+        print_kv("OFF 结果文件", off_r["result_path"])
+
+
+def benchmark_modes(args: argparse.Namespace) -> list[tuple[str, int]]:
+    if args.modes == "off":
+        return [("OFF", 0)]
+    if args.modes == "on":
+        return [("ON", args.chunk_size)]
+    return [("OFF", 0), ("ON", args.chunk_size)]
+
+
+def with_suffix(path: Path, suffix: str) -> Path:
+    return path.with_name(f"{path.stem}_{suffix}{path.suffix}")
+
+
+def default_result_path(args: argparse.Namespace, result_dir: Path) -> Path:
+    rate = sanitize(rate_to_str(args.request_rate))
+    return result_dir / (
+        f"vllm_burstgpt_{args.num_prompts}req_rps{rate}_mc{args.max_concurrency}.json"
+    )
+
+
+def result_path_for_mode(args: argparse.Namespace, result_dir: Path, mode_label: str) -> Path:
+    base = result_dir / args.result_filename if args.result_filename else default_result_path(args, result_dir)
+    return with_suffix(base, f"chunk_{mode_label.lower()}")
+
+
+def validate_args(args: argparse.Namespace, modes: list[tuple[str, int]]) -> None:
+    if args.num_prompts <= 0:
+        raise SystemExit("--num-prompts 必须大于 0。")
+    if args.max_concurrency <= 0:
+        raise SystemExit("--max-concurrency 必须大于 0。")
+    if args.max_request_tokens <= 0 or args.max_response_tokens <= 0:
+        raise SystemExit("--max-request-tokens / --max-response-tokens 必须大于 0。")
+    if any(mode == "ON" for mode, _ in modes) and args.chunk_size <= 0:
+        raise SystemExit("optimized ON 需要 --chunk-size > 0。")
+    if args.result_filename and not args.save_result_files:
+        print("  注意：未加 --save-result-files，--result-filename 只用于临时文件名，不会保留。")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="对比 optimized 开/关时的 BurstGPT / vLLM benchmark serve 结果。"
+    )
+    parser.add_argument("--model-path", default=DEFAULT_MODEL)
+    parser.add_argument("--tokenizer", default=None)
+    parser.add_argument("--dataset-path", default=DEFAULT_DATASET)
+    parser.add_argument("--filtered-dataset-path", default=None)
+    parser.add_argument("--use-full-dataset", action="store_true")
+    parser.add_argument("--max-request-tokens", type=int, default=1024)
+    parser.add_argument("--max-response-tokens", type=int, default=256)
+    parser.add_argument("--num-prompts", type=int, default=100)
+    parser.add_argument("--request-rate", type=parse_rate, default=2.0)
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--burstiness", type=float, default=1.0)
+    parser.add_argument("--max-concurrency", type=int, default=10)
+    parser.add_argument("--result-dir", default=DEFAULT_RESULT_DIR)
+    parser.add_argument("--result-filename", default=None)
+    parser.add_argument("--save-result-files", action="store_true", help="保留 vLLM JSON 结果文件；默认只在临时文件中读取结果，跑完不保留。")
+    parser.add_argument("--save-detailed", action=argparse.BooleanOptionalAction, default=False)
+
+    parser.add_argument("--cuda-visible-devices", default=os.environ.get("CUDA_VISIBLE_DEVICES", "0"))
+    parser.add_argument("--device", default="nvidia")
+    parser.add_argument("--backend", default="cpp", choices=["cpp", "python", "torch", "vllm"])
+    parser.add_argument("--server-host", default="127.0.0.1")
+    parser.add_argument("--client-host", default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=2333)
+    parser.add_argument("--max-batch-size", type=int, default=32)
+    parser.add_argument("--server-max-new-tokens", type=int, default=256)
+    parser.add_argument("--enable-paged-attn", action=argparse.BooleanOptionalAction, default=True)
+    parser.add_argument("--enable-graph", action=argparse.BooleanOptionalAction, default=True)
+    parser.add_argument("--enable-chunk-prefill-graph", action=argparse.BooleanOptionalAction, default=True)
+    parser.add_argument("--chunk-size", type=int, default=256, help="optimized ON 时使用的 chunk-size；OFF 固定为 0。")
+    parser.add_argument("--modes", choices=["both", "off", "on"], default="both", help="默认 both：先 OFF 后 ON 并输出最终对比。")
+    parser.add_argument("--server-timeout", type=int, default=600)
+    parser.add_argument("--served-model-name", default="9g_8b_thinking")
+    parser.add_argument("--temperature", type=float, default=None)
+    parser.add_argument("--ignore-eos", action="store_true")
+    parser.add_argument("--progress-interval", type=int, default=30, help="压测运行中的中文心跳提示间隔，单位秒；设为 0 可关闭。")
+    parser.add_argument("--show-vllm-output", action="store_true", help="显示 vLLM benchmark 的原始英文输出。")
+    parser.add_argument("--show-server-output", action="store_true", help="直接显示 InfiniLM 服务端输出；默认隐藏且不保存日志。")
+    parser.add_argument("--color", choices=["auto", "always", "never"], default="auto", help="终端颜色输出。")
+    return parser.parse_args()
+
+
+def main() -> None:
+    global USE_COLOR
+    args = parse_args()
+    USE_COLOR = (
+        args.color == "always"
+        or (args.color == "auto" and sys.stdout.isatty() and not os.environ.get("NO_COLOR"))
+    )
+
+    modes = benchmark_modes(args)
+    validate_args(args, modes)
+
+    result_dir = Path(args.result_dir)
+    if args.save_result_files:
+        result_dir.mkdir(parents=True, exist_ok=True)
+    print_bar("课题1：高性能统一智能计算架构及编译优化技术")
+    print_bar("课题1.3：负载资源互感知编译优化")
+    print_bar("优化技术效果对比测试")
+    print_kv("模型目录", args.model_path)
+    print_kv("请求数", args.num_prompts)
+    print_kv("请求速率", rate_to_str(args.request_rate))
+    print_kv("并发上限", args.max_concurrency)
+    print_kv("GPU", args.cuda_visible_devices)
+    print_kv("对比模式", " -> ".join(mode for mode, _ in modes))
+    print_kv("文件保存", "保存 JSON 结果" if args.save_result_files else "不保存日志和结果文件")
+    print_kv("提示", "如果只是试通，可以加 --num-prompts 20 --request-rate inf --modes both")
+
+    dataset_path = count_and_filter_dataset(args)
+    results: list[dict] = []
+
+    with tempfile.TemporaryDirectory(prefix="infinilm-burstgpt-") as tmpdir:
+        run_result_dir = result_dir if args.save_result_files else Path(tmpdir)
+
+        for mode_label, chunk_size in modes:
+            result_path = result_path_for_mode(args, run_result_dir, mode_label)
+            popen: subprocess.Popen | None = None
+            try:
+                popen = start_server(args, chunk_size, mode_label)
+                run_benchmark(args, dataset_path, result_path, mode_label)
+                stats = load_result(result_path)
+                stats.update(
+                    {
+                        "mode": mode_label,
+                        "chunk_size": chunk_size,
+                        "result_path": str(result_path) if args.save_result_files else "",
+                    }
+                )
+                results.append(stats)
+                print_run_summary(stats)
+                print(
+                    paint(
+                        f"完成 optimized {mode_label} 测试 -> "
+                        f"成功 {fmt(stats.get('completed'), '', '.0f')}，"
+                        f"失败 {fmt(stats.get('failed'), '', '.0f')}，"
+                        f"请求吞吐 {fmt(stats.get('request_throughput'), ' req/s', '.3f')}，"
+                        f"平均 TTFT {fmt(stats.get('mean_ttft_ms'), ' ms', '.2f')}",
+                        "32;1",
+                    )
+                )
+            finally:
+                stop_server(popen, host=args.client_host, port=args.port)
+                print("服务已停止")
+
+    print_comparison(results)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/test_chunk_prefill.py b/scripts/test_chunk_prefill.py
index 90d01b9e5..6df2d23eb 100644
--- a/scripts/test_chunk_prefill.py
+++ b/scripts/test_chunk_prefill.py
@@ -40,9 +40,9 @@ def wait_for_server(popen, host, port, model, timeout=300):
     raise TimeoutError(f"server not ready within {timeout}s")
 
 
-def inference_server(chunk_size, device, port, batch_size, max_new_tokens, enable_paged_attn, enable_graph, model_path):
+def inference_server(chunk_size, device, port, batch_size, max_new_tokens, enable_paged_attn, enable_graph, enable_chunk_prefill_graph, model_path):
     print(INFERENCE_SERVER)
-    args = ["CUDA_VISIBLE_DEVICES=12", sys.executable, INFERENCE_SERVER,
+    args = [f"CUDA_VISIBLE_DEVICES={os.environ.get('CUDA_VISIBLE_DEVICES', '0')}", sys.executable, INFERENCE_SERVER,
             f"--chunk-size {chunk_size}",
             f"--device {device}",
             f"--port {port}",
@@ -53,7 +53,8 @@ def inference_server(chunk_size, device, port, batch_size, max_new_tokens, enabl
         args.append("--enable-paged-attn")
     if enable_graph:
         args.append("--enable-graph")
-
+    if enable_chunk_prefill_graph:
+        args.append("--enable-chunk-prefill-graph")
     popen = subprocess.Popen(" ".join(args), shell=True, preexec_fn=os.setsid, stderr=subprocess.STDOUT)
     return popen
 
@@ -112,13 +113,14 @@ def grab(pat):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="比较 chunked prefill 开/关的 TTFT/E2E")
-    parser.add_argument("--device", type=str, default="iluvatar", help="设备类型")
+    parser.add_argument("--device", type=str, default="nvidia", help="设备类型")
     parser.add_argument("--batch-size", type=int, default=16)
     parser.add_argument("--max-new-tokens", type=int, default=16)
     parser.add_argument("--enable-paged-attn", type=bool, default=True)
     parser.add_argument("--enable-graph", type=bool, default=True)
+    parser.add_argument("--enable-chunk-prefill-graph", type=bool, default=True)
     parser.add_argument("--port", type=int, default=2333)
-    parser.add_argument("--model-path", type=str, default="/data-aisoft/mechdancer/models/9g_8b_thinking_llama/")
+    parser.add_argument("--model-path", type=str, default="/workspace/models/9g_8b_v2_thinking/9g_8b_thinking")
 
     
     args = parser.parse_args()
@@ -132,7 +134,8 @@ def grab(pat):
 
         server = inference_server(chunk_size=chunk_size, device=args.device, port=args.port,
                                 batch_size=args.batch_size, max_new_tokens=args.max_new_tokens,
-                                enable_paged_attn=args.enable_paged_attn, enable_graph=args.enable_graph,
+                                enable_paged_attn=args.enable_paged_attn, enable_graph=args.enable_graph, 
+                                enable_chunk_prefill_graph=args.enable_chunk_prefill_graph,
                                 model_path=args.model_path)
         try:
             wait_for_server(server, "127.0.0.1", args.port, model="FM9G-7B", timeout=300)

From 8f7d2e59917f0fd3f1e0b089c406f9b4715b929e Mon Sep 17 00:00:00 2001
From: huidesheng <1832140001@qq.com>
Date: Wed, 10 Jun 2026 12:29:39 +0000
Subject: [PATCH 9/9] add datasets

---
 scripts/datasets/BurstGPT/BurstGPT_1000.csv   | 1000 +++++++++++++++++
 ...BurstGPT_1000_gpt4_pos_req1024_resp256.csv |  111 ++
 scripts/test_burstgpt_bench.py                |    8 +-
 3 files changed, 1115 insertions(+), 4 deletions(-)
 create mode 100644 scripts/datasets/BurstGPT/BurstGPT_1000.csv
 create mode 100644 scripts/datasets/BurstGPT/BurstGPT_1000_gpt4_pos_req1024_resp256.csv

diff --git a/scripts/datasets/BurstGPT/BurstGPT_1000.csv b/scripts/datasets/BurstGPT/BurstGPT_1000.csv
new file mode 100644
index 000000000..8dc0e7a06
--- /dev/null
+++ b/scripts/datasets/BurstGPT/BurstGPT_1000.csv
@@ -0,0 +1,1000 @@
+Timestamp,Model,Request tokens,Response tokens,Total tokens,Log Type
+5,ChatGPT,472,18,490,Conversation log
+45,ChatGPT,1087,230,1317,Conversation log
+118,GPT-4,417,276,693,Conversation log
+185,ChatGPT,1360,647,2007,Conversation log
+214,ChatGPT,185,215,400,Conversation log
+233,GPT-4,586,293,879,Conversation log
+261,ChatGPT,37,1656,1693,Conversation log
+267,ChatGPT,54,503,557,Conversation log
+410,ChatGPT,1528,414,1942,Conversation log
+535,ChatGPT,89,370,459,Conversation log
+560,GPT-4,549,362,911,Conversation log
+638,ChatGPT,172,69,241,Conversation log
+686,GPT-4,969,206,1175,Conversation log
+741,ChatGPT,97,137,234,Conversation log
+771,GPT-4,1574,501,2075,Conversation log
+821,ChatGPT,253,239,492,Conversation log
+966,ChatGPT,23,344,367,Conversation log
+974,ChatGPT,509,207,716,Conversation log
+1028,ChatGPT,387,349,736,Conversation log
+1072,ChatGPT,733,180,913,Conversation log
+1106,ChatGPT,0,0,0,Conversation log
+1129,ChatGPT,1305,477,1782,Conversation log
+1278,ChatGPT,17,382,399,Conversation log
+1449,ChatGPT,601,557,1158,Conversation log
+1611,GPT-4,1327,239,1566,Conversation log
+1780,ChatGPT,1437,181,1618,Conversation log
+2069,ChatGPT,1336,245,1581,Conversation log
+2173,ChatGPT,19,440,459,Conversation log
+2273,GPT-4,92,95,187,Conversation log
+2293,ChatGPT,507,448,955,Conversation log
+2579,ChatGPT,938,263,1201,Conversation log
+2584,ChatGPT,1544,608,2152,Conversation log
+2608,GPT-4,1840,378,2218,Conversation log
+2684,ChatGPT,991,462,1453,Conversation log
+2826,ChatGPT,1109,329,1438,Conversation log
+3009,ChatGPT,1476,443,1919,Conversation log
+3091,ChatGPT,14,11,25,Conversation log
+3232,GPT-4,1741,428,2169,Conversation log
+4525,GPT-4,0,0,0,Conversation log
+4574,ChatGPT,1306,267,1573,Conversation log
+4608,GPT-4,0,0,0,Conversation log
+4725,GPT-4,0,0,0,Conversation log
+4798,GPT-4,614,323,937,Conversation log
+5812,ChatGPT,1960,136,2096,Conversation log
+6049,ChatGPT,1698,627,2325,Conversation log
+6263,ChatGPT,1907,616,2523,Conversation log
+6499,ChatGPT,137,225,362,Conversation log
+6882,GPT-4,20,86,106,Conversation log
+6998,GPT-4,127,268,395,Conversation log
+7442,GPT-4,665,167,832,Conversation log
+7982,GPT-4,855,297,1152,Conversation log
+8052,GPT-4,1191,30,1221,Conversation log
+8126,GPT-4,18,67,85,Conversation log
+8208,GPT-4,101,148,249,Conversation log
+8211,ChatGPT,112,336,448,Conversation log
+8258,GPT-4,264,76,340,Conversation log
+8681,ChatGPT,82,343,425,Conversation log
+8782,ChatGPT,506,343,849,Conversation log
+8989,ChatGPT,530,339,869,Conversation log
+9015,ChatGPT,942,396,1338,Conversation log
+9418,ChatGPT,1364,416,1780,Conversation log
+10205,ChatGPT,122,496,618,Conversation log
+10414,ChatGPT,24,63,87,Conversation log
+10443,ChatGPT,698,258,956,Conversation log
+10577,ChatGPT,103,76,179,Conversation log
+10600,ChatGPT,1073,129,1202,Conversation log
+10603,ChatGPT,200,95,295,Conversation log
+10636,ChatGPT,1282,188,1470,Conversation log
+10669,ChatGPT,1762,418,2180,Conversation log
+10740,ChatGPT,1625,268,1893,Conversation log
+10760,ChatGPT,25,326,351,Conversation log
+10760,ChatGPT,1639,188,1827,Conversation log
+10812,ChatGPT,42,197,239,Conversation log
+10825,ChatGPT,1637,231,1868,Conversation log
+10949,ChatGPT,1701,381,2082,Conversation log
+10995,ChatGPT,383,319,702,Conversation log
+11096,ChatGPT,266,246,512,Conversation log
+11411,ChatGPT,1450,497,1947,Conversation log
+11517,ChatGPT,1631,422,2053,Conversation log
+11539,ChatGPT,540,285,825,Conversation log
+11642,ChatGPT,1797,577,2374,Conversation log
+11738,ChatGPT,0,0,0,Conversation log
+11817,ChatGPT,20,310,330,Conversation log
+11853,ChatGPT,352,298,650,Conversation log
+12560,ChatGPT,726,334,1060,Conversation log
+14618,ChatGPT,870,269,1139,Conversation log
+19020,ChatGPT,66,342,408,Conversation log
+19108,ChatGPT,478,296,774,Conversation log
+19499,ChatGPT,837,292,1129,Conversation log
+19601,ChatGPT,1152,237,1389,Conversation log
+20288,ChatGPT,42,17,59,Conversation log
+26939,ChatGPT,29,69,98,Conversation log
+27037,ChatGPT,136,74,210,Conversation log
+27162,ChatGPT,229,69,298,Conversation log
+27307,ChatGPT,326,66,392,Conversation log
+27894,ChatGPT,408,322,730,Conversation log
+27948,ChatGPT,667,79,746,Conversation log
+28389,ChatGPT,657,192,849,Conversation log
+28436,ChatGPT,788,292,1080,Conversation log
+28571,ChatGPT,1035,227,1262,Conversation log
+28765,GPT-4,930,83,1013,API log
+28829,GPT-4,929,80,1009,API log
+29450,ChatGPT,36,14,50,Conversation log
+29549,ChatGPT,103,23,126,Conversation log
+29568,ChatGPT,209,242,451,Conversation log
+29648,ChatGPT,505,166,671,Conversation log
+29822,ChatGPT,166,34,200,Conversation log
+29832,ChatGPT,19,326,345,Conversation log
+29848,ChatGPT,239,30,269,Conversation log
+29870,ChatGPT,357,559,916,Conversation log
+29907,ChatGPT,320,28,348,Conversation log
+29987,ChatGPT,333,35,368,Conversation log
+29992,ChatGPT,342,11,353,Conversation log
+30024,ChatGPT,353,62,415,Conversation log
+30096,ChatGPT,26,205,231,Conversation log
+30324,ChatGPT,950,67,1017,Conversation log
+30342,ChatGPT,249,229,478,Conversation log
+30348,ChatGPT,926,73,999,Conversation log
+30368,ChatGPT,495,270,765,Conversation log
+30371,ChatGPT,807,73,880,Conversation log
+30393,ChatGPT,587,96,683,Conversation log
+30402,ChatGPT,0,0,0,Conversation log
+30414,ChatGPT,0,0,0,Conversation log
+30418,ChatGPT,0,0,0,Conversation log
+30422,ChatGPT,1539,82,1621,Conversation log
+30424,ChatGPT,428,194,622,Conversation log
+30476,ChatGPT,434,1018,1452,Conversation log
+30528,ChatGPT,561,79,640,Conversation log
+30601,ChatGPT,565,90,655,Conversation log
+30612,ChatGPT,31,144,175,Conversation log
+30649,ChatGPT,21,16,37,Conversation log
+30654,ChatGPT,578,300,878,Conversation log
+30677,ChatGPT,68,232,300,Conversation log
+30795,ChatGPT,14,423,437,Conversation log
+30852,ChatGPT,839,465,1304,Conversation log
+30878,ChatGPT,447,381,828,Conversation log
+30899,ChatGPT,838,436,1274,Conversation log
+31163,ChatGPT,1107,288,1395,Conversation log
+31244,ChatGPT,1306,245,1551,Conversation log
+31293,ChatGPT,1458,366,1824,Conversation log
+31390,ChatGPT,1524,427,1951,Conversation log
+31458,ChatGPT,1435,462,1897,Conversation log
+31790,ChatGPT,1618,161,1779,Conversation log
+31810,ChatGPT,1536,208,1744,Conversation log
+31826,ChatGPT,1385,150,1535,Conversation log
+31841,ChatGPT,1113,197,1310,Conversation log
+31906,ChatGPT,109,1413,1522,Conversation log
+31985,ChatGPT,45,972,1017,Conversation log
+31994,ChatGPT,36,307,343,Conversation log
+32004,GPT-4,380,210,590,Conversation log
+32034,ChatGPT,28,62,90,Conversation log
+32119,ChatGPT,42,954,996,Conversation log
+32185,ChatGPT,246,130,376,Conversation log
+32247,ChatGPT,395,787,1182,Conversation log
+32311,ChatGPT,1254,312,1566,Conversation log
+32387,ChatGPT,840,246,1086,Conversation log
+32488,ChatGPT,1169,845,2014,Conversation log
+32561,ChatGPT,1304,343,1647,Conversation log
+32582,ChatGPT,19,56,75,Conversation log
+32599,ChatGPT,92,252,344,Conversation log
+32601,ChatGPT,69,381,450,Conversation log
+32625,ChatGPT,10,318,328,Conversation log
+32656,ChatGPT,531,40,571,Conversation log
+32663,ChatGPT,23,426,449,Conversation log
+32674,ChatGPT,49,11,60,Conversation log
+32678,ChatGPT,40,541,581,Conversation log
+32680,GPT-4,99,470,569,Conversation log
+32698,ChatGPT,134,55,189,Conversation log
+32820,ChatGPT,42,1362,1404,Conversation log
+32915,ChatGPT,1877,455,2332,Conversation log
+32946,ChatGPT,12,391,403,Conversation log
+33017,ChatGPT,302,119,421,Conversation log
+33035,ChatGPT,447,639,1086,Conversation log
+33042,ChatGPT,10,3,13,Conversation log
+33068,ChatGPT,26,67,93,Conversation log
+33072,ChatGPT,103,16,119,Conversation log
+33099,ChatGPT,0,0,0,Conversation log
+33111,ChatGPT,1737,300,2037,Conversation log
+33206,ChatGPT,22,416,438,Conversation log
+33312,ChatGPT,111,1158,1269,Conversation log
+33313,ChatGPT,100,104,204,Conversation log
+33324,ChatGPT,1416,586,2002,Conversation log
+33371,ChatGPT,39,1238,1277,Conversation log
+33407,ChatGPT,22,287,309,Conversation log
+33411,ChatGPT,13,4,17,Conversation log
+33413,ChatGPT,405,256,661,Conversation log
+33437,ChatGPT,38,19,57,Conversation log
+33454,ChatGPT,708,44,752,Conversation log
+33481,ChatGPT,963,109,1072,Conversation log
+33509,ChatGPT,1276,266,1542,Conversation log
+33510,ChatGPT,1065,524,1589,Conversation log
+33527,ChatGPT,271,54,325,Conversation log
+33581,ChatGPT,1314,1903,3217,Conversation log
+33602,ChatGPT,337,356,693,Conversation log
+33632,ChatGPT,123,261,384,Conversation log
+33632,ChatGPT,387,58,445,Conversation log
+33637,ChatGPT,102,36,138,Conversation log
+33696,ChatGPT,854,350,1204,Conversation log
+33747,ChatGPT,522,65,587,Conversation log
+33784,ChatGPT,916,484,1400,Conversation log
+33810,ChatGPT,603,6,609,Conversation log
+33850,ChatGPT,628,495,1123,Conversation log
+33866,ChatGPT,31,26,57,Conversation log
+34015,ChatGPT,16,421,437,Conversation log
+34016,ChatGPT,1640,664,2304,Conversation log
+34035,ChatGPT,1232,438,1670,Conversation log
+34050,ChatGPT,489,92,581,Conversation log
+34068,ChatGPT,40,1092,1132,Conversation log
+34069,ChatGPT,461,496,957,Conversation log
+34076,ChatGPT,362,155,517,Conversation log
+34087,ChatGPT,149,3,152,Conversation log
+34100,ChatGPT,1540,35,1575,Conversation log
+34102,GPT-4,974,513,1487,Conversation log
+34105,ChatGPT,541,201,742,Conversation log
+34117,ChatGPT,954,51,1005,Conversation log
+34126,ChatGPT,930,92,1022,Conversation log
+34129,ChatGPT,34,15,49,Conversation log
+34139,ChatGPT,1169,2019,3188,Conversation log
+34146,GPT-4,1512,368,1880,Conversation log
+34173,ChatGPT,1994,570,2564,Conversation log
+34173,ChatGPT,489,338,827,Conversation log
+34193,ChatGPT,72,19,91,Conversation log
+34246,ChatGPT,1692,401,2093,Conversation log
+34257,ChatGPT,849,5,854,Conversation log
+34271,ChatGPT,490,27,517,Conversation log
+34273,ChatGPT,205,144,349,Conversation log
+34278,ChatGPT,1390,260,1650,Conversation log
+34291,ChatGPT,547,302,849,Conversation log
+34294,GPT-4,32,262,294,Conversation log
+34320,ChatGPT,1021,335,1356,Conversation log
+34323,ChatGPT,122,376,498,Conversation log
+34347,ChatGPT,1844,233,2077,Conversation log
+34352,ChatGPT,98,33,131,Conversation log
+34384,ChatGPT,436,31,467,Conversation log
+34405,ChatGPT,30,294,324,Conversation log
+34420,ChatGPT,55,34,89,Conversation log
+34444,ChatGPT,16,199,215,Conversation log
+34525,ChatGPT,704,331,1035,Conversation log
+34534,ChatGPT,71,731,802,Conversation log
+34542,ChatGPT,47,32,79,Conversation log
+34560,ChatGPT,1298,630,1928,Conversation log
+34601,GPT-4,15,115,130,Conversation log
+34615,GPT-4,145,314,459,Conversation log
+34640,ChatGPT,0,0,0,Conversation log
+34680,GPT-4,477,297,774,Conversation log
+34746,GPT-4,189,253,442,Conversation log
+34758,ChatGPT,418,95,513,Conversation log
+34770,GPT-4,585,26,611,Conversation log
+34802,GPT-4,757,211,968,Conversation log
+34818,ChatGPT,705,299,1004,Conversation log
+34824,GPT-4,203,200,403,Conversation log
+34832,ChatGPT,17,114,131,Conversation log
+34868,ChatGPT,22,29,51,Conversation log
+34876,ChatGPT,210,416,626,Conversation log
+34890,GPT-4,978,197,1175,Conversation log
+34896,GPT-4,188,181,369,Conversation log
+34911,GPT-4,226,362,588,Conversation log
+34919,ChatGPT,16,18,34,Conversation log
+34928,ChatGPT,45,14,59,Conversation log
+34943,GPT-4,163,19,182,Conversation log
+34976,ChatGPT,0,0,0,Conversation log
+34986,ChatGPT,1258,384,1642,Conversation log
+34988,ChatGPT,1956,353,2309,Conversation log
+34995,GPT-4,389,221,610,Conversation log
+35016,ChatGPT,1046,395,1441,Conversation log
+35056,ChatGPT,867,30,897,Conversation log
+35087,ChatGPT,145,618,763,Conversation log
+35096,GPT-4,603,339,942,Conversation log
+35132,GPT-4,19,380,399,Conversation log
+35134,ChatGPT,870,80,950,Conversation log
+35138,ChatGPT,72,13,85,Conversation log
+35142,ChatGPT,963,90,1053,Conversation log
+35149,GPT-4,33,82,115,Conversation log
+35156,GPT-4,342,231,573,Conversation log
+35161,GPT-4,161,22,183,Conversation log
+35168,GPT-4,419,385,804,Conversation log
+35178,ChatGPT,244,149,393,Conversation log
+35184,ChatGPT,66,36,102,Conversation log
+35220,ChatGPT,162,14,176,Conversation log
+35227,GPT-4,210,17,227,Conversation log
+35235,ChatGPT,862,214,1076,Conversation log
+35244,ChatGPT,263,17,280,Conversation log
+35246,ChatGPT,168,35,203,Conversation log
+35258,ChatGPT,1122,59,1181,Conversation log
+35260,ChatGPT,1036,13,1049,Conversation log
+35266,ChatGPT,353,15,368,Conversation log
+35293,ChatGPT,354,342,696,Conversation log
+35319,ChatGPT,302,887,1189,Conversation log
+35369,ChatGPT,723,541,1264,Conversation log
+35370,ChatGPT,1202,157,1359,Conversation log
+35374,GPT-4,466,60,526,Conversation log
+35403,ChatGPT,217,241,458,Conversation log
+35410,ChatGPT,25,57,82,Conversation log
+35415,ChatGPT,1360,702,2062,Conversation log
+35426,ChatGPT,70,82,152,Conversation log
+35429,ChatGPT,10,111,121,Conversation log
+35453,ChatGPT,140,98,238,Conversation log
+35459,ChatGPT,20,556,576,Conversation log
+35520,ChatGPT,590,93,683,Conversation log
+35541,ChatGPT,308,151,459,Conversation log
+35576,GPT-4,58,26,84,Conversation log
+35611,ChatGPT,35,12,47,Conversation log
+35647,ChatGPT,74,19,93,Conversation log
+35746,ChatGPT,15,38,53,Conversation log
+35761,ChatGPT,0,0,0,Conversation log
+35776,ChatGPT,396,333,729,Conversation log
+35788,ChatGPT,247,506,753,Conversation log
+35796,ChatGPT,757,356,1113,Conversation log
+35816,GPT-4,45,32,77,Conversation log
+35837,GPT-4,273,25,298,Conversation log
+35861,ChatGPT,274,442,716,Conversation log
+35876,ChatGPT,1130,812,1942,Conversation log
+35894,GPT-4,820,26,846,Conversation log
+35898,GPT-4,788,188,976,Conversation log
+35921,ChatGPT,18,412,430,Conversation log
+35954,ChatGPT,179,329,508,Conversation log
+35975,ChatGPT,19,38,57,Conversation log
+36021,ChatGPT,23,27,50,Conversation log
+36025,ChatGPT,1286,16,1302,Conversation log
+36043,ChatGPT,535,297,832,Conversation log
+36055,ChatGPT,17,7,24,Conversation log
+36069,ChatGPT,36,24,60,Conversation log
+36074,ChatGPT,992,15,1007,Conversation log
+36097,GPT-4,695,16,711,Conversation log
+36114,ChatGPT,36,238,274,Conversation log
+36116,GPT-4,146,299,445,Conversation log
+36121,GPT-4,149,256,405,Conversation log
+36165,ChatGPT,73,273,346,Conversation log
+36173,GPT-4,985,74,1059,Conversation log
+36195,ChatGPT,18,457,475,Conversation log
+36246,GPT-4,1958,737,2695,Conversation log
+36282,GPT-4,36,505,541,Conversation log
+36286,ChatGPT,66,17,83,Conversation log
+36308,ChatGPT,88,300,388,Conversation log
+36309,ChatGPT,203,285,488,Conversation log
+36343,ChatGPT,232,514,746,Conversation log
+36352,GPT-4,74,404,478,Conversation log
+36362,ChatGPT,537,249,786,Conversation log
+36371,ChatGPT,0,0,0,Conversation log
+36385,GPT-4,567,416,983,Conversation log
+36435,ChatGPT,14,203,217,Conversation log
+36447,GPT-4,995,454,1449,Conversation log
+36471,ChatGPT,836,701,1537,Conversation log
+36479,ChatGPT,81,999,1080,Conversation log
+36516,ChatGPT,477,163,640,Conversation log
+36591,ChatGPT,1098,721,1819,Conversation log
+36607,GPT-4,29,2,31,Conversation log
+36617,ChatGPT,24,14,38,Conversation log
+36626,GPT-4,1475,485,1960,Conversation log
+36638,GPT-4,1025,405,1430,Conversation log
+36646,GPT-4,391,25,416,Conversation log
+36647,ChatGPT,48,11,59,Conversation log
+36660,ChatGPT,69,14,83,Conversation log
+36665,ChatGPT,36,370,406,Conversation log
+36673,GPT-4,347,147,494,Conversation log
+36700,ChatGPT,1847,1362,3209,Conversation log
+36701,GPT-4,510,599,1109,Conversation log
+36716,ChatGPT,20,606,626,Conversation log
+36738,ChatGPT,20,171,191,Conversation log
+36754,ChatGPT,0,0,0,Conversation log
+36758,ChatGPT,0,0,0,Conversation log
+36789,ChatGPT,26,23,49,Conversation log
+36791,ChatGPT,299,20,319,Conversation log
+36804,GPT-4,1995,663,2658,Conversation log
+36805,ChatGPT,59,31,90,Conversation log
+36836,ChatGPT,1365,214,1579,Conversation log
+36850,ChatGPT,1595,354,1949,Conversation log
+36868,GPT-4,500,238,738,Conversation log
+36883,ChatGPT,51,1442,1493,Conversation log
+36915,GPT-4,2126,379,2505,Conversation log
+36922,ChatGPT,33,81,114,Conversation log
+36968,ChatGPT,79,59,138,Conversation log
+36989,GPT-4,145,45,190,Conversation log
+36990,ChatGPT,118,24,142,Conversation log
+36993,ChatGPT,222,43,265,Conversation log
+36997,ChatGPT,21,387,408,Conversation log
+37000,GPT-4,752,280,1032,Conversation log
+37006,ChatGPT,158,104,262,Conversation log
+37008,ChatGPT,170,14,184,Conversation log
+37053,ChatGPT,290,90,380,Conversation log
+37081,ChatGPT,601,327,928,Conversation log
+37106,GPT-4,29,213,242,Conversation log
+37117,GPT-4,112,110,222,Conversation log
+37140,ChatGPT,50,962,1012,Conversation log
+37148,ChatGPT,733,311,1044,Conversation log
+37178,ChatGPT,9,9,18,Conversation log
+37182,ChatGPT,40,11,51,Conversation log
+37183,GPT-4,41,552,593,Conversation log
+37189,ChatGPT,1544,1421,2965,Conversation log
+37198,GPT-4,28,231,259,Conversation log
+37201,ChatGPT,24,443,467,Conversation log
+37218,ChatGPT,657,1011,1668,Conversation log
+37228,ChatGPT,60,46,106,Conversation log
+37235,GPT-4,275,225,500,Conversation log
+37236,ChatGPT,0,0,0,Conversation log
+37243,ChatGPT,0,0,0,Conversation log
+37255,ChatGPT,50,1143,1193,Conversation log
+37272,GPT-4,251,23,274,Conversation log
+37274,GPT-4,18,70,88,Conversation log
+37274,ChatGPT,67,58,125,Conversation log
+37277,GPT-4,281,239,520,Conversation log
+37291,ChatGPT,1061,1161,2222,Conversation log
+37314,ChatGPT,484,308,792,Conversation log
+37316,ChatGPT,126,43,169,Conversation log
+37337,ChatGPT,723,170,893,Conversation log
+37340,ChatGPT,50,1265,1315,Conversation log
+37342,ChatGPT,967,357,1324,Conversation log
+37376,GPT-4,106,329,435,Conversation log
+37378,ChatGPT,815,717,1532,Conversation log
+37380,ChatGPT,301,28,329,Conversation log
+37389,ChatGPT,1364,1331,2695,Conversation log
+37408,GPT-4,449,192,641,Conversation log
+37442,GPT-4,96,260,356,Conversation log
+37454,ChatGPT,757,351,1108,Conversation log
+37481,ChatGPT,1571,434,2005,Conversation log
+37538,GPT-4,690,170,860,Conversation log
+37548,ChatGPT,502,234,736,Conversation log
+37573,ChatGPT,71,40,111,Conversation log
+37676,ChatGPT,312,204,516,Conversation log
+37785,ChatGPT,19,320,339,Conversation log
+37788,ChatGPT,401,102,503,Conversation log
+37806,GPT-4,248,53,301,Conversation log
+37808,GPT-4,386,17,403,Conversation log
+37825,ChatGPT,27,18,45,Conversation log
+37831,GPT-4,749,344,1093,Conversation log
+37844,ChatGPT,19,232,251,Conversation log
+37847,ChatGPT,28,53,81,Conversation log
+37862,ChatGPT,0,0,0,Conversation log
+37868,GPT-4,27,209,236,Conversation log
+37873,ChatGPT,79,62,141,Conversation log
+37879,ChatGPT,36,513,549,Conversation log
+37879,ChatGPT,268,32,300,Conversation log
+37886,GPT-4,244,65,309,Conversation log
+37900,GPT-4,0,0,0,Conversation log
+37914,GPT-4,91,63,154,Conversation log
+37929,GPT-4,170,42,212,Conversation log
+37982,GPT-4,258,171,429,Conversation log
+37997,ChatGPT,40,2048,2088,Conversation log
+38017,ChatGPT,989,206,1195,Conversation log
+38019,GPT-4,536,236,772,Conversation log
+38028,ChatGPT,580,56,636,Conversation log
+38030,ChatGPT,815,433,1248,Conversation log
+38041,GPT-4,359,22,381,Conversation log
+38042,ChatGPT,35,452,487,Conversation log
+38054,ChatGPT,524,716,1240,Conversation log
+38062,ChatGPT,1275,694,1969,Conversation log
+38069,GPT-4,472,213,685,Conversation log
+38070,GPT-4,437,31,468,Conversation log
+38073,ChatGPT,1130,519,1649,Conversation log
+38074,GPT-4,19,307,326,Conversation log
+38091,ChatGPT,1263,707,1970,Conversation log
+38104,ChatGPT,13,68,81,Conversation log
+38105,ChatGPT,374,827,1201,Conversation log
+38114,ChatGPT,100,8,108,Conversation log
+38151,GPT-4,338,71,409,Conversation log
+38172,GPT-4,426,369,795,Conversation log
+38179,ChatGPT,1965,516,2481,Conversation log
+38215,ChatGPT,114,366,480,Conversation log
+38235,GPT-4,517,27,544,Conversation log
+38256,GPT-4,46,462,508,Conversation log
+38269,ChatGPT,63,388,451,Conversation log
+38309,ChatGPT,1299,267,1566,Conversation log
+38318,ChatGPT,355,456,811,Conversation log
+38321,GPT-4,1422,348,1770,Conversation log
+38345,ChatGPT,492,536,1028,Conversation log
+38354,GPT-4,599,112,711,Conversation log
+38371,ChatGPT,1676,559,2235,Conversation log
+38427,ChatGPT,72,15,87,Conversation log
+38449,GPT-4,13,85,98,Conversation log
+38457,ChatGPT,157,11,168,Conversation log
+38467,ChatGPT,32,176,208,Conversation log
+38479,ChatGPT,255,17,272,Conversation log
+38492,ChatGPT,353,19,372,Conversation log
+38501,ChatGPT,454,14,468,Conversation log
+38513,ChatGPT,288,252,540,Conversation log
+38515,GPT-4,668,473,1141,Conversation log
+38525,ChatGPT,321,851,1172,Conversation log
+38591,ChatGPT,516,647,1163,Conversation log
+38600,GPT-4,22,242,264,Conversation log
+38617,GPT-4,52,180,232,Conversation log
+38620,ChatGPT,86,577,663,Conversation log
+38649,ChatGPT,36,19,55,Conversation log
+38680,GPT-4,86,411,497,Conversation log
+38706,GPT-4,491,179,670,Conversation log
+38720,ChatGPT,2016,182,2198,Conversation log
+38722,GPT-4,10,85,95,Conversation log
+38751,ChatGPT,1276,213,1489,Conversation log
+38769,GPT-4,122,10,132,Conversation log
+38793,ChatGPT,1042,20,1062,Conversation log
+38798,GPT-4,68,427,495,Conversation log
+38837,ChatGPT,1180,524,1704,Conversation log
+38837,GPT-4,99,112,211,Conversation log
+38916,ChatGPT,1711,220,1931,Conversation log
+38918,GPT-4,37,558,595,Conversation log
+38920,GPT-4,242,94,336,Conversation log
+38928,GPT-4,169,18,187,Conversation log
+38932,ChatGPT,1269,87,1356,Conversation log
+38937,GPT-4,764,225,989,Conversation log
+38958,GPT-4,384,231,615,Conversation log
+38970,ChatGPT,1912,99,2011,Conversation log
+38975,ChatGPT,47,600,647,Conversation log
+38977,GPT-4,717,133,850,Conversation log
+38989,GPT-4,81,420,501,Conversation log
+39036,GPT-4,648,377,1025,Conversation log
+39058,ChatGPT,1462,175,1637,Conversation log
+39073,ChatGPT,150,331,481,Conversation log
+39090,ChatGPT,0,0,0,Conversation log
+39094,ChatGPT,1462,420,1882,Conversation log
+39112,GPT-4,1069,366,1435,Conversation log
+39143,ChatGPT,1030,461,1491,Conversation log
+39148,ChatGPT,1450,483,1933,Conversation log
+39169,ChatGPT,1317,191,1508,Conversation log
+39179,ChatGPT,529,48,577,Conversation log
+39183,ChatGPT,183,72,255,Conversation log
+39207,ChatGPT,2039,571,2610,Conversation log
+39223,ChatGPT,1286,444,1730,Conversation log
+39248,ChatGPT,1382,288,1670,Conversation log
+39262,ChatGPT,1387,131,1518,Conversation log
+39268,GPT-4,550,394,944,Conversation log
+39281,ChatGPT,21,16,37,Conversation log
+39324,ChatGPT,0,0,0,Conversation log
+39333,ChatGPT,361,721,1082,Conversation log
+39377,ChatGPT,1278,477,1755,Conversation log
+39439,GPT-4,328,8,336,Conversation log
+39454,ChatGPT,16,105,121,Conversation log
+39457,ChatGPT,1476,146,1622,Conversation log
+39467,GPT-4,995,517,1512,Conversation log
+39478,ChatGPT,139,224,363,Conversation log
+39501,ChatGPT,13,357,370,Conversation log
+39512,ChatGPT,59,81,140,Conversation log
+39527,ChatGPT,387,475,862,Conversation log
+39530,GPT-4,0,0,0,Conversation log
+39558,ChatGPT,21,109,130,Conversation log
+39571,ChatGPT,132,68,200,Conversation log
+39575,ChatGPT,12,260,272,Conversation log
+39582,ChatGPT,500,556,1056,Conversation log
+39606,ChatGPT,893,377,1270,Conversation log
+39621,ChatGPT,79,31,110,Conversation log
+39634,ChatGPT,119,48,167,Conversation log
+39639,GPT-4,58,5,63,Conversation log
+39643,GPT-4,0,0,0,Conversation log
+39650,GPT-4,117,5,122,Conversation log
+39658,GPT-4,288,178,466,Conversation log
+39658,ChatGPT,1590,556,2146,Conversation log
+39677,ChatGPT,14,15,29,Conversation log
+39686,ChatGPT,1308,624,1932,Conversation log
+39687,ChatGPT,67,44,111,Conversation log
+39694,GPT-4,137,162,299,Conversation log
+39698,GPT-4,16,272,288,Conversation log
+39714,GPT-4,0,0,0,Conversation log
+39725,GPT-4,15,104,119,Conversation log
+39751,ChatGPT,1521,973,2494,Conversation log
+39762,ChatGPT,1058,22,1080,Conversation log
+39764,ChatGPT,194,177,371,Conversation log
+39765,ChatGPT,18,239,257,Conversation log
+39780,GPT-4,560,264,824,Conversation log
+39795,GPT-4,10,64,74,Conversation log
+39818,ChatGPT,1642,1046,2688,Conversation log
+39845,ChatGPT,0,0,0,Conversation log
+39847,GPT-4,224,294,518,Conversation log
+39852,ChatGPT,212,97,309,Conversation log
+39861,ChatGPT,1866,717,2583,Conversation log
+39864,ChatGPT,155,200,355,Conversation log
+39877,ChatGPT,247,216,463,Conversation log
+39884,GPT-4,29,174,203,Conversation log
+39903,GPT-4,654,49,703,Conversation log
+39934,GPT-4,954,250,1204,Conversation log
+39939,GPT-4,727,254,981,Conversation log
+39962,GPT-4,0,0,0,Conversation log
+39966,GPT-4,0,0,0,Conversation log
+39967,ChatGPT,19,499,518,Conversation log
+39985,GPT-4,0,0,0,Conversation log
+40015,ChatGPT,0,0,0,Conversation log
+40021,ChatGPT,45,31,76,Conversation log
+40027,ChatGPT,0,0,0,Conversation log
+40031,ChatGPT,1770,194,1964,Conversation log
+40049,GPT-4,301,133,434,Conversation log
+40055,ChatGPT,76,442,518,Conversation log
+40071,GPT-4,0,0,0,Conversation log
+40108,GPT-4,0,0,0,Conversation log
+40110,ChatGPT,1779,549,2328,Conversation log
+40112,ChatGPT,478,726,1204,Conversation log
+40151,ChatGPT,0,0,0,Conversation log
+40166,ChatGPT,1925,323,2248,Conversation log
+40170,ChatGPT,23,336,359,Conversation log
+40183,ChatGPT,1532,335,1867,Conversation log
+40185,ChatGPT,19,249,268,Conversation log
+40207,ChatGPT,372,354,726,Conversation log
+40241,GPT-4,0,0,0,Conversation log
+40255,ChatGPT,14,54,68,Conversation log
+40274,ChatGPT,1472,234,1706,Conversation log
+40279,ChatGPT,26,530,556,Conversation log
+40303,ChatGPT,1421,520,1941,Conversation log
+40314,GPT-4,945,79,1024,Conversation log
+40336,ChatGPT,1486,257,1743,Conversation log
+40337,GPT-4,773,367,1140,Conversation log
+40361,ChatGPT,1260,268,1528,Conversation log
+40417,ChatGPT,1215,185,1400,Conversation log
+40422,ChatGPT,28,302,330,Conversation log
+40454,ChatGPT,949,208,1157,Conversation log
+40471,GPT-4,216,198,414,Conversation log
+40477,ChatGPT,20,418,438,Conversation log
+40503,ChatGPT,96,535,631,Conversation log
+40525,ChatGPT,0,0,0,Conversation log
+40545,ChatGPT,1509,344,1853,Conversation log
+40558,GPT-4,359,380,739,Conversation log
+40568,ChatGPT,22,696,718,Conversation log
+40571,ChatGPT,1015,226,1241,Conversation log
+40581,ChatGPT,898,429,1327,Conversation log
+40586,ChatGPT,268,43,311,Conversation log
+40615,ChatGPT,217,210,427,Conversation log
+40634,ChatGPT,1144,309,1453,Conversation log
+40635,ChatGPT,59,32,91,Conversation log
+40636,ChatGPT,739,313,1052,Conversation log
+40643,GPT-4,0,0,0,Conversation log
+40648,ChatGPT,1809,618,2427,Conversation log
+40665,GPT-4,228,180,408,Conversation log
+40674,ChatGPT,49,535,584,Conversation log
+40681,ChatGPT,1559,303,1862,Conversation log
+40701,GPT-4,0,0,0,Conversation log
+40716,ChatGPT,1866,563,2429,Conversation log
+40742,GPT-4,0,0,0,Conversation log
+40765,GPT-4,0,0,0,Conversation log
+40781,GPT-4,0,0,0,Conversation log
+40816,GPT-4,0,0,0,Conversation log
+40817,GPT-4,0,0,0,Conversation log
+40833,GPT-4,151,34,185,Conversation log
+40851,GPT-4,0,0,0,Conversation log
+40856,ChatGPT,272,184,456,Conversation log
+40862,GPT-4,229,40,269,Conversation log
+40870,ChatGPT,1413,312,1725,Conversation log
+40882,ChatGPT,1052,212,1264,Conversation log
+40884,ChatGPT,962,384,1346,Conversation log
+40889,ChatGPT,1335,340,1675,Conversation log
+40904,ChatGPT,19,411,430,Conversation log
+40908,ChatGPT,461,250,711,Conversation log
+40918,ChatGPT,0,0,0,Conversation log
+40926,ChatGPT,138,41,179,Conversation log
+40930,ChatGPT,149,17,166,Conversation log
+40930,ChatGPT,821,245,1066,Conversation log
+40948,ChatGPT,0,0,0,Conversation log
+40953,ChatGPT,1708,695,2403,Conversation log
+40958,ChatGPT,651,235,886,Conversation log
+40966,ChatGPT,16,454,470,Conversation log
+40996,ChatGPT,21,260,281,Conversation log
+41013,GPT-4,0,0,0,Conversation log
+41016,ChatGPT,306,233,539,Conversation log
+41031,GPT-4,0,0,0,Conversation log
+41035,ChatGPT,228,41,269,Conversation log
+41045,ChatGPT,292,355,647,Conversation log
+41046,ChatGPT,16,380,396,Conversation log
+41068,ChatGPT,0,0,0,Conversation log
+41073,ChatGPT,446,570,1016,Conversation log
+41078,ChatGPT,0,0,0,Conversation log
+41083,ChatGPT,1189,373,1562,Conversation log
+41093,GPT-4,0,0,0,Conversation log
+41098,GPT-4,0,0,0,Conversation log
+41099,ChatGPT,1718,549,2267,Conversation log
+41102,ChatGPT,1115,506,1621,Conversation log
+41105,ChatGPT,1549,468,2017,Conversation log
+41114,ChatGPT,932,147,1079,Conversation log
+41119,GPT-4,2554,340,2894,Conversation log
+41138,GPT-4,0,0,0,Conversation log
+41147,ChatGPT,29,509,538,Conversation log
+41158,GPT-4,0,0,0,Conversation log
+41176,ChatGPT,164,111,275,Conversation log
+41189,ChatGPT,44,78,122,Conversation log
+41193,ChatGPT,1102,174,1276,Conversation log
+41194,GPT-4,72,167,239,Conversation log
+41198,ChatGPT,134,326,460,Conversation log
+41208,ChatGPT,16,458,474,Conversation log
+41212,GPT-4,767,279,1046,Conversation log
+41215,GPT-4,0,0,0,Conversation log
+41217,ChatGPT,66,178,244,Conversation log
+41219,ChatGPT,44,15,59,Conversation log
+41231,ChatGPT,262,121,383,Conversation log
+41239,GPT-4,263,57,320,Conversation log
+41243,ChatGPT,474,431,905,Conversation log
+41266,ChatGPT,490,326,816,Conversation log
+41293,GPT-4,0,0,0,Conversation log
+41309,ChatGPT,416,297,713,Conversation log
+41319,ChatGPT,1293,311,1604,Conversation log
+41335,ChatGPT,399,120,519,Conversation log
+41335,ChatGPT,0,0,0,Conversation log
+41345,ChatGPT,1550,334,1884,Conversation log
+41351,ChatGPT,106,23,129,Conversation log
+41357,ChatGPT,568,166,734,Conversation log
+41368,ChatGPT,463,104,567,Conversation log
+41386,ChatGPT,602,63,665,Conversation log
+41397,ChatGPT,1619,273,1892,Conversation log
+41402,ChatGPT,413,99,512,Conversation log
+41406,ChatGPT,61,722,783,Conversation log
+41406,ChatGPT,580,165,745,Conversation log
+41446,ChatGPT,19,339,358,Conversation log
+41464,ChatGPT,163,397,560,Conversation log
+41474,ChatGPT,45,453,498,Conversation log
+41486,ChatGPT,1483,516,1999,Conversation log
+41489,ChatGPT,28,388,416,Conversation log
+41491,ChatGPT,380,389,769,Conversation log
+41549,ChatGPT,1253,497,1750,Conversation log
+41549,ChatGPT,1333,330,1663,Conversation log
+41549,ChatGPT,745,137,882,Conversation log
+41551,GPT-4,0,0,0,Conversation log
+41553,ChatGPT,719,297,1016,Conversation log
+41558,ChatGPT,199,59,258,Conversation log
+41606,GPT-4,26,349,375,Conversation log
+41613,ChatGPT,905,116,1021,Conversation log
+41616,GPT-4,25,4,29,Conversation log
+41617,ChatGPT,1002,204,1206,Conversation log
+41624,ChatGPT,63,548,611,Conversation log
+41626,GPT-4,46,3,49,Conversation log
+41629,ChatGPT,1623,684,2307,Conversation log
+41632,ChatGPT,997,393,1390,Conversation log
+41640,ChatGPT,789,344,1133,Conversation log
+41641,GPT-4,64,4,68,Conversation log
+41641,GPT-4,0,0,0,Conversation log
+41649,GPT-4,607,12,619,Conversation log
+41666,ChatGPT,951,188,1139,Conversation log
+41666,ChatGPT,573,385,958,Conversation log
+41686,ChatGPT,991,43,1034,Conversation log
+41687,ChatGPT,1122,130,1252,Conversation log
+41688,ChatGPT,26,94,120,Conversation log
+41701,GPT-4,84,30,114,Conversation log
+41711,ChatGPT,140,350,490,Conversation log
+41716,ChatGPT,1168,527,1695,Conversation log
+41751,ChatGPT,908,115,1023,Conversation log
+41754,GPT-4,416,353,769,Conversation log
+41775,ChatGPT,28,20,48,Conversation log
+41784,ChatGPT,1700,565,2265,Conversation log
+41792,ChatGPT,984,540,1524,Conversation log
+41797,ChatGPT,767,291,1058,Conversation log
+41799,ChatGPT,66,8,74,Conversation log
+41809,ChatGPT,1165,398,1563,Conversation log
+41814,ChatGPT,103,19,122,Conversation log
+41844,GPT-4,0,0,0,Conversation log
+41849,ChatGPT,1026,447,1473,Conversation log
+41887,GPT-4,166,280,446,Conversation log
+41889,ChatGPT,523,203,726,Conversation log
+41904,GPT-4,805,139,944,Conversation log
+41917,ChatGPT,1454,311,1765,Conversation log
+41924,ChatGPT,1609,524,2133,Conversation log
+41936,ChatGPT,1100,520,1620,Conversation log
+41937,GPT-4,48,336,384,Conversation log
+41941,ChatGPT,46,491,537,Conversation log
+41956,ChatGPT,561,434,995,Conversation log
+41962,GPT-4,0,0,0,Conversation log
+41965,ChatGPT,0,0,0,Conversation log
+41970,ChatGPT,1023,443,1466,Conversation log
+41973,ChatGPT,1083,223,1306,Conversation log
+41975,ChatGPT,1929,377,2306,Conversation log
+42036,ChatGPT,1745,499,2244,Conversation log
+42047,ChatGPT,978,85,1063,Conversation log
+42052,ChatGPT,28,288,316,Conversation log
+42055,ChatGPT,156,24,180,Conversation log
+42070,ChatGPT,334,426,760,Conversation log
+42080,ChatGPT,784,536,1320,Conversation log
+42084,GPT-4,0,0,0,Conversation log
+42086,ChatGPT,841,322,1163,Conversation log
+42094,ChatGPT,1350,629,1979,Conversation log
+42120,ChatGPT,667,536,1203,Conversation log
+42123,ChatGPT,24,6,30,Conversation log
+42124,ChatGPT,202,11,213,Conversation log
+42133,ChatGPT,0,0,0,Conversation log
+42147,ChatGPT,1993,462,2455,Conversation log
+42149,GPT-4,0,0,0,Conversation log
+42162,GPT-4,48,183,231,Conversation log
+42167,ChatGPT,1195,430,1625,Conversation log
+42183,ChatGPT,0,0,0,Conversation log
+42194,ChatGPT,0,0,0,Conversation log
+42199,ChatGPT,0,0,0,Conversation log
+42213,ChatGPT,1777,638,2415,Conversation log
+42214,ChatGPT,892,121,1013,Conversation log
+42225,GPT-4,0,0,0,Conversation log
+42239,ChatGPT,1869,609,2478,Conversation log
+42257,GPT-4,267,167,434,Conversation log
+42270,ChatGPT,1244,854,2098,Conversation log
+42280,ChatGPT,1847,556,2403,Conversation log
+42296,ChatGPT,185,543,728,Conversation log
+42297,ChatGPT,23,404,427,Conversation log
+42300,ChatGPT,38,560,598,Conversation log
+42313,ChatGPT,878,96,974,Conversation log
+42315,GPT-4,0,0,0,Conversation log
+42317,ChatGPT,1945,657,2602,Conversation log
+42317,ChatGPT,39,22,61,Conversation log
+42323,ChatGPT,177,5,182,Conversation log
+42326,ChatGPT,26,696,722,Conversation log
+42328,ChatGPT,1649,341,1990,Conversation log
+42338,ChatGPT,626,433,1059,Conversation log
+42339,ChatGPT,1182,651,1833,Conversation log
+42365,ChatGPT,208,22,230,Conversation log
+42369,ChatGPT,0,0,0,Conversation log
+42371,ChatGPT,294,102,396,Conversation log
+42372,ChatGPT,103,30,133,Conversation log
+42372,ChatGPT,1938,578,2516,Conversation log
+42379,ChatGPT,1531,76,1607,Conversation log
+42389,ChatGPT,1081,490,1571,Conversation log
+42399,ChatGPT,20,106,126,Conversation log
+42404,ChatGPT,1925,580,2505,Conversation log
+42407,ChatGPT,1592,529,2121,Conversation log
+42428,ChatGPT,172,10,182,Conversation log
+42432,ChatGPT,1947,556,2503,Conversation log
+42440,GPT-4,0,0,0,Conversation log
+42442,GPT-4,793,339,1132,Conversation log
+42452,GPT-4,0,0,0,Conversation log
+42457,ChatGPT,614,197,811,Conversation log
+42466,ChatGPT,1855,569,2424,Conversation log
+42492,ChatGPT,1004,7,1011,Conversation log
+42507,ChatGPT,0,0,0,Conversation log
+42509,ChatGPT,1874,547,2421,Conversation log
+42526,ChatGPT,267,58,325,Conversation log
+42531,ChatGPT,1844,592,2436,Conversation log
+42532,ChatGPT,177,11,188,Conversation log
+42550,ChatGPT,191,922,1113,Conversation log
+42550,ChatGPT,1589,396,1985,Conversation log
+42551,ChatGPT,50,4,54,Conversation log
+42557,ChatGPT,1877,674,2551,Conversation log
+42565,GPT-4,0,0,0,Conversation log
+42574,ChatGPT,139,140,279,Conversation log
+42587,GPT-4,744,137,881,Conversation log
+42603,GPT-4,0,0,0,Conversation log
+42620,ChatGPT,720,56,776,Conversation log
+42683,ChatGPT,404,50,454,Conversation log
+42738,ChatGPT,472,59,531,Conversation log
+42842,GPT-4,0,0,0,Conversation log
+42856,ChatGPT,12,26,38,Conversation log
+42859,ChatGPT,193,431,624,Conversation log
+42865,GPT-4,21,274,295,Conversation log
+42877,ChatGPT,1655,402,2057,Conversation log
+42904,ChatGPT,530,36,566,Conversation log
+42913,ChatGPT,25,250,275,Conversation log
+42931,ChatGPT,1252,209,1461,Conversation log
+42961,ChatGPT,320,470,790,Conversation log
+42965,GPT-4,0,0,0,Conversation log
+42973,ChatGPT,73,175,248,Conversation log
+42974,ChatGPT,1941,691,2632,Conversation log
+42989,GPT-4,1011,65,1076,Conversation log
+42992,ChatGPT,277,25,302,Conversation log
+43033,ChatGPT,556,15,571,Conversation log
+43046,ChatGPT,265,299,564,Conversation log
+43047,GPT-4,1092,24,1116,Conversation log
+43063,ChatGPT,1287,354,1641,Conversation log
+43072,GPT-4,0,0,0,Conversation log
+43076,ChatGPT,219,80,299,Conversation log
+43083,GPT-4,0,0,0,Conversation log
+43088,ChatGPT,110,77,187,Conversation log
+43114,ChatGPT,305,96,401,Conversation log
+43115,ChatGPT,589,507,1096,Conversation log
+43118,ChatGPT,842,667,1509,Conversation log
+43143,ChatGPT,1542,249,1791,Conversation log
+43172,ChatGPT,1141,211,1352,Conversation log
+43183,ChatGPT,384,88,472,Conversation log
+43201,ChatGPT,38,52,90,Conversation log
+43244,ChatGPT,525,58,583,Conversation log
+43334,ChatGPT,1523,370,1893,Conversation log
+43377,GPT-4,14,98,112,Conversation log
+43463,ChatGPT,1494,208,1702,Conversation log
+43484,ChatGPT,1645,190,1835,Conversation log
+43502,ChatGPT,1395,157,1552,Conversation log
+43513,GPT-4,0,0,0,Conversation log
+43526,GPT-4,0,0,0,Conversation log
+43543,ChatGPT,1245,386,1631,Conversation log
+43544,ChatGPT,1000,381,1381,Conversation log
+43565,ChatGPT,1497,398,1895,Conversation log
+43579,GPT-4,787,312,1099,Conversation log
+43619,ChatGPT,1804,685,2489,Conversation log
+43659,ChatGPT,209,528,737,Conversation log
+43661,ChatGPT,2026,652,2678,Conversation log
+43683,GPT-4,0,0,0,Conversation log
+43720,ChatGPT,0,0,0,Conversation log
+43734,ChatGPT,0,0,0,Conversation log
+43748,ChatGPT,66,170,236,Conversation log
+43768,ChatGPT,374,266,640,Conversation log
+43783,ChatGPT,654,314,968,Conversation log
+43784,GPT-4,861,128,989,Conversation log
+43851,GPT-4,595,228,823,Conversation log
+43921,GPT-4,0,0,0,Conversation log
+43932,ChatGPT,69,25,94,Conversation log
+43942,GPT-4,0,0,0,Conversation log
+44000,ChatGPT,75,17,92,Conversation log
+44032,ChatGPT,72,13,85,Conversation log
+44042,GPT-4,0,0,0,Conversation log
+44057,ChatGPT,93,20,113,Conversation log
+44081,ChatGPT,83,21,104,Conversation log
+44104,ChatGPT,83,15,98,Conversation log
+44108,ChatGPT,27,504,531,Conversation log
+44132,GPT-4,0,0,0,Conversation log
+44161,ChatGPT,1338,601,1939,Conversation log
+44164,GPT-4,0,0,0,Conversation log
+44168,ChatGPT,8,9,17,Conversation log
+44176,ChatGPT,552,477,1029,Conversation log
+44187,ChatGPT,122,137,259,Conversation log
+44221,ChatGPT,17,441,458,Conversation log
+44227,ChatGPT,17,352,369,Conversation log
+44245,ChatGPT,1480,553,2033,Conversation log
+44302,ChatGPT,1089,318,1407,Conversation log
+44314,ChatGPT,416,405,821,Conversation log
+44316,GPT-4,0,0,0,Conversation log
+44316,GPT-4,0,0,0,Conversation log
+44326,ChatGPT,282,172,454,Conversation log
+44329,GPT-4,35,145,180,Conversation log
+44352,ChatGPT,848,317,1165,Conversation log
+44364,ChatGPT,743,417,1160,Conversation log
+44384,ChatGPT,1047,258,1305,Conversation log
+44389,ChatGPT,1177,475,1652,Conversation log
+44400,ChatGPT,1317,319,1636,Conversation log
+44412,ChatGPT,1221,569,1790,Conversation log
+44449,ChatGPT,18,111,129,Conversation log
+44526,ChatGPT,1663,512,2175,Conversation log
+44562,ChatGPT,17,376,393,Conversation log
+44565,ChatGPT,1750,320,2070,Conversation log
+44592,ChatGPT,1960,461,2421,Conversation log
+44617,GPT-4,216,191,407,Conversation log
+44853,GPT-4,0,0,0,Conversation log
+44865,ChatGPT,0,0,0,Conversation log
+44878,GPT-4,0,0,0,Conversation log
+44879,ChatGPT,101,467,568,Conversation log
+44897,GPT-4,1150,212,1362,Conversation log
+44930,ChatGPT,295,84,379,Conversation log
+44934,ChatGPT,405,321,726,Conversation log
+44943,ChatGPT,392,520,912,Conversation log
+44995,ChatGPT,334,232,566,Conversation log
+45023,ChatGPT,586,523,1109,Conversation log
+45049,GPT-4,0,0,0,Conversation log
+45343,ChatGPT,1430,625,2055,Conversation log
+45390,GPT-4,3112,689,3801,Conversation log
+45531,ChatGPT,49,691,740,Conversation log
+45538,ChatGPT,68,151,219,Conversation log
+45840,ChatGPT,56,326,382,Conversation log
+45912,ChatGPT,108,197,305,Conversation log
+45914,ChatGPT,317,241,558,Conversation log
+45929,ChatGPT,316,239,555,Conversation log
+46074,ChatGPT,229,313,542,Conversation log
+46078,ChatGPT,1752,442,2194,Conversation log
+46133,ChatGPT,1833,362,2195,Conversation log
+46186,ChatGPT,1697,548,2245,Conversation log
+46393,ChatGPT,568,455,1023,Conversation log
+46440,ChatGPT,1115,432,1547,Conversation log
+46753,ChatGPT,20,395,415,Conversation log
+46819,ChatGPT,0,0,0,Conversation log
+46936,ChatGPT,0,0,0,Conversation log
+46972,ChatGPT,0,0,0,Conversation log
+46997,ChatGPT,0,0,0,Conversation log
+47028,GPT-4,0,0,0,Conversation log
+47100,ChatGPT,809,1093,1902,Conversation log
+47136,ChatGPT,21,304,325,Conversation log
+47208,ChatGPT,13,453,466,Conversation log
+47420,GPT-4,0,0,0,Conversation log
+47540,GPT-4,1083,57,1140,Conversation log
+47638,GPT-4,0,0,0,Conversation log
+47672,ChatGPT,68,344,412,Conversation log
+47716,ChatGPT,60,275,335,Conversation log
+47728,GPT-4,68,329,397,Conversation log
+47730,GPT-4,0,0,0,Conversation log
+47735,ChatGPT,20,375,395,Conversation log
+47745,GPT-4,0,0,0,Conversation log
+47819,GPT-4,0,0,0,Conversation log
+47891,GPT-4,75,260,335,Conversation log
+47904,GPT-4,1124,194,1318,Conversation log
+47989,ChatGPT,97,462,559,Conversation log
+48009,GPT-4,0,0,0,Conversation log
+48078,GPT-4,0,0,0,Conversation log
+48126,ChatGPT,104,377,481,Conversation log
+48273,ChatGPT,506,483,989,Conversation log
+48348,ChatGPT,144,238,382,Conversation log
+48354,ChatGPT,285,262,547,Conversation log
+48361,ChatGPT,52,123,175,Conversation log
+48537,ChatGPT,168,352,520,Conversation log
+48591,GPT-4,0,0,0,Conversation log
+48595,ChatGPT,25,261,286,Conversation log
+48598,GPT-4,52,105,157,Conversation log
+48633,GPT-4,0,0,0,Conversation log
+48635,GPT-4,174,96,270,Conversation log
+48644,ChatGPT,338,123,461,Conversation log
+48676,GPT-4,0,0,0,Conversation log
+48681,GPT-4,303,109,412,Conversation log
+48703,GPT-4,188,321,509,Conversation log
+48764,ChatGPT,1482,363,1845,Conversation log
+48791,ChatGPT,509,368,877,Conversation log
+48864,GPT-4,101,371,472,Conversation log
+48887,ChatGPT,15,296,311,Conversation log
+48911,ChatGPT,414,276,690,Conversation log
+48991,ChatGPT,1458,238,1696,Conversation log
+49122,ChatGPT,16,6,22,Conversation log
+49154,GPT-4,549,390,939,Conversation log
+49159,ChatGPT,0,0,0,Conversation log
+49180,ChatGPT,17,179,196,Conversation log
+49184,ChatGPT,914,377,1291,Conversation log
+49187,ChatGPT,39,72,111,Conversation log
+49192,ChatGPT,50,11,61,Conversation log
+49220,GPT-4,23,167,190,Conversation log
+49228,ChatGPT,81,18,99,Conversation log
+49251,ChatGPT,631,276,907,Conversation log
+49264,ChatGPT,115,16,131,Conversation log
+49277,ChatGPT,142,11,153,Conversation log
+49284,ChatGPT,31,701,732,Conversation log
+49289,ChatGPT,1308,281,1589,Conversation log
+49292,ChatGPT,129,12,141,Conversation log
+49314,ChatGPT,585,288,873,Conversation log
+49317,ChatGPT,128,18,146,Conversation log
diff --git a/scripts/datasets/BurstGPT/BurstGPT_1000_gpt4_pos_req1024_resp256.csv b/scripts/datasets/BurstGPT/BurstGPT_1000_gpt4_pos_req1024_resp256.csv
new file mode 100644
index 000000000..2bd9f39a1
--- /dev/null
+++ b/scripts/datasets/BurstGPT/BurstGPT_1000_gpt4_pos_req1024_resp256.csv
@@ -0,0 +1,111 @@
+Timestamp,Model,Request tokens,Response tokens,Total tokens,Log Type
+686,GPT-4,969,206,1175,Conversation log
+2273,GPT-4,92,95,187,Conversation log
+6882,GPT-4,20,86,106,Conversation log
+7442,GPT-4,665,167,832,Conversation log
+8126,GPT-4,18,67,85,Conversation log
+8208,GPT-4,101,148,249,Conversation log
+8258,GPT-4,264,76,340,Conversation log
+28765,GPT-4,930,83,1013,API log
+28829,GPT-4,929,80,1009,API log
+32004,GPT-4,380,210,590,Conversation log
+34601,GPT-4,15,115,130,Conversation log
+34746,GPT-4,189,253,442,Conversation log
+34770,GPT-4,585,26,611,Conversation log
+34802,GPT-4,757,211,968,Conversation log
+34824,GPT-4,203,200,403,Conversation log
+34890,GPT-4,978,197,1175,Conversation log
+34896,GPT-4,188,181,369,Conversation log
+34943,GPT-4,163,19,182,Conversation log
+34995,GPT-4,389,221,610,Conversation log
+35149,GPT-4,33,82,115,Conversation log
+35156,GPT-4,342,231,573,Conversation log
+35161,GPT-4,161,22,183,Conversation log
+35227,GPT-4,210,17,227,Conversation log
+35374,GPT-4,466,60,526,Conversation log
+35576,GPT-4,58,26,84,Conversation log
+35816,GPT-4,45,32,77,Conversation log
+35837,GPT-4,273,25,298,Conversation log
+35894,GPT-4,820,26,846,Conversation log
+35898,GPT-4,788,188,976,Conversation log
+36097,GPT-4,695,16,711,Conversation log
+36121,GPT-4,149,256,405,Conversation log
+36173,GPT-4,985,74,1059,Conversation log
+36607,GPT-4,29,2,31,Conversation log
+36646,GPT-4,391,25,416,Conversation log
+36673,GPT-4,347,147,494,Conversation log
+36868,GPT-4,500,238,738,Conversation log
+36989,GPT-4,145,45,190,Conversation log
+37106,GPT-4,29,213,242,Conversation log
+37117,GPT-4,112,110,222,Conversation log
+37198,GPT-4,28,231,259,Conversation log
+37235,GPT-4,275,225,500,Conversation log
+37272,GPT-4,251,23,274,Conversation log
+37274,GPT-4,18,70,88,Conversation log
+37277,GPT-4,281,239,520,Conversation log
+37408,GPT-4,449,192,641,Conversation log
+37538,GPT-4,690,170,860,Conversation log
+37806,GPT-4,248,53,301,Conversation log
+37808,GPT-4,386,17,403,Conversation log
+37868,GPT-4,27,209,236,Conversation log
+37886,GPT-4,244,65,309,Conversation log
+37914,GPT-4,91,63,154,Conversation log
+37929,GPT-4,170,42,212,Conversation log
+37982,GPT-4,258,171,429,Conversation log
+38019,GPT-4,536,236,772,Conversation log
+38041,GPT-4,359,22,381,Conversation log
+38069,GPT-4,472,213,685,Conversation log
+38070,GPT-4,437,31,468,Conversation log
+38151,GPT-4,338,71,409,Conversation log
+38235,GPT-4,517,27,544,Conversation log
+38354,GPT-4,599,112,711,Conversation log
+38449,GPT-4,13,85,98,Conversation log
+38600,GPT-4,22,242,264,Conversation log
+38617,GPT-4,52,180,232,Conversation log
+38706,GPT-4,491,179,670,Conversation log
+38722,GPT-4,10,85,95,Conversation log
+38769,GPT-4,122,10,132,Conversation log
+38837,GPT-4,99,112,211,Conversation log
+38920,GPT-4,242,94,336,Conversation log
+38928,GPT-4,169,18,187,Conversation log
+38937,GPT-4,764,225,989,Conversation log
+38958,GPT-4,384,231,615,Conversation log
+38977,GPT-4,717,133,850,Conversation log
+39439,GPT-4,328,8,336,Conversation log
+39639,GPT-4,58,5,63,Conversation log
+39650,GPT-4,117,5,122,Conversation log
+39658,GPT-4,288,178,466,Conversation log
+39694,GPT-4,137,162,299,Conversation log
+39725,GPT-4,15,104,119,Conversation log
+39795,GPT-4,10,64,74,Conversation log
+39884,GPT-4,29,174,203,Conversation log
+39903,GPT-4,654,49,703,Conversation log
+39934,GPT-4,954,250,1204,Conversation log
+39939,GPT-4,727,254,981,Conversation log
+40049,GPT-4,301,133,434,Conversation log
+40314,GPT-4,945,79,1024,Conversation log
+40471,GPT-4,216,198,414,Conversation log
+40665,GPT-4,228,180,408,Conversation log
+40833,GPT-4,151,34,185,Conversation log
+40862,GPT-4,229,40,269,Conversation log
+41194,GPT-4,72,167,239,Conversation log
+41239,GPT-4,263,57,320,Conversation log
+41616,GPT-4,25,4,29,Conversation log
+41626,GPT-4,46,3,49,Conversation log
+41641,GPT-4,64,4,68,Conversation log
+41649,GPT-4,607,12,619,Conversation log
+41701,GPT-4,84,30,114,Conversation log
+41904,GPT-4,805,139,944,Conversation log
+42162,GPT-4,48,183,231,Conversation log
+42257,GPT-4,267,167,434,Conversation log
+42587,GPT-4,744,137,881,Conversation log
+42989,GPT-4,1011,65,1076,Conversation log
+43377,GPT-4,14,98,112,Conversation log
+43784,GPT-4,861,128,989,Conversation log
+43851,GPT-4,595,228,823,Conversation log
+44329,GPT-4,35,145,180,Conversation log
+44617,GPT-4,216,191,407,Conversation log
+48598,GPT-4,52,105,157,Conversation log
+48635,GPT-4,174,96,270,Conversation log
+48681,GPT-4,303,109,412,Conversation log
+49220,GPT-4,23,167,190,Conversation log
diff --git a/scripts/test_burstgpt_bench.py b/scripts/test_burstgpt_bench.py
index 2b2f95851..b85ea33c6 100755
--- a/scripts/test_burstgpt_bench.py
+++ b/scripts/test_burstgpt_bench.py
@@ -30,9 +30,9 @@
 PROJECT_ROOT = Path(__file__).resolve().parents[1]
 INFERENCE_SERVER = PROJECT_ROOT / "python" / "infinilm" / "server" / "inference_server.py"
 BENCH_SCRIPT = Path(__file__).resolve().parent / "pure_bench_serve.py"
-DEFAULT_MODEL = "/workspace/models/9g_8b_v2_thinking/9g_8b_thinking"
-DEFAULT_DATASET = "/workspace/datasets/burstgpt/BurstGPT_1.csv"
-DEFAULT_RESULT_DIR = "/workspace/bench_results"
+DEFAULT_MODEL = "/workspace/models/9g_8b_v2_thinking/9g_8b_thinking"    # 改成机器上对应的模型路径
+DEFAULT_DATASET = Path(__file__).resolve().parent / "datasets/BurstGPT/BurstGPT_1000.csv"
+DEFAULT_RESULT_DIR = Path(__file__).resolve().parent / "bench_results"
 USE_COLOR = False
 
 
@@ -632,7 +632,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--served-model-name", default="9g_8b_thinking")
     parser.add_argument("--temperature", type=float, default=None)
     parser.add_argument("--ignore-eos", action="store_true")
-    parser.add_argument("--progress-interval", type=int, default=30, help="压测运行中的中文心跳提示间隔，单位秒；设为 0 可关闭。")
+    parser.add_argument("--progress-interval", type=int, default=10, help="压测运行中的中文心跳提示间隔，单位秒；设为 0 可关闭。")
     parser.add_argument("--show-vllm-output", action="store_true", help="显示 vLLM benchmark 的原始英文输出。")
     parser.add_argument("--show-server-output", action="store_true", help="直接显示 InfiniLM 服务端输出；默认隐藏且不保存日志。")
     parser.add_argument("--color", choices=["auto", "always", "never"], default="auto", help="终端颜色输出。")