From 94c6b39c88943741ead96f7946700abb8729a002 Mon Sep 17 00:00:00 2001 From: richardhuo-nv Date: Wed, 24 Jun 2026 12:28:06 -0700 Subject: [PATCH 1/2] NV Refresh Minimax M3 FP8 submission on GB300 --- .github/configs/nvidia-master.yaml | 173 +++++++++--------- ...-dep4-3n.yaml => 1p1d-dep2-dep4-1k1k.yaml} | 33 ++-- ...-tep8-3n.yaml => 1p2d-dep2-tep8-1k1k.yaml} | 29 ++- ...-tep8-5n.yaml => 2p2d-dep2-tep8-1k1k.yaml} | 23 +-- ...arlin-2n.yaml => 2p3d-dep2-dep4-1k1k.yaml} | 42 ++--- .../1k1k/2p4d-dep2-dep4-1k1k.yaml | 100 ++++++++++ ...-dep8-3n.yaml => 4p2d-dep2-dep8-1k1k.yaml} | 37 ++-- .../1k1k/disagg-gb300-2p1d-dep2-tep8-3n.yaml | 103 ----------- .../1k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml | 103 ----------- ...-dep8-5n.yaml => 1p1d-dep2-dep8-8k1k.yaml} | 33 ++-- ...-tep8-6n.yaml => 1p1d-dep2-tep8-8k1k.yaml} | 31 ++-- ...-tep8-7n.yaml => 1p2d-dep2-tep8-8k1k.yaml} | 27 ++- ...-dep8-5n.yaml => 2p1d-dep2-dep8-8k1k.yaml} | 33 ++-- ...-tep8-5n.yaml => 2p2d-dep2-tep8-8k1k.yaml} | 19 +- .../8k1k/2p4d-dep2-tep4-8k1k.yaml | 98 ++++++++++ ...-dep8-6n.yaml => 3p1d-dep2-dep8-8k1k.yaml} | 35 ++-- ...-dep8-6n.yaml => 3p2d-dep2-dep8-8k1k.yaml} | 25 +-- .../8k1k/6p1d-dep2-dep8-8k1k.yaml | 100 ++++++++++ .../disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml | 104 ----------- perf-changelog.yaml | 11 +- runners/launch_gb300-nv.sh | 2 +- 21 files changed, 549 insertions(+), 612 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/{disagg-gb300-1p2d-dep2-dep4-3n.yaml => 1p1d-dep2-dep4-1k1k.yaml} (76%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/{disagg-gb300-1p1d-dep2-tep8-3n.yaml => 1p2d-dep2-tep8-1k1k.yaml} (76%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/{disagg-gb300-2p2d-dep2-tep8-5n.yaml => 2p2d-dep2-tep8-1k1k.yaml} (80%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/{disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml => 2p3d-dep2-dep4-1k1k.yaml} (70%) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p4d-dep2-dep4-1k1k.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/{disagg-gb300-2p1d-dep2-dep8-3n.yaml => 4p2d-dep2-dep8-1k1k.yaml} (74%) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-tep8-3n.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/{disagg-gb300-1p2d-dep2-dep8-5n.yaml => 1p1d-dep2-dep8-8k1k.yaml} (76%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/{disagg-gb300-3p2d-dep2-tep8-6n.yaml => 1p1d-dep2-tep8-8k1k.yaml} (77%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/{disagg-gb300-5p2d-dep2-tep8-7n.yaml => 1p2d-dep2-tep8-8k1k.yaml} (78%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/{disagg-gb300-2p2d-dep2-dep8-5n.yaml => 2p1d-dep2-dep8-8k1k.yaml} (76%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/{disagg-gb300-2p2d-dep2-tep8-5n.yaml => 2p2d-dep2-tep8-8k1k.yaml} (83%) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p4d-dep2-tep4-8k1k.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/{disagg-gb300-4p2d-dep2-dep8-6n.yaml => 3p1d-dep2-dep8-8k1k.yaml} (75%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/{disagg-gb300-3p2d-dep2-dep8-6n.yaml => 3p2d-dep2-dep8-8k1k.yaml} (80%) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/6p1d-dep2-dep8-8k1k.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f4d70f977..5d1ef1912 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11955,10 +11955,9 @@ minimaxm3-fp8-b300-dynamo-vllm: ep: 8 dp-attn: false -# MiniMax-M3 GB300 disagg sweep — adapted from NV B300 PR #1863. -# All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: TP4+Marlin, TEP8, -# DEP8, DEP4. 4 GPU/node (GB300 NVL72). 4p3d (3 decode workers) skipped. -# kv-cache-dtype=fp8 added. srun_options mem=0 required. +# MiniMax-M3 GB300 disagg sweep — refreshed recipe set (no Marlin variants). +# All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: DEP4, TEP8, DEP8, TEP4. +# 4 GPU/node (GB300 NVL72). kv-cache-dtype=fp8. srun_options mem=0 required. minimaxm3-fp8-gb300-dynamo-vllm: image: vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223 model: MiniMaxAI/MiniMax-M3-MXFP8 @@ -11973,155 +11972,155 @@ minimaxm3-fp8-gb300-dynamo-vllm: - isl: 1024 osl: 1024 search-space: - # 1p1d DEP2+TEP8, 3n: conc 4,16,64,128,4096 - - conc-list: [4, 16, 64, 128, 4096] + # 1p1d DEP2+DEP4, 2n: conc 8192 + - conc-list: [8192] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tep8-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p1d-dep2-dep4-1k1k.yaml" decode: num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false + tp: 4 + ep: 4 + dp-attn: true - # 1p1d DEP2+TP4 Marlin, 2n: conc 1,4,8,16 - - conc-list: [1, 4, 8, 16] + # 1p2d DEP2+TEP8, 5n: conc 4,16,64,128,256 + - conc-list: [4, 16, 64, 128, 256] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p2d-dep2-tep8-1k1k.yaml" decode: - num-worker: 1 - tp: 4 - ep: 1 + num-worker: 2 + tp: 8 + ep: 8 dp-attn: false - # 1p2d DEP2+DEP4, 3n: conc 2048 - - conc-list: [2048] + # 2p2d DEP2+TEP8, 5n: conc 32 + - conc-list: [32] prefill: - num-worker: 1 + num-worker: 2 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p2d-dep2-dep4-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml" decode: num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true + tp: 8 + ep: 8 + dp-attn: false - # 2p1d DEP2+DEP8, 3n: conc 512,4096 - - conc-list: [512, 4096] + # 2p3d DEP2+DEP4, 4n: conc 8192 + - conc-list: [8192] prefill: num-worker: 2 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-dep8-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p3d-dep2-dep4-1k1k.yaml" decode: - num-worker: 1 - tp: 8 - ep: 8 + num-worker: 3 + tp: 4 + ep: 4 dp-attn: true - # 2p1d DEP2+TEP8, 3n: conc 32 - - conc-list: [32] + # 2p4d DEP2+DEP4, 5n: conc 8192 + - conc-list: [8192] prefill: num-worker: 2 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-tep8-3n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p4d-dep2-dep4-1k1k.yaml" decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true - # 2p2d DEP2+TEP8, 5n: conc 16 - - conc-list: [16] + # 4p2d DEP2+DEP8, 6n: conc 1024,4096 + - conc-list: [1024, 4096] prefill: - num-worker: 2 + num-worker: 4 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/4p2d-dep2-dep8-1k1k.yaml" decode: num-worker: 2 tp: 8 ep: 8 - dp-attn: false + dp-attn: true - # 3p2d DEP2+TEP8, 6n: conc 4 - - conc-list: [4] + - isl: 8192 + osl: 1024 + search-space: + # 1p1d DEP2+DEP8, 3n: conc 256 + - conc-list: [256] prefill: - num-worker: 3 + num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml" decode: - num-worker: 2 + num-worker: 1 tp: 8 ep: 8 - dp-attn: false + dp-attn: true - - isl: 8192 - osl: 1024 - search-space: - # 1p1d DEP2+TP4 Marlin, 2n: conc 1,4,8,16 - - conc-list: [1, 4, 8, 16] + # 1p1d DEP2+TEP8, 3n: conc 128 + - conc-list: [128] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-tep8-8k1k.yaml" decode: num-worker: 1 - tp: 4 - ep: 1 + tp: 8 + ep: 8 dp-attn: false - # 1p2d DEP2+DEP8, 5n: conc 128 - - conc-list: [128] + # 1p2d DEP2+TEP8, 5n: conc 32,64,128 + - conc-list: [32, 64, 128] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p2d-dep2-dep8-5n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p2d-dep2-tep8-8k1k.yaml" decode: num-worker: 2 tp: 8 ep: 8 - dp-attn: true + dp-attn: false - # 2p2d DEP2+DEP8, 5n: conc 256,512 - - conc-list: [256, 512] + # 2p1d DEP2+DEP8, 3n: conc 512 + - conc-list: [512] prefill: num-worker: 2 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-dep8-5n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p1d-dep2-dep8-8k1k.yaml" decode: - num-worker: 2 + num-worker: 1 tp: 8 ep: 8 dp-attn: true @@ -12134,72 +12133,72 @@ minimaxm3-fp8-gb300-dynamo-vllm: ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: false - # 3p2d DEP2+DEP8, 6n: conc 512 - - conc-list: [512] + # 2p4d DEP2+TEP4, 5n: conc 4 + - conc-list: [4] prefill: - num-worker: 3 + num-worker: 2 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-dep8-6n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p4d-dep2-tep4-8k1k.yaml" decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: false - # 3p2d DEP2+TEP8, 6n: conc 32 - - conc-list: [32] + # 3p1d DEP2+DEP8, 4n: conc 1024 + - conc-list: [1024] prefill: num-worker: 3 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p1d-dep2-dep8-8k1k.yaml" decode: - num-worker: 2 + num-worker: 1 tp: 8 ep: 8 - dp-attn: false + dp-attn: true - # 4p2d DEP2+DEP8, 6n: conc 4096 - - conc-list: [4096] + # 3p2d DEP2+DEP8, 6n: conc 512 + - conc-list: [512] prefill: - num-worker: 4 + num-worker: 3 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-4p2d-dep2-dep8-6n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: true - # 5p2d DEP2+TEP8, 7n: conc 4,64 - - conc-list: [4, 64] + # 6p1d DEP2+DEP8, 5n: conc 2048 + - conc-list: [2048] prefill: - num-worker: 5 + num-worker: 6 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p2d-dep2-tep8-7n.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3-gb300-fp8/8k1k/6p1d-dep2-dep8-8k1k.yaml" decode: - num-worker: 2 + num-worker: 1 tp: 8 ep: 8 - dp-attn: false + dp-attn: true qwen3.5-fp4-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc18 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p2d-dep2-dep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p1d-dep2-dep4-1k1k.yaml similarity index 76% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p2d-dep2-dep4-3n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p1d-dep2-dep4-1k1k.yaml index af5315c76..4f1e096a7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p2d-dep2-dep4-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p1d-dep2-dep4-1k1k.yaml @@ -1,25 +1,18 @@ -name: "minimax-m3-vllm-disagg-gb300-1p2d-dep2-dep4-fp8-1k1k" +name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-dep4-fp8-1k1k" -# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP4 (TP1 DP4 EP) -# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. +# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D DEP4 decode (TP1 DP4 EP, 4 GPU/worker = 1 node each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 1 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" + version: 1.3.0.dev20260614 health_check: max_attempts: 720 @@ -29,9 +22,9 @@ resources: gpu_type: "gb300" gpus_per_node: 4 prefill_nodes: 1 - decode_nodes: 2 + decode_nodes: 1 prefill_workers: 1 - decode_workers: 2 + decode_workers: 1 gpus_per_prefill: 2 gpus_per_decode: 4 @@ -64,6 +57,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -75,12 +69,13 @@ backend: max-model-len: 2304 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 + stream-interval: 100 max-cudagraph-capture-size: 2048 max-num-batched-tokens: 2048 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 4 data-parallel-rpc-port: 13345 @@ -92,8 +87,8 @@ backend: max-model-len: 2304 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 512 + stream-interval: 100 + max-num-seqs: 1024 max-num-batched-tokens: 16384 max-cudagraph-capture-size: 2048 @@ -101,5 +96,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "2048" + concurrencies: "8192" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p2d-dep2-tep8-1k1k.yaml similarity index 76% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tep8-3n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p2d-dep2-tep8-1k1k.yaml index 4b00b5660..7cdcfe43d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tep8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p2d-dep2-tep8-1k1k.yaml @@ -1,25 +1,18 @@ -name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-tep8-fp8-1k1k" +name: "minimax-m3-vllm-disagg-gb300-1p2d-dep2-tep8-fp8-1k1k" -# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TEP8 (TP8+EP8) -# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. +# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 decode (TP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 4 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" + version: 1.3.0.dev20260614 health_check: max_attempts: 720 @@ -29,9 +22,9 @@ resources: gpu_type: "gb300" gpus_per_node: 4 prefill_nodes: 1 - decode_nodes: 2 + decode_nodes: 4 prefill_workers: 1 - decode_workers: 1 + decode_workers: 2 gpus_per_prefill: 2 gpus_per_decode: 8 @@ -64,6 +57,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -81,6 +75,7 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 8 enable-expert-parallel: true trust-remote-code: true @@ -93,11 +88,11 @@ backend: stream-interval: 32 max-num-seqs: 4096 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8192 + max-cudagraph-capture-size: 8196 benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "4x16x64x128x4096" + concurrencies: "4x16x64x128x256" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml similarity index 80% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml index 5babf0835..0fe27f8a2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml @@ -1,25 +1,18 @@ name: "minimax-m3-vllm-disagg-gb300-2p2d-dep2-tep8-fp8-1k1k" -# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) -# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 decode (TP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 4 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" + version: 1.3.0.dev20260614 health_check: max_attempts: 720 @@ -64,6 +57,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -81,6 +75,7 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 8 enable-expert-parallel: true trust-remote-code: true @@ -93,11 +88,11 @@ backend: stream-interval: 32 max-num-seqs: 4096 max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8192 + max-cudagraph-capture-size: 8196 benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "16" + concurrencies: "32" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p3d-dep2-dep4-1k1k.yaml similarity index 70% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p3d-dep2-dep4-1k1k.yaml index 26fa89b94..046fe737a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p3d-dep2-dep4-1k1k.yaml @@ -1,25 +1,18 @@ -name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-tp4-marlin-fp8-1k1k" +name: "minimax-m3-vllm-disagg-gb300-2p3d-dep2-dep4-fp8-1k1k" -# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TP4 Marlin -# decode = 2 nodes (1P + 1D). Adapted from NV B300 PR #1863. +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 3D DEP4 decode (TP1 DP4 EP, 4 GPU/worker = 1 node each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 3 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" + version: 1.3.0.dev20260614 health_check: max_attempts: 720 @@ -29,9 +22,9 @@ resources: gpu_type: "gb300" gpus_per_node: 4 prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 + decode_nodes: 3 + prefill_workers: 2 + decode_workers: 3 gpus_per_prefill: 2 gpus_per_decode: 4 @@ -64,6 +57,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -75,15 +69,17 @@ backend: max-model-len: 2304 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 + stream-interval: 100 max-cudagraph-capture-size: 2048 max-num-batched-tokens: 2048 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - enable-expert-parallel: false - moe-backend: marlin + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true trust-remote-code: true no-enable-prefix-caching: true block-size: 128 @@ -91,8 +87,8 @@ backend: max-model-len: 2304 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 4096 + stream-interval: 100 + max-num-seqs: 1024 max-num-batched-tokens: 16384 max-cudagraph-capture-size: 2048 @@ -100,5 +96,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "1x4x8x16" + concurrencies: "8192" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p4d-dep2-dep4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p4d-dep2-dep4-1k1k.yaml new file mode 100644 index 000000000..5251b01da --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p4d-dep2-dep4-1k1k.yaml @@ -0,0 +1,100 @@ +name: "minimax-m3-vllm-disagg-gb300-2p4d-dep2-dep4-fp8-1k1k" + +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 4D DEP4 decode (TP1 DP4 EP, 4 GPU/worker = 1 node each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 4 decode (+ head/infra). + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + + +dynamo: + install: true + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 2 + decode_workers: 4 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + stream-interval: 100 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + kv-cache-dtype: fp8 + stream-interval: 100 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "8192" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-dep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/4p2d-dep2-dep8-1k1k.yaml similarity index 74% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-dep8-3n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/4p2d-dep2-dep8-1k1k.yaml index 7cc5f50c4..3e35cda66 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-dep8-3n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/4p2d-dep2-dep8-1k1k.yaml @@ -1,25 +1,18 @@ -name: "minimax-m3-vllm-disagg-gb300-2p1d-dep2-dep8-fp8-1k1k" +name: "minimax-m3-vllm-disagg-gb300-4p2d-dep2-dep8-fp8-1k1k" -# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D DEP8 (TP1 DP8 EP) -# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. +# 4P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 decode (TP1 DP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 2 prefill + 4 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" + version: 1.3.0.dev20260614 health_check: max_attempts: 720 @@ -28,10 +21,10 @@ health_check: resources: gpu_type: "gb300" gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 2 - decode_workers: 1 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 4 + decode_workers: 2 gpus_per_prefill: 2 gpus_per_decode: 8 @@ -64,6 +57,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -75,12 +69,13 @@ backend: max-model-len: 2304 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 + stream-interval: 100 max-cudagraph-capture-size: 2048 max-num-batched-tokens: 2048 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 @@ -92,8 +87,8 @@ backend: max-model-len: 2304 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 512 + stream-interval: 100 + max-num-seqs: 4096 max-num-batched-tokens: 16384 max-cudagraph-capture-size: 2048 @@ -101,5 +96,5 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "512x4096" + concurrencies: "1024x4096" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-tep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-tep8-3n.yaml deleted file mode 100644 index 0c4f3498c..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-2p1d-dep2-tep8-3n.yaml +++ /dev/null @@ -1,103 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb300-2p1d-dep2-tep8-fp8-1k1k" - -# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TEP8 (TP8+EP8) -# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863. - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 2 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 2 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 2304 - language-model-only: true - kv-cache-dtype: fp8 - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 8 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 2304 - language-model-only: true - kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 4096 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8192 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "32" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml deleted file mode 100644 index d4176055a..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml +++ /dev/null @@ -1,103 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb300-3p2d-dep2-tep8-fp8-1k1k" - -# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) -# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 4 - prefill_workers: 3 - decode_workers: 2 - gpus_per_prefill: 2 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 2304 - language-model-only: true - kv-cache-dtype: fp8 - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 2048 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 8 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 2304 - language-model-only: true - kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 4096 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 8192 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4" - req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p2d-dep2-dep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml similarity index 76% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p2d-dep2-dep8-5n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml index b56b65b26..e47c42b05 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p2d-dep2-dep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml @@ -1,25 +1,18 @@ -name: "minimax-m3-vllm-disagg-gb300-1p2d-dep2-dep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-dep8-fp8-8k1k" -# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP) -# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. +# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D DEP8 decode (TP1 DP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 2 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" + version: 1.3.0.dev20260614 health_check: max_attempts: 720 @@ -29,9 +22,9 @@ resources: gpu_type: "gb300" gpus_per_node: 4 prefill_nodes: 1 - decode_nodes: 4 + decode_nodes: 2 prefill_workers: 1 - decode_workers: 2 + decode_workers: 1 gpus_per_prefill: 2 gpus_per_decode: 8 @@ -64,6 +57,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -75,12 +69,13 @@ backend: max-model-len: 9472 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 + stream-interval: 100 max-cudagraph-capture-size: 2048 max-num-batched-tokens: 16384 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 @@ -92,8 +87,8 @@ backend: max-model-len: 9472 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 512 + stream-interval: 100 + max-num-seqs: 1024 max-num-batched-tokens: 16384 max-cudagraph-capture-size: 2048 @@ -101,5 +96,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "128" + concurrencies: "256" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-tep8-8k1k.yaml similarity index 77% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-tep8-8k1k.yaml index 35950dc32..293249f0a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-tep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-tep8-8k1k.yaml @@ -1,25 +1,18 @@ -name: "minimax-m3-vllm-disagg-gb300-3p2d-dep2-tep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-tep8-fp8-8k1k" -# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) -# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. +# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TEP8 decode (TP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 2 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" + version: 1.3.0.dev20260614 health_check: max_attempts: 720 @@ -28,10 +21,10 @@ health_check: resources: gpu_type: "gb300" gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 4 - prefill_workers: 3 - decode_workers: 2 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 gpus_per_prefill: 2 gpus_per_decode: 8 @@ -64,6 +57,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -81,6 +75,7 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 8 enable-expert-parallel: true trust-remote-code: true @@ -99,5 +94,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "32" + concurrencies: "128" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p2d-dep2-tep8-7n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p2d-dep2-tep8-8k1k.yaml similarity index 78% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p2d-dep2-tep8-7n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p2d-dep2-tep8-8k1k.yaml index dbc9c5c9a..043c24d2d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-5p2d-dep2-tep8-7n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p2d-dep2-tep8-8k1k.yaml @@ -1,25 +1,18 @@ -name: "minimax-m3-vllm-disagg-gb300-5p2d-dep2-tep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb300-1p2d-dep2-tep8-fp8-8k1k" -# 5P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) -# decode = 7 nodes (3P + 4D). Adapted from NV B300 PR #1863. +# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 decode (TP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 4 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" + version: 1.3.0.dev20260614 health_check: max_attempts: 720 @@ -28,9 +21,9 @@ health_check: resources: gpu_type: "gb300" gpus_per_node: 4 - prefill_nodes: 3 + prefill_nodes: 1 decode_nodes: 4 - prefill_workers: 5 + prefill_workers: 1 decode_workers: 2 gpus_per_prefill: 2 gpus_per_decode: 8 @@ -64,6 +57,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -81,6 +75,7 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 8 enable-expert-parallel: true trust-remote-code: true @@ -99,5 +94,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "4x64" + concurrencies: "32x64x128" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-dep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p1d-dep2-dep8-8k1k.yaml similarity index 76% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-dep8-5n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p1d-dep2-dep8-8k1k.yaml index 7beba3420..abdd35143 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-dep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p1d-dep2-dep8-8k1k.yaml @@ -1,25 +1,18 @@ -name: "minimax-m3-vllm-disagg-gb300-2p2d-dep2-dep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb300-2p1d-dep2-dep8-fp8-8k1k" -# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP) -# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D DEP8 decode (TP1 DP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 2 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" + version: 1.3.0.dev20260614 health_check: max_attempts: 720 @@ -29,9 +22,9 @@ resources: gpu_type: "gb300" gpus_per_node: 4 prefill_nodes: 1 - decode_nodes: 4 + decode_nodes: 2 prefill_workers: 2 - decode_workers: 2 + decode_workers: 1 gpus_per_prefill: 2 gpus_per_decode: 8 @@ -64,6 +57,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -75,12 +69,13 @@ backend: max-model-len: 9472 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 + stream-interval: 100 max-cudagraph-capture-size: 2048 max-num-batched-tokens: 16384 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 @@ -92,8 +87,8 @@ backend: max-model-len: 9472 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 512 + stream-interval: 100 + max-num-seqs: 1024 max-num-batched-tokens: 16384 max-cudagraph-capture-size: 2048 @@ -101,5 +96,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "256x512" + concurrencies: "512" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml similarity index 83% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml index 1ea678ace..c3fe1887b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-2p2d-dep2-tep8-5n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml @@ -1,25 +1,18 @@ name: "minimax-m3-vllm-disagg-gb300-2p2d-dep2-tep8-fp8-8k1k" -# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8) -# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863. +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 decode (TP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 4 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" + version: 1.3.0.dev20260614 health_check: max_attempts: 720 @@ -64,6 +57,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -81,6 +75,7 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 8 enable-expert-parallel: true trust-remote-code: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p4d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p4d-dep2-tep4-8k1k.yaml new file mode 100644 index 000000000..4f6399171 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p4d-dep2-tep4-8k1k.yaml @@ -0,0 +1,98 @@ +name: "minimax-m3-vllm-disagg-gb300-2p4d-dep2-tep4-fp8-8k1k" + +# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 4D TEP4 decode (TP4 EP, 4 GPU/worker = 1 node each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 1 prefill + 4 decode (+ head/infra). + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + + +dynamo: + install: true + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 2 + decode_workers: 4 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + tensor-parallel-size: 4 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-4p2d-dep2-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p1d-dep2-dep8-8k1k.yaml similarity index 75% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-4p2d-dep2-dep8-6n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p1d-dep2-dep8-8k1k.yaml index 1526cd7ad..cbd5cbf60 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-4p2d-dep2-dep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p1d-dep2-dep8-8k1k.yaml @@ -1,25 +1,18 @@ -name: "minimax-m3-vllm-disagg-gb300-4p2d-dep2-dep8-fp8-8k1k" +name: "minimax-m3-vllm-disagg-gb300-3p1d-dep2-dep8-fp8-8k1k" -# 4P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP) -# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. +# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D DEP8 decode (TP1 DP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 2 prefill + 2 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" + version: 1.3.0.dev20260614 health_check: max_attempts: 720 @@ -29,9 +22,9 @@ resources: gpu_type: "gb300" gpus_per_node: 4 prefill_nodes: 2 - decode_nodes: 4 - prefill_workers: 4 - decode_workers: 2 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 1 gpus_per_prefill: 2 gpus_per_decode: 8 @@ -64,6 +57,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -75,12 +69,13 @@ backend: max-model-len: 9472 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 + stream-interval: 100 max-cudagraph-capture-size: 2048 max-num-batched-tokens: 16384 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 @@ -92,8 +87,8 @@ backend: max-model-len: 9472 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 512 + stream-interval: 100 + max-num-seqs: 1024 max-num-batched-tokens: 16384 max-cudagraph-capture-size: 2048 @@ -101,5 +96,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "4096" + concurrencies: "1024" req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml similarity index 80% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-dep8-6n.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml index f4e000a5f..2eda5502d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-3p2d-dep2-dep8-6n.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml @@ -1,25 +1,18 @@ name: "minimax-m3-vllm-disagg-gb300-3p2d-dep2-dep8-fp8-8k1k" -# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP) -# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863. +# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 decode (TP1 DP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 2 prefill + 4 decode (+ head/infra). model: path: "minimax-m3-mxfp8" container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" precision: "fp8" + dynamo: install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" + version: 1.3.0.dev20260614 health_check: max_attempts: 720 @@ -64,6 +57,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 2 data-parallel-rpc-port: 13345 @@ -75,12 +69,13 @@ backend: max-model-len: 9472 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 + stream-interval: 100 max-cudagraph-capture-size: 2048 max-num-batched-tokens: 16384 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' tensor-parallel-size: 1 data-parallel-size: 8 data-parallel-rpc-port: 13345 @@ -92,8 +87,8 @@ backend: max-model-len: 9472 language-model-only: true kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 512 + stream-interval: 100 + max-num-seqs: 1024 max-num-batched-tokens: 16384 max-cudagraph-capture-size: 2048 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/6p1d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/6p1d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..a8b1eb501 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/6p1d-dep2-dep8-8k1k.yaml @@ -0,0 +1,100 @@ +name: "minimax-m3-vllm-disagg-gb300-6p1d-dep2-dep8-fp8-8k1k" + +# 6P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D DEP8 decode (TP1 DP8 EP, 8 GPU/worker = 2 nodes each). +# GB300 has 4 GPUs/node. Adapted from NV B300 PR #1863. +# Nodes: 3 prefill + 2 decode (+ head/infra). + +model: + path: "minimax-m3-mxfp8" + container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" + precision: "fp8" + + +dynamo: + install: true + version: 1.3.0.dev20260614 + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 3 + decode_nodes: 2 + prefill_workers: 6 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + stream-interval: 100 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + kv-cache-dtype: fp8 + stream-interval: 100 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml deleted file mode 100644 index 4ee41241e..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/disagg-gb300-1p1d-dep2-tp4-marlin-2n.yaml +++ /dev/null @@ -1,104 +0,0 @@ -name: "minimax-m3-vllm-disagg-gb300-1p1d-dep2-tp4-marlin-fp8-8k1k" - -# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TP4 Marlin -# decode = 2 nodes (1P + 1D). Adapted from NV B300 PR #1863. - -model: - path: "minimax-m3-mxfp8" - container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223" - precision: "fp8" - -dynamo: - install: true - wheel: "1.2.0.dev20260526" - -sbatch_directives: - mem: "0" - cpus-per-task: "72" -srun_options: - mem: "0" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 720 - interval_seconds: 10 - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 2 - gpus_per_decode: 4 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - VLLM_FLOAT32_MATMUL_PRECISION: "high" - VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 1 - data-parallel-size: 2 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - kv-cache-dtype: fp8 - stream-interval: 32 - max-cudagraph-capture-size: 2048 - max-num-batched-tokens: 16384 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - tensor-parallel-size: 4 - enable-expert-parallel: false - moe-backend: marlin - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 128 - gpu-memory-utilization: 0.90 - max-model-len: 9472 - language-model-only: true - kv-cache-dtype: fp8 - stream-interval: 32 - max-num-seqs: 1024 - max-num-batched-tokens: 16384 - max-cudagraph-capture-size: 2048 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1x4x8x16" - req_rate: "inf" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9c45a352a..30f38f3dc 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4183,10 +4183,19 @@ - "server_atom.sh: fix _MAX_CONC assignment before cudagraph size check; gate ATOM_MOE_GU_ITLV/AITER_BF16_FP8_MOE_BOUND on DeepSeek-V4-Pro only" - "Search space: ISL=8192 and ISL=1024, 1P1D TP4, conc 1-512" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1927 - + - config-keys: - dsv4-fp4-b200-dynamo-vllm description: - "Update the DeepSeek-V4-Pro B200 disaggregated Dynamo-vLLM benchmark to the vllm/vllm-openai:v0.23.0 image" - "Lower max-num-batched-tokens to 16384 and gpu-memory-utilization to 0.9 on the high-throughput and max-throughput recipes to avoid OOM" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1899 + +- config-keys: + - minimaxm3-fp8-gb300-dynamo-vllm + description: + - "Refresh GB300 MiniMax-M3 disagg recipe set: replace disagg-gb300-* files with new naming convention; drop TP4+Marlin variants." + - "1k/1k topologies (6 shapes): 1p1d-dep2-dep4 (conc 8192), 1p2d-dep2-tep8 (conc 4,16,64,128,256), 2p2d-dep2-tep8 (conc 32), 2p3d-dep2-dep4 (conc 8192), 2p4d-dep2-dep4 (conc 8192), 4p2d-dep2-dep8 (conc 1024,4096)." + - "8k/1k topologies (9 shapes): 1p1d-dep2-dep8 (conc 256), 1p1d-dep2-tep8 (conc 128), 1p2d-dep2-tep8 (conc 32,64,128), 2p1d-dep2-dep8 (conc 512), 2p2d-dep2-tep8 (conc 16), 2p4d-dep2-tep4 (conc 4), 3p1d-dep2-dep8 (conc 1024), 3p2d-dep2-dep8 (conc 512), 6p1d-dep2-dep8 (conc 2048)." + - "Image unchanged: vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1925 diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 93d9eb252..7df971138 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -169,7 +169,7 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECIS elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout main + git checkout sa-submission-q2-2026 mkdir -p recipes/vllm/minimax-m3-gb300-fp8 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8" recipes/vllm/minimax-m3-gb300-fp8 SRTCTL_SETUP_SCRIPT="minimax-m3-gb300-vllm-fixes.sh" From 03567efd71d838143b3becf979afad17f875f488 Mon Sep 17 00:00:00 2001 From: richardhuo-nv Date: Wed, 24 Jun 2026 15:47:28 -0700 Subject: [PATCH 2/2] add srun and sbatch configs --- .../minimax-m3-gb300-fp8/1k1k/1p1d-dep2-dep4-1k1k.yaml | 7 +++++++ .../minimax-m3-gb300-fp8/1k1k/1p2d-dep2-tep8-1k1k.yaml | 7 +++++++ .../minimax-m3-gb300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml | 7 +++++++ .../minimax-m3-gb300-fp8/1k1k/2p3d-dep2-dep4-1k1k.yaml | 7 +++++++ .../minimax-m3-gb300-fp8/1k1k/2p4d-dep2-dep4-1k1k.yaml | 7 +++++++ .../minimax-m3-gb300-fp8/1k1k/4p2d-dep2-dep8-1k1k.yaml | 7 +++++++ .../minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml | 7 +++++++ .../minimax-m3-gb300-fp8/8k1k/1p1d-dep2-tep8-8k1k.yaml | 7 +++++++ .../minimax-m3-gb300-fp8/8k1k/1p2d-dep2-tep8-8k1k.yaml | 7 +++++++ .../minimax-m3-gb300-fp8/8k1k/2p1d-dep2-dep8-8k1k.yaml | 7 +++++++ .../minimax-m3-gb300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml | 7 +++++++ .../minimax-m3-gb300-fp8/8k1k/2p4d-dep2-tep4-8k1k.yaml | 7 +++++++ .../minimax-m3-gb300-fp8/8k1k/3p1d-dep2-dep8-8k1k.yaml | 7 +++++++ .../minimax-m3-gb300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml | 7 +++++++ .../minimax-m3-gb300-fp8/8k1k/6p1d-dep2-dep8-8k1k.yaml | 7 +++++++ 15 files changed, 105 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p1d-dep2-dep4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p1d-dep2-dep4-1k1k.yaml index 4f1e096a7..f57d7af09 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p1d-dep2-dep4-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p1d-dep2-dep4-1k1k.yaml @@ -18,6 +18,13 @@ health_check: max_attempts: 720 interval_seconds: 10 +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p2d-dep2-tep8-1k1k.yaml index 7cdcfe43d..b4f457654 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p2d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/1p2d-dep2-tep8-1k1k.yaml @@ -18,6 +18,13 @@ health_check: max_attempts: 720 interval_seconds: 10 +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml index 0fe27f8a2..6bba9ea86 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml @@ -18,6 +18,13 @@ health_check: max_attempts: 720 interval_seconds: 10 +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p3d-dep2-dep4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p3d-dep2-dep4-1k1k.yaml index 046fe737a..de852e427 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p3d-dep2-dep4-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p3d-dep2-dep4-1k1k.yaml @@ -18,6 +18,13 @@ health_check: max_attempts: 720 interval_seconds: 10 +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p4d-dep2-dep4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p4d-dep2-dep4-1k1k.yaml index 5251b01da..8f7b7b140 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p4d-dep2-dep4-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/2p4d-dep2-dep4-1k1k.yaml @@ -18,6 +18,13 @@ health_check: max_attempts: 720 interval_seconds: 10 +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/4p2d-dep2-dep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/4p2d-dep2-dep8-1k1k.yaml index 3e35cda66..f6cf6a59f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/4p2d-dep2-dep8-1k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/1k1k/4p2d-dep2-dep8-1k1k.yaml @@ -18,6 +18,13 @@ health_check: max_attempts: 720 interval_seconds: 10 +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml index e47c42b05..d990d661b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-dep8-8k1k.yaml @@ -18,6 +18,13 @@ health_check: max_attempts: 720 interval_seconds: 10 +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-tep8-8k1k.yaml index 293249f0a..d46133924 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p1d-dep2-tep8-8k1k.yaml @@ -18,6 +18,13 @@ health_check: max_attempts: 720 interval_seconds: 10 +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p2d-dep2-tep8-8k1k.yaml index 043c24d2d..e8c606e27 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/1p2d-dep2-tep8-8k1k.yaml @@ -18,6 +18,13 @@ health_check: max_attempts: 720 interval_seconds: 10 +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p1d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p1d-dep2-dep8-8k1k.yaml index abdd35143..02c3be14a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p1d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p1d-dep2-dep8-8k1k.yaml @@ -18,6 +18,13 @@ health_check: max_attempts: 720 interval_seconds: 10 +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml index c3fe1887b..304650d6c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml @@ -18,6 +18,13 @@ health_check: max_attempts: 720 interval_seconds: 10 +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p4d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p4d-dep2-tep4-8k1k.yaml index 4f6399171..efea8bfac 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p4d-dep2-tep4-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/2p4d-dep2-tep4-8k1k.yaml @@ -18,6 +18,13 @@ health_check: max_attempts: 720 interval_seconds: 10 +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p1d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p1d-dep2-dep8-8k1k.yaml index cbd5cbf60..97e1ec88c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p1d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p1d-dep2-dep8-8k1k.yaml @@ -18,6 +18,13 @@ health_check: max_attempts: 720 interval_seconds: 10 +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml index 2eda5502d..745b2fad4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml @@ -18,6 +18,13 @@ health_check: max_attempts: 720 interval_seconds: 10 +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/6p1d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/6p1d-dep2-dep8-8k1k.yaml index a8b1eb501..9be5cc177 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/6p1d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb300-fp8/8k1k/6p1d-dep2-dep8-8k1k.yaml @@ -18,6 +18,13 @@ health_check: max_attempts: 720 interval_seconds: 10 +sbatch_directives: + mem: "0" + cpus-per-task: "72" + +srun_options: + mem: "0" + resources: gpu_type: "gb300" gpus_per_node: 4