diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index fe272b18e..77833a1b3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11850,19 +11850,6 @@ minimaxm3-fp8-b300-dynamo-vllm: - isl: 8192 osl: 1024 search-space: - - conc-list: [128] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - conc-list: [256, 512] prefill: num-worker: 2 @@ -11889,31 +11876,31 @@ minimaxm3-fp8-b300-dynamo-vllm: tp: 8 ep: 8 dp-attn: false - - conc-list: [512] + - conc-list: [4096] prefill: - num-worker: 3 + num-worker: 4 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml" decode: num-worker: 2 tp: 8 ep: 8 dp-attn: true - - conc-list: [32] + - conc-list: [1, 4, 8, 16] prefill: - num-worker: 3 + num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml" decode: - num-worker: 2 - tp: 8 - ep: 8 + num-worker: 1 + tp: 4 + ep: 1 dp-attn: false - conc-list: [4096] prefill: @@ -11922,50 +11909,50 @@ minimaxm3-fp8-b300-dynamo-vllm: ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml" decode: num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [4096] + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [16, 32, 64, 128] prefill: - num-worker: 4 + num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep4-8k1k.yaml" decode: - num-worker: 3 + num-worker: 4 tp: 4 ep: 4 - dp-attn: true - - conc-list: [4, 64] + dp-attn: false + - conc-list: [16] prefill: - num-worker: 5 + num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-tep4-8k1k.yaml" decode: num-worker: 2 - tp: 8 - ep: 8 + tp: 4 + ep: 4 dp-attn: false - - conc-list: [1, 4, 8, 16] + - conc-list: [4] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml" + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep8-8k1k.yaml" decode: - num-worker: 1 - tp: 4 - ep: 1 + num-worker: 4 + tp: 8 + ep: 8 dp-attn: false # MiniMax-M3 GB300 disagg sweep — adapted from NV B300 PR #1863. diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml index c98ad0b44..04aca6586 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml @@ -45,6 +45,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 @@ -60,6 +62,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-tep4-8k1k.yaml new file mode 100644 index 000000000..e48310898 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-tep4-8k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-1p2d-fp8-dep2-tep4-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 4 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep4-8k1k.yaml new file mode 100644 index 000000000..30ac635a9 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep4-8k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-1p4d-fp8-dep2-tep4-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 4 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16x32x64x128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep8-8k1k.yaml new file mode 100644 index 000000000..46af72e46 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep8-8k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-1p4d-fp8-dep2-tep8-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml index 7a8ddd1a1..b1558ae34 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml @@ -44,6 +44,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 @@ -60,6 +62,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml index d00fb046e..46aaa045d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml @@ -44,6 +44,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 @@ -58,6 +60,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml index a0e050f1d..3756103ee 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml @@ -44,6 +44,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 @@ -60,6 +62,8 @@ backend: trust-remote-code: true no-enable-prefix-caching: true kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 block-size: 128 gpu-memory-utilization: 0.90 max-model-len: 9472 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml new file mode 100644 index 000000000..c9f29f785 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-4p2d-fp8-dep2-tep4-8k1k" + +model: + path: "MiniMaxAI/MiniMax-M3-MXFP8" + container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130" + precision: "fp8" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 4 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + tensor-parallel-size: 4 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index b9fe3eb50..d6a5f35e4 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4146,3 +4146,10 @@ - "Image: lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85" - "8 topologies: low-latency 1p1d-tp8-tp8 + 1p6d-dep8-tp8; mid-curve 1p1d through 6p1d-dep8-dep16." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1676 + +- config-keys: + - minimaxm3-fp8-b300-dynamo-vllm + description: + - "Run the PR #1891 MiniMax-M3 MXFP8 B300 Dynamo-vLLM recipe set on top of current main." + - "Uses the vllm/vllm-openai:minimax-m3-0618-x86_64-cu130 image and the TEP4/TEP8 8k1k topologies not covered by PR #1890." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1891