Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
b506cd4
[NV] Add MiniMax M3 B300 Dynamo vLLM recipes
Oseltamivir Jun 19, 2026
84a023a
chore: update MiniMax M3 B300 container
Oseltamivir Jun 19, 2026
b09bc78
chore: update changelog PR link
Oseltamivir Jun 19, 2026
86da150
Update perf-changelog.yaml
Oseltamivir Jun 19, 2026
f5727c2
Update perf-changelog.yaml
Oseltamivir Jun 19, 2026
3b6dad4
fix(vllm): patch MiniMax M3 MSA contiguity
Oseltamivir Jun 19, 2026
71ba2ea
fix(recipes): align MiniMax M3 parallel settings
Oseltamivir Jun 19, 2026
b859a0b
fix(vllm): backport MiniMax M3 eval fixes
Oseltamivir Jun 19, 2026
2d408e4
ci(sweep): enable full MiniMax M3 validation
Oseltamivir Jun 19, 2026
3956aee
perf(vllm): right-size MiniMax M3 low concurrency
Oseltamivir Jun 20, 2026
33fe6a9
Merge remote-tracking branch 'origin/main' into pr-1787-latest
Oseltamivir Jun 20, 2026
77c6391
Merge branch 'main' into pr-1787-latest
Oseltamivir Jun 20, 2026
b99d3c9
perf(vllm): colocate MiniMax M3 TP4 workers
Oseltamivir Jun 20, 2026
d2347aa
fix(runner): exclude faulty B300 RDMA node
Oseltamivir Jun 20, 2026
8ace2e9
fix(runner): verify B300 node exclusion
Oseltamivir Jun 20, 2026
884ff12
fix(runner): check generated B300 sbatch script
Oseltamivir Jun 20, 2026
3ae240b
ci(sweep): validate B300 node exclusion
Oseltamivir Jun 20, 2026
9751d93
Merge remote-tracking branch 'origin/main' into pr-1787-latest
Oseltamivir Jun 20, 2026
03d27e7
refactor(vllm): trim MiniMax M3 runtime patches
Oseltamivir Jun 21, 2026
826a64e
Merge branch 'main' into pr-1787-latest
Oseltamivir Jun 22, 2026
aec850f
Merge branch 'main' into pr-1787-latest
Oseltamivir Jun 22, 2026
37d5e2c
Update MiniMax M3 B300 Dynamo vLLM recipes
biswapanda Jun 22, 2026
adbe614
fix
biswapanda Jun 22, 2026
fe0eda5
update to flashinfer
biswapanda Jun 23, 2026
0a751a7
prune non-pareto
biswapanda Jun 23, 2026
d08cc43
Merge branch 'main' into pr-1787-latest--update
Ankur-singh Jun 23, 2026
f100024
fix(vllm): remove pruned MiniMax M3 B300 recipes
jasonlizhengjian Jun 23, 2026
376f261
Merge remote-tracking branch 'refs/remotes/inferencex/main' into code…
jasonlizhengjian Jun 23, 2026
6b9fe1e
Merge remote-tracking branch 'refs/remotes/github/main' into codex/pr…
jasonlizhengjian Jun 23, 2026
de40cd7
Merge branch 'main' into codex/pr-1887-same-repo-sweep
RohitNagraj Jun 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 28 additions & 41 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11850,19 +11850,6 @@ minimaxm3-fp8-b300-dynamo-vllm:
- isl: 8192
osl: 1024
search-space:
- conc-list: [128]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
- conc-list: [256, 512]
prefill:
num-worker: 2
Expand All @@ -11889,31 +11876,31 @@ minimaxm3-fp8-b300-dynamo-vllm:
tp: 8
ep: 8
dp-attn: false
- conc-list: [512]
- conc-list: [4096]
prefill:
num-worker: 3
num-worker: 4
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml"
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
- conc-list: [32]
- conc-list: [1, 4, 8, 16]
prefill:
num-worker: 3
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml"
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- conc-list: [4096]
prefill:
Expand All @@ -11922,50 +11909,50 @@ minimaxm3-fp8-b300-dynamo-vllm:
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml"
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-tep4-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
- conc-list: [4096]
tp: 4
ep: 4
dp-attn: false
- conc-list: [16, 32, 64, 128]
prefill:
num-worker: 4
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml"
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep4-8k1k.yaml"
decode:
num-worker: 3
num-worker: 4
tp: 4
ep: 4
dp-attn: true
- conc-list: [4, 64]
dp-attn: false
- conc-list: [16]
prefill:
num-worker: 5
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml"
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-tep4-8k1k.yaml"
decode:
num-worker: 2
tp: 8
ep: 8
tp: 4
ep: 4
dp-attn: false
- conc-list: [1, 4, 8, 16]
- conc-list: [4]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml"
- "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p4d-dep2-tep8-8k1k.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
num-worker: 4
tp: 8
ep: 8
dp-attn: false

# MiniMax-M3 GB300 disagg sweep — adapted from NV B300 PR #1863.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ backend:
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}'
kv-cache-dtype: fp8
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 9472
Expand All @@ -60,6 +62,8 @@ backend:
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}'
kv-cache-dtype: fp8
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 9472
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
name: "minimax-m3-vllm-disagg-b300-1p2d-fp8-dep2-tep4-8k1k"

model:
path: "MiniMaxAI/MiniMax-M3-MXFP8"
container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
precision: "fp8"

resources:
gpu_type: "b300"
gpus_per_node: 8
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 2
gpus_per_prefill: 2
gpus_per_decode: 4

dynamo:
install: true
version: 1.3.0.dev20260614

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
VLLM_FLOAT32_MATMUL_PRECISION: high
UCX_TLS: "cuda_copy,rc"

decode_environment:
VLLM_FLOAT32_MATMUL_PRECISION: high
UCX_TLS: "cuda_copy,rc"

vllm_config:
prefill:
tensor-parallel-size: 1
data-parallel-size: 2
data-parallel-rpc-port: 13345
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}'
kv-cache-dtype: fp8
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 9472
language-model-only: true
stream-interval: 32
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 16384

decode:
tensor-parallel-size: 4
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}'
kv-cache-dtype: fp8
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 9472
language-model-only: true
stream-interval: 32
max-num-seqs: 512
max-num-batched-tokens: 16384
max-cudagraph-capture-size: 4096

health_check:
max_attempts: 360
interval_seconds: 10

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "16"
req_rate: "inf"
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
name: "minimax-m3-vllm-disagg-b300-1p4d-fp8-dep2-tep4-8k1k"

model:
path: "MiniMaxAI/MiniMax-M3-MXFP8"
container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
precision: "fp8"

resources:
gpu_type: "b300"
gpus_per_node: 8
prefill_nodes: 1
decode_nodes: 2
prefill_workers: 1
decode_workers: 4
gpus_per_prefill: 2
gpus_per_decode: 4

dynamo:
install: true
version: 1.3.0.dev20260614

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
VLLM_FLOAT32_MATMUL_PRECISION: high
UCX_TLS: "cuda_copy,rc"

decode_environment:
VLLM_FLOAT32_MATMUL_PRECISION: high
UCX_TLS: "cuda_copy,rc"

vllm_config:
prefill:
tensor-parallel-size: 1
data-parallel-size: 2
data-parallel-rpc-port: 13345
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}'
kv-cache-dtype: fp8
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 9472
language-model-only: true
stream-interval: 32
max-cudagraph-capture-size: 2048
max-num-batched-tokens: 16384

decode:
tensor-parallel-size: 4
enable-expert-parallel: true
trust-remote-code: true
no-enable-prefix-caching: true
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}'
kv-cache-dtype: fp8
block-size: 128
gpu-memory-utilization: 0.90
max-model-len: 9472
language-model-only: true
stream-interval: 32
max-num-seqs: 512
max-num-batched-tokens: 16384
max-cudagraph-capture-size: 4096

health_check:
max_attempts: 360
interval_seconds: 10

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "16x32x64x128"
req_rate: "inf"
Loading