Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11114,6 +11114,86 @@ qwen3.5-fp8-h100-sglang-mtp:
search-space:
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }

qwen3.5-fp4-gb300-dynamo-sglang:
image: lmsysorg/sglang:nightly-dev-cu13-20260624-b2c8f7a2
model: nvidia/Qwen3.5-397B-A17B-NVFP4
model-prefix: qwen3.5
runner: gb300
precision: fp4
framework: dynamo-sglang
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 8192
osl: 1024
search-space:
# 1P1D TP4: 1 prefill worker at TP4 and 1 decode worker at TP4
# Pure tensor parallel (STP), 8k1k baseline-low-latency sweep.
# Total: 8 GB300 GPUs.
- spec-decoding: "none"
conc-list: [1, 4, 8, 16, 32, 64, 256]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
# 5P1D wide-EP: 5 prefill workers @ DEP4 + 1 decode worker @ DEP16.
# NIXL transfer. Total: 36 GB300 GPUs (5*4 + 4*4).
- spec-decoding: "none"
conc-list: [2048]
prefill:
num-worker: 5
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# 6P1D wide-EP: 6 prefill workers @ DEP4 + 1 decode worker @ DEP16.
# Mooncake transfer. Total: 40 GB300 GPUs (6*4 + 4*4).
- spec-decoding: "none"
conc-list: [5120]
prefill:
num-worker: 6
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# 7P1D wide-EP: 7 prefill workers @ DEP4 + 1 decode worker @ DEP16.
# Mooncake transfer. Total: 44 GB300 GPUs (7*4 + 4*4).
- spec-decoding: "none"
conc-list: [5120]
prefill:
num-worker: 7
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true

glm5-fp4-gb300-dynamo-sglang:
image: lmsysorg/sglang:v0.5.11-cu130
model: nvidia/GLM-5-NVFP4
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
# Qwen3.5-397B-A17B-NVFP4 Disaggregated 1P1D: TP4 Prefill + TP4 Decode
# Pure tensor parallel, no expert parallel (STP)
# 8k1k sa-bench concurrency sweep on GB300
#
# Values taken from ni_experiment_config of the
# sa-qwen-3.5-8k1k-fp4-baseline-low-latency study, row
# qwen3.5-1p_tp4x1d_tp4-aligned-ccsweep (CSV pareto export 2026-06-05).

name: "gb300-fp4-qwen3.5_8k1k_lowlat_0"

model:
path: "qwen3.5-fp4"
container: "dynamo-sglang"
precision: "fp4"

dynamo:
version: "1.1.0"

frontend:
type: dynamo
enable_multiple_frontends: true
num_additional_frontends: 2
nginx_container: nginx

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1

backend:
type: sglang

prefill_environment:
NO_COLOR: "1"
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
SGLANG_ENABLE_SPEC_V2: "1"
PYTHONUNBUFFERED: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
MC_FORCE_MNNVL: "1"
SGLANG_ENABLE_JIT_DEEPGEMM: "true"
SGLANG_ENABLE_FLASHINFER_GEMM: "true"
FLASHINFER_DISABLE_VERSION_CHECK: "1"
SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache"
FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"

decode_environment:
NO_COLOR: "1"
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
SGLANG_ENABLE_SPEC_V2: "1"
PYTHONUNBUFFERED: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
MC_FORCE_MNNVL: "1"
MC_TE_METRIC: "true"
SGLANG_ENABLE_JIT_DEEPGEMM: "true"
SGLANG_ENABLE_FLASHINFER_GEMM: "true"
FLASHINFER_DISABLE_VERSION_CHECK: "1"
SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache"
FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
SGLANG_HEALTH_CHECK_TIMEOUT: "1800"
SGLANG_HEALTH_STARTING_OK: "1"
SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0"

sglang_config:
prefill:
served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4"
model-path: "/model/"
trust-remote-code: true

tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1

reasoning-parser: "qwen3"
tool-call-parser: "qwen3_coder"

quantization: "modelopt_fp4"
fp4-gemm-backend: "flashinfer_cutlass"
kv-cache-dtype: "fp8_e4m3"

mamba-scheduler-strategy: "no_buffer"
mamba-ssm-dtype: "bfloat16"
mamba-track-interval: 2048

attention-backend: "trtllm_mha"
mm-attention-backend: "triton_attn"
moe-runner-backend: "flashinfer_trtllm"
linear-attn-decode-backend: "flashinfer"

disaggregation-mode: "prefill"
disable-radix-cache: true

mem-fraction-static: 0.8
context-length: 9236
max-total-tokens: 128000
max-running-requests: 128
cuda-graph-max-bs: 4
chunked-prefill-size: 32768
max-prefill-tokens: 32768
scheduler-recv-interval: 10
stream-interval: 30
load-balance-method: "round_robin"
page-size: 64
watchdog-timeout: 1000000
log-level: "info"

decode:
served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4"
model-path: "/model/"
trust-remote-code: true

tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1

reasoning-parser: "qwen3"
tool-call-parser: "qwen3_coder"

quantization: "modelopt_fp4"
fp4-gemm-backend: "flashinfer_cutlass"
kv-cache-dtype: "fp8_e4m3"

mamba-scheduler-strategy: "no_buffer"
mamba-ssm-dtype: "bfloat16"
mamba-track-interval: 128

attention-backend: "trtllm_mha"
mm-attention-backend: "triton_attn"
moe-runner-backend: "flashinfer_trtllm"
linear-attn-decode-backend: "flashinfer"

disaggregation-mode: "decode"
disable-radix-cache: true

mem-fraction-static: 0.8
context-length: 9236
max-total-tokens: 1500000
max-mamba-cache-size: 256
max-running-requests: 128
cuda-graph-max-bs: 256
chunked-prefill-size: 32768
max-prefill-tokens: 32768
scheduler-recv-interval: 10
stream-interval: 30
page-size: 64
watchdog-timeout: 1000000
decode-log-interval: 50

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "1x4x8x16x32x64x256"
req_rate: "inf"
random_range_ratio: 0.8
Loading