diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f4d70f977..054652e57 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11114,6 +11114,86 @@ qwen3.5-fp8-h100-sglang-mtp: search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } +qwen3.5-fp4-gb300-dynamo-sglang: + image: lmsysorg/sglang:nightly-dev-cu13-20260624-b2c8f7a2 + model: nvidia/Qwen3.5-397B-A17B-NVFP4 + model-prefix: qwen3.5 + runner: gb300 + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + # 1P1D TP4: 1 prefill worker at TP4 and 1 decode worker at TP4 + # Pure tensor parallel (STP), 8k1k baseline-low-latency sweep. + # Total: 8 GB300 GPUs. + - spec-decoding: "none" + conc-list: [1, 4, 8, 16, 32, 64, 256] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + # 5P1D wide-EP: 5 prefill workers @ DEP4 + 1 decode worker @ DEP16. + # NIXL transfer. Total: 36 GB300 GPUs (5*4 + 4*4). + - spec-decoding: "none" + conc-list: [2048] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # 6P1D wide-EP: 6 prefill workers @ DEP4 + 1 decode worker @ DEP16. + # Mooncake transfer. Total: 40 GB300 GPUs (6*4 + 4*4). + - spec-decoding: "none" + conc-list: [5120] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # 7P1D wide-EP: 7 prefill workers @ DEP4 + 1 decode worker @ DEP16. + # Mooncake transfer. Total: 44 GB300 GPUs (7*4 + 4*4). + - spec-decoding: "none" + conc-list: [5120] + prefill: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + glm5-fp4-gb300-dynamo-sglang: image: lmsysorg/sglang:v0.5.11-cu130 model: nvidia/GLM-5-NVFP4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml new file mode 100644 index 000000000..71a0ffb9f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml @@ -0,0 +1,175 @@ +# Qwen3.5-397B-A17B-NVFP4 Disaggregated 1P1D: TP4 Prefill + TP4 Decode +# Pure tensor parallel, no expert parallel (STP) +# 8k1k sa-bench concurrency sweep on GB300 +# +# Values taken from ni_experiment_config of the +# sa-qwen-3.5-8k1k-fp4-baseline-low-latency study, row +# qwen3.5-1p_tp4x1d_tp4-aligned-ccsweep (CSV pareto export 2026-06-05). + +name: "gb300-fp4-qwen3.5_8k1k_lowlat_0" + +model: + path: "qwen3.5-fp4" + container: "dynamo-sglang" + precision: "fp4" + +dynamo: + version: "1.1.0" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 2 + nginx_container: nginx + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + NO_COLOR: "1" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "true" + SGLANG_ENABLE_FLASHINFER_GEMM: "true" + FLASHINFER_DISABLE_VERSION_CHECK: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + decode_environment: + NO_COLOR: "1" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + MC_FORCE_MNNVL: "1" + MC_TE_METRIC: "true" + SGLANG_ENABLE_JIT_DEEPGEMM: "true" + SGLANG_ENABLE_FLASHINFER_GEMM: "true" + FLASHINFER_DISABLE_VERSION_CHECK: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_HEALTH_CHECK_TIMEOUT: "1800" + SGLANG_HEALTH_STARTING_OK: "1" + SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0" + + sglang_config: + prefill: + served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + reasoning-parser: "qwen3" + tool-call-parser: "qwen3_coder" + + quantization: "modelopt_fp4" + fp4-gemm-backend: "flashinfer_cutlass" + kv-cache-dtype: "fp8_e4m3" + + mamba-scheduler-strategy: "no_buffer" + mamba-ssm-dtype: "bfloat16" + mamba-track-interval: 2048 + + attention-backend: "trtllm_mha" + mm-attention-backend: "triton_attn" + moe-runner-backend: "flashinfer_trtllm" + linear-attn-decode-backend: "flashinfer" + + disaggregation-mode: "prefill" + disable-radix-cache: true + + mem-fraction-static: 0.8 + context-length: 9236 + max-total-tokens: 128000 + max-running-requests: 128 + cuda-graph-max-bs: 4 + chunked-prefill-size: 32768 + max-prefill-tokens: 32768 + scheduler-recv-interval: 10 + stream-interval: 30 + load-balance-method: "round_robin" + page-size: 64 + watchdog-timeout: 1000000 + log-level: "info" + + decode: + served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4" + model-path: "/model/" + trust-remote-code: true + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + reasoning-parser: "qwen3" + tool-call-parser: "qwen3_coder" + + quantization: "modelopt_fp4" + fp4-gemm-backend: "flashinfer_cutlass" + kv-cache-dtype: "fp8_e4m3" + + mamba-scheduler-strategy: "no_buffer" + mamba-ssm-dtype: "bfloat16" + mamba-track-interval: 128 + + attention-backend: "trtllm_mha" + mm-attention-backend: "triton_attn" + moe-runner-backend: "flashinfer_trtllm" + linear-attn-decode-backend: "flashinfer" + + disaggregation-mode: "decode" + disable-radix-cache: true + + mem-fraction-static: 0.8 + context-length: 9236 + max-total-tokens: 1500000 + max-mamba-cache-size: 256 + max-running-requests: 128 + cuda-graph-max-bs: 256 + chunked-prefill-size: 32768 + max-prefill-tokens: 32768 + scheduler-recv-interval: 10 + stream-interval: 30 + page-size: 64 + watchdog-timeout: 1000000 + decode-log-interval: 50 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8x16x32x64x256" + req_rate: "inf" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml new file mode 100644 index 000000000..00e576439 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml @@ -0,0 +1,177 @@ +# Qwen3.5-397B-A17B-NVFP4 Disaggregated 5P1D wide-EP +# Prefill: 5 workers @ TP4/DP4/EP4 with DP-attn (per-node, DEP4) +# Decode: 1 worker @ TP16/DP16/EP16 with DP-attn + TBO (DEP16, 4 nodes) +# Total: 36 GB300 GPUs (5*4 + 4*4); 8k1k concurrency 1024/2048/3072. +# +# Values taken from ni_experiment_config of pareto row +# qwen3.5-dep16-fia2a-tbo-cc1024x2048x3072-dynamo-tot-nixl +# (sa-qwen-3.5-8k1k-fp4-baseline-mid-pareto study). + +name: "gb300-fp4-qwen3.5_8k1k_maxtpt_0" + +model: + path: "qwen3.5-fp4" + container: "dynamo-sglang" + precision: "fp4" + +dynamo: + version: "1.1.0" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 2 + nginx_container: nginx + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 5 + prefill_workers: 5 + decode_nodes: 4 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + NO_COLOR: "1" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "true" + SGLANG_ENABLE_FLASHINFER_GEMM: "true" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + FLASHINFER_DISABLE_VERSION_CHECK: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_ENABLE_NIXL: "1" + + decode_environment: + NO_COLOR: "1" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + MC_FORCE_MNNVL: "1" + MC_TE_METRIC: "true" + SGLANG_ENABLE_JIT_DEEPGEMM: "true" + SGLANG_ENABLE_FLASHINFER_GEMM: "true" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" + SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE: "1" + SGLANG_NCCL_ALL_GATHER_IN_OVERLAP_SCHEDULER_SYNC_BATCH: "1" + FLASHINFER_DISABLE_VERSION_CHECK: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_HEALTH_CHECK_TIMEOUT: "1800" + SGLANG_HEALTH_STARTING_OK: "1" + SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0" + SGLANG_ENABLE_NIXL: "1" + + sglang_config: + prefill: + served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4" + model-path: "/model/" + trust-remote-code: true + + quantization: "modelopt_fp4" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + disable-radix-cache: true + disaggregation-bootstrap-port: 31000 + disaggregation-transfer-backend: "nixl" + + mem-fraction-static: 0.8 + max-total-tokens: 128000 + chunked-prefill-size: 65536 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + log-level: "info" + page-size: 64 + + attention-backend: "trtllm_mha" + moe-runner-backend: "flashinfer_trtllm" + linear-attn-decode-backend: "flashinfer" + + decode: + served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4" + model-path: "/model/" + trust-remote-code: true + + quantization: "modelopt_fp4" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + enable-two-batch-overlap: true + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 128 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + disable-radix-cache: true + disaggregation-bootstrap-port: 31000 + disaggregation-transfer-backend: "nixl" + + chunked-prefill-size: 4096 + max-mamba-cache-size: 4096 + max-total-tokens: 2200000 + max-running-requests: 4096 + mem-fraction-static: 0.8 + watchdog-timeout: 1000000 + page-size: 64 + + attention-backend: "trtllm_mha" + moe-runner-backend: "flashinfer_cutedsl" + moe-a2a-backend: "flashinfer" + disable-shared-experts-fusion: true + linear-attn-decode-backend: "flashinfer" + + decode-log-interval: 50 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml new file mode 100644 index 000000000..ecab28509 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml @@ -0,0 +1,175 @@ +# Qwen3.5-397B-A17B-NVFP4 Disaggregated 6P1D wide-EP +# Prefill: 6 workers @ TP4/DP4/EP4 with DP-attn (per-node, DEP4) +# Decode: 1 worker @ TP16/DP16/EP16 with DP-attn + TBO (DEP16, 4 nodes) +# Total: 40 GB300 GPUs (6*4 + 4*4); 8k1k concurrency 5120. +# +# Values taken from ni_experiment_config of pareto row +# qwen3.5-6p_dep4x1d_dep16-fia2a-tbo-cc5120-dynamo-tot-mooncake +# (sa-qwen-3.5-8k1k-fp4-baseline-mid-pareto study). + +name: "gb300-fp4-qwen3.5_8k1k_maxtpt_1" + +model: + path: "qwen3.5-fp4" + container: "dynamo-sglang" + precision: "fp4" + +dynamo: + version: "1.1.0" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 2 + nginx_container: nginx + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 6 + prefill_workers: 6 + decode_nodes: 4 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + NO_COLOR: "1" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "true" + SGLANG_ENABLE_FLASHINFER_GEMM: "true" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + FLASHINFER_DISABLE_VERSION_CHECK: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + decode_environment: + NO_COLOR: "1" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + MC_FORCE_MNNVL: "1" + MC_TE_METRIC: "true" + SGLANG_ENABLE_JIT_DEEPGEMM: "true" + SGLANG_ENABLE_FLASHINFER_GEMM: "true" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" + SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE: "1" + SGLANG_NCCL_ALL_GATHER_IN_OVERLAP_SCHEDULER_SYNC_BATCH: "1" + FLASHINFER_DISABLE_VERSION_CHECK: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_HEALTH_CHECK_TIMEOUT: "1800" + SGLANG_HEALTH_STARTING_OK: "1" + SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0" + + sglang_config: + prefill: + served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4" + model-path: "/model/" + trust-remote-code: true + + quantization: "modelopt_fp4" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + disable-radix-cache: true + disaggregation-bootstrap-port: 31000 + disaggregation-transfer-backend: "mooncake" + + mem-fraction-static: 0.8 + max-total-tokens: 128000 + chunked-prefill-size: 65536 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + log-level: "info" + page-size: 64 + + attention-backend: "trtllm_mha" + moe-runner-backend: "flashinfer_trtllm" + linear-attn-decode-backend: "flashinfer" + + decode: + served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4" + model-path: "/model/" + trust-remote-code: true + + quantization: "modelopt_fp4" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + enable-two-batch-overlap: true + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 128 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + disable-radix-cache: true + disaggregation-bootstrap-port: 31000 + disaggregation-transfer-backend: "mooncake" + + chunked-prefill-size: 5120 + max-mamba-cache-size: 5120 + max-total-tokens: 3200000 + max-running-requests: 5120 + mem-fraction-static: 0.8 + watchdog-timeout: 1000000 + page-size: 64 + + attention-backend: "trtllm_mha" + moe-runner-backend: "flashinfer_cutedsl" + moe-a2a-backend: "flashinfer" + disable-shared-experts-fusion: true + linear-attn-decode-backend: "flashinfer" + + decode-log-interval: 50 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "5120" + req_rate: "inf" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml new file mode 100644 index 000000000..d35f44469 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml @@ -0,0 +1,175 @@ +# Qwen3.5-397B-A17B-NVFP4 Disaggregated 7P1D wide-EP +# Prefill: 7 workers @ TP4/DP4/EP4 with DP-attn (per-node, DEP4) +# Decode: 1 worker @ TP16/DP16/EP16 with DP-attn + TBO (DEP16, 4 nodes) +# Total: 44 GB300 GPUs (7*4 + 4*4); 8k1k concurrency 5120. +# +# Values taken from ni_experiment_config of pareto row +# qwen3.5-7p_dep4x1d_dep16-fia2a-tbo-cc5120-dynamo-tot-mooncake +# (sa-qwen-3.5-8k1k-fp4-baseline-mid-pareto study). + +name: "gb300-fp4-qwen3.5_8k1k_maxtpt_2" + +model: + path: "qwen3.5-fp4" + container: "dynamo-sglang" + precision: "fp4" + +dynamo: + version: "1.1.0" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 2 + nginx_container: nginx + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 7 + prefill_workers: 7 + decode_nodes: 4 + decode_workers: 1 + +backend: + type: sglang + + prefill_environment: + NO_COLOR: "1" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + MC_FORCE_MNNVL: "1" + SGLANG_ENABLE_JIT_DEEPGEMM: "true" + SGLANG_ENABLE_FLASHINFER_GEMM: "true" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + FLASHINFER_DISABLE_VERSION_CHECK: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + + decode_environment: + NO_COLOR: "1" + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800" + SGLANG_ENABLE_SPEC_V2: "1" + PYTHONUNBUFFERED: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + MC_FORCE_MNNVL: "1" + MC_TE_METRIC: "true" + SGLANG_ENABLE_JIT_DEEPGEMM: "true" + SGLANG_ENABLE_FLASHINFER_GEMM: "true" + SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass" + SGLANG_MOE_NVFP4_DISPATCH: "1" + SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" + SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE: "1" + SGLANG_NCCL_ALL_GATHER_IN_OVERLAP_SCHEDULER_SYNC_BATCH: "1" + FLASHINFER_DISABLE_VERSION_CHECK: "1" + SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache" + FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000" + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0" + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_HEALTH_CHECK_TIMEOUT: "1800" + SGLANG_HEALTH_STARTING_OK: "1" + SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0" + + sglang_config: + prefill: + served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4" + model-path: "/model/" + trust-remote-code: true + + quantization: "modelopt_fp4" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + enable-dp-lm-head: true + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 2048 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "prefill" + disable-radix-cache: true + disaggregation-bootstrap-port: 31000 + disaggregation-transfer-backend: "mooncake" + + mem-fraction-static: 0.8 + max-total-tokens: 128000 + chunked-prefill-size: 65536 + load-balance-method: "round_robin" + watchdog-timeout: 1000000 + log-level: "info" + page-size: 64 + + attention-backend: "trtllm_mha" + moe-runner-backend: "flashinfer_trtllm" + linear-attn-decode-backend: "flashinfer" + + decode: + served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4" + model-path: "/model/" + trust-remote-code: true + + quantization: "modelopt_fp4" + kv-cache-dtype: "fp8_e4m3" + + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + enable-two-batch-overlap: true + + mamba-scheduler-strategy: "no_buffer" + mamba-track-interval: 128 + mamba-ssm-dtype: "bfloat16" + + disaggregation-mode: "decode" + disable-radix-cache: true + disaggregation-bootstrap-port: 31000 + disaggregation-transfer-backend: "mooncake" + + chunked-prefill-size: 5120 + max-mamba-cache-size: 5120 + max-total-tokens: 3200000 + max-running-requests: 5120 + mem-fraction-static: 0.8 + watchdog-timeout: 1000000 + page-size: 64 + + attention-backend: "trtllm_mha" + moe-runner-backend: "flashinfer_cutedsl" + moe-a2a-backend: "flashinfer" + disable-shared-experts-fusion: true + linear-attn-decode-backend: "flashinfer" + + decode-log-interval: 50 + stream-interval: 50 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "5120" + req_rate: "inf" + random_range_ratio: 0.8 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 7c16dc460..51d799478 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4205,3 +4205,11 @@ - "Serve through the text-only language-model path with block size 128, TRITON_ATTN, MiniMax-M3 tool/reasoning parsers, automatic tool choice, and VLLM_USE_BREAKABLE_CUDAGRAPH=0; let vLLM select the MoE backend and retain the default KV-cache dtype." - "Mirror the MiniMax-M3 MXFP8 MI355X TP/EP/DP-attention search space at 1k1k and 8k1k for a direct precision comparison." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1935 + +- config-keys: + - qwen3.5-fp4-gb300-dynamo-sglang + description: + - "Add Qwen3.5-397B-A17B-NVFP4 GB300 disaggregated multinode SGLang benchmarks via Dynamo." + - "Image: lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc." + - "8k/1k STP recipes: 1P1D TP4 (conc 1-256), 5P1D DEP4+1D DEP16 (conc 2048, NIXL), 6P1D and 7P1D DEP4+1D DEP16 (conc 5120, Mooncake)." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1921 diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 93d9eb252..689451443 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -61,8 +61,13 @@ elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then elif [[ $MODEL_PREFIX == "kimik2.5" && $PRECISION == "fp4" ]]; then export MODEL_PATH=/scratch/models/Kimi-K2.5-NVFP4 export SRT_SLURM_MODEL_PREFIX="nvidia/Kimi-K2.5-NVFP4" +elif [[ $MODEL_PREFIX == "qwen3.5" && $PRECISION == "fp4" ]]; then + # SRT_SLURM_MODEL_PREFIX must match the model.path alias used in our + # Qwen3.5 sglang recipes (qwen3.5-fp4). + export MODEL_PATH=/scratch/models/Qwen3.5-397B-A17B-NVFP4 + export SRT_SLURM_MODEL_PREFIX="qwen3.5-fp4" else - echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4, glm5-fp4, glm5-fp8, minimaxm2.5-fp4, minimaxm2.5-fp8, kimik2.5-fp4" + echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4, glm5-fp4, glm5-fp8, minimaxm2.5-fp4, minimaxm2.5-fp8, kimik2.5-fp4, qwen3.5-fp4" exit 1 fi @@ -154,6 +159,14 @@ elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "glm5" ]]; then git checkout sa-submission-q2-2026 mkdir -p recipes/sglang/glm5 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5" recipes/sglang/glm5 +elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "qwen3.5" ]]; then + # Same srt-slurm tooling as glm5: NVIDIA/srt-slurm @ sa-submission-q2-2026. + # Overlay our version-controlled Qwen3.5 recipes on top (upstream has none). + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout sa-submission-q2-2026 + mkdir -p recipes/sglang/qwen3.5 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5" recipes/sglang/qwen3.5 elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR"