SemiAnalysisAI · adibarra · Jun 25, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
@@ -11114,6 +11114,86 @@ qwen3.5-fp8-h100-sglang-mtp:
       search-space:
       - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
 
+qwen3.5-fp4-gb300-dynamo-sglang:
+  image: lmsysorg/sglang:nightly-dev-cu13-20260624-b2c8f7a2
+  model: nvidia/Qwen3.5-397B-A17B-NVFP4
+  model-prefix: qwen3.5
+  runner: gb300
+  precision: fp4
+  framework: dynamo-sglang
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1P1D TP4: 1 prefill worker at TP4 and 1 decode worker at TP4
+      # Pure tensor parallel (STP), 8k1k baseline-low-latency sweep.
+      # Total: 8 GB300 GPUs.
+      - spec-decoding: "none"
+        conc-list: [1, 4, 8, 16, 32, 64, 256]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+      # 5P1D wide-EP: 5 prefill workers @ DEP4 + 1 decode worker @ DEP16.
+      # NIXL transfer. Total: 36 GB300 GPUs (5*4 + 4*4).
+      - spec-decoding: "none"
+        conc-list: [2048]
+        prefill:
+          num-worker: 5
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+      # 6P1D wide-EP: 6 prefill workers @ DEP4 + 1 decode worker @ DEP16.
+      # Mooncake transfer. Total: 40 GB300 GPUs (6*4 + 4*4).
+      - spec-decoding: "none"
+        conc-list: [5120]
+        prefill:
+          num-worker: 6
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+      # 7P1D wide-EP: 7 prefill workers @ DEP4 + 1 decode worker @ DEP16.
+      # Mooncake transfer. Total: 44 GB300 GPUs (7*4 + 4*4).
+      - spec-decoding: "none"
+        conc-list: [5120]
+        prefill:
+          num-worker: 7
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
 glm5-fp4-gb300-dynamo-sglang:
   image: lmsysorg/sglang:v0.5.11-cu130
   model: nvidia/GLM-5-NVFP4

diff --git a/...ti_node/srt-slurm-recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml b/...ti_node/srt-slurm-recipes/sglang/qwen3.5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml
@@ -0,0 +1,175 @@
+# Qwen3.5-397B-A17B-NVFP4 Disaggregated 1P1D: TP4 Prefill + TP4 Decode
+# Pure tensor parallel, no expert parallel (STP)
+# 8k1k sa-bench concurrency sweep on GB300
+#
+# Values taken from ni_experiment_config of the
+# sa-qwen-3.5-8k1k-fp4-baseline-low-latency study, row
+# qwen3.5-1p_tp4x1d_tp4-aligned-ccsweep (CSV pareto export 2026-06-05).
+
+name: "gb300-fp4-qwen3.5_8k1k_lowlat_0"
+
+model:
+  path: "qwen3.5-fp4"
+  container: "dynamo-sglang"
+  precision: "fp4"
+
+dynamo:
+  version: "1.1.0"
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 2
+  nginx_container: nginx
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    NO_COLOR: "1"
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_ENABLE_SPEC_V2: "1"
+    PYTHONUNBUFFERED: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    MC_FORCE_MNNVL: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "true"
+    SGLANG_ENABLE_FLASHINFER_GEMM: "true"
+    FLASHINFER_DISABLE_VERSION_CHECK: "1"
+    SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache"
+    FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+
+  decode_environment:
+    NO_COLOR: "1"
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_ENABLE_SPEC_V2: "1"
+    PYTHONUNBUFFERED: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    MC_FORCE_MNNVL: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "true"
+    SGLANG_ENABLE_FLASHINFER_GEMM: "true"
+    FLASHINFER_DISABLE_VERSION_CHECK: "1"
+    SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache"
+    FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_HEALTH_CHECK_TIMEOUT: "1800"
+    SGLANG_HEALTH_STARTING_OK: "1"
+    SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION: "0"
+
+  sglang_config:
+    prefill:
+      served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 4
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+      reasoning-parser: "qwen3"
+      tool-call-parser: "qwen3_coder"
+
+      quantization: "modelopt_fp4"
+      fp4-gemm-backend: "flashinfer_cutlass"
+      kv-cache-dtype: "fp8_e4m3"
+
+      mamba-scheduler-strategy: "no_buffer"
+      mamba-ssm-dtype: "bfloat16"
+      mamba-track-interval: 2048
+
+      attention-backend: "trtllm_mha"
+      mm-attention-backend: "triton_attn"
+      moe-runner-backend: "flashinfer_trtllm"
+      linear-attn-decode-backend: "flashinfer"
+
+      disaggregation-mode: "prefill"
+      disable-radix-cache: true
+
+      mem-fraction-static: 0.8
+      context-length: 9236
+      max-total-tokens: 128000
+      max-running-requests: 128
+      cuda-graph-max-bs: 4
+      chunked-prefill-size: 32768
+      max-prefill-tokens: 32768
+      scheduler-recv-interval: 10
+      stream-interval: 30
+      load-balance-method: "round_robin"
+      page-size: 64
+      watchdog-timeout: 1000000
+      log-level: "info"
+
+    decode:
+      served-model-name: "nvidia/Qwen3.5-397B-A17B-NVFP4"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 4
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+      reasoning-parser: "qwen3"
+      tool-call-parser: "qwen3_coder"
+
+      quantization: "modelopt_fp4"
+      fp4-gemm-backend: "flashinfer_cutlass"
+      kv-cache-dtype: "fp8_e4m3"
+
+      mamba-scheduler-strategy: "no_buffer"
+      mamba-ssm-dtype: "bfloat16"
+      mamba-track-interval: 128
+
+      attention-backend: "trtllm_mha"
+      mm-attention-backend: "triton_attn"
+      moe-runner-backend: "flashinfer_trtllm"
+      linear-attn-decode-backend: "flashinfer"
+
+      disaggregation-mode: "decode"
+      disable-radix-cache: true
+
+      mem-fraction-static: 0.8
+      context-length: 9236
+      max-total-tokens: 1500000
+      max-mamba-cache-size: 256
+      max-running-requests: 128
+      cuda-graph-max-bs: 256
+      chunked-prefill-size: 32768
+      max-prefill-tokens: 32768
+      scheduler-recv-interval: 10
+      stream-interval: 30
+      page-size: 64
+      watchdog-timeout: 1000000
+      decode-log-interval: 50
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x4x8x16x32x64x256"
+  req_rate: "inf"
+  random_range_ratio: 0.8