antirez · machiabeli · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026 · Jun 30, 2026
diff --git a/Makefile b/Makefile
@@ -17,8 +17,8 @@ ROCM_SRCS := $(wildcard rocm/*.cuh)
 
 ifeq ($(UNAME_S),Darwin)
 METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
-CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_metal.o
-CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o
+CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o ds4_metal.o
+CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o
 else
 CFLAGS += -D_GNU_SOURCE -fno-finite-math-only
 CUDA_HOME ?= /usr/local/cuda
@@ -28,8 +28,8 @@ ifneq ($(strip $(CUDA_ARCH)),)
 NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH)
 endif
 NVCCFLAGS ?= -O3 -g -lineinfo --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread
-CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_cuda.o
-CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o
+CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o ds4_cuda.o
+CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o
 CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
 HIPCC ?= $(shell command -v hipcc 2>/dev/null || echo /opt/rocm/bin/hipcc)
 ROCM_ARCH ?= gfx1151
@@ -106,7 +106,7 @@ cuda:
 
 strix-halo:
 	$(MAKE) -B ds4 ds4-server ds4-bench ds4-eval ds4-agent \
-		CORE_OBJS="ds4.o ds4_distributed.o ds4_ssd.o ds4_rocm.o" \
+		CORE_OBJS="ds4.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o ds4_rocm.o" \
 		CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD" \
 		DS4_LINK="$(HIPCC) $(ROCM_CFLAGS)" \
 		DS4_LINK_LIBS="$(ROCM_LDLIBS)"
@@ -139,11 +139,13 @@ cuda-regression: tests/cuda_long_context_smoke
 	./tests/cuda_long_context_smoke
 endif
 
-ds4.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_gpu.h
+ds4.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_dspark_runtime.h ds4_gpu.h
 	$(CC) $(CFLAGS) -c -o $@ ds4.c
 
 ds4_ssd.o: ds4_ssd.c ds4_ssd.h
-	$(CC) $(CFLAGS) -c -o $@ ds4_ssd.c
+
+ds4_dspark_runtime.o: ds4_dspark_runtime.c ds4_dspark_runtime.h ds4.h
+	$(CC) $(CFLAGS) -c -o $@ ds4_dspark_runtime.c
 
 ds4_cli.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h linenoise.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_cli.c
@@ -187,7 +189,7 @@ rax.o: rax.c rax.h rax_malloc.h
 linenoise.o: linenoise.c linenoise.h
 	$(CC) $(CFLAGS) -c -o $@ linenoise.c
 
-ds4_cpu.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_gpu.h
+ds4_cpu.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_dspark_runtime.h ds4_gpu.h
 	$(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4.c
 
 ds4_cli_cpu.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h linenoise.h

diff --git a/README.md b/README.md
@@ -133,11 +133,37 @@ weights. Flash GGUF generation is supported by the local tools. PRO GGUF
 production currently still depends on the external `llama.cpp`-based workflow;
 native tooling can be added later.
 
-`./download_model.sh mtp` fetches the optional speculative decoding support
-GGUF for Flash. It can be used with q2-imatrix, q2-q4-imatrix, and q4-imatrix,
-but must be enabled explicitly with `--mtp`. The current MTP/speculative
-decoding path is still experimental: it is correctness-gated and currently
-provides at most a slight speedup, not a meaningful generation-speed win.
+`./download_model.sh mtp` fetches the optional legacy speculative decoding
+support GGUF for Flash. It can be used with q2-imatrix, q2-q4-imatrix, and
+q4-imatrix, but must be enabled explicitly with `--mtp`. Legacy one-step MTP is
+correctness-gated and experimental: it currently provides at most a slight
+speedup, not a meaningful generation-speed win. Official DeepSeek-V4-Flash
+DSpark/DeepSpec Markov draft shards can be converted with
+`gguf-tools/deepseek4-quantize --dspark-only`. Passing the converted DSpark GGUF
+with `--mtp DSpark.gguf` enables an experimental Metal block speculative decode
+path: draft blocks are target-verified before commit, but acceptance and speed
+depend on the base/draft quantization and prompt. DSpark GGUFs are additional
+draft-model weights, so higher draft precision trades directly against
+long-context headroom. CPU builds do not run MTP, and CUDA/ROCm currently load
+DSpark GGUFs without enabling the DSpark runtime.
+
+For DeepSpec training experiments, `ds4 --dspark-target-cache-dataset FILE
+--dspark-target-cache-out DIR --dspark-target-cache-target-model HF_OR_PATH`
+consumes the same rendered prompt dataset format used by imatrix collection and
+writes a DeepSpec-compatible target cache (`manifest.json`, `samples.idx`, and
+shard data) containing prompt token ids, attention/loss masks, target-layer
+hidden states, and last hidden states. Use
+`--dspark-target-cache-chat-template NAME` to stamp the cache manifest with the
+DeepSpec training template identity.
+Validate the cache contract with
+`python3 gguf-tools/deepspec/ds4_deepspec.py DIR --target-model HF_OR_PATH`
+before handing it to a DeepSpec checkout. The same helper can emit the DS4-side
+non-Markov DeepSpec config scaffold with
+`python3 gguf-tools/deepspec/ds4_deepspec.py --emit-nonseq-config dspark_v4_nonseq.py --target-cache DIR`.
+This target-cache export path remains useful for DSpark/DeepSpec training
+experiments; the built-in Metal runtime uses already converted official DSpark
+Markov draft GGUFs and should still be benchmarked with `DS4_MTP_TIMING=1` on
+the exact base/draft quant pair before treating it as a throughput win.
 
 Then build:
 
@@ -689,10 +715,12 @@ conversation. Useful commands are `/help`, `/think`, `/think-max`, `/nothink`,
 and returns to `ds4>`.
 
 The CLI defaults to thinking mode. Use `/nothink` or `--nothink` for direct
-answers. `--mtp MTP.gguf --mtp-draft 2` enables the optional MTP speculative
-path; it is useful only for greedy decoding, currently uses a confidence gate
-(`--mtp-margin`) to avoid slow partial accepts, and should be treated as an
-experimental slight-speedup path.
+answers. `--mtp MTP.gguf --mtp-draft 2` enables the optional legacy one-step
+MTP speculative path. Passing a converted official DSpark/DeepSpec Markov GGUF
+with `--mtp DSpark.gguf` opts into the experimental Metal block-draft runtime,
+which verifies proposed blocks against the target model before committing them.
+It is correctness-gated, not a guaranteed speedup; measure acceptance and wall
+time for the exact quantized base/draft pair.
 
 ## Server
 

diff --git a/download_model.sh b/download_model.sh
@@ -65,9 +65,9 @@ Targets:
        Downloads both PRO Q4 split files into the download directory. About
        838 GB total. This target does not update ./ds4flash.gguf.
 
-  mtp  Optional speculative decoding component, about 3.5 GB on disk.
-       It is useful with q2-imatrix, q2-q4-imatrix, and q4-imatrix, but must be
-       enabled explicitly with --mtp when running ds4 or ds4-server.
+  mtp  Optional legacy one-step speculative decoding component, about 3.5 GB on
+       disk. It is useful with q2-imatrix, q2-q4-imatrix, and q4-imatrix, but
+       must be enabled explicitly with --mtp when running ds4 or ds4-server.
 
 Options:
   --token TOKEN  Hugging Face token. Otherwise HF_TOKEN or the local HF token
@@ -259,9 +259,10 @@ fi
 
 if [ "$MODEL" = "mtp" ]; then
     echo
-    echo "MTP is an optional component for q2-imatrix, q2-q4-imatrix, and q4-imatrix."
+    echo "MTP is an optional legacy one-step component for q2-imatrix, q2-q4-imatrix, and q4-imatrix."
     echo "Enable it explicitly, for example:"
     echo "  ./ds4 --mtp $OUT_DIR/$MTP_FILE --mtp-draft 2"
+    echo "DeepSpec/DSpark GGUFs are recognized separately by the loader but speculative block drafting remains disabled until validated."
 elif [ "$MODEL" = "pro-q4-layers00-30" ] || [ "$MODEL" = "pro-q4-layers31-output" ] || [ "$MODEL" = "pro-q4-split" ]; then
     echo
     echo "Downloaded PRO Q4 distributed split file(s). Use them with --layers,"