Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ ROCM_SRCS := $(wildcard rocm/*.cuh)

ifeq ($(UNAME_S),Darwin)
METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_metal.o
CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o
CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o ds4_metal.o
CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o
else
CFLAGS += -D_GNU_SOURCE -fno-finite-math-only
CUDA_HOME ?= /usr/local/cuda
Expand All @@ -28,8 +28,8 @@ ifneq ($(strip $(CUDA_ARCH)),)
NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH)
endif
NVCCFLAGS ?= -O3 -g -lineinfo --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread
CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_cuda.o
CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o
CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o ds4_cuda.o
CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o
CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
HIPCC ?= $(shell command -v hipcc 2>/dev/null || echo /opt/rocm/bin/hipcc)
ROCM_ARCH ?= gfx1151
Expand Down Expand Up @@ -106,7 +106,7 @@ cuda:

strix-halo:
$(MAKE) -B ds4 ds4-server ds4-bench ds4-eval ds4-agent \
CORE_OBJS="ds4.o ds4_distributed.o ds4_ssd.o ds4_rocm.o" \
CORE_OBJS="ds4.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o ds4_rocm.o" \
CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD" \
DS4_LINK="$(HIPCC) $(ROCM_CFLAGS)" \
DS4_LINK_LIBS="$(ROCM_LDLIBS)"
Expand Down Expand Up @@ -139,11 +139,13 @@ cuda-regression: tests/cuda_long_context_smoke
./tests/cuda_long_context_smoke
endif

ds4.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_gpu.h
ds4.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_dspark_runtime.h ds4_gpu.h
$(CC) $(CFLAGS) -c -o $@ ds4.c

ds4_ssd.o: ds4_ssd.c ds4_ssd.h
$(CC) $(CFLAGS) -c -o $@ ds4_ssd.c

ds4_dspark_runtime.o: ds4_dspark_runtime.c ds4_dspark_runtime.h ds4.h
$(CC) $(CFLAGS) -c -o $@ ds4_dspark_runtime.c

ds4_cli.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h linenoise.h
$(CC) $(CFLAGS) -c -o $@ ds4_cli.c
Expand Down Expand Up @@ -187,7 +189,7 @@ rax.o: rax.c rax.h rax_malloc.h
linenoise.o: linenoise.c linenoise.h
$(CC) $(CFLAGS) -c -o $@ linenoise.c

ds4_cpu.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_gpu.h
ds4_cpu.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_dspark_runtime.h ds4_gpu.h
$(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4.c

ds4_cli_cpu.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h linenoise.h
Expand Down
46 changes: 37 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,11 +133,37 @@ weights. Flash GGUF generation is supported by the local tools. PRO GGUF
production currently still depends on the external `llama.cpp`-based workflow;
native tooling can be added later.

`./download_model.sh mtp` fetches the optional speculative decoding support
GGUF for Flash. It can be used with q2-imatrix, q2-q4-imatrix, and q4-imatrix,
but must be enabled explicitly with `--mtp`. The current MTP/speculative
decoding path is still experimental: it is correctness-gated and currently
provides at most a slight speedup, not a meaningful generation-speed win.
`./download_model.sh mtp` fetches the optional legacy speculative decoding
support GGUF for Flash. It can be used with q2-imatrix, q2-q4-imatrix, and
q4-imatrix, but must be enabled explicitly with `--mtp`. Legacy one-step MTP is
correctness-gated and experimental: it currently provides at most a slight
speedup, not a meaningful generation-speed win. Official DeepSeek-V4-Flash
DSpark/DeepSpec Markov draft shards can be converted with
`gguf-tools/deepseek4-quantize --dspark-only`. Passing the converted DSpark GGUF
with `--mtp DSpark.gguf` enables an experimental Metal block speculative decode
path: draft blocks are target-verified before commit, but acceptance and speed
depend on the base/draft quantization and prompt. DSpark GGUFs are additional
draft-model weights, so higher draft precision trades directly against
long-context headroom. CPU builds do not run MTP, and CUDA/ROCm currently load
DSpark GGUFs without enabling the DSpark runtime.

For DeepSpec training experiments, `ds4 --dspark-target-cache-dataset FILE
--dspark-target-cache-out DIR --dspark-target-cache-target-model HF_OR_PATH`
consumes the same rendered prompt dataset format used by imatrix collection and
writes a DeepSpec-compatible target cache (`manifest.json`, `samples.idx`, and
shard data) containing prompt token ids, attention/loss masks, target-layer
hidden states, and last hidden states. Use
`--dspark-target-cache-chat-template NAME` to stamp the cache manifest with the
DeepSpec training template identity.
Validate the cache contract with
`python3 gguf-tools/deepspec/ds4_deepspec.py DIR --target-model HF_OR_PATH`
before handing it to a DeepSpec checkout. The same helper can emit the DS4-side
non-Markov DeepSpec config scaffold with
`python3 gguf-tools/deepspec/ds4_deepspec.py --emit-nonseq-config dspark_v4_nonseq.py --target-cache DIR`.
This target-cache export path remains useful for DSpark/DeepSpec training
experiments; the built-in Metal runtime uses already converted official DSpark
Markov draft GGUFs and should still be benchmarked with `DS4_MTP_TIMING=1` on
the exact base/draft quant pair before treating it as a throughput win.

Then build:

Expand Down Expand Up @@ -689,10 +715,12 @@ conversation. Useful commands are `/help`, `/think`, `/think-max`, `/nothink`,
and returns to `ds4>`.

The CLI defaults to thinking mode. Use `/nothink` or `--nothink` for direct
answers. `--mtp MTP.gguf --mtp-draft 2` enables the optional MTP speculative
path; it is useful only for greedy decoding, currently uses a confidence gate
(`--mtp-margin`) to avoid slow partial accepts, and should be treated as an
experimental slight-speedup path.
answers. `--mtp MTP.gguf --mtp-draft 2` enables the optional legacy one-step
MTP speculative path. Passing a converted official DSpark/DeepSpec Markov GGUF
with `--mtp DSpark.gguf` opts into the experimental Metal block-draft runtime,
which verifies proposed blocks against the target model before committing them.
It is correctness-gated, not a guaranteed speedup; measure acceptance and wall
time for the exact quantized base/draft pair.

## Server

Expand Down
9 changes: 5 additions & 4 deletions download_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ Targets:
Downloads both PRO Q4 split files into the download directory. About
838 GB total. This target does not update ./ds4flash.gguf.

mtp Optional speculative decoding component, about 3.5 GB on disk.
It is useful with q2-imatrix, q2-q4-imatrix, and q4-imatrix, but must be
enabled explicitly with --mtp when running ds4 or ds4-server.
mtp Optional legacy one-step speculative decoding component, about 3.5 GB on
disk. It is useful with q2-imatrix, q2-q4-imatrix, and q4-imatrix, but
must be enabled explicitly with --mtp when running ds4 or ds4-server.

Options:
--token TOKEN Hugging Face token. Otherwise HF_TOKEN or the local HF token
Expand Down Expand Up @@ -259,9 +259,10 @@ fi

if [ "$MODEL" = "mtp" ]; then
echo
echo "MTP is an optional component for q2-imatrix, q2-q4-imatrix, and q4-imatrix."
echo "MTP is an optional legacy one-step component for q2-imatrix, q2-q4-imatrix, and q4-imatrix."
echo "Enable it explicitly, for example:"
echo " ./ds4 --mtp $OUT_DIR/$MTP_FILE --mtp-draft 2"
echo "DeepSpec/DSpark GGUFs are recognized separately by the loader but speculative block drafting remains disabled until validated."
elif [ "$MODEL" = "pro-q4-layers00-30" ] || [ "$MODEL" = "pro-q4-layers31-output" ] || [ "$MODEL" = "pro-q4-split" ]; then
echo
echo "Downloaded PRO Q4 distributed split file(s). Use them with --layers,"
Expand Down
Loading