Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions inference_engine/bridge/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,29 @@ def _harness_preset(
params={"max_new_tokens": ("int:max_new_tokens", "64")},
validate_reports=True, # §4 liveness gate on-device
),
Preset(
name="mlx-kakeya-launcher-full",
description="Validate scripts/run_kakeya_mac.sh in FULL mode (f_θ "
"verifier+proposer+f_θ, default path) on a LONG scripted "
"answer that crosses the ~1024 native-cache ring wrap. "
"Guards the launcher's full pipeline + the PR #146 "
"wrapped-ring fix end-to-end: the report must pass the §4 "
"liveness gate AND the quality gate (coherent, no runaway "
"repeat) past the wrap.",
command_templates=(
(
"bash", "scripts/run_kakeya_mac.sh",
"--max-new-tokens", "{max_new_tokens}",
"--ignore-turn-stop",
"--chat-scripted", "请详细解释POW的工作原理",
"--output",
"results/research/k3_mac_bridge_launcher_full.json",
),
),
timeout_minutes=90,
params={"max_new_tokens": ("int:max_new_tokens", "1300")},
validate_reports=True, # §4 liveness + §2.4 quality gate on-device
),
Preset(
name="mlx-kakeya-degen-probe",
description="Long-decode regression probe: full f_θ fused engine on a "
Expand Down
20 changes: 15 additions & 5 deletions scripts/run_kakeya_mac.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@
# the all-MLX proposer path (f_θ bypassed via S5 native prefill — much faster on
# Mac, but the f_θ projection does not execute).
#
# LONG ANSWERS ARE SAFE (PR #146). The full path runs on gemma-4's native hybrid
# cache (sliding RotatingKVCache, max_size≈1024). Past that ring wrap the engine
# automatically commits single tokens (no speculative rollback to mis-trim on the
# wrapped ring), so generations stay coherent well beyond ~1024 tokens — they
# just lose the spec-decode speedup past the wrap. So the default budget below is
# generous; you no longer need to keep answers under the window.
#
# Model facts come from env vars (set on the kakeya-mac-m4 runner), with sane
# fallbacks; override on the CLI if needed:
# KAKEYA_MAC_VERIFIER_PATH local MLX gemma-4 dir
Expand All @@ -17,7 +24,7 @@
# Usage:
# bash scripts/run_kakeya_mac.sh # full engine (f_θ on), interactive
# bash scripts/run_kakeya_mac.sh --fast # proposer-only (f_θ bypassed), faster
# bash scripts/run_kakeya_mac.sh --max-new-tokens 2048 --window 128
# bash scripts/run_kakeya_mac.sh --max-new-tokens 4096 --window 128
# bash scripts/run_kakeya_mac.sh --dry-run # print the command, run nothing
# echo 'Explain proof-of-work.' | bash scripts/run_kakeya_mac.sh # one-shot via stdin
set -euo pipefail
Expand All @@ -31,7 +38,9 @@ FTHETA="${KAKEYA_MAC_FTHETA_DIR:-results/research/f_theta_v5_s5_sliding}"
SINK="${KAKEYA_SINK:-4}"
WINDOW="${KAKEYA_WINDOW:-64}"
BLOCK="${KAKEYA_BLOCK_SIZE:-4}"
MAX_NEW="${KAKEYA_MAX_NEW_TOKENS:-1024}"
# Default budget reaches past the ~1024 native-cache wrap; coherent there since
# PR #146 (single-token commits past the wrap). Raise/lower freely.
MAX_NEW="${KAKEYA_MAX_NEW_TOKENS:-2048}"

FAST=0
DRY_RUN=0
Expand All @@ -47,7 +56,7 @@ while [[ $# -gt 0 ]]; do
--window) shift; WINDOW="${1:?}" ;;
--sink) shift; SINK="${1:?}" ;;
--block-size) shift; BLOCK="${1:?}" ;;
-h|--help) sed -n '2,28p' "$0"; exit 0 ;;
-h|--help) sed -n '2,29p' "$0"; exit 0 ;;
*) EXTRA+=("$1") ;; # pass-through (e.g. --chat-scripted ...)
esac
shift
Expand All @@ -70,8 +79,9 @@ if [[ "$FAST" == "1" ]]; then
MODE="FAST (verifier + proposer + S5 bounded KV; f_θ BYPASSED)"
else
# torch drafter + f_θ: the harness auto-enables --force-f-theta in --chat, so
# f_θ projection ACTUALLY RUNS each turn (the full pipeline).
MODE="FULL (verifier + proposer + f_θ + S5 bounded KV; f_θ runs)"
# f_θ projection ACTUALLY RUNS each turn (the full pipeline). Coherent past the
# ~1024 native-cache wrap (PR #146: single-token commits once the ring wraps).
MODE="FULL (verifier + proposer + f_θ + S5 bounded KV; f_θ runs; long-answer safe)"
fi

log "mode : $MODE"
Expand Down
17 changes: 16 additions & 1 deletion tests/inference_engine/bridge/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def test_allowlist_contains_exactly_the_documented_presets():
"mlx-kakeya-degen-probe",
"mlx-kakeya-fused-chat-ftheta",
"mlx-kakeya-fused-chat-smoke",
"mlx-kakeya-launcher-full",
"mlx-kakeya-launcher-smoke",
"mlx-multitenant-pressure",
"mlx-upgrade",
Expand All @@ -106,7 +107,7 @@ def test_harness_presets_validate_reports_others_do_not():
"k3-step2-fused-allmlx",
# §4 liveness gate runs on-device for the fused-chat presets too:
"mlx-kakeya-fused-chat-smoke", "mlx-kakeya-fused-chat-ftheta",
"mlx-kakeya-launcher-smoke",
"mlx-kakeya-launcher-smoke", "mlx-kakeya-launcher-full",
}


Expand Down Expand Up @@ -166,6 +167,20 @@ def test_mlx_kakeya_launcher_smoke_preset_invokes_launcher():
assert argv[argv.index("--max-new-tokens") + 1] == "64"


def test_mlx_kakeya_launcher_full_preset_runs_full_mode_past_wrap():
request = parse_manifest(_manifest(
preset="mlx-kakeya-launcher-full", params={"max_new_tokens": "1300"}))
(argv,) = build_commands(request, {})
assert argv[0] == "bash"
assert argv[1].endswith("run_kakeya_mac.sh")
# FULL mode: NO --fast (f_θ verifier+proposer+f_θ path).
assert "--fast" not in argv
assert "--chat-scripted" in argv
assert "--ignore-turn-stop" in argv
# budget crosses the ~1024 native-cache ring wrap.
assert int(argv[argv.index("--max-new-tokens") + 1]) > 1024


def test_mlx_kakeya_fused_chat_ftheta_preset_runs_f_theta_path():
request = parse_manifest(_manifest(
preset="mlx-kakeya-fused-chat-ftheta",
Expand Down
Loading