diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py index 8b67da7..69b5d8f 100644 --- a/inference_engine/bridge/manifest.py +++ b/inference_engine/bridge/manifest.py @@ -771,6 +771,29 @@ def _harness_preset( params={"max_new_tokens": ("int:max_new_tokens", "64")}, validate_reports=True, # §4 liveness gate on-device ), + Preset( + name="mlx-kakeya-launcher-full", + description="Validate scripts/run_kakeya_mac.sh in FULL mode (f_θ " + "verifier+proposer+f_θ, default path) on a LONG scripted " + "answer that crosses the ~1024 native-cache ring wrap. " + "Guards the launcher's full pipeline + the PR #146 " + "wrapped-ring fix end-to-end: the report must pass the §4 " + "liveness gate AND the quality gate (coherent, no runaway " + "repeat) past the wrap.", + command_templates=( + ( + "bash", "scripts/run_kakeya_mac.sh", + "--max-new-tokens", "{max_new_tokens}", + "--ignore-turn-stop", + "--chat-scripted", "请详细解释POW的工作原理", + "--output", + "results/research/k3_mac_bridge_launcher_full.json", + ), + ), + timeout_minutes=90, + params={"max_new_tokens": ("int:max_new_tokens", "1300")}, + validate_reports=True, # §4 liveness + §2.4 quality gate on-device + ), Preset( name="mlx-kakeya-degen-probe", description="Long-decode regression probe: full f_θ fused engine on a " diff --git a/scripts/run_kakeya_mac.sh b/scripts/run_kakeya_mac.sh index 4bf3308..ea99f34 100755 --- a/scripts/run_kakeya_mac.sh +++ b/scripts/run_kakeya_mac.sh @@ -8,6 +8,13 @@ # the all-MLX proposer path (f_θ bypassed via S5 native prefill — much faster on # Mac, but the f_θ projection does not execute). # +# LONG ANSWERS ARE SAFE (PR #146). The full path runs on gemma-4's native hybrid +# cache (sliding RotatingKVCache, max_size≈1024). Past that ring wrap the engine +# automatically commits single tokens (no speculative rollback to mis-trim on the +# wrapped ring), so generations stay coherent well beyond ~1024 tokens — they +# just lose the spec-decode speedup past the wrap. So the default budget below is +# generous; you no longer need to keep answers under the window. +# # Model facts come from env vars (set on the kakeya-mac-m4 runner), with sane # fallbacks; override on the CLI if needed: # KAKEYA_MAC_VERIFIER_PATH local MLX gemma-4 dir @@ -17,7 +24,7 @@ # Usage: # bash scripts/run_kakeya_mac.sh # full engine (f_θ on), interactive # bash scripts/run_kakeya_mac.sh --fast # proposer-only (f_θ bypassed), faster -# bash scripts/run_kakeya_mac.sh --max-new-tokens 2048 --window 128 +# bash scripts/run_kakeya_mac.sh --max-new-tokens 4096 --window 128 # bash scripts/run_kakeya_mac.sh --dry-run # print the command, run nothing # echo 'Explain proof-of-work.' | bash scripts/run_kakeya_mac.sh # one-shot via stdin set -euo pipefail @@ -31,7 +38,9 @@ FTHETA="${KAKEYA_MAC_FTHETA_DIR:-results/research/f_theta_v5_s5_sliding}" SINK="${KAKEYA_SINK:-4}" WINDOW="${KAKEYA_WINDOW:-64}" BLOCK="${KAKEYA_BLOCK_SIZE:-4}" -MAX_NEW="${KAKEYA_MAX_NEW_TOKENS:-1024}" +# Default budget reaches past the ~1024 native-cache wrap; coherent there since +# PR #146 (single-token commits past the wrap). Raise/lower freely. +MAX_NEW="${KAKEYA_MAX_NEW_TOKENS:-2048}" FAST=0 DRY_RUN=0 @@ -47,7 +56,7 @@ while [[ $# -gt 0 ]]; do --window) shift; WINDOW="${1:?}" ;; --sink) shift; SINK="${1:?}" ;; --block-size) shift; BLOCK="${1:?}" ;; - -h|--help) sed -n '2,28p' "$0"; exit 0 ;; + -h|--help) sed -n '2,29p' "$0"; exit 0 ;; *) EXTRA+=("$1") ;; # pass-through (e.g. --chat-scripted ...) esac shift @@ -70,8 +79,9 @@ if [[ "$FAST" == "1" ]]; then MODE="FAST (verifier + proposer + S5 bounded KV; f_θ BYPASSED)" else # torch drafter + f_θ: the harness auto-enables --force-f-theta in --chat, so - # f_θ projection ACTUALLY RUNS each turn (the full pipeline). - MODE="FULL (verifier + proposer + f_θ + S5 bounded KV; f_θ runs)" + # f_θ projection ACTUALLY RUNS each turn (the full pipeline). Coherent past the + # ~1024 native-cache wrap (PR #146: single-token commits once the ring wraps). + MODE="FULL (verifier + proposer + f_θ + S5 bounded KV; f_θ runs; long-answer safe)" fi log "mode : $MODE" diff --git a/tests/inference_engine/bridge/test_manifest.py b/tests/inference_engine/bridge/test_manifest.py index 31ce0ec..4348b88 100644 --- a/tests/inference_engine/bridge/test_manifest.py +++ b/tests/inference_engine/bridge/test_manifest.py @@ -84,6 +84,7 @@ def test_allowlist_contains_exactly_the_documented_presets(): "mlx-kakeya-degen-probe", "mlx-kakeya-fused-chat-ftheta", "mlx-kakeya-fused-chat-smoke", + "mlx-kakeya-launcher-full", "mlx-kakeya-launcher-smoke", "mlx-multitenant-pressure", "mlx-upgrade", @@ -106,7 +107,7 @@ def test_harness_presets_validate_reports_others_do_not(): "k3-step2-fused-allmlx", # §4 liveness gate runs on-device for the fused-chat presets too: "mlx-kakeya-fused-chat-smoke", "mlx-kakeya-fused-chat-ftheta", - "mlx-kakeya-launcher-smoke", + "mlx-kakeya-launcher-smoke", "mlx-kakeya-launcher-full", } @@ -166,6 +167,20 @@ def test_mlx_kakeya_launcher_smoke_preset_invokes_launcher(): assert argv[argv.index("--max-new-tokens") + 1] == "64" +def test_mlx_kakeya_launcher_full_preset_runs_full_mode_past_wrap(): + request = parse_manifest(_manifest( + preset="mlx-kakeya-launcher-full", params={"max_new_tokens": "1300"})) + (argv,) = build_commands(request, {}) + assert argv[0] == "bash" + assert argv[1].endswith("run_kakeya_mac.sh") + # FULL mode: NO --fast (f_θ verifier+proposer+f_θ path). + assert "--fast" not in argv + assert "--chat-scripted" in argv + assert "--ignore-turn-stop" in argv + # budget crosses the ~1024 native-cache ring wrap. + assert int(argv[argv.index("--max-new-tokens") + 1]) > 1024 + + def test_mlx_kakeya_fused_chat_ftheta_preset_runs_f_theta_path(): request = parse_manifest(_manifest( preset="mlx-kakeya-fused-chat-ftheta",