FluffyAIcode · cursor · Jun 18, 2026 · Jun 17, 2026
diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py
@@ -771,6 +771,29 @@ def _harness_preset(
             params={"max_new_tokens": ("int:max_new_tokens", "64")},
             validate_reports=True,  # §4 liveness gate on-device
         ),
+        Preset(
+            name="mlx-kakeya-launcher-full",
+            description="Validate scripts/run_kakeya_mac.sh in FULL mode (f_θ "
+                        "verifier+proposer+f_θ, default path) on a LONG scripted "
+                        "answer that crosses the ~1024 native-cache ring wrap. "
+                        "Guards the launcher's full pipeline + the PR #146 "
+                        "wrapped-ring fix end-to-end: the report must pass the §4 "
+                        "liveness gate AND the quality gate (coherent, no runaway "
+                        "repeat) past the wrap.",
+            command_templates=(
+                (
+                    "bash", "scripts/run_kakeya_mac.sh",
+                    "--max-new-tokens", "{max_new_tokens}",
+                    "--ignore-turn-stop",
+                    "--chat-scripted", "请详细解释POW的工作原理",
+                    "--output",
+                    "results/research/k3_mac_bridge_launcher_full.json",
+                ),
+            ),
+            timeout_minutes=90,
+            params={"max_new_tokens": ("int:max_new_tokens", "1300")},
+            validate_reports=True,  # §4 liveness + §2.4 quality gate on-device
+        ),
         Preset(
             name="mlx-kakeya-degen-probe",
             description="Long-decode regression probe: full f_θ fused engine on a "

diff --git a/scripts/run_kakeya_mac.sh b/scripts/run_kakeya_mac.sh
@@ -8,6 +8,13 @@
 # the all-MLX proposer path (f_θ bypassed via S5 native prefill — much faster on
 # Mac, but the f_θ projection does not execute).
 #
+# LONG ANSWERS ARE SAFE (PR #146). The full path runs on gemma-4's native hybrid
+# cache (sliding RotatingKVCache, max_size≈1024). Past that ring wrap the engine
+# automatically commits single tokens (no speculative rollback to mis-trim on the
+# wrapped ring), so generations stay coherent well beyond ~1024 tokens — they
+# just lose the spec-decode speedup past the wrap. So the default budget below is
+# generous; you no longer need to keep answers under the window.
+#
 # Model facts come from env vars (set on the kakeya-mac-m4 runner), with sane
 # fallbacks; override on the CLI if needed:
 #   KAKEYA_MAC_VERIFIER_PATH   local MLX gemma-4 dir
@@ -17,7 +24,7 @@
 # Usage:
 #   bash scripts/run_kakeya_mac.sh                 # full engine (f_θ on), interactive
 #   bash scripts/run_kakeya_mac.sh --fast          # proposer-only (f_θ bypassed), faster
-#   bash scripts/run_kakeya_mac.sh --max-new-tokens 2048 --window 128
+#   bash scripts/run_kakeya_mac.sh --max-new-tokens 4096 --window 128
 #   bash scripts/run_kakeya_mac.sh --dry-run       # print the command, run nothing
 #   echo 'Explain proof-of-work.' | bash scripts/run_kakeya_mac.sh   # one-shot via stdin
 set -euo pipefail
@@ -31,7 +38,9 @@ FTHETA="${KAKEYA_MAC_FTHETA_DIR:-results/research/f_theta_v5_s5_sliding}"
 SINK="${KAKEYA_SINK:-4}"
 WINDOW="${KAKEYA_WINDOW:-64}"
 BLOCK="${KAKEYA_BLOCK_SIZE:-4}"
-MAX_NEW="${KAKEYA_MAX_NEW_TOKENS:-1024}"
+# Default budget reaches past the ~1024 native-cache wrap; coherent there since
+# PR #146 (single-token commits past the wrap). Raise/lower freely.
+MAX_NEW="${KAKEYA_MAX_NEW_TOKENS:-2048}"
 
 FAST=0
 DRY_RUN=0
@@ -47,7 +56,7 @@ while [[ $# -gt 0 ]]; do
     --window)          shift; WINDOW="${1:?}" ;;
     --sink)            shift; SINK="${1:?}" ;;
     --block-size)      shift; BLOCK="${1:?}" ;;
-    -h|--help)         sed -n '2,28p' "$0"; exit 0 ;;
+    -h|--help)         sed -n '2,29p' "$0"; exit 0 ;;
     *)                 EXTRA+=("$1") ;;   # pass-through (e.g. --chat-scripted ...)
   esac
   shift
@@ -70,8 +79,9 @@ if [[ "$FAST" == "1" ]]; then
   MODE="FAST (verifier + proposer + S5 bounded KV; f_θ BYPASSED)"
 else
   # torch drafter + f_θ: the harness auto-enables --force-f-theta in --chat, so
-  # f_θ projection ACTUALLY RUNS each turn (the full pipeline).
-  MODE="FULL (verifier + proposer + f_θ + S5 bounded KV; f_θ runs)"
+  # f_θ projection ACTUALLY RUNS each turn (the full pipeline). Coherent past the
+  # ~1024 native-cache wrap (PR #146: single-token commits once the ring wraps).
+  MODE="FULL (verifier + proposer + f_θ + S5 bounded KV; f_θ runs; long-answer safe)"
 fi
 
 log "mode    : $MODE"

diff --git a/tests/inference_engine/bridge/test_manifest.py b/tests/inference_engine/bridge/test_manifest.py
@@ -84,6 +84,7 @@ def test_allowlist_contains_exactly_the_documented_presets():
         "mlx-kakeya-degen-probe",
         "mlx-kakeya-fused-chat-ftheta",
         "mlx-kakeya-fused-chat-smoke",
+        "mlx-kakeya-launcher-full",
         "mlx-kakeya-launcher-smoke",
         "mlx-multitenant-pressure",
         "mlx-upgrade",
@@ -106,7 +107,7 @@ def test_harness_presets_validate_reports_others_do_not():
         "k3-step2-fused-allmlx",
         # §4 liveness gate runs on-device for the fused-chat presets too:
         "mlx-kakeya-fused-chat-smoke", "mlx-kakeya-fused-chat-ftheta",
-        "mlx-kakeya-launcher-smoke",
+        "mlx-kakeya-launcher-smoke", "mlx-kakeya-launcher-full",
     }
 
 
@@ -166,6 +167,20 @@ def test_mlx_kakeya_launcher_smoke_preset_invokes_launcher():
     assert argv[argv.index("--max-new-tokens") + 1] == "64"
 
 
+def test_mlx_kakeya_launcher_full_preset_runs_full_mode_past_wrap():
+    request = parse_manifest(_manifest(
+        preset="mlx-kakeya-launcher-full", params={"max_new_tokens": "1300"}))
+    (argv,) = build_commands(request, {})
+    assert argv[0] == "bash"
+    assert argv[1].endswith("run_kakeya_mac.sh")
+    # FULL mode: NO --fast (f_θ verifier+proposer+f_θ path).
+    assert "--fast" not in argv
+    assert "--chat-scripted" in argv
+    assert "--ignore-turn-stop" in argv
+    # budget crosses the ~1024 native-cache ring wrap.
+    assert int(argv[argv.index("--max-new-tokens") + 1]) > 1024
+
+
 def test_mlx_kakeya_fused_chat_ftheta_preset_runs_f_theta_path():
     request = parse_manifest(_manifest(
         preset="mlx-kakeya-fused-chat-ftheta",