From f06264a0c11e192abe975192ae170e893006d53d Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 17 Jun 2026 15:15:19 +0000
Subject: [PATCH 01/12] feat(mac-launcher): long-answer-safe defaults +
 full-mode validation preset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

run_kakeya_mac.sh:
- Document that long answers are now coherent past the ~1024 native-cache ring
  wrap (PR #146: single-token commits once the sliding RotatingKVCache wraps).
- Raise default --max-new-tokens 1024 -> 2048 (the wrap is no longer a coherence
  cliff; FULL mode just drops the spec-decode speedup past it).
- Refresh help text and FULL-mode mode banner.

bridge: add mlx-kakeya-launcher-full preset (FULL f_θ path, long scripted answer
crossing the wrap, validate_reports) so CI/on-device guards the launcher's full
pipeline + the wrap fix end-to-end; launcher-smoke stays for fast wiring checks.

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 inference_engine/bridge/manifest.py           | 23 +++++++++++++++++++
 scripts/run_kakeya_mac.sh                     | 20 ++++++++++++----
 .../inference_engine/bridge/test_manifest.py  | 17 +++++++++++++-
 3 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py
index 8b67da7..69b5d8f 100644
--- a/inference_engine/bridge/manifest.py
+++ b/inference_engine/bridge/manifest.py
@@ -771,6 +771,29 @@ def _harness_preset(
             params={"max_new_tokens": ("int:max_new_tokens", "64")},
             validate_reports=True,  # §4 liveness gate on-device
         ),
+        Preset(
+            name="mlx-kakeya-launcher-full",
+            description="Validate scripts/run_kakeya_mac.sh in FULL mode (f_θ "
+                        "verifier+proposer+f_θ, default path) on a LONG scripted "
+                        "answer that crosses the ~1024 native-cache ring wrap. "
+                        "Guards the launcher's full pipeline + the PR #146 "
+                        "wrapped-ring fix end-to-end: the report must pass the §4 "
+                        "liveness gate AND the quality gate (coherent, no runaway "
+                        "repeat) past the wrap.",
+            command_templates=(
+                (
+                    "bash", "scripts/run_kakeya_mac.sh",
+                    "--max-new-tokens", "{max_new_tokens}",
+                    "--ignore-turn-stop",
+                    "--chat-scripted", "请详细解释POW的工作原理",
+                    "--output",
+                    "results/research/k3_mac_bridge_launcher_full.json",
+                ),
+            ),
+            timeout_minutes=90,
+            params={"max_new_tokens": ("int:max_new_tokens", "1300")},
+            validate_reports=True,  # §4 liveness + §2.4 quality gate on-device
+        ),
         Preset(
             name="mlx-kakeya-degen-probe",
             description="Long-decode regression probe: full f_θ fused engine on a "
diff --git a/scripts/run_kakeya_mac.sh b/scripts/run_kakeya_mac.sh
index 4bf3308..ea99f34 100755
--- a/scripts/run_kakeya_mac.sh
+++ b/scripts/run_kakeya_mac.sh
@@ -8,6 +8,13 @@
 # the all-MLX proposer path (f_θ bypassed via S5 native prefill — much faster on
 # Mac, but the f_θ projection does not execute).
 #
+# LONG ANSWERS ARE SAFE (PR #146). The full path runs on gemma-4's native hybrid
+# cache (sliding RotatingKVCache, max_size≈1024). Past that ring wrap the engine
+# automatically commits single tokens (no speculative rollback to mis-trim on the
+# wrapped ring), so generations stay coherent well beyond ~1024 tokens — they
+# just lose the spec-decode speedup past the wrap. So the default budget below is
+# generous; you no longer need to keep answers under the window.
+#
 # Model facts come from env vars (set on the kakeya-mac-m4 runner), with sane
 # fallbacks; override on the CLI if needed:
 #   KAKEYA_MAC_VERIFIER_PATH   local MLX gemma-4 dir
@@ -17,7 +24,7 @@
 # Usage:
 #   bash scripts/run_kakeya_mac.sh                 # full engine (f_θ on), interactive
 #   bash scripts/run_kakeya_mac.sh --fast          # proposer-only (f_θ bypassed), faster
-#   bash scripts/run_kakeya_mac.sh --max-new-tokens 2048 --window 128
+#   bash scripts/run_kakeya_mac.sh --max-new-tokens 4096 --window 128
 #   bash scripts/run_kakeya_mac.sh --dry-run       # print the command, run nothing
 #   echo 'Explain proof-of-work.' | bash scripts/run_kakeya_mac.sh   # one-shot via stdin
 set -euo pipefail
@@ -31,7 +38,9 @@ FTHETA="${KAKEYA_MAC_FTHETA_DIR:-results/research/f_theta_v5_s5_sliding}"
 SINK="${KAKEYA_SINK:-4}"
 WINDOW="${KAKEYA_WINDOW:-64}"
 BLOCK="${KAKEYA_BLOCK_SIZE:-4}"
-MAX_NEW="${KAKEYA_MAX_NEW_TOKENS:-1024}"
+# Default budget reaches past the ~1024 native-cache wrap; coherent there since
+# PR #146 (single-token commits past the wrap). Raise/lower freely.
+MAX_NEW="${KAKEYA_MAX_NEW_TOKENS:-2048}"
 
 FAST=0
 DRY_RUN=0
@@ -47,7 +56,7 @@ while [[ $# -gt 0 ]]; do
     --window)          shift; WINDOW="${1:?}" ;;
     --sink)            shift; SINK="${1:?}" ;;
     --block-size)      shift; BLOCK="${1:?}" ;;
-    -h|--help)         sed -n '2,28p' "$0"; exit 0 ;;
+    -h|--help)         sed -n '2,29p' "$0"; exit 0 ;;
     *)                 EXTRA+=("$1") ;;   # pass-through (e.g. --chat-scripted ...)
   esac
   shift
@@ -70,8 +79,9 @@ if [[ "$FAST" == "1" ]]; then
   MODE="FAST (verifier + proposer + S5 bounded KV; f_θ BYPASSED)"
 else
   # torch drafter + f_θ: the harness auto-enables --force-f-theta in --chat, so
-  # f_θ projection ACTUALLY RUNS each turn (the full pipeline).
-  MODE="FULL (verifier + proposer + f_θ + S5 bounded KV; f_θ runs)"
+  # f_θ projection ACTUALLY RUNS each turn (the full pipeline). Coherent past the
+  # ~1024 native-cache wrap (PR #146: single-token commits once the ring wraps).
+  MODE="FULL (verifier + proposer + f_θ + S5 bounded KV; f_θ runs; long-answer safe)"
 fi
 
 log "mode    : $MODE"
diff --git a/tests/inference_engine/bridge/test_manifest.py b/tests/inference_engine/bridge/test_manifest.py
index 31ce0ec..4348b88 100644
--- a/tests/inference_engine/bridge/test_manifest.py
+++ b/tests/inference_engine/bridge/test_manifest.py
@@ -84,6 +84,7 @@ def test_allowlist_contains_exactly_the_documented_presets():
         "mlx-kakeya-degen-probe",
         "mlx-kakeya-fused-chat-ftheta",
         "mlx-kakeya-fused-chat-smoke",
+        "mlx-kakeya-launcher-full",
         "mlx-kakeya-launcher-smoke",
         "mlx-multitenant-pressure",
         "mlx-upgrade",
@@ -106,7 +107,7 @@ def test_harness_presets_validate_reports_others_do_not():
         "k3-step2-fused-allmlx",
         # §4 liveness gate runs on-device for the fused-chat presets too:
         "mlx-kakeya-fused-chat-smoke", "mlx-kakeya-fused-chat-ftheta",
-        "mlx-kakeya-launcher-smoke",
+        "mlx-kakeya-launcher-smoke", "mlx-kakeya-launcher-full",
     }
 
 
@@ -166,6 +167,20 @@ def test_mlx_kakeya_launcher_smoke_preset_invokes_launcher():
     assert argv[argv.index("--max-new-tokens") + 1] == "64"
 
 
+def test_mlx_kakeya_launcher_full_preset_runs_full_mode_past_wrap():
+    request = parse_manifest(_manifest(
+        preset="mlx-kakeya-launcher-full", params={"max_new_tokens": "1300"}))
+    (argv,) = build_commands(request, {})
+    assert argv[0] == "bash"
+    assert argv[1].endswith("run_kakeya_mac.sh")
+    # FULL mode: NO --fast (f_θ verifier+proposer+f_θ path).
+    assert "--fast" not in argv
+    assert "--chat-scripted" in argv
+    assert "--ignore-turn-stop" in argv
+    # budget crosses the ~1024 native-cache ring wrap.
+    assert int(argv[argv.index("--max-new-tokens") + 1]) > 1024
+
+
 def test_mlx_kakeya_fused_chat_ftheta_preset_runs_f_theta_path():
     request = parse_manifest(_manifest(
         preset="mlx-kakeya-fused-chat-ftheta",

From 88743e53aa71b87a970569889efe6065e79fc645 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 17 Jun 2026 15:55:38 +0000
Subject: [PATCH 02/12] debug(mlx-fused): instrument codegen markdown-loop
 degeneration + native-control probe

KAKEYA_KDBG-gated per-block logging (sampled/committed ids, cyc_frac/cyc_p,
cache offsets) in fused_specdecode_generate, and a turn_compare_fused_vs_native
record (first_divergence_idx + both tails) in _run_fused_chat. New bridge preset
mlx-kakeya-codegen-degen-probe runs the C-code prompt with --chat-native-ref to
decide greedy-pathology vs engine bug. Instrumentation only; reverted after fix.

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 .../backends/mlx/fused_specdecode.py          | 78 +++++++++++++++++++
 inference_engine/bridge/manifest.py           | 27 +++++++
 .../research/k3_integrated_niah_eval_mac.py   | 38 +++++++++
 .../inference_engine/bridge/test_manifest.py  |  1 +
 4 files changed, 144 insertions(+)

diff --git a/inference_engine/backends/mlx/fused_specdecode.py b/inference_engine/backends/mlx/fused_specdecode.py
index c14cfbc..3f35f02 100644
--- a/inference_engine/backends/mlx/fused_specdecode.py
+++ b/inference_engine/backends/mlx/fused_specdecode.py
@@ -35,6 +35,68 @@
     restored_prefill_cache,
 )
 
+# region agent log (fused-codegen-degeneration-2815 probe; strip after fix)
+import os as _kdbg_os
+import sys as _kdbg_sys
+import json as _kdbg_json
+
+_KDBG = bool(_kdbg_os.environ.get("KAKEYA_KDBG"))
+
+
+def _kdbg(hyp: str, msg: str, **data: Any) -> None:
+    """Emit one NDJSON probe line to stderr (prefix ``KDBG ``) and, best-effort,
+    to /opt/cursor/logs/debug.log. No-op unless ``KAKEYA_KDBG`` is set, so
+    production behaviour is unchanged."""
+    if not _KDBG:
+        return
+    rec = {"hypothesisId": hyp, "location": "fused_specdecode.py",
+           "message": msg, "data": data}
+    try:
+        _kdbg_sys.stderr.write("KDBG " + _kdbg_json.dumps(rec, ensure_ascii=False) + "\n")
+        _kdbg_sys.stderr.flush()
+    except Exception:
+        pass
+    try:
+        with open("/opt/cursor/logs/debug.log", "a") as _f:
+            _f.write(_kdbg_json.dumps(rec) + "\n")
+    except Exception:
+        pass
+
+
+def _kdbg_cycle(ids: Sequence[int], window: int = 80) -> Tuple[float, int]:
+    """Short-unit cycle metric on the tail of ``ids``: returns
+    ``(cyc_frac, cyc_p)`` where ``cyc_p`` is the period (1..window//3) whose
+    back-to-back repetition covers the largest fraction ``cyc_frac`` of the
+    trailing ``window`` tokens. ~1.0 => the tail is a tight repeating loop."""
+    w = list(ids[-window:])
+    n = len(w)
+    if n < 6:
+        return 0.0, 0
+    best_frac, best_p = 0.0, 0
+    for p in range(1, n // 3 + 1):
+        run, i = 0, n - 1
+        while i - p >= 0 and w[i] == w[i - p]:
+            run += 1
+            i -= 1
+        if run > 0:
+            frac = (run + p) / n
+            if frac > best_frac:
+                best_frac, best_p = frac, p
+    return round(best_frac, 3), best_p
+
+
+def _kdbg_cache_offsets(cache: Any) -> Tuple[Optional[int], Optional[int]]:
+    """(first full-attn KVCache offset, first sliding RotatingKVCache offset)."""
+    off_full = off_rot = None
+    for c in (cache or []):
+        nm = type(c).__name__
+        if off_rot is None and "Rotating" in nm:
+            off_rot = int(getattr(c, "offset", -1))
+        elif off_full is None and "Rotating" not in nm:
+            off_full = int(getattr(c, "offset", -1))
+    return off_full, off_rot
+# endregion
+
 
 # --------------------------------------------------------------------------- #
 # Component A: capture verifier aux-layer hidden states (no transformers
@@ -772,6 +834,22 @@ def fused_specdecode_generate(
                 commit = candidate[:accepted] + [correction]
             generated += commit
             accepts.append(accepted)
+            # region agent log (fused-codegen-degeneration-2815 probe)
+            if _KDBG:
+                off_full, off_rot = _kdbg_cache_offsets(getattr(adapter, "_cache", None))
+                cyc_frac, cyc_p = _kdbg_cycle(generated)
+                # H-D: cache.offset must track committed length (past_len).
+                # off_rot lags by the sliding window (bounded), off_full == past_len.
+                _kdbg("AD", "block",
+                      blk=len(accepts) - 1, base=cstart,
+                      past_len=int(adapter._past_len), gen=len(generated),
+                      off_full=off_full, off_rot=off_rot,
+                      bonus=int(bonus), cand=[int(x) for x in candidate],
+                      n_cand=len(candidate), accepted=int(accepted),
+                      commit=[int(x) for x in commit],
+                      next_argmax=int(argmax_fn(adapter.next_token_logits)),
+                      cyc_frac=cyc_frac, cyc_p=cyc_p)
+            # endregion
             if any(t in eos for t in commit):
                 break
             if (allow_greedy_fallback and len(accepts) >= 2
diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py
index 8b67da7..1338d47 100644
--- a/inference_engine/bridge/manifest.py
+++ b/inference_engine/bridge/manifest.py
@@ -771,6 +771,33 @@ def _harness_preset(
             params={"max_new_tokens": ("int:max_new_tokens", "64")},
             validate_reports=True,  # §4 liveness gate on-device
         ),
+        Preset(
+            name="mlx-kakeya-codegen-degen-probe",
+            description="DEBUG: full f_θ fused engine on a CODE prompt (write PoW "
+                        "in C) that triggers an early high-acceptance markdown-"
+                        "marker loop (**/.2/* wall), with KAKEYA_KDBG per-block "
+                        "logging + native-greedy control (--chat-native-ref). The "
+                        "decisive signal is fused-vs-native divergence: if native "
+                        "also loops, it is greedy pathology the engine must guard.",
+            command_templates=(
+                (
+                    "env", "KAKEYA_KDBG=1",
+                    "python3", "scripts/research/k3_integrated_niah_eval_mac.py",
+                    "--verifier-path", "${ENV:KAKEYA_MAC_VERIFIER_PATH}",
+                    "--drafter-id", "${ENV:KAKEYA_MAC_DRAFTER_ID}",
+                    "--f-theta-dir", "${ENV:KAKEYA_MAC_FTHETA_DIR}",
+                    "--s5-exact-full-attn", "--fused-specdecode", "--force-f-theta",
+                    "--sink-size", "4", "--window-size", "64", "--block-size", "4",
+                    "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop",
+                    "--chat", "--chat-native-ref",
+                    "--chat-scripted", "实现一个PoW的代码，用c语言完成",
+                    "--output", "results/research/codegen_degen_2815_chat.json",
+                ),
+            ),
+            timeout_minutes=90,
+            params={"max_new_tokens": ("int:max_new_tokens", "800")},
+            validate_reports=False,
+        ),
         Preset(
             name="mlx-kakeya-degen-probe",
             description="Long-decode regression probe: full f_θ fused engine on a "
diff --git a/scripts/research/k3_integrated_niah_eval_mac.py b/scripts/research/k3_integrated_niah_eval_mac.py
index ecde21d..a628eb2 100644
--- a/scripts/research/k3_integrated_niah_eval_mac.py
+++ b/scripts/research/k3_integrated_niah_eval_mac.py
@@ -851,6 +851,44 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]:
                     res["native_ref_tokens"] = len(nref_tokens)
                 res["resident_kv_bytes"] = int(
                     sum(int(getattr(c, "nbytes", 0)) for c in (adapter._cache or [])))
+                # region agent log (fused-codegen-degeneration-2815 probe)
+                import os as _kos
+                if _kos.environ.get("KAKEYA_KDBG"):
+                    ftoks = [int(t) for t in res.get("tokens", [])]
+                    ntoks = [int(t) for t in nref_tokens]
+                    div = None
+                    for j, (a, b) in enumerate(zip(ftoks, ntoks)):
+                        if a != b:
+                            div = j
+                            break
+                    if div is None:
+                        div = min(len(ftoks), len(ntoks))
+
+                    def _dec(seq):
+                        try:
+                            return tokenizer.decode(seq, skip_special_tokens=True)
+                        except TypeError:
+                            return tokenizer.decode(seq)
+                    rec = {
+                        "hypothesisId": "AC",
+                        "message": "turn_compare_fused_vs_native",
+                        "data": {
+                            "fused_n": len(ftoks), "native_n": len(ntoks),
+                            "first_divergence_idx": div,
+                            "fused_div_ctx": ftoks[max(0, div - 8):div + 16],
+                            "native_div_ctx": ntoks[max(0, div - 8):div + 16],
+                            "fused_div_text": _dec(ftoks[max(0, div - 8):div + 16]),
+                            "native_div_text": _dec(ntoks[max(0, div - 8):div + 16]),
+                            "fused_tail": ftoks[-48:],
+                            "native_tail": ntoks[-48:],
+                            "fused_tail_text": _dec(ftoks[-48:]),
+                            "native_tail_text": _dec(ntoks[-48:]),
+                        },
+                    }
+                    sys.stderr.write(
+                        "KDBG " + json.dumps(rec, ensure_ascii=False) + "\n")
+                    sys.stderr.flush()
+                # endregion
                 return res
 
             print(f"[chat] FULL fused engine: verifier={args.verifier_path} "
diff --git a/tests/inference_engine/bridge/test_manifest.py b/tests/inference_engine/bridge/test_manifest.py
index 31ce0ec..cce242d 100644
--- a/tests/inference_engine/bridge/test_manifest.py
+++ b/tests/inference_engine/bridge/test_manifest.py
@@ -81,6 +81,7 @@ def test_allowlist_contains_exactly_the_documented_presets():
         "mlx-batched-pad-decode",
         "mlx-env-probe",
         "mlx-kakeya-chat-smoke",
+        "mlx-kakeya-codegen-degen-probe",
         "mlx-kakeya-degen-probe",
         "mlx-kakeya-fused-chat-ftheta",
         "mlx-kakeya-fused-chat-smoke",

From f636370349c82255d9a3d8debfd23e21424b69b7 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 17 Jun 2026 16:21:12 +0000
Subject: [PATCH 03/12] debug(mlx-fused): add multi-turn prefill-state probe
 (ring-wrap-at-prefill) + multi-turn degen preset

KAKEYA_KDBG-gated prefill_state_fused / prefill_state_native records in
_run_fused_chat: per-turn prompt_len, evicted_count, rot/full cache offsets,
any_wrapped, would_wrap_block0, plus a turn index on turn_compare. Repoints
mlx-kakeya-codegen-degen-probe to the multi-turn repro (turn-1 PoW explanation
pushes the turn-2 code prompt's prefill past the sliding window) at 1200 tok.
Instrumentation only; reverted after fix.

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 inference_engine/bridge/manifest.py           | 27 ++++---
 .../research/k3_integrated_niah_eval_mac.py   | 75 +++++++++++++++++++
 2 files changed, 93 insertions(+), 9 deletions(-)

diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py
index 1338d47..9e442d2 100644
--- a/inference_engine/bridge/manifest.py
+++ b/inference_engine/bridge/manifest.py
@@ -773,12 +773,20 @@ def _harness_preset(
         ),
         Preset(
             name="mlx-kakeya-codegen-degen-probe",
-            description="DEBUG: full f_θ fused engine on a CODE prompt (write PoW "
-                        "in C) that triggers an early high-acceptance markdown-"
-                        "marker loop (**/.2/* wall), with KAKEYA_KDBG per-block "
-                        "logging + native-greedy control (--chat-native-ref). The "
-                        "decisive signal is fused-vs-native divergence: if native "
-                        "also loops, it is greedy pathology the engine must guard.",
+            description="DEBUG: full f_θ fused engine on a MULTI-TURN chat whose "
+                        "turn-1 PoW explanation makes the turn-2 code prompt's "
+                        "prefill exceed the sliding window / native RotatingKVCache "
+                        "(ring pre-wrapped before decode). Single-turn ruled OUT an "
+                        "engine bug (token-identical to native); this probe targets "
+                        "the long-prompt prefill regime. KAKEYA_KDBG logs per-turn "
+                        "prefill state (prompt_len, evicted_count, rot/full cache "
+                        "offsets, any_wrapped, would_wrap_block0) + per-block offsets "
+                        "+ a turn_compare_fused_vs_native record (first_divergence_idx "
+                        "+ tails). Native-greedy control (--chat-native-ref) decodes "
+                        "the SAME per-turn prompt (history-inclusive) so the decisive "
+                        "signal stays fused-vs-native: native coherent + fused garbled "
+                        "from turn-2 start ⇒ long-prompt prefill corrupts logits "
+                        "(engine); both loop identically ⇒ greedy pathology.",
             command_templates=(
                 (
                     "env", "KAKEYA_KDBG=1",
@@ -790,12 +798,13 @@ def _harness_preset(
                     "--sink-size", "4", "--window-size", "64", "--block-size", "4",
                     "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop",
                     "--chat", "--chat-native-ref",
-                    "--chat-scripted", "实现一个PoW的代码，用c语言完成",
+                    "--chat-scripted",
+                    "请详细解释POW的工作原理||实现一个PoW的代码，用c语言完成",
                     "--output", "results/research/codegen_degen_2815_chat.json",
                 ),
             ),
-            timeout_minutes=90,
-            params={"max_new_tokens": ("int:max_new_tokens", "800")},
+            timeout_minutes=120,
+            params={"max_new_tokens": ("int:max_new_tokens", "1200")},
             validate_reports=False,
         ),
         Preset(
diff --git a/scripts/research/k3_integrated_niah_eval_mac.py b/scripts/research/k3_integrated_niah_eval_mac.py
index a628eb2..646c8c1 100644
--- a/scripts/research/k3_integrated_niah_eval_mac.py
+++ b/scripts/research/k3_integrated_niah_eval_mac.py
@@ -221,6 +221,7 @@ def main() -> int:
         MLXRestoredIncrementalVerifier, capture_aux_hidden,
         make_bridge_embed_lm_head, fused_specdecode_generate,
         fused_specdecode_generate_mlx, fused_specdecode_generate_mlx_trim,
+        _sliding_ring_would_wrap,  # region agent log (fused-codegen-degeneration-2815)
     )
     from inference_engine.v04.kv_compressor import make_default_compressor
     from inference_engine.bench.k3_report_gate import (
@@ -769,6 +770,54 @@ def _encode_chat(history: List[Dict[str, str]]) -> List[int]:
                         history, add_generation_prompt=True)
                 return list(cids.tolist() if hasattr(cids, "tolist") else cids)
 
+            # region agent log (fused-codegen-degeneration-2815 prefill probe)
+            import os as _kos_chat
+            _KDBG_CHAT = bool(_kos_chat.environ.get("KAKEYA_KDBG"))
+
+            def _kdbg_emit(rec: Dict[str, Any]) -> None:
+                try:
+                    sys.stderr.write("KDBG " + json.dumps(rec, ensure_ascii=False) + "\n")
+                    sys.stderr.flush()
+                except Exception:
+                    pass
+                try:
+                    with open("/opt/cursor/logs/debug.log", "a") as _f:
+                        _f.write(json.dumps(rec) + "\n")
+                except Exception:
+                    pass
+
+            def _kdbg_cache_summary(cache: Any) -> Dict[str, Any]:
+                """rot/full offset+max_size rollup + wrap/trimmable flags. The
+                decisive prefill signal: is the sliding RotatingKVCache already
+                wrapped (off>=max_size) BEFORE decode starts, and does full-attn
+                off == prompt_len? A pre-wrapped ring at prefill means the very
+                first speculative block's trim is refused (offset desync)."""
+                rot_off = rot_ms = full_off = None
+                any_wrapped = False
+                all_trimmable = True
+                n = 0
+                for c in (cache or []):
+                    n += 1
+                    nm = type(c).__name__
+                    off = int(getattr(c, "offset", -1))
+                    ms = getattr(c, "max_size", None)
+                    ms = int(ms) if ms is not None else None
+                    is_rot = "Rotating" in nm
+                    if is_rot and ms is not None and off >= ms:
+                        any_wrapped = True
+                    trim_fn = getattr(c, "is_trimmable", None)
+                    trim = bool(trim_fn()) if callable(trim_fn) else None
+                    if trim is False:
+                        all_trimmable = False
+                    if is_rot and rot_off is None:
+                        rot_off, rot_ms = off, ms
+                    if (not is_rot) and full_off is None:
+                        full_off = off
+                return {"n_layers": n, "rot_off": rot_off, "rot_ms": rot_ms,
+                        "full_off": full_off, "any_wrapped": any_wrapped,
+                        "all_trimmable": all_trimmable}
+            # endregion
+
             def _gen_turn(pid: List[int]) -> Dict[str, Any]:
                 # Opt-in A/B control (--chat-native-ref): a plain NATIVE greedy
                 # AR decode of the SAME prompt for --max-new-tokens. Captured as
@@ -779,6 +828,14 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]:
                 nref_tokens: List[int] = []
                 if args.chat_native_ref:
                     nref_cache, nref_logits = native_prefill(list(pid))
+                    # region agent log (fused-codegen-degeneration-2815 prefill probe)
+                    if _KDBG_CHAT:
+                        _turn = sum(1 for h in history if h.get("role") == "user")
+                        _kdbg_emit({"hypothesisId": "AE",
+                                    "message": "prefill_state_native",
+                                    "data": {"turn": _turn, "prompt_len": len(pid),
+                                             "cache": _kdbg_cache_summary(nref_cache)}})
+                    # endregion
                     while len(nref_tokens) < args.max_new_tokens:
                         tok = int(mx.argmax(nref_logits).item())
                         nref_tokens.append(tok)
@@ -809,6 +866,23 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]:
                     restored_v_per_layer=_pad(rv, tsrc, T),
                     evicted_positions=evicted,
                     prefill_chunk_size=args.prefill_chunk_size, full_kv=args.cuda_trim)
+                # region agent log (fused-codegen-degeneration-2815 prefill probe)
+                if _KDBG_CHAT:
+                    _turn = sum(1 for h in history if h.get("role") == "user")
+                    _kdbg_emit({"hypothesisId": "AE",
+                                "message": "prefill_state_fused",
+                                "data": {"turn": _turn, "prompt_len": T,
+                                         "evicted_count": len(evicted),
+                                         "block_size": int(args.block_size),
+                                         "would_wrap_block0": bool(
+                                             _sliding_ring_would_wrap(
+                                                 getattr(adapter, "_cache", None),
+                                                 int(args.block_size))),
+                                         "past_len": int(adapter._past_len),
+                                         "f_theta_ran": bool(f_theta_ran),
+                                         "cache": _kdbg_cache_summary(
+                                             getattr(adapter, "_cache", None))}})
+                # endregion
                 t0 = time.perf_counter()
                 if mlx_drafter is not None and args.cuda_trim:
                     res = fused_specdecode_generate_mlx_trim(
@@ -873,6 +947,7 @@ def _dec(seq):
                         "hypothesisId": "AC",
                         "message": "turn_compare_fused_vs_native",
                         "data": {
+                            "turn": sum(1 for h in history if h.get("role") == "user"),
                             "fused_n": len(ftoks), "native_n": len(ntoks),
                             "first_divergence_idx": div,
                             "fused_div_ctx": ftoks[max(0, div - 8):div + 16],

From 12fda6009c6da5095039ac6208d6bdf5047a1ddb Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 17 Jun 2026 17:01:30 +0000
Subject: [PATCH 04/12] debug(probe): light single-turn long-prompt repro (ring
 pre-wrapped at prefill)

Multi-turn+native at 1200x2 OOM'd the Mac runner. Per debug analysis, the
cheapest test of H-C' (long-prompt prefill corrupts logits) vs H-A' (bounded-
greedy pathology) is a single-turn LONG prompt that wraps the ring AT prefill
(would_wrap_block0) with a tiny 192-tok budget. Add --chat-scripted-file so the
~2k-char context is a committed fixture (pow_codegen_longprompt.txt) instead of
a giant manifest argv; repoint mlx-kakeya-codegen-degen-probe to it.

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 inference_engine/bridge/manifest.py           | 37 ++++----
 .../research/k3_integrated_niah_eval_mac.py   | 13 ++-
 scripts/research/pow_codegen_longprompt.txt   | 85 +++++++++++++++++++
 3 files changed, 114 insertions(+), 21 deletions(-)
 create mode 100644 scripts/research/pow_codegen_longprompt.txt

diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py
index 9e442d2..05237d2 100644
--- a/inference_engine/bridge/manifest.py
+++ b/inference_engine/bridge/manifest.py
@@ -773,20 +773,19 @@ def _harness_preset(
         ),
         Preset(
             name="mlx-kakeya-codegen-degen-probe",
-            description="DEBUG: full f_θ fused engine on a MULTI-TURN chat whose "
-                        "turn-1 PoW explanation makes the turn-2 code prompt's "
-                        "prefill exceed the sliding window / native RotatingKVCache "
-                        "(ring pre-wrapped before decode). Single-turn ruled OUT an "
-                        "engine bug (token-identical to native); this probe targets "
-                        "the long-prompt prefill regime. KAKEYA_KDBG logs per-turn "
-                        "prefill state (prompt_len, evicted_count, rot/full cache "
-                        "offsets, any_wrapped, would_wrap_block0) + per-block offsets "
-                        "+ a turn_compare_fused_vs_native record (first_divergence_idx "
-                        "+ tails). Native-greedy control (--chat-native-ref) decodes "
-                        "the SAME per-turn prompt (history-inclusive) so the decisive "
-                        "signal stays fused-vs-native: native coherent + fused garbled "
-                        "from turn-2 start ⇒ long-prompt prefill corrupts logits "
-                        "(engine); both loop identically ⇒ greedy pathology.",
+            description="DEBUG: full f_θ fused engine on a LONG single-turn prompt "
+                        "(~2k-char PoW explanation + a 'write C code' request, from "
+                        "the committed fixture pow_codegen_longprompt.txt) so the "
+                        "native RotatingKVCache ring is ALREADY WRAPPED at prefill "
+                        "(would_wrap_block0). Short single-turn prompts were proven "
+                        "token-identical to native & coherent; this isolates the "
+                        "long-prompt-prefill regime cheaply (tiny 192-tok budget). "
+                        "KAKEYA_KDBG logs prefill state (prompt_len, any_wrapped, "
+                        "would_wrap_block0, rot/full offsets) + per-block offsets + "
+                        "turn_compare_fused_vs_native. Native-greedy control "
+                        "(--chat-native-ref): native coherent + fused garbled ⇒ "
+                        "long-prompt prefill corrupts logits (engine bug); both "
+                        "degenerate ⇒ bounded-greedy pathology the engine must guard.",
             command_templates=(
                 (
                     "env", "KAKEYA_KDBG=1",
@@ -798,13 +797,13 @@ def _harness_preset(
                     "--sink-size", "4", "--window-size", "64", "--block-size", "4",
                     "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop",
                     "--chat", "--chat-native-ref",
-                    "--chat-scripted",
-                    "请详细解释POW的工作原理||实现一个PoW的代码，用c语言完成",
-                    "--output", "results/research/codegen_degen_2815_chat.json",
+                    "--chat-scripted-file",
+                    "scripts/research/pow_codegen_longprompt.txt",
+                    "--output", "results/research/codegen_degen_2815_longprompt.json",
                 ),
             ),
-            timeout_minutes=120,
-            params={"max_new_tokens": ("int:max_new_tokens", "1200")},
+            timeout_minutes=90,
+            params={"max_new_tokens": ("int:max_new_tokens", "192")},
             validate_reports=False,
         ),
         Preset(
diff --git a/scripts/research/k3_integrated_niah_eval_mac.py b/scripts/research/k3_integrated_niah_eval_mac.py
index 646c8c1..ed354be 100644
--- a/scripts/research/k3_integrated_niah_eval_mac.py
+++ b/scripts/research/k3_integrated_niah_eval_mac.py
@@ -180,6 +180,11 @@ def parse_args() -> argparse.Namespace:
     ap.add_argument("--chat-scripted", default=None,
                     help="Non-interactive chat: '||'-separated user turns "
                          "(for Mac-bridge verification); writes a transcript.")
+    ap.add_argument("--chat-scripted-file", default=None,
+                    help="Like --chat-scripted but reads the (possibly long, "
+                         "'||'-separated) scripted prompt from a UTF-8 file. Lets "
+                         "a long context be a committed fixture instead of a giant "
+                         "manifest argv. Overrides --chat-scripted when set.")
     ap.add_argument("--chat-native-ref", action="store_true",
                     help="DIAGNOSTIC opt-in: before each chat turn, also run a "
                          "plain NATIVE greedy AR decode of the SAME prompt for "
@@ -973,8 +978,12 @@ def _dec(seq):
                   file=sys.stderr, flush=True)
 
             history: List[Dict[str, str]] = []
-            if args.chat_scripted is not None:
-                turns = [t for t in args.chat_scripted.split("||") if t.strip()]
+            scripted = args.chat_scripted
+            if args.chat_scripted_file is not None:
+                with open(args.chat_scripted_file, encoding="utf-8") as _f:
+                    scripted = _f.read()
+            if scripted is not None:
+                turns = [t for t in scripted.split("||") if t.strip()]
                 transcript = []
                 for u in turns:
                     history.append({"role": "user", "content": u})
diff --git a/scripts/research/pow_codegen_longprompt.txt b/scripts/research/pow_codegen_longprompt.txt
new file mode 100644
index 0000000..fa1ae08
--- /dev/null
+++ b/scripts/research/pow_codegen_longprompt.txt
@@ -0,0 +1,85 @@
+**PoW (Proof of Work，工作量证明)** 是区块链技术中最核心的共识机制之一。它的核心目的是：**在没有中心化机构（如银行）的情况下，让分布在世界各地的计算机能够达成一致，决定谁有权记账，并防止有人通过伪造数据来欺骗网络。**
+
+为了深入理解，我们可以从“核心逻辑”、“工作流程”、“数学原理”和“经济博弈”四个维度来详细解释。
+
+---
+
+### 1. 核心逻辑：用“算力”换取“信任”
+
+在去中心化网络中，大家面临一个问题：**如果每个人都说自己记了一笔账，听谁的？**
+
+PoW 的逻辑是：**谁付出了巨大的计算资源（工作量），谁就有权提议下一块账本。**
+*   **工作量（Work）**：指消耗的计算资源和时间。
+*   **证明（Proof）**：当计算完成后，结果是公开且易于验证的。
+
+这种机制确保了：**作恶的成本极高，而维护网络的收益（奖励）很诱人。**
+
+---
+
+### 2. 工作流程：从“挖矿”到“验证”
+
+假设我们正在运行一个像比特币这样的 PoW 网络，流程如下：
+
+#### 第一步：收集交易
+矿工（节点）从网络中收集待处理的交易，并将它们打包成一个“候选区块”。
+
+#### 第二步：寻找“随机数”（核心环节）
+为了让区块生效，矿工必须解决一个数学难题。这个难题通常是：
+> “找到一个数字（称为 **Nonce**），使得：**区块头部数据的哈希值（Hash） < 目标难度值（Target）**。”
+
+*   **哈希函数（Hash Function）**：像是一个“数字粉碎机”。你输入任何内容，它都会输出一串固定长度的乱码。只要输入变一点点，输出就会天差地别。
+*   **不可逆性**：你无法通过结果反推输入。
+
+#### 第三步：竞争与“挖矿”
+矿工们开始疯狂尝试。他们不断改变区块里的 `Nonce` 值，重新计算哈希值。
+*   矿工 A 尝试：`Hash(数据 + 1) = 0xabc...` (不符合要求)
+*   矿工 B 尝试：`Hash(数据 + 2) = 0xdef...` (不符合要求)
+*   ...
+*   矿工 C 运气好/算力强：`Hash(数据 + 999) = 0x000...` (**符合要求！**)
+
+一旦有人找到了满足条件的哈希值，他就“挖到了矿”。
+
+#### 第四步：广播与验证
+获胜的矿工将新区块广播给全网。其他节点收到后，只需进行一次简单的计算（把 Nonce 代入哈希函数），发现结果确实符合要求，就会接受这个区块，并更新自己的账本。
+
+---
+
+### 3. 数学原理：为什么它是安全的？
+
+#### A. 难度调整机制 (Difficulty Adjustment)
+如果每个人都买更快的显卡，挖矿速度会变快，导致区块产生太频繁。
+为了保持稳定的出块时间（例如比特币约10分钟一个块），系统会**自动调整难度**：
+*   如果出块太快 $\rightarrow$ 提高目标值（让要求的哈绪值前导零更多，变得更难）。
+*   如果出块太慢 $\rightarrow$ 降低难度。
+
+#### B. 概率与公平性
+由于哈希值是随机分布的，寻找答案的过程就像“在茫茫大海里捞针”。
+*   **算力越高**，意味着你每秒尝试的次数越多，捞到针的概率就越大。
+*   这保证了在宏观上，算力占比与奖励分配是公平的。
+
+#### C. 抵抗“双花”与篡改
+如果有人想修改 10 分钟前的一个交易，他必须重新计算那个区块的哈希，以及**之后所有区块**的哈希。
+由于后面的区块都包含了前一个区块的哈希值（形成链式结构），修改历史意味着要重算后面所有的工作量。除非攻击者的算力超过全网总和的 51%，否则这在经济上是不可能的。
+
+---
+
+### 4. 总结：PoW 的优缺点
+
+#### 优点：
+1.  **极高的安全性**：通过物理世界的能源消耗（电力）为数字世界筑起防线。
+2.  **去中心化**：任何人只要有设备就可以加入，不需要许可。
+3.  **公平性**：基于数学概率，不存在人为干预。
+
+#### 缺点：
+1.  **能源消耗大**：为了维持安全，全球范围内的矿机都在消耗大量电力（这是最受争议的一点）。
+2.  **扩展性差（TPS低）**：为了保证全球同步，出块速度不能太快，导致处理交易的速度受限。
+3.  **算力集中风险**：如果出现大规模矿池，可能导致权力向少数人手中集中。
+
+### 通俗比喻总结
+想象一个**“数字猜数字游戏”**：
+全网的人都在玩一个游戏，目标是猜出一个符合特定规则的数字。
+*   **挖矿**就是不停地尝试不同的数字。
+*   **难度**就是规则越来越严苛（比如要求数字必须以 10 个零开头）。
+*   **验证**就是别人看到你猜中了，只需要看一眼你的数字是否符合规则，瞬间就能确认你没撒谎。
+
+基于以上说明，用C语言写一个简短的PoW示例（只写核心循环，不超过30行）。
\ No newline at end of file

From c6a699c98d48e38e7a04754c752b1e60519f3791 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 17 Jun 2026 17:30:25 +0000
Subject: [PATCH 05/12] fix(mlx-fused): runaway-loop guard stops greedy
 markdown-marker collapse
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Repro evidence: single-turn fused decode is TOKEN-IDENTICAL to native greedy
(first_divergence_idx=None) and coherent through 1200 tokens, so the engine is
faithful — the user's '由于...'/'**/.2/*' collapse is greedy-decoding pathology
on code/markdown-heavy prompts that the fused path (pure argmax, unlike
chat_mlx_kakeya.py) had no mitigation for. Once a loop starts the drafter
trivially predicts the repeats and the greedy verifier accepts them (high
accept_len), so it walls indefinitely.

Fix: _trailing_runaway_drop detects a 1..8-token unit repeated >=12x at the tail
(conservative; never trims legit lists/enumerations/code) and the three fused
loops stop generation, keeping a short clean tail instead of an unbounded wall.
Default ON (stop_on_runaway=True); --fused-no-loop-guard disables it for
degeneration probes. Adds stopped_on_runaway to the result.

Also: --chat-scripted-file (long prompt as committed fixture) + repoint the
codegen-degen probe to a single-turn long prompt that wraps the ring at prefill
(cheap; the multi-turn+native variant OOM'd the Mac runner). KAKEYA_KDBG probe
instrumentation retained (inert unless the env var is set) for the pending
on-device H-C'-vs-H-A' confirmation.

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 .../backends/mlx/fused_specdecode.py          | 72 +++++++++++++++++++
 inference_engine/bridge/manifest.py           |  2 +-
 .../research/k3_integrated_niah_eval_mac.py   | 13 +++-
 tests/backends/mlx/test_fused_specdecode.py   | 62 ++++++++++++++++
 4 files changed, 145 insertions(+), 4 deletions(-)

diff --git a/inference_engine/backends/mlx/fused_specdecode.py b/inference_engine/backends/mlx/fused_specdecode.py
index 3f35f02..9fafdd7 100644
--- a/inference_engine/backends/mlx/fused_specdecode.py
+++ b/inference_engine/backends/mlx/fused_specdecode.py
@@ -436,6 +436,7 @@ def fused_specdecode_generate_mlx_trim(
     block_size: int,
     eos_ids: Sequence[int] = (),
     single_fused: bool = False,
+    stop_on_runaway: bool = True,
 ) -> Dict[str, Any]:
     """CUDA-parity fused spec decode: KEEP accepted K/V, TRIM only the rejected
     tail (no rollback, no carry re-forward). Requires the adapter to be
@@ -461,6 +462,7 @@ def fused_specdecode_generate_mlx_trim(
     generated: List[int] = []
     accepts: List[int] = []
     block_evals: List[float] = []
+    stopped_on_runaway = False
     ctx_len = C
     try:
         while len(generated) < gen_tokens:
@@ -522,6 +524,12 @@ def fused_specdecode_generate_mlx_trim(
                 timing["extend_s"] += time.perf_counter() - t_extend
             if any(t in eos for t in commit):
                 break
+            if stop_on_runaway:
+                drop = _trailing_runaway_drop(generated)
+                if drop > 0:
+                    del generated[len(generated) - drop:]
+                    stopped_on_runaway = True
+                    break
     finally:
         adapter._capture_aux = False
     generated = generated[:gen_tokens]
@@ -531,6 +539,7 @@ def fused_specdecode_generate_mlx_trim(
         "mean_accept_len": (round(sum(accepts) / len(accepts), 3)
                             if accepts else 0.0),
         "decode_tokens": len(generated),
+        "stopped_on_runaway": stopped_on_runaway,
         "loop": ("mlx_trim_single_fused_probe" if single_fused
                  else "mlx_trim_keep_accepted_cuda_parity"),
         "single_fused": bool(single_fused),
@@ -552,6 +561,7 @@ def fused_specdecode_generate_mlx(
     gen_tokens: int,
     block_size: int,
     eos_ids: Sequence[int] = (),
+    stop_on_runaway: bool = True,
 ) -> Dict[str, Any]:
     """All-MLX fused spec decode with ONE host sync per block.
 
@@ -593,6 +603,7 @@ def fused_specdecode_generate_mlx(
 
     generated: List[int] = []
     accepts: List[int] = []
+    stopped_on_runaway = False
     # Rollback-carry state: rejected blocks roll the WHOLE forward back
     # (rollback_block — see its docstring for why trim is unsound on the
     # wrapped sliding ring) and carry the stream-committed-but-not-cached
@@ -676,6 +687,12 @@ def fused_specdecode_generate_mlx(
                 timing["extend_s"] += time.perf_counter() - t_extend
             if any(t in eos for t in commit):
                 break
+            if stop_on_runaway:
+                drop = _trailing_runaway_drop(generated)
+                if drop > 0:
+                    del generated[len(generated) - drop:]
+                    stopped_on_runaway = True
+                    break
     finally:
         adapter._capture_aux = False
     generated = generated[:gen_tokens]
@@ -685,6 +702,7 @@ def fused_specdecode_generate_mlx(
         "mean_accept_len": (round(sum(accepts) / len(accepts), 3)
                             if accepts else 0.0),
         "decode_tokens": len(generated),
+        "stopped_on_runaway": stopped_on_runaway,
         "loop": "mlx_rollback_carry_v3",
         "time_breakdown_s": {k: round(v, 3) for k, v in timing.items()},
     }
@@ -717,6 +735,40 @@ def _sliding_ring_would_wrap(cache: Any, n_new: int) -> bool:
     return False
 
 
+def _trailing_runaway_drop(
+    ids: Sequence[int],
+    *,
+    max_period: int = 8,
+    min_reps: int = 12,
+    keep_reps: int = 3,
+) -> int:
+    """Return how many TRAILING tokens to drop if ``ids`` ends in a runaway
+    short-period loop, else 0.
+
+    A runaway loop is a unit of ``1..max_period`` tokens repeated ``>= min_reps``
+    times back-to-back at the tail (e.g. the ``**``/``.2``/``*`` markdown-marker
+    collapse greedy decoding falls into on code prompts). When found, we keep
+    ``keep_reps`` instances and drop the rest, so callers can stop generation
+    with a clean tail instead of emitting an unbounded wall of repeats.
+
+    Deliberately CONSERVATIVE (>= 12 back-to-back repeats of a <= 8-token unit)
+    so legitimately repetitive text — numbered lists, ``矿工 A/B/C`` enumerations,
+    structured code — is never trimmed. Returns 0 when no runaway is present."""
+    n = len(ids)
+    for p in range(1, max_period + 1):
+        if n < p * min_reps:
+            continue
+        unit = list(ids[n - p:])
+        reps = 0
+        i = n
+        while i - p >= 0 and list(ids[i - p:i]) == unit:
+            reps += 1
+            i -= p
+        if reps >= min_reps:
+            return max((reps - keep_reps) * p, 0)
+    return 0
+
+
 # --------------------------------------------------------------------------- #
 # The fused spec-decode loop (control flow; MLX/torch ops via injected fns).
 # --------------------------------------------------------------------------- #
@@ -734,6 +786,7 @@ def fused_specdecode_generate(
     arange_fn: Callable[[int, int], Any],
     cat_aux_fn: Callable[[Sequence[Any]], Any],
     allow_greedy_fallback: bool = True,
+    stop_on_runaway: bool = True,
 ) -> Dict[str, Any]:
     """Run the fused engine. ``adapter`` must already be prefilled. Per block:
     draft from the cached drafter context (B), verify+capture-aux incrementally
@@ -762,6 +815,7 @@ def fused_specdecode_generate(
     generated: List[int] = []
     accepts: List[int] = []
     fallback_to_greedy = False
+    stopped_on_runaway = False
     try:
         while len(generated) < gen_tokens:
             L = min(block_size, gen_tokens - len(generated))
@@ -852,6 +906,17 @@ def fused_specdecode_generate(
             # endregion
             if any(t in eos for t in commit):
                 break
+            # Greedy decoding can collapse into a runaway short-period loop (e.g.
+            # the **/.2/* markdown-marker wall on code prompts); the drafter then
+            # trivially predicts the repeats and the greedy verifier accepts them,
+            # so acceptance stays HIGH while the output is garbage. Stop on it
+            # instead of emitting an unbounded wall (keeps a short clean tail).
+            if stop_on_runaway:
+                drop = _trailing_runaway_drop(generated)
+                if drop > 0:
+                    del generated[len(generated) - drop:]
+                    stopped_on_runaway = True
+                    break
             if (allow_greedy_fallback and len(accepts) >= 2
                     and (sum(accepts) / len(accepts)) < 1.5):
                 fallback_to_greedy = True
@@ -869,6 +934,12 @@ def fused_specdecode_generate(
                 generated.append(tok)
                 if tok in eos:
                     break
+                if stop_on_runaway:
+                    drop = _trailing_runaway_drop(generated)
+                    if drop > 0:
+                        del generated[len(generated) - drop:]
+                        stopped_on_runaway = True
+                        break
             timing["fallback_greedy_s"] += time.perf_counter() - t_fb
     finally:
         adapter._capture_aux = False
@@ -879,5 +950,6 @@ def fused_specdecode_generate(
         "mean_accept_len": (round(sum(accepts) / len(accepts), 3)
                             if accepts else 0.0),
         "decode_tokens": len(generated),
+        "stopped_on_runaway": stopped_on_runaway,
         "time_breakdown_s": {k: round(v, 3) for k, v in timing.items()},
     }
diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py
index 05237d2..5977c77 100644
--- a/inference_engine/bridge/manifest.py
+++ b/inference_engine/bridge/manifest.py
@@ -796,7 +796,7 @@ def _harness_preset(
                     "--s5-exact-full-attn", "--fused-specdecode", "--force-f-theta",
                     "--sink-size", "4", "--window-size", "64", "--block-size", "4",
                     "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop",
-                    "--chat", "--chat-native-ref",
+                    "--chat", "--chat-native-ref", "--fused-no-loop-guard",
                     "--chat-scripted-file",
                     "scripts/research/pow_codegen_longprompt.txt",
                     "--output", "results/research/codegen_degen_2815_longprompt.json",
diff --git a/scripts/research/k3_integrated_niah_eval_mac.py b/scripts/research/k3_integrated_niah_eval_mac.py
index ed354be..345de2b 100644
--- a/scripts/research/k3_integrated_niah_eval_mac.py
+++ b/scripts/research/k3_integrated_niah_eval_mac.py
@@ -185,6 +185,10 @@ def parse_args() -> argparse.Namespace:
                          "'||'-separated) scripted prompt from a UTF-8 file. Lets "
                          "a long context be a committed fixture instead of a giant "
                          "manifest argv. Overrides --chat-scripted when set.")
+    ap.add_argument("--fused-no-loop-guard", action="store_true",
+                    help="DIAGNOSTIC: disable the fused engine's runaway-loop stop "
+                         "(default ON) so a degeneration probe can observe the full "
+                         "collapse. Production chat keeps the guard enabled.")
     ap.add_argument("--chat-native-ref", action="store_true",
                     help="DIAGNOSTIC opt-in: before each chat turn, also run a "
                          "plain NATIVE greedy AR decode of the SAME prompt for "
@@ -889,25 +893,28 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]:
                                              getattr(adapter, "_cache", None))}})
                 # endregion
                 t0 = time.perf_counter()
+                _guard = not args.fused_no_loop_guard
                 if mlx_drafter is not None and args.cuda_trim:
                     res = fused_specdecode_generate_mlx_trim(
                         adapter, active_drafter, aux_prompt=aux_prompt,
                         embed_fn=embed_fn, lm_head_fn=lm_head_fn,
                         gen_tokens=args.max_new_tokens, block_size=args.block_size,
-                        eos_ids=chat_eos, single_fused=args.single_fused)
+                        eos_ids=chat_eos, single_fused=args.single_fused,
+                        stop_on_runaway=_guard)
                 elif mlx_drafter is not None:
                     res = fused_specdecode_generate_mlx(
                         adapter, active_drafter, aux_prompt=aux_prompt,
                         embed_fn=embed_fn, lm_head_fn=lm_head_fn,
                         gen_tokens=args.max_new_tokens, block_size=args.block_size,
-                        eos_ids=chat_eos)
+                        eos_ids=chat_eos, stop_on_runaway=_guard)
                 else:
                     res = fused_specdecode_generate(
                         adapter, active_drafter, aux_prompt=aux_prompt,
                         embed_fn=embed_fn, lm_head_fn=lm_head_fn,
                         gen_tokens=args.max_new_tokens, block_size=args.block_size,
                         eos_ids=chat_eos, argmax_fn=argmax_fn, arange_fn=arange_fn,
-                        cat_aux_fn=cat_aux_fn, allow_greedy_fallback=False)
+                        cat_aux_fn=cat_aux_fn, allow_greedy_fallback=False,
+                        stop_on_runaway=_guard)
                 res["decode_s"] = round(time.perf_counter() - t0, 3)
                 res["f_theta_ran"] = f_theta_ran
                 res["f_theta_layers"] = sorted(rk.keys()) if rk else []
diff --git a/tests/backends/mlx/test_fused_specdecode.py b/tests/backends/mlx/test_fused_specdecode.py
index ddf099b..f9c37a4 100644
--- a/tests/backends/mlx/test_fused_specdecode.py
+++ b/tests/backends/mlx/test_fused_specdecode.py
@@ -170,6 +170,68 @@ def __init__(self, offset):
         self.max_size = None
 
 
+def test_trailing_runaway_drop_detects_and_trims_loops():
+    # 1-token unit repeated 20x -> drop all but keep_reps (default 3).
+    ids = [1, 2, 3] + [9] * 20
+    drop = fsd._trailing_runaway_drop(ids)
+    assert drop == 17                                   # 20 - 3 kept
+    # multi-token unit (period 3) repeated 12x -> drop (12-3)*3 = 27.
+    ids2 = [5, 6] + [7, 8, 9] * 12
+    assert fsd._trailing_runaway_drop(ids2) == 27
+
+
+def test_trailing_runaway_drop_is_conservative():
+    # fewer than min_reps (12) back-to-back -> no trim.
+    assert fsd._trailing_runaway_drop([9] * 11) == 0
+    # legitimate non-repeating tail -> no trim.
+    assert fsd._trailing_runaway_drop(list(range(40))) == 0
+    # a period that does not tile the very tail -> no trim.
+    assert fsd._trailing_runaway_drop([1, 2] * 10 + [3]) == 0
+    # empty / short -> no trim.
+    assert fsd._trailing_runaway_drop([]) == 0
+
+
+def test_fused_loop_stops_on_runaway_repeat():
+    # Drafter keeps proposing the same token; the fake verifier's "+1" truth is
+    # defeated by making the bonus re-loop: we feed a drafter that always drafts
+    # the marker token and a verifier that greedily agrees, so the committed
+    # stream becomes a runaway single-token loop the guard must cut.
+    class _LoopAdapter(_FakeAdapter):
+        def forward_block(self, candidate):
+            # verifier greedily predicts the SAME marker token (42) forever.
+            if self._capture_aux:
+                L = len(candidate)
+                self._last_aux = [torch.zeros(L, self.hidden)]
+            return [42 for _ in candidate]
+
+    adapter = _LoopAdapter(prompt_len=5, first_token=42)
+    drafter = _FakeDrafter(drafts=[[42, 42, 42]] * 60)
+    res = fsd.fused_specdecode_generate(
+        adapter, drafter, gen_tokens=400, block_size=4, eos_ids=(),
+        allow_greedy_fallback=False, **_loop_kwargs(drafter))
+    assert res["stopped_on_runaway"] is True
+    # stopped early with a short clean tail, nowhere near the 400 budget.
+    assert len(res["tokens"]) < 40
+    assert set(res["tokens"]) == {42}
+
+
+def test_fused_loop_runaway_guard_can_be_disabled():
+    class _LoopAdapter(_FakeAdapter):
+        def forward_block(self, candidate):
+            if self._capture_aux:
+                self._last_aux = [torch.zeros(len(candidate), self.hidden)]
+            return [42 for _ in candidate]
+
+    adapter = _LoopAdapter(prompt_len=5, first_token=42)
+    drafter = _FakeDrafter(drafts=[[42, 42, 42]] * 200)
+    res = fsd.fused_specdecode_generate(
+        adapter, drafter, gen_tokens=120, block_size=4, eos_ids=(),
+        allow_greedy_fallback=False, stop_on_runaway=False,
+        **_loop_kwargs(drafter))
+    assert res["stopped_on_runaway"] is False
+    assert len(res["tokens"]) == 120                    # ran to the full budget
+
+
 def test_sliding_ring_would_wrap_detects_wrap():
     # offset + n_new >= max_size -> the rotating ring becomes non-trimmable.
     cache = [_FakeRotating(offset=1022, max_size=1024)]

From d10aac96bd44406ba999c122cf2aabbbd5a57cc5 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Thu, 18 Jun 2026 04:23:54 +0000
Subject: [PATCH 06/12] fix(probe): drop env KAKEYA_KDBG prefix (broke venv
 python3 -> no mlx_lm); add guard-ON validation preset

The 'env KAKEYA_KDBG=1 python3' prefix resolved a python3 without mlx_lm on the
runner (ModuleNotFoundError). Drop it (KDBG instrumentation is now inert, which
is also what we want for the final PR). The native_ref/text/stopped_on_runaway
signals in the JSON are sufficient to characterize + validate. Add
mlx-kakeya-codegen-guard-validate (guard ON) to prove the clean stop.

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 inference_engine/bridge/manifest.py           | 29 ++++++++++++++++++-
 .../inference_engine/bridge/test_manifest.py  |  1 +
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py
index 5977c77..76e9ee1 100644
--- a/inference_engine/bridge/manifest.py
+++ b/inference_engine/bridge/manifest.py
@@ -788,7 +788,6 @@ def _harness_preset(
                         "degenerate ⇒ bounded-greedy pathology the engine must guard.",
             command_templates=(
                 (
-                    "env", "KAKEYA_KDBG=1",
                     "python3", "scripts/research/k3_integrated_niah_eval_mac.py",
                     "--verifier-path", "${ENV:KAKEYA_MAC_VERIFIER_PATH}",
                     "--drafter-id", "${ENV:KAKEYA_MAC_DRAFTER_ID}",
@@ -806,6 +805,34 @@ def _harness_preset(
             params={"max_new_tokens": ("int:max_new_tokens", "192")},
             validate_reports=False,
         ),
+        Preset(
+            name="mlx-kakeya-codegen-guard-validate",
+            description="Validate the runaway-loop guard end-to-end: full f_θ fused "
+                        "engine on the same long code prompt (pow_codegen_longprompt"
+                        ".txt) with the guard ENABLED (default). The fused answer "
+                        "must NOT collapse into a marker wall — the guard stops the "
+                        "runaway (stopped_on_runaway) leaving a clean tail — while "
+                        "the native-greedy control (no guard) degenerates, proving "
+                        "the guard is what saves the engine from greedy pathology.",
+            command_templates=(
+                (
+                    "python3", "scripts/research/k3_integrated_niah_eval_mac.py",
+                    "--verifier-path", "${ENV:KAKEYA_MAC_VERIFIER_PATH}",
+                    "--drafter-id", "${ENV:KAKEYA_MAC_DRAFTER_ID}",
+                    "--f-theta-dir", "${ENV:KAKEYA_MAC_FTHETA_DIR}",
+                    "--s5-exact-full-attn", "--fused-specdecode", "--force-f-theta",
+                    "--sink-size", "4", "--window-size", "64", "--block-size", "4",
+                    "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop",
+                    "--chat", "--chat-native-ref",
+                    "--chat-scripted-file",
+                    "scripts/research/pow_codegen_longprompt.txt",
+                    "--output", "results/research/codegen_guard_validate_2815.json",
+                ),
+            ),
+            timeout_minutes=90,
+            params={"max_new_tokens": ("int:max_new_tokens", "256")},
+            validate_reports=False,
+        ),
         Preset(
             name="mlx-kakeya-degen-probe",
             description="Long-decode regression probe: full f_θ fused engine on a "
diff --git a/tests/inference_engine/bridge/test_manifest.py b/tests/inference_engine/bridge/test_manifest.py
index cce242d..cfea538 100644
--- a/tests/inference_engine/bridge/test_manifest.py
+++ b/tests/inference_engine/bridge/test_manifest.py
@@ -82,6 +82,7 @@ def test_allowlist_contains_exactly_the_documented_presets():
         "mlx-env-probe",
         "mlx-kakeya-chat-smoke",
         "mlx-kakeya-codegen-degen-probe",
+        "mlx-kakeya-codegen-guard-validate",
         "mlx-kakeya-degen-probe",
         "mlx-kakeya-fused-chat-ftheta",
         "mlx-kakeya-fused-chat-smoke",

From f8a7a9ae517fc6d542572b5dc60c578f46ae3436 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Thu, 18 Jun 2026 05:30:52 +0000
Subject: [PATCH 07/12] debug(probe): long single-decode A/B (drop native-ref
 for memory, budget 1100) to reach the ~978-tok collapse onset

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 inference_engine/bridge/manifest.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py
index 76e9ee1..0953833 100644
--- a/inference_engine/bridge/manifest.py
+++ b/inference_engine/bridge/manifest.py
@@ -795,14 +795,14 @@ def _harness_preset(
                     "--s5-exact-full-attn", "--fused-specdecode", "--force-f-theta",
                     "--sink-size", "4", "--window-size", "64", "--block-size", "4",
                     "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop",
-                    "--chat", "--chat-native-ref", "--fused-no-loop-guard",
+                    "--chat", "--fused-no-loop-guard",
                     "--chat-scripted-file",
                     "scripts/research/pow_codegen_longprompt.txt",
                     "--output", "results/research/codegen_degen_2815_longprompt.json",
                 ),
             ),
-            timeout_minutes=90,
-            params={"max_new_tokens": ("int:max_new_tokens", "192")},
+            timeout_minutes=120,
+            params={"max_new_tokens": ("int:max_new_tokens", "1100")},
             validate_reports=False,
         ),
         Preset(
@@ -823,14 +823,14 @@ def _harness_preset(
                     "--s5-exact-full-attn", "--fused-specdecode", "--force-f-theta",
                     "--sink-size", "4", "--window-size", "64", "--block-size", "4",
                     "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop",
-                    "--chat", "--chat-native-ref",
+                    "--chat",
                     "--chat-scripted-file",
                     "scripts/research/pow_codegen_longprompt.txt",
                     "--output", "results/research/codegen_guard_validate_2815.json",
                 ),
             ),
-            timeout_minutes=90,
-            params={"max_new_tokens": ("int:max_new_tokens", "256")},
+            timeout_minutes=120,
+            params={"max_new_tokens": ("int:max_new_tokens", "1100")},
             validate_reports=False,
         ),
         Preset(

From 85abe81834b584b507eefc9a99830ed9ded1b9e1 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Thu, 18 Jun 2026 05:44:50 +0000
Subject: [PATCH 08/12] debug(probe): multi-turn (explanation->code)
 guard-off/on A/B, no native-ref, budget 900 (matches the user's high-accept
 regime)

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 inference_engine/bridge/manifest.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py
index 0953833..76794be 100644
--- a/inference_engine/bridge/manifest.py
+++ b/inference_engine/bridge/manifest.py
@@ -796,13 +796,13 @@ def _harness_preset(
                     "--sink-size", "4", "--window-size", "64", "--block-size", "4",
                     "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop",
                     "--chat", "--fused-no-loop-guard",
-                    "--chat-scripted-file",
-                    "scripts/research/pow_codegen_longprompt.txt",
+                    "--chat-scripted",
+                    "请详细解释POW的工作原理||实现一个PoW的代码，用c语言完成",
                     "--output", "results/research/codegen_degen_2815_longprompt.json",
                 ),
             ),
             timeout_minutes=120,
-            params={"max_new_tokens": ("int:max_new_tokens", "1100")},
+            params={"max_new_tokens": ("int:max_new_tokens", "900")},
             validate_reports=False,
         ),
         Preset(
@@ -824,13 +824,13 @@ def _harness_preset(
                     "--sink-size", "4", "--window-size", "64", "--block-size", "4",
                     "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop",
                     "--chat",
-                    "--chat-scripted-file",
-                    "scripts/research/pow_codegen_longprompt.txt",
+                    "--chat-scripted",
+                    "请详细解释POW的工作原理||实现一个PoW的代码，用c语言完成",
                     "--output", "results/research/codegen_guard_validate_2815.json",
                 ),
             ),
             timeout_minutes=120,
-            params={"max_new_tokens": ("int:max_new_tokens", "1100")},
+            params={"max_new_tokens": ("int:max_new_tokens", "900")},
             validate_reports=False,
         ),
         Preset(

From 772c8dfe571bb4bfdfb85a1d634d3c8eed39e4cf Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Thu, 18 Jun 2026 06:17:18 +0000
Subject: [PATCH 09/12] cleanup(mlx-fused): strip inert KDBG probe
 instrumentation; finalize codegen presets

- Remove the KAKEYA_KDBG-gated debug instrumentation (helpers + per-block
  emission + prefill_state/turn_compare) from fused_specdecode.py and
  k3_integrated_niah_eval_mac.py. Investigation complete.
- Keep the production fix (runaway-loop guard) + the --chat-scripted-file /
  --fused-no-loop-guard / --chat-native-ref flags.
- Repoint the two codegen presets to the multi-turn 'explain||code' chat
  (guard-off probe + guard-on validate), accurate descriptions; drop the now-
  unused pow_codegen_longprompt.txt fixture.

On-device (Mac M4): across short/long/multi-turn regimes the engine is coherent
(fused==native); guard-on and guard-off outputs are byte-identical on the
multi-turn code scenario -> the guard is inert on healthy output (no regression)
and the systematic degeneration was already resolved by the wrap fix (#146).

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 .../backends/mlx/fused_specdecode.py          |  79 ------------
 inference_engine/bridge/manifest.py           |  33 +++--
 .../research/k3_integrated_niah_eval_mac.py   | 113 ------------------
 scripts/research/pow_codegen_longprompt.txt   |  85 -------------
 4 files changed, 14 insertions(+), 296 deletions(-)
 delete mode 100644 scripts/research/pow_codegen_longprompt.txt

diff --git a/inference_engine/backends/mlx/fused_specdecode.py b/inference_engine/backends/mlx/fused_specdecode.py
index 9fafdd7..3d44d79 100644
--- a/inference_engine/backends/mlx/fused_specdecode.py
+++ b/inference_engine/backends/mlx/fused_specdecode.py
@@ -35,69 +35,6 @@
     restored_prefill_cache,
 )
 
-# region agent log (fused-codegen-degeneration-2815 probe; strip after fix)
-import os as _kdbg_os
-import sys as _kdbg_sys
-import json as _kdbg_json
-
-_KDBG = bool(_kdbg_os.environ.get("KAKEYA_KDBG"))
-
-
-def _kdbg(hyp: str, msg: str, **data: Any) -> None:
-    """Emit one NDJSON probe line to stderr (prefix ``KDBG ``) and, best-effort,
-    to /opt/cursor/logs/debug.log. No-op unless ``KAKEYA_KDBG`` is set, so
-    production behaviour is unchanged."""
-    if not _KDBG:
-        return
-    rec = {"hypothesisId": hyp, "location": "fused_specdecode.py",
-           "message": msg, "data": data}
-    try:
-        _kdbg_sys.stderr.write("KDBG " + _kdbg_json.dumps(rec, ensure_ascii=False) + "\n")
-        _kdbg_sys.stderr.flush()
-    except Exception:
-        pass
-    try:
-        with open("/opt/cursor/logs/debug.log", "a") as _f:
-            _f.write(_kdbg_json.dumps(rec) + "\n")
-    except Exception:
-        pass
-
-
-def _kdbg_cycle(ids: Sequence[int], window: int = 80) -> Tuple[float, int]:
-    """Short-unit cycle metric on the tail of ``ids``: returns
-    ``(cyc_frac, cyc_p)`` where ``cyc_p`` is the period (1..window//3) whose
-    back-to-back repetition covers the largest fraction ``cyc_frac`` of the
-    trailing ``window`` tokens. ~1.0 => the tail is a tight repeating loop."""
-    w = list(ids[-window:])
-    n = len(w)
-    if n < 6:
-        return 0.0, 0
-    best_frac, best_p = 0.0, 0
-    for p in range(1, n // 3 + 1):
-        run, i = 0, n - 1
-        while i - p >= 0 and w[i] == w[i - p]:
-            run += 1
-            i -= 1
-        if run > 0:
-            frac = (run + p) / n
-            if frac > best_frac:
-                best_frac, best_p = frac, p
-    return round(best_frac, 3), best_p
-
-
-def _kdbg_cache_offsets(cache: Any) -> Tuple[Optional[int], Optional[int]]:
-    """(first full-attn KVCache offset, first sliding RotatingKVCache offset)."""
-    off_full = off_rot = None
-    for c in (cache or []):
-        nm = type(c).__name__
-        if off_rot is None and "Rotating" in nm:
-            off_rot = int(getattr(c, "offset", -1))
-        elif off_full is None and "Rotating" not in nm:
-            off_full = int(getattr(c, "offset", -1))
-    return off_full, off_rot
-# endregion
-
-
 # --------------------------------------------------------------------------- #
 # Component A: capture verifier aux-layer hidden states (no transformers
 # `output_hidden_states` on MLX → patch the decoder-layer __call__).
@@ -888,22 +825,6 @@ def fused_specdecode_generate(
                 commit = candidate[:accepted] + [correction]
             generated += commit
             accepts.append(accepted)
-            # region agent log (fused-codegen-degeneration-2815 probe)
-            if _KDBG:
-                off_full, off_rot = _kdbg_cache_offsets(getattr(adapter, "_cache", None))
-                cyc_frac, cyc_p = _kdbg_cycle(generated)
-                # H-D: cache.offset must track committed length (past_len).
-                # off_rot lags by the sliding window (bounded), off_full == past_len.
-                _kdbg("AD", "block",
-                      blk=len(accepts) - 1, base=cstart,
-                      past_len=int(adapter._past_len), gen=len(generated),
-                      off_full=off_full, off_rot=off_rot,
-                      bonus=int(bonus), cand=[int(x) for x in candidate],
-                      n_cand=len(candidate), accepted=int(accepted),
-                      commit=[int(x) for x in commit],
-                      next_argmax=int(argmax_fn(adapter.next_token_logits)),
-                      cyc_frac=cyc_frac, cyc_p=cyc_p)
-            # endregion
             if any(t in eos for t in commit):
                 break
             # Greedy decoding can collapse into a runaway short-period loop (e.g.
diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py
index 76794be..fbcdbb0 100644
--- a/inference_engine/bridge/manifest.py
+++ b/inference_engine/bridge/manifest.py
@@ -773,19 +773,13 @@ def _harness_preset(
         ),
         Preset(
             name="mlx-kakeya-codegen-degen-probe",
-            description="DEBUG: full f_θ fused engine on a LONG single-turn prompt "
-                        "(~2k-char PoW explanation + a 'write C code' request, from "
-                        "the committed fixture pow_codegen_longprompt.txt) so the "
-                        "native RotatingKVCache ring is ALREADY WRAPPED at prefill "
-                        "(would_wrap_block0). Short single-turn prompts were proven "
-                        "token-identical to native & coherent; this isolates the "
-                        "long-prompt-prefill regime cheaply (tiny 192-tok budget). "
-                        "KAKEYA_KDBG logs prefill state (prompt_len, any_wrapped, "
-                        "would_wrap_block0, rot/full offsets) + per-block offsets + "
-                        "turn_compare_fused_vs_native. Native-greedy control "
-                        "(--chat-native-ref): native coherent + fused garbled ⇒ "
-                        "long-prompt prefill corrupts logits (engine bug); both "
-                        "degenerate ⇒ bounded-greedy pathology the engine must guard.",
+            description="Regression probe (guard DISABLED): full f_θ fused engine "
+                        "on the multi-turn 'explain PoW || write PoW in C' chat "
+                        "that originally degenerated, with --fused-no-loop-guard so "
+                        "any greedy markdown-marker collapse is observable. Pairs "
+                        "with mlx-kakeya-codegen-guard-validate (guard ENABLED) to "
+                        "show the guard is what keeps the answer clean. On current "
+                        "code (post wrap-fix) both turns stay coherent.",
             command_templates=(
                 (
                     "python3", "scripts/research/k3_integrated_niah_eval_mac.py",
@@ -808,12 +802,13 @@ def _harness_preset(
         Preset(
             name="mlx-kakeya-codegen-guard-validate",
             description="Validate the runaway-loop guard end-to-end: full f_θ fused "
-                        "engine on the same long code prompt (pow_codegen_longprompt"
-                        ".txt) with the guard ENABLED (default). The fused answer "
-                        "must NOT collapse into a marker wall — the guard stops the "
-                        "runaway (stopped_on_runaway) leaving a clean tail — while "
-                        "the native-greedy control (no guard) degenerates, proving "
-                        "the guard is what saves the engine from greedy pathology.",
+                        "engine on the multi-turn 'explain PoW || write PoW in C' "
+                        "chat with the guard ENABLED (production default). The "
+                        "answer must stay coherent and never collapse into a marker "
+                        "wall — if a runaway starts, the guard stops it "
+                        "(stopped_on_runaway) leaving a clean tail. Confirmed "
+                        "coherent on current code; byte-identical to the guard-off "
+                        "probe (the guard is inert on healthy output).",
             command_templates=(
                 (
                     "python3", "scripts/research/k3_integrated_niah_eval_mac.py",
diff --git a/scripts/research/k3_integrated_niah_eval_mac.py b/scripts/research/k3_integrated_niah_eval_mac.py
index 345de2b..a6fc2eb 100644
--- a/scripts/research/k3_integrated_niah_eval_mac.py
+++ b/scripts/research/k3_integrated_niah_eval_mac.py
@@ -230,7 +230,6 @@ def main() -> int:
         MLXRestoredIncrementalVerifier, capture_aux_hidden,
         make_bridge_embed_lm_head, fused_specdecode_generate,
         fused_specdecode_generate_mlx, fused_specdecode_generate_mlx_trim,
-        _sliding_ring_would_wrap,  # region agent log (fused-codegen-degeneration-2815)
     )
     from inference_engine.v04.kv_compressor import make_default_compressor
     from inference_engine.bench.k3_report_gate import (
@@ -779,54 +778,6 @@ def _encode_chat(history: List[Dict[str, str]]) -> List[int]:
                         history, add_generation_prompt=True)
                 return list(cids.tolist() if hasattr(cids, "tolist") else cids)
 
-            # region agent log (fused-codegen-degeneration-2815 prefill probe)
-            import os as _kos_chat
-            _KDBG_CHAT = bool(_kos_chat.environ.get("KAKEYA_KDBG"))
-
-            def _kdbg_emit(rec: Dict[str, Any]) -> None:
-                try:
-                    sys.stderr.write("KDBG " + json.dumps(rec, ensure_ascii=False) + "\n")
-                    sys.stderr.flush()
-                except Exception:
-                    pass
-                try:
-                    with open("/opt/cursor/logs/debug.log", "a") as _f:
-                        _f.write(json.dumps(rec) + "\n")
-                except Exception:
-                    pass
-
-            def _kdbg_cache_summary(cache: Any) -> Dict[str, Any]:
-                """rot/full offset+max_size rollup + wrap/trimmable flags. The
-                decisive prefill signal: is the sliding RotatingKVCache already
-                wrapped (off>=max_size) BEFORE decode starts, and does full-attn
-                off == prompt_len? A pre-wrapped ring at prefill means the very
-                first speculative block's trim is refused (offset desync)."""
-                rot_off = rot_ms = full_off = None
-                any_wrapped = False
-                all_trimmable = True
-                n = 0
-                for c in (cache or []):
-                    n += 1
-                    nm = type(c).__name__
-                    off = int(getattr(c, "offset", -1))
-                    ms = getattr(c, "max_size", None)
-                    ms = int(ms) if ms is not None else None
-                    is_rot = "Rotating" in nm
-                    if is_rot and ms is not None and off >= ms:
-                        any_wrapped = True
-                    trim_fn = getattr(c, "is_trimmable", None)
-                    trim = bool(trim_fn()) if callable(trim_fn) else None
-                    if trim is False:
-                        all_trimmable = False
-                    if is_rot and rot_off is None:
-                        rot_off, rot_ms = off, ms
-                    if (not is_rot) and full_off is None:
-                        full_off = off
-                return {"n_layers": n, "rot_off": rot_off, "rot_ms": rot_ms,
-                        "full_off": full_off, "any_wrapped": any_wrapped,
-                        "all_trimmable": all_trimmable}
-            # endregion
-
             def _gen_turn(pid: List[int]) -> Dict[str, Any]:
                 # Opt-in A/B control (--chat-native-ref): a plain NATIVE greedy
                 # AR decode of the SAME prompt for --max-new-tokens. Captured as
@@ -837,14 +788,6 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]:
                 nref_tokens: List[int] = []
                 if args.chat_native_ref:
                     nref_cache, nref_logits = native_prefill(list(pid))
-                    # region agent log (fused-codegen-degeneration-2815 prefill probe)
-                    if _KDBG_CHAT:
-                        _turn = sum(1 for h in history if h.get("role") == "user")
-                        _kdbg_emit({"hypothesisId": "AE",
-                                    "message": "prefill_state_native",
-                                    "data": {"turn": _turn, "prompt_len": len(pid),
-                                             "cache": _kdbg_cache_summary(nref_cache)}})
-                    # endregion
                     while len(nref_tokens) < args.max_new_tokens:
                         tok = int(mx.argmax(nref_logits).item())
                         nref_tokens.append(tok)
@@ -875,23 +818,6 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]:
                     restored_v_per_layer=_pad(rv, tsrc, T),
                     evicted_positions=evicted,
                     prefill_chunk_size=args.prefill_chunk_size, full_kv=args.cuda_trim)
-                # region agent log (fused-codegen-degeneration-2815 prefill probe)
-                if _KDBG_CHAT:
-                    _turn = sum(1 for h in history if h.get("role") == "user")
-                    _kdbg_emit({"hypothesisId": "AE",
-                                "message": "prefill_state_fused",
-                                "data": {"turn": _turn, "prompt_len": T,
-                                         "evicted_count": len(evicted),
-                                         "block_size": int(args.block_size),
-                                         "would_wrap_block0": bool(
-                                             _sliding_ring_would_wrap(
-                                                 getattr(adapter, "_cache", None),
-                                                 int(args.block_size))),
-                                         "past_len": int(adapter._past_len),
-                                         "f_theta_ran": bool(f_theta_ran),
-                                         "cache": _kdbg_cache_summary(
-                                             getattr(adapter, "_cache", None))}})
-                # endregion
                 t0 = time.perf_counter()
                 _guard = not args.fused_no_loop_guard
                 if mlx_drafter is not None and args.cuda_trim:
@@ -937,45 +863,6 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]:
                     res["native_ref_tokens"] = len(nref_tokens)
                 res["resident_kv_bytes"] = int(
                     sum(int(getattr(c, "nbytes", 0)) for c in (adapter._cache or [])))
-                # region agent log (fused-codegen-degeneration-2815 probe)
-                import os as _kos
-                if _kos.environ.get("KAKEYA_KDBG"):
-                    ftoks = [int(t) for t in res.get("tokens", [])]
-                    ntoks = [int(t) for t in nref_tokens]
-                    div = None
-                    for j, (a, b) in enumerate(zip(ftoks, ntoks)):
-                        if a != b:
-                            div = j
-                            break
-                    if div is None:
-                        div = min(len(ftoks), len(ntoks))
-
-                    def _dec(seq):
-                        try:
-                            return tokenizer.decode(seq, skip_special_tokens=True)
-                        except TypeError:
-                            return tokenizer.decode(seq)
-                    rec = {
-                        "hypothesisId": "AC",
-                        "message": "turn_compare_fused_vs_native",
-                        "data": {
-                            "turn": sum(1 for h in history if h.get("role") == "user"),
-                            "fused_n": len(ftoks), "native_n": len(ntoks),
-                            "first_divergence_idx": div,
-                            "fused_div_ctx": ftoks[max(0, div - 8):div + 16],
-                            "native_div_ctx": ntoks[max(0, div - 8):div + 16],
-                            "fused_div_text": _dec(ftoks[max(0, div - 8):div + 16]),
-                            "native_div_text": _dec(ntoks[max(0, div - 8):div + 16]),
-                            "fused_tail": ftoks[-48:],
-                            "native_tail": ntoks[-48:],
-                            "fused_tail_text": _dec(ftoks[-48:]),
-                            "native_tail_text": _dec(ntoks[-48:]),
-                        },
-                    }
-                    sys.stderr.write(
-                        "KDBG " + json.dumps(rec, ensure_ascii=False) + "\n")
-                    sys.stderr.flush()
-                # endregion
                 return res
 
             print(f"[chat] FULL fused engine: verifier={args.verifier_path} "
diff --git a/scripts/research/pow_codegen_longprompt.txt b/scripts/research/pow_codegen_longprompt.txt
deleted file mode 100644
index fa1ae08..0000000
--- a/scripts/research/pow_codegen_longprompt.txt
+++ /dev/null
@@ -1,85 +0,0 @@
-**PoW (Proof of Work，工作量证明)** 是区块链技术中最核心的共识机制之一。它的核心目的是：**在没有中心化机构（如银行）的情况下，让分布在世界各地的计算机能够达成一致，决定谁有权记账，并防止有人通过伪造数据来欺骗网络。**
-
-为了深入理解，我们可以从“核心逻辑”、“工作流程”、“数学原理”和“经济博弈”四个维度来详细解释。
-
----
-
-### 1. 核心逻辑：用“算力”换取“信任”
-
-在去中心化网络中，大家面临一个问题：**如果每个人都说自己记了一笔账，听谁的？**
-
-PoW 的逻辑是：**谁付出了巨大的计算资源（工作量），谁就有权提议下一块账本。**
-*   **工作量（Work）**：指消耗的计算资源和时间。
-*   **证明（Proof）**：当计算完成后，结果是公开且易于验证的。
-
-这种机制确保了：**作恶的成本极高，而维护网络的收益（奖励）很诱人。**
-
----
-
-### 2. 工作流程：从“挖矿”到“验证”
-
-假设我们正在运行一个像比特币这样的 PoW 网络，流程如下：
-
-#### 第一步：收集交易
-矿工（节点）从网络中收集待处理的交易，并将它们打包成一个“候选区块”。
-
-#### 第二步：寻找“随机数”（核心环节）
-为了让区块生效，矿工必须解决一个数学难题。这个难题通常是：
-> “找到一个数字（称为 **Nonce**），使得：**区块头部数据的哈希值（Hash） < 目标难度值（Target）**。”
-
-*   **哈希函数（Hash Function）**：像是一个“数字粉碎机”。你输入任何内容，它都会输出一串固定长度的乱码。只要输入变一点点，输出就会天差地别。
-*   **不可逆性**：你无法通过结果反推输入。
-
-#### 第三步：竞争与“挖矿”
-矿工们开始疯狂尝试。他们不断改变区块里的 `Nonce` 值，重新计算哈希值。
-*   矿工 A 尝试：`Hash(数据 + 1) = 0xabc...` (不符合要求)
-*   矿工 B 尝试：`Hash(数据 + 2) = 0xdef...` (不符合要求)
-*   ...
-*   矿工 C 运气好/算力强：`Hash(数据 + 999) = 0x000...` (**符合要求！**)
-
-一旦有人找到了满足条件的哈希值，他就“挖到了矿”。
-
-#### 第四步：广播与验证
-获胜的矿工将新区块广播给全网。其他节点收到后，只需进行一次简单的计算（把 Nonce 代入哈希函数），发现结果确实符合要求，就会接受这个区块，并更新自己的账本。
-
----
-
-### 3. 数学原理：为什么它是安全的？
-
-#### A. 难度调整机制 (Difficulty Adjustment)
-如果每个人都买更快的显卡，挖矿速度会变快，导致区块产生太频繁。
-为了保持稳定的出块时间（例如比特币约10分钟一个块），系统会**自动调整难度**：
-*   如果出块太快 $\rightarrow$ 提高目标值（让要求的哈绪值前导零更多，变得更难）。
-*   如果出块太慢 $\rightarrow$ 降低难度。
-
-#### B. 概率与公平性
-由于哈希值是随机分布的，寻找答案的过程就像“在茫茫大海里捞针”。
-*   **算力越高**，意味着你每秒尝试的次数越多，捞到针的概率就越大。
-*   这保证了在宏观上，算力占比与奖励分配是公平的。
-
-#### C. 抵抗“双花”与篡改
-如果有人想修改 10 分钟前的一个交易，他必须重新计算那个区块的哈希，以及**之后所有区块**的哈希。
-由于后面的区块都包含了前一个区块的哈希值（形成链式结构），修改历史意味着要重算后面所有的工作量。除非攻击者的算力超过全网总和的 51%，否则这在经济上是不可能的。
-
----
-
-### 4. 总结：PoW 的优缺点
-
-#### 优点：
-1.  **极高的安全性**：通过物理世界的能源消耗（电力）为数字世界筑起防线。
-2.  **去中心化**：任何人只要有设备就可以加入，不需要许可。
-3.  **公平性**：基于数学概率，不存在人为干预。
-
-#### 缺点：
-1.  **能源消耗大**：为了维持安全，全球范围内的矿机都在消耗大量电力（这是最受争议的一点）。
-2.  **扩展性差（TPS低）**：为了保证全球同步，出块速度不能太快，导致处理交易的速度受限。
-3.  **算力集中风险**：如果出现大规模矿池，可能导致权力向少数人手中集中。
-
-### 通俗比喻总结
-想象一个**“数字猜数字游戏”**：
-全网的人都在玩一个游戏，目标是猜出一个符合特定规则的数字。
-*   **挖矿**就是不停地尝试不同的数字。
-*   **难度**就是规则越来越严苛（比如要求数字必须以 10 个零开头）。
-*   **验证**就是别人看到你猜中了，只需要看一眼你的数字是否符合规则，瞬间就能确认你没撒谎。
-
-基于以上说明，用C语言写一个简短的PoW示例（只写核心循环，不超过30行）。
\ No newline at end of file

From 51ff901281dcd2d361c758faaa0ec947e456afde Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Thu, 18 Jun 2026 06:37:29 +0000
Subject: [PATCH 10/12] docs(skill): add reusable 'pin self-hosted runner
 Python env' skill + prompt

Captures the diagnosis+fix for the post-reboot ModuleNotFoundError (mlx_lm) on
the kakeya-mac-m4 runner: lightweight env-probe diagnosis, 3-layer fix (pin
venv on the runner agent PATH via .path/.env|launchd|systemd; resolve a pinned
interpreter in the workflow/executor instead of bare python3; fail-fast import
gate), reboot-inclusive verification, and the Cloud-VM-vs-runner distinction
(Mac-only deps belong on the runner, not the Linux Cloud Agent env). Includes a
ready-to-paste setup-agent prompt; generalized for any Claude/Codex agent.

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 docs/kakeyainferenceenginebuildskill.md       |   1 +
 .../pin-selfhosted-runner-python-env-skill.md | 193 ++++++++++++++++++
 2 files changed, 194 insertions(+)
 create mode 100644 docs/skills/pin-selfhosted-runner-python-env-skill.md

diff --git a/docs/kakeyainferenceenginebuildskill.md b/docs/kakeyainferenceenginebuildskill.md
index 38a40c8..afab3e4 100644
--- a/docs/kakeyainferenceenginebuildskill.md
+++ b/docs/kakeyainferenceenginebuildskill.md
@@ -351,5 +351,6 @@ If any answer is "no", write the weaker, true claim.
 - v0.5-cuda scorecard (+ honest §5): `docs/reports/kakeya-inference-engine-v0.5-cuda.md`
 - Engine vs vLLM long-context journey: `docs/reports/kakeya-engine-vs-vllm-h200.md`, `docs/reports/kakeya-vs-vllm-longcontext-h200.md`
 - MLX port lessons: `docs/mlx-port-lessons.md`
+- Self-hosted runner Python pinning (reboot-proof mlx_lm/torch/transformers): `docs/skills/pin-selfhosted-runner-python-env-skill.md`
 - f_θ training pipeline: `docs/design/k3-f-theta-training-pipeline.md`
 - Session capacity / cross-host: `docs/adr/0014-agent-connection-capacity-and-cross-host-topology-tests.md`
diff --git a/docs/skills/pin-selfhosted-runner-python-env-skill.md b/docs/skills/pin-selfhosted-runner-python-env-skill.md
new file mode 100644
index 0000000..2f348fd
--- /dev/null
+++ b/docs/skills/pin-selfhosted-runner-python-env-skill.md
@@ -0,0 +1,193 @@
+# Skill: Pin a self-hosted runner's Python env (survive reboots, reproducible heavy ML deps)
+
+**Reusable across agents (Claude / Codex / Cursor).** Copy this file or paste the
+prompt in the appendix. It is written to be repo-agnostic; the concrete examples
+use a GitHub Actions self-hosted Mac runner driving MLX (`mlx_lm`/`torch`/
+`transformers`), but the pattern applies to any self-hosted runner (Mac or Linux)
+that runs heavy ML/native deps from a virtualenv.
+
+---
+
+## 1. When to use this skill
+
+Trigger it when **a self-hosted runner job fails on a missing module that "used to
+work"**, especially after a host **reboot / OS or Python upgrade / runner
+re-register**. Classic signatures:
+
+- `ModuleNotFoundError: No module named 'mlx_lm'` (or `torch`, `transformers`, …)
+  in a job that previously passed.
+- The failure is **fast** (seconds) — it dies at `import`, before any real work.
+- A **lightweight probe** (one that only needs stdlib + a base package) still
+  passes, proving the runner is *online* but pointing at the **wrong interpreter**.
+- The interpreter version changed (e.g. `python=3.14.3` where it used to be
+  `3.13.x`), or `pkg=None` for a package that should be installed.
+
+Root cause is almost always: the workflow invokes a **bare `python3`**, and after
+the reboot the default `python3` on `PATH` is no longer the venv that has the
+deps. The venv still exists; nothing points at it.
+
+---
+
+## 2. Diagnose first (don't guess)
+
+Run the **cheapest possible probe** through the same runner path to read the
+interpreter + module state, instead of assuming. Example (adapt the import list):
+
+```bash
+python3 - <<'PY'
+import sys
+def v(m):
+    try:
+        mod = __import__(m); return getattr(mod, "__version__", "ok")
+    except Exception as e:
+        return f"MISSING ({e.__class__.__name__})"
+print("python =", sys.version.split()[0], "| exe =", sys.executable)
+for m in ("mlx", "mlx_lm", "torch", "transformers"):
+    print(f"{m} = {v(m)}")
+PY
+```
+
+Decision rule:
+- **Runner online + probe shows wrong `python`/`exe` or `MISSING` deps** → this skill (interpreter pinning).
+- **Probe itself never starts (job stuck `queued`/`pending`)** → the runner *agent*
+  is down; restart the agent first (different problem).
+
+> In CI-driven runners, route the probe through the same executor the real jobs
+> use (so `PATH`/env match). A one-liner like the above, committed as a tiny
+> "env-probe" job/preset, is worth keeping permanently.
+
+---
+
+## 3. Fix — three layers (do all three; they are defense-in-depth)
+
+### Layer A — Pin the interpreter the runner *agent* sees (host side, durable)
+
+Make the venv's `bin` the first thing on the **runner agent's** `PATH`, so a bare
+`python3` resolves to the venv even across reboots. Pick the mechanism for how the
+agent is launched:
+
+- **GitHub Actions runner as a service (recommended).** The runner reads a
+  `.env` and a `.path` file in its install dir at start:
+  ```bash
+  cd ~/actions-runner
+  echo "$HOME/kakeya-venv/bin"  > .path          # prepended to PATH
+  echo "VIRTUAL_ENV=$HOME/kakeya-venv" >> .env
+  ./svc.sh stop && ./svc.sh start                # reload
+  ```
+  (`.path` is concatenated ahead of the system PATH for every job; `.env` injects
+  process env. Both persist across reboots because the service re-reads them.)
+- **launchd plist (macOS), if not using `svc.sh`.** In the runner's
+  `~/Library/LaunchAgents/<runner>.plist`, set:
+  ```xml
+  <key>EnvironmentVariables</key>
+  <dict>
+    <key>PATH</key><string>/Users/&lt;you&gt;/kakeya-venv/bin:/usr/bin:/bin:/usr/sbin:/sbin</string>
+  </dict>
+  ```
+  then `launchctl unload/load` the plist.
+- **systemd (Linux self-hosted).** In the runner unit:
+  `Environment="PATH=/opt/kakeya-venv/bin:%h/.local/bin:/usr/bin:/bin"`, then
+  `systemctl daemon-reload && systemctl restart <runner>`.
+
+Verify: `python3 -c "import mlx_lm, torch, transformers; print('ok')"` from a job.
+
+### Layer B — Make the workflow/executor resolve a *pinned* interpreter (repo side, robust)
+
+Never call a bare `python3` for the heavy job. Resolve an explicit interpreter so
+the repo is robust even if Layer A drifts:
+
+1. Add a repo/runner variable, e.g. `KAKEYA_MAC_PYTHON`, pointing at the venv
+   python (`/Users/<you>/kakeya-venv/bin/python`). Default-discover if unset:
+   ```bash
+   PYBIN="${KAKEYA_MAC_PYTHON:-}"
+   for c in "$PYBIN" "$HOME/kakeya-venv/bin/python" "$(command -v python3.13)" "$(command -v python3)"; do
+     [ -n "$c" ] && [ -x "$c" ] && "$c" -c 'import mlx_lm' 2>/dev/null && { PYBIN="$c"; break; }
+   done
+   ```
+2. Use `$PYBIN` (or substitute a `${PYTHON}` token in your command templates)
+   instead of `python3` for the actual workload. If your executor spawns argv
+   lists (no shell), resolve the token to `$PYBIN` before `subprocess.run`.
+
+### Layer C — Fail fast with a clear message (repo side, observability)
+
+Before the expensive step, assert the deps and **print a fix hint** so the next
+failure is self-explanatory instead of a deep `ModuleNotFoundError`:
+
+```bash
+"$PYBIN" - <<'PY' || { echo "::error::runner python missing ML deps — see pin-selfhosted-runner-python-env-skill.md (Layer A)"; exit 90; }
+import mlx_lm, torch, transformers  # noqa
+PY
+```
+
+---
+
+## 4. Verify the fix
+
+1. Re-run the lightweight env-probe → correct `python`/`exe`, all deps present.
+2. Re-run one **real** (heavy) job → no `ModuleNotFoundError`, completes.
+3. **Reboot the host and re-run** (the actual regression you are fixing) → still
+   green. This step is the whole point; do not skip it.
+
+---
+
+## 5. Generalizing to a *Cloud Agent* VM env setup (different machine!)
+
+Do **not** confuse the self-hosted runner with the Cloud Agent VM:
+- The **Cloud Agent VM** is typically Linux; it runs the *client* that dispatches
+  jobs and the unit-test gate. **Mac-only deps (MLX) do not belong there.** Put
+  only what the client/tests need into the Cloud Agent env setup (base image +
+  startup script), and pin versions.
+- The **self-hosted runner** is where the heavy/native/Mac deps live. Pin them
+  there (Layers A–C above), not in the Cloud VM env setup.
+
+For the Cloud Agent VM specifically: bake stable deps into the **base image**, do
+slow-changing installs in the **startup script**, and pin versions so a new VM is
+reproducible. (In Cursor, this is the "env setup agent" config.)
+
+---
+
+## 6. Anti-patterns
+
+- ❌ `pip install` the missing dep into whatever `python3` happens to be active
+  (often a too-new system Python with no wheels for `torch`/`mlx_lm`). Pin to the
+  known-good venv instead.
+- ❌ Hardcoding an absolute interpreter path in many places. Resolve once
+  (variable + discovery) and reuse.
+- ❌ "It works now" without a reboot test — the regression is reboot-triggered.
+- ❌ Relying on an interactive shell's `source venv/bin/activate`; CI jobs and
+  services don't run your `.zshrc`.
+
+---
+
+## Appendix — ready-to-paste prompt for a setup agent
+
+> **Task: make our self-hosted CI runner's Python environment reboot-proof.**
+>
+> Symptom: jobs on our self-hosted runner fail fast with
+> `ModuleNotFoundError: No module named 'mlx_lm'` after the host rebooted; a
+> lightweight env-probe shows the runner's default `python3` switched to a newer
+> interpreter that lacks our ML stack (`mlx_lm`/`torch`/`transformers`), while the
+> known-good venv still exists but is no longer on `PATH`.
+>
+> Do all of the following, smallest-diff first, and verify each:
+> 1. **Diagnose:** run a tiny probe that prints `sys.version`, `sys.executable`,
+>    and import status of `mlx_lm, torch, transformers` through the same path the
+>    real jobs use. Confirm the wrong interpreter / missing modules.
+> 2. **Host (runner agent):** pin the venv's `bin` ahead of system `PATH` for the
+>    runner service so a bare `python3` resolves to the venv across reboots — via
+>    the runner's `.path`/`.env` files (GitHub Actions `svc.sh`), or the
+>    launchd/systemd unit's `PATH` env. Reload the service.
+> 3. **Repo (workflow/executor):** stop calling bare `python3` for the heavy job.
+>    Resolve a pinned interpreter from a `*_PYTHON` repo/runner variable, with a
+>    discovery fallback that picks the first candidate where `import mlx_lm`
+>    succeeds; use it for the workload commands.
+> 4. **Repo (fail-fast):** before the expensive step, assert
+>    `import mlx_lm, torch, transformers` and emit a clear `::error::` with a link
+>    to this skill if missing (exit non-zero).
+> 5. **Verify, including a reboot:** env-probe green, one real heavy job green,
+>    then reboot the host and re-run the same job — must still be green.
+> 6. **Pin versions** in the venv (freeze a lockfile) and document the venv path +
+>    rebuild steps so the environment is reproducible, not just patched.
+>
+> Keep the heavy/native deps on the self-hosted runner only; do NOT add Mac-only
+> deps to the Cloud Agent (Linux) VM env setup.

From 16440ff48fc76a4a64a9d36f230059ea9422b964 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Thu, 18 Jun 2026 06:51:50 +0000
Subject: [PATCH 11/12] feat(mac-bridge): pin workload interpreter (Layer B) +
 import self-check gate (Layer C)

Reboots can repoint the runner's default python3 to one without mlx_lm, which
broke every full-engine preset with a deep ModuleNotFoundError. Make the
workload interpreter explicit and verified:

- inference_engine/bridge/runner_python.py (NEW, pure + 100% unit-tested):
  workload_python_candidates (pin KAKEYA_MAC_PYTHON -> venvs -> PATH),
  resolve_workload_python (first interpreter that can import mlx_lm; else
  fallback), preset_requires_gate (mlx-/k3- engine presets, minus env-probe/
  upgrade), substitute_python, gate_error_message.
- scripts/mac_bridge/run_preset.py: resolve the pinned interpreter, rewrite bare
  python3 argv0 to it, export KAKEYA_MAC_PYTHON to the subprocess, and FAIL FAST
  (exit 90 + ::error::) when a gated preset has no mlx_lm-capable interpreter.
- scripts/run_kakeya_mac.sh: honor KAKEYA_MAC_PYTHON; preflight asserts mlx+mlx_lm.

CI enforcement: the resolution/gate logic lives in the unit-tested, 100%-coverage
library (runner_python.py), so every PR exercises it on the Linux gate. See
docs/skills/pin-selfhosted-runner-python-env-skill.md.

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 inference_engine/bridge/runner_python.py      | 123 ++++++++++++++++++
 scripts/mac_bridge/run_preset.py              |  44 ++++++-
 scripts/run_kakeya_mac.sh                     |  13 +-
 .../bridge/test_runner_python.py              | 108 +++++++++++++++
 4 files changed, 283 insertions(+), 5 deletions(-)
 create mode 100644 inference_engine/bridge/runner_python.py
 create mode 100644 tests/inference_engine/bridge/test_runner_python.py

diff --git a/inference_engine/bridge/runner_python.py b/inference_engine/bridge/runner_python.py
new file mode 100644
index 0000000..1b0c27b
--- /dev/null
+++ b/inference_engine/bridge/runner_python.py
@@ -0,0 +1,123 @@
+"""Pin the Mac-bridge workload interpreter (Layer B) + import self-check (Layer C).
+
+A self-hosted runner's default ``python3`` can silently change across reboots /
+OS upgrades (observed 2026-06-18: it flipped to a Python 3.14 without ``mlx_lm``,
+breaking every full-engine preset with a deep ``ModuleNotFoundError``). The
+mac-bridge executor used to invoke a bare ``python3`` for the workload, so it
+inherited whatever interpreter happened to be first on ``PATH``.
+
+This module makes the workload interpreter **explicit and verified**:
+
+* **Layer B — resolution.** Build an ordered candidate list (a pinned
+  ``KAKEYA_MAC_PYTHON``, common venv paths, then ``PATH`` pythons) and pick the
+  first one that can import the gate module (``mlx_lm``); fall back to the first
+  existing candidate otherwise.
+* **Layer C — gate.** For presets whose workload needs ``mlx_lm`` (the ``mlx-`` /
+  ``k3-`` engine families, minus the env-probe / upgrade tools that exist to
+  diagnose/repair the env), fail fast with a clear message instead of a deep
+  import error when no capable interpreter exists.
+
+All functions here are pure / dependency-injected so they are unit-tested on the
+Linux gate (the CLI ``scripts/mac_bridge/run_preset.py`` is a thin caller). See
+``docs/skills/pin-selfhosted-runner-python-env-skill.md``.
+"""
+
+from __future__ import annotations
+
+import os
+import shutil
+from dataclasses import dataclass
+from typing import Callable, List, Mapping, Optional, Sequence
+
+# The single module whose absence broke the runner; importing it implies the
+# full MLX-LM stack is wired for the interpreter.
+GATE_MODULE = "mlx_lm"
+
+# ``mlx-``/``k3-`` presets that must NOT be import-gated: these exist precisely
+# to probe or repair the environment, so they must run even when mlx_lm is gone.
+_IMPORT_GATE_SKIP = frozenset({"mlx-env-probe", "mlx-upgrade"})
+
+SKILL_DOC = "docs/skills/pin-selfhosted-runner-python-env-skill.md"
+
+
+def workload_python_candidates(
+    environ: Mapping[str, str],
+    *,
+    which: Callable[[str], Optional[str]] = shutil.which,
+    expanduser: Callable[[str], str] = os.path.expanduser,
+) -> List[str]:
+    """Ordered, de-duplicated interpreter candidates for the heavy workload.
+
+    Priority: the explicit pin (``KAKEYA_MAC_PYTHON``), then conventional venv
+    locations, then ``PATH`` pythons (a pinned minor version before the bare
+    ``python3`` that a reboot may have repointed)."""
+    raw = [
+        environ.get("KAKEYA_MAC_PYTHON"),
+        expanduser("~/kakeya-venv/bin/python"),
+        expanduser("~/.venv/bin/python"),
+        which("python3.13"),
+        which("python3"),
+    ]
+    out: List[str] = []
+    for c in raw:
+        if c and c not in out:
+            out.append(c)
+    return out
+
+
+@dataclass(frozen=True)
+class ResolvedPython:
+    """The interpreter chosen for the workload."""
+
+    path: str
+    gate_module_ok: bool   # whether ``path`` can import GATE_MODULE
+    from_pin: bool         # whether it came from ``KAKEYA_MAC_PYTHON``
+
+
+def resolve_workload_python(
+    candidates: Sequence[str],
+    can_import: Callable[[str], bool],
+    *,
+    pinned: Optional[str] = None,
+) -> Optional[ResolvedPython]:
+    """Pick the first candidate that can import :data:`GATE_MODULE`; otherwise
+    the first candidate (a fallback whose ``gate_module_ok`` is ``False``).
+    Returns ``None`` only when there are no candidates at all."""
+    first: Optional[str] = None
+    for c in candidates:
+        if first is None:
+            first = c
+        if can_import(c):
+            return ResolvedPython(c, True, c == pinned)
+    if first is None:
+        return None
+    return ResolvedPython(first, False, first == pinned)
+
+
+def preset_requires_gate(preset_name: str) -> bool:
+    """True iff a preset's workload needs :data:`GATE_MODULE` (so a missing
+    import must fail fast). The ``mlx-`` / ``k3-`` engine presets do; the
+    env-probe and upgrade tools (which diagnose/repair the env) are exempt."""
+    if preset_name in _IMPORT_GATE_SKIP:
+        return False
+    return preset_name.startswith(("mlx-", "k3-"))
+
+
+def substitute_python(argv: Sequence[str], pybin: str) -> List[str]:
+    """Rewrite a leading bare ``python3`` to the resolved interpreter ``pybin``.
+    Non-``python3`` argv (e.g. ``bash run_kakeya_mac.sh``, which reads
+    ``KAKEYA_MAC_PYTHON`` itself) is returned unchanged."""
+    a = list(argv)
+    if a and a[0] == "python3":
+        a[0] = pybin
+    return a
+
+
+def gate_error_message(preset_name: str, pybin: str) -> str:
+    """The fail-fast message when a gated preset has no mlx_lm-capable python."""
+    return (
+        f"runner python '{pybin}' cannot import {GATE_MODULE!r}, which preset "
+        f"'{preset_name}' requires. The runner's default python likely changed "
+        f"(e.g. after a reboot). Pin the venv via KAKEYA_MAC_PYTHON or the runner "
+        f"agent PATH and reinstall the ML stack — see {SKILL_DOC}."
+    )
diff --git a/scripts/mac_bridge/run_preset.py b/scripts/mac_bridge/run_preset.py
index a95122a..4a63c00 100644
--- a/scripts/mac_bridge/run_preset.py
+++ b/scripts/mac_bridge/run_preset.py
@@ -32,10 +32,29 @@
     build_commands,
     parse_manifest_text,
 )
+from inference_engine.bridge.runner_python import (
+    GATE_MODULE,
+    gate_error_message,
+    preset_requires_gate,
+    resolve_workload_python,
+    substitute_python,
+    workload_python_candidates,
+)
 
 LOG_DIR = Path(".mac-bridge/logs")
 
 
+def _can_import_gate_module(pybin: str) -> bool:
+    """True iff interpreter ``pybin`` can import the gate module (mlx_lm)."""
+    try:
+        return subprocess.run(
+            [pybin, "-c", f"import {GATE_MODULE}"],
+            stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
+        ).returncode == 0
+    except OSError:
+        return False
+
+
 def main() -> int:
     ap = argparse.ArgumentParser(description=__doc__)
     ap.add_argument("--manifest", default=".mac-bridge/request.json")
@@ -59,6 +78,23 @@ def main() -> int:
             print(json.dumps(argv))
         return 0
 
+    # Layer B — resolve a PINNED workload interpreter instead of trusting the
+    # bare ``python3`` on PATH (which a reboot can repoint to a python without
+    # mlx_lm). Layer C — gate: mlx-/k3- engine presets fail fast with a clear
+    # message when no mlx_lm-capable interpreter exists.
+    pinned = os.environ.get("KAKEYA_MAC_PYTHON")
+    candidates = workload_python_candidates(os.environ)
+    resolved = resolve_workload_python(
+        candidates, _can_import_gate_module, pinned=pinned)
+    pybin = resolved.path if resolved else "python3"
+    gate_ok = bool(resolved and resolved.gate_module_ok)
+    print(f"[mac-bridge] workload python={pybin} {GATE_MODULE}_ok={gate_ok} "
+          f"pinned={pinned!r} candidates={candidates}", file=sys.stderr)
+    if preset_requires_gate(request.preset.name) and not gate_ok:
+        print(f"::error::{gate_error_message(request.preset.name, pybin)}",
+              file=sys.stderr)
+        return 90
+
     LOG_DIR.mkdir(parents=True, exist_ok=True)
     summary = {
         "preset": request.preset.name,
@@ -66,13 +102,19 @@ def main() -> int:
         "nonce": request.nonce,
         "commands": [],
     }
+    # Make the resolved interpreter authoritative for BOTH bare-``python3``
+    # commands (rewritten here) and the launcher (which reads KAKEYA_MAC_PYTHON).
+    sub_env = dict(os.environ)
+    sub_env["KAKEYA_MAC_PYTHON"] = pybin
     rc = 0
     for idx, argv in enumerate(commands):
+        argv = substitute_python(argv, pybin)
         log_path = LOG_DIR / f"{request.preset.name}-{idx}.log"
         print(f"[mac-bridge] exec[{idx}]: {argv}", file=sys.stderr)
         t0 = time.perf_counter()
         with log_path.open("wb") as log:
-            proc = subprocess.run(argv, stdout=log, stderr=subprocess.STDOUT)
+            proc = subprocess.run(argv, stdout=log, stderr=subprocess.STDOUT,
+                                  env=sub_env)
         elapsed = time.perf_counter() - t0
         summary["commands"].append({
             "argv": argv,
diff --git a/scripts/run_kakeya_mac.sh b/scripts/run_kakeya_mac.sh
index 4bf3308..8197ad2 100755
--- a/scripts/run_kakeya_mac.sh
+++ b/scripts/run_kakeya_mac.sh
@@ -55,6 +55,11 @@ done
 
 log() { echo "[run-kakeya-mac] $*" >&2; }
 
+# Pinned interpreter (Layer B): prefer KAKEYA_MAC_PYTHON (the venv python with
+# mlx_lm/torch/transformers) over a bare python3 that a host reboot may have
+# repointed. See docs/skills/pin-selfhosted-runner-python-env-skill.md.
+PYBIN="${KAKEYA_MAC_PYTHON:-python3}"
+
 # ---- argv for the full-engine harness chat ----
 args=(
   --verifier-path "$VERIFIER"
@@ -80,7 +85,7 @@ log "drafter : $DRAFTER"
 log "f_theta : $FTHETA"
 log "params  : sink=$SINK window=$WINDOW block=$BLOCK max_new=$MAX_NEW"
 
-cmd=( python3 scripts/research/k3_integrated_niah_eval_mac.py "${args[@]}" "${EXTRA[@]}" )
+cmd=( "$PYBIN" scripts/research/k3_integrated_niah_eval_mac.py "${args[@]}" "${EXTRA[@]}" )
 
 if [[ "$DRY_RUN" == "1" ]]; then
   echo "PYTHONPATH=.:sdks/python ${cmd[*]}"
@@ -88,9 +93,9 @@ if [[ "$DRY_RUN" == "1" ]]; then
 fi
 
 # ---- preflight (Apple Silicon + MLX + model) ----
-command -v python3 >/dev/null || { log "python3 not found"; exit 1; }
-python3 -c "import mlx.core" 2>/dev/null \
-  || { log "MLX not importable — this needs Apple Silicon + 'pip install mlx mlx-lm'"; exit 2; }
+command -v "$PYBIN" >/dev/null 2>&1 || { log "interpreter not found: $PYBIN (set KAKEYA_MAC_PYTHON)"; exit 1; }
+"$PYBIN" -c "import mlx.core, mlx_lm" 2>/dev/null \
+  || { log "mlx/mlx_lm not importable by $PYBIN — Apple Silicon + a venv with 'mlx mlx-lm'; set KAKEYA_MAC_PYTHON. See docs/skills/pin-selfhosted-runner-python-env-skill.md"; exit 2; }
 [[ -d "$VERIFIER" ]] \
   || { log "verifier model dir not found: $VERIFIER (set KAKEYA_MAC_VERIFIER_PATH)"; exit 3; }
 if [[ "$FAST" != "1" && ! -e "$FTHETA" ]]; then
diff --git a/tests/inference_engine/bridge/test_runner_python.py b/tests/inference_engine/bridge/test_runner_python.py
new file mode 100644
index 0000000..8a0a6bb
--- /dev/null
+++ b/tests/inference_engine/bridge/test_runner_python.py
@@ -0,0 +1,108 @@
+"""Unit tests for the mac-bridge workload interpreter pinning (Layers B/C).
+
+Pure / dependency-injected logic from ``inference_engine.bridge.runner_python``;
+the CLI ``scripts/mac_bridge/run_preset.py`` is a thin caller (coverage-exempt).
+"""
+
+from __future__ import annotations
+
+from inference_engine.bridge.runner_python import (
+    GATE_MODULE,
+    SKILL_DOC,
+    ResolvedPython,
+    gate_error_message,
+    preset_requires_gate,
+    resolve_workload_python,
+    substitute_python,
+    workload_python_candidates,
+)
+
+
+# --------------------------------------------------------------------------- #
+# workload_python_candidates
+# --------------------------------------------------------------------------- #
+def test_candidates_prioritise_pin_then_venvs_then_path():
+    env = {"KAKEYA_MAC_PYTHON": "/pin/bin/python"}
+    which = {"python3.13": "/usr/bin/python3.13", "python3": "/usr/bin/python3"}.get
+    cands = workload_python_candidates(
+        env, which=which, expanduser=lambda p: p.replace("~", "/home/me"))
+    assert cands == [
+        "/pin/bin/python",
+        "/home/me/kakeya-venv/bin/python",
+        "/home/me/.venv/bin/python",
+        "/usr/bin/python3.13",
+        "/usr/bin/python3",
+    ]
+
+
+def test_candidates_drop_empty_and_dedupe():
+    # no pin, python3.13 missing, and python3 == an expanded venv path (dedupe).
+    env: dict = {}
+    which = {"python3.13": None, "python3": "/home/me/.venv/bin/python"}.get
+    cands = workload_python_candidates(
+        env, which=which, expanduser=lambda p: p.replace("~", "/home/me"))
+    assert cands == [
+        "/home/me/kakeya-venv/bin/python",
+        "/home/me/.venv/bin/python",
+    ]
+    assert None not in cands
+
+
+# --------------------------------------------------------------------------- #
+# resolve_workload_python
+# --------------------------------------------------------------------------- #
+def test_resolve_picks_first_importable():
+    cands = ["/a/py", "/b/py", "/c/py"]
+    r = resolve_workload_python(cands, lambda p: p == "/b/py", pinned="/a/py")
+    assert r == ResolvedPython(path="/b/py", gate_module_ok=True, from_pin=False)
+
+
+def test_resolve_marks_from_pin_when_pinned_is_importable():
+    r = resolve_workload_python(["/pin/py", "/x/py"], lambda p: True,
+                                pinned="/pin/py")
+    assert r.path == "/pin/py" and r.gate_module_ok is True and r.from_pin is True
+
+
+def test_resolve_falls_back_to_first_when_none_importable():
+    r = resolve_workload_python(["/a/py", "/b/py"], lambda p: False,
+                                pinned="/a/py")
+    assert r == ResolvedPython(path="/a/py", gate_module_ok=False, from_pin=True)
+
+
+def test_resolve_returns_none_without_candidates():
+    assert resolve_workload_python([], lambda p: True) is None
+
+
+# --------------------------------------------------------------------------- #
+# preset_requires_gate
+# --------------------------------------------------------------------------- #
+def test_gate_required_for_mlx_and_k3_engine_presets():
+    assert preset_requires_gate("mlx-kakeya-launcher-full") is True
+    assert preset_requires_gate("k3-step2-fused") is True
+
+
+def test_gate_skips_diagnostic_and_installer_and_non_engine():
+    assert preset_requires_gate("mlx-env-probe") is False     # diagnostic
+    assert preset_requires_gate("mlx-upgrade") is False       # installer
+    assert preset_requires_gate("integration-tests") is False
+    assert preset_requires_gate("agent-capacity-stress") is False
+
+
+# --------------------------------------------------------------------------- #
+# substitute_python / gate_error_message
+# --------------------------------------------------------------------------- #
+def test_substitute_rewrites_only_leading_bare_python3():
+    assert substitute_python(["python3", "a.py", "--x"], "/v/py") == [
+        "/v/py", "a.py", "--x"]
+    # non-python3 argv0 (e.g. the launcher) is untouched.
+    assert substitute_python(["bash", "run.sh"], "/v/py") == ["bash", "run.sh"]
+    # empty argv is safe.
+    assert substitute_python([], "/v/py") == []
+
+
+def test_gate_error_message_names_module_preset_and_skill():
+    msg = gate_error_message("mlx-kakeya-launcher-full", "/usr/bin/python3")
+    assert GATE_MODULE in msg
+    assert "mlx-kakeya-launcher-full" in msg
+    assert "/usr/bin/python3" in msg
+    assert SKILL_DOC in msg

From cff05ac635c74d3e74707464ef85703d10589140 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Thu, 18 Jun 2026 12:56:05 +0000
Subject: [PATCH 12/12] fix(mac-launcher): bash 3.2-safe empty-array expansion
 (EXTRA[@]: unbound variable)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

scripts/run_kakeya_mac.sh used 'set -u' + a bare "${EXTRA[@]}". macOS's default
/bin/bash is 3.2, where expanding an EMPTY array under nounset errors with
'EXTRA[@]: unbound variable' — hit when the launcher is run with no pass-through
args (the common interactive case). Use the canonical ${EXTRA[@]+"${EXTRA[@]}"}
form (elements if set, nothing if empty, no nounset error). Add
mlx-kakeya-launcher-dryrun-bash32 preset to guard it on the real /bin/bash 3.2.

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 inference_engine/bridge/manifest.py            | 14 ++++++++++++++
 scripts/run_kakeya_mac.sh                      |  6 +++++-
 tests/inference_engine/bridge/test_manifest.py |  1 +
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py
index 8a27bbb..b88e8cb 100644
--- a/inference_engine/bridge/manifest.py
+++ b/inference_engine/bridge/manifest.py
@@ -798,6 +798,20 @@ def _harness_preset(
             params={"max_new_tokens": ("int:max_new_tokens", "64")},
             validate_reports=True,  # §4 liveness gate on-device
         ),
+        Preset(
+            name="mlx-kakeya-launcher-dryrun-bash32",
+            description="Guard the launcher against the macOS bash-3.2 "
+                        "'unbound variable' bug: run scripts/run_kakeya_mac.sh "
+                        "--dry-run under /bin/bash (Apple's frozen bash 3.2) with "
+                        "NO pass-through args, so the empty EXTRA array is expanded "
+                        "under set -u. Must exit 0 and print the command (pre-fix it "
+                        "died with 'EXTRA[@]: unbound variable'). Fast; no model load.",
+            command_templates=(
+                ("/bin/bash", "scripts/run_kakeya_mac.sh", "--dry-run"),
+            ),
+            timeout_minutes=10,
+            validate_reports=False,
+        ),
         Preset(
             name="mlx-kakeya-degen-probe",
             description="Long-decode regression probe: full f_θ fused engine on a "
diff --git a/scripts/run_kakeya_mac.sh b/scripts/run_kakeya_mac.sh
index 4bf3308..57dc34e 100755
--- a/scripts/run_kakeya_mac.sh
+++ b/scripts/run_kakeya_mac.sh
@@ -80,7 +80,11 @@ log "drafter : $DRAFTER"
 log "f_theta : $FTHETA"
 log "params  : sink=$SINK window=$WINDOW block=$BLOCK max_new=$MAX_NEW"
 
-cmd=( python3 scripts/research/k3_integrated_niah_eval_mac.py "${args[@]}" "${EXTRA[@]}" )
+# NOTE: ``${EXTRA[@]+"${EXTRA[@]}"}`` (not a bare ``"${EXTRA[@]}"``) — under
+# ``set -u`` macOS's default bash 3.2 treats expanding an EMPTY array as an
+# "unbound variable" error; the ``+`` form expands to nothing when EXTRA is
+# empty and to the quoted elements otherwise.
+cmd=( python3 scripts/research/k3_integrated_niah_eval_mac.py "${args[@]}" ${EXTRA[@]+"${EXTRA[@]}"} )
 
 if [[ "$DRY_RUN" == "1" ]]; then
   echo "PYTHONPATH=.:sdks/python ${cmd[*]}"
diff --git a/tests/inference_engine/bridge/test_manifest.py b/tests/inference_engine/bridge/test_manifest.py
index 090f189..c42cd6b 100644
--- a/tests/inference_engine/bridge/test_manifest.py
+++ b/tests/inference_engine/bridge/test_manifest.py
@@ -85,6 +85,7 @@ def test_allowlist_contains_exactly_the_documented_presets():
         "mlx-kakeya-degen-probe",
         "mlx-kakeya-fused-chat-ftheta",
         "mlx-kakeya-fused-chat-smoke",
+        "mlx-kakeya-launcher-dryrun-bash32",
         "mlx-kakeya-launcher-smoke",
         "mlx-multitenant-pressure",
         "mlx-upgrade",