From f06264a0c11e192abe975192ae170e893006d53d Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 17 Jun 2026 15:15:19 +0000 Subject: [PATCH 01/12] feat(mac-launcher): long-answer-safe defaults + full-mode validation preset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit run_kakeya_mac.sh: - Document that long answers are now coherent past the ~1024 native-cache ring wrap (PR #146: single-token commits once the sliding RotatingKVCache wraps). - Raise default --max-new-tokens 1024 -> 2048 (the wrap is no longer a coherence cliff; FULL mode just drops the spec-decode speedup past it). - Refresh help text and FULL-mode mode banner. bridge: add mlx-kakeya-launcher-full preset (FULL f_θ path, long scripted answer crossing the wrap, validate_reports) so CI/on-device guards the launcher's full pipeline + the wrap fix end-to-end; launcher-smoke stays for fast wiring checks. Co-authored-by: FluffyAIcode --- inference_engine/bridge/manifest.py | 23 +++++++++++++++++++ scripts/run_kakeya_mac.sh | 20 ++++++++++++---- .../inference_engine/bridge/test_manifest.py | 17 +++++++++++++- 3 files changed, 54 insertions(+), 6 deletions(-) diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py index 8b67da7..69b5d8f 100644 --- a/inference_engine/bridge/manifest.py +++ b/inference_engine/bridge/manifest.py @@ -771,6 +771,29 @@ def _harness_preset( params={"max_new_tokens": ("int:max_new_tokens", "64")}, validate_reports=True, # §4 liveness gate on-device ), + Preset( + name="mlx-kakeya-launcher-full", + description="Validate scripts/run_kakeya_mac.sh in FULL mode (f_θ " + "verifier+proposer+f_θ, default path) on a LONG scripted " + "answer that crosses the ~1024 native-cache ring wrap. " + "Guards the launcher's full pipeline + the PR #146 " + "wrapped-ring fix end-to-end: the report must pass the §4 " + "liveness gate AND the quality gate (coherent, no runaway " + "repeat) past the wrap.", + command_templates=( + ( + "bash", "scripts/run_kakeya_mac.sh", + "--max-new-tokens", "{max_new_tokens}", + "--ignore-turn-stop", + "--chat-scripted", "请详细解释POW的工作原理", + "--output", + "results/research/k3_mac_bridge_launcher_full.json", + ), + ), + timeout_minutes=90, + params={"max_new_tokens": ("int:max_new_tokens", "1300")}, + validate_reports=True, # §4 liveness + §2.4 quality gate on-device + ), Preset( name="mlx-kakeya-degen-probe", description="Long-decode regression probe: full f_θ fused engine on a " diff --git a/scripts/run_kakeya_mac.sh b/scripts/run_kakeya_mac.sh index 4bf3308..ea99f34 100755 --- a/scripts/run_kakeya_mac.sh +++ b/scripts/run_kakeya_mac.sh @@ -8,6 +8,13 @@ # the all-MLX proposer path (f_θ bypassed via S5 native prefill — much faster on # Mac, but the f_θ projection does not execute). # +# LONG ANSWERS ARE SAFE (PR #146). The full path runs on gemma-4's native hybrid +# cache (sliding RotatingKVCache, max_size≈1024). Past that ring wrap the engine +# automatically commits single tokens (no speculative rollback to mis-trim on the +# wrapped ring), so generations stay coherent well beyond ~1024 tokens — they +# just lose the spec-decode speedup past the wrap. So the default budget below is +# generous; you no longer need to keep answers under the window. +# # Model facts come from env vars (set on the kakeya-mac-m4 runner), with sane # fallbacks; override on the CLI if needed: # KAKEYA_MAC_VERIFIER_PATH local MLX gemma-4 dir @@ -17,7 +24,7 @@ # Usage: # bash scripts/run_kakeya_mac.sh # full engine (f_θ on), interactive # bash scripts/run_kakeya_mac.sh --fast # proposer-only (f_θ bypassed), faster -# bash scripts/run_kakeya_mac.sh --max-new-tokens 2048 --window 128 +# bash scripts/run_kakeya_mac.sh --max-new-tokens 4096 --window 128 # bash scripts/run_kakeya_mac.sh --dry-run # print the command, run nothing # echo 'Explain proof-of-work.' | bash scripts/run_kakeya_mac.sh # one-shot via stdin set -euo pipefail @@ -31,7 +38,9 @@ FTHETA="${KAKEYA_MAC_FTHETA_DIR:-results/research/f_theta_v5_s5_sliding}" SINK="${KAKEYA_SINK:-4}" WINDOW="${KAKEYA_WINDOW:-64}" BLOCK="${KAKEYA_BLOCK_SIZE:-4}" -MAX_NEW="${KAKEYA_MAX_NEW_TOKENS:-1024}" +# Default budget reaches past the ~1024 native-cache wrap; coherent there since +# PR #146 (single-token commits past the wrap). Raise/lower freely. +MAX_NEW="${KAKEYA_MAX_NEW_TOKENS:-2048}" FAST=0 DRY_RUN=0 @@ -47,7 +56,7 @@ while [[ $# -gt 0 ]]; do --window) shift; WINDOW="${1:?}" ;; --sink) shift; SINK="${1:?}" ;; --block-size) shift; BLOCK="${1:?}" ;; - -h|--help) sed -n '2,28p' "$0"; exit 0 ;; + -h|--help) sed -n '2,29p' "$0"; exit 0 ;; *) EXTRA+=("$1") ;; # pass-through (e.g. --chat-scripted ...) esac shift @@ -70,8 +79,9 @@ if [[ "$FAST" == "1" ]]; then MODE="FAST (verifier + proposer + S5 bounded KV; f_θ BYPASSED)" else # torch drafter + f_θ: the harness auto-enables --force-f-theta in --chat, so - # f_θ projection ACTUALLY RUNS each turn (the full pipeline). - MODE="FULL (verifier + proposer + f_θ + S5 bounded KV; f_θ runs)" + # f_θ projection ACTUALLY RUNS each turn (the full pipeline). Coherent past the + # ~1024 native-cache wrap (PR #146: single-token commits once the ring wraps). + MODE="FULL (verifier + proposer + f_θ + S5 bounded KV; f_θ runs; long-answer safe)" fi log "mode : $MODE" diff --git a/tests/inference_engine/bridge/test_manifest.py b/tests/inference_engine/bridge/test_manifest.py index 31ce0ec..4348b88 100644 --- a/tests/inference_engine/bridge/test_manifest.py +++ b/tests/inference_engine/bridge/test_manifest.py @@ -84,6 +84,7 @@ def test_allowlist_contains_exactly_the_documented_presets(): "mlx-kakeya-degen-probe", "mlx-kakeya-fused-chat-ftheta", "mlx-kakeya-fused-chat-smoke", + "mlx-kakeya-launcher-full", "mlx-kakeya-launcher-smoke", "mlx-multitenant-pressure", "mlx-upgrade", @@ -106,7 +107,7 @@ def test_harness_presets_validate_reports_others_do_not(): "k3-step2-fused-allmlx", # §4 liveness gate runs on-device for the fused-chat presets too: "mlx-kakeya-fused-chat-smoke", "mlx-kakeya-fused-chat-ftheta", - "mlx-kakeya-launcher-smoke", + "mlx-kakeya-launcher-smoke", "mlx-kakeya-launcher-full", } @@ -166,6 +167,20 @@ def test_mlx_kakeya_launcher_smoke_preset_invokes_launcher(): assert argv[argv.index("--max-new-tokens") + 1] == "64" +def test_mlx_kakeya_launcher_full_preset_runs_full_mode_past_wrap(): + request = parse_manifest(_manifest( + preset="mlx-kakeya-launcher-full", params={"max_new_tokens": "1300"})) + (argv,) = build_commands(request, {}) + assert argv[0] == "bash" + assert argv[1].endswith("run_kakeya_mac.sh") + # FULL mode: NO --fast (f_θ verifier+proposer+f_θ path). + assert "--fast" not in argv + assert "--chat-scripted" in argv + assert "--ignore-turn-stop" in argv + # budget crosses the ~1024 native-cache ring wrap. + assert int(argv[argv.index("--max-new-tokens") + 1]) > 1024 + + def test_mlx_kakeya_fused_chat_ftheta_preset_runs_f_theta_path(): request = parse_manifest(_manifest( preset="mlx-kakeya-fused-chat-ftheta", From 88743e53aa71b87a970569889efe6065e79fc645 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 17 Jun 2026 15:55:38 +0000 Subject: [PATCH 02/12] debug(mlx-fused): instrument codegen markdown-loop degeneration + native-control probe KAKEYA_KDBG-gated per-block logging (sampled/committed ids, cyc_frac/cyc_p, cache offsets) in fused_specdecode_generate, and a turn_compare_fused_vs_native record (first_divergence_idx + both tails) in _run_fused_chat. New bridge preset mlx-kakeya-codegen-degen-probe runs the C-code prompt with --chat-native-ref to decide greedy-pathology vs engine bug. Instrumentation only; reverted after fix. Co-authored-by: FluffyAIcode --- .../backends/mlx/fused_specdecode.py | 78 +++++++++++++++++++ inference_engine/bridge/manifest.py | 27 +++++++ .../research/k3_integrated_niah_eval_mac.py | 38 +++++++++ .../inference_engine/bridge/test_manifest.py | 1 + 4 files changed, 144 insertions(+) diff --git a/inference_engine/backends/mlx/fused_specdecode.py b/inference_engine/backends/mlx/fused_specdecode.py index c14cfbc..3f35f02 100644 --- a/inference_engine/backends/mlx/fused_specdecode.py +++ b/inference_engine/backends/mlx/fused_specdecode.py @@ -35,6 +35,68 @@ restored_prefill_cache, ) +# region agent log (fused-codegen-degeneration-2815 probe; strip after fix) +import os as _kdbg_os +import sys as _kdbg_sys +import json as _kdbg_json + +_KDBG = bool(_kdbg_os.environ.get("KAKEYA_KDBG")) + + +def _kdbg(hyp: str, msg: str, **data: Any) -> None: + """Emit one NDJSON probe line to stderr (prefix ``KDBG ``) and, best-effort, + to /opt/cursor/logs/debug.log. No-op unless ``KAKEYA_KDBG`` is set, so + production behaviour is unchanged.""" + if not _KDBG: + return + rec = {"hypothesisId": hyp, "location": "fused_specdecode.py", + "message": msg, "data": data} + try: + _kdbg_sys.stderr.write("KDBG " + _kdbg_json.dumps(rec, ensure_ascii=False) + "\n") + _kdbg_sys.stderr.flush() + except Exception: + pass + try: + with open("/opt/cursor/logs/debug.log", "a") as _f: + _f.write(_kdbg_json.dumps(rec) + "\n") + except Exception: + pass + + +def _kdbg_cycle(ids: Sequence[int], window: int = 80) -> Tuple[float, int]: + """Short-unit cycle metric on the tail of ``ids``: returns + ``(cyc_frac, cyc_p)`` where ``cyc_p`` is the period (1..window//3) whose + back-to-back repetition covers the largest fraction ``cyc_frac`` of the + trailing ``window`` tokens. ~1.0 => the tail is a tight repeating loop.""" + w = list(ids[-window:]) + n = len(w) + if n < 6: + return 0.0, 0 + best_frac, best_p = 0.0, 0 + for p in range(1, n // 3 + 1): + run, i = 0, n - 1 + while i - p >= 0 and w[i] == w[i - p]: + run += 1 + i -= 1 + if run > 0: + frac = (run + p) / n + if frac > best_frac: + best_frac, best_p = frac, p + return round(best_frac, 3), best_p + + +def _kdbg_cache_offsets(cache: Any) -> Tuple[Optional[int], Optional[int]]: + """(first full-attn KVCache offset, first sliding RotatingKVCache offset).""" + off_full = off_rot = None + for c in (cache or []): + nm = type(c).__name__ + if off_rot is None and "Rotating" in nm: + off_rot = int(getattr(c, "offset", -1)) + elif off_full is None and "Rotating" not in nm: + off_full = int(getattr(c, "offset", -1)) + return off_full, off_rot +# endregion + # --------------------------------------------------------------------------- # # Component A: capture verifier aux-layer hidden states (no transformers @@ -772,6 +834,22 @@ def fused_specdecode_generate( commit = candidate[:accepted] + [correction] generated += commit accepts.append(accepted) + # region agent log (fused-codegen-degeneration-2815 probe) + if _KDBG: + off_full, off_rot = _kdbg_cache_offsets(getattr(adapter, "_cache", None)) + cyc_frac, cyc_p = _kdbg_cycle(generated) + # H-D: cache.offset must track committed length (past_len). + # off_rot lags by the sliding window (bounded), off_full == past_len. + _kdbg("AD", "block", + blk=len(accepts) - 1, base=cstart, + past_len=int(adapter._past_len), gen=len(generated), + off_full=off_full, off_rot=off_rot, + bonus=int(bonus), cand=[int(x) for x in candidate], + n_cand=len(candidate), accepted=int(accepted), + commit=[int(x) for x in commit], + next_argmax=int(argmax_fn(adapter.next_token_logits)), + cyc_frac=cyc_frac, cyc_p=cyc_p) + # endregion if any(t in eos for t in commit): break if (allow_greedy_fallback and len(accepts) >= 2 diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py index 8b67da7..1338d47 100644 --- a/inference_engine/bridge/manifest.py +++ b/inference_engine/bridge/manifest.py @@ -771,6 +771,33 @@ def _harness_preset( params={"max_new_tokens": ("int:max_new_tokens", "64")}, validate_reports=True, # §4 liveness gate on-device ), + Preset( + name="mlx-kakeya-codegen-degen-probe", + description="DEBUG: full f_θ fused engine on a CODE prompt (write PoW " + "in C) that triggers an early high-acceptance markdown-" + "marker loop (**/.2/* wall), with KAKEYA_KDBG per-block " + "logging + native-greedy control (--chat-native-ref). The " + "decisive signal is fused-vs-native divergence: if native " + "also loops, it is greedy pathology the engine must guard.", + command_templates=( + ( + "env", "KAKEYA_KDBG=1", + "python3", "scripts/research/k3_integrated_niah_eval_mac.py", + "--verifier-path", "${ENV:KAKEYA_MAC_VERIFIER_PATH}", + "--drafter-id", "${ENV:KAKEYA_MAC_DRAFTER_ID}", + "--f-theta-dir", "${ENV:KAKEYA_MAC_FTHETA_DIR}", + "--s5-exact-full-attn", "--fused-specdecode", "--force-f-theta", + "--sink-size", "4", "--window-size", "64", "--block-size", "4", + "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop", + "--chat", "--chat-native-ref", + "--chat-scripted", "实现一个PoW的代码,用c语言完成", + "--output", "results/research/codegen_degen_2815_chat.json", + ), + ), + timeout_minutes=90, + params={"max_new_tokens": ("int:max_new_tokens", "800")}, + validate_reports=False, + ), Preset( name="mlx-kakeya-degen-probe", description="Long-decode regression probe: full f_θ fused engine on a " diff --git a/scripts/research/k3_integrated_niah_eval_mac.py b/scripts/research/k3_integrated_niah_eval_mac.py index ecde21d..a628eb2 100644 --- a/scripts/research/k3_integrated_niah_eval_mac.py +++ b/scripts/research/k3_integrated_niah_eval_mac.py @@ -851,6 +851,44 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]: res["native_ref_tokens"] = len(nref_tokens) res["resident_kv_bytes"] = int( sum(int(getattr(c, "nbytes", 0)) for c in (adapter._cache or []))) + # region agent log (fused-codegen-degeneration-2815 probe) + import os as _kos + if _kos.environ.get("KAKEYA_KDBG"): + ftoks = [int(t) for t in res.get("tokens", [])] + ntoks = [int(t) for t in nref_tokens] + div = None + for j, (a, b) in enumerate(zip(ftoks, ntoks)): + if a != b: + div = j + break + if div is None: + div = min(len(ftoks), len(ntoks)) + + def _dec(seq): + try: + return tokenizer.decode(seq, skip_special_tokens=True) + except TypeError: + return tokenizer.decode(seq) + rec = { + "hypothesisId": "AC", + "message": "turn_compare_fused_vs_native", + "data": { + "fused_n": len(ftoks), "native_n": len(ntoks), + "first_divergence_idx": div, + "fused_div_ctx": ftoks[max(0, div - 8):div + 16], + "native_div_ctx": ntoks[max(0, div - 8):div + 16], + "fused_div_text": _dec(ftoks[max(0, div - 8):div + 16]), + "native_div_text": _dec(ntoks[max(0, div - 8):div + 16]), + "fused_tail": ftoks[-48:], + "native_tail": ntoks[-48:], + "fused_tail_text": _dec(ftoks[-48:]), + "native_tail_text": _dec(ntoks[-48:]), + }, + } + sys.stderr.write( + "KDBG " + json.dumps(rec, ensure_ascii=False) + "\n") + sys.stderr.flush() + # endregion return res print(f"[chat] FULL fused engine: verifier={args.verifier_path} " diff --git a/tests/inference_engine/bridge/test_manifest.py b/tests/inference_engine/bridge/test_manifest.py index 31ce0ec..cce242d 100644 --- a/tests/inference_engine/bridge/test_manifest.py +++ b/tests/inference_engine/bridge/test_manifest.py @@ -81,6 +81,7 @@ def test_allowlist_contains_exactly_the_documented_presets(): "mlx-batched-pad-decode", "mlx-env-probe", "mlx-kakeya-chat-smoke", + "mlx-kakeya-codegen-degen-probe", "mlx-kakeya-degen-probe", "mlx-kakeya-fused-chat-ftheta", "mlx-kakeya-fused-chat-smoke", From f636370349c82255d9a3d8debfd23e21424b69b7 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 17 Jun 2026 16:21:12 +0000 Subject: [PATCH 03/12] debug(mlx-fused): add multi-turn prefill-state probe (ring-wrap-at-prefill) + multi-turn degen preset KAKEYA_KDBG-gated prefill_state_fused / prefill_state_native records in _run_fused_chat: per-turn prompt_len, evicted_count, rot/full cache offsets, any_wrapped, would_wrap_block0, plus a turn index on turn_compare. Repoints mlx-kakeya-codegen-degen-probe to the multi-turn repro (turn-1 PoW explanation pushes the turn-2 code prompt's prefill past the sliding window) at 1200 tok. Instrumentation only; reverted after fix. Co-authored-by: FluffyAIcode --- inference_engine/bridge/manifest.py | 27 ++++--- .../research/k3_integrated_niah_eval_mac.py | 75 +++++++++++++++++++ 2 files changed, 93 insertions(+), 9 deletions(-) diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py index 1338d47..9e442d2 100644 --- a/inference_engine/bridge/manifest.py +++ b/inference_engine/bridge/manifest.py @@ -773,12 +773,20 @@ def _harness_preset( ), Preset( name="mlx-kakeya-codegen-degen-probe", - description="DEBUG: full f_θ fused engine on a CODE prompt (write PoW " - "in C) that triggers an early high-acceptance markdown-" - "marker loop (**/.2/* wall), with KAKEYA_KDBG per-block " - "logging + native-greedy control (--chat-native-ref). The " - "decisive signal is fused-vs-native divergence: if native " - "also loops, it is greedy pathology the engine must guard.", + description="DEBUG: full f_θ fused engine on a MULTI-TURN chat whose " + "turn-1 PoW explanation makes the turn-2 code prompt's " + "prefill exceed the sliding window / native RotatingKVCache " + "(ring pre-wrapped before decode). Single-turn ruled OUT an " + "engine bug (token-identical to native); this probe targets " + "the long-prompt prefill regime. KAKEYA_KDBG logs per-turn " + "prefill state (prompt_len, evicted_count, rot/full cache " + "offsets, any_wrapped, would_wrap_block0) + per-block offsets " + "+ a turn_compare_fused_vs_native record (first_divergence_idx " + "+ tails). Native-greedy control (--chat-native-ref) decodes " + "the SAME per-turn prompt (history-inclusive) so the decisive " + "signal stays fused-vs-native: native coherent + fused garbled " + "from turn-2 start ⇒ long-prompt prefill corrupts logits " + "(engine); both loop identically ⇒ greedy pathology.", command_templates=( ( "env", "KAKEYA_KDBG=1", @@ -790,12 +798,13 @@ def _harness_preset( "--sink-size", "4", "--window-size", "64", "--block-size", "4", "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop", "--chat", "--chat-native-ref", - "--chat-scripted", "实现一个PoW的代码,用c语言完成", + "--chat-scripted", + "请详细解释POW的工作原理||实现一个PoW的代码,用c语言完成", "--output", "results/research/codegen_degen_2815_chat.json", ), ), - timeout_minutes=90, - params={"max_new_tokens": ("int:max_new_tokens", "800")}, + timeout_minutes=120, + params={"max_new_tokens": ("int:max_new_tokens", "1200")}, validate_reports=False, ), Preset( diff --git a/scripts/research/k3_integrated_niah_eval_mac.py b/scripts/research/k3_integrated_niah_eval_mac.py index a628eb2..646c8c1 100644 --- a/scripts/research/k3_integrated_niah_eval_mac.py +++ b/scripts/research/k3_integrated_niah_eval_mac.py @@ -221,6 +221,7 @@ def main() -> int: MLXRestoredIncrementalVerifier, capture_aux_hidden, make_bridge_embed_lm_head, fused_specdecode_generate, fused_specdecode_generate_mlx, fused_specdecode_generate_mlx_trim, + _sliding_ring_would_wrap, # region agent log (fused-codegen-degeneration-2815) ) from inference_engine.v04.kv_compressor import make_default_compressor from inference_engine.bench.k3_report_gate import ( @@ -769,6 +770,54 @@ def _encode_chat(history: List[Dict[str, str]]) -> List[int]: history, add_generation_prompt=True) return list(cids.tolist() if hasattr(cids, "tolist") else cids) + # region agent log (fused-codegen-degeneration-2815 prefill probe) + import os as _kos_chat + _KDBG_CHAT = bool(_kos_chat.environ.get("KAKEYA_KDBG")) + + def _kdbg_emit(rec: Dict[str, Any]) -> None: + try: + sys.stderr.write("KDBG " + json.dumps(rec, ensure_ascii=False) + "\n") + sys.stderr.flush() + except Exception: + pass + try: + with open("/opt/cursor/logs/debug.log", "a") as _f: + _f.write(json.dumps(rec) + "\n") + except Exception: + pass + + def _kdbg_cache_summary(cache: Any) -> Dict[str, Any]: + """rot/full offset+max_size rollup + wrap/trimmable flags. The + decisive prefill signal: is the sliding RotatingKVCache already + wrapped (off>=max_size) BEFORE decode starts, and does full-attn + off == prompt_len? A pre-wrapped ring at prefill means the very + first speculative block's trim is refused (offset desync).""" + rot_off = rot_ms = full_off = None + any_wrapped = False + all_trimmable = True + n = 0 + for c in (cache or []): + n += 1 + nm = type(c).__name__ + off = int(getattr(c, "offset", -1)) + ms = getattr(c, "max_size", None) + ms = int(ms) if ms is not None else None + is_rot = "Rotating" in nm + if is_rot and ms is not None and off >= ms: + any_wrapped = True + trim_fn = getattr(c, "is_trimmable", None) + trim = bool(trim_fn()) if callable(trim_fn) else None + if trim is False: + all_trimmable = False + if is_rot and rot_off is None: + rot_off, rot_ms = off, ms + if (not is_rot) and full_off is None: + full_off = off + return {"n_layers": n, "rot_off": rot_off, "rot_ms": rot_ms, + "full_off": full_off, "any_wrapped": any_wrapped, + "all_trimmable": all_trimmable} + # endregion + def _gen_turn(pid: List[int]) -> Dict[str, Any]: # Opt-in A/B control (--chat-native-ref): a plain NATIVE greedy # AR decode of the SAME prompt for --max-new-tokens. Captured as @@ -779,6 +828,14 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]: nref_tokens: List[int] = [] if args.chat_native_ref: nref_cache, nref_logits = native_prefill(list(pid)) + # region agent log (fused-codegen-degeneration-2815 prefill probe) + if _KDBG_CHAT: + _turn = sum(1 for h in history if h.get("role") == "user") + _kdbg_emit({"hypothesisId": "AE", + "message": "prefill_state_native", + "data": {"turn": _turn, "prompt_len": len(pid), + "cache": _kdbg_cache_summary(nref_cache)}}) + # endregion while len(nref_tokens) < args.max_new_tokens: tok = int(mx.argmax(nref_logits).item()) nref_tokens.append(tok) @@ -809,6 +866,23 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]: restored_v_per_layer=_pad(rv, tsrc, T), evicted_positions=evicted, prefill_chunk_size=args.prefill_chunk_size, full_kv=args.cuda_trim) + # region agent log (fused-codegen-degeneration-2815 prefill probe) + if _KDBG_CHAT: + _turn = sum(1 for h in history if h.get("role") == "user") + _kdbg_emit({"hypothesisId": "AE", + "message": "prefill_state_fused", + "data": {"turn": _turn, "prompt_len": T, + "evicted_count": len(evicted), + "block_size": int(args.block_size), + "would_wrap_block0": bool( + _sliding_ring_would_wrap( + getattr(adapter, "_cache", None), + int(args.block_size))), + "past_len": int(adapter._past_len), + "f_theta_ran": bool(f_theta_ran), + "cache": _kdbg_cache_summary( + getattr(adapter, "_cache", None))}}) + # endregion t0 = time.perf_counter() if mlx_drafter is not None and args.cuda_trim: res = fused_specdecode_generate_mlx_trim( @@ -873,6 +947,7 @@ def _dec(seq): "hypothesisId": "AC", "message": "turn_compare_fused_vs_native", "data": { + "turn": sum(1 for h in history if h.get("role") == "user"), "fused_n": len(ftoks), "native_n": len(ntoks), "first_divergence_idx": div, "fused_div_ctx": ftoks[max(0, div - 8):div + 16], From 12fda6009c6da5095039ac6208d6bdf5047a1ddb Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 17 Jun 2026 17:01:30 +0000 Subject: [PATCH 04/12] debug(probe): light single-turn long-prompt repro (ring pre-wrapped at prefill) Multi-turn+native at 1200x2 OOM'd the Mac runner. Per debug analysis, the cheapest test of H-C' (long-prompt prefill corrupts logits) vs H-A' (bounded- greedy pathology) is a single-turn LONG prompt that wraps the ring AT prefill (would_wrap_block0) with a tiny 192-tok budget. Add --chat-scripted-file so the ~2k-char context is a committed fixture (pow_codegen_longprompt.txt) instead of a giant manifest argv; repoint mlx-kakeya-codegen-degen-probe to it. Co-authored-by: FluffyAIcode --- inference_engine/bridge/manifest.py | 37 ++++---- .../research/k3_integrated_niah_eval_mac.py | 13 ++- scripts/research/pow_codegen_longprompt.txt | 85 +++++++++++++++++++ 3 files changed, 114 insertions(+), 21 deletions(-) create mode 100644 scripts/research/pow_codegen_longprompt.txt diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py index 9e442d2..05237d2 100644 --- a/inference_engine/bridge/manifest.py +++ b/inference_engine/bridge/manifest.py @@ -773,20 +773,19 @@ def _harness_preset( ), Preset( name="mlx-kakeya-codegen-degen-probe", - description="DEBUG: full f_θ fused engine on a MULTI-TURN chat whose " - "turn-1 PoW explanation makes the turn-2 code prompt's " - "prefill exceed the sliding window / native RotatingKVCache " - "(ring pre-wrapped before decode). Single-turn ruled OUT an " - "engine bug (token-identical to native); this probe targets " - "the long-prompt prefill regime. KAKEYA_KDBG logs per-turn " - "prefill state (prompt_len, evicted_count, rot/full cache " - "offsets, any_wrapped, would_wrap_block0) + per-block offsets " - "+ a turn_compare_fused_vs_native record (first_divergence_idx " - "+ tails). Native-greedy control (--chat-native-ref) decodes " - "the SAME per-turn prompt (history-inclusive) so the decisive " - "signal stays fused-vs-native: native coherent + fused garbled " - "from turn-2 start ⇒ long-prompt prefill corrupts logits " - "(engine); both loop identically ⇒ greedy pathology.", + description="DEBUG: full f_θ fused engine on a LONG single-turn prompt " + "(~2k-char PoW explanation + a 'write C code' request, from " + "the committed fixture pow_codegen_longprompt.txt) so the " + "native RotatingKVCache ring is ALREADY WRAPPED at prefill " + "(would_wrap_block0). Short single-turn prompts were proven " + "token-identical to native & coherent; this isolates the " + "long-prompt-prefill regime cheaply (tiny 192-tok budget). " + "KAKEYA_KDBG logs prefill state (prompt_len, any_wrapped, " + "would_wrap_block0, rot/full offsets) + per-block offsets + " + "turn_compare_fused_vs_native. Native-greedy control " + "(--chat-native-ref): native coherent + fused garbled ⇒ " + "long-prompt prefill corrupts logits (engine bug); both " + "degenerate ⇒ bounded-greedy pathology the engine must guard.", command_templates=( ( "env", "KAKEYA_KDBG=1", @@ -798,13 +797,13 @@ def _harness_preset( "--sink-size", "4", "--window-size", "64", "--block-size", "4", "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop", "--chat", "--chat-native-ref", - "--chat-scripted", - "请详细解释POW的工作原理||实现一个PoW的代码,用c语言完成", - "--output", "results/research/codegen_degen_2815_chat.json", + "--chat-scripted-file", + "scripts/research/pow_codegen_longprompt.txt", + "--output", "results/research/codegen_degen_2815_longprompt.json", ), ), - timeout_minutes=120, - params={"max_new_tokens": ("int:max_new_tokens", "1200")}, + timeout_minutes=90, + params={"max_new_tokens": ("int:max_new_tokens", "192")}, validate_reports=False, ), Preset( diff --git a/scripts/research/k3_integrated_niah_eval_mac.py b/scripts/research/k3_integrated_niah_eval_mac.py index 646c8c1..ed354be 100644 --- a/scripts/research/k3_integrated_niah_eval_mac.py +++ b/scripts/research/k3_integrated_niah_eval_mac.py @@ -180,6 +180,11 @@ def parse_args() -> argparse.Namespace: ap.add_argument("--chat-scripted", default=None, help="Non-interactive chat: '||'-separated user turns " "(for Mac-bridge verification); writes a transcript.") + ap.add_argument("--chat-scripted-file", default=None, + help="Like --chat-scripted but reads the (possibly long, " + "'||'-separated) scripted prompt from a UTF-8 file. Lets " + "a long context be a committed fixture instead of a giant " + "manifest argv. Overrides --chat-scripted when set.") ap.add_argument("--chat-native-ref", action="store_true", help="DIAGNOSTIC opt-in: before each chat turn, also run a " "plain NATIVE greedy AR decode of the SAME prompt for " @@ -973,8 +978,12 @@ def _dec(seq): file=sys.stderr, flush=True) history: List[Dict[str, str]] = [] - if args.chat_scripted is not None: - turns = [t for t in args.chat_scripted.split("||") if t.strip()] + scripted = args.chat_scripted + if args.chat_scripted_file is not None: + with open(args.chat_scripted_file, encoding="utf-8") as _f: + scripted = _f.read() + if scripted is not None: + turns = [t for t in scripted.split("||") if t.strip()] transcript = [] for u in turns: history.append({"role": "user", "content": u}) diff --git a/scripts/research/pow_codegen_longprompt.txt b/scripts/research/pow_codegen_longprompt.txt new file mode 100644 index 0000000..fa1ae08 --- /dev/null +++ b/scripts/research/pow_codegen_longprompt.txt @@ -0,0 +1,85 @@ +**PoW (Proof of Work,工作量证明)** 是区块链技术中最核心的共识机制之一。它的核心目的是:**在没有中心化机构(如银行)的情况下,让分布在世界各地的计算机能够达成一致,决定谁有权记账,并防止有人通过伪造数据来欺骗网络。** + +为了深入理解,我们可以从“核心逻辑”、“工作流程”、“数学原理”和“经济博弈”四个维度来详细解释。 + +--- + +### 1. 核心逻辑:用“算力”换取“信任” + +在去中心化网络中,大家面临一个问题:**如果每个人都说自己记了一笔账,听谁的?** + +PoW 的逻辑是:**谁付出了巨大的计算资源(工作量),谁就有权提议下一块账本。** +* **工作量(Work)**:指消耗的计算资源和时间。 +* **证明(Proof)**:当计算完成后,结果是公开且易于验证的。 + +这种机制确保了:**作恶的成本极高,而维护网络的收益(奖励)很诱人。** + +--- + +### 2. 工作流程:从“挖矿”到“验证” + +假设我们正在运行一个像比特币这样的 PoW 网络,流程如下: + +#### 第一步:收集交易 +矿工(节点)从网络中收集待处理的交易,并将它们打包成一个“候选区块”。 + +#### 第二步:寻找“随机数”(核心环节) +为了让区块生效,矿工必须解决一个数学难题。这个难题通常是: +> “找到一个数字(称为 **Nonce**),使得:**区块头部数据的哈希值(Hash) < 目标难度值(Target)**。” + +* **哈希函数(Hash Function)**:像是一个“数字粉碎机”。你输入任何内容,它都会输出一串固定长度的乱码。只要输入变一点点,输出就会天差地别。 +* **不可逆性**:你无法通过结果反推输入。 + +#### 第三步:竞争与“挖矿” +矿工们开始疯狂尝试。他们不断改变区块里的 `Nonce` 值,重新计算哈希值。 +* 矿工 A 尝试:`Hash(数据 + 1) = 0xabc...` (不符合要求) +* 矿工 B 尝试:`Hash(数据 + 2) = 0xdef...` (不符合要求) +* ... +* 矿工 C 运气好/算力强:`Hash(数据 + 999) = 0x000...` (**符合要求!**) + +一旦有人找到了满足条件的哈希值,他就“挖到了矿”。 + +#### 第四步:广播与验证 +获胜的矿工将新区块广播给全网。其他节点收到后,只需进行一次简单的计算(把 Nonce 代入哈希函数),发现结果确实符合要求,就会接受这个区块,并更新自己的账本。 + +--- + +### 3. 数学原理:为什么它是安全的? + +#### A. 难度调整机制 (Difficulty Adjustment) +如果每个人都买更快的显卡,挖矿速度会变快,导致区块产生太频繁。 +为了保持稳定的出块时间(例如比特币约10分钟一个块),系统会**自动调整难度**: +* 如果出块太快 $\rightarrow$ 提高目标值(让要求的哈绪值前导零更多,变得更难)。 +* 如果出块太慢 $\rightarrow$ 降低难度。 + +#### B. 概率与公平性 +由于哈希值是随机分布的,寻找答案的过程就像“在茫茫大海里捞针”。 +* **算力越高**,意味着你每秒尝试的次数越多,捞到针的概率就越大。 +* 这保证了在宏观上,算力占比与奖励分配是公平的。 + +#### C. 抵抗“双花”与篡改 +如果有人想修改 10 分钟前的一个交易,他必须重新计算那个区块的哈希,以及**之后所有区块**的哈希。 +由于后面的区块都包含了前一个区块的哈希值(形成链式结构),修改历史意味着要重算后面所有的工作量。除非攻击者的算力超过全网总和的 51%,否则这在经济上是不可能的。 + +--- + +### 4. 总结:PoW 的优缺点 + +#### 优点: +1. **极高的安全性**:通过物理世界的能源消耗(电力)为数字世界筑起防线。 +2. **去中心化**:任何人只要有设备就可以加入,不需要许可。 +3. **公平性**:基于数学概率,不存在人为干预。 + +#### 缺点: +1. **能源消耗大**:为了维持安全,全球范围内的矿机都在消耗大量电力(这是最受争议的一点)。 +2. **扩展性差(TPS低)**:为了保证全球同步,出块速度不能太快,导致处理交易的速度受限。 +3. **算力集中风险**:如果出现大规模矿池,可能导致权力向少数人手中集中。 + +### 通俗比喻总结 +想象一个**“数字猜数字游戏”**: +全网的人都在玩一个游戏,目标是猜出一个符合特定规则的数字。 +* **挖矿**就是不停地尝试不同的数字。 +* **难度**就是规则越来越严苛(比如要求数字必须以 10 个零开头)。 +* **验证**就是别人看到你猜中了,只需要看一眼你的数字是否符合规则,瞬间就能确认你没撒谎。 + +基于以上说明,用C语言写一个简短的PoW示例(只写核心循环,不超过30行)。 \ No newline at end of file From c6a699c98d48e38e7a04754c752b1e60519f3791 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 17 Jun 2026 17:30:25 +0000 Subject: [PATCH 05/12] fix(mlx-fused): runaway-loop guard stops greedy markdown-marker collapse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Repro evidence: single-turn fused decode is TOKEN-IDENTICAL to native greedy (first_divergence_idx=None) and coherent through 1200 tokens, so the engine is faithful — the user's '由于...'/'**/.2/*' collapse is greedy-decoding pathology on code/markdown-heavy prompts that the fused path (pure argmax, unlike chat_mlx_kakeya.py) had no mitigation for. Once a loop starts the drafter trivially predicts the repeats and the greedy verifier accepts them (high accept_len), so it walls indefinitely. Fix: _trailing_runaway_drop detects a 1..8-token unit repeated >=12x at the tail (conservative; never trims legit lists/enumerations/code) and the three fused loops stop generation, keeping a short clean tail instead of an unbounded wall. Default ON (stop_on_runaway=True); --fused-no-loop-guard disables it for degeneration probes. Adds stopped_on_runaway to the result. Also: --chat-scripted-file (long prompt as committed fixture) + repoint the codegen-degen probe to a single-turn long prompt that wraps the ring at prefill (cheap; the multi-turn+native variant OOM'd the Mac runner). KAKEYA_KDBG probe instrumentation retained (inert unless the env var is set) for the pending on-device H-C'-vs-H-A' confirmation. Co-authored-by: FluffyAIcode --- .../backends/mlx/fused_specdecode.py | 72 +++++++++++++++++++ inference_engine/bridge/manifest.py | 2 +- .../research/k3_integrated_niah_eval_mac.py | 13 +++- tests/backends/mlx/test_fused_specdecode.py | 62 ++++++++++++++++ 4 files changed, 145 insertions(+), 4 deletions(-) diff --git a/inference_engine/backends/mlx/fused_specdecode.py b/inference_engine/backends/mlx/fused_specdecode.py index 3f35f02..9fafdd7 100644 --- a/inference_engine/backends/mlx/fused_specdecode.py +++ b/inference_engine/backends/mlx/fused_specdecode.py @@ -436,6 +436,7 @@ def fused_specdecode_generate_mlx_trim( block_size: int, eos_ids: Sequence[int] = (), single_fused: bool = False, + stop_on_runaway: bool = True, ) -> Dict[str, Any]: """CUDA-parity fused spec decode: KEEP accepted K/V, TRIM only the rejected tail (no rollback, no carry re-forward). Requires the adapter to be @@ -461,6 +462,7 @@ def fused_specdecode_generate_mlx_trim( generated: List[int] = [] accepts: List[int] = [] block_evals: List[float] = [] + stopped_on_runaway = False ctx_len = C try: while len(generated) < gen_tokens: @@ -522,6 +524,12 @@ def fused_specdecode_generate_mlx_trim( timing["extend_s"] += time.perf_counter() - t_extend if any(t in eos for t in commit): break + if stop_on_runaway: + drop = _trailing_runaway_drop(generated) + if drop > 0: + del generated[len(generated) - drop:] + stopped_on_runaway = True + break finally: adapter._capture_aux = False generated = generated[:gen_tokens] @@ -531,6 +539,7 @@ def fused_specdecode_generate_mlx_trim( "mean_accept_len": (round(sum(accepts) / len(accepts), 3) if accepts else 0.0), "decode_tokens": len(generated), + "stopped_on_runaway": stopped_on_runaway, "loop": ("mlx_trim_single_fused_probe" if single_fused else "mlx_trim_keep_accepted_cuda_parity"), "single_fused": bool(single_fused), @@ -552,6 +561,7 @@ def fused_specdecode_generate_mlx( gen_tokens: int, block_size: int, eos_ids: Sequence[int] = (), + stop_on_runaway: bool = True, ) -> Dict[str, Any]: """All-MLX fused spec decode with ONE host sync per block. @@ -593,6 +603,7 @@ def fused_specdecode_generate_mlx( generated: List[int] = [] accepts: List[int] = [] + stopped_on_runaway = False # Rollback-carry state: rejected blocks roll the WHOLE forward back # (rollback_block — see its docstring for why trim is unsound on the # wrapped sliding ring) and carry the stream-committed-but-not-cached @@ -676,6 +687,12 @@ def fused_specdecode_generate_mlx( timing["extend_s"] += time.perf_counter() - t_extend if any(t in eos for t in commit): break + if stop_on_runaway: + drop = _trailing_runaway_drop(generated) + if drop > 0: + del generated[len(generated) - drop:] + stopped_on_runaway = True + break finally: adapter._capture_aux = False generated = generated[:gen_tokens] @@ -685,6 +702,7 @@ def fused_specdecode_generate_mlx( "mean_accept_len": (round(sum(accepts) / len(accepts), 3) if accepts else 0.0), "decode_tokens": len(generated), + "stopped_on_runaway": stopped_on_runaway, "loop": "mlx_rollback_carry_v3", "time_breakdown_s": {k: round(v, 3) for k, v in timing.items()}, } @@ -717,6 +735,40 @@ def _sliding_ring_would_wrap(cache: Any, n_new: int) -> bool: return False +def _trailing_runaway_drop( + ids: Sequence[int], + *, + max_period: int = 8, + min_reps: int = 12, + keep_reps: int = 3, +) -> int: + """Return how many TRAILING tokens to drop if ``ids`` ends in a runaway + short-period loop, else 0. + + A runaway loop is a unit of ``1..max_period`` tokens repeated ``>= min_reps`` + times back-to-back at the tail (e.g. the ``**``/``.2``/``*`` markdown-marker + collapse greedy decoding falls into on code prompts). When found, we keep + ``keep_reps`` instances and drop the rest, so callers can stop generation + with a clean tail instead of emitting an unbounded wall of repeats. + + Deliberately CONSERVATIVE (>= 12 back-to-back repeats of a <= 8-token unit) + so legitimately repetitive text — numbered lists, ``矿工 A/B/C`` enumerations, + structured code — is never trimmed. Returns 0 when no runaway is present.""" + n = len(ids) + for p in range(1, max_period + 1): + if n < p * min_reps: + continue + unit = list(ids[n - p:]) + reps = 0 + i = n + while i - p >= 0 and list(ids[i - p:i]) == unit: + reps += 1 + i -= p + if reps >= min_reps: + return max((reps - keep_reps) * p, 0) + return 0 + + # --------------------------------------------------------------------------- # # The fused spec-decode loop (control flow; MLX/torch ops via injected fns). # --------------------------------------------------------------------------- # @@ -734,6 +786,7 @@ def fused_specdecode_generate( arange_fn: Callable[[int, int], Any], cat_aux_fn: Callable[[Sequence[Any]], Any], allow_greedy_fallback: bool = True, + stop_on_runaway: bool = True, ) -> Dict[str, Any]: """Run the fused engine. ``adapter`` must already be prefilled. Per block: draft from the cached drafter context (B), verify+capture-aux incrementally @@ -762,6 +815,7 @@ def fused_specdecode_generate( generated: List[int] = [] accepts: List[int] = [] fallback_to_greedy = False + stopped_on_runaway = False try: while len(generated) < gen_tokens: L = min(block_size, gen_tokens - len(generated)) @@ -852,6 +906,17 @@ def fused_specdecode_generate( # endregion if any(t in eos for t in commit): break + # Greedy decoding can collapse into a runaway short-period loop (e.g. + # the **/.2/* markdown-marker wall on code prompts); the drafter then + # trivially predicts the repeats and the greedy verifier accepts them, + # so acceptance stays HIGH while the output is garbage. Stop on it + # instead of emitting an unbounded wall (keeps a short clean tail). + if stop_on_runaway: + drop = _trailing_runaway_drop(generated) + if drop > 0: + del generated[len(generated) - drop:] + stopped_on_runaway = True + break if (allow_greedy_fallback and len(accepts) >= 2 and (sum(accepts) / len(accepts)) < 1.5): fallback_to_greedy = True @@ -869,6 +934,12 @@ def fused_specdecode_generate( generated.append(tok) if tok in eos: break + if stop_on_runaway: + drop = _trailing_runaway_drop(generated) + if drop > 0: + del generated[len(generated) - drop:] + stopped_on_runaway = True + break timing["fallback_greedy_s"] += time.perf_counter() - t_fb finally: adapter._capture_aux = False @@ -879,5 +950,6 @@ def fused_specdecode_generate( "mean_accept_len": (round(sum(accepts) / len(accepts), 3) if accepts else 0.0), "decode_tokens": len(generated), + "stopped_on_runaway": stopped_on_runaway, "time_breakdown_s": {k: round(v, 3) for k, v in timing.items()}, } diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py index 05237d2..5977c77 100644 --- a/inference_engine/bridge/manifest.py +++ b/inference_engine/bridge/manifest.py @@ -796,7 +796,7 @@ def _harness_preset( "--s5-exact-full-attn", "--fused-specdecode", "--force-f-theta", "--sink-size", "4", "--window-size", "64", "--block-size", "4", "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop", - "--chat", "--chat-native-ref", + "--chat", "--chat-native-ref", "--fused-no-loop-guard", "--chat-scripted-file", "scripts/research/pow_codegen_longprompt.txt", "--output", "results/research/codegen_degen_2815_longprompt.json", diff --git a/scripts/research/k3_integrated_niah_eval_mac.py b/scripts/research/k3_integrated_niah_eval_mac.py index ed354be..345de2b 100644 --- a/scripts/research/k3_integrated_niah_eval_mac.py +++ b/scripts/research/k3_integrated_niah_eval_mac.py @@ -185,6 +185,10 @@ def parse_args() -> argparse.Namespace: "'||'-separated) scripted prompt from a UTF-8 file. Lets " "a long context be a committed fixture instead of a giant " "manifest argv. Overrides --chat-scripted when set.") + ap.add_argument("--fused-no-loop-guard", action="store_true", + help="DIAGNOSTIC: disable the fused engine's runaway-loop stop " + "(default ON) so a degeneration probe can observe the full " + "collapse. Production chat keeps the guard enabled.") ap.add_argument("--chat-native-ref", action="store_true", help="DIAGNOSTIC opt-in: before each chat turn, also run a " "plain NATIVE greedy AR decode of the SAME prompt for " @@ -889,25 +893,28 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]: getattr(adapter, "_cache", None))}}) # endregion t0 = time.perf_counter() + _guard = not args.fused_no_loop_guard if mlx_drafter is not None and args.cuda_trim: res = fused_specdecode_generate_mlx_trim( adapter, active_drafter, aux_prompt=aux_prompt, embed_fn=embed_fn, lm_head_fn=lm_head_fn, gen_tokens=args.max_new_tokens, block_size=args.block_size, - eos_ids=chat_eos, single_fused=args.single_fused) + eos_ids=chat_eos, single_fused=args.single_fused, + stop_on_runaway=_guard) elif mlx_drafter is not None: res = fused_specdecode_generate_mlx( adapter, active_drafter, aux_prompt=aux_prompt, embed_fn=embed_fn, lm_head_fn=lm_head_fn, gen_tokens=args.max_new_tokens, block_size=args.block_size, - eos_ids=chat_eos) + eos_ids=chat_eos, stop_on_runaway=_guard) else: res = fused_specdecode_generate( adapter, active_drafter, aux_prompt=aux_prompt, embed_fn=embed_fn, lm_head_fn=lm_head_fn, gen_tokens=args.max_new_tokens, block_size=args.block_size, eos_ids=chat_eos, argmax_fn=argmax_fn, arange_fn=arange_fn, - cat_aux_fn=cat_aux_fn, allow_greedy_fallback=False) + cat_aux_fn=cat_aux_fn, allow_greedy_fallback=False, + stop_on_runaway=_guard) res["decode_s"] = round(time.perf_counter() - t0, 3) res["f_theta_ran"] = f_theta_ran res["f_theta_layers"] = sorted(rk.keys()) if rk else [] diff --git a/tests/backends/mlx/test_fused_specdecode.py b/tests/backends/mlx/test_fused_specdecode.py index ddf099b..f9c37a4 100644 --- a/tests/backends/mlx/test_fused_specdecode.py +++ b/tests/backends/mlx/test_fused_specdecode.py @@ -170,6 +170,68 @@ def __init__(self, offset): self.max_size = None +def test_trailing_runaway_drop_detects_and_trims_loops(): + # 1-token unit repeated 20x -> drop all but keep_reps (default 3). + ids = [1, 2, 3] + [9] * 20 + drop = fsd._trailing_runaway_drop(ids) + assert drop == 17 # 20 - 3 kept + # multi-token unit (period 3) repeated 12x -> drop (12-3)*3 = 27. + ids2 = [5, 6] + [7, 8, 9] * 12 + assert fsd._trailing_runaway_drop(ids2) == 27 + + +def test_trailing_runaway_drop_is_conservative(): + # fewer than min_reps (12) back-to-back -> no trim. + assert fsd._trailing_runaway_drop([9] * 11) == 0 + # legitimate non-repeating tail -> no trim. + assert fsd._trailing_runaway_drop(list(range(40))) == 0 + # a period that does not tile the very tail -> no trim. + assert fsd._trailing_runaway_drop([1, 2] * 10 + [3]) == 0 + # empty / short -> no trim. + assert fsd._trailing_runaway_drop([]) == 0 + + +def test_fused_loop_stops_on_runaway_repeat(): + # Drafter keeps proposing the same token; the fake verifier's "+1" truth is + # defeated by making the bonus re-loop: we feed a drafter that always drafts + # the marker token and a verifier that greedily agrees, so the committed + # stream becomes a runaway single-token loop the guard must cut. + class _LoopAdapter(_FakeAdapter): + def forward_block(self, candidate): + # verifier greedily predicts the SAME marker token (42) forever. + if self._capture_aux: + L = len(candidate) + self._last_aux = [torch.zeros(L, self.hidden)] + return [42 for _ in candidate] + + adapter = _LoopAdapter(prompt_len=5, first_token=42) + drafter = _FakeDrafter(drafts=[[42, 42, 42]] * 60) + res = fsd.fused_specdecode_generate( + adapter, drafter, gen_tokens=400, block_size=4, eos_ids=(), + allow_greedy_fallback=False, **_loop_kwargs(drafter)) + assert res["stopped_on_runaway"] is True + # stopped early with a short clean tail, nowhere near the 400 budget. + assert len(res["tokens"]) < 40 + assert set(res["tokens"]) == {42} + + +def test_fused_loop_runaway_guard_can_be_disabled(): + class _LoopAdapter(_FakeAdapter): + def forward_block(self, candidate): + if self._capture_aux: + self._last_aux = [torch.zeros(len(candidate), self.hidden)] + return [42 for _ in candidate] + + adapter = _LoopAdapter(prompt_len=5, first_token=42) + drafter = _FakeDrafter(drafts=[[42, 42, 42]] * 200) + res = fsd.fused_specdecode_generate( + adapter, drafter, gen_tokens=120, block_size=4, eos_ids=(), + allow_greedy_fallback=False, stop_on_runaway=False, + **_loop_kwargs(drafter)) + assert res["stopped_on_runaway"] is False + assert len(res["tokens"]) == 120 # ran to the full budget + + def test_sliding_ring_would_wrap_detects_wrap(): # offset + n_new >= max_size -> the rotating ring becomes non-trimmable. cache = [_FakeRotating(offset=1022, max_size=1024)] From d10aac96bd44406ba999c122cf2aabbbd5a57cc5 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Thu, 18 Jun 2026 04:23:54 +0000 Subject: [PATCH 06/12] fix(probe): drop env KAKEYA_KDBG prefix (broke venv python3 -> no mlx_lm); add guard-ON validation preset The 'env KAKEYA_KDBG=1 python3' prefix resolved a python3 without mlx_lm on the runner (ModuleNotFoundError). Drop it (KDBG instrumentation is now inert, which is also what we want for the final PR). The native_ref/text/stopped_on_runaway signals in the JSON are sufficient to characterize + validate. Add mlx-kakeya-codegen-guard-validate (guard ON) to prove the clean stop. Co-authored-by: FluffyAIcode --- inference_engine/bridge/manifest.py | 29 ++++++++++++++++++- .../inference_engine/bridge/test_manifest.py | 1 + 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py index 5977c77..76e9ee1 100644 --- a/inference_engine/bridge/manifest.py +++ b/inference_engine/bridge/manifest.py @@ -788,7 +788,6 @@ def _harness_preset( "degenerate ⇒ bounded-greedy pathology the engine must guard.", command_templates=( ( - "env", "KAKEYA_KDBG=1", "python3", "scripts/research/k3_integrated_niah_eval_mac.py", "--verifier-path", "${ENV:KAKEYA_MAC_VERIFIER_PATH}", "--drafter-id", "${ENV:KAKEYA_MAC_DRAFTER_ID}", @@ -806,6 +805,34 @@ def _harness_preset( params={"max_new_tokens": ("int:max_new_tokens", "192")}, validate_reports=False, ), + Preset( + name="mlx-kakeya-codegen-guard-validate", + description="Validate the runaway-loop guard end-to-end: full f_θ fused " + "engine on the same long code prompt (pow_codegen_longprompt" + ".txt) with the guard ENABLED (default). The fused answer " + "must NOT collapse into a marker wall — the guard stops the " + "runaway (stopped_on_runaway) leaving a clean tail — while " + "the native-greedy control (no guard) degenerates, proving " + "the guard is what saves the engine from greedy pathology.", + command_templates=( + ( + "python3", "scripts/research/k3_integrated_niah_eval_mac.py", + "--verifier-path", "${ENV:KAKEYA_MAC_VERIFIER_PATH}", + "--drafter-id", "${ENV:KAKEYA_MAC_DRAFTER_ID}", + "--f-theta-dir", "${ENV:KAKEYA_MAC_FTHETA_DIR}", + "--s5-exact-full-attn", "--fused-specdecode", "--force-f-theta", + "--sink-size", "4", "--window-size", "64", "--block-size", "4", + "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop", + "--chat", "--chat-native-ref", + "--chat-scripted-file", + "scripts/research/pow_codegen_longprompt.txt", + "--output", "results/research/codegen_guard_validate_2815.json", + ), + ), + timeout_minutes=90, + params={"max_new_tokens": ("int:max_new_tokens", "256")}, + validate_reports=False, + ), Preset( name="mlx-kakeya-degen-probe", description="Long-decode regression probe: full f_θ fused engine on a " diff --git a/tests/inference_engine/bridge/test_manifest.py b/tests/inference_engine/bridge/test_manifest.py index cce242d..cfea538 100644 --- a/tests/inference_engine/bridge/test_manifest.py +++ b/tests/inference_engine/bridge/test_manifest.py @@ -82,6 +82,7 @@ def test_allowlist_contains_exactly_the_documented_presets(): "mlx-env-probe", "mlx-kakeya-chat-smoke", "mlx-kakeya-codegen-degen-probe", + "mlx-kakeya-codegen-guard-validate", "mlx-kakeya-degen-probe", "mlx-kakeya-fused-chat-ftheta", "mlx-kakeya-fused-chat-smoke", From f8a7a9ae517fc6d542572b5dc60c578f46ae3436 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Thu, 18 Jun 2026 05:30:52 +0000 Subject: [PATCH 07/12] debug(probe): long single-decode A/B (drop native-ref for memory, budget 1100) to reach the ~978-tok collapse onset Co-authored-by: FluffyAIcode --- inference_engine/bridge/manifest.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py index 76e9ee1..0953833 100644 --- a/inference_engine/bridge/manifest.py +++ b/inference_engine/bridge/manifest.py @@ -795,14 +795,14 @@ def _harness_preset( "--s5-exact-full-attn", "--fused-specdecode", "--force-f-theta", "--sink-size", "4", "--window-size", "64", "--block-size", "4", "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop", - "--chat", "--chat-native-ref", "--fused-no-loop-guard", + "--chat", "--fused-no-loop-guard", "--chat-scripted-file", "scripts/research/pow_codegen_longprompt.txt", "--output", "results/research/codegen_degen_2815_longprompt.json", ), ), - timeout_minutes=90, - params={"max_new_tokens": ("int:max_new_tokens", "192")}, + timeout_minutes=120, + params={"max_new_tokens": ("int:max_new_tokens", "1100")}, validate_reports=False, ), Preset( @@ -823,14 +823,14 @@ def _harness_preset( "--s5-exact-full-attn", "--fused-specdecode", "--force-f-theta", "--sink-size", "4", "--window-size", "64", "--block-size", "4", "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop", - "--chat", "--chat-native-ref", + "--chat", "--chat-scripted-file", "scripts/research/pow_codegen_longprompt.txt", "--output", "results/research/codegen_guard_validate_2815.json", ), ), - timeout_minutes=90, - params={"max_new_tokens": ("int:max_new_tokens", "256")}, + timeout_minutes=120, + params={"max_new_tokens": ("int:max_new_tokens", "1100")}, validate_reports=False, ), Preset( From 85abe81834b584b507eefc9a99830ed9ded1b9e1 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Thu, 18 Jun 2026 05:44:50 +0000 Subject: [PATCH 08/12] debug(probe): multi-turn (explanation->code) guard-off/on A/B, no native-ref, budget 900 (matches the user's high-accept regime) Co-authored-by: FluffyAIcode --- inference_engine/bridge/manifest.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py index 0953833..76794be 100644 --- a/inference_engine/bridge/manifest.py +++ b/inference_engine/bridge/manifest.py @@ -796,13 +796,13 @@ def _harness_preset( "--sink-size", "4", "--window-size", "64", "--block-size", "4", "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop", "--chat", "--fused-no-loop-guard", - "--chat-scripted-file", - "scripts/research/pow_codegen_longprompt.txt", + "--chat-scripted", + "请详细解释POW的工作原理||实现一个PoW的代码,用c语言完成", "--output", "results/research/codegen_degen_2815_longprompt.json", ), ), timeout_minutes=120, - params={"max_new_tokens": ("int:max_new_tokens", "1100")}, + params={"max_new_tokens": ("int:max_new_tokens", "900")}, validate_reports=False, ), Preset( @@ -824,13 +824,13 @@ def _harness_preset( "--sink-size", "4", "--window-size", "64", "--block-size", "4", "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop", "--chat", - "--chat-scripted-file", - "scripts/research/pow_codegen_longprompt.txt", + "--chat-scripted", + "请详细解释POW的工作原理||实现一个PoW的代码,用c语言完成", "--output", "results/research/codegen_guard_validate_2815.json", ), ), timeout_minutes=120, - params={"max_new_tokens": ("int:max_new_tokens", "1100")}, + params={"max_new_tokens": ("int:max_new_tokens", "900")}, validate_reports=False, ), Preset( From 772c8dfe571bb4bfdfb85a1d634d3c8eed39e4cf Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Thu, 18 Jun 2026 06:17:18 +0000 Subject: [PATCH 09/12] cleanup(mlx-fused): strip inert KDBG probe instrumentation; finalize codegen presets - Remove the KAKEYA_KDBG-gated debug instrumentation (helpers + per-block emission + prefill_state/turn_compare) from fused_specdecode.py and k3_integrated_niah_eval_mac.py. Investigation complete. - Keep the production fix (runaway-loop guard) + the --chat-scripted-file / --fused-no-loop-guard / --chat-native-ref flags. - Repoint the two codegen presets to the multi-turn 'explain||code' chat (guard-off probe + guard-on validate), accurate descriptions; drop the now- unused pow_codegen_longprompt.txt fixture. On-device (Mac M4): across short/long/multi-turn regimes the engine is coherent (fused==native); guard-on and guard-off outputs are byte-identical on the multi-turn code scenario -> the guard is inert on healthy output (no regression) and the systematic degeneration was already resolved by the wrap fix (#146). Co-authored-by: FluffyAIcode --- .../backends/mlx/fused_specdecode.py | 79 ------------ inference_engine/bridge/manifest.py | 33 +++-- .../research/k3_integrated_niah_eval_mac.py | 113 ------------------ scripts/research/pow_codegen_longprompt.txt | 85 ------------- 4 files changed, 14 insertions(+), 296 deletions(-) delete mode 100644 scripts/research/pow_codegen_longprompt.txt diff --git a/inference_engine/backends/mlx/fused_specdecode.py b/inference_engine/backends/mlx/fused_specdecode.py index 9fafdd7..3d44d79 100644 --- a/inference_engine/backends/mlx/fused_specdecode.py +++ b/inference_engine/backends/mlx/fused_specdecode.py @@ -35,69 +35,6 @@ restored_prefill_cache, ) -# region agent log (fused-codegen-degeneration-2815 probe; strip after fix) -import os as _kdbg_os -import sys as _kdbg_sys -import json as _kdbg_json - -_KDBG = bool(_kdbg_os.environ.get("KAKEYA_KDBG")) - - -def _kdbg(hyp: str, msg: str, **data: Any) -> None: - """Emit one NDJSON probe line to stderr (prefix ``KDBG ``) and, best-effort, - to /opt/cursor/logs/debug.log. No-op unless ``KAKEYA_KDBG`` is set, so - production behaviour is unchanged.""" - if not _KDBG: - return - rec = {"hypothesisId": hyp, "location": "fused_specdecode.py", - "message": msg, "data": data} - try: - _kdbg_sys.stderr.write("KDBG " + _kdbg_json.dumps(rec, ensure_ascii=False) + "\n") - _kdbg_sys.stderr.flush() - except Exception: - pass - try: - with open("/opt/cursor/logs/debug.log", "a") as _f: - _f.write(_kdbg_json.dumps(rec) + "\n") - except Exception: - pass - - -def _kdbg_cycle(ids: Sequence[int], window: int = 80) -> Tuple[float, int]: - """Short-unit cycle metric on the tail of ``ids``: returns - ``(cyc_frac, cyc_p)`` where ``cyc_p`` is the period (1..window//3) whose - back-to-back repetition covers the largest fraction ``cyc_frac`` of the - trailing ``window`` tokens. ~1.0 => the tail is a tight repeating loop.""" - w = list(ids[-window:]) - n = len(w) - if n < 6: - return 0.0, 0 - best_frac, best_p = 0.0, 0 - for p in range(1, n // 3 + 1): - run, i = 0, n - 1 - while i - p >= 0 and w[i] == w[i - p]: - run += 1 - i -= 1 - if run > 0: - frac = (run + p) / n - if frac > best_frac: - best_frac, best_p = frac, p - return round(best_frac, 3), best_p - - -def _kdbg_cache_offsets(cache: Any) -> Tuple[Optional[int], Optional[int]]: - """(first full-attn KVCache offset, first sliding RotatingKVCache offset).""" - off_full = off_rot = None - for c in (cache or []): - nm = type(c).__name__ - if off_rot is None and "Rotating" in nm: - off_rot = int(getattr(c, "offset", -1)) - elif off_full is None and "Rotating" not in nm: - off_full = int(getattr(c, "offset", -1)) - return off_full, off_rot -# endregion - - # --------------------------------------------------------------------------- # # Component A: capture verifier aux-layer hidden states (no transformers # `output_hidden_states` on MLX → patch the decoder-layer __call__). @@ -888,22 +825,6 @@ def fused_specdecode_generate( commit = candidate[:accepted] + [correction] generated += commit accepts.append(accepted) - # region agent log (fused-codegen-degeneration-2815 probe) - if _KDBG: - off_full, off_rot = _kdbg_cache_offsets(getattr(adapter, "_cache", None)) - cyc_frac, cyc_p = _kdbg_cycle(generated) - # H-D: cache.offset must track committed length (past_len). - # off_rot lags by the sliding window (bounded), off_full == past_len. - _kdbg("AD", "block", - blk=len(accepts) - 1, base=cstart, - past_len=int(adapter._past_len), gen=len(generated), - off_full=off_full, off_rot=off_rot, - bonus=int(bonus), cand=[int(x) for x in candidate], - n_cand=len(candidate), accepted=int(accepted), - commit=[int(x) for x in commit], - next_argmax=int(argmax_fn(adapter.next_token_logits)), - cyc_frac=cyc_frac, cyc_p=cyc_p) - # endregion if any(t in eos for t in commit): break # Greedy decoding can collapse into a runaway short-period loop (e.g. diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py index 76794be..fbcdbb0 100644 --- a/inference_engine/bridge/manifest.py +++ b/inference_engine/bridge/manifest.py @@ -773,19 +773,13 @@ def _harness_preset( ), Preset( name="mlx-kakeya-codegen-degen-probe", - description="DEBUG: full f_θ fused engine on a LONG single-turn prompt " - "(~2k-char PoW explanation + a 'write C code' request, from " - "the committed fixture pow_codegen_longprompt.txt) so the " - "native RotatingKVCache ring is ALREADY WRAPPED at prefill " - "(would_wrap_block0). Short single-turn prompts were proven " - "token-identical to native & coherent; this isolates the " - "long-prompt-prefill regime cheaply (tiny 192-tok budget). " - "KAKEYA_KDBG logs prefill state (prompt_len, any_wrapped, " - "would_wrap_block0, rot/full offsets) + per-block offsets + " - "turn_compare_fused_vs_native. Native-greedy control " - "(--chat-native-ref): native coherent + fused garbled ⇒ " - "long-prompt prefill corrupts logits (engine bug); both " - "degenerate ⇒ bounded-greedy pathology the engine must guard.", + description="Regression probe (guard DISABLED): full f_θ fused engine " + "on the multi-turn 'explain PoW || write PoW in C' chat " + "that originally degenerated, with --fused-no-loop-guard so " + "any greedy markdown-marker collapse is observable. Pairs " + "with mlx-kakeya-codegen-guard-validate (guard ENABLED) to " + "show the guard is what keeps the answer clean. On current " + "code (post wrap-fix) both turns stay coherent.", command_templates=( ( "python3", "scripts/research/k3_integrated_niah_eval_mac.py", @@ -808,12 +802,13 @@ def _harness_preset( Preset( name="mlx-kakeya-codegen-guard-validate", description="Validate the runaway-loop guard end-to-end: full f_θ fused " - "engine on the same long code prompt (pow_codegen_longprompt" - ".txt) with the guard ENABLED (default). The fused answer " - "must NOT collapse into a marker wall — the guard stops the " - "runaway (stopped_on_runaway) leaving a clean tail — while " - "the native-greedy control (no guard) degenerates, proving " - "the guard is what saves the engine from greedy pathology.", + "engine on the multi-turn 'explain PoW || write PoW in C' " + "chat with the guard ENABLED (production default). The " + "answer must stay coherent and never collapse into a marker " + "wall — if a runaway starts, the guard stops it " + "(stopped_on_runaway) leaving a clean tail. Confirmed " + "coherent on current code; byte-identical to the guard-off " + "probe (the guard is inert on healthy output).", command_templates=( ( "python3", "scripts/research/k3_integrated_niah_eval_mac.py", diff --git a/scripts/research/k3_integrated_niah_eval_mac.py b/scripts/research/k3_integrated_niah_eval_mac.py index 345de2b..a6fc2eb 100644 --- a/scripts/research/k3_integrated_niah_eval_mac.py +++ b/scripts/research/k3_integrated_niah_eval_mac.py @@ -230,7 +230,6 @@ def main() -> int: MLXRestoredIncrementalVerifier, capture_aux_hidden, make_bridge_embed_lm_head, fused_specdecode_generate, fused_specdecode_generate_mlx, fused_specdecode_generate_mlx_trim, - _sliding_ring_would_wrap, # region agent log (fused-codegen-degeneration-2815) ) from inference_engine.v04.kv_compressor import make_default_compressor from inference_engine.bench.k3_report_gate import ( @@ -779,54 +778,6 @@ def _encode_chat(history: List[Dict[str, str]]) -> List[int]: history, add_generation_prompt=True) return list(cids.tolist() if hasattr(cids, "tolist") else cids) - # region agent log (fused-codegen-degeneration-2815 prefill probe) - import os as _kos_chat - _KDBG_CHAT = bool(_kos_chat.environ.get("KAKEYA_KDBG")) - - def _kdbg_emit(rec: Dict[str, Any]) -> None: - try: - sys.stderr.write("KDBG " + json.dumps(rec, ensure_ascii=False) + "\n") - sys.stderr.flush() - except Exception: - pass - try: - with open("/opt/cursor/logs/debug.log", "a") as _f: - _f.write(json.dumps(rec) + "\n") - except Exception: - pass - - def _kdbg_cache_summary(cache: Any) -> Dict[str, Any]: - """rot/full offset+max_size rollup + wrap/trimmable flags. The - decisive prefill signal: is the sliding RotatingKVCache already - wrapped (off>=max_size) BEFORE decode starts, and does full-attn - off == prompt_len? A pre-wrapped ring at prefill means the very - first speculative block's trim is refused (offset desync).""" - rot_off = rot_ms = full_off = None - any_wrapped = False - all_trimmable = True - n = 0 - for c in (cache or []): - n += 1 - nm = type(c).__name__ - off = int(getattr(c, "offset", -1)) - ms = getattr(c, "max_size", None) - ms = int(ms) if ms is not None else None - is_rot = "Rotating" in nm - if is_rot and ms is not None and off >= ms: - any_wrapped = True - trim_fn = getattr(c, "is_trimmable", None) - trim = bool(trim_fn()) if callable(trim_fn) else None - if trim is False: - all_trimmable = False - if is_rot and rot_off is None: - rot_off, rot_ms = off, ms - if (not is_rot) and full_off is None: - full_off = off - return {"n_layers": n, "rot_off": rot_off, "rot_ms": rot_ms, - "full_off": full_off, "any_wrapped": any_wrapped, - "all_trimmable": all_trimmable} - # endregion - def _gen_turn(pid: List[int]) -> Dict[str, Any]: # Opt-in A/B control (--chat-native-ref): a plain NATIVE greedy # AR decode of the SAME prompt for --max-new-tokens. Captured as @@ -837,14 +788,6 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]: nref_tokens: List[int] = [] if args.chat_native_ref: nref_cache, nref_logits = native_prefill(list(pid)) - # region agent log (fused-codegen-degeneration-2815 prefill probe) - if _KDBG_CHAT: - _turn = sum(1 for h in history if h.get("role") == "user") - _kdbg_emit({"hypothesisId": "AE", - "message": "prefill_state_native", - "data": {"turn": _turn, "prompt_len": len(pid), - "cache": _kdbg_cache_summary(nref_cache)}}) - # endregion while len(nref_tokens) < args.max_new_tokens: tok = int(mx.argmax(nref_logits).item()) nref_tokens.append(tok) @@ -875,23 +818,6 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]: restored_v_per_layer=_pad(rv, tsrc, T), evicted_positions=evicted, prefill_chunk_size=args.prefill_chunk_size, full_kv=args.cuda_trim) - # region agent log (fused-codegen-degeneration-2815 prefill probe) - if _KDBG_CHAT: - _turn = sum(1 for h in history if h.get("role") == "user") - _kdbg_emit({"hypothesisId": "AE", - "message": "prefill_state_fused", - "data": {"turn": _turn, "prompt_len": T, - "evicted_count": len(evicted), - "block_size": int(args.block_size), - "would_wrap_block0": bool( - _sliding_ring_would_wrap( - getattr(adapter, "_cache", None), - int(args.block_size))), - "past_len": int(adapter._past_len), - "f_theta_ran": bool(f_theta_ran), - "cache": _kdbg_cache_summary( - getattr(adapter, "_cache", None))}}) - # endregion t0 = time.perf_counter() _guard = not args.fused_no_loop_guard if mlx_drafter is not None and args.cuda_trim: @@ -937,45 +863,6 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]: res["native_ref_tokens"] = len(nref_tokens) res["resident_kv_bytes"] = int( sum(int(getattr(c, "nbytes", 0)) for c in (adapter._cache or []))) - # region agent log (fused-codegen-degeneration-2815 probe) - import os as _kos - if _kos.environ.get("KAKEYA_KDBG"): - ftoks = [int(t) for t in res.get("tokens", [])] - ntoks = [int(t) for t in nref_tokens] - div = None - for j, (a, b) in enumerate(zip(ftoks, ntoks)): - if a != b: - div = j - break - if div is None: - div = min(len(ftoks), len(ntoks)) - - def _dec(seq): - try: - return tokenizer.decode(seq, skip_special_tokens=True) - except TypeError: - return tokenizer.decode(seq) - rec = { - "hypothesisId": "AC", - "message": "turn_compare_fused_vs_native", - "data": { - "turn": sum(1 for h in history if h.get("role") == "user"), - "fused_n": len(ftoks), "native_n": len(ntoks), - "first_divergence_idx": div, - "fused_div_ctx": ftoks[max(0, div - 8):div + 16], - "native_div_ctx": ntoks[max(0, div - 8):div + 16], - "fused_div_text": _dec(ftoks[max(0, div - 8):div + 16]), - "native_div_text": _dec(ntoks[max(0, div - 8):div + 16]), - "fused_tail": ftoks[-48:], - "native_tail": ntoks[-48:], - "fused_tail_text": _dec(ftoks[-48:]), - "native_tail_text": _dec(ntoks[-48:]), - }, - } - sys.stderr.write( - "KDBG " + json.dumps(rec, ensure_ascii=False) + "\n") - sys.stderr.flush() - # endregion return res print(f"[chat] FULL fused engine: verifier={args.verifier_path} " diff --git a/scripts/research/pow_codegen_longprompt.txt b/scripts/research/pow_codegen_longprompt.txt deleted file mode 100644 index fa1ae08..0000000 --- a/scripts/research/pow_codegen_longprompt.txt +++ /dev/null @@ -1,85 +0,0 @@ -**PoW (Proof of Work,工作量证明)** 是区块链技术中最核心的共识机制之一。它的核心目的是:**在没有中心化机构(如银行)的情况下,让分布在世界各地的计算机能够达成一致,决定谁有权记账,并防止有人通过伪造数据来欺骗网络。** - -为了深入理解,我们可以从“核心逻辑”、“工作流程”、“数学原理”和“经济博弈”四个维度来详细解释。 - ---- - -### 1. 核心逻辑:用“算力”换取“信任” - -在去中心化网络中,大家面临一个问题:**如果每个人都说自己记了一笔账,听谁的?** - -PoW 的逻辑是:**谁付出了巨大的计算资源(工作量),谁就有权提议下一块账本。** -* **工作量(Work)**:指消耗的计算资源和时间。 -* **证明(Proof)**:当计算完成后,结果是公开且易于验证的。 - -这种机制确保了:**作恶的成本极高,而维护网络的收益(奖励)很诱人。** - ---- - -### 2. 工作流程:从“挖矿”到“验证” - -假设我们正在运行一个像比特币这样的 PoW 网络,流程如下: - -#### 第一步:收集交易 -矿工(节点)从网络中收集待处理的交易,并将它们打包成一个“候选区块”。 - -#### 第二步:寻找“随机数”(核心环节) -为了让区块生效,矿工必须解决一个数学难题。这个难题通常是: -> “找到一个数字(称为 **Nonce**),使得:**区块头部数据的哈希值(Hash) < 目标难度值(Target)**。” - -* **哈希函数(Hash Function)**:像是一个“数字粉碎机”。你输入任何内容,它都会输出一串固定长度的乱码。只要输入变一点点,输出就会天差地别。 -* **不可逆性**:你无法通过结果反推输入。 - -#### 第三步:竞争与“挖矿” -矿工们开始疯狂尝试。他们不断改变区块里的 `Nonce` 值,重新计算哈希值。 -* 矿工 A 尝试:`Hash(数据 + 1) = 0xabc...` (不符合要求) -* 矿工 B 尝试:`Hash(数据 + 2) = 0xdef...` (不符合要求) -* ... -* 矿工 C 运气好/算力强:`Hash(数据 + 999) = 0x000...` (**符合要求!**) - -一旦有人找到了满足条件的哈希值,他就“挖到了矿”。 - -#### 第四步:广播与验证 -获胜的矿工将新区块广播给全网。其他节点收到后,只需进行一次简单的计算(把 Nonce 代入哈希函数),发现结果确实符合要求,就会接受这个区块,并更新自己的账本。 - ---- - -### 3. 数学原理:为什么它是安全的? - -#### A. 难度调整机制 (Difficulty Adjustment) -如果每个人都买更快的显卡,挖矿速度会变快,导致区块产生太频繁。 -为了保持稳定的出块时间(例如比特币约10分钟一个块),系统会**自动调整难度**: -* 如果出块太快 $\rightarrow$ 提高目标值(让要求的哈绪值前导零更多,变得更难)。 -* 如果出块太慢 $\rightarrow$ 降低难度。 - -#### B. 概率与公平性 -由于哈希值是随机分布的,寻找答案的过程就像“在茫茫大海里捞针”。 -* **算力越高**,意味着你每秒尝试的次数越多,捞到针的概率就越大。 -* 这保证了在宏观上,算力占比与奖励分配是公平的。 - -#### C. 抵抗“双花”与篡改 -如果有人想修改 10 分钟前的一个交易,他必须重新计算那个区块的哈希,以及**之后所有区块**的哈希。 -由于后面的区块都包含了前一个区块的哈希值(形成链式结构),修改历史意味着要重算后面所有的工作量。除非攻击者的算力超过全网总和的 51%,否则这在经济上是不可能的。 - ---- - -### 4. 总结:PoW 的优缺点 - -#### 优点: -1. **极高的安全性**:通过物理世界的能源消耗(电力)为数字世界筑起防线。 -2. **去中心化**:任何人只要有设备就可以加入,不需要许可。 -3. **公平性**:基于数学概率,不存在人为干预。 - -#### 缺点: -1. **能源消耗大**:为了维持安全,全球范围内的矿机都在消耗大量电力(这是最受争议的一点)。 -2. **扩展性差(TPS低)**:为了保证全球同步,出块速度不能太快,导致处理交易的速度受限。 -3. **算力集中风险**:如果出现大规模矿池,可能导致权力向少数人手中集中。 - -### 通俗比喻总结 -想象一个**“数字猜数字游戏”**: -全网的人都在玩一个游戏,目标是猜出一个符合特定规则的数字。 -* **挖矿**就是不停地尝试不同的数字。 -* **难度**就是规则越来越严苛(比如要求数字必须以 10 个零开头)。 -* **验证**就是别人看到你猜中了,只需要看一眼你的数字是否符合规则,瞬间就能确认你没撒谎。 - -基于以上说明,用C语言写一个简短的PoW示例(只写核心循环,不超过30行)。 \ No newline at end of file From 51ff901281dcd2d361c758faaa0ec947e456afde Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Thu, 18 Jun 2026 06:37:29 +0000 Subject: [PATCH 10/12] docs(skill): add reusable 'pin self-hosted runner Python env' skill + prompt Captures the diagnosis+fix for the post-reboot ModuleNotFoundError (mlx_lm) on the kakeya-mac-m4 runner: lightweight env-probe diagnosis, 3-layer fix (pin venv on the runner agent PATH via .path/.env|launchd|systemd; resolve a pinned interpreter in the workflow/executor instead of bare python3; fail-fast import gate), reboot-inclusive verification, and the Cloud-VM-vs-runner distinction (Mac-only deps belong on the runner, not the Linux Cloud Agent env). Includes a ready-to-paste setup-agent prompt; generalized for any Claude/Codex agent. Co-authored-by: FluffyAIcode --- docs/kakeyainferenceenginebuildskill.md | 1 + .../pin-selfhosted-runner-python-env-skill.md | 193 ++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 docs/skills/pin-selfhosted-runner-python-env-skill.md diff --git a/docs/kakeyainferenceenginebuildskill.md b/docs/kakeyainferenceenginebuildskill.md index 38a40c8..afab3e4 100644 --- a/docs/kakeyainferenceenginebuildskill.md +++ b/docs/kakeyainferenceenginebuildskill.md @@ -351,5 +351,6 @@ If any answer is "no", write the weaker, true claim. - v0.5-cuda scorecard (+ honest §5): `docs/reports/kakeya-inference-engine-v0.5-cuda.md` - Engine vs vLLM long-context journey: `docs/reports/kakeya-engine-vs-vllm-h200.md`, `docs/reports/kakeya-vs-vllm-longcontext-h200.md` - MLX port lessons: `docs/mlx-port-lessons.md` +- Self-hosted runner Python pinning (reboot-proof mlx_lm/torch/transformers): `docs/skills/pin-selfhosted-runner-python-env-skill.md` - f_θ training pipeline: `docs/design/k3-f-theta-training-pipeline.md` - Session capacity / cross-host: `docs/adr/0014-agent-connection-capacity-and-cross-host-topology-tests.md` diff --git a/docs/skills/pin-selfhosted-runner-python-env-skill.md b/docs/skills/pin-selfhosted-runner-python-env-skill.md new file mode 100644 index 0000000..2f348fd --- /dev/null +++ b/docs/skills/pin-selfhosted-runner-python-env-skill.md @@ -0,0 +1,193 @@ +# Skill: Pin a self-hosted runner's Python env (survive reboots, reproducible heavy ML deps) + +**Reusable across agents (Claude / Codex / Cursor).** Copy this file or paste the +prompt in the appendix. It is written to be repo-agnostic; the concrete examples +use a GitHub Actions self-hosted Mac runner driving MLX (`mlx_lm`/`torch`/ +`transformers`), but the pattern applies to any self-hosted runner (Mac or Linux) +that runs heavy ML/native deps from a virtualenv. + +--- + +## 1. When to use this skill + +Trigger it when **a self-hosted runner job fails on a missing module that "used to +work"**, especially after a host **reboot / OS or Python upgrade / runner +re-register**. Classic signatures: + +- `ModuleNotFoundError: No module named 'mlx_lm'` (or `torch`, `transformers`, …) + in a job that previously passed. +- The failure is **fast** (seconds) — it dies at `import`, before any real work. +- A **lightweight probe** (one that only needs stdlib + a base package) still + passes, proving the runner is *online* but pointing at the **wrong interpreter**. +- The interpreter version changed (e.g. `python=3.14.3` where it used to be + `3.13.x`), or `pkg=None` for a package that should be installed. + +Root cause is almost always: the workflow invokes a **bare `python3`**, and after +the reboot the default `python3` on `PATH` is no longer the venv that has the +deps. The venv still exists; nothing points at it. + +--- + +## 2. Diagnose first (don't guess) + +Run the **cheapest possible probe** through the same runner path to read the +interpreter + module state, instead of assuming. Example (adapt the import list): + +```bash +python3 - <<'PY' +import sys +def v(m): + try: + mod = __import__(m); return getattr(mod, "__version__", "ok") + except Exception as e: + return f"MISSING ({e.__class__.__name__})" +print("python =", sys.version.split()[0], "| exe =", sys.executable) +for m in ("mlx", "mlx_lm", "torch", "transformers"): + print(f"{m} = {v(m)}") +PY +``` + +Decision rule: +- **Runner online + probe shows wrong `python`/`exe` or `MISSING` deps** → this skill (interpreter pinning). +- **Probe itself never starts (job stuck `queued`/`pending`)** → the runner *agent* + is down; restart the agent first (different problem). + +> In CI-driven runners, route the probe through the same executor the real jobs +> use (so `PATH`/env match). A one-liner like the above, committed as a tiny +> "env-probe" job/preset, is worth keeping permanently. + +--- + +## 3. Fix — three layers (do all three; they are defense-in-depth) + +### Layer A — Pin the interpreter the runner *agent* sees (host side, durable) + +Make the venv's `bin` the first thing on the **runner agent's** `PATH`, so a bare +`python3` resolves to the venv even across reboots. Pick the mechanism for how the +agent is launched: + +- **GitHub Actions runner as a service (recommended).** The runner reads a + `.env` and a `.path` file in its install dir at start: + ```bash + cd ~/actions-runner + echo "$HOME/kakeya-venv/bin" > .path # prepended to PATH + echo "VIRTUAL_ENV=$HOME/kakeya-venv" >> .env + ./svc.sh stop && ./svc.sh start # reload + ``` + (`.path` is concatenated ahead of the system PATH for every job; `.env` injects + process env. Both persist across reboots because the service re-reads them.) +- **launchd plist (macOS), if not using `svc.sh`.** In the runner's + `~/Library/LaunchAgents/.plist`, set: + ```xml + EnvironmentVariables + + PATH/Users/<you>/kakeya-venv/bin:/usr/bin:/bin:/usr/sbin:/sbin + + ``` + then `launchctl unload/load` the plist. +- **systemd (Linux self-hosted).** In the runner unit: + `Environment="PATH=/opt/kakeya-venv/bin:%h/.local/bin:/usr/bin:/bin"`, then + `systemctl daemon-reload && systemctl restart `. + +Verify: `python3 -c "import mlx_lm, torch, transformers; print('ok')"` from a job. + +### Layer B — Make the workflow/executor resolve a *pinned* interpreter (repo side, robust) + +Never call a bare `python3` for the heavy job. Resolve an explicit interpreter so +the repo is robust even if Layer A drifts: + +1. Add a repo/runner variable, e.g. `KAKEYA_MAC_PYTHON`, pointing at the venv + python (`/Users//kakeya-venv/bin/python`). Default-discover if unset: + ```bash + PYBIN="${KAKEYA_MAC_PYTHON:-}" + for c in "$PYBIN" "$HOME/kakeya-venv/bin/python" "$(command -v python3.13)" "$(command -v python3)"; do + [ -n "$c" ] && [ -x "$c" ] && "$c" -c 'import mlx_lm' 2>/dev/null && { PYBIN="$c"; break; } + done + ``` +2. Use `$PYBIN` (or substitute a `${PYTHON}` token in your command templates) + instead of `python3` for the actual workload. If your executor spawns argv + lists (no shell), resolve the token to `$PYBIN` before `subprocess.run`. + +### Layer C — Fail fast with a clear message (repo side, observability) + +Before the expensive step, assert the deps and **print a fix hint** so the next +failure is self-explanatory instead of a deep `ModuleNotFoundError`: + +```bash +"$PYBIN" - <<'PY' || { echo "::error::runner python missing ML deps — see pin-selfhosted-runner-python-env-skill.md (Layer A)"; exit 90; } +import mlx_lm, torch, transformers # noqa +PY +``` + +--- + +## 4. Verify the fix + +1. Re-run the lightweight env-probe → correct `python`/`exe`, all deps present. +2. Re-run one **real** (heavy) job → no `ModuleNotFoundError`, completes. +3. **Reboot the host and re-run** (the actual regression you are fixing) → still + green. This step is the whole point; do not skip it. + +--- + +## 5. Generalizing to a *Cloud Agent* VM env setup (different machine!) + +Do **not** confuse the self-hosted runner with the Cloud Agent VM: +- The **Cloud Agent VM** is typically Linux; it runs the *client* that dispatches + jobs and the unit-test gate. **Mac-only deps (MLX) do not belong there.** Put + only what the client/tests need into the Cloud Agent env setup (base image + + startup script), and pin versions. +- The **self-hosted runner** is where the heavy/native/Mac deps live. Pin them + there (Layers A–C above), not in the Cloud VM env setup. + +For the Cloud Agent VM specifically: bake stable deps into the **base image**, do +slow-changing installs in the **startup script**, and pin versions so a new VM is +reproducible. (In Cursor, this is the "env setup agent" config.) + +--- + +## 6. Anti-patterns + +- ❌ `pip install` the missing dep into whatever `python3` happens to be active + (often a too-new system Python with no wheels for `torch`/`mlx_lm`). Pin to the + known-good venv instead. +- ❌ Hardcoding an absolute interpreter path in many places. Resolve once + (variable + discovery) and reuse. +- ❌ "It works now" without a reboot test — the regression is reboot-triggered. +- ❌ Relying on an interactive shell's `source venv/bin/activate`; CI jobs and + services don't run your `.zshrc`. + +--- + +## Appendix — ready-to-paste prompt for a setup agent + +> **Task: make our self-hosted CI runner's Python environment reboot-proof.** +> +> Symptom: jobs on our self-hosted runner fail fast with +> `ModuleNotFoundError: No module named 'mlx_lm'` after the host rebooted; a +> lightweight env-probe shows the runner's default `python3` switched to a newer +> interpreter that lacks our ML stack (`mlx_lm`/`torch`/`transformers`), while the +> known-good venv still exists but is no longer on `PATH`. +> +> Do all of the following, smallest-diff first, and verify each: +> 1. **Diagnose:** run a tiny probe that prints `sys.version`, `sys.executable`, +> and import status of `mlx_lm, torch, transformers` through the same path the +> real jobs use. Confirm the wrong interpreter / missing modules. +> 2. **Host (runner agent):** pin the venv's `bin` ahead of system `PATH` for the +> runner service so a bare `python3` resolves to the venv across reboots — via +> the runner's `.path`/`.env` files (GitHub Actions `svc.sh`), or the +> launchd/systemd unit's `PATH` env. Reload the service. +> 3. **Repo (workflow/executor):** stop calling bare `python3` for the heavy job. +> Resolve a pinned interpreter from a `*_PYTHON` repo/runner variable, with a +> discovery fallback that picks the first candidate where `import mlx_lm` +> succeeds; use it for the workload commands. +> 4. **Repo (fail-fast):** before the expensive step, assert +> `import mlx_lm, torch, transformers` and emit a clear `::error::` with a link +> to this skill if missing (exit non-zero). +> 5. **Verify, including a reboot:** env-probe green, one real heavy job green, +> then reboot the host and re-run the same job — must still be green. +> 6. **Pin versions** in the venv (freeze a lockfile) and document the venv path + +> rebuild steps so the environment is reproducible, not just patched. +> +> Keep the heavy/native deps on the self-hosted runner only; do NOT add Mac-only +> deps to the Cloud Agent (Linux) VM env setup. From 16440ff48fc76a4a64a9d36f230059ea9422b964 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Thu, 18 Jun 2026 06:51:50 +0000 Subject: [PATCH 11/12] feat(mac-bridge): pin workload interpreter (Layer B) + import self-check gate (Layer C) Reboots can repoint the runner's default python3 to one without mlx_lm, which broke every full-engine preset with a deep ModuleNotFoundError. Make the workload interpreter explicit and verified: - inference_engine/bridge/runner_python.py (NEW, pure + 100% unit-tested): workload_python_candidates (pin KAKEYA_MAC_PYTHON -> venvs -> PATH), resolve_workload_python (first interpreter that can import mlx_lm; else fallback), preset_requires_gate (mlx-/k3- engine presets, minus env-probe/ upgrade), substitute_python, gate_error_message. - scripts/mac_bridge/run_preset.py: resolve the pinned interpreter, rewrite bare python3 argv0 to it, export KAKEYA_MAC_PYTHON to the subprocess, and FAIL FAST (exit 90 + ::error::) when a gated preset has no mlx_lm-capable interpreter. - scripts/run_kakeya_mac.sh: honor KAKEYA_MAC_PYTHON; preflight asserts mlx+mlx_lm. CI enforcement: the resolution/gate logic lives in the unit-tested, 100%-coverage library (runner_python.py), so every PR exercises it on the Linux gate. See docs/skills/pin-selfhosted-runner-python-env-skill.md. Co-authored-by: FluffyAIcode --- inference_engine/bridge/runner_python.py | 123 ++++++++++++++++++ scripts/mac_bridge/run_preset.py | 44 ++++++- scripts/run_kakeya_mac.sh | 13 +- .../bridge/test_runner_python.py | 108 +++++++++++++++ 4 files changed, 283 insertions(+), 5 deletions(-) create mode 100644 inference_engine/bridge/runner_python.py create mode 100644 tests/inference_engine/bridge/test_runner_python.py diff --git a/inference_engine/bridge/runner_python.py b/inference_engine/bridge/runner_python.py new file mode 100644 index 0000000..1b0c27b --- /dev/null +++ b/inference_engine/bridge/runner_python.py @@ -0,0 +1,123 @@ +"""Pin the Mac-bridge workload interpreter (Layer B) + import self-check (Layer C). + +A self-hosted runner's default ``python3`` can silently change across reboots / +OS upgrades (observed 2026-06-18: it flipped to a Python 3.14 without ``mlx_lm``, +breaking every full-engine preset with a deep ``ModuleNotFoundError``). The +mac-bridge executor used to invoke a bare ``python3`` for the workload, so it +inherited whatever interpreter happened to be first on ``PATH``. + +This module makes the workload interpreter **explicit and verified**: + +* **Layer B — resolution.** Build an ordered candidate list (a pinned + ``KAKEYA_MAC_PYTHON``, common venv paths, then ``PATH`` pythons) and pick the + first one that can import the gate module (``mlx_lm``); fall back to the first + existing candidate otherwise. +* **Layer C — gate.** For presets whose workload needs ``mlx_lm`` (the ``mlx-`` / + ``k3-`` engine families, minus the env-probe / upgrade tools that exist to + diagnose/repair the env), fail fast with a clear message instead of a deep + import error when no capable interpreter exists. + +All functions here are pure / dependency-injected so they are unit-tested on the +Linux gate (the CLI ``scripts/mac_bridge/run_preset.py`` is a thin caller). See +``docs/skills/pin-selfhosted-runner-python-env-skill.md``. +""" + +from __future__ import annotations + +import os +import shutil +from dataclasses import dataclass +from typing import Callable, List, Mapping, Optional, Sequence + +# The single module whose absence broke the runner; importing it implies the +# full MLX-LM stack is wired for the interpreter. +GATE_MODULE = "mlx_lm" + +# ``mlx-``/``k3-`` presets that must NOT be import-gated: these exist precisely +# to probe or repair the environment, so they must run even when mlx_lm is gone. +_IMPORT_GATE_SKIP = frozenset({"mlx-env-probe", "mlx-upgrade"}) + +SKILL_DOC = "docs/skills/pin-selfhosted-runner-python-env-skill.md" + + +def workload_python_candidates( + environ: Mapping[str, str], + *, + which: Callable[[str], Optional[str]] = shutil.which, + expanduser: Callable[[str], str] = os.path.expanduser, +) -> List[str]: + """Ordered, de-duplicated interpreter candidates for the heavy workload. + + Priority: the explicit pin (``KAKEYA_MAC_PYTHON``), then conventional venv + locations, then ``PATH`` pythons (a pinned minor version before the bare + ``python3`` that a reboot may have repointed).""" + raw = [ + environ.get("KAKEYA_MAC_PYTHON"), + expanduser("~/kakeya-venv/bin/python"), + expanduser("~/.venv/bin/python"), + which("python3.13"), + which("python3"), + ] + out: List[str] = [] + for c in raw: + if c and c not in out: + out.append(c) + return out + + +@dataclass(frozen=True) +class ResolvedPython: + """The interpreter chosen for the workload.""" + + path: str + gate_module_ok: bool # whether ``path`` can import GATE_MODULE + from_pin: bool # whether it came from ``KAKEYA_MAC_PYTHON`` + + +def resolve_workload_python( + candidates: Sequence[str], + can_import: Callable[[str], bool], + *, + pinned: Optional[str] = None, +) -> Optional[ResolvedPython]: + """Pick the first candidate that can import :data:`GATE_MODULE`; otherwise + the first candidate (a fallback whose ``gate_module_ok`` is ``False``). + Returns ``None`` only when there are no candidates at all.""" + first: Optional[str] = None + for c in candidates: + if first is None: + first = c + if can_import(c): + return ResolvedPython(c, True, c == pinned) + if first is None: + return None + return ResolvedPython(first, False, first == pinned) + + +def preset_requires_gate(preset_name: str) -> bool: + """True iff a preset's workload needs :data:`GATE_MODULE` (so a missing + import must fail fast). The ``mlx-`` / ``k3-`` engine presets do; the + env-probe and upgrade tools (which diagnose/repair the env) are exempt.""" + if preset_name in _IMPORT_GATE_SKIP: + return False + return preset_name.startswith(("mlx-", "k3-")) + + +def substitute_python(argv: Sequence[str], pybin: str) -> List[str]: + """Rewrite a leading bare ``python3`` to the resolved interpreter ``pybin``. + Non-``python3`` argv (e.g. ``bash run_kakeya_mac.sh``, which reads + ``KAKEYA_MAC_PYTHON`` itself) is returned unchanged.""" + a = list(argv) + if a and a[0] == "python3": + a[0] = pybin + return a + + +def gate_error_message(preset_name: str, pybin: str) -> str: + """The fail-fast message when a gated preset has no mlx_lm-capable python.""" + return ( + f"runner python '{pybin}' cannot import {GATE_MODULE!r}, which preset " + f"'{preset_name}' requires. The runner's default python likely changed " + f"(e.g. after a reboot). Pin the venv via KAKEYA_MAC_PYTHON or the runner " + f"agent PATH and reinstall the ML stack — see {SKILL_DOC}." + ) diff --git a/scripts/mac_bridge/run_preset.py b/scripts/mac_bridge/run_preset.py index a95122a..4a63c00 100644 --- a/scripts/mac_bridge/run_preset.py +++ b/scripts/mac_bridge/run_preset.py @@ -32,10 +32,29 @@ build_commands, parse_manifest_text, ) +from inference_engine.bridge.runner_python import ( + GATE_MODULE, + gate_error_message, + preset_requires_gate, + resolve_workload_python, + substitute_python, + workload_python_candidates, +) LOG_DIR = Path(".mac-bridge/logs") +def _can_import_gate_module(pybin: str) -> bool: + """True iff interpreter ``pybin`` can import the gate module (mlx_lm).""" + try: + return subprocess.run( + [pybin, "-c", f"import {GATE_MODULE}"], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ).returncode == 0 + except OSError: + return False + + def main() -> int: ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("--manifest", default=".mac-bridge/request.json") @@ -59,6 +78,23 @@ def main() -> int: print(json.dumps(argv)) return 0 + # Layer B — resolve a PINNED workload interpreter instead of trusting the + # bare ``python3`` on PATH (which a reboot can repoint to a python without + # mlx_lm). Layer C — gate: mlx-/k3- engine presets fail fast with a clear + # message when no mlx_lm-capable interpreter exists. + pinned = os.environ.get("KAKEYA_MAC_PYTHON") + candidates = workload_python_candidates(os.environ) + resolved = resolve_workload_python( + candidates, _can_import_gate_module, pinned=pinned) + pybin = resolved.path if resolved else "python3" + gate_ok = bool(resolved and resolved.gate_module_ok) + print(f"[mac-bridge] workload python={pybin} {GATE_MODULE}_ok={gate_ok} " + f"pinned={pinned!r} candidates={candidates}", file=sys.stderr) + if preset_requires_gate(request.preset.name) and not gate_ok: + print(f"::error::{gate_error_message(request.preset.name, pybin)}", + file=sys.stderr) + return 90 + LOG_DIR.mkdir(parents=True, exist_ok=True) summary = { "preset": request.preset.name, @@ -66,13 +102,19 @@ def main() -> int: "nonce": request.nonce, "commands": [], } + # Make the resolved interpreter authoritative for BOTH bare-``python3`` + # commands (rewritten here) and the launcher (which reads KAKEYA_MAC_PYTHON). + sub_env = dict(os.environ) + sub_env["KAKEYA_MAC_PYTHON"] = pybin rc = 0 for idx, argv in enumerate(commands): + argv = substitute_python(argv, pybin) log_path = LOG_DIR / f"{request.preset.name}-{idx}.log" print(f"[mac-bridge] exec[{idx}]: {argv}", file=sys.stderr) t0 = time.perf_counter() with log_path.open("wb") as log: - proc = subprocess.run(argv, stdout=log, stderr=subprocess.STDOUT) + proc = subprocess.run(argv, stdout=log, stderr=subprocess.STDOUT, + env=sub_env) elapsed = time.perf_counter() - t0 summary["commands"].append({ "argv": argv, diff --git a/scripts/run_kakeya_mac.sh b/scripts/run_kakeya_mac.sh index 4bf3308..8197ad2 100755 --- a/scripts/run_kakeya_mac.sh +++ b/scripts/run_kakeya_mac.sh @@ -55,6 +55,11 @@ done log() { echo "[run-kakeya-mac] $*" >&2; } +# Pinned interpreter (Layer B): prefer KAKEYA_MAC_PYTHON (the venv python with +# mlx_lm/torch/transformers) over a bare python3 that a host reboot may have +# repointed. See docs/skills/pin-selfhosted-runner-python-env-skill.md. +PYBIN="${KAKEYA_MAC_PYTHON:-python3}" + # ---- argv for the full-engine harness chat ---- args=( --verifier-path "$VERIFIER" @@ -80,7 +85,7 @@ log "drafter : $DRAFTER" log "f_theta : $FTHETA" log "params : sink=$SINK window=$WINDOW block=$BLOCK max_new=$MAX_NEW" -cmd=( python3 scripts/research/k3_integrated_niah_eval_mac.py "${args[@]}" "${EXTRA[@]}" ) +cmd=( "$PYBIN" scripts/research/k3_integrated_niah_eval_mac.py "${args[@]}" "${EXTRA[@]}" ) if [[ "$DRY_RUN" == "1" ]]; then echo "PYTHONPATH=.:sdks/python ${cmd[*]}" @@ -88,9 +93,9 @@ if [[ "$DRY_RUN" == "1" ]]; then fi # ---- preflight (Apple Silicon + MLX + model) ---- -command -v python3 >/dev/null || { log "python3 not found"; exit 1; } -python3 -c "import mlx.core" 2>/dev/null \ - || { log "MLX not importable — this needs Apple Silicon + 'pip install mlx mlx-lm'"; exit 2; } +command -v "$PYBIN" >/dev/null 2>&1 || { log "interpreter not found: $PYBIN (set KAKEYA_MAC_PYTHON)"; exit 1; } +"$PYBIN" -c "import mlx.core, mlx_lm" 2>/dev/null \ + || { log "mlx/mlx_lm not importable by $PYBIN — Apple Silicon + a venv with 'mlx mlx-lm'; set KAKEYA_MAC_PYTHON. See docs/skills/pin-selfhosted-runner-python-env-skill.md"; exit 2; } [[ -d "$VERIFIER" ]] \ || { log "verifier model dir not found: $VERIFIER (set KAKEYA_MAC_VERIFIER_PATH)"; exit 3; } if [[ "$FAST" != "1" && ! -e "$FTHETA" ]]; then diff --git a/tests/inference_engine/bridge/test_runner_python.py b/tests/inference_engine/bridge/test_runner_python.py new file mode 100644 index 0000000..8a0a6bb --- /dev/null +++ b/tests/inference_engine/bridge/test_runner_python.py @@ -0,0 +1,108 @@ +"""Unit tests for the mac-bridge workload interpreter pinning (Layers B/C). + +Pure / dependency-injected logic from ``inference_engine.bridge.runner_python``; +the CLI ``scripts/mac_bridge/run_preset.py`` is a thin caller (coverage-exempt). +""" + +from __future__ import annotations + +from inference_engine.bridge.runner_python import ( + GATE_MODULE, + SKILL_DOC, + ResolvedPython, + gate_error_message, + preset_requires_gate, + resolve_workload_python, + substitute_python, + workload_python_candidates, +) + + +# --------------------------------------------------------------------------- # +# workload_python_candidates +# --------------------------------------------------------------------------- # +def test_candidates_prioritise_pin_then_venvs_then_path(): + env = {"KAKEYA_MAC_PYTHON": "/pin/bin/python"} + which = {"python3.13": "/usr/bin/python3.13", "python3": "/usr/bin/python3"}.get + cands = workload_python_candidates( + env, which=which, expanduser=lambda p: p.replace("~", "/home/me")) + assert cands == [ + "/pin/bin/python", + "/home/me/kakeya-venv/bin/python", + "/home/me/.venv/bin/python", + "/usr/bin/python3.13", + "/usr/bin/python3", + ] + + +def test_candidates_drop_empty_and_dedupe(): + # no pin, python3.13 missing, and python3 == an expanded venv path (dedupe). + env: dict = {} + which = {"python3.13": None, "python3": "/home/me/.venv/bin/python"}.get + cands = workload_python_candidates( + env, which=which, expanduser=lambda p: p.replace("~", "/home/me")) + assert cands == [ + "/home/me/kakeya-venv/bin/python", + "/home/me/.venv/bin/python", + ] + assert None not in cands + + +# --------------------------------------------------------------------------- # +# resolve_workload_python +# --------------------------------------------------------------------------- # +def test_resolve_picks_first_importable(): + cands = ["/a/py", "/b/py", "/c/py"] + r = resolve_workload_python(cands, lambda p: p == "/b/py", pinned="/a/py") + assert r == ResolvedPython(path="/b/py", gate_module_ok=True, from_pin=False) + + +def test_resolve_marks_from_pin_when_pinned_is_importable(): + r = resolve_workload_python(["/pin/py", "/x/py"], lambda p: True, + pinned="/pin/py") + assert r.path == "/pin/py" and r.gate_module_ok is True and r.from_pin is True + + +def test_resolve_falls_back_to_first_when_none_importable(): + r = resolve_workload_python(["/a/py", "/b/py"], lambda p: False, + pinned="/a/py") + assert r == ResolvedPython(path="/a/py", gate_module_ok=False, from_pin=True) + + +def test_resolve_returns_none_without_candidates(): + assert resolve_workload_python([], lambda p: True) is None + + +# --------------------------------------------------------------------------- # +# preset_requires_gate +# --------------------------------------------------------------------------- # +def test_gate_required_for_mlx_and_k3_engine_presets(): + assert preset_requires_gate("mlx-kakeya-launcher-full") is True + assert preset_requires_gate("k3-step2-fused") is True + + +def test_gate_skips_diagnostic_and_installer_and_non_engine(): + assert preset_requires_gate("mlx-env-probe") is False # diagnostic + assert preset_requires_gate("mlx-upgrade") is False # installer + assert preset_requires_gate("integration-tests") is False + assert preset_requires_gate("agent-capacity-stress") is False + + +# --------------------------------------------------------------------------- # +# substitute_python / gate_error_message +# --------------------------------------------------------------------------- # +def test_substitute_rewrites_only_leading_bare_python3(): + assert substitute_python(["python3", "a.py", "--x"], "/v/py") == [ + "/v/py", "a.py", "--x"] + # non-python3 argv0 (e.g. the launcher) is untouched. + assert substitute_python(["bash", "run.sh"], "/v/py") == ["bash", "run.sh"] + # empty argv is safe. + assert substitute_python([], "/v/py") == [] + + +def test_gate_error_message_names_module_preset_and_skill(): + msg = gate_error_message("mlx-kakeya-launcher-full", "/usr/bin/python3") + assert GATE_MODULE in msg + assert "mlx-kakeya-launcher-full" in msg + assert "/usr/bin/python3" in msg + assert SKILL_DOC in msg From cff05ac635c74d3e74707464ef85703d10589140 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Thu, 18 Jun 2026 12:56:05 +0000 Subject: [PATCH 12/12] fix(mac-launcher): bash 3.2-safe empty-array expansion (EXTRA[@]: unbound variable) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scripts/run_kakeya_mac.sh used 'set -u' + a bare "${EXTRA[@]}". macOS's default /bin/bash is 3.2, where expanding an EMPTY array under nounset errors with 'EXTRA[@]: unbound variable' — hit when the launcher is run with no pass-through args (the common interactive case). Use the canonical ${EXTRA[@]+"${EXTRA[@]}"} form (elements if set, nothing if empty, no nounset error). Add mlx-kakeya-launcher-dryrun-bash32 preset to guard it on the real /bin/bash 3.2. Co-authored-by: FluffyAIcode --- inference_engine/bridge/manifest.py | 14 ++++++++++++++ scripts/run_kakeya_mac.sh | 6 +++++- tests/inference_engine/bridge/test_manifest.py | 1 + 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py index 8a27bbb..b88e8cb 100644 --- a/inference_engine/bridge/manifest.py +++ b/inference_engine/bridge/manifest.py @@ -798,6 +798,20 @@ def _harness_preset( params={"max_new_tokens": ("int:max_new_tokens", "64")}, validate_reports=True, # §4 liveness gate on-device ), + Preset( + name="mlx-kakeya-launcher-dryrun-bash32", + description="Guard the launcher against the macOS bash-3.2 " + "'unbound variable' bug: run scripts/run_kakeya_mac.sh " + "--dry-run under /bin/bash (Apple's frozen bash 3.2) with " + "NO pass-through args, so the empty EXTRA array is expanded " + "under set -u. Must exit 0 and print the command (pre-fix it " + "died with 'EXTRA[@]: unbound variable'). Fast; no model load.", + command_templates=( + ("/bin/bash", "scripts/run_kakeya_mac.sh", "--dry-run"), + ), + timeout_minutes=10, + validate_reports=False, + ), Preset( name="mlx-kakeya-degen-probe", description="Long-decode regression probe: full f_θ fused engine on a " diff --git a/scripts/run_kakeya_mac.sh b/scripts/run_kakeya_mac.sh index 4bf3308..57dc34e 100755 --- a/scripts/run_kakeya_mac.sh +++ b/scripts/run_kakeya_mac.sh @@ -80,7 +80,11 @@ log "drafter : $DRAFTER" log "f_theta : $FTHETA" log "params : sink=$SINK window=$WINDOW block=$BLOCK max_new=$MAX_NEW" -cmd=( python3 scripts/research/k3_integrated_niah_eval_mac.py "${args[@]}" "${EXTRA[@]}" ) +# NOTE: ``${EXTRA[@]+"${EXTRA[@]}"}`` (not a bare ``"${EXTRA[@]}"``) — under +# ``set -u`` macOS's default bash 3.2 treats expanding an EMPTY array as an +# "unbound variable" error; the ``+`` form expands to nothing when EXTRA is +# empty and to the quoted elements otherwise. +cmd=( python3 scripts/research/k3_integrated_niah_eval_mac.py "${args[@]}" ${EXTRA[@]+"${EXTRA[@]}"} ) if [[ "$DRY_RUN" == "1" ]]; then echo "PYTHONPATH=.:sdks/python ${cmd[*]}" diff --git a/tests/inference_engine/bridge/test_manifest.py b/tests/inference_engine/bridge/test_manifest.py index 090f189..c42cd6b 100644 --- a/tests/inference_engine/bridge/test_manifest.py +++ b/tests/inference_engine/bridge/test_manifest.py @@ -85,6 +85,7 @@ def test_allowlist_contains_exactly_the_documented_presets(): "mlx-kakeya-degen-probe", "mlx-kakeya-fused-chat-ftheta", "mlx-kakeya-fused-chat-smoke", + "mlx-kakeya-launcher-dryrun-bash32", "mlx-kakeya-launcher-smoke", "mlx-multitenant-pressure", "mlx-upgrade",