From f52acfa2c6eef1767c9ffd8929aafe2c314ec9d9 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 17 Jun 2026 11:46:11 +0000
Subject: [PATCH 01/10] debug(mlx-fused): Phase-1 instrumentation for long-gen
 degeneration

Add stderr NDJSON 'KDBG' instrumentation to the fused spec-decode decode
loops and the restored-prefill adapter to characterize the long-generation
degeneration bug:

* prefill: log prompt_len, restored coverage (evicted range + layers),
  full_kv layout, and per-layer cache class/max_size/keep.
* per block (torch f_theta + mlx_trim loops): block idx, generated-token
  count, past_len, accepted, dt_ms, a cheap repetition signal (unique
  fraction + longest single-token run over last 32 tokens), the count of
  sliding-layer positions evicted DURING decode that have no restored K/V
  (lost), and a per-layer cache state summary.

Temporary debug-only; reverted after the fix is verified.

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 .../backends/mlx/fused_specdecode.py          | 140 ++++++++++++++++++
 1 file changed, 140 insertions(+)

diff --git a/inference_engine/backends/mlx/fused_specdecode.py b/inference_engine/backends/mlx/fused_specdecode.py
index 47e5febc..bd151f03 100644
--- a/inference_engine/backends/mlx/fused_specdecode.py
+++ b/inference_engine/backends/mlx/fused_specdecode.py
@@ -35,6 +35,88 @@
     restored_prefill_cache,
 )
 
+# #region agent log (Phase-1 long-gen degeneration debug; remove after fix)
+import json as _kjson
+import sys as _ksys
+
+
+def _kdbg(ev: str, **kw: Any) -> None:
+    """Emit one compact NDJSON line to stderr (captured by the git-bus bridge)."""
+    try:
+        rec = {"ev": ev, **kw}
+        _ksys.stderr.write("KDBG " + _kjson.dumps(rec, separators=(",", ":")) + "\n")
+        _ksys.stderr.flush()
+    except Exception:
+        pass
+
+
+def _kdbg_rep(toks: List[int], k: int = 32) -> Dict[str, Any]:
+    """Cheap degeneration signal over the last ``k`` generated tokens:
+    unique fraction + longest single-token run (repetition collapse spikes it)."""
+    w = toks[-k:]
+    if not w:
+        return {"win": 0}
+    n = len(w)
+    uniq = len(set(w))
+    run = best = 1
+    for a, b in zip(w, w[1:]):
+        run = run + 1 if a == b else 1
+        if run > best:
+            best = run
+    return {"win": n, "uniq_frac": round(uniq / n, 3),
+            "rep_frac": round(1.0 - uniq / n, 3), "max_run": best}
+
+
+def _kdbg_cache(cache: Any) -> Dict[str, Any]:
+    """Summarize per-layer cache state: pick the first sliding (RotatingKVCache)
+    and first full (KVCache) layer and report global offset, physical resident
+    seq-len, max_size and keep (sink) so we can correlate window-eviction with
+    the restored-coverage boundary. Also returns layer-class counts."""
+    sliding = full = None
+    counts: Dict[str, int] = {}
+    for c in (cache or []):
+        cls = type(c).__name__
+        counts[cls] = counts.get(cls, 0) + 1
+        keys = getattr(c, "keys", None)
+        info = {
+            "cls": cls,
+            "off": int(getattr(c, "offset", 0)),
+            "phys": int(keys.shape[2]) if keys is not None else 0,
+            "ms": (int(getattr(c, "max_size")) if getattr(c, "max_size", None) is not None else None),
+            "keep": (int(getattr(c, "keep")) if getattr(c, "keep", None) is not None else None),
+        }
+        if "Rotating" in cls and sliding is None:
+            sliding = info
+        elif "Rotating" not in cls and full is None:
+            full = info
+    return {"counts": counts, "sliding": sliding, "full": full}
+
+
+def _kdbg_lost(cache: Any, restored: Any, prompt_len: int) -> Optional[Dict[str, Any]]:
+    """Phase-1 Q2: count sliding-layer positions evicted DURING decode that have
+    NO restored K/V. For the first RotatingKVCache: positions [keep, evict_hi)
+    are no longer resident, where evict_hi = offset - (max_size - keep). Of those,
+    any not in the (prompt-only) restored coverage are 'lost' (no K/V anywhere)."""
+    for c in (cache or []):
+        if "Rotating" not in type(c).__name__:
+            continue
+        ms = getattr(c, "max_size", None)
+        if ms is None:
+            return None
+        off = int(getattr(c, "offset", 0))
+        keep = int(getattr(c, "keep", 0) or 0)
+        ms = int(ms)
+        evict_hi = off - (ms - keep)          # exclusive upper bound of evicted region
+        evicted_n = max(0, evict_hi - keep)
+        rset = restored if isinstance(restored, set) else set()
+        lost = sum(1 for p in range(keep, evict_hi) if p not in rset)
+        return {"off": off, "ms": ms, "keep": keep, "evict_hi": evict_hi,
+                "evicted_n": evicted_n, "restored_in_evicted": evicted_n - lost,
+                "lost": lost, "prompt_len": int(prompt_len),
+                "window_slid_off_prompt": bool(evict_hi > prompt_len)}
+    return None
+# #endregion
+
 
 # --------------------------------------------------------------------------- #
 # Component A: capture verifier aux-layer hidden states (no transformers
@@ -193,6 +275,27 @@ def prefill(
             cache_factory=factory,
         )
         self._past_len = len(prompt_ids)
+        # #region agent log (Phase-1)
+        try:
+            ev = sorted(int(p) for p in evicted_positions)
+            rk_layers = sorted(int(k) for k in restored_k_per_layer.keys())
+            # Stash restored coverage for the decode loop's lost-position check.
+            self._dbg_restored_positions = set(ev)
+            self._dbg_prompt_len = int(len(prompt_ids))
+            _kdbg(
+                "prefill",
+                prompt_len=len(prompt_ids),
+                evicted_count=len(ev),
+                evicted_lo=(ev[0] if ev else None),
+                evicted_hi=(ev[-1] if ev else None),
+                restored_layers=rk_layers,
+                restored_layer_count=len(rk_layers),
+                full_kv=bool(self._full_kv),
+                cache=_kdbg_cache(self._cache),
+            )
+        except Exception:
+            pass
+        # #endregion
 
     def forward_block(self, tokens: Sequence[int]) -> Any:
         """Incremental verify of ``tokens`` against the restored cache. Returns
@@ -402,6 +505,7 @@ def fused_specdecode_generate_mlx_trim(
     ctx_len = C
     try:
         while len(generated) < gen_tokens:
+            _kblk_t0 = time.perf_counter()  # agent log (Phase-1)
             L = min(block_size, gen_tokens - len(generated))
             base = adapter._past_len
             t_build = time.perf_counter()
@@ -440,6 +544,23 @@ def fused_specdecode_generate_mlx_trim(
             commit = check[:accepted]
             generated += commit
             accepts.append(accepted)
+            # #region agent log (Phase-1)
+            _kdbg(
+                "block",
+                loop="mlx_trim",
+                blk=len(accepts) - 1,
+                gen=len(generated),
+                past_len=adapter._past_len,
+                accepted=accepted,
+                L=int(check_ids.shape[0]),
+                dt_ms=round((time.perf_counter() - _kblk_t0) * 1e3, 1),
+                rep=_kdbg_rep(generated),
+                lost=_kdbg_lost(adapter._cache,
+                                getattr(adapter, "_dbg_restored_positions", set()),
+                                getattr(adapter, "_dbg_prompt_len", 0)),
+                cache=_kdbg_cache(adapter._cache),
+            )
+            # #endregion
             adapter.next_token_logits = next_row
             aux_rows = adapter._last_aux_mx
             # KEEP accepted (positions base..base+accepted-1), TRIM rejected.
@@ -675,6 +796,7 @@ def fused_specdecode_generate(
     fallback_to_greedy = False
     try:
         while len(generated) < gen_tokens:
+            _kblk_t0 = time.perf_counter()  # agent log (Phase-1)
             L = min(block_size, gen_tokens - len(generated))
             cstart = adapter._past_len
             bonus = int(argmax_fn(adapter.next_token_logits))
@@ -731,6 +853,24 @@ def fused_specdecode_generate(
                 commit = candidate[:accepted] + [correction]
             generated += commit
             accepts.append(accepted)
+            # #region agent log (Phase-1)
+            _kdbg(
+                "block",
+                loop="torch_ftheta",
+                blk=len(accepts) - 1,
+                gen=len(generated),
+                gen_since_prompt=len(generated),
+                past_len=adapter._past_len,
+                accepted=accepted,
+                L=len(candidate),
+                dt_ms=round((time.perf_counter() - _kblk_t0) * 1e3, 1),
+                rep=_kdbg_rep(generated),
+                lost=_kdbg_lost(adapter._cache,
+                                getattr(adapter, "_dbg_restored_positions", set()),
+                                getattr(adapter, "_dbg_prompt_len", 0)),
+                cache=_kdbg_cache(adapter._cache),
+            )
+            # #endregion
             if any(t in eos for t in commit):
                 break
             if (allow_greedy_fallback and len(accepts) >= 2

From 0d1daa7e464e62161204bfe319092bb556631bf2 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 17 Jun 2026 11:48:43 +0000
Subject: [PATCH 02/10] debug(mac-bridge): mlx-kakeya-degen-probe preset
 (Phase-1 long-decode degeneration characterization)

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 inference_engine/bridge/manifest.py           | 24 +++++++++++++++++++
 .../inference_engine/bridge/test_manifest.py  |  1 +
 2 files changed, 25 insertions(+)

diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py
index 70bfbade..03c47bde 100644
--- a/inference_engine/bridge/manifest.py
+++ b/inference_engine/bridge/manifest.py
@@ -771,6 +771,30 @@ def _harness_preset(
             params={"max_new_tokens": ("int:max_new_tokens", "64")},
             validate_reports=True,  # §4 liveness gate on-device
         ),
+        Preset(
+            name="mlx-kakeya-degen-probe",
+            description="DEBUG (Phase-1): full f_θ fused engine on a LONG prompt "
+                        "(--ignore-turn-stop, default 256 tokens) to characterize "
+                        "the long-decode degeneration onset. Emits KDBG NDJSON to "
+                        "stderr (captured in the bridge log) + transcript JSON. NOT "
+                        "gated — the degeneration is the thing being measured.",
+            command_templates=(
+                (
+                    "python3", "scripts/research/k3_integrated_niah_eval_mac.py",
+                    "--verifier-path", "${ENV:KAKEYA_MAC_VERIFIER_PATH}",
+                    "--drafter-id", "${ENV:KAKEYA_MAC_DRAFTER_ID}",
+                    "--f-theta-dir", "${ENV:KAKEYA_MAC_FTHETA_DIR}",
+                    "--s5-exact-full-attn", "--fused-specdecode", "--force-f-theta",
+                    "--sink-size", "4", "--window-size", "64", "--block-size", "4",
+                    "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop",
+                    "--chat", "--chat-scripted", "请详细解释POW的工作原理",
+                    "--output", "results/research/phase1_degeneration_chat.json",
+                ),
+            ),
+            timeout_minutes=90,
+            params={"max_new_tokens": ("int:max_new_tokens", "256")},
+            validate_reports=False,
+        ),
     )
 }
 
diff --git a/tests/inference_engine/bridge/test_manifest.py b/tests/inference_engine/bridge/test_manifest.py
index 8b10f261..31ce0ec6 100644
--- a/tests/inference_engine/bridge/test_manifest.py
+++ b/tests/inference_engine/bridge/test_manifest.py
@@ -81,6 +81,7 @@ def test_allowlist_contains_exactly_the_documented_presets():
         "mlx-batched-pad-decode",
         "mlx-env-probe",
         "mlx-kakeya-chat-smoke",
+        "mlx-kakeya-degen-probe",
         "mlx-kakeya-fused-chat-ftheta",
         "mlx-kakeya-fused-chat-smoke",
         "mlx-kakeya-launcher-smoke",

From 91c71adbab2471eaed693543a1278ce71b2d3330 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 17 Jun 2026 12:28:50 +0000
Subject: [PATCH 03/10] debug(mlx-fused): Phase-2 instrumentation +
 native-greedy control for long-decode degeneration

- cyc_frac/cyc_p phrase-cycle detection (max_run was blind to phrase loops)
- _kdbg_sync: per-block sliding-vs-full offset divergence (H2 trim-desync)
- commit_or_truncate trim event logging (short-trim smoking gun)
- final token-id dump for offline divergence comparison
- --chat-native-ref: plain native greedy control on identical prompt (H1 vs engine)
- degen-probe preset now runs the native control

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 .../backends/mlx/fused_specdecode.py          | 67 +++++++++++++++++--
 inference_engine/bridge/manifest.py           |  3 +-
 .../research/k3_integrated_niah_eval_mac.py   | 29 ++++++++
 3 files changed, 93 insertions(+), 6 deletions(-)

diff --git a/inference_engine/backends/mlx/fused_specdecode.py b/inference_engine/backends/mlx/fused_specdecode.py
index bd151f03..e89781c8 100644
--- a/inference_engine/backends/mlx/fused_specdecode.py
+++ b/inference_engine/backends/mlx/fused_specdecode.py
@@ -50,9 +50,16 @@ def _kdbg(ev: str, **kw: Any) -> None:
         pass
 
 
-def _kdbg_rep(toks: List[int], k: int = 32) -> Dict[str, Any]:
-    """Cheap degeneration signal over the last ``k`` generated tokens:
-    unique fraction + longest single-token run (repetition collapse spikes it)."""
+def _kdbg_rep(toks: List[int], k: int = 64) -> Dict[str, Any]:
+    """Degeneration signal over the last ``k`` generated tokens.
+
+    Phase-1 widened the window (32→64) AND added phrase-level cycle detection:
+    ``max_run`` only sees single-token runs, so a structural repeat loop
+    ("### 1. ...### 1. ...") reads ``max_run:1`` yet is fully degenerate. We
+    also scan for the period ``p`` in ``[1, n//2]`` that maximises the fraction
+    of positions equal to the token ``p`` steps back (``cyc_frac`` near 1.0 with
+    ``cyc_p>1`` ⇒ phrase/sentence-level repetition the run-length metric misses).
+    """
     w = toks[-k:]
     if not w:
         return {"win": 0}
@@ -63,8 +70,36 @@ def _kdbg_rep(toks: List[int], k: int = 32) -> Dict[str, Any]:
         run = run + 1 if a == b else 1
         if run > best:
             best = run
+    cyc_p, cyc_frac = 0, 0.0
+    for p in range(1, n // 2 + 1):
+        m = sum(1 for i in range(p, n) if w[i] == w[i - p])
+        frac = m / (n - p)
+        if frac > cyc_frac:
+            cyc_frac, cyc_p = frac, p
     return {"win": n, "uniq_frac": round(uniq / n, 3),
-            "rep_frac": round(1.0 - uniq / n, 3), "max_run": best}
+            "rep_frac": round(1.0 - uniq / n, 3), "max_run": best,
+            "cyc_p": cyc_p, "cyc_frac": round(cyc_frac, 3)}
+
+
+def _kdbg_sync(cache: Any, past_len: int) -> Dict[str, Any]:
+    """Phase-1 H2: surface cache desync. The torch_ftheta loop rolls rejections
+    back with ``trim_prompt_cache`` on the NATIVE hybrid cache. If the sliding
+    ``RotatingKVCache`` and the full ``KVCache`` trim by different amounts (or
+    one is non-trimmable after the ring wraps), their ``offset`` diverges from
+    ``_past_len`` and from each other — the position misalignment that would
+    corrupt subsequent logits. Compare both offsets to ``past_len``."""
+    sl = fu = None
+    for c in (cache or []):
+        off = int(getattr(c, "offset", 0))
+        if "Rotating" in type(c).__name__:
+            if sl is None:
+                sl = off
+        elif fu is None:
+            fu = off
+    return {"past_len": int(past_len), "sliding_off": sl, "full_off": fu,
+            "sliding_eq": (sl == past_len) if sl is not None else None,
+            "full_eq": (fu == past_len) if fu is not None else None,
+            "sliding_minus_full": (sl - fu) if (sl is not None and fu is not None) else None}
 
 
 def _kdbg_cache(cache: Any) -> Dict[str, Any]:
@@ -394,7 +429,19 @@ def commit_or_truncate(self, *, forwarded: int, accepted: int) -> None:
         drop = forwarded - accepted
         if drop > 0 and self._cache is not None:
             from mlx_lm.models.cache import trim_prompt_cache  # type: ignore
-            trim_prompt_cache(self._cache, drop)
+            # #region agent log (Phase-1 H2: unsound trim on the hybrid cache)
+            _before = _kdbg_cache(self._cache)
+            trimmed = trim_prompt_cache(self._cache, drop)
+            _kdbg(
+                "trim",
+                drop=int(drop),
+                trimmed=(int(trimmed) if trimmed is not None else None),
+                short=(trimmed is not None and int(trimmed) < int(drop)),
+                past_len=int(self._past_len),
+                before=_before,
+                after=_kdbg_cache(self._cache),
+            )
+            # #endregion
         self._past_len += accepted
 
     def append_token(self, token_id: int) -> Any:
@@ -863,8 +910,10 @@ def fused_specdecode_generate(
                 past_len=adapter._past_len,
                 accepted=accepted,
                 L=len(candidate),
+                commit_ids=[int(t) for t in commit],
                 dt_ms=round((time.perf_counter() - _kblk_t0) * 1e3, 1),
                 rep=_kdbg_rep(generated),
+                sync=_kdbg_sync(adapter._cache, adapter._past_len),
                 lost=_kdbg_lost(adapter._cache,
                                 getattr(adapter, "_dbg_restored_positions", set()),
                                 getattr(adapter, "_dbg_prompt_len", 0)),
@@ -894,6 +943,14 @@ def fused_specdecode_generate(
     finally:
         adapter._capture_aux = False
     generated = generated[:gen_tokens]
+    # #region agent log (Phase-1: full token dump for offline fused-vs-native
+    # divergence comparison + final wide-window degeneration summary)
+    _kdbg("final", loop="torch_ftheta", n=len(generated),
+          blocks=len(accepts),
+          mean_accept_len=(round(sum(accepts) / len(accepts), 3) if accepts else 0.0),
+          rep_w128=_kdbg_rep(generated, k=128),
+          tokens=[int(t) for t in generated])
+    # #endregion
     return {
         "tokens": generated,
         "blocks": len(accepts),
diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py
index 03c47bde..ba34dfc0 100644
--- a/inference_engine/bridge/manifest.py
+++ b/inference_engine/bridge/manifest.py
@@ -787,7 +787,8 @@ def _harness_preset(
                     "--s5-exact-full-attn", "--fused-specdecode", "--force-f-theta",
                     "--sink-size", "4", "--window-size", "64", "--block-size", "4",
                     "--max-new-tokens", "{max_new_tokens}", "--ignore-turn-stop",
-                    "--chat", "--chat-scripted", "请详细解释POW的工作原理",
+                    "--chat", "--chat-native-ref",
+                    "--chat-scripted", "请详细解释POW的工作原理",
                     "--output", "results/research/phase1_degeneration_chat.json",
                 ),
             ),
diff --git a/scripts/research/k3_integrated_niah_eval_mac.py b/scripts/research/k3_integrated_niah_eval_mac.py
index b4c621f3..27fea08a 100644
--- a/scripts/research/k3_integrated_niah_eval_mac.py
+++ b/scripts/research/k3_integrated_niah_eval_mac.py
@@ -180,6 +180,14 @@ def parse_args() -> argparse.Namespace:
     ap.add_argument("--chat-scripted", default=None,
                     help="Non-interactive chat: '||'-separated user turns "
                          "(for Mac-bridge verification); writes a transcript.")
+    ap.add_argument("--chat-native-ref", action="store_true",
+                    help="DEBUG (Phase-1): before each scripted-chat turn, also "
+                         "run a plain NATIVE greedy AR decode of the SAME prompt "
+                         "for --max-new-tokens, emitting KDBG block_native/"
+                         "final_native rep metrics. This is the decisive control "
+                         "for greedy-pathology vs fused-engine degeneration: if "
+                         "the native reference degenerates at the same length, the "
+                         "fused engine is not the cause.")
     ap.add_argument("--force-f-theta", action="store_true",
                     help="Run f_θ restoration even under --s5-exact-full-attn "
                          "(bypass the S5 native-prefill short-circuit). On gemma-4 "
@@ -213,6 +221,7 @@ def main() -> int:
         MLXRestoredIncrementalVerifier, capture_aux_hidden,
         make_bridge_embed_lm_head, fused_specdecode_generate,
         fused_specdecode_generate_mlx, fused_specdecode_generate_mlx_trim,
+        _kdbg, _kdbg_rep,  # Phase-1 degeneration instrumentation
     )
     from inference_engine.v04.kv_compressor import make_default_compressor
     from inference_engine.bench.k3_report_gate import (
@@ -762,6 +771,26 @@ def _encode_chat(history: List[Dict[str, str]]) -> List[int]:
                 return list(cids.tolist() if hasattr(cids, "tolist") else cids)
 
             def _gen_turn(pid: List[int]) -> Dict[str, Any]:
+                # #region agent log (Phase-1: native-greedy control on the SAME
+                # prompt — discriminates greedy pathology from fused-engine bug)
+                if args.chat_native_ref:
+                    nref_cache, nref_logits = native_prefill(list(pid))
+                    nref_gen: List[int] = []
+                    while len(nref_gen) < args.max_new_tokens:
+                        tok = int(mx.argmax(nref_logits).item())
+                        nref_gen.append(tok)
+                        if tok in end_ids:
+                            break
+                        out = mlx_model(mx.array([[tok]]), cache=nref_cache)
+                        mx.eval(out)
+                        nref_logits = out[0, -1]
+                        if len(nref_gen) % max(args.block_size, 1) == 0:
+                            _kdbg("block_native", ref="native_greedy",
+                                  gen=len(nref_gen), rep=_kdbg_rep(nref_gen))
+                    _kdbg("final_native", ref="native_greedy", n=len(nref_gen),
+                          rep_w128=_kdbg_rep(nref_gen, k=128),
+                          tokens=[int(t) for t in nref_gen])
+                # #endregion
                 rk, rv, tsrc = build_restoration(pid, prefill_native_s5=True)
                 # f_θ ran iff build_restoration produced restored banks via the
                 # torch drafter+f_θ (under --force-f-theta the S5 short-circuit is

From b56c6b44f0a9dd61a8b41b766239cd95f74e12ec Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 17 Jun 2026 12:47:31 +0000
Subject: [PATCH 04/10] debug(mlx-fused): capture native-ref text for coherence
 A/B in >wrap probe

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 scripts/research/k3_integrated_niah_eval_mac.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/scripts/research/k3_integrated_niah_eval_mac.py b/scripts/research/k3_integrated_niah_eval_mac.py
index 27fea08a..ec83bc0e 100644
--- a/scripts/research/k3_integrated_niah_eval_mac.py
+++ b/scripts/research/k3_integrated_niah_eval_mac.py
@@ -787,8 +787,14 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]:
                         if len(nref_gen) % max(args.block_size, 1) == 0:
                             _kdbg("block_native", ref="native_greedy",
                                   gen=len(nref_gen), rep=_kdbg_rep(nref_gen))
+                    try:
+                        nref_text = tokenizer.decode(
+                            nref_gen, skip_special_tokens=True)
+                    except TypeError:
+                        nref_text = tokenizer.decode(nref_gen)
                     _kdbg("final_native", ref="native_greedy", n=len(nref_gen),
                           rep_w128=_kdbg_rep(nref_gen, k=128),
+                          text=nref_text,
                           tokens=[int(t) for t in nref_gen])
                 # #endregion
                 rk, rv, tsrc = build_restoration(pid, prefill_native_s5=True)

From 56546e559f120bec741334b5481d5da1d3ef0fa8 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 17 Jun 2026 13:04:57 +0000
Subject: [PATCH 05/10] fix(mlx-fused): single-token commits past
 RotatingKVCache wrap to prevent offset desync
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Root cause: once the ms=1024 sliding RotatingKVCache ring wraps (offset>=ms),
mlx_lm.trim_prompt_cache refuses the rejected-draft rollback (all-or-nothing,
is_trimmable requires offset<max_size). Un-trimmed rejected K/V leave
cache.offset ahead of committed past_len (+8 observed), misaligning RoPE/mask
-> logit corruption -> runaway repetition (由于由于...) onset ~gen1064.

Fix (Option A, correctness-first): detect when the sliding ring would wrap and
commit single-token blocks (L=1). With L=1 the bonus token is always accepted
(it is argmax(next_token_logits)), so drop==0 and trim is never called while
wrapped -> offset stays == past_len, matching the coherent native AR path.

Validated by the native-greedy control: native (no spec rollback) stays
coherent past 1024; only the fused trim path degenerated.

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 .../backends/mlx/fused_specdecode.py          | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/inference_engine/backends/mlx/fused_specdecode.py b/inference_engine/backends/mlx/fused_specdecode.py
index e89781c8..9ef0273b 100644
--- a/inference_engine/backends/mlx/fused_specdecode.py
+++ b/inference_engine/backends/mlx/fused_specdecode.py
@@ -796,6 +796,33 @@ def fused_specdecode_generate_mlx(
     }
 
 
+# --------------------------------------------------------------------------- #
+# H2 wrap fix: detect when the sliding RotatingKVCache is at/over its ring
+# capacity so the caller can avoid the unsound (refused) speculative trim.
+# --------------------------------------------------------------------------- #
+def _sliding_ring_would_wrap(cache: Any, n_new: int) -> bool:
+    """True if appending ``n_new`` tokens leaves a sliding ``RotatingKVCache``
+    non-trimmable (``offset + n_new >= max_size``).
+
+    ``mlx_lm.trim_prompt_cache`` is all-or-nothing across the hybrid cache: it
+    trims only when EVERY layer is trimmable, and a ``RotatingKVCache`` reports
+    ``is_trimmable`` only while ``offset < max_size``. Once the ring is at/over
+    ``max_size`` the rejected-draft rollback is REFUSED on every layer (sliding
+    AND full alike), so the un-trimmed rejected K/V linger and ``cache.offset``
+    runs ahead of the committed length — desyncing RoPE/causal alignment and
+    corrupting all subsequent logits (the >ms degeneration). Detecting this lets
+    the loop fall back to single-token commits, which never produce a rejected
+    tail to trim, so ``offset`` stays == committed ``past_len``."""
+    for c in (cache or []):
+        if "Rotating" in type(c).__name__:
+            ms = getattr(c, "max_size", None)
+            if ms is None:
+                continue
+            if int(getattr(c, "offset", 0)) + int(n_new) >= int(ms):
+                return True
+    return False
+
+
 # --------------------------------------------------------------------------- #
 # The fused spec-decode loop (control flow; MLX/torch ops via injected fns).
 # --------------------------------------------------------------------------- #
@@ -845,6 +872,19 @@ def fused_specdecode_generate(
         while len(generated) < gen_tokens:
             _kblk_t0 = time.perf_counter()  # agent log (Phase-1)
             L = min(block_size, gen_tokens - len(generated))
+            # H2 wrap fix (correctness-first): once the sliding RotatingKVCache
+            # would wrap, a multi-token speculative block can leave rejected
+            # draft K/V in the now-non-trimmable ring (trim_prompt_cache refuses
+            # on the wrapped hybrid cache), desyncing cache.offset from the
+            # committed _past_len and corrupting every subsequent step's
+            # RoPE/causal alignment -> degeneration past ms. Forcing L=1 past the
+            # wrap point makes each block commit exactly its always-accepted
+            # bonus token, so there is never a rejected tail to trim and offset
+            # stays == past_len (matching the native greedy path, which stays
+            # coherent past ms). Costs the speculative speedup past ms only.
+            _wrap_l1 = _sliding_ring_would_wrap(adapter._cache, L)
+            if _wrap_l1:
+                L = 1
             cstart = adapter._past_len
             bonus = int(argmax_fn(adapter.next_token_logits))
             t_draft = time.perf_counter()
@@ -910,6 +950,7 @@ def fused_specdecode_generate(
                 past_len=adapter._past_len,
                 accepted=accepted,
                 L=len(candidate),
+                wrap_l1=bool(_wrap_l1),
                 commit_ids=[int(t) for t in commit],
                 dt_ms=round((time.perf_counter() - _kblk_t0) * 1e3, 1),
                 rep=_kdbg_rep(generated),

From 34c2d2f69c8b8f5bb849d5a7bb16ce9853c1c9b2 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 17 Jun 2026 13:26:15 +0000
Subject: [PATCH 06/10] chore(mlx-fused): remove Phase-1 KDBG instrumentation;
 keep wrap fix + --chat-native-ref

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 .../backends/mlx/fused_specdecode.py          | 204 +-----------------
 .../research/k3_integrated_niah_eval_mac.py   |  50 ++---
 2 files changed, 29 insertions(+), 225 deletions(-)

diff --git a/inference_engine/backends/mlx/fused_specdecode.py b/inference_engine/backends/mlx/fused_specdecode.py
index 9ef0273b..00cc9432 100644
--- a/inference_engine/backends/mlx/fused_specdecode.py
+++ b/inference_engine/backends/mlx/fused_specdecode.py
@@ -35,123 +35,6 @@
     restored_prefill_cache,
 )
 
-# #region agent log (Phase-1 long-gen degeneration debug; remove after fix)
-import json as _kjson
-import sys as _ksys
-
-
-def _kdbg(ev: str, **kw: Any) -> None:
-    """Emit one compact NDJSON line to stderr (captured by the git-bus bridge)."""
-    try:
-        rec = {"ev": ev, **kw}
-        _ksys.stderr.write("KDBG " + _kjson.dumps(rec, separators=(",", ":")) + "\n")
-        _ksys.stderr.flush()
-    except Exception:
-        pass
-
-
-def _kdbg_rep(toks: List[int], k: int = 64) -> Dict[str, Any]:
-    """Degeneration signal over the last ``k`` generated tokens.
-
-    Phase-1 widened the window (32→64) AND added phrase-level cycle detection:
-    ``max_run`` only sees single-token runs, so a structural repeat loop
-    ("### 1. ...### 1. ...") reads ``max_run:1`` yet is fully degenerate. We
-    also scan for the period ``p`` in ``[1, n//2]`` that maximises the fraction
-    of positions equal to the token ``p`` steps back (``cyc_frac`` near 1.0 with
-    ``cyc_p>1`` ⇒ phrase/sentence-level repetition the run-length metric misses).
-    """
-    w = toks[-k:]
-    if not w:
-        return {"win": 0}
-    n = len(w)
-    uniq = len(set(w))
-    run = best = 1
-    for a, b in zip(w, w[1:]):
-        run = run + 1 if a == b else 1
-        if run > best:
-            best = run
-    cyc_p, cyc_frac = 0, 0.0
-    for p in range(1, n // 2 + 1):
-        m = sum(1 for i in range(p, n) if w[i] == w[i - p])
-        frac = m / (n - p)
-        if frac > cyc_frac:
-            cyc_frac, cyc_p = frac, p
-    return {"win": n, "uniq_frac": round(uniq / n, 3),
-            "rep_frac": round(1.0 - uniq / n, 3), "max_run": best,
-            "cyc_p": cyc_p, "cyc_frac": round(cyc_frac, 3)}
-
-
-def _kdbg_sync(cache: Any, past_len: int) -> Dict[str, Any]:
-    """Phase-1 H2: surface cache desync. The torch_ftheta loop rolls rejections
-    back with ``trim_prompt_cache`` on the NATIVE hybrid cache. If the sliding
-    ``RotatingKVCache`` and the full ``KVCache`` trim by different amounts (or
-    one is non-trimmable after the ring wraps), their ``offset`` diverges from
-    ``_past_len`` and from each other — the position misalignment that would
-    corrupt subsequent logits. Compare both offsets to ``past_len``."""
-    sl = fu = None
-    for c in (cache or []):
-        off = int(getattr(c, "offset", 0))
-        if "Rotating" in type(c).__name__:
-            if sl is None:
-                sl = off
-        elif fu is None:
-            fu = off
-    return {"past_len": int(past_len), "sliding_off": sl, "full_off": fu,
-            "sliding_eq": (sl == past_len) if sl is not None else None,
-            "full_eq": (fu == past_len) if fu is not None else None,
-            "sliding_minus_full": (sl - fu) if (sl is not None and fu is not None) else None}
-
-
-def _kdbg_cache(cache: Any) -> Dict[str, Any]:
-    """Summarize per-layer cache state: pick the first sliding (RotatingKVCache)
-    and first full (KVCache) layer and report global offset, physical resident
-    seq-len, max_size and keep (sink) so we can correlate window-eviction with
-    the restored-coverage boundary. Also returns layer-class counts."""
-    sliding = full = None
-    counts: Dict[str, int] = {}
-    for c in (cache or []):
-        cls = type(c).__name__
-        counts[cls] = counts.get(cls, 0) + 1
-        keys = getattr(c, "keys", None)
-        info = {
-            "cls": cls,
-            "off": int(getattr(c, "offset", 0)),
-            "phys": int(keys.shape[2]) if keys is not None else 0,
-            "ms": (int(getattr(c, "max_size")) if getattr(c, "max_size", None) is not None else None),
-            "keep": (int(getattr(c, "keep")) if getattr(c, "keep", None) is not None else None),
-        }
-        if "Rotating" in cls and sliding is None:
-            sliding = info
-        elif "Rotating" not in cls and full is None:
-            full = info
-    return {"counts": counts, "sliding": sliding, "full": full}
-
-
-def _kdbg_lost(cache: Any, restored: Any, prompt_len: int) -> Optional[Dict[str, Any]]:
-    """Phase-1 Q2: count sliding-layer positions evicted DURING decode that have
-    NO restored K/V. For the first RotatingKVCache: positions [keep, evict_hi)
-    are no longer resident, where evict_hi = offset - (max_size - keep). Of those,
-    any not in the (prompt-only) restored coverage are 'lost' (no K/V anywhere)."""
-    for c in (cache or []):
-        if "Rotating" not in type(c).__name__:
-            continue
-        ms = getattr(c, "max_size", None)
-        if ms is None:
-            return None
-        off = int(getattr(c, "offset", 0))
-        keep = int(getattr(c, "keep", 0) or 0)
-        ms = int(ms)
-        evict_hi = off - (ms - keep)          # exclusive upper bound of evicted region
-        evicted_n = max(0, evict_hi - keep)
-        rset = restored if isinstance(restored, set) else set()
-        lost = sum(1 for p in range(keep, evict_hi) if p not in rset)
-        return {"off": off, "ms": ms, "keep": keep, "evict_hi": evict_hi,
-                "evicted_n": evicted_n, "restored_in_evicted": evicted_n - lost,
-                "lost": lost, "prompt_len": int(prompt_len),
-                "window_slid_off_prompt": bool(evict_hi > prompt_len)}
-    return None
-# #endregion
-
 
 # --------------------------------------------------------------------------- #
 # Component A: capture verifier aux-layer hidden states (no transformers
@@ -310,27 +193,6 @@ def prefill(
             cache_factory=factory,
         )
         self._past_len = len(prompt_ids)
-        # #region agent log (Phase-1)
-        try:
-            ev = sorted(int(p) for p in evicted_positions)
-            rk_layers = sorted(int(k) for k in restored_k_per_layer.keys())
-            # Stash restored coverage for the decode loop's lost-position check.
-            self._dbg_restored_positions = set(ev)
-            self._dbg_prompt_len = int(len(prompt_ids))
-            _kdbg(
-                "prefill",
-                prompt_len=len(prompt_ids),
-                evicted_count=len(ev),
-                evicted_lo=(ev[0] if ev else None),
-                evicted_hi=(ev[-1] if ev else None),
-                restored_layers=rk_layers,
-                restored_layer_count=len(rk_layers),
-                full_kv=bool(self._full_kv),
-                cache=_kdbg_cache(self._cache),
-            )
-        except Exception:
-            pass
-        # #endregion
 
     def forward_block(self, tokens: Sequence[int]) -> Any:
         """Incremental verify of ``tokens`` against the restored cache. Returns
@@ -429,19 +291,7 @@ def commit_or_truncate(self, *, forwarded: int, accepted: int) -> None:
         drop = forwarded - accepted
         if drop > 0 and self._cache is not None:
             from mlx_lm.models.cache import trim_prompt_cache  # type: ignore
-            # #region agent log (Phase-1 H2: unsound trim on the hybrid cache)
-            _before = _kdbg_cache(self._cache)
-            trimmed = trim_prompt_cache(self._cache, drop)
-            _kdbg(
-                "trim",
-                drop=int(drop),
-                trimmed=(int(trimmed) if trimmed is not None else None),
-                short=(trimmed is not None and int(trimmed) < int(drop)),
-                past_len=int(self._past_len),
-                before=_before,
-                after=_kdbg_cache(self._cache),
-            )
-            # #endregion
+            trim_prompt_cache(self._cache, drop)
         self._past_len += accepted
 
     def append_token(self, token_id: int) -> Any:
@@ -552,7 +402,6 @@ def fused_specdecode_generate_mlx_trim(
     ctx_len = C
     try:
         while len(generated) < gen_tokens:
-            _kblk_t0 = time.perf_counter()  # agent log (Phase-1)
             L = min(block_size, gen_tokens - len(generated))
             base = adapter._past_len
             t_build = time.perf_counter()
@@ -591,23 +440,6 @@ def fused_specdecode_generate_mlx_trim(
             commit = check[:accepted]
             generated += commit
             accepts.append(accepted)
-            # #region agent log (Phase-1)
-            _kdbg(
-                "block",
-                loop="mlx_trim",
-                blk=len(accepts) - 1,
-                gen=len(generated),
-                past_len=adapter._past_len,
-                accepted=accepted,
-                L=int(check_ids.shape[0]),
-                dt_ms=round((time.perf_counter() - _kblk_t0) * 1e3, 1),
-                rep=_kdbg_rep(generated),
-                lost=_kdbg_lost(adapter._cache,
-                                getattr(adapter, "_dbg_restored_positions", set()),
-                                getattr(adapter, "_dbg_prompt_len", 0)),
-                cache=_kdbg_cache(adapter._cache),
-            )
-            # #endregion
             adapter.next_token_logits = next_row
             aux_rows = adapter._last_aux_mx
             # KEEP accepted (positions base..base+accepted-1), TRIM rejected.
@@ -870,7 +702,6 @@ def fused_specdecode_generate(
     fallback_to_greedy = False
     try:
         while len(generated) < gen_tokens:
-            _kblk_t0 = time.perf_counter()  # agent log (Phase-1)
             L = min(block_size, gen_tokens - len(generated))
             # H2 wrap fix (correctness-first): once the sliding RotatingKVCache
             # would wrap, a multi-token speculative block can leave rejected
@@ -882,8 +713,8 @@ def fused_specdecode_generate(
             # bonus token, so there is never a rejected tail to trim and offset
             # stays == past_len (matching the native greedy path, which stays
             # coherent past ms). Costs the speculative speedup past ms only.
-            _wrap_l1 = _sliding_ring_would_wrap(adapter._cache, L)
-            if _wrap_l1:
+            wrap_l1 = _sliding_ring_would_wrap(adapter._cache, L)
+            if wrap_l1:
                 L = 1
             cstart = adapter._past_len
             bonus = int(argmax_fn(adapter.next_token_logits))
@@ -940,27 +771,6 @@ def fused_specdecode_generate(
                 commit = candidate[:accepted] + [correction]
             generated += commit
             accepts.append(accepted)
-            # #region agent log (Phase-1)
-            _kdbg(
-                "block",
-                loop="torch_ftheta",
-                blk=len(accepts) - 1,
-                gen=len(generated),
-                gen_since_prompt=len(generated),
-                past_len=adapter._past_len,
-                accepted=accepted,
-                L=len(candidate),
-                wrap_l1=bool(_wrap_l1),
-                commit_ids=[int(t) for t in commit],
-                dt_ms=round((time.perf_counter() - _kblk_t0) * 1e3, 1),
-                rep=_kdbg_rep(generated),
-                sync=_kdbg_sync(adapter._cache, adapter._past_len),
-                lost=_kdbg_lost(adapter._cache,
-                                getattr(adapter, "_dbg_restored_positions", set()),
-                                getattr(adapter, "_dbg_prompt_len", 0)),
-                cache=_kdbg_cache(adapter._cache),
-            )
-            # #endregion
             if any(t in eos for t in commit):
                 break
             if (allow_greedy_fallback and len(accepts) >= 2
@@ -984,14 +794,6 @@ def fused_specdecode_generate(
     finally:
         adapter._capture_aux = False
     generated = generated[:gen_tokens]
-    # #region agent log (Phase-1: full token dump for offline fused-vs-native
-    # divergence comparison + final wide-window degeneration summary)
-    _kdbg("final", loop="torch_ftheta", n=len(generated),
-          blocks=len(accepts),
-          mean_accept_len=(round(sum(accepts) / len(accepts), 3) if accepts else 0.0),
-          rep_w128=_kdbg_rep(generated, k=128),
-          tokens=[int(t) for t in generated])
-    # #endregion
     return {
         "tokens": generated,
         "blocks": len(accepts),
diff --git a/scripts/research/k3_integrated_niah_eval_mac.py b/scripts/research/k3_integrated_niah_eval_mac.py
index ec83bc0e..ecde21dd 100644
--- a/scripts/research/k3_integrated_niah_eval_mac.py
+++ b/scripts/research/k3_integrated_niah_eval_mac.py
@@ -181,12 +181,12 @@ def parse_args() -> argparse.Namespace:
                     help="Non-interactive chat: '||'-separated user turns "
                          "(for Mac-bridge verification); writes a transcript.")
     ap.add_argument("--chat-native-ref", action="store_true",
-                    help="DEBUG (Phase-1): before each scripted-chat turn, also "
-                         "run a plain NATIVE greedy AR decode of the SAME prompt "
-                         "for --max-new-tokens, emitting KDBG block_native/"
-                         "final_native rep metrics. This is the decisive control "
-                         "for greedy-pathology vs fused-engine degeneration: if "
-                         "the native reference degenerates at the same length, the "
+                    help="DIAGNOSTIC opt-in: before each chat turn, also run a "
+                         "plain NATIVE greedy AR decode of the SAME prompt for "
+                         "--max-new-tokens and capture it as native_ref_text in "
+                         "the turn record. This is the decisive A/B control for "
+                         "greedy-pathology vs fused-engine degeneration: if the "
+                         "native reference degenerates at the same length, the "
                          "fused engine is not the cause.")
     ap.add_argument("--force-f-theta", action="store_true",
                     help="Run f_θ restoration even under --s5-exact-full-attn "
@@ -221,7 +221,6 @@ def main() -> int:
         MLXRestoredIncrementalVerifier, capture_aux_hidden,
         make_bridge_embed_lm_head, fused_specdecode_generate,
         fused_specdecode_generate_mlx, fused_specdecode_generate_mlx_trim,
-        _kdbg, _kdbg_rep,  # Phase-1 degeneration instrumentation
     )
     from inference_engine.v04.kv_compressor import make_default_compressor
     from inference_engine.bench.k3_report_gate import (
@@ -771,32 +770,28 @@ def _encode_chat(history: List[Dict[str, str]]) -> List[int]:
                 return list(cids.tolist() if hasattr(cids, "tolist") else cids)
 
             def _gen_turn(pid: List[int]) -> Dict[str, Any]:
-                # #region agent log (Phase-1: native-greedy control on the SAME
-                # prompt — discriminates greedy pathology from fused-engine bug)
+                # Opt-in A/B control (--chat-native-ref): a plain NATIVE greedy
+                # AR decode of the SAME prompt for --max-new-tokens. Captured as
+                # res["native_ref_text"] so the fused answer can be compared
+                # against the native reference for the same turn (the decisive
+                # discriminator between greedy pathology and a fused-engine bug).
+                nref_text: Optional[str] = None
+                nref_tokens: List[int] = []
                 if args.chat_native_ref:
                     nref_cache, nref_logits = native_prefill(list(pid))
-                    nref_gen: List[int] = []
-                    while len(nref_gen) < args.max_new_tokens:
+                    while len(nref_tokens) < args.max_new_tokens:
                         tok = int(mx.argmax(nref_logits).item())
-                        nref_gen.append(tok)
+                        nref_tokens.append(tok)
                         if tok in end_ids:
                             break
                         out = mlx_model(mx.array([[tok]]), cache=nref_cache)
                         mx.eval(out)
                         nref_logits = out[0, -1]
-                        if len(nref_gen) % max(args.block_size, 1) == 0:
-                            _kdbg("block_native", ref="native_greedy",
-                                  gen=len(nref_gen), rep=_kdbg_rep(nref_gen))
                     try:
                         nref_text = tokenizer.decode(
-                            nref_gen, skip_special_tokens=True)
+                            nref_tokens, skip_special_tokens=True)
                     except TypeError:
-                        nref_text = tokenizer.decode(nref_gen)
-                    _kdbg("final_native", ref="native_greedy", n=len(nref_gen),
-                          rep_w128=_kdbg_rep(nref_gen, k=128),
-                          text=nref_text,
-                          tokens=[int(t) for t in nref_gen])
-                # #endregion
+                        nref_text = tokenizer.decode(nref_tokens)
                 rk, rv, tsrc = build_restoration(pid, prefill_native_s5=True)
                 # f_θ ran iff build_restoration produced restored banks via the
                 # torch drafter+f_θ (under --force-f-theta the S5 short-circuit is
@@ -851,6 +846,9 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]:
                     if idx > 0:
                         txt = txt[:idx]
                 res["text"] = txt.strip()
+                if nref_text is not None:
+                    res["native_ref_text"] = nref_text.strip()
+                    res["native_ref_tokens"] = len(nref_tokens)
                 res["resident_kv_bytes"] = int(
                     sum(int(getattr(c, "nbytes", 0)) for c in (adapter._cache or [])))
                 return res
@@ -871,14 +869,18 @@ def _gen_turn(pid: List[int]) -> Dict[str, Any]:
                     history.append({"role": "assistant", "content": res["text"]})
                     tps = (res["decode_tokens"] / res["decode_s"]
                            if res["decode_s"] > 0 else 0.0)
-                    transcript.append({
+                    turn_rec = {
                         "user": u, "text": res["text"],
                         "tokens": res["decode_tokens"], "blocks": res["blocks"],
                         "mean_accept_len": res["mean_accept_len"],
                         "f_theta_ran": res["f_theta_ran"],
                         "f_theta_layers": res["f_theta_layers"],
                         "decode_s": res["decode_s"], "decode_tps": round(tps, 2),
-                        "resident_kv_bytes": res["resident_kv_bytes"]})
+                        "resident_kv_bytes": res["resident_kv_bytes"]}
+                    if "native_ref_text" in res:
+                        turn_rec["native_ref_text"] = res["native_ref_text"]
+                        turn_rec["native_ref_tokens"] = res["native_ref_tokens"]
+                    transcript.append(turn_rec)
                     print(f"[chat] USER {u!r}", file=sys.stderr, flush=True)
                     print(f"[chat] GEMMA-4 {res['text'][:200]!r} (blocks="
                           f"{res['blocks']}, accept_len={res['mean_accept_len']}, "

From f873482503ef8acced9b9e6a13f9ce3d443cc9f2 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 17 Jun 2026 13:31:45 +0000
Subject: [PATCH 07/10] fix(mlx-fused): use getattr for adapter._cache (fixes 3
 control-flow test regressions) + unit-test wrap detector

The wrap guard accessed adapter._cache directly, which AttributeError'd on
adapters/fakes without a cache (regressing the 3 test_fused_loop_* tests).
Use getattr(adapter,'_cache',None); _sliding_ring_would_wrap already treats
None as 'no wrap'. Add focused unit tests for the detector (wrap True/False,
non-rotating, missing max_size, empty/None).

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 .../backends/mlx/fused_specdecode.py          |  3 +-
 tests/backends/mlx/test_fused_specdecode.py   | 34 +++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/inference_engine/backends/mlx/fused_specdecode.py b/inference_engine/backends/mlx/fused_specdecode.py
index 00cc9432..c14cfbc6 100644
--- a/inference_engine/backends/mlx/fused_specdecode.py
+++ b/inference_engine/backends/mlx/fused_specdecode.py
@@ -713,7 +713,8 @@ def fused_specdecode_generate(
             # bonus token, so there is never a rejected tail to trim and offset
             # stays == past_len (matching the native greedy path, which stays
             # coherent past ms). Costs the speculative speedup past ms only.
-            wrap_l1 = _sliding_ring_would_wrap(adapter._cache, L)
+            wrap_l1 = _sliding_ring_would_wrap(
+                getattr(adapter, "_cache", None), L)
             if wrap_l1:
                 L = 1
             cstart = adapter._past_len
diff --git a/tests/backends/mlx/test_fused_specdecode.py b/tests/backends/mlx/test_fused_specdecode.py
index de92aa7a..7c1676a1 100644
--- a/tests/backends/mlx/test_fused_specdecode.py
+++ b/tests/backends/mlx/test_fused_specdecode.py
@@ -127,6 +127,40 @@ def test_fused_loop_stops_on_eos():
     assert res["blocks"] == 1
 
 
+# =========================================================================== #
+# 1b) Sliding-ring wrap detector (H2 >ms degeneration fix). Pure-Python; no MLX.
+# =========================================================================== #
+class _FakeRotating:
+    """Minimal stand-in for ``mlx_lm`` ``RotatingKVCache`` (name match only)."""
+    def __init__(self, offset, max_size):
+        self.offset = offset
+        self.max_size = max_size
+
+
+class _FakePlainKV:
+    def __init__(self, offset):
+        self.offset = offset
+        self.max_size = None
+
+
+def test_sliding_ring_would_wrap_detects_wrap():
+    # offset + n_new >= max_size -> the rotating ring becomes non-trimmable.
+    cache = [_FakeRotating(offset=1022, max_size=1024)]
+    assert fsd._sliding_ring_would_wrap(cache, 4) is True
+    assert fsd._sliding_ring_would_wrap(cache, 1) is False  # 1023 < 1024
+
+
+def test_sliding_ring_would_wrap_ignores_non_rotating_and_missing_max():
+    # A plain (non-Rotating) layer never triggers the wrap guard.
+    assert fsd._sliding_ring_would_wrap([_FakePlainKV(offset=5000)], 8) is False
+    # A Rotating layer without a usable max_size is skipped, not crashed.
+    assert fsd._sliding_ring_would_wrap(
+        [_FakeRotating(offset=5000, max_size=None)], 8) is False
+    # Empty / None cache is safe.
+    assert fsd._sliding_ring_would_wrap(None, 4) is False
+    assert fsd._sliding_ring_would_wrap([], 4) is False
+
+
 # =========================================================================== #
 # 2) MLX-touching wrappers with fake mlx / mlx_lm.
 # =========================================================================== #

From dc8169c8ec25400232f880f4cf4d3eeafbc2ae65 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 17 Jun 2026 13:39:32 +0000
Subject: [PATCH 08/10] fix(gate): remove disproven RESTORATION_COVERAGE rule;
 strengthen OUTPUT_DEGENERATE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Runtime evidence (Mac, 1300-tok run) disproved the 'restoration covers only
<= window decode tokens' theory: the decode cache is the native RotatingKVCache
(max_size~1024, not S5 window=64), and a run with 332 evicted-unrestored
positions stayed fully coherent once the trim-desync bug was fixed. So
tokens>window and even evicted>0 are NOT degeneration signals; the old rule was
a pure false-positive (would fail every coherent answer >64 tokens).

- Remove the RESTORATION_COVERAGE token>window rule.
- Add _has_runaway_substring: catches the newline-free 由于由于… collapse the
  line-based _looks_degenerate missed; conservative (>=8x tiled short unit) so
  templated 矿工 A/B/C enumerations do not false-fire.
- OUTPUT_DEGENERATE now = line-wall OR runaway-substring (empirical signal only).
- Update tests (100% coverage) and the self-correction methodology doc with the
  confirmed root cause + the verify-don't-trust-the-comment lesson.

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 ...utonomous-iteration-and-self-correction.md | 82 ++++++++++++++-----
 inference_engine/bench/k3_report_gate.py      | 75 ++++++++++++-----
 .../bench/test_k3_report_gate.py              | 68 +++++++++------
 3 files changed, 162 insertions(+), 63 deletions(-)

diff --git a/docs/kakeya-autonomous-iteration-and-self-correction.md b/docs/kakeya-autonomous-iteration-and-self-correction.md
index ef897dae..9223a77c 100644
--- a/docs/kakeya-autonomous-iteration-and-self-correction.md
+++ b/docs/kakeya-autonomous-iteration-and-self-correction.md
@@ -34,7 +34,7 @@ How it slipped through — the **silent-fallback anti-pattern**, in its observed
 | B | f_θ bypassed under S5 ("free lunch" smoke opt) | "restoration engine" | `build_restoration` returns `{}`; no f_θ forward |
 | C | a proxy/plumbing run | "engine validated" | wrong model (Qwen3-4B), no trained f_θ/proposer, prompt inside window |
 | D | a simpler component shipped | "the engine" | verifier-only AR chat presented as the product |
-| E | long-decode degeneration | "the engine works (smoke passed)" | restoration covers only ≤ window decode tokens; a real long answer (780 tok ≫ 64) degenerated to garbage + throughput collapse (0.31 tok/s) — masked because every smoke answer was ≤ window |
+| E | long-decode degeneration | "the engine works (smoke passed)" | a long answer (>1024 tok) degenerated to a `由于由于…` loop — masked because every smoke answer was short. **Root cause (confirmed by debug loop, not the initial guess):** the fused spec-decode rollback's `trim_prompt_cache` silently fails once the native `RotatingKVCache` ring wraps at `max_size`≈1024, desyncing `cache.offset` from `past_len`. Fixed via single-token commits past the wrap. The *initial* hypothesis ("restoration only covers ≤ window=64") was disproved by runtime evidence — see §4b. |
 
 Common root cause: an agent (or optimization) chose the **easy/robust path** and
 **relabeled it as the hard one**, and no automated check asserted the intended
@@ -146,12 +146,26 @@ the output was garbage and throughput collapsed. So the gate **also** asserts th
 
 | Invariant | Manifest field | Gate assertion | Code |
 | --- | --- | --- | --- |
-| restoration covers the generation | `window`, per-turn `tokens` | restored run: `tokens <= window` (beyond it, evicted-during-decode positions are unrestored) | `RESTORATION_COVERAGE` |
-| output is not degenerate | per-turn `text` | no runaway repeat (≥8 identical short lines) | `OUTPUT_DEGENERATE` |
-
-Verified: a PoW-style report (`tokens=780 > window=64`, repeated `*   *   *`) now
-**fails** the walker (CI + on-device) with both codes. **Liveness proves the
-components ran; quality proves they produced a valid result — the gate needs both.**
+| output is not degenerate | per-turn `text` | no runaway repeat — ≥8 identical short lines **or** a 1–8 char unit tiled ≥8× at the tail (catches the newline-free `由于由于…` collapse) | `OUTPUT_DEGENERATE` |
+
+Verified: a PoW-style report (repeated `*   *   *` lines, or `"由于"×120` with no
+line breaks) **fails** the walker (CI + on-device); the real coherent long answer
+and templated `矿工 A/B/C` enumerations **pass**. **Liveness proves the components
+ran; quality proves they produced a valid result — the gate needs both.**
+
+**Correction (2026-06-17) — `RESTORATION_COVERAGE` removed.** An earlier gate
+fired when a restored run generated more tokens than the S5 `window` (=64), on the
+theory that decode-time evicted positions are "unrestored" and the output beyond
+the window must degenerate. **Mac runtime evidence disproved that theory** (see
+the §"long-decode degeneration" root-cause below): the decode cache is the model's
+native hybrid cache (sliding `RotatingKVCache` with `max_size`≈1024, not the S5
+window), so nothing is evicted until ~1024 tokens; and a 1300-token run with **332
+evicted-unrestored positions stayed fully coherent** once the *actual* bug was
+fixed. "tokens > window" and even "evicted > 0" are not degeneration signals, so
+the rule was a pure false-positive (it would have failed every coherent answer
+> 64 tokens). The only trustworthy quality gate is the **empirical** one:
+`OUTPUT_DEGENERATE`. This is itself an instance of the North-Star discipline —
+*verify against runtime, never trust a plausible code comment/hypothesis.*
 
 ---
 
@@ -208,18 +222,48 @@ proposer live (`blocks=2/4`, `accept_len=4.0/3.5`), f_θ live by default
 (`f_theta_ran=TRUE`, 25 sliding layers), correct answers, bounded KV, natural EOS
 stop. One-command launcher: `scripts/run_kakeya_mac.sh`. (PR #144 + this PR.)
 
-**Known limitation (anti-pattern E, found 2026-06-17):** the Mac fused engine's
-restoration is **prefill-amortized for the prompt only** — it covers ≤ `window`
-decode tokens (code comment, `k3_integrated_niah_eval_mac.py` §"Per-sample
-restoration"). Generations longer than the window degenerate (garbage + throughput
-collapse + KV growth). The §4b gate now **fails loud** on it; the *fix* is
-**continuous decode-time restoration** (re-restore positions evicted during decode,
-as the CUDA engine does) — the real open engineering work, not a gate matter.
-
-**Open / next:** (1) continuous decode-time restoration so long generations don't
-degenerate (the engine fix); (2) full-attention model (Qwen/Llama) where f_θ is
-load-bearing for the large memory win. The gate (§4/§4b) now prevents silent
-regression to verifier-only AND silent long-decode degeneration.
+**Long-decode degeneration — root cause found and FIXED (2026-06-17).** The
+originally-hypothesised cause (anti-pattern E: "restoration covers only ≤ `window`
+decode tokens") was **wrong**, and the debug loop disproved it with runtime
+evidence — a textbook case of *verify, don't trust the comment*:
+
+1. **Characterization (128 → 800 → 1300 tokens, Mac M4, prompt "请详细解释POW的工作原理"):**
+   - The decode cache is the model's **native hybrid cache** — sliding layers are
+     `RotatingKVCache` (`max_size`=1024, `keep`=0), full layers are `KVCache`. The
+     S5 `--window-size 64` only feeds the analytical memory math; it does **not**
+     bound the decode cache. So nothing is evicted until ~1024 tokens.
+   - At 128 and 800 tokens the fused output was **fully coherent** (`max_run=1`);
+     `lost=0`; the hypothesis predicted failure at 64 — disproved.
+   - At **1300 tokens** the fused engine **degenerated** into a `由于由于…` loop
+     (`cyc_frac=1.0`) starting at gen≈1064 — *only after the ring wrapped at
+     gen≈1017*. The **native-greedy control on the same prompt stayed coherent**
+     past the wrap (terminated cleanly at gen 1247), proving the model handles
+     >1024 fine and the **fused engine** was at fault.
+2. **Root cause:** once the sliding `RotatingKVCache` ring wraps (`offset ≥ max_size`),
+   `mlx_lm.trim_prompt_cache` is **all-or-nothing and refuses** (a rotating layer is
+   `is_trimmable` only while `offset < max_size`). The fused speculative loop's
+   rejected-draft rollback then silently fails — 15 `trim short:true` events — so
+   `cache.offset` ran **+8 ahead of the committed `past_len`** on every post-wrap
+   block, misaligning RoPE/causal masking → logit corruption → collapse.
+3. **Fix (`fused_specdecode.py`, `_sliding_ring_would_wrap` + `if wrap_l1: L=1`):**
+   detect the impending wrap and commit **single-token blocks** past it. With L=1
+   the bonus token is always accepted (it *is* `argmax(next_token_logits)`), so
+   there is never a rejected tail to trim and `offset` stays `== past_len`.
+4. **Validated (re-run, 1300 tokens):** `trim short:true` 15→0; post-wrap
+   offset-desync 76/76→0; post-wrap `cyc_frac` 1.0→0.158; fused output **coherent**,
+   clean termination at gen 1241 — matching the native control. (Cost: spec-decode
+   speedup is forgone past `max_size`; correctness-first.)
+
+So eviction past `max_size` is **normal and harmless** (it is gemma's native
+sliding-window behavior); "continuous decode-time restoration" is **not** required
+for ≤-context coherence. The §4b gate now keys purely on the empirical
+`OUTPUT_DEGENERATE` signal (above).
+
+**Open / next:** (1) optional perf: a sound *wrapped-ring rollback* (snapshot/restore
+of the rotating cache) to keep speculative speedup past `max_size` — pure throughput,
+not correctness; (2) full-attention model (Qwen/Llama) where f_θ is load-bearing for
+the large memory win. The gate (§4/§4b) prevents silent regression to verifier-only
+AND silent long-decode degeneration.
 
 > Maintenance: append to §7 every iteration; update §4 if new components/
 > invariants appear; never delete the §1 failure record — it is the reason for §0.
diff --git a/inference_engine/bench/k3_report_gate.py b/inference_engine/bench/k3_report_gate.py
index a3ebdfd8..269d2cf1 100644
--- a/inference_engine/bench/k3_report_gate.py
+++ b/inference_engine/bench/k3_report_gate.py
@@ -187,8 +187,18 @@ def assert_liveness(report: Dict[str, Any]) -> List[GateViolation]:
 
 def _looks_degenerate(text: Any) -> bool:
     """True when text has collapsed into a runaway repeat — the long-decode
-    failure mode (e.g. many identical short lines like ``*   *   *``). Strict:
-    >= 8 consecutive identical stripped non-empty lines of <= 12 chars."""
+    failure mode. Two independent, conservative detectors:
+
+    1. Line-level: >= 8 consecutive identical stripped non-empty lines of
+       <= 12 chars (e.g. ``*   *   *`` walls).
+    2. Char-level (``_has_runaway_substring``): a short unit (1..8 chars) tiled
+       >= 8 times consecutively at the tail with no line breaks — the exact
+       ``由于由于由于…`` collapse observed past the KV-cache ring wrap, which the
+       line detector misses because it has no newlines.
+
+    Both require a long, unambiguous run so legitimate templated/parallel text
+    (numbered lists, ``矿工 A / 矿工 B`` enumerations) does NOT trip the gate.
+    """
     if not isinstance(text, str):
         return False
     run = 0
@@ -204,39 +214,64 @@ def _looks_degenerate(text: Any) -> bool:
         else:
             run = 0
             prev = line
+    return _has_runaway_substring(text)
+
+
+def _has_runaway_substring(text: Any) -> bool:
+    """True when the tail is a short unit repeated many times consecutively
+    (e.g. ``"由于" * 100``). Conservative: requires a 1..8 char unit (with at
+    least one non-space char) tiled >= 8 times AND spanning >= 16 chars at the
+    very end of the (stripped) text. Coherent prose — even heavily templated —
+    never repeats a short unit 8+ times back-to-back, so this does not
+    false-positive on parallel enumerations."""
+    if not isinstance(text, str):
+        return False
+    s = text.strip()
+    if len(s) < 16:
+        return False
+    # ``s`` is stripped, so the tail always ends in a non-whitespace char and a
+    # whitespace-only unit is impossible; no need to guard against it.
+    tail = s[-256:]
+    n = len(tail)
+    for p in range(1, 9):
+        unit = tail[n - p:]
+        k = 0
+        idx = n
+        while idx - p >= 0 and tail[idx - p:idx] == unit:
+            k += 1
+            idx -= p
+        if k >= 8 and p * k >= 16:
+            return True
     return False
 
 
 def assert_quality(report: Dict[str, Any]) -> List[GateViolation]:
-    """§2.4/§2.5 contract — prove the run did not lose intelligence or throughput.
+    """§2.4/§2.5 contract — prove the run did not lose intelligence.
 
     Catches the long-decode failure the liveness gate cannot see (proposer/f_θ
-    ran, yet the output is garbage + throughput collapsed):
-      * RESTORATION_COVERAGE — a restored run generated more tokens than the
-        resident window, beyond which the prefill-amortized restoration does NOT
-        cover the evicted positions (outputs become unrestored/degenerate),
+    ran, yet the output is garbage):
       * OUTPUT_DEGENERATE — a turn's text collapsed into a runaway repeat.
+
+    NOTE on the removed RESTORATION_COVERAGE rule: an earlier version fired when
+    a restored run generated more tokens than the S5 sliding ``window`` (=64),
+    on the theory that decode-time evicted positions are "unrestored" and the
+    output beyond the window must be degenerate. Mac runtime evidence DISPROVED
+    that theory: (a) the decode cache is the model's native hybrid cache
+    (sliding ``RotatingKVCache`` with ``max_size``≈1024, not the S5 window), so
+    nothing is evicted until ~1024 tokens; and (b) a 1300-token run with 332
+    evicted-unrestored positions stayed fully COHERENT once the real bug — a
+    ``trim_prompt_cache`` offset desync on the wrapped ring — was fixed
+    (single-token commits past the wrap). So "tokens > window" and even
+    "evicted positions > 0" are NOT degeneration signals. The only trustworthy
+    quality gate is the empirical one: did the text actually collapse?
     """
     violations: List[GateViolation] = []
     turns = report.get("turns")
     if not isinstance(turns, list) or not turns:
         return violations
-    window = report.get("window")
-    restored = report.get("f_theta_intended") is True
     for i, t in enumerate(turns):
         if not isinstance(t, dict):
             continue
-        toks = t.get("tokens")
-        if (restored and isinstance(window, int) and window > 0
-                and isinstance(toks, (int, float)) and not isinstance(toks, bool)
-                and int(toks) > window):
-            violations.append(GateViolation(
-                "RESTORATION_COVERAGE",
-                f"turn {i} generated {int(toks)} tokens > resident window "
-                f"{window}: the prefill-amortized restoration covers only "
-                "<= window decode tokens; positions evicted during decode are "
-                "UNRESTORED, so the output beyond the window is degenerate",
-            ))
         if _looks_degenerate(t.get("text")):
             violations.append(GateViolation(
                 "OUTPUT_DEGENERATE",
diff --git a/tests/inference_engine/bench/test_k3_report_gate.py b/tests/inference_engine/bench/test_k3_report_gate.py
index b94014f5..3157ced2 100644
--- a/tests/inference_engine/bench/test_k3_report_gate.py
+++ b/tests/inference_engine/bench/test_k3_report_gate.py
@@ -125,7 +125,8 @@ def test_liveness_silent_fallback_report_and_turn_level():
 # §2.4/§2.5 quality contract (degeneration / restoration coverage)
 # ---------------------------------------------------------------------------
 
-from inference_engine.bench.k3_report_gate import assert_quality, _looks_degenerate
+from inference_engine.bench.k3_report_gate import (
+    assert_quality, _looks_degenerate, _has_runaway_substring)
 
 
 def _quality_report(turns, window=64, restored=True):
@@ -141,35 +142,42 @@ def test_quality_passes_clean_short_turn():
     assert assert_quality(rep) == []
 
 
-def test_quality_restoration_coverage_exceeded():
-    # the PoW failure: restored run generated way past the window
-    rep = _quality_report([{"tokens": 780, "text": "ok"}], window=64, restored=True)
-    codes = {v.code for v in assert_quality(rep)}
-    assert "RESTORATION_COVERAGE" in codes
-    # and it surfaces through validate_report (dispatch wires assert_quality)
-    assert any(v.code == "RESTORATION_COVERAGE" for v in validate_report(rep))
-
-
-def test_quality_no_coverage_check_when_not_restored():
-    # all-MLX path (f_θ bypassed): no restoration to exceed.
-    rep = _quality_report([{"tokens": 780, "text": "ok"}], window=64, restored=False)
-    assert all(v.code != "RESTORATION_COVERAGE" for v in assert_quality(rep))
+def test_quality_long_coherent_run_does_not_fire():
+    # Runtime evidence (Mac, 1300 tokens, 332 evicted-unrestored positions) shows
+    # a restored run far past the S5 window stays coherent once the trim-desync
+    # bug is fixed. The gate must NOT fire on token count / eviction alone.
+    long_text = ("Proof of Work is a consensus mechanism. " * 60).strip()
+    rep = _quality_report([{"tokens": 1300, "text": long_text}],
+                          window=64, restored=True)
+    assert assert_quality(rep) == []
 
 
-def test_quality_coverage_skipped_without_window():
-    rep = _quality_report([{"tokens": 780, "text": "ok"}], window=None)
-    assert all(v.code != "RESTORATION_COVERAGE" for v in assert_quality(rep))
+def test_quality_output_degenerate_line_wall():
+    garbage = "Answer:\n" + "\n".join(["*   *   *"] * 12)
+    rep = _quality_report([{"tokens": 50, "text": garbage}])
+    assert any(v.code == "OUTPUT_DEGENERATE" for v in assert_quality(rep))
+    # and it surfaces through validate_report (dispatch wires assert_quality)
+    assert any(v.code == "OUTPUT_DEGENERATE" for v in validate_report(rep))
 
 
-def test_quality_bool_tokens_not_counted():
-    rep = _quality_report([{"tokens": True, "text": "ok"}], window=64)
-    assert all(v.code != "RESTORATION_COVERAGE" for v in assert_quality(rep))
+def test_quality_output_degenerate_runaway_substring_no_newlines():
+    # the exact >ms-wrap collapse: a short unit repeated with no line breaks,
+    # which the line-level detector cannot see.
+    garbage = "PoW 的核心是" + "由于" * 120
+    rep = _quality_report([{"tokens": 1300, "text": garbage}])
+    assert any(v.code == "OUTPUT_DEGENERATE" for v in assert_quality(rep))
 
 
-def test_quality_output_degenerate_detected():
-    garbage = "Answer:\n" + "\n".join(["*   *   *"] * 12)
-    rep = _quality_report([{"tokens": 50, "text": garbage}])
-    assert any(v.code == "OUTPUT_DEGENERATE" for v in assert_quality(rep))
+def test_quality_templated_enumeration_does_not_fire():
+    # the real PoW answer's templated section (矿工 A/B/C with DIFFERENT nonces)
+    # has high autocorrelation but must NOT be flagged as degenerate.
+    templated = (
+        "矿工 A 尝试：Hash(数据 + 1) = 0xabc (不符合要求)\n"
+        "矿工 B 尝试：Hash(数据 + 2) = 0xdef (不符合要求)\n"
+        "矿工 C 运气好：Hash(数据 + 999) = 0x000 (符合要求！)\n"
+        "一旦有人找到了满足条件的哈希值，他就挖到了矿。")
+    rep = _quality_report([{"tokens": 200, "text": templated}])
+    assert assert_quality(rep) == []
 
 
 def test_quality_empty_and_nondict_turns():
@@ -191,6 +199,18 @@ def test_looks_degenerate_helper():
     assert _looks_degenerate("x\n\ny\nz\nw\nq\nr\ns") is False
 
 
+def test_has_runaway_substring_helper():
+    assert _has_runaway_substring("由于" * 50) is True          # 2-char unit x50
+    assert _has_runaway_substring("ok " + "a" * 40) is True     # 1-char unit
+    assert _has_runaway_substring(123) is False
+    assert _has_runaway_substring("short") is False             # < 16 chars
+    # coherent prose ending differently is not flagged
+    assert _has_runaway_substring(
+        "Proof of Work secures the chain through energy expenditure.") is False
+    # only ~5 repeats (< 8) of a multi-char unit does not trip
+    assert _has_runaway_substring("prefix " + "ab" * 5) is False
+
+
 def _valid_report(n: int = MIN_PERF_SAMPLES) -> Dict[str, Any]:
     """A schema-2 report that passes every rule."""
     cross_rows = [

From 72083180634f46d95140da4f8f75eca5103efc4f Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 17 Jun 2026 13:40:57 +0000
Subject: [PATCH 09/10] docs(manifest): degen-probe is now a long-decode
 regression guard (drop stale KDBG mention)

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 inference_engine/bridge/manifest.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py
index ba34dfc0..8b67da71 100644
--- a/inference_engine/bridge/manifest.py
+++ b/inference_engine/bridge/manifest.py
@@ -773,11 +773,14 @@ def _harness_preset(
         ),
         Preset(
             name="mlx-kakeya-degen-probe",
-            description="DEBUG (Phase-1): full f_θ fused engine on a LONG prompt "
-                        "(--ignore-turn-stop, default 256 tokens) to characterize "
-                        "the long-decode degeneration onset. Emits KDBG NDJSON to "
-                        "stderr (captured in the bridge log) + transcript JSON. NOT "
-                        "gated — the degeneration is the thing being measured.",
+            description="Long-decode regression probe: full f_θ fused engine on a "
+                        "LONG generation (--ignore-turn-stop) past the native "
+                        "RotatingKVCache ring wrap (max_size~1024), with a "
+                        "native-greedy control (--chat-native-ref) on the same "
+                        "prompt for an A/B coherence comparison. Guards the "
+                        "wrapped-ring trim-desync fix: at >=1300 tokens the fused "
+                        "output must stay coherent (match native), not collapse "
+                        "into a runaway repeat.",
             command_templates=(
                 (
                     "python3", "scripts/research/k3_integrated_niah_eval_mac.py",

From e62b9bf4bdc5c4f359f453a5e9600c2a429382e4 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 17 Jun 2026 14:43:34 +0000
Subject: [PATCH 10/10] =?UTF-8?q?docs(skill):=20add=20long-decode=20degene?=
 =?UTF-8?q?ration=20debugging=20case=20study=20as=20a=20reusable=20templat?=
 =?UTF-8?q?e=20(=C2=A77)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- §5: add the RotatingKVCache-wrap degeneration bug row.
- New §7: worked hypothesis-driven debugging template (reproduce at increasing
  scale, native-greedy A/B control, instrument the indicated mechanism, fix
  correctness-first, re-validate) + two generalizable lessons (runtime evidence
  overrides plausible hypotheses; gate on observed outcomes not theorized
  proxies).
- Renumber Validation §7->§8 and Pointers §8->§9; update cross-refs.

Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
---
 docs/kakeyainferenceenginebuildskill.md | 110 +++++++++++++++++++++---
 1 file changed, 98 insertions(+), 12 deletions(-)

diff --git a/docs/kakeyainferenceenginebuildskill.md b/docs/kakeyainferenceenginebuildskill.md
index e9093ea4..38a40c82 100644
--- a/docs/kakeyainferenceenginebuildskill.md
+++ b/docs/kakeyainferenceenginebuildskill.md
@@ -7,8 +7,9 @@ v0.5-cuda) into: what the engine is, where the code lives, how to run/benchmark
 it, the milestone roadmap, the hard-won bugs+fixes, and — most important — the
 **validation honesty standards** (the rules that keep claims defensible).
 
-> If you only read one section, read **§7 Validation & honesty standards**. The
-> most expensive mistakes in this project were *overclaims*, not bugs.
+> If you only read one section, read **§8 Validation & honesty standards**. The
+> most expensive mistakes in this project were *overclaims*, not bugs. For the
+> debugging method, **§7 is a reusable worked template.**
 
 ---
 
@@ -142,7 +143,7 @@ Port lessons: `docs/mlx-port-lessons.md`.
 | **KIE-v1.1.z** (#139) | throughput + N=75 | **N=75 MET** (recall 1.0, 126.7 GB, ~4.8× vLLM; ~31 tok/s aggregate); **decode ≥ vLLM NOT met** (eager 26B-MoE wall) |
 | **KIE-v1.1.z2** | rebuild fused-MoE + graph forward | **abandoned** — superseded by KIE-v2 (run *on* vLLM) |
 | **KIE-v2** (#140) | **Kakeya Attention on vLLM** | decode **≥ vLLM (1.15–1.23×)** @16k, recall 1.0, measured to N=70 — inherits vLLM runtime |
-| **v0.5-cuda** (#141) | release `KakeyaVLLM` + consolidated reports | done (gemma-4 instantiation). Product concurrency claim = **`KakeyaVLLM` N→70 @16k** on vLLM; the **N=75 @62k is the *eager* `KakeyaEngine` substrate**, not the v0.5 product path — do not conflate. See §7 for exact validation scope |
+| **v0.5-cuda** (#141) | release `KakeyaVLLM` + consolidated reports | done (gemma-4 instantiation). Product concurrency claim = **`KakeyaVLLM` N→70 @16k** on vLLM; the **N=75 @62k is the *eager* `KakeyaEngine` substrate**, not the v0.5 product path — do not conflate. See §8 for exact validation scope |
 | **v0.6** (= ADR 0015 KIE-v1.2) | **restoration backend on full-attention models** (Qwen/Llama): train f_θ/proposer + inject restoration at vLLM prefill + graph-capturable quantized-exact kernel | **planned — the real memory differentiator (~6×)** |
 
 > **N=16 vs N=24 (KIE-v1.1 precaution).** The evicting StaticCache alone at the
@@ -168,6 +169,7 @@ Port lessons: `docs/mlx-port-lessons.md`.
 | `torch.compile` attention 6.6× but **0% e2e decode gain** | decode dominated by **eager 26B-MoE full-model forward**, not attention | need fused-MoE + full-forward graph capture → that's vLLM's job → **KIE-v2** |
 | fused-MoE port blocked | HF `kernels` incompatible w/ transformers 5.12; vLLM `fused_moe` cross-venv surgery; from-scratch = multi-week | **run Kakeya ON vLLM** instead of rebuilding it (KIE-v2) |
 | `KakeyaVLLM` crash on text-only model | unconditional `text_config` nesting (gemma multimodal) breaks Qwen/Llama (`num_attention_heads` missing) | **auto-detect** `text_config` via `AutoConfig`: nested for gemma-4, flat for Qwen/Llama |
+| MLX fused engine **long-decode degeneration** (`由于由于…` runaway past ~1024 tok, throughput collapse) | once the native sliding `RotatingKVCache` ring **wraps** (`offset ≥ max_size`≈1024), `mlx_lm.trim_prompt_cache` refuses the spec-decode rejected-draft rollback (all-or-nothing; `is_trimmable` needs `offset < max_size`) → un-trimmed rejects leave `cache.offset` **+8 ahead of `past_len`** → RoPE/mask desync → logit corruption | detect the impending wrap (`_sliding_ring_would_wrap`) and commit **single-token blocks** past it (`L=1`): the bonus is always accepted, so there's no rejected tail to trim and `offset` stays `== past_len`. **Full worked template in §7.** |
 
 ---
 
@@ -189,12 +191,96 @@ Port lessons: `docs/mlx-port-lessons.md`.
 
 ---
 
-## 7. Validation & honesty standards (READ THIS)
+## 7. Worked case study: debugging the long-decode degeneration (a TEMPLATE)
+
+This is the **model example** of how to debug a non-obvious runtime bug in this
+project. Reuse the *shape* of this process for any "it works in smoke tests but
+breaks in the real workload" bug. The actual fix is the `RotatingKVCache`-wrap
+row in §5; what follows is the **method**, written so it transfers.
+
+### 7.A The symptom
+Mac (MLX) fused spec-decode engine produced **garbage on long answers**: a long
+reply (e.g. "请详细解释POW的工作原理") started coherent, then collapsed into a runaway
+repeat (`由于由于由于…`) with throughput falling off. Short answers were fine, so it
+had slipped through every smoke test.
+
+### 7.B The process (the reusable template)
+
+> **Golden rule (this project's §6 principle made concrete): never fix from code
+> alone. Reproduce → instrument → measure → let runtime evidence pick the
+> hypothesis. Be ready to have your first hypothesis killed by the data.**
+
+1. **Write down the initial hypothesis — then try to disprove it, not confirm it.**
+   Initial guess (from a code comment): "restoration only covers ≤ `window`=64
+   decode tokens, so output past 64 is unrestored → degenerate." Plausible, and
+   **wrong**. Treat plausible hypotheses as suspects, not conclusions.
+2. **Reproduce at increasing scale, on the real device, with one fixed prompt.**
+   Drive the Mac M4 via the bridge (`mlx-kakeya-degen-probe` preset). Sweep the
+   one variable that matters (generation length):
+
+   | run | length | result | inference |
+   | --- | --- | --- | --- |
+   | 1 | 128 tok | coherent | kills "fails at window=64"; also reveals the decode cache is the model's **native `RotatingKVCache` (`max_size`≈1024)**, *not* the S5 window |
+   | 2 | 800 tok | coherent | failure is past 800 → keep going |
+   | 3 | 1300 tok | **degenerates** at gen≈1064 | reproduced; onset is right after the ring **wraps** at gen≈1017 |
+
+3. **Add a discriminating control (the single highest-value step).** In run 3,
+   also decode the **same prompt with a plain native-greedy loop** (`--chat-native-ref`)
+   as an A/B. Native stayed **fully coherent** past the wrap (clean stop @ 1247) →
+   *the model handles >1024 fine; the fused engine corrupts it.* A control that
+   isolates "your code" from "the model/library" is worth more than ten more logs.
+4. **Instrument the exact mechanism the data now points at.** NDJSON per-block
+   logs of cache `offset` vs committed `past_len`, and of every `trim_prompt_cache`
+   call. Smoking gun: after the wrap, `offset` ran **+8 ahead of `past_len`** on
+   every block, with **15 "trim refused" events** — only post-wrap.
+5. **State the root cause mechanistically** (see §5 row): wrapped ring →
+   `trim_prompt_cache` refuses → rejected drafts linger → offset/`past_len` desync
+   → RoPE/mask misalignment → logit corruption.
+6. **Fix correctness-first**, then re-run the *identical* probe and show the
+   metrics move the right way:
+
+   | signal | before | after |
+   | --- | --- | --- |
+   | "trim refused" events | 15 | **0** |
+   | post-wrap offset desync | 76/76 blocks | **0/225** |
+   | repetition `cyc_frac` | 1.0 (collapse) | **0.158** |
+   | final text | `由于…` runaway | **coherent**, clean stop @ 1241 (= native) |
+
+### 7.C Two lessons that generalize (the "样板" payload)
+
+- **L1 — runtime evidence overrides plausible hypotheses (and code comments).**
+  The comment-derived "≤ window restoration coverage" theory was disproved by run 1
+  (128 tok coherent) and run 3's native control (332 evicted-yet-coherent tokens).
+  Eviction past `max_size` is *normal* (native sliding-window behavior), not a
+  degeneration cause. **Always verify the assumption against a run before building
+  on it.**
+- **L2 — a gate built on a wrong hypothesis is a false-positive factory.** A
+  `RESTORATION_COVERAGE` quality gate had shipped that fired on `tokens > window`.
+  Once L1 disproved the theory, that gate was shown to flag **every** coherent
+  answer > 64 tokens. It was removed; the quality gate now keys only on the
+  **empirical** signal (did the text actually collapse? — `_has_runaway_substring`
+  catches the newline-free `由于…` case, and is conservative enough to *not* trip on
+  legitimate templated text like `矿工 A/B/C` enumerations). **Gate on observed
+  outcomes, not on theorized proxies.**
+
+### 7.D Pointers
+- Fix + control flag: `inference_engine/backends/mlx/fused_specdecode.py`
+  (`_sliding_ring_would_wrap`), `scripts/research/k3_integrated_niah_eval_mac.py`
+  (`--chat-native-ref`).
+- Corrected gate: `inference_engine/bench/k3_report_gate.py`
+  (`assert_quality`, `_has_runaway_substring`).
+- Full narrative + the disproved-hypothesis timeline:
+  `docs/kakeya-autonomous-iteration-and-self-correction.md` (§"long-decode
+  degeneration"). PR #146.
+
+---
+
+## 8. Validation & honesty standards (READ THIS)
 
 The single most damaging error pattern in this project is **overclaiming a
 validation**. Follow these rules rigidly.
 
-### 7.1 What counts as validating "the engine" vs "the plumbing"
+### 8.1 What counts as validating "the engine" vs "the plumbing"
 
 - **Engine/algorithm validation** = the actual claim (recall, memory, throughput)
   measured **on the release model, through the release code path, exercising the
@@ -203,9 +289,9 @@ validation**. Follow these rules rigidly.
   generates" — proves the code runs, proves **nothing** about the algorithm.
 - **Label every artifact as one or the other.** Never let a smoke test masquerade
   as engine validation. (Case study: a Qwen3-4B run of `KakeyaVLLM` was wrongly
-  presented as "end-to-end validation". It was plumbing-only — see §7.3.)
+  presented as "end-to-end validation". It was plumbing-only — see §8.3.)
 
-### 7.2 The Gemma-4 "S5 free lunch" — and why it does NOT generalize
+### 8.2 The Gemma-4 "S5 free lunch" — and why it does NOT generalize
 
 - On **gemma-4-26B-A4B**, recall is **1.0 at `sliding_window=68` with NO
   restoration**, because **5 of 30 layers are native full-attention and carry
@@ -218,7 +304,7 @@ validation**. Follow these rules rigidly.
   recall** — so restoration is the *only* way to bound memory at full recall, and
   vLLM (no restoration) must keep full KV.
 
-### 7.3 HARD RULE: never validate Kakeya Attention on a model without trained f_θ/proposer
+### 8.3 HARD RULE: never validate Kakeya Attention on a model without trained f_θ/proposer
 
 A bounded window **without** trained restoration is **naive truncation, not Kakeya
 Attention.** On a full-attention model with no trained f_θ/proposer:
@@ -228,9 +314,9 @@ Attention.** On a full-attention model with no trained f_θ/proposer:
 
 So you **cannot** demonstrate the engine on such a model. The v0.6 work is exactly
 "train f_θ/proposer for a full-attention model **then** validate". Until then, the
-only defensible engine evidence is gemma-4 (§7.2).
+only defensible engine evidence is gemma-4 (§8.2).
 
-### 7.4 Decode-speed honesty
+### 8.4 Decode-speed honesty
 
 - The **eager `KakeyaEngine`** wins memory/concurrency but is slow at decode
   (~25–31 tok/s aggregate; the eager 26B-MoE forward dominates). Report decode-only
@@ -241,7 +327,7 @@ only defensible engine evidence is gemma-4 (§7.2).
   inherits vLLM's fused-MoE + CUDA graphs + scheduler. Don't claim product decode
   speed from the eager engine.
 
-### 7.5 Checklist before writing "validated" anywhere
+### 8.5 Checklist before writing "validated" anywhere
 
 1. Did the **release code path** run (not a side script that approximates it)?
 2. Was the claim's **mechanism actually exercised** (restoration ran? eviction
@@ -257,7 +343,7 @@ If any answer is "no", write the weaker, true claim.
 
 ---
 
-## 8. Pointers
+## 9. Pointers
 
 - North star + algorithm + milestones: `docs/adr/0015-kakeya-attention-and-engine-substrate.md`
 - Engine architecture: `docs/design/kakeya-inference-engine-architecture.md`