diff --git a/inference_engine/backends/mlx/cross_model_dlm_verifier.py b/inference_engine/backends/mlx/cross_model_dlm_verifier.py
index a487b726..51b42ee8 100644
--- a/inference_engine/backends/mlx/cross_model_dlm_verifier.py
+++ b/inference_engine/backends/mlx/cross_model_dlm_verifier.py
@@ -376,6 +376,7 @@ def restored_prefill_cache(
     restored_v_per_layer: Dict[int, Any],
     evicted_positions: Sequence[int],
     prefill_chunk_size: int = 0,
+    cache_factory: Optional[Callable[[Any], Any]] = None,
 ):
     """Prefill ONCE with restoration, capturing the restored K/V into a
     persistent mlx_lm prompt cache; return ``(cache, last_logits)``.
@@ -397,7 +398,15 @@ def restored_prefill_cache(
     text_model = resolve_mlx_text_model(mlx_model)
     T = len(list(input_ids))
     evicted = set(int(p) for p in evicted_positions if 0 <= int(p) < T)
-    cache = make_prompt_cache(mlx_model)
+    # cache_factory lets the caller swap the model's native hybrid cache for an
+    # all-`KVCache` layout (full store for sliding layers too) so that the
+    # spec-decode accept/reject rollback can use mlx_lm's native, SOUND
+    # `trim_prompt_cache` (keep accepted K/V, drop only rejected) instead of the
+    # full re-forward carry — `RotatingKVCache` is not trimmable once wrapped.
+    # Sliding attention stays byte-exact: the window mask is applied regardless
+    # of cache capacity. (Costs O(T) sliding KV during decode; fine for the
+    # short-context code/agent workloads this targets.)
+    cache = (cache_factory or make_prompt_cache)(mlx_model)
 
     def _slice_restored(a, start: int, end: int):
         if a is None:
diff --git a/inference_engine/backends/mlx/fused_specdecode.py b/inference_engine/backends/mlx/fused_specdecode.py
index d22a0eb3..47e5febc 100644
--- a/inference_engine/backends/mlx/fused_specdecode.py
+++ b/inference_engine/backends/mlx/fused_specdecode.py
@@ -157,6 +157,7 @@ def __init__(
         self._last_aux_mx: Optional[List[Any]] = None
         self._capture_aux = False
         self._block_snapshot: Optional[List[Dict[str, Any]]] = None
+        self._full_kv = False
 
     def reset(self) -> None:
         self._cache = None
@@ -174,16 +175,22 @@ def prefill(
         restored_v_per_layer: Dict[int, Any],
         evicted_positions: Sequence[int],
         prefill_chunk_size: int = 0,
+        full_kv: bool = False,
     ) -> None:
         if not prompt_ids:
             raise ValueError("prompt_ids must be non-empty")
         self.reset()
+        # full_kv=True → all-`KVCache` layout so accept/reject rollback can use
+        # SOUND native trim (keep accepted, drop rejected) with no re-forward.
+        self._full_kv = bool(full_kv)
+        factory = make_full_kv_prompt_cache if full_kv else None
         self._cache, self.next_token_logits = restored_prefill_cache(
             self.mlx_model, list(prompt_ids),
             restored_k_per_layer=restored_k_per_layer,
             restored_v_per_layer=restored_v_per_layer,
             evicted_positions=evicted_positions,
             prefill_chunk_size=prefill_chunk_size,
+            cache_factory=factory,
         )
         self._past_len = len(prompt_ids)
 
@@ -336,6 +343,143 @@ def lm_head_fn(h: Any) -> Any:
 # Single-sync all-MLX fused loop (levers ① ② ③ of the Step-2 throughput plan;
 # docs/mlx-port-lessons.md "Step-2 rescue status").
 # --------------------------------------------------------------------------- #
+def make_full_kv_prompt_cache(mlx_model: Any) -> List[Any]:
+    """Build a prompt cache that uses a full append-only ``KVCache`` for EVERY
+    layer (including the sliding-attention ones, which the model's native
+    ``make_cache`` would give a ``RotatingKVCache``).
+
+    Why: ``RotatingKVCache`` is not trimmable once the ring has wrapped
+    (``is_trimmable`` → ``offset < max_size``), so spec-decode accept/reject
+    rollback cannot keep the accepted K/V via a cheap trim — it must re-forward
+    (the v3 carry penalty). With an all-``KVCache`` layout, ``trim_prompt_cache``
+    is a sound O(1) slice on every layer, so the loop keeps accepted K/V and
+    drops only the rejected tail (CUDA `DynamicCache` parity). Sliding attention
+    remains byte-exact because the per-layer window mask is applied regardless
+    of cache capacity; the only cost is O(T) sliding KV during decode.
+    """
+    from mlx_lm.models.cache import make_prompt_cache, KVCache  # type: ignore
+
+    n = len(make_prompt_cache(mlx_model))
+    return [KVCache() for _ in range(n)]
+
+
+def fused_specdecode_generate_mlx_trim(
+    adapter: "MLXRestoredIncrementalVerifier",
+    drafter: Any,
+    *,
+    aux_prompt: Sequence[Any],
+    embed_fn: Callable[[Any], Any],
+    lm_head_fn: Callable[[Any], Any],
+    gen_tokens: int,
+    block_size: int,
+    eos_ids: Sequence[int] = (),
+    single_fused: bool = False,
+) -> Dict[str, Any]:
+    """CUDA-parity fused spec decode: KEEP accepted K/V, TRIM only the rejected
+    tail (no rollback, no carry re-forward). Requires the adapter to be
+    prefilled with ``full_kv=True`` (all-``KVCache`` layout) so the native
+    ``trim_prompt_cache`` is sound. Levers ①②③ retained (lazy draft+verify
+    single graph, in-graph cumprod acceptance, carried correction).
+
+    Per block: forward ``[bonus + drafts]`` (L tokens) → cache = base+L; accept
+    the leading match count ``k`` (bonus always accepts); ``trim_prompt_cache``
+    drops the L−k rejected tokens; advance ``_past_len`` by ``k``. The accepted
+    tokens' K/V (computed in this forward) stay in the cache — never recomputed.
+    """
+    import mlx.core as mx  # type: ignore
+    from mlx_lm.models.cache import trim_prompt_cache  # type: ignore
+
+    eos = set(int(t) for t in eos_ids)
+    C = adapter._past_len
+    ctx_kv = drafter.make_context_kv(list(aux_prompt), mx.arange(0, C))
+    mx.async_eval([t for kv in ctx_kv for t in kv])
+    timing = {"ctx_kv_build_s": 0.0, "build_s": 0.0, "eval_s": 0.0, "extend_s": 0.0}
+    adapter._capture_aux = True
+
+    generated: List[int] = []
+    accepts: List[int] = []
+    block_evals: List[float] = []
+    ctx_len = C
+    try:
+        while len(generated) < gen_tokens:
+            L = min(block_size, gen_tokens - len(generated))
+            base = adapter._past_len
+            t_build = time.perf_counter()
+            bonus_id = mx.argmax(adapter.next_token_logits)        # lazy scalar
+            n_draft = max(L - 1, 0)
+            if n_draft:
+                drafts = drafter.draft_block_ids(
+                    ctx_kv, bonus_id, embed_fn, lm_head_fn,
+                    n_masks=n_draft, context_len=base)
+                check_ids = mx.concatenate([bonus_id[None], drafts])   # [L]
+                if not single_fused:
+                    mx.eval(check_ids)   # two-phase (drafter graph before 26B)
+                # single_fused=True → leave check_ids LAZY so the drafter and
+                # 26B verify fuse into ONE graph (the path b876 found Metal-
+                # pathological); this probe times it to classify the instability.
+            else:
+                check_ids = bonus_id[None]
+            block_logits = adapter.forward_block_lazy(check_ids[None])  # [L, V]
+            # in-graph greedy acceptance over the check region
+            pred_rows = mx.concatenate(
+                [adapter.next_token_logits[None], block_logits[:max(L - 1, 0)]],
+                axis=0)
+            matches = (mx.argmax(pred_rows, axis=-1) == check_ids)
+            accepted_mx = mx.sum(mx.cumprod(matches.astype(mx.int32)))
+            rows = mx.concatenate(
+                [adapter.next_token_logits[None], block_logits], axis=0)  # [L+1,V]
+            next_row = mx.take(rows, accepted_mx[None], axis=0)[0]        # [V]
+            timing["build_s"] += time.perf_counter() - t_build
+            t_eval = time.perf_counter()
+            mx.eval(accepted_mx, check_ids)
+            blk_eval = time.perf_counter() - t_eval
+            timing["eval_s"] += blk_eval
+            block_evals.append(round(blk_eval, 4))
+            accepted = int(accepted_mx.item())
+            check = [int(x) for x in check_ids.tolist()]
+            commit = check[:accepted]
+            generated += commit
+            accepts.append(accepted)
+            adapter.next_token_logits = next_row
+            aux_rows = adapter._last_aux_mx
+            # KEEP accepted (positions base..base+accepted-1), TRIM rejected.
+            drop = L - accepted
+            if drop > 0:
+                trim_prompt_cache(adapter._cache, drop)
+            adapter._past_len = base + accepted
+            S_new = adapter._past_len
+            lo, hi = ctx_len - base, S_new - base
+            if hi > lo and aux_rows is not None:
+                t_extend = time.perf_counter()
+                new_aux = [a[lo:hi][None] for a in aux_rows]
+                ctx_kv = drafter.extend_context_kv(
+                    ctx_kv,
+                    drafter.make_context_kv(new_aux, mx.arange(ctx_len, S_new)))
+                mx.async_eval([t for kv in ctx_kv for t in kv])
+                ctx_len = S_new
+                timing["extend_s"] += time.perf_counter() - t_extend
+            if any(t in eos for t in commit):
+                break
+    finally:
+        adapter._capture_aux = False
+    generated = generated[:gen_tokens]
+    return {
+        "tokens": generated,
+        "blocks": len(accepts),
+        "mean_accept_len": (round(sum(accepts) / len(accepts), 3)
+                            if accepts else 0.0),
+        "decode_tokens": len(generated),
+        "loop": ("mlx_trim_single_fused_probe" if single_fused
+                 else "mlx_trim_keep_accepted_cuda_parity"),
+        "single_fused": bool(single_fused),
+        "block_eval_s_first8": block_evals[:8],
+        "block_eval_s_max": (round(max(block_evals), 4) if block_evals else None),
+        "block_eval_s_mean": (round(sum(block_evals) / len(block_evals), 4)
+                              if block_evals else None),
+        "time_breakdown_s": {k: round(v, 3) for k, v in timing.items()},
+    }
+
+
 def fused_specdecode_generate_mlx(
     adapter: "MLXRestoredIncrementalVerifier",
     drafter: Any,
diff --git a/inference_engine/bridge/manifest.py b/inference_engine/bridge/manifest.py
index 773965d0..d1224895 100644
--- a/inference_engine/bridge/manifest.py
+++ b/inference_engine/bridge/manifest.py
@@ -235,6 +235,129 @@ def _harness_preset(
             timeout_minutes=45,
             params={"path": ("path:tests", None)},
         ),
+        Preset(
+            name="k3-fused-singlefused-probe",
+            description="PROBE: single-fused (one drafter+26B graph) vs two-phase, "
+                        "to classify the Metal instability. Small (n=2, gen=16) so a "
+                        "pathological per-block eval is bounded. Compare block_eval_s "
+                        "vs k3-fused-allmlx-code-trim (two-phase).",
+            command_templates=(
+                (
+                    "python3", "scripts/research/k3_integrated_niah_eval_mac.py",
+                    "--verifier-path", "${ENV:KAKEYA_MAC_VERIFIER_PATH}",
+                    "--drafter-id", "${ENV:KAKEYA_MAC_DRAFTER_ID}",
+                    "--f-theta-dir", "${ENV:KAKEYA_MAC_FTHETA_DIR}",
+                    "--s5-exact-full-attn", "--fused-specdecode",
+                    "--all-mlx-drafter", "--code-prompts", "--cuda-trim",
+                    "--single-fused",
+                    "--n-samples", "{n_samples}",
+                    "--max-new-tokens", "{max_new_tokens}",
+                    "--block-size", "{block_size}",
+                    "--prefill-chunk-size", "512",
+                    "--output",
+                    "results/research/k3_mac_bridge_k3_fused_singlefused_probe.json",
+                ),
+            ),
+            timeout_minutes=60,
+            params={
+                "n_samples": ("int:n_samples", "2"),
+                "max_new_tokens": ("int:max_new_tokens", "16"),
+                "block_size": ("int:block_size", "4"),
+            },
+            validate_reports=False,
+        ),
+        Preset(
+            name="k3-fused-allmlx-code-trim",
+            description="CUDA-parity rollback test: all-MLX fused + --cuda-trim "
+                        "(all-KVCache + native trim, keep accepted / drop rejected, "
+                        "no re-forward) on the code-completion workload. Compare "
+                        "decode-only tok/s vs k3-fused-allmlx-code (v3 carry).",
+            command_templates=(
+                (
+                    "python3", "scripts/research/k3_integrated_niah_eval_mac.py",
+                    "--verifier-path", "${ENV:KAKEYA_MAC_VERIFIER_PATH}",
+                    "--drafter-id", "${ENV:KAKEYA_MAC_DRAFTER_ID}",
+                    "--f-theta-dir", "${ENV:KAKEYA_MAC_FTHETA_DIR}",
+                    "--s5-exact-full-attn", "--fused-specdecode",
+                    "--all-mlx-drafter", "--code-prompts", "--cuda-trim",
+                    "--n-samples", "{n_samples}",
+                    "--max-new-tokens", "{max_new_tokens}",
+                    "--block-size", "{block_size}",
+                    "--prefill-chunk-size", "512",
+                    "--output",
+                    "results/research/k3_mac_bridge_k3_fused_allmlx_code_trim.json",
+                ),
+            ),
+            timeout_minutes=120,
+            params={
+                "n_samples": ("int:n_samples", "8"),
+                "max_new_tokens": ("int:max_new_tokens", "128"),
+                "block_size": ("int:block_size", "4"),
+            },
+            validate_reports=False,
+        ),
+        Preset(
+            name="k3-fused-allmlx-code",
+            description="HONEST spec-decode throughput probe: all-MLX fused on a "
+                        "code-completion workload (naturally-long, predictable gen "
+                        "= the spec-decode sweet spot), natural stop. Reports "
+                        "decode-only tok/s (fused vs oracle AR) + acceptance.",
+            command_templates=(
+                (
+                    "python3", "scripts/research/k3_integrated_niah_eval_mac.py",
+                    "--verifier-path", "${ENV:KAKEYA_MAC_VERIFIER_PATH}",
+                    "--drafter-id", "${ENV:KAKEYA_MAC_DRAFTER_ID}",
+                    "--f-theta-dir", "${ENV:KAKEYA_MAC_FTHETA_DIR}",
+                    "--s5-exact-full-attn", "--fused-specdecode",
+                    "--all-mlx-drafter", "--code-prompts",
+                    # natural stop (no --ignore-turn-stop); code finishes itself
+                    "--n-samples", "{n_samples}",
+                    "--max-new-tokens", "{max_new_tokens}",
+                    "--block-size", "{block_size}",
+                    "--prefill-chunk-size", "512",
+                    "--output",
+                    "results/research/k3_mac_bridge_k3_fused_allmlx_code.json",
+                ),
+            ),
+            timeout_minutes=120,
+            params={
+                "n_samples": ("int:n_samples", "8"),
+                "max_new_tokens": ("int:max_new_tokens", "128"),
+                "block_size": ("int:block_size", "4"),
+            },
+            validate_reports=False,
+        ),
+        Preset(
+            name="k3-fused-allmlx-natural",
+            description="Acceptance probe: all-MLX fused, NATURAL stop (no "
+                        "--ignore-turn-stop) so generation ends at the real "
+                        "answer. Compare mean_accept_len vs the forced "
+                        "k3-step2-fused-allmlx (which over-generates).",
+            command_templates=(
+                (
+                    "python3", "scripts/research/k3_integrated_niah_eval_mac.py",
+                    "--verifier-path", "${ENV:KAKEYA_MAC_VERIFIER_PATH}",
+                    "--drafter-id", "${ENV:KAKEYA_MAC_DRAFTER_ID}",
+                    "--f-theta-dir", "${ENV:KAKEYA_MAC_FTHETA_DIR}",
+                    "--s5-exact-full-attn", "--fused-specdecode",
+                    "--all-mlx-drafter",
+                    # deliberately NO --ignore-turn-stop (natural stop)
+                    "--n-samples", "{n_samples}",
+                    "--max-new-tokens", "{max_new_tokens}",
+                    "--block-size", "{block_size}",
+                    "--prefill-chunk-size", "512",
+                    "--output",
+                    "results/research/k3_mac_bridge_k3_fused_allmlx_natural.json",
+                ),
+            ),
+            timeout_minutes=120,
+            params={
+                "n_samples": ("int:n_samples", "5"),
+                "max_new_tokens": ("int:max_new_tokens", "48"),
+                "block_size": ("int:block_size", "4"),
+            },
+            validate_reports=False,
+        ),
     )
 }
 
diff --git a/scripts/research/k3_integrated_niah_eval_mac.py b/scripts/research/k3_integrated_niah_eval_mac.py
index 85554696..8025206e 100644
--- a/scripts/research/k3_integrated_niah_eval_mac.py
+++ b/scripts/research/k3_integrated_niah_eval_mac.py
@@ -112,6 +112,21 @@ def parse_args() -> argparse.Namespace:
                          "crossings per block. Requires --s5-exact-full-attn "
                          "(the all-MLX path uses native-S5 injection; the "
                          "f_theta sliding restoration path stays torch).")
+    ap.add_argument("--single-fused", action="store_true",
+                    help="PROBE: with --cuda-trim, fuse drafter+verifier into ONE "
+                         "graph (skip the two-phase eval) to classify the Metal "
+                         "instability (fundamental command-buffer vs fixable SDPA "
+                         "fallback). Reports per-block eval times.")
+    ap.add_argument("--cuda-trim", action="store_true",
+                    help="All-MLX fused with the CUDA-parity rollback: all-KVCache "
+                         "verifier layout + native trim_prompt_cache (keep accepted "
+                         "K/V, drop only rejected) instead of the v3 carry "
+                         "re-forward. Requires --all-mlx-drafter --fused-specdecode.")
+    ap.add_argument("--code-prompts", action="store_true",
+                    help="Replace the NIAH dataset with code-completion prompts "
+                         "(naturally-long, predictable generation = the spec-decode "
+                         "sweet spot). Recall metric is N/A; measures honest "
+                         "decode-only throughput + acceptance on a real workload.")
     ap.add_argument("--ignore-turn-stop", action="store_true",
                     help="Do not include Gemma4 <turn|> as a stop token. "
                          "Useful for throughput evidence runs that require "
@@ -179,7 +194,7 @@ def main() -> int:
     from inference_engine.backends.mlx.fused_specdecode import (
         MLXRestoredIncrementalVerifier, capture_aux_hidden,
         make_bridge_embed_lm_head, fused_specdecode_generate,
-        fused_specdecode_generate_mlx,
+        fused_specdecode_generate_mlx, fused_specdecode_generate_mlx_trim,
     )
     from inference_engine.v04.kv_compressor import make_default_compressor
     from inference_engine.bench.k3_report_gate import (
@@ -391,12 +406,41 @@ def restored_forward(ids: List[int], rk, rv, t_src, *, return_all: bool):
         )
 
     # ---------- Dataset ----------
-    samples: List[NIAHSample] = make_niah_dataset(
-        n_samples=args.n_samples,
-        haystack_min_lines=args.haystack_min_lines,
-        haystack_max_lines=args.haystack_max_lines,
-        seed=args.seed,
-    )
+    if args.code_prompts:
+        _CODE = [
+            "Write a complete Python implementation of a binary search tree class "
+            "with insert, search, and in-order traversal methods. Include type "
+            "hints and docstrings.",
+            "Implement a Python LRU cache class with get and put methods using an "
+            "OrderedDict. Include type hints and docstrings.",
+            "Write a Python function that parses a CSV string into a list of dicts, "
+            "correctly handling quoted fields and embedded commas. Add error handling.",
+            "Implement quicksort in Python with an in-place partition helper. "
+            "Include docstrings and a small example in a __main__ block.",
+            "Write a Python class for a fixed-capacity ring buffer with push, pop, "
+            "and is_full methods, raising on overflow. Include type hints.",
+            "Implement a recursive descent parser in Python for arithmetic "
+            "expressions with + - * / and parentheses. Return the evaluated value.",
+            "Write a Python decorator `retry` that retries a function up to n times "
+            "with exponential backoff on exception. Include type hints and docstring.",
+            "Implement a thread-safe counter class in Python using threading.Lock, "
+            "with increment, decrement, and value methods.",
+        ]
+        n = min(args.n_samples, len(_CODE))
+        samples: List[NIAHSample] = [
+            NIAHSample(prompt_text=p, answer_text="", needle_line_index=0,
+                       needle_text="")
+            for p in _CODE[:n]
+        ]
+        print(f"[mac] CODE-PROMPTS workload: {n} prompts (recall N/A; "
+              f"measuring decode throughput + acceptance)", file=sys.stderr)
+    else:
+        samples = make_niah_dataset(
+            n_samples=args.n_samples,
+            haystack_min_lines=args.haystack_min_lines,
+            haystack_max_lines=args.haystack_max_lines,
+            seed=args.seed,
+        )
 
     def encode(prompt_text: str) -> List[int]:
         if args.direct_answer_prompt:
@@ -681,12 +725,21 @@ def eval_fused_specdecode() -> Tuple[List[str], List[float], List[int]]:
                     restored_k_per_layer=_pad(rk, tsrc, T),
                     restored_v_per_layer=_pad(rv, tsrc, T),
                     evicted_positions=evicted,
-                    prefill_chunk_size=args.prefill_chunk_size)
+                    prefill_chunk_size=args.prefill_chunk_size,
+                    full_kv=args.cuda_trim)
             prefill_s = time.perf_counter() - prefill_t0
             t0 = time.perf_counter()
             if args.force_fused_specdecode:
-                if mlx_drafter is not None:
-                    # Single-sync all-MLX loop (levers ①②③).
+                if mlx_drafter is not None and args.cuda_trim:
+                    # CUDA-parity: keep accepted K/V, trim only rejected.
+                    res = fused_specdecode_generate_mlx_trim(
+                        adapter, active_drafter, aux_prompt=aux_prompt,
+                        embed_fn=embed_fn, lm_head_fn=lm_head_fn,
+                        gen_tokens=args.max_new_tokens,
+                        block_size=args.block_size, eos_ids=end_ids,
+                        single_fused=args.single_fused)
+                elif mlx_drafter is not None:
+                    # Single-sync all-MLX loop (levers ①②③) + v3 carry rollback.
                     res = fused_specdecode_generate_mlx(
                         adapter, active_drafter, aux_prompt=aux_prompt,
                         embed_fn=embed_fn, lm_head_fn=lm_head_fn,
@@ -1036,12 +1089,17 @@ def _mx_peak_mb() -> Optional[float]:
     print(f"\n[mac] DONE. {sut_label}={cross_res.recall:.3f} "
           f"oracle={oracle_res.recall if oracle_res else 'skipped'} "
           f"-> {out_path}", file=sys.stderr)
-    if violations:
+    if violations and args.code_prompts:
+        print("[mac] code-prompts throughput probe: recall is N/A by design; "
+              "evidence gate informational only (not aborting):\n"
+              + summarize_violations(violations), file=sys.stderr)
+    elif violations:
         print("[mac] EVIDENCE GATE FAILED — this report is NOT admissible "
               "as evidence:\n" + summarize_violations(violations),
               file=sys.stderr)
         return 2
-    print("[mac] evidence gate: PASS", file=sys.stderr)
+    else:
+        print("[mac] evidence gate: PASS", file=sys.stderr)
     return 0
 
 
diff --git a/tests/backends/mlx/test_fused_specdecode.py b/tests/backends/mlx/test_fused_specdecode.py
index a3764b8b..de92aa7a 100644
--- a/tests/backends/mlx/test_fused_specdecode.py
+++ b/tests/backends/mlx/test_fused_specdecode.py
@@ -310,6 +310,23 @@ def test_adapter_prefill_rejects_empty_prompt(monkeypatch):
                         evicted_positions=[])
 
 
+def test_make_full_kv_prompt_cache_all_kvcache(monkeypatch):
+    # Fake mlx_lm.models.cache with make_prompt_cache (count) + a KVCache class.
+    import types as _t
+    class _FakeKV:
+        instances = 0
+        def __init__(self): type(self).instances += 1
+    cache_mod = _t.ModuleType("mlx_lm.models.cache")
+    cache_mod.make_prompt_cache = lambda model, **k: ["a", "b", "c", "d"]  # 4 layers
+    cache_mod.KVCache = _FakeKV
+    monkeypatch.setitem(sys.modules, "mlx_lm", _t.ModuleType("mlx_lm"))
+    monkeypatch.setitem(sys.modules, "mlx_lm.models", _t.ModuleType("mlx_lm.models"))
+    monkeypatch.setitem(sys.modules, "mlx_lm.models.cache", cache_mod)
+    out = fsd.make_full_kv_prompt_cache(object())
+    assert len(out) == 4 and all(isinstance(c, _FakeKV) for c in out)
+    assert _FakeKV.instances == 4   # every layer is a fresh full KVCache
+
+
 def test_patched_decoder_layers_empty_is_noop(monkeypatch):
     _install_mlx(monkeypatch)
     tm = _TextModel(0)