FluffyAIcode · FluffyAIcode · Jun 13, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -95,16 +95,17 @@ jobs:
             tests/inference_engine/session/ \
             tests/inference_engine/bench/ \
             tests/inference_engine/setup/ \
+            tests/inference_engine/bridge/ \
             tests/sdk/python/ \
             tests/training/repr_align/ \
             tests/backends/mlx/test_env.py \
             --junitxml=junit.xml \
             -v
           coverage report \
-            --include='inference_engine/server/auth.py,inference_engine/server/config.py,inference_engine/server/errors.py,inference_engine/server/grpc_app.py,inference_engine/server/metrics.py,inference_engine/server/schemas.py,inference_engine/server/proto_gen/**/*.py,inference_engine/memory/*,inference_engine/scheduler/config.py,inference_engine/scheduler/session.py,inference_engine/pipeline/*,inference_engine/session/store.py,inference_engine/setup/*,sdks/python/kakeya/__init__.py,sdks/python/kakeya/errors.py,training/repr_align/*' \
+            --include='inference_engine/server/auth.py,inference_engine/server/config.py,inference_engine/server/errors.py,inference_engine/server/grpc_app.py,inference_engine/server/metrics.py,inference_engine/server/schemas.py,inference_engine/server/proto_gen/**/*.py,inference_engine/memory/*,inference_engine/bridge/*,inference_engine/scheduler/config.py,inference_engine/scheduler/session.py,inference_engine/pipeline/*,inference_engine/session/store.py,inference_engine/setup/*,sdks/python/kakeya/__init__.py,sdks/python/kakeya/errors.py,training/repr_align/*' \
             --fail-under=100
           coverage xml -o coverage.xml \
-            --include='inference_engine/server/auth.py,inference_engine/server/config.py,inference_engine/server/errors.py,inference_engine/server/grpc_app.py,inference_engine/server/metrics.py,inference_engine/server/schemas.py,inference_engine/server/proto_gen/**/*.py,inference_engine/memory/*,inference_engine/scheduler/config.py,inference_engine/scheduler/session.py,inference_engine/pipeline/*,inference_engine/session/store.py,inference_engine/setup/*,sdks/python/kakeya/__init__.py,sdks/python/kakeya/errors.py,training/repr_align/*'
+            --include='inference_engine/server/auth.py,inference_engine/server/config.py,inference_engine/server/errors.py,inference_engine/server/grpc_app.py,inference_engine/server/metrics.py,inference_engine/server/schemas.py,inference_engine/server/proto_gen/**/*.py,inference_engine/memory/*,inference_engine/bridge/*,inference_engine/scheduler/config.py,inference_engine/scheduler/session.py,inference_engine/pipeline/*,inference_engine/session/store.py,inference_engine/setup/*,sdks/python/kakeya/__init__.py,sdks/python/kakeya/errors.py,training/repr_align/*'
 
       - name: Upload coverage artifact
         if: always()
@@ -166,6 +167,8 @@ jobs:
             import kakeya.client; \
             import kakeya.session; \
             import kakeya.errors; \
+            import inference_engine.bridge; \
+            import inference_engine.bridge.manifest; \
             import inference_engine.proposer; \
             import inference_engine.proposer.sparse_logits; \
             import inference_engine.backends.mlx.env; \

diff --git a/.github/workflows/mac-bridge.yaml b/.github/workflows/mac-bridge.yaml
@@ -0,0 +1,142 @@
+name: Mac bridge
+
+# Git-bus executor for cloud-agent access to the self-hosted Apple
+# Silicon node (docs/design/mac-bridge-cloud-agent-access.md §2.1).
+#
+# Protocol: an agent pushes a branch `mac-bridge/<preset>-<nonce>`
+# containing the workload tree + a manifest at .mac-bridge/request.json
+# (created by scripts/mac_bridge/request_run.py). This workflow runs the
+# manifest's ALLOWLISTED preset on the kakeya-mac-m4 runner and pushes
+# logs + result JSONs back to the same branch, where the agent fetches
+# them with plain git (and read-only `gh run list`).
+#
+# Security (design doc §3):
+#   * Command surface = the preset allowlist in
+#     inference_engine/bridge/manifest.py — typed, bounded params; no
+#     manifest string ever reaches a shell. Validation is unit-tested
+#     at 100% coverage on the Linux gate.
+#   * Trigger surface = push permission on mac-bridge/** — the same
+#     population that can already execute code on this runner via the
+#     `needs-mac-m4` PR label (integration.yaml).
+#   * The single Mac is serialized via the concurrency group; every
+#     preset carries its own timeout inside the executor and the job
+#     has a hard cap below.
+#   * K3 acceptance reports produced by a run are validated by the
+#     PR #109 evidence gate ON the runner; a non-conforming report
+#     fails the bridge run itself.
+
+on:
+  push:
+    branches:
+      # Canonical request namespace.
+      - "mac-bridge/**"
+      # Cursor cloud agents are typically constrained to an
+      # AgentMemory/<name>[-suffix] branch template; this pattern lets
+      # them participate without violating their naming policy
+      # (request_run.py --branch-prefix/--branch-suffix).
+      - "AgentMemory/mac-bridge-*"
+
+concurrency:
+  # One Mac: queue bridge runs globally, never cancel a running one
+  # (results are expensive; the requester can cancel from the UI).
+  group: mac-bridge
+  cancel-in-progress: false
+
+permissions:
+  contents: write # commit logs/results back to the request branch
+
+jobs:
+  bridge:
+    name: run allowlisted preset on kakeya-mac-m4
+    runs-on: [self-hosted, macOS, ARM64, kakeya-mac-m4]
+    timeout-minutes: 150
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          # Push results back to the request branch.
+          persist-credentials: true
+          # k3-* presets load LFS-tracked checkpoints from the repo
+          # (e.g. results/research/f_theta_v5_s5_sliding/
+          # f_theta_weights.pt). Without lfs:true the workspace holds
+          # pointer files and torch.load fails with the cryptic
+          # "Unsupported operand 118" (ASCII 'v' = the first byte of
+          # an LFS pointer).
+          lfs: true
+
+      - name: Show request
+        run: |
+          echo "=== .mac-bridge/request.json ==="
+          cat .mac-bridge/request.json
+
+      - name: Materialize LFS objects (deterministic)
+        # checkout@v4's lfs:true proved insufficient on a reused
+        # self-hosted workspace: a previous non-LFS checkout left
+        # pointer-content files in the worktree, the blob is unchanged
+        # on the new branch, so git skips re-smudging and the stale
+        # pointer survives (observed live: torch.load failing with
+        # "Unsupported operand 118" = ASCII 'v' of an LFS pointer).
+        # `git lfs pull` force-materializes; the guard fails fast if
+        # any tracked LFS file is still a pointer.
+        run: |
+          git lfs install --local
+          git lfs pull
+          bad=""
+          while IFS= read -r f; do
+            if [ -f "$f" ] && head -c 40 "$f" | grep -q "git-lfs"; then
+              bad="$bad $f"
+            fi
+          done < <(git lfs ls-files -n)
+          if [ -n "$bad" ]; then
+            echo "::error::LFS pointers not materialized:$bad"
+            exit 1
+          fi
+          echo "all LFS objects materialized"
+
+      - name: Run preset (allowlist-validated executor)
+        env:
+          PYTHONPATH: .:sdks/python
+          # Machine-local model locations come from the runner env,
+          # never from the manifest (docs/ops/mac-m4-runner-setup.md).
+          # Precedence: repo Actions variable > ~/kakeya-models/<name>
+          # (the documented stable symlink location on the runner) >
+          # repo-relative fallback. $HOME needs shell expansion, hence
+          # the export block instead of plain env defaults.
+          KAKEYA_MAC_VERIFIER_PATH_VAR: ${{ vars.KAKEYA_MAC_VERIFIER_PATH || '' }}
+          KAKEYA_MAC_DRAFTER_ID_VAR: ${{ vars.KAKEYA_MAC_DRAFTER_ID || '' }}
+          KAKEYA_MAC_FTHETA_DIR_VAR: ${{ vars.KAKEYA_MAC_FTHETA_DIR || '' }}
+          HF_HUB_OFFLINE: "1"
+        run: |
+          default_verifier="$HOME/kakeya-models/gemma-4-26B-A4B-it-mlx-4bit"
+          if [ ! -d "$default_verifier" ]; then
+            default_verifier="models/gemma-4-26B-A4B-it-mlx-4bit"
+          fi
+          export KAKEYA_MAC_VERIFIER_PATH="${KAKEYA_MAC_VERIFIER_PATH_VAR:-$default_verifier}"
+          export KAKEYA_MAC_DRAFTER_ID="${KAKEYA_MAC_DRAFTER_ID_VAR:-z-lab/gemma-4-26B-A4B-it-DFlash}"
+          export KAKEYA_MAC_FTHETA_DIR="${KAKEYA_MAC_FTHETA_DIR_VAR:-results/research/f_theta_v5_s5_sliding}"
+          echo "verifier=$KAKEYA_MAC_VERIFIER_PATH"
+          python3 scripts/mac_bridge/run_preset.py \
+            --manifest .mac-bridge/request.json
+
+      - name: Commit results back to the request branch
+        if: always()
+        run: |
+          git config user.name "kakeya-mac-bridge"
+          git config user.email "mac-bridge@users.noreply.github.com"
+          git add -A .mac-bridge/logs results/research 2>/dev/null || true
+          if git diff --cached --quiet; then
+            echo "no result files to commit"
+          else
+            git commit -m "mac-bridge results: ${GITHUB_REF_NAME}"
+            git push origin "HEAD:${GITHUB_REF_NAME}"
+          fi
+
+      - name: Upload results as artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: mac-bridge-${{ github.run_id }}
+          path: |
+            .mac-bridge/logs/
+            results/research/k3_mac_bridge_*.json
+          if-no-files-found: warn
+          retention-days: 14
diff --git a/.mac-bridge/logs/k3-step1-incremental-0.log b/.mac-bridge/logs/k3-step1-incremental-0.log
@@ -0,0 +1,23 @@
+[mac] loading MLX verifier /Users/fluffy314/kakeya-models/gemma-4-26B-A4B-it-mlx-4bit
+[mac] verifier layers=30 full_attn=[5, 11, 17, 23, 29]
+[mac] loading drafter z-lab/gemma-4-26B-A4B-it-DFlash on cpu
+[mac] 5 samples, prompt len min=4406 max=5810
+[mac] running restored cross-model verifier (s5, free_gen_incremental)
+[mac]   incr 0: T=5810 prefill=25.4s decode=2.8s -> 'BETA-1409<turn|><turn|>thought\n<channel|>The use'
+[mac]   incr 1: T=4911 prefill=19.7s decode=2.9s -> 'DELTA-3286<turn|><turn|>\nthought\n<channel|>The u'
+[mac]   incr 2: T=5594 prefill=19.6s decode=2.9s -> 'BETA-7912<turn|><turn|>\nthought\n<channel|>The us'
+[mac]   incr 3: T=4406 prefill=15.8s decode=2.8s -> 'BETA-4582<turn|><turn|>thought\n<channel|>The use'
+[mac]   incr 4: T=5505 prefill=31.7s decode=4.9s -> 'KAPPA-1434<turn|><turn|>\nthought\n<channel|>The u'
+[mac] restored_cross_model recall = 1.000 (5/5)
+[mac] running oracle
+[mac]   oracle 0: T=5810 -> 'BETA-1409<turn|><turn|>thought\n<channel|>The use'
+[mac]   oracle 1: T=4911 -> 'DELTA-3286<turn|><turn|>\nthought\n<channel|>The u'
+[mac]   oracle 2: T=5594 -> 'BETA-7912<turn|><turn|>\nthought\n<channel|>The us'
+[mac]   oracle 3: T=4406 -> 'BETA-4582<turn|><turn|>thought\n<channel|>The use'
+[mac]   oracle 4: T=5505 -> 'KAPPA-1434<turn|><turn|>\nthought\n<channel|>The u'
+[mac] oracle recall = 1.000
+[mac] KV resident @T=5810: S5=132.92 MB (growth 20.0 KB/tok); naive-full=1308.88 MB
+[mac] cross-model throughput (free_gen_incremental): 2.49 tok/s (320 tok / 128.514 s, 25.703 s/sample)
+
+[mac] DONE. restored_cross_model=1.000 oracle=1.0 -> results/research/k3_mac_bridge_k3_step1_incremental.json
+[mac] evidence gate: PASS
diff --git a/.mac-bridge/logs/summary.json b/.mac-bridge/logs/summary.json
@@ -0,0 +1,41 @@
+{
+  "preset": "k3-step1-incremental",
+  "params": {
+    "n_samples": "5",
+    "max_new_tokens": "64",
+    "block_size": "4"
+  },
+  "nonce": "1781268308-dc400e",
+  "commands": [
+    {
+      "argv": [
+        "python3",
+        "scripts/research/k3_integrated_niah_eval_mac.py",
+        "--verifier-path",
+        "/Users/fluffy314/kakeya-models/gemma-4-26B-A4B-it-mlx-4bit",
+        "--drafter-id",
+        "z-lab/gemma-4-26B-A4B-it-DFlash",
+        "--f-theta-dir",
+        "results/research/f_theta_v5_s5_sliding",
+        "--s5-exact-full-attn",
+        "--incremental",
+        "--ignore-turn-stop",
+        "--n-samples",
+        "5",
+        "--max-new-tokens",
+        "64",
+        "--block-size",
+        "4",
+        "--prefill-chunk-size",
+        "512",
+        "--output",
+        "results/research/k3_mac_bridge_k3_step1_incremental.json"
+      ],
+      "exit_code": 0,
+      "seconds": 370.5,
+      "log": ".mac-bridge/logs/k3-step1-incremental-0.log"
+    }
+  ],
+  "evidence_gate_exit_code": 0,
+  "exit_code": 0
+}
diff --git a/README.md b/README.md
@@ -105,6 +105,66 @@ the binding correctness gate. Mac M4 evidence on `main`:
 
 Raw artifacts: [`results/platform-tests/bench_session_4h_1780332893.json`](results/platform-tests/) (4-h evidence) and the v0.3.0 GA tag's smoke run committed at `6399546`.
 
+## Kakeya Inference Engine for Mac — MLX speculative-decode port (K3 beta baseline)
+
+After the **CUDA** beta (PR #107: f_θ + S5 K/V-restoration verifier, **fused DFlash
+spec-decode at 1.27× AR, recall 1.0 on Gemma-4-26B-A4B / H200**), the engine was
+ported to the **Apple-Silicon MLX** backend. The decode throughput climbed from a
+near-total collapse to **≈AR parity** through a sequence of precisely-diagnosed
+fixes. This is the baseline record of that journey (all numbers are decode-only
+tok/s vs the native `mlx_lm` AR oracle on the same model, measured on a Mac M4 via
+the [Mac bridge](#evaluation-environment); ×AR is the ratio).
+
+| Stage | ×AR | Binding problem | Fix |
+| --- | --- | --- | --- |
+| Naïve restored decode | **~0.09×** | **O(T²) collapse** — the restored verifier did a *full-sequence* forward **per generated token** (`restored_logits`); the Mac harness called it once per token. | **Gap-A incremental decode**: prefill **once**, capture the restored K/V into the model's **native** cache, then decode with `mlx_lm.generate_step` (chunked prefill + `mx.async_eval` pipelined) — O(L)/token, never re-forward the sequence. |
+| Hybrid fused spec-decode | **~0.2×** | **Cross-runtime bridge** — MLX verifier + PyTorch/MPS drafter shipped **MB/block of aux-hidden** across runtimes on the critical path; plus a benchmark **forced-over-generation** artifact (`--ignore-turn-stop`) that tanked acceptance. | Recognised the bridge as the bottleneck; moved toward an **all-MLX drafter** (single runtime, zero per-block bridge crossings). |
+| All-MLX + sound rollback | **~0.5×** | **Unsound rollback** — `RotatingKVCache` is not trimmable once the sliding ring wraps (`is_trimmable → offset < max_size`), so the loop **rolled the whole block back and re-forwarded** the carried accepted tokens every partial-accept block (~2 verifier forwards/block). | **CUDA-`DynamicCache` parity**: prefill an **all-`KVCache`** layout (sliding too — byte-exact, the window mask applies regardless of cache capacity) so `trim_prompt_cache` is a sound O(1) slice; **keep accepted K/V, trim only the rejected tail**, never re-forward. |
+| Block-4 CUDA-trim | **~0.7×** | **Per-block Python graph construction** (`build_s` ≈ 50 ms/block building the 26B lazy graph). | Removing the re-forward (above) closed most of it; block-4 lands at **0.68× AR**. |
+| Block-8 tuned | **~1.0×** | **Block size vs the drafter's accept-len plateau.** | Tune to **block-8** (matches the all-MLX drafter's ~4.5 accept-len ceiling); long-code completions reach **~1.0–1.05× AR (parity, best samples just over)**. block-16 is *worse* — `verify(16)` cost is wasted because acceptance plateaus. |
+
+**Honest ceiling & what was *ruled out*.** ≈AR parity is the Mac result on the
+spec-decode sweet spot (short-context, naturally-long *code/agent* generation);
+**>AR meaningfully remains CUDA-favoured** (H200 1.27×) because the binding
+constraint is the **26B `verify(L)` compute per block** — *not* rollback (fixed),
+*not* sync count (a one-graph "single-fused" probe ran stably at ~0.16 s/block and
+was ≈ equal — the b876 single-fused "143 s" pathology is **large-cache-specific**,
+not fundamental), *not* drafter acceptance (a clean ~3–4.5/block on natural
+workloads), *not* verifier quantization (4-bit ≥ bf16; the loop is self-consistent),
+*not* context length (NIAH ≥ general), and *not* a missing alignment asset
+(fc_norms fine-tuning *degraded* held-out acceptance — the base z-lab drafter is
+already near its block-4 ceiling). The earlier "low acceptance / 2.13" numbers were
+a **forced-over-generation benchmark artifact**, reproduced on a clean full-KV bf16
+verifier. The one genuine remaining lever is closing the **drafter accept-len gap
+(~4.5 ours → ~7.7 z-lab reference)** — a port-fidelity / alignment residual.
+
+Recall (the architecture's primary deliverable) is **1.0** throughout, with
+bounded resident KV (**S5**: ~133 MB vs ~1309 MB naïve at 5.8 k ctx, ~90 % saving;
+~48 MB after affine-4). See [ADR 0012](docs/adr/0012-proposer-verifier-value-proposition.md)
+(value is realised on the **memory axis** all-platform + **throughput** on CUDA)
+and [ADR 0013](docs/adr/0013-distributed-inference-topology.md) (what AR
+sequentiality allows for distribution).
+
+### Evaluation environment
+
+The Mac port was developed and benchmarked **remotely from a Linux cloud agent**,
+since MLX runs only on Apple Silicon:
+
+- **Mac bridge** (`scripts/mac_bridge/`): a **git-bus** request/response plane — the
+  agent pushes an allowlisted-preset request branch, a **self-hosted GitHub Actions
+  runner (`kakeya-mac-m4`)** executes it on the Mac and pushes results back. No SSH/
+  VPN — only git push. Presets + param bounds are enforced by
+  `inference_engine/bridge/manifest.py`; this is itself an instance of the
+  multi-host capability plane ([ADR 0009](docs/adr/0009-mlx-distributed-spec-decode-and-capability-exchange.md)).
+- **Evidence gate** (`inference_engine/bench/k3_report_gate.py`): every Mac report is
+  machine-validated — rejects fused runs that didn't execute (`blocks=0`), baseline
+  bypasses claiming recall/speedup, self-comparison speedups, prefill-variance, and
+  decode-token-budget violations — so a number is admissible only if it survives the
+  same rules that caught the earlier artifacts.
+- **GPU side** (vast.ai H200): alignment-training + acceptance-factor experiments
+  (`scripts/research/k3_dflash_alignment_train.py`, `k3_dflash_specdecode_eval.py`)
+  used to rule out the non-levers above.
+
 ## SDKs
 
 ### Python — `sdks/python/kakeya`