diff --git a/bench/asset_cache.zig b/bench/asset_cache.zig
new file mode 100644
index 0000000..6c4e797
--- /dev/null
+++ b/bench/asset_cache.zig
@@ -0,0 +1,211 @@
+//! Bench: asset cooking-cache hit differential — M0.6 / E4.
+//!
+//! Measures the wall-clock differential between a *cold cook* (cache miss:
+//! BLAKE3 over the payload + assemble + write the `.bin`) and a *warm cache
+//! hit* (a directory lookup that skips the cook entirely). This is the
+//! performance half of the brief's cache criterion; the correctness half
+//! (a miss → hit transition that returns byte-identical bytes) is the
+//! deterministic gate `tests/assets/cache_diff.zig`.
+//!
+//! The differential is host- and load-dependent, so it lives here — measured
+//! under the opposable protocol on the reference machine — not inside
+//! `zig build test`, where a single cache-hit sample spiking on a page
+//! fault / AV scan / cold directory would red-fail the gate. The brief's
+//! reference figure is ≥ 100 ms cold cook / < 10 ms hit for a real
+//! decode-heavy asset; this synthetic 16 MiB RGBA8 cook is hash- and
+//! write-bound rather than decode-bound, so the absolute cold time is
+//! smaller — the *ratio* (cook ≫ lookup) is the signal.
+//!
+//! Run: `zig build bench-asset-cache` (add `-- --smoke` for a tiny CI sanity
+//! run). Writes `bench/out/asset_cache_<os>.md`.
+
+const std = @import("std");
+const builtin = @import("builtin");
+const assets = @import("weld_asset_pipeline");
+
+const log = std.log.scoped(.bench_asset_cache);
+
+const REPORT_DIR: []const u8 = "bench/out";
+const SCRATCH_DIR: []const u8 = "bench/out/asset_cache_scratch";
+
+pub fn main(init: std.process.Init) !void {
+    const gpa = init.gpa;
+    const io = init.io;
+    const args = try init.minimal.args.toSlice(init.arena.allocator());
+
+    var smoke = false;
+    for (args[1..]) |a| {
+        if (std.mem.eql(u8, a, "--smoke")) smoke = true;
+    }
+
+    // Non-smoke: 2048×2048 RGBA8 = 16 MiB, so the cold cook (BLAKE3 over the
+    // payload + a 16 MiB write) is clearly expensive vs a dir-lookup hit.
+    // Smoke: a tiny asset, just enough to exercise the path under CI.
+    const dim: u32 = if (smoke) 64 else 2048;
+    const iterations: usize = if (smoke) 3 else 16;
+
+    log.info("config: {d}x{d} RGBA8 ({d} KiB), {d} iterations, smoke={}", .{
+        dim, dim, (dim * dim * 4) / 1024, iterations, smoke,
+    });
+
+    const cwd = std.Io.Dir.cwd();
+    cwd.createDirPath(io, SCRATCH_DIR) catch |e| switch (e) {
+        error.PathAlreadyExists => {},
+        else => return e,
+    };
+    // The scratch cache is bench-private state; remove it on the way out so a
+    // re-run always starts from genuine misses (cf. bench/etch_compile.zig).
+    defer cwd.deleteTree(io, SCRATCH_DIR) catch {};
+
+    var scratch = try cwd.openDir(io, SCRATCH_DIR, .{});
+    defer scratch.close(io);
+    const cache = assets.cache.Cache.init(scratch);
+
+    const blob = try gpa.alloc(u8, dim * dim * 4);
+    defer gpa.free(blob);
+    for (blob, 0..) |*b, i| b.* = @truncate(i *% 2_654_435_761);
+
+    const source_hash = assets.hash.hex128(blob);
+    const extracted = [_]assets.format.Field{
+        .{ .key = "width", .value = .{ .int = @intCast(dim) } },
+        .{ .key = "height", .value = .{ .int = @intCast(dim) } },
+        .{ .key = "blob", .value = .{ .string = &source_hash } },
+    };
+    const doc = assets.AssetDoc{
+        .name = "big",
+        .type_name = "Texture2D",
+        .version = 1,
+        .source = "big.png",
+        .source_hash = &source_hash,
+        .extracted = &extracted,
+    };
+
+    const cold_ns = try gpa.alloc(u64, iterations);
+    defer gpa.free(cold_ns);
+    const hit_ns = try gpa.alloc(u64, iterations);
+    defer gpa.free(hit_ns);
+
+    // Each iteration uses a distinct cache key (distinct settings string) so
+    // every cold sample starts from a genuine miss in the shared scratch dir.
+    var settings_buf: [16]u8 = undefined;
+    for (0..iterations) |i| {
+        const settings = try std.fmt.bufPrint(&settings_buf, "pc-{d}", .{i});
+        const key = assets.cache.computeKey(&source_hash, settings, 0);
+
+        // Cold path — cache miss: look up (absent) → cook → store.
+        const t_cold = std.Io.Clock.Timestamp.now(io, .awake);
+        std.debug.assert(!cache.contains(io, &key));
+        const bin = try assets.cookers.cookTexture(gpa, doc, blob);
+        try cache.put(io, &key, bin);
+        cold_ns[i] = @intCast(t_cold.untilNow(io).raw.nanoseconds);
+        gpa.free(bin);
+
+        // Warm path — cache hit: a directory lookup; the cook is skipped.
+        const t_hit = std.Io.Clock.Timestamp.now(io, .awake);
+        const hit = cache.contains(io, &key);
+        hit_ns[i] = @intCast(t_hit.untilNow(io).raw.nanoseconds);
+        std.debug.assert(hit);
+    }
+
+    try writeReport(gpa, io, cold_ns, hit_ns, dim, iterations, smoke);
+}
+
+const Stats = struct {
+    min_ns: u64,
+    p50_ns: u64,
+    max_ns: u64,
+};
+
+fn computeStats(gpa: std.mem.Allocator, samples: []const u64) !Stats {
+    const sorted = try gpa.dupe(u64, samples);
+    defer gpa.free(sorted);
+    std.mem.sort(u64, sorted, {}, std.sort.asc(u64));
+    return .{
+        .min_ns = sorted[0],
+        .p50_ns = sorted[sorted.len / 2],
+        .max_ns = sorted[sorted.len - 1],
+    };
+}
+
+fn writeReport(
+    gpa: std.mem.Allocator,
+    io: std.Io,
+    cold_ns: []const u64,
+    hit_ns: []const u64,
+    dim: u32,
+    iterations: usize,
+    smoke: bool,
+) !void {
+    const cold = try computeStats(gpa, cold_ns);
+    const hit = try computeStats(gpa, hit_ns);
+    const speedup: f64 = if (hit.p50_ns == 0)
+        0
+    else
+        @as(f64, @floatFromInt(cold.p50_ns)) / @as(f64, @floatFromInt(hit.p50_ns));
+
+    std.Io.Dir.cwd().createDirPath(io, REPORT_DIR) catch |e| switch (e) {
+        error.PathAlreadyExists => {},
+        else => return e,
+    };
+
+    const platform_tag = @tagName(builtin.os.tag);
+    var path_buf: [256]u8 = undefined;
+    const path = try std.fmt.bufPrint(&path_buf, "{s}/asset_cache_{s}.md", .{ REPORT_DIR, platform_tag });
+
+    var file = try std.Io.Dir.cwd().createFile(io, path, .{ .truncate = true });
+    defer file.close(io);
+
+    var write_buf: [4096]u8 = undefined;
+    var writer = file.writer(io, &write_buf);
+    const w = &writer.interface;
+
+    try w.print("# Bench: asset cooking-cache hit differential — {s}\n\n", .{platform_tag});
+    try w.print(
+        "Config: {d}×{d} RGBA8 ({d} KiB payload), {d} iterations{s}.\n\n",
+        .{ dim, dim, (dim * dim * 4) / 1024, iterations, if (smoke) " (smoke)" else "" },
+    );
+
+    try w.print("## Cold cook (cache miss)\n\n", .{});
+    try w.print("| Metric | Value (ms) |\n|---|---|\n", .{});
+    try w.print("| min | {d:.3} |\n", .{nanosToMs(cold.min_ns)});
+    try w.print("| p50 | {d:.3} |\n", .{nanosToMs(cold.p50_ns)});
+    try w.print("| max | {d:.3} |\n\n", .{nanosToMs(cold.max_ns)});
+
+    try w.print("## Warm hit (cache lookup)\n\n", .{});
+    try w.print("| Metric | Value (µs) |\n|---|---|\n", .{});
+    try w.print("| min | {d:.3} |\n", .{nanosToUs(hit.min_ns)});
+    try w.print("| p50 | {d:.3} |\n", .{nanosToUs(hit.p50_ns)});
+    try w.print("| max | {d:.3} |\n\n", .{nanosToUs(hit.max_ns)});
+
+    try w.print("## Differential\n\n", .{});
+    try w.print("- Cache-hit speedup (cold p50 / hit p50): **{d:.0}×**\n\n", .{speedup});
+
+    try w.print("## Notes\n\n", .{});
+    try w.print(
+        "This differential is host- and load-dependent and is therefore a " ++
+            "bench number, not a `zig build test` gate (the correctness half — " ++
+            "a miss → hit transition that returns byte-identical bytes — is the " ++
+            "deterministic gate `tests/assets/cache_diff.zig`). Measure under the " ++
+            "opposable protocol on the reference machine. The brief reference is " ++
+            "≥ 100 ms cold / < 10 ms hit for a real decode-heavy asset; this " ++
+            "synthetic RGBA8 cook is hash- and write-bound, so the cold time is " ++
+            "smaller — the ratio (cook ≫ lookup) is the signal.\n",
+        .{},
+    );
+    try w.flush();
+
+    log.info("report written: {s}", .{path});
+    log.info("cold p50 = {d:.3} ms, hit p50 = {d:.3} µs, speedup = {d:.0}×", .{
+        nanosToMs(cold.p50_ns),
+        nanosToUs(hit.p50_ns),
+        speedup,
+    });
+}
+
+fn nanosToMs(ns: u64) f64 {
+    return @as(f64, @floatFromInt(ns)) / 1e6;
+}
+
+fn nanosToUs(ns: u64) f64 {
+    return @as(f64, @floatFromInt(ns)) / 1e3;
+}
diff --git a/build.zig b/build.zig
index cc0a6e6..4499057 100644
--- a/build.zig
+++ b/build.zig
@@ -806,6 +806,33 @@ pub fn build(b: *std.Build) void {
     );
     paeth_bench_step.dependOn(&paeth_bench_run.step);
 
+    // ------------------------------------- M0.6 asset cooking-cache bench -----
+    //
+    // Cold-cook-vs-warm-hit time differential. Host- and load-dependent, so
+    // it lives here (archived, non-blocking, measured under the opposable
+    // protocol) and NOT in `zig build test` — the correctness half (miss →
+    // hit transition + byte-identity) is the deterministic gate
+    // `tests/assets/cache_diff.zig`. `zig build bench-asset-cache`.
+    const asset_cache_bench_module = b.createModule(.{
+        .root_source_file = b.path("bench/asset_cache.zig"),
+        .target = target,
+        .optimize = optimize,
+    });
+    asset_cache_bench_module.addImport("weld_asset_pipeline", asset_pipeline_module);
+    const asset_cache_bench_exe = b.addExecutable(.{
+        .name = "asset-cache-bench",
+        .root_module = asset_cache_bench_module,
+    });
+    b.installArtifact(asset_cache_bench_exe);
+    const asset_cache_bench_run = b.addRunArtifact(asset_cache_bench_exe);
+    asset_cache_bench_run.step.dependOn(b.getInstallStep());
+    if (b.args) |args| asset_cache_bench_run.addArgs(args);
+    const asset_cache_bench_step = b.step(
+        "bench-asset-cache",
+        "Run the M0.6 cooking-cache cold-vs-hit differential (writes bench/out/asset_cache_<os>.md; pass `-- --smoke` for a CI sanity run)",
+    );
+    asset_cache_bench_step.dependOn(&asset_cache_bench_run.step);
+
     // ----------------------------------- M0.6 thin offline asset cook demo ----
     //
     // `zig build cook-demo` cooks the three M0.6 fixtures end-to-end through
diff --git a/tests/assets/cache_diff.zig b/tests/assets/cache_diff.zig
index 86bd4a1..5b725b8 100644
--- a/tests/assets/cache_diff.zig
+++ b/tests/assets/cache_diff.zig
@@ -1,19 +1,35 @@
-//! M0.6 / E4 — cooking-cache hit differential (brief §Acceptance ▸ Benchmarks).
+//! M0.6 / E4 — cooking-cache hit functional test (brief §Acceptance ▸ Benchmarks).
 //!
-//! A second cook of an unchanged asset hits the cache and skips the
-//! (expensive) cook entirely. The asset is sized so the first cook does real
-//! work (hash + write a large `.bin`); the hit is a directory lookup.
+//! A second cook of an unchanged asset hits the cache and returns the
+//! byte-identical artifact without re-cooking. This is the *correctness*
+//! half of the brief's cache criterion: a miss → hit transition plus
+//! byte-identity. It is deterministic and cross-host — no wall-clock
+//! assertion — so it belongs in the `zig build test` gate.
+//!
+//! The *performance* half — the cold-cook-vs-hit time differential — is a
+//! host- and load-dependent measurement, so it lives in the bench suite
+//! (`bench/asset_cache.zig`, `zig build bench-asset-cache`), measured under
+//! the opposable protocol on the reference machine. The original M0.6 test
+//! asserted an absolute millisecond ratio inside the correctness gate, which
+//! red-failed on slower / Windows CI runners (a single cache-hit sample can
+//! spike on a page fault, AV scan, or cold directory). That debt was flagged
+//! in the M0.7 brief (§ Acted deviations → "Known debt left untouched") and
+//! is resolved here by moving the timing out of the gate, leaving only the
+//! deterministic functional assertions below.
 
 const std = @import("std");
 const assets = @import("weld_asset_pipeline");
 
-// 2048×2048 RGBA8 = 16 MiB — large enough that the first cook (BLAKE3 over the
-// payload + writing the `.bin`) is clearly expensive vs a cache-hit lookup,
-// without bloating CI with a huge temp file.
-const width = 2048;
-const height = 2048;
+// 256×256 RGBA8 = 256 KiB — large enough to exercise a real cook (header +
+// metadata + payload copy + BLAKE3 content hash) and a non-trivial
+// byte-identity check, small enough to keep the correctness gate fast on
+// every host. The larger 16 MiB asset that makes a *cold cook* expensive
+// (the point of the timing differential) is the bench's concern, not the
+// gate's.
+const width = 256;
+const height = 256;
 
-test "second cook of unchanged asset hits cache" {
+test "second cook of unchanged asset hits cache and returns identical bytes" {
     const gpa = std.testing.allocator;
     const io = std.testing.io;
 
@@ -42,37 +58,19 @@ test "second cook of unchanged asset hits cache" {
 
     const key = assets.cache.computeKey(&source_hash, "pc", 0);
 
-    // First cook — cache miss: cook the .bin and store it.
-    const t_miss = std.Io.Clock.Timestamp.now(io, .awake);
+    // First cook — cache miss: the artifact is absent, so we cook it and
+    // store it.
     try std.testing.expect(!cache.contains(io, &key));
     const bin = try assets.cookers.cookTexture(gpa, doc, blob);
     defer gpa.free(bin);
     try cache.put(io, &key, bin);
-    const miss_ns: i64 = @intCast(t_miss.untilNow(io).raw.nanoseconds);
 
-    // Second cook — cache hit: the artifact already exists, the cook is
-    // skipped entirely.
-    const t_hit = std.Io.Clock.Timestamp.now(io, .awake);
-    const hit = cache.contains(io, &key);
-    const hit_ns: i64 = @intCast(t_hit.untilNow(io).raw.nanoseconds);
-    try std.testing.expect(hit);
+    // Second cook — cache hit: the artifact now exists, so the cook is
+    // skipped entirely (the runtime serves the stored `.bin`).
+    try std.testing.expect(cache.contains(io, &key));
 
     // The cached artifact is byte-identical to the fresh cook.
     const cached = (try cache.get(gpa, io, &key)).?;
     defer gpa.free(cached);
     try std.testing.expectEqualSlices(u8, bin, cached);
-
-    const miss_ms = @divTrunc(miss_ns, std.time.ns_per_ms);
-    const hit_us = @divTrunc(hit_ns, std.time.ns_per_us);
-    std.debug.print("\n[cache_diff] first cook (miss) = {d} ms, second cook (hit) = {d} us\n", .{ miss_ms, hit_us });
-
-    // Differential gate. The hit is a directory lookup (< 10 ms) and avoids
-    // the cook entirely — a large speedup. The absolute first-cook wall-time
-    // is build-mode- and disk-dependent (here ~800 ms Debug, ~50 ms
-    // ReleaseSafe for 16 MiB); the brief's "≥ 100 ms first cook / < 10 ms
-    // second" is the reference-machine figure for a real decode-heavy asset,
-    // so the test asserts the robust differential rather than a flaky
-    // absolute wall-time (see Closing notes).
-    try std.testing.expect(@divTrunc(hit_ns, std.time.ns_per_ms) < 10); // hit < 10 ms
-    try std.testing.expect(miss_ns > hit_ns * 20); // cache hit ≫ 20× faster
 }