diff --git a/.github/workflows/nightly-fuzz.yml b/.github/workflows/nightly-fuzz.yml new file mode 100644 index 0000000..08858cd --- /dev/null +++ b/.github/workflows/nightly-fuzz.yml @@ -0,0 +1,64 @@ +name: Nightly IPC fuzz + +# 1 h IPC fuzz over the full message catalogue (tests/ipc/fuzz_1h.zig), +# promoted to nightly CI at M0.7 / E4. Runs on Linux + Windows and +# archives the stdout digest as an artifact (G3 gate). Scheduled runs +# only fire from the default branch (GitHub rule), so this activates once +# the M0.7 branch is squash-merged to `main`; `workflow_dispatch` lets it +# be triggered manually from the Actions tab in the meantime. +on: + schedule: + # 04:00 UTC daily — off-peak for the shared runner pool. + - cron: '0 4 * * *' + workflow_dispatch: + inputs: + duration_ms: + description: 'Fuzz duration in ms (default 3600000 = 1 h)' + required: false + default: '3600000' + +concurrency: + group: nightly-fuzz-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + fuzz: + strategy: + fail-fast: false + matrix: + os: [ubuntu-24.04, windows-2025] + runs-on: ${{ matrix.os }} + # 1 h fuzz + ReleaseSafe build (~3 min on the 2-vCPU Windows runner) + + # overhead. 90 min leaves headroom without risking a runaway hang + # masquerading as a pass. + timeout-minutes: 90 + steps: + - uses: actions/checkout@v6 + + # Pinned to 0.16.0 exact — same rationale as ci.yml (the action takes + # `0.16.x` literally and 404s on the mirrors). + - uses: mlugg/setup-zig@v2 + with: + version: 0.16.0 + + - name: zig build (ReleaseSafe) + run: zig build -Doptimize=ReleaseSafe + + - name: Run 1 h IPC fuzz over the full catalogue + shell: bash + run: | + set -euo pipefail + DURATION="${{ github.event.inputs.duration_ms || '3600000' }}" + zig build test-ipc-fuzz-1h -Doptimize=ReleaseSafe -- \ + --duration-ms="${DURATION}" 2>&1 | tee "fuzz-${{ matrix.os }}.txt" + + - name: Upload fuzz digest artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: nightly-fuzz-digest-${{ matrix.os }} + path: fuzz-${{ matrix.os }}.txt + retention-days: 30 diff --git a/.gitignore b/.gitignore index 72b7593..d527966 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,11 @@ zig-pkg/ # Weld caches (shader compile cache, asset cooking cache, etc.) .weld-cache/ +# IPC runtime artifacts — minimal scene snapshots written on SaveProject +# (engine-ipc.md §7.1) and any leftover from crash-recovery tests +weld-snap-*.bin +weld-snapshot-*.bin + # Bench outputs (per-machine, regenerated by `zig build bench-*`) bench/out/*.md diff --git a/briefs/M0.7-ipc-scm-rights-windows-fuzz.md b/briefs/M0.7-ipc-scm-rights-windows-fuzz.md new file mode 100644 index 0000000..3ffdab4 --- /dev/null +++ b/briefs/M0.7-ipc-scm-rights-windows-fuzz.md @@ -0,0 +1,728 @@ + + +# M0.7 — IPC complete (SCM_RIGHTS primary attach + Windows + replay + nightly fuzz) + +> **Status:** CLOSED +> **Phase:** 0.7 +> **Branch:** `phase-0/ipc/scm-rights-and-windows-and-fuzz` +> **Planned tag:** `v0.7.0-M0.7-ipc` +> **Dependencies:** M0.3 (extended platform layer + Win32 thread safety). Historical baseline: spike S6 (`v0.0.7-S6-ipc-round-trip`). +> **Open date:** 2026-06-05 +> **Close date:** 2026-06-06 + +--- + +# FROZEN SECTION + +*Produced by Claude.ai. Not modifiable by Claude Code outside a Claude.ai round-trip (cf. § Acted deviations).* + +## Context + +M0.7 ships no new surface: it completes and hardens the IPC skeleton delivered at spike S6 and absorbs five Phase −1 debts. The central architecture pivot — POSIX cross-process `shm_open(O_RDWR)` replaced by `SCM_RIGHTS` fd-passing as the **primary attach** — was decided at S6 closure (triggered by the macOS BSD shm quirk: cross-process `shm_open(O_RDWR)` returns `EACCES` systematically). This milestone implements that pivot, the full Windows editor path, best-effort replay after `kill -9`, a 1 h IPC fuzz in nightly CI, and the message catalogue extension. It advances criterion **C0.4 (editor↔runtime IPC stable)**. IPC is **Tier 0**: no widening of the Tier 0 surface beyond what the pivot strictly requires. + +The milestone is split into **four steps (E1→E4) with a review checkpoint between each** (M0.5/M0.6 model). Ordering is dependency-driven: E2 (`ProjectSaved`) precedes E4 (replay); E3 (Windows) precedes E4 (the `crash_recovery` and `fuzz_1h` criteria must be green on Windows). + +## Scope + +### E1 — SCM_RIGHTS primary attach (POSIX) + +- Migrate POSIX to `SCM_RIGHTS` fd-passing as the **primary attach**. The editor creates each region via `ShmRegion.create` and **keeps the fd**. At handshake (after `ProtocolHelloAck`), the editor passes the region fds to the runtime via `IpcSocket.sendWithHandles`, in a **`ShmRegionsHandoff`** message (editor→runtime). The runtime `mmap`s each received fd directly via a new **`ShmRegion.fromFd(fd, size)`** API; it **never** calls cross-process `shm_open(O_RDWR)` again. +- `ShmRegion.open(name)` demoted: **intra-process** discovery/use only. Cross-process `O_RDWR` disappears from production POSIX code. +- Bump `WELD_IPC_PROTOCOL_VERSION`: `2` → `3` (new attach semantics + new mandatory messages from E2). Strictly incompatible versions (cf. `engine-ipc.md §5.2`). +- **EINTR non-regression** check: `sendmsg`/`recvmsg`/`*WithHandles` (fixed in M0.5) retain the EINTR retry now that they are on the primary attach path. M0.7 does **not** re-implement EINTR (M0.5 owns it); it attests non-regression. +- Validate startup orphan cleanup on **Linux** (regions + sockets with PID in the name, `platform.process.is_alive` check). + +### E2 — Extended message catalogue + +- Implement (codegen + handlers) the messages **already specified** in `engine-ipc.md §3.3` but not shipped at S6: `Play`, `Pause`, `Stop`, `LoadScene`, `HotReloadScript`. +- Add to the catalogue **`SaveProject`** (editor→runtime command, project granularity: all dirty scenes + project settings + modified prefabs) and its ack **`ProjectSaved`** (runtime→editor, same `seq_id`, transactional §3.4). +- Add to the catalogue **`RuntimeError`** (runtime→editor event, non-fatal recoverable error; distinct from `CrashReport`, which stays reserved for the fatal case). +- `SaveScene` (already §3.3) stays declared **with no wired handler** in M0.7 (scene granularity — wiring deferred until the scene serialization pipeline lands, out of Phase 0). + +### E3 — Windows editor path + +- `src/editor/main.zig` no longer returns `error.Unimplemented` on Windows: `CreateProcessW` (spawn runtime) + named pipe (message transport) + reuse of the S2 Win32 window backend. +- Validate startup orphan cleanup on **Windows**. + +### E4 — Best-effort replay + nightly fuzz + +- Editor `CommandLog` (`engine-tools-editor.md §2.7.3`): 1024-entry ring, `last_clean_line` set on receipt of the `ProjectSaved` ack. +- `SaveProject` persists a **minimal binary snapshot** of the active scene as the replay reload point (option 1 acted). No `.scene.etch` writer, no full project-settings serialization (out of Phase 0). +- Best-effort replay after crash detection + restart (`engine-tools-editor.md §2.7.4`, `engine-ipc.md §7`): replay the `since(last_clean_line)` non-acked entries, short 500 ms/command timeout, hard stop on nack/timeout. +- `tests/ipc/fuzz_1h.zig` triggered by a nightly GitHub Actions cron on Linux **and** Windows, result archived as an artifact. + +## Out-of-scope + +- Windows `sendWithHandles` via `DuplicateHandle` (Phase 3, coupled with GPU shared framebuffer §4.7). +- GPU shared framebuffer / `VK_KHR_external_memory` (§4.7, Phase 3). +- macOS window backend (Phase 2). NB: the SCM_RIGHTS pivot makes the POSIX paths (Linux + macOS) green, but no macOS window backend is delivered here. +- `.scene.etch` writer and full scene serialization pipeline (belongs to `engine-scene-serialization.md`, out of Phase 0). +- Wiring the `SaveScene` handler (scene granularity). +- shm regions other than `viewport_framebuffer` (debug_overlays, profiler_samples, selection_snapshot, log_stream) beyond what S6 delivered — their full wiring belongs to the consuming modules, not M0.7. +- Islandz editor beyond the IPC stub (Phase 2). +- Idempotence of replayed commands (`engine-ipc.md §7.3`: explicitly not guaranteed). + +## Documents to read first + +1. `engine-ipc.md` — full, prioritizing §2 (transport, incl. §2.3 `ShmRegion`/`IpcSocket` wrapper and §2.4 lifetimes), §3.3 (catalogue), §4 (shm regions incl. §4.8 fd-passing primary attach), §5 (handshake + versioning), §7 (replay). +2. `engine-tools-editor.md` — full §2.7 (crash detection, `CommandLog` §2.7.3, best-effort replay §2.7.4, `RuntimeStatus` §2.7.7). +3. `engine-zig-conventions.md` — §13 (surface coverage / mandatory module rooting — S6 reminder of the `90/92` false positive), §19 (table). +4. `engine-development-workflow.md` — §2 (milestone workflow), §4.3 (Conventional Commits), §4.6 (squash), closing language criterion. +5. `engine-phase-0-criteria.md` — C0.4. + +## Files to create or modify + +(NB Claude Code: the paths below are derived from the S6 specs. Reconcile against the real tree before writing; any path divergence goes in the journal, not into scope creep. The docs diverge between `src/ipc/` (§3.3) and `src/core/ipc/` (M0.5 EINTR debt) — use the repo's effective path.) + +- `src/core/ipc/transport_posix.zig` — modify — `sendmsg`/`recvmsg`/`*WithHandles`, attach from received fd; EINTR non-regression. +- `/ipc.zig` (`IpcSocket`/`ShmRegion` wrapper, cf. §2.3) — modify — add `ShmRegion.fromFd`, demote `open` to intra-process. +- `src/runtime/main.zig` — modify — attach from received fd (no cross-process `shm_open`); handlers `Play/Pause/Stop/LoadScene/HotReloadScript`, `SaveProject`→`ProjectSaved`, emit `RuntimeError`; minimal binary snapshot persistence. +- `src/editor/main.zig` — modify — fd handoff (`ShmRegionsHandoff`); Windows path (`CreateProcessW` + named pipe, out of `error.Unimplemented`); `CommandLog`, `last_clean_line`, replay sequence. +- `src/ipc/ipc_messages.etch` (catalogue, cf. §3.3) — modify — `ShmRegionsHandoff`, `SaveProject`, `ProjectSaved`, `RuntimeError`; activate `Play/Pause/Stop/LoadScene/HotReloadScript`. +- `tests/ipc/crash_recovery.zig` — modify — detection < 100 ms, replay < 500 ms aggregate, on Linux + Windows. +- `tests/ipc/fuzz_1h.zig` — modify — promoted to a nightly target (already present, run manually at S6). +- `tests/ipc/handoff_fd.zig` — create — `ShmRegionsHandoff` handoff: the editor creates a region, passes the fd, the runtime `mmap`s via `fromFd` and reads/writes; no cross-process `shm_open(O_RDWR)`. +- `.github/workflows/.yml` — create or modify — nightly cron `fuzz_1h` on Linux + Windows, upload artifact. + +(Note: `engine-ipc.md` is patched by Claude.ai in the KB — out of the repo — and is NOT a file Claude Code edits.) + +## Acceptance criteria + +### Tests + +- `tests/ipc/handoff_fd.zig` — `test "shm attach via received fd"` — region created on side A, fd passed via `sendWithHandles`, `ShmRegion.fromFd` on side B `mmap`s and writes; coherent read on side A. Green Linux + macOS. +- `tests/ipc/crash_recovery.zig` — `test "kill -9 + best-effort replay"` — `kill -9` runtime, detection < 100 ms, restart + re-handshake, replay the `since(last_clean_line)` commands < 500 ms aggregate. Green Linux + Windows. +- `tests/ipc/fuzz_1h.zig` — `test "1h random command fuzz"` — no deadlock, no `magic` desync, no leak (`std.testing.allocator`). Green in nightly Linux + Windows. +- Catalogue tests: round-trip `Play/Pause/Stop`, `LoadScene`, `HotReloadScript`, `SaveProject`→`ProjectSaved` (ack same `seq_id`), `RuntimeError` (unidirectional event received). Green. +- Non-regression: S6 tests (`magic`/version/`schema_hash` fatal paths, G7 fd loopback) green with `WELD_IPC_PROTOCOL_VERSION = 3`. + +### Benchmarks + +- Round-trip latency < 1 ms (non-regression of the S6 RTT bench; Apple Silicon ReleaseSafe baseline p50 0.006 ms). Archived in CI. + +### Observable behavior + +- macOS dev-primary tests G3/G4/G5/G6 green via SCM_RIGHTS (the BSD shm quirk is bypassed — no cross-process `EACCES` left). +- `grep` of production POSIX code: no cross-process `shm_open(...O_RDWR...)` left. +- Editor launch on **Windows**: spawns runtime via `CreateProcessW`, handshake green, viewport displayed (no more `error.Unimplemented`). +- Orphan cleanup: after a `kill -9`, an editor relaunch removes orphan regions/sockets (verified Linux + Windows). +- Nightly `fuzz_1h` artifact present and green on both OSes in the Actions tab. + +### CI + +- `zig build` clean, zero warnings, on the matrix (ubuntu-24.04 + windows-2025 × {Debug, ReleaseSafe}). +- `zig build test` green (Debug + ReleaseSafe). +- `zig fmt --check` green. +- `zig build lint` green. +- `commit-msg` hook green on every commit. +- Nightly cron job `fuzz_1h` configured Linux + Windows, artifact uploaded. +- Closing language criterion (cf. `engine-development-workflow.md`): 100% of repo artifacts in English. + +## Conventions + +- **Branch**: `phase-0/ipc/scm-rights-and-windows-and-fuzz` +- **Final tag**: `v0.7.0-M0.7-ipc` +- **PR title**: `Phase 0 / IPC / IPC complete (SCM_RIGHTS primary attach + Windows + replay + nightly fuzz)` +- **Commit convention**: Conventional Commits (cf. `engine-development-workflow.md §4.3`) +- **Merge strategy**: squash-and-merge (cf. `engine-development-workflow.md §4.6`) + +## Notes + +- **E1→E4 split with checkpoints.** At the end of each Ei, signal Guy for review before continuing. Hard dependencies: E2 before E4 (`ProjectSaved` anchors `last_clean_line`); E3 before E4 (`crash_recovery`/`fuzz_1h` criteria must be green on Windows). E1 is independent of the others (can open). +- **The SCM_RIGHTS pivot is acted, not to be reopened.** Decided at S6 closure (cf. `engine-spec.md` S6 results). E1 implements it; it does not re-debate the choice. +- **No new Tier 0 surface beyond the pivot.** `ShmRegion.fromFd` and the `ShmRegionsHandoff` message are the minimal materialization of the pivot (the editor keeps the fd, sends it, the runtime mmaps it). `sendWithHandles`/`recvWithHandles` have existed since S6 (G7-validated) — no addition. +- **EINTR belongs to M0.5.** Do not re-fix it; only verify the retry holds on `sendmsg`/`recvmsg`/`*WithHandles` once promoted to the primary attach path. +- **Minimal binary snapshot (option 1).** `SaveProject` does not write a `.scene.etch`. It persists just enough for replay to reload a reference point. The text writer and full project-settings serialization are out of Phase 0. +- **Mandatory module rooting (S6 reminder).** Inline `test`s of a module not transitively referenced from the test target root are silently skipped (S6 `90/92` false positive). Verify every touched module is rooted (cf. `engine-zig-conventions.md §13`). +- **`schema_hash` fatal on mismatch.** The 2→3 bump makes S6 editor / M0.7 runtime strictly incompatible — expected behavior (§5.2, no negotiation). +- **Conduit coherence.** `scene.save` (Conduit §3.2) maps scene granularity; `SaveProject` is the project granularity of replay. Do not conflate the two in handlers. + +--- + +# LIVING SECTION + +*Maintained by Claude Code during the milestone.* + +## Specs read + +- [x] `engine-ipc.md` (§2, §3.3, §4, §5, §7) — read 2026-06-05 05:58 +- [x] `engine-tools-editor.md` (§2.7) — read 2026-06-05 05:58 +- [x] `engine-zig-conventions.md` (§13, §19) — read 2026-06-05 05:58 +- [x] `engine-development-workflow.md` (§2, §4.3, §4.6) — read 2026-06-05 05:58 +- [x] `engine-phase-0-criteria.md` (C0.4) — read 2026-06-05 05:58 + +## Execution journal + +### 2026-06-05 — Step 1 (branch + brief) and Step 2 (spec ingestion) + +- Branch `phase-0/ipc/scm-rights-and-windows-and-fuzz` created from `main` up to + date with `origin/main` (HEAD `b732541`, M0.6). Brief copied verbatim + (byte-identical, 13182 bytes) and committed as first commit. +- Brief verified 100% English before commit (French function-word grep empty), + per `engine-development-workflow.md §3` safety net. +- All five specs read in full at the scope the brief defines. Scoping notes: + - `engine-tools-editor.md` is a 3283-line Phase 2+ document. The brief scopes + it to "full §2.7". Read §1–§2.7 in full (lines 1–961, §2.7 ends at 808); + §3+ (Selection/Search/Clipboard/gizmos/panels) is out of M0.7 scope. + Key anchors absorbed: §2.7.3 CommandLog (ring 1024 + `last_clean_line`), + §2.7.4 best-effort replay (500 ms/command timeout, hard stop on nack/timeout), + §2.7.7 `RuntimeStatus` enum. + - `engine-zig-conventions.md` read fully; §13 (module rooting S6 "90/92" + reminder + surface coverage + external-resource test ≤5 s timeout) and §19 + (rules table) are the load-bearing sections for this milestone. + +### 2026-06-05 — E1 reconnaissance: path reconciliation vs real tree + +Per the brief's "Files to create or modify" caveat, the real tree was +reconciled against the S6-derived paths before any production write. Every +brief-named symbol was confirmed by reading the code directly (anti-hallucination +E1 discipline, CLAUDE.md). + +Path reconciliation (brief path → effective repo path): + +| Brief path | Effective repo path | Note | +|---|---|---| +| `src/core/ipc/transport_posix.zig` | same | EINTR retry present (M0.5): `.INTR => continue` in `send`/`recv`/`sendWithHandles`/`recvWithHandles` (l. 231/244/294/336). Attest non-regression; do not touch. | +| `/ipc.zig` (IpcSocket/ShmRegion wrapper §2.3) | split: `src/core/ipc/shm.zig` (ShmRegion) + `src/core/ipc/transport.zig` (IpcSocket, OsHandle) + backends `shm_{posix,windows}.zig` / `transport_{posix,windows}.zig` | No single `platform/ipc.zig` file exists. `ShmRegion.fromFd`/`fd()` to add in `shm.zig` + `shm_posix.zig`; `open` demoted there. | +| `src/ipc/ipc_messages.etch` (catalogue §3.3) | `src/core/ipc/messages.zig` | **No `.etch` IPC catalogue and no codegen pipeline exist.** The catalogue is hand-written `extern struct` POD. New messages are added as Zig structs following the existing pattern — building an `.etch` codegen pipeline is out of scope (and out of Phase 0 IPC). | +| `tests/ipc/handoff_fd.zig` (create) | new, distinct from existing `tests/ipc/fd_passing.zig` | `fd_passing.zig` is the S6 G7 raw-socket fd loopback (pipe fd via `sendWithHandles`). `handoff_fd.zig` is the higher-level `ShmRegion.fromFd` end-to-end. Must be registered in `build.zig` `ipc_test_paths`. | +| `.github/workflows/.yml` (create) | new; only `.github/workflows/ci.yml` exists today | `fuzz_1h.zig` already builds as exe `ipc-fuzz-1h` (step `test-ipc-fuzz-1h`, manual at S6). Nightly cron promotes it. | +| `src/runtime/main.zig`, `src/editor/main.zig` | same | confirmed present. | + +Additional files touched (justified, not scope creep): + +- `src/core/ipc/protocol.zig` — holds `WELD_IPC_PROTOCOL_VERSION` (currently `2`); + E1 bumps to `3` (brief § Scope E1). +- `src/core/ipc/{framing,messages,connection,server,client}.zig` — `MsgType` range, + `msgTypeOf`, handshake handoff helpers for the new catalogue (E1/E2). +- `src/core/root.zig` — re-exports + comptime-pins the 9 `ipc/*.zig` files + (`pub const ipc = struct {…}` + `_ = ipc.X;`). Any NEW `src/core/ipc/*.zig` + with inline tests must be added here (§13 module rooting, S6 "90/92" reminder). +- `build.zig` — register `tests/ipc/handoff_fd.zig` in `ipc_test_paths`; nightly wiring. + +Symbol confirmation: + +- `ShmRegion.create`/`open(name,size)`/`close` exist; `fromFd`/`fd()` do **not** (to add). + Note: real `open` takes `(name, size)`, not §2.3's `open(name)` — minor signature divergence. +- `IpcSocket.sendWithHandles`/`recvWithHandles` exist; POSIX implemented (G7), Windows + returns `error.Unimplemented` (Phase 3 — out of scope, leave untouched). +- Catalogue messages `Play/Pause/Stop/LoadScene/HotReloadScript`, `SaveScene`, + `SaveProject`/`ProjectSaved`/`RuntimeError`, `ShmRegionsHandoff` are absent from + `messages.zig` (`MsgType` 1..13) — all to add in E1/E2. +- Runtime stub `src/runtime/main.zig:94` also returns `error.Unimplemented` on Windows. + A green Windows handshake (E3) + Windows `crash_recovery`/`fuzz_1h` (E4) require the + runtime to connect+handshake on Windows (named-pipe transport + by-name shm attach + both already exist). Flagged as an E3 dependency, not new scope — it is implied by the + E3/E4 acceptance criteria ("handshake green on Windows"). Confirmed at E3 start. + +No blocker. Path divergences are exactly those the brief anticipated. + +### 2026-06-05 — E1 implemented (SCM_RIGHTS primary attach) + +Delivered (commits `f94dc92`, `e781554`): + +- `ShmRegion.fromFd(fd, size)` + `ShmRegion.fd()` (`shm.zig` + `shm_posix.zig`); + POSIX `Backend.fromFd` mmaps the received fd with no `shm_open`, `name_z` made + optional for the nameless attach. Windows `fromFd` → `error.Unimplemented` + (attach stays by name, §4.8). `ShmViewport.fromFd` / `ShmViewport.fd()` wrap it. +- `ShmRegionsHandoff` + `ShmRegionDesc` messages (`msg_type 14`), `MAX_SHM_REGIONS = 8`. + `IpcConnection.recvFrameWithHandles` is the receive counterpart of the existing + `sendMessageWithHandles` (the minimal surface the pivot needs). +- `WELD_IPC_PROTOCOL_VERSION` 2 → 3. +- Editor sends `ShmRegionsHandoff` (viewport fd) right after `ProtocolHelloAck`; + runtime attaches via `ShmViewport.fromFd` — the by-name cross-process `open` is gone + from production. `open` demoted to intra-process / Windows-by-name only. +- `tests/ipc/handoff_fd.zig` (new, registered in `build.zig`): create → fd → send → + `fromFd` → write → coherent read. Failure-injection confirmed it actually runs + (not a silent §13 skip). +- Startup orphan reaping `src/core/ipc/cleanup.zig` (`reapOrphans`): scans `/tmp` + (`weld-.sock`, Linux + macOS) and `/dev/shm` (`weld-shm-*-`, Linux), + removing entries whose PID is dead (`process.is_alive`); live PIDs untouched. + Wired into `src/editor/main.zig` startup. Tested (socket reap POSIX, shm reap Linux, + pure PID-parse units). + +Verifications: + +- **EINTR non-regression (attested, not re-implemented)**: `.INTR => continue` retained + on all four POSIX paths incl. the now-primary `sendWithHandles`/`recvWithHandles` + (`transport_posix.zig:231/244/294/336`). No EINTR code changed. +- **No cross-process `shm_open(O_RDWR)` in production POSIX**: grep shows the only + `O_RDWR` `shm_open`s are the editor's `create` (creation, not attach) and the demoted + intra-process `open`; no `src/editor`/`src/runtime` call site attaches cross-process by + name. The runtime uses `fromFd`. +- **macOS G3/G4/G5 green via SCM_RIGHTS**: `tests/ipc/crash_recovery.zig` un-gated from + Linux-only to POSIX — the pivot removes the BSD shm `EACCES`, so G4/G5 now run on macOS + dev (verified by failure-injection that all 3 run). `test-ipc` stable 3/3 on macOS. +- **Pre-existing macOS full-suite flakiness (not introduced here)**: under `zig build test` + parallel execution, a few non-IPC exes (plugin_loader/dlopen, events/threads, + etch-cache/fileIO) intermittently report `failed command`, yet pass 5/5 standalone and + the overall build exits 0. Unrelated to IPC; CI authority is Linux + Windows (macOS not + in the matrix). + +E1 deliverables complete. **Checkpoint: awaiting Guy's review before E2.** + +### 2026-06-05 — E1 review addendum (Claude.ai, pre-E2) + +Two review points applied on the same branch (no new checkpoint). + +**(1) Hardened `ShmRegionsHandoff` validation (§8.3)** — commit `ebdfcf7`. +The runtime previously accepted any handoff with `hf.handles >= 1` and +`region_count >= 1`. New `connection.acceptShmHandoff` enforces the §8.3 +contract: `region_count` in `[1, MAX_SHM_REGIONS]` **and** +`fd_count == region_count` exactly, else `error.InvalidHandoff`. On any +rejection every received fd is closed; on success only the viewport +(`regions[0]`) is retained and every further region fd is closed +(`transport.closeHandle`) — no descriptor leak from a malformed or +multi-region handoff. Negative tests added to `handoff_fd.zig` (count +mismatch, over-cap, multi-region success), with the fd-closure +assertions confirmed real by failure-injection. Pre-hardens the E4 +catalogue fuzz. + +**(2) macOS full-suite flakiness confirmed pre-existing.** Replayed +`zig build test` on `main` (commit `b732541`, in an isolated +`git worktree`) 4×: every run exits 0 but emits exactly **3** +`failed command` lines from the same three non-IPC exes — `plugin_loader` +(dlopen), `etch` `cache_diff` (file I/O timing), `events` (drop +saturation). This is **identical** to the branch behaviour (same count, +same exes, same exit 0). Conclusion: the flakiness is a pre-existing +parallel-execution artefact of the macOS dev full suite, **not** +introduced by `reapOrphans()` or the `crash_recovery` macOS un-gate. +Static analysis agrees: `reapOrphans` is invoked during `zig build test` +only by `cleanup.zig`'s own inline tests (the editor binary is never +launched by the suite), and it removes only `weld-.sock` / +`weld-shm-*-` with a dead PID — the suite's sockets use the +non-matching `weld-crashtest-*` / `weld-restart-*` / `weld-g5-*` shapes +with live PIDs, so there is no interference with the global `/tmp` scan. +CI authority remains Linux + Windows (macOS is out of the CI matrix). + +### 2026-06-05 — E2 implemented (extended message catalogue) + +Delivered (commit `8d05120`). Nine messages added to `messages.zig` as +hand-written `extern struct` POD (no `.etch` codegen in-repo — see the +reconnaissance entry), `MsgType` range now `1..23`: + +- `Play` / `Pause` / `Stop` (fire-and-forget): the runtime tracks a + `play_state` atomic; the render loop advances the mire only while + playing. Default `playing`, so S6 behaviour and `crash_recovery` are + unchanged (a separate `iter` counter bounds the lifetime via `--frames`). +- `LoadScene` / `HotReloadScript` (fire-and-forget): handlers decode and + act. An empty `LoadScene` path emits a non-fatal `RuntimeError` event + (recoverable non-transactional command failure, §3.3 / §8.3). +- `SaveScene`: declared with **no wired handler** (scene granularity, + falls through to the reader's `else`) — per brief § Out-of-scope. +- `SaveProject` → `ProjectSaved`: transactional (§3.4), ack carries the + same `seq_id`. (The minimal binary snapshot it will persist is E4.) +- `RuntimeError`: runtime→editor non-fatal event, distinct from `CrashReport`. +- `ErrorSeverity` enum (warning/err). + +Tests — `tests/ipc/catalogue.zig` (new, registered in `build.zig` + +runtime-install dependency like `crash_recovery`): +- pure framing round-trips (encode→decode parity) for every new message; +- end-to-end against the spawned runtime (POSIX, macOS too via the pivot): + `SaveProject`→`ProjectSaved` same `seq_id`, `LoadScene`-empty→`RuntimeError`, + `Play`/`Pause`/`Stop` accepted without desync (Echo after still round-trips). +- `schema_hash` uniqueness extended to all 23 messages — confirms no wire + collision (the identical-layout `Play`/`Pause`/`Stop`/`SaveProject` and + `LoadScene`/`SaveScene` are disambiguated by `typeName` in the RTTI hash). + +Bug found + fixed during E2 testing: returning an `IpcServer` by value +broke its internal `conn.socket` pointer (`&self.client.?`), causing +`BrokenPipe`. The catalogue fixture now drives a caller-owned, stable +`IpcServer` (helper takes `*IpcServer`); `ShmViewport`/`Process` have no +self-references and are returned by value safely. + +Two stale boundary tests updated as the catalogue grew (`MsgType.isKnown` +range checks) — caught by the suite, the intended guard behaviour. + +Validation: `zig build` + `zig build test` + `test-ipc` green in **debug +and ReleaseSafe**; `zig fmt --check` + `zig build lint` green. + +E2 deliverables complete. **Checkpoint: awaiting Guy's review before E3.** + +### 2026-06-05 — E3 implemented (Windows editor + runtime path) + +Delivered (commit `f76d9b7`). Both binaries leave `error.Unimplemented` +on Windows; the round-trip runs over the existing named-pipe transport +(`transport_windows.zig`, S6) + by-name shm attach (`shm_windows.zig`). + +- `platform/process.zig`: `spawn_process` implements `CreateProcessW` + (UTF-16 command line, `STARTUPINFOW` + `PROCESS_INFORMATION`), + replacing the Windows stub. +- `editor/main.zig`: early `error.Unimplemented` removed; OS-correct + endpoints via `transport.buildSocketPath` (`\\.\pipe\weld-`) + + Windows shm name `Local\weld-shm-viewport-` (§2.2); the SCM_RIGHTS + handoff is gated to POSIX (Windows attaches by name — no fd to pass). +- `runtime/main.zig`: early `error.Unimplemented` removed; viewport + attach branches POSIX (`fromFd` via handoff) vs Windows (`open` by name). +- **Review point 1 — reader buffer sizing**: `scratch` is now sized over + the FULL incoming editor→runtime catalogue (explicit enumeration), not + `@max(Echo, LoadScene)`. Constated frame sizes (`24 + @sizeOf`): + Heartbeat 32, Shutdown 25, Echo 88, SpawnEntity 28, ModifyComponent 80, + Play/Pause/Stop 25, LoadScene 280, SaveScene 280, HotReloadScript 32, + SaveProject 25 → **max 280** (LoadScene/SaveScene). No undersize today; + the enumeration is regression-proof against a future larger message. +- **Windows orphan cleanup**: `reapOrphans` is a documented no-op on + Windows — named pipes and named file mappings are refcounted kernel + objects that vanish with their last handle, so a crashed editor leaves + no orphan to unlink and a relaunch (fresh PID-keyed names) never + conflicts. The editor calls `reapOrphans` at startup on all OSes (no-op + on Windows). Relaunch cleanliness is validated by E4's Windows + `crash_recovery` (un-gate) + manual hardware. + +Windows validation method: the full `zig build -Dtarget=x86_64-windows` +is blocked by a native-tool toolchain quirk on this macOS box (`etch_cook` +gets x86 features on an arm64 host build — pre-existing, not E3 code), so +each binary is cross-compiled directly: +`zig build-exe --dep weld_core [--dep shaders] -Mroot=src//main.zig +… -target x86_64-windows -lc`. Both editor + runtime compile **and link** +for Windows. Runtime behaviour on Windows is validated by CI (windows-2025 +build) + E4's `crash_recovery` un-gate; the interactive "viewport +displayed" gate is manual (Win11 box, currently down per CLAUDE.md). +POSIX behaviour is unchanged (`test-ipc` green, debug + ReleaseSafe). + +E3 deliverables complete. **Checkpoint: awaiting Guy's review before E4.** + +### 2026-06-05 — E3 review addendum (Claude.ai): Windows cmdline quoting + +Commit `765f3e4`. The E3 `spawn_process` built the Windows command line +with naive `"arg"` wrapping — broken for any argument ending in +backslashes (a `\` glued to the closing `"` is read as an escaped quote, +so the argument swallows the next one) or containing a `"`. Fixed at the +Tier 0 primitive, not at the call site: + +- `platform/process.zig`: new `pub fn quoteArg` implements the standard + MSVCRT `ArgvQuote` / `CommandLineToArgvW` algorithm (verbatim when free + of whitespace/quote; else wrap in `"` and double runs of backslashes + that precede a `"`, literal or closing). UTF-8, ASCII-transparent. + `spawn_process` quotes every arg through it (argv[0] included, per the + brief). +- `tests/ipc/process.zig`: golden cases + a round-trip through a + reference `CommandLineToArgvW` parser over the tricky inputs + (trailing backslashes ± space, internal quote, backslash+quote, tab). + Pure + cross-platform (no Windows needed); failure-injection confirmed + the golden assertions run. Editor still cross-compiles to + `x86_64-windows`. + +### 2026-06-05 — E3 addendum 2 (Guy's Windows PC): 2 portability fixes + +Guy's real Windows build caught two compile errors my cross-compile had +reported green. **Root cause of the false green** (important): a +`zig build-exe -target x86_64-windows` with `-target` placed AFTER the +`-M` module definitions compiles the modules for the **native (macOS)** +target and only emits a Windows-format shell — so every Windows-gated +path went unanalysed. Proven by an `OSPROBE`: `builtin.os.tag` was not +`.windows` with `-target` last, but is `.windows` with `-target` first. +A control (broken `Iterator.init`) is now correctly rejected only with +the corrected invocation. **All prior "cross GREEN" claims for E3 were +macOS builds — invalid for Windows.** Going forward the reliable form is +`zig build-exe -target x86_64-windows -lc --dep … -Mroot=… -M…` (target +BEFORE modules); the real `zig build` on Windows stays the authority. + +Fixes (commit `7f9faa3`): +1. `parseArgs` (editor + runtime): `std.process.Args.Iterator.init` is a + `@compileError` on Windows — switched to `Iterator.initAllocator( + init.args, gpa)` (allocator variant; `deinit` frees the Windows + buffer; works on POSIX too). +2. `spawn_process`: `utf8ToUtf16LeAllocZ` returns `error.InvalidUtf8`, + outside the process `Error` set — new `utf8ToUtf16Z` remaps it to + `error.InvalidArgument` (invalid argv input, not an engine fault). + +Re-verified: both binaries compile clean for `x86_64-windows` with the +corrected (target-first) invocation; POSIX `test-ipc` + `fmt` + `lint` +green. Guy re-tests on the Windows PC (native build + interactive run) +before any E4 green-light. + +### 2026-06-05 — E3 addendum 3 (Guy's Windows PC): CreateProcessW runtime fix + +Commit `1c3e31d`. First real Windows run: `CreateProcessW` → +`error.SpawnFailed` (the editor failed to spawn the runtime). Compile +checks could not catch this — it is a **runtime** failure, only the real +Windows run exposes it. + +1. **Diagnostic** (done first, per the review): on `ok == 0` the spawn + now logs `path` + `GetLastError()` before `return error.SpawnFailed`. + I cannot obtain the code myself (no Windows host); expected cause is + `2 = ERROR_FILE_NOT_FOUND` (missing `.exe` / wrong path). The fix + below should make Guy's re-run succeed; if it still fails, the logged + code now pinpoints why. **Code to confirm on Guy's re-run.** +2. **Root fix**: the default `runtime_path` was the POSIX-inherited + `zig-out/bin/weld-runtime` — no `.exe`, CWD-relative. `CreateProcessW` + with `lpApplicationName` needs the exact path and neither appends + `.exe` nor searches PATH. The editor now derives it from its own + executable directory (`std.process.executableDirPathAlloc`, the 0.16 + replacement for the removed `std.fs.selfExeDirPath*` — needs `io`) + + `weld-runtime` + `.exe` on Windows. `--runtime=` override kept. The + editor switched to the full `std.process.Init` (Juicy Main for dev + tools, conventions §2) to obtain `init.io`; `init.arena` replaces the + manual arena. +3. **Anti-regression**: native Windows `spawn_process` test + (`cmd.exe /c exit 0` → `wait_nonblock == 0`), gated Windows + (the prior spawn tests were POSIX-gated). + +Verified: editor + runtime + `tests/ipc/process.zig` compile clean for +`x86_64-windows` (corrected target-first cross method, fresh cache, +`zig test --test-no-exec` for the test). POSIX `test-ipc` + `fmt` + +`lint` green. Pending Guy's live Windows re-run (build + interactive +editor → animated mire window + two processes). + +### 2026-06-05 — E4 implemented (best-effort replay + nightly fuzz) + +Commits `8bced76` (command log, prior session), `553f105`, `4590370`, +`fab104f`, `8c34093`, `3d56e67`. + +**Replay building blocks.** +- `src/core/ipc/command_log.zig` (new, `8bced76`) — editor-side `CommandLog` + ring (capacity 1024, ≤ 512 B/frame). `Entry { seq_id, msg_type, status + (pending/acked/nacked), issued_at/acked_at, frame bytes }`. `markCleanLine` + records `last_clean_line`; `replaySince()` iterates the still-pending + entries since it. 6 inline tests, rooted in `root.zig`. +- `src/core/ipc/snapshot.zig` (new, `553f105`) — minimal fixed-size binary + snapshot (`magic "WSNP"` + `version` + `frame_id` marker), the §7.1 reload + point. **Not** a `.scene.etch` writer, no project-settings serialization + (option 1, out of Phase 0). `write`/`read` go through `std.Io.Dir` + `io` + (the 0.16 filesystem API — `std.fs.cwd()` is gone). `SaveProject` persists + it then acks; a write failure surfaces via `ProjectSaved.ok = 0` so the + editor does not advance its clean line on a save that did not persist. The + runtime reloads the marker on restart (`start_frame`). 3 inline tests. +- `src/core/ipc/connection.zig` `replayCommands` (`4590370`) — re-sends each + pending frame verbatim (same `seq_id`) and awaits a matching reply; the + first nack / timeout / seq desync stops the pass hard (`complete = false`, + no idempotence, §7.2/§7.3). Never raises. + +**`crash_recovery.zig` un-gated to Windows + replay case** (`fab104f`). The +4 tests now compile + run on Windows too (was POSIX-only): per-OS differences +isolated in `spawnAndHandshake` (POSIX SCM_RIGHTS handoff vs Windows +open-by-name) and the cleanup helpers. **0.16 migration:** +`std.time.milliTimestamp` and `std.Thread.sleep` are both gone — clock/sleep +now route through the cross-platform `platform.time` wrapper +(`nowNanos` / `sleepPrecise(io, …)`), so each test owns a `std.Io.Threaded`. +New fourth case: `SaveProject` sets a clean line, 3 post-save commands queue +unacked, the runtime is `kill -9`'d (EOF detected < 100 ms), then a restart + +`replayCommands` re-acks all 3 in < 500 ms aggregate. The persisted snapshot +`.bin` is cleaned up via `io` and the pattern is gitignored. + +**`fuzz_1h.zig` → full catalogue + leak detection + nightly** (`8c34093`, +`3d56e67`). The harness sent only `Echo`; it now picks a random message type +each iteration over a 15-type catalogue slice (incl. `ShmRegionsHandoff` and +every E2 command). Interleaving heterogeneous frame *sizes* is what actually +stresses the length-prefixed framing's delimiting (the no-desync gate) — a +fixed-size `Echo` stream never did. A `CountingAllocator` over +`page_allocator` turns any leak into a non-zero exit (`alloc == free` + byte +tallies balance); `sent == recv` + a clean reader prove no desync. A +deterministic bad-magic teardown sentinel (published after the `stop` flag) +unblocks the reader's blocking `recv` rather than relying on a recv timeout. +Duration is overridable via argv (env vars are gone in 0.16): +`zig build test-ipc-fuzz-1h -- --duration-ms=N`. New nightly workflow +`.github/workflows/nightly-fuzz.yml` (cron 04:00 UTC + `workflow_dispatch`) +runs the 1 h fuzz in ReleaseSafe on `ubuntu-24.04` + `windows-2025` and +uploads each digest as an artifact (G3). + +**Verified (POSIX, dev Apple Silicon).** `test-ipc` green incl. all 4 +`crash_recovery` tests (the spawning ones run the real `weld-runtime`). +3 s fuzz smoke: `sent=1070183 recv=1070183 fault=0 alloc=free +bytes_alloc=bytes_freed` — perfect delimiting over 1.07 M heterogeneous +frames, zero leak. `fmt` + `lint` green. `crash_recovery.zig` + `fuzz_1h.zig` +compile clean for `x86_64-windows` (corrected target-first cross method). + +**Windows validation discipline (E3 lesson).** The cross-compile check is +**not** a Windows proof. Pending Guy on the real Windows PC: +1. `zig build test-ipc` (the un-gated `crash_recovery` must be green — + detection < 100 ms, replay < 500 ms aggregate); +2. `zig build test-ipc-fuzz-1h -- --duration-ms=60000` (≈ 1 min smoke; digest + must show `fault=0`, `sent == recv`, balanced alloc/free); +3. the nightly cron activates automatically once the branch is squash-merged + to `main` (scheduled runs fire only from the default branch); it can also + be triggered early via `workflow_dispatch` from the Actions tab. + +### 2026-06-06 — E4 review addendum (Claude.ai): harden fuzz teardown + +Commit `3b53414`. The E4 fuzz teardown used a bad-magic sentinel and +reclassified `error.InvalidMagic` as benign once `stop == 1`. That opened a +narrow **masking window**: a real framing desync on the last backlog data +frames — sent before the writer published `stop` but received after — would +be scored as teardown instead of a fault, blunting the no-desync gate. +Narrow but unacceptable for that gate. + +Fix (harness only — prod transport unchanged): the sentinel is now a +**well-formed** frame of a catalogue type kept out of `fuzz_types` +(`ShutdownAck`), recognised by the reader via its `msg_type`. The clean +end-of-stream is therefore a valid frame, never an error — so `InvalidMagic` +/ `UnknownMsgType` / version / size mismatches are **unconditional faults at +any time**, regardless of `stop`. Only a post-`stop` socket EOF stays +tolerated (outside the targeted bug class). A `comptime` guard keeps the +sentinel type out of `fuzz_types`. Verified: 60 s smoke +`sent == recv == 20 440 075`, `fault=0`, alloc/free + bytes balanced; `fmt` +clean; clean `x86_64-windows` cross-compile (not a Windows proof). Pending +Guy's 60 s smoke on the real Windows PC. + +## Acted deviations + +### Files touched outside the brief's "Files to create or modify" list + +- **`src/core/ipc/cleanup.zig` (new)** — the brief's file list did not anticipate a + dedicated module for the §2.4 startup orphan reaping (an explicit E1 deliverable: + "Validate startup orphan cleanup on Linux"). A self-contained, rooted, unit-tested + module is cleaner than inlining a raw `opendir` scan into `src/editor/main.zig`. + Rooted in `src/core/root.zig` per §13. +- **`src/core/ipc/{protocol,messages,connection}.zig`, `src/core/root.zig`** — these are + the concrete repo locations of the brief's `/ipc.zig` wrapper and + `src/ipc/ipc_messages.etch` catalogue (no `.etch` codegen exists; the catalogue is + hand-written Zig). Reconciliation logged in the E1 reconnaissance entry above. +- **`tests/ipc/crash_recovery.zig` modified in E1** (the brief assigns it to E4) — E1 + changed the runtime contract (a handoff is now required after the handshake), so the + three runtime-spawning tests had to send the handoff to stay valid, and the file was + un-gated to macOS. The deeper E4 rework (detection < 100 ms / replay < 500 ms timing + on Linux + Windows) is unchanged and still owned by E4. +- **`tests/ipc/schema_hash.zig` modified** — added `ShmRegionsHandoff` to the + hash-uniqueness set (was "13 S6 messages"); keeps the wire-collision guard complete. +- **`src/runtime/main.zig` + `src/core/platform/process.zig` Windows paths (E3)** — the + brief's E3 "Files" centre on `src/editor/main.zig`, but a green Windows handshake (the + Windows observable-behavior criterion) requires the runtime to also leave + `error.Unimplemented` AND `platform.process.spawn_process` to implement `CreateProcessW` + (it was a Windows stub returning `error.SpawnFailed`). Confirmed acted with Guy in the + E3 green-light message — a direct consequence of the Windows observable-behavior + criterion, not new scope. +- **`src/core/ipc/command_log.zig` + `src/core/ipc/snapshot.zig` (new, E4)** — the brief + placed `CommandLog` inside `src/editor/main.zig` and the snapshot persistence inside + `src/runtime/main.zig`. Both are instead self-contained, rooted, unit-tested modules + (6 + 3 inline tests), same rationale as `cleanup.zig` in E1: a ring buffer and a + binary file format are reusable Tier 0 primitives, cleaner tested in isolation than + inlined into a `main`. Rooted in `root.zig` per §13. No surface widening — they are + internal to the `ipc` namespace. +- **`src/core/ipc/connection.zig` `replayCommands` (E4)** — the brief assigned the + "replay sequence" to `src/editor/main.zig`. The reusable send-pending-and-await-ack + primitive lives at the connection layer (which already owns `sendMessage`/`recvFrame`) + rather than in the editor `main`; the editor would call it. Keeps the replay logic + unit-testable (exercised by the `crash_recovery` replay case) and out of the window/IPC + glue. +- **`.gitignore` (modify, E4)** — added `weld-snap-*.bin` / `weld-snapshot-*.bin` so the + runtime's `SaveProject` snapshot artifacts (and crash-recovery test leftovers) never + enter the work tree. Pure hygiene, not in the brief's file list. + +### Known debt left untouched — `tests/assets/cache_diff.zig` (M0.6) + +`tests/assets/cache_diff.zig` (the M0.6 asset-cook cache perf test) asserts an +**absolute** cook-time ratio, which is fragile across hosts and may fail under +`zig build test` on Windows. Per Guy's E3 addendum, this is **out of M0.7 scope** and +must NOT be touched here: it is an M0.6 debt to address separately (make the assertion +tolerant, or move it out of the gate). If it red-fails on the Windows CI / PC, it is a +pre-existing M0.6 issue, not an M0.7 regression. Flagged for a dedicated follow-up. + +## Blockers encountered + +## Closing notes + +**M0.7 — IPC complete — CLOSED 2026-06-06.** The milestone completed and +hardened the S6 IPC skeleton and absorbed the carried-forward Phase −1 IPC +debts. No Tier 0 surface widening beyond what the SCM_RIGHTS pivot required. +Criterion **C0.4 (editor↔runtime IPC stable)** is advanced. + +### What shipped (E1→E4) + +- **E1 — SCM_RIGHTS primary attach (POSIX).** Cross-process + `shm_open(O_RDWR)` (refused `EACCES` for non-creator siblings on macOS BSD + shm) is gone. The editor creates each region, keeps the fd, and hands the + viewport fd to the runtime via `IpcSocket.sendWithHandles` in a + `ShmRegionsHandoff` message right after `ProtocolHelloAck`; the runtime + maps it with `ShmRegion.fromFd` / `ShmViewport.fromFd`. `open` is demoted + to intra-process. Protocol bumped to **v3**. Startup orphan reaping + (`cleanup.reapOrphans`) added. EINTR loops attested non-regressed + (M0.5-owned, not re-implemented). +- **E2 — extended catalogue.** `ShmRegionsHandoff`/`ShmRegionDesc` (E1), + `Play/Pause/Stop/LoadScene/HotReloadScript`, `SaveScene`, and + `SaveProject`→`ProjectSaved` (transactional, same `seq_id`) + `RuntimeError` + (non-fatal event). `msgTypeOf` + `isKnown` extended to `1..23`; + `schema_hash` uniqueness guard kept complete. `SaveScene` is declared but + intentionally not wired (out of scope). +- **E3 — Windows editor path.** `src/editor/main.zig` out of + `error.Unimplemented`: `CreateProcessW` spawn + named pipe + the S2 Win32 + window. Editor switched to full `std.process.Init` (Juicy Main) for + `init.io`/`init.arena`. +- **E4 — best-effort replay + nightly fuzz.** Editor `CommandLog` ring + (1024); `last_clean_line` set on the `ProjectSaved` ack. `SaveProject` + persists a minimal binary snapshot (option 1; no `.scene.etch` writer), + reloaded on restart. `connection.replayCommands` re-sends pending + post-clean-line frames and awaits matching acks, stopping hard on + nack/timeout/desync (no idempotence, §7.2/§7.3). `crash_recovery.zig` + un-gated to Windows (detection < 100 ms, replay < 500 ms aggregate). + `fuzz_1h.zig` promoted to a nightly cron (Linux + Windows, artifact), + rewritten to fuzz the full catalogue with leak detection. + +### Review addendums acted + +- **E3-a (handoff §8.3 hardening, `8859ff9`)** — `acceptShmHandoff` rejects + `region_count == 0`, `> MAX_SHM_REGIONS`, or an fd-count mismatch; closes + every received fd on rejection and every non-viewport fd on success, so no + malformed handoff leaks descriptors. Negative tests added. +- **E3-b/c/d (Windows portability, `32e4fbb` + `1c3e31d`)** — surfaced by + Guy's real Windows runs, invisible to compile checks: + `parseArgs` → `Args.Iterator.initAllocator` (`.init` is a `@compileError` + on Windows); `spawn_process` error set remapped `InvalidUtf8` → + `InvalidArgument` via a `utf8ToUtf16Z` helper; `CreateProcessW` + `error.SpawnFailed` fixed by resolving the runtime path from the editor's + own exe dir + `.exe` (with a `GetLastError` diagnostic); MSVCRT-correct + command-line quoting (`ArgvQuote`, `34a057e`). +- **E4-a (fuzz teardown, `3b53414`)** — the bad-magic teardown sentinel had + reclassified `InvalidMagic` as benign once `stop == 1`, a narrow window + that could mask a real desync on the last backlog frames. Replaced by a + *well-formed* sentinel frame of a type kept out of `fuzz_types` + (`ShutdownAck`); `InvalidMagic` / `UnknownMsgType` / size-mismatch are now + unconditional faults at any time. Harness only — prod transport unchanged. + +### Validation (3 platforms) + +- **POSIX — macOS dev (Apple Silicon, ReleaseSafe/Debug).** `zig build test` + / `test-ipc` green incl. all 4 `crash_recovery` cases (the spawning ones + drive the real `weld-runtime`). Fuzz 60 s smoke: `sent == recv == + 20 440 075`, `fault=0`, alloc/free + byte tallies balanced. `fmt` + `lint` + + pre-push (build + test + test-release + tsan-wayland) green. +- **POSIX — Linux CI** (`ubuntu-24.04`, Debug + ReleaseSafe): green. +- **Windows — real PC (Guy).** Native `zig build` + `test-ipc` + `test` + green; interactive editor run OK ("ipc demo completed cleanly": + `CreateProcessW` + handshake + round-trip + clean shutdown); + `crash_recovery` green; fuzz 60 s smoke green. (`tests/assets/cache_diff.zig` + flakes intermittently — pre-existing **M0.6** debt, out of M0.7 scope.) +- **Windows CI** (`windows-2025`, Debug + ReleaseSafe): green. +- **Nightly fuzz cron** (`.github/workflows/nightly-fuzz.yml`): activates on + squash-merge to `main` (scheduled runs fire only from the default branch); + `workflow_dispatch` available meanwhile. + +### Acted deviations (recap; details above in § Acted deviations) + +- **Path reconciliation** — the spec's `src/ipc/ipc_messages.etch` (no `.etch` + codegen exists) → hand-written `src/core/ipc/messages.zig`; the + `/ipc.zig` wrapper → `src/core/ipc/{shm,shm_posix,shm_windows, + transport,transport_windows,viewport}.zig`. +- **Dedicated rooted modules** instead of inlining into the `main`s: + `cleanup.zig` (E1 orphan reaping), `command_log.zig` + `snapshot.zig` (E4) + — reusable, unit-tested Tier 0 primitives; rooted in `root.zig` per §13. +- **`connection.replayCommands`** placed at the connection layer (not in the + editor `main`) — keeps replay unit-testable. +- **`src/runtime/main.zig` + `src/core/platform/process.zig` Windows paths** + touched beyond the E3 Files list — a green Windows handshake required the + runtime to leave `error.Unimplemented` and `spawn_process` to implement + `CreateProcessW`. Confirmed acted with Guy. +- **`.gitignore`** — snapshot artifact patterns (hygiene). **`build.zig`** — + `handoff_fd.zig`/`catalogue.zig` registration + nightly arg forwarding (in + the brief's Notes). **`schema_hash.zig`/`process.zig`** — guard + native + Windows spawn test. + +### Out-of-repo (Claude.ai KB, NOT in this PR) + +`engine-ipc.md` (§4.7/§4.8 fd-passing, §7 replay) and `engine-phase-0-plan.md` +are patched by Claude.ai in the knowledge base — out of the repo, not files +Claude Code edits. + +### Known follow-ups + +- `tests/assets/cache_diff.zig` absolute-ratio assertion — fragile across + hosts; **M0.6** debt to make tolerant or move out of the gate (a chip was + spawned). Not an M0.7 regression. +- Phase 3: `transport_windows.sendWithHandles` (`DuplicateHandle`) lands with + the GPU shared framebuffer (§4.7); `SaveScene` handler wiring. diff --git a/build.zig b/build.zig index 4499057..c7eb279 100644 --- a/build.zig +++ b/build.zig @@ -621,6 +621,8 @@ pub fn build(b: *std.Build) void { "tests/ipc/viewport_cases/wrong_width.zig", "tests/ipc/viewport_cases/no_tearing_1000_frames.zig", "tests/ipc/fd_passing.zig", + "tests/ipc/handoff_fd.zig", + "tests/ipc/catalogue.zig", "tests/ipc/process.zig", "tests/ipc/handshake.zig", "tests/ipc/crash_recovery.zig", @@ -642,17 +644,18 @@ pub fn build(b: *std.Build) void { t_mod.addImport("weld_core", core_module); const t = b.addTest(.{ .root_module = t_mod }); const run_t = b.addRunArtifact(t); - // `tests/ipc/crash_recovery.zig` spawns - // `zig-out/bin/weld-runtime` to exercise the editor↔runtime - // termination contract (G4 + G5). The path is relative to - // the project root which is the cwd when `zig build test` - // dispatches the test binary; the runtime exe must already - // be installed for `posix_spawnp` to find it. Bare - // `b.addRunArtifact(t).step.dependOn(b.getInstallStep())` - // would gate the test on every install step (including the - // S5 etch_cook), so we wire the dependency narrowly to the - // runtime install step alone. - if (std.mem.eql(u8, p, "tests/ipc/crash_recovery.zig")) { + // `tests/ipc/crash_recovery.zig` and `tests/ipc/catalogue.zig` + // spawn `zig-out/bin/weld-runtime` to exercise the editor↔runtime + // contract end-to-end (G4 + G5; M0.7 / E2 catalogue handlers). The + // path is relative to the project root which is the cwd when + // `zig build test` dispatches the test binary; the runtime exe must + // already be installed for `posix_spawnp` to find it. Bare + // `b.addRunArtifact(t).step.dependOn(b.getInstallStep())` would gate + // the test on every install step (including the S5 etch_cook), so we + // wire the dependency narrowly to the runtime install step alone. + if (std.mem.eql(u8, p, "tests/ipc/crash_recovery.zig") or + std.mem.eql(u8, p, "tests/ipc/catalogue.zig")) + { run_t.step.dependOn(&b.addInstallArtifact(runtime_exe, .{}).step); } test_step.dependOn(&run_t.step); @@ -698,9 +701,13 @@ pub fn build(b: *std.Build) void { b.installArtifact(fuzz_1h_exe); const fuzz_1h_run = b.addRunArtifact(fuzz_1h_exe); fuzz_1h_run.step.dependOn(b.getInstallStep()); + // Forward `-- ` so the duration can be overridden for a local + // smoke run, e.g. `zig build test-ipc-fuzz-1h -- --duration-ms=3000`. + // The nightly cron runs it with no args (1 h default). + if (b.args) |forwarded| fuzz_1h_run.addArgs(forwarded); const fuzz_1h_step = b.step( "test-ipc-fuzz-1h", - "Run the S6 1 h IPC fuzz harness (manual; output digest archived in validation/s6-go-nogo.md)", + "Run the IPC fuzz harness over the full catalogue (1 h default; nightly CI). Override: -- --duration-ms=N", ); fuzz_1h_step.dependOn(&fuzz_1h_run.step); diff --git a/src/core/ipc/cleanup.zig b/src/core/ipc/cleanup.zig new file mode 100644 index 0000000..f9e2bd1 --- /dev/null +++ b/src/core/ipc/cleanup.zig @@ -0,0 +1,203 @@ +//! Startup orphan reaping for IPC endpoints (`engine-ipc.md` §2.4 + +//! §6.3). The editor owns its Unix socket file (`/tmp/weld-.sock`) +//! and its POSIX shm regions (`/weld-shm--`); a `kill -9` +//! of the editor leaves both behind, named with the dead editor's PID. +//! The next editor calls `reapOrphans` at startup to remove any such +//! orphan whose embedded PID is no longer alive (`process.is_alive`). +//! +//! Safety: an endpoint is removed **only** when its PID is dead, so a +//! second editor running concurrently (live PID) never has its +//! endpoints reaped. The reap is best-effort — every failure is +//! swallowed (it is startup hygiene, not a correctness gate). +//! +//! Implementation note: raw `opendir`/`readdir` via `extern "c"`, +//! consistent with the rest of the IPC module (`shm_posix.zig`, +//! `transport_posix.zig`) which binds libc directly to stay decoupled +//! from the evolving `std.fs` / `std.Io.Dir` signatures across Zig +//! 0.16 patches. Windows is a no-op: named pipes and named file +//! mappings are refcounted kernel objects that vanish with their last +//! handle, so there is nothing to unlink. shm orphan scanning uses the +//! Linux `/dev/shm` tmpfs listing (macOS POSIX shm objects are not +//! filesystem-visible, so only the socket reap runs there). + +const std = @import("std"); +const builtin = @import("builtin"); + +const process = @import("../platform/process.zig"); + +const is_linux = builtin.os.tag == .linux; +const is_posix = builtin.os.tag == .linux or builtin.os.tag == .macos; + +// `struct dirent` layout. Linux glibc and macOS (arm64, 64-bit inode) +// differ in field order/width before `d_name`; only the `d_name` +// offset matters here (we read it as a NUL-terminated slice). +const dirent = if (is_linux) extern struct { + d_ino: u64, + d_off: i64, + d_reclen: u16, + d_type: u8, + d_name: [256]u8, +} else extern struct { + d_ino: u64, + d_seekoff: u64, + d_reclen: u16, + d_namlen: u16, + d_type: u8, + d_name: [1024]u8, +}; + +const DIR = opaque {}; + +const sys = struct { + extern "c" fn opendir(name: [*:0]const u8) ?*DIR; + extern "c" fn readdir(dir: *DIR) ?*dirent; + extern "c" fn closedir(dir: *DIR) c_int; + extern "c" fn unlink(path: [*:0]const u8) c_int; + extern "c" fn shm_unlink(name: [*:0]const u8) i32; +}; + +/// Removes orphan IPC endpoints left by crashed editors. Scans +/// `/tmp` for `weld-.sock` sockets (Linux + macOS) and, on Linux, +/// `/dev/shm` for `weld-shm-*-` regions, unlinking each whose +/// `` is no longer alive. Best-effort and side-effect-safe for a +/// concurrently-running editor (live PIDs are kept). No-op on Windows. +pub fn reapOrphans() void { + if (comptime !is_posix) return; + reapSocketOrphans(); + if (comptime is_linux) reapShmOrphans(); +} + +/// Parses the PID out of an editor socket name `weld-.sock`. +/// Returns `null` for any name that is not exactly that shape (so +/// test sockets like `weld-crashtest-.sock` are left untouched). +fn pidFromSocketName(name: []const u8) ?process.Pid { + const prefix = "weld-"; + const suffix = ".sock"; + if (!std.mem.startsWith(u8, name, prefix)) return null; + if (!std.mem.endsWith(u8, name, suffix)) return null; + const mid = name[prefix.len .. name.len - suffix.len]; + if (mid.len == 0) return null; + return std.fmt.parseInt(process.Pid, mid, 10) catch null; +} + +/// Parses the trailing PID out of a shm region name +/// `weld-shm--`. Returns `null` when the name does not +/// start with `weld-shm-` or has no numeric trailing segment. +fn pidFromShmName(name: []const u8) ?process.Pid { + const prefix = "weld-shm-"; + if (!std.mem.startsWith(u8, name, prefix)) return null; + const last_dash = std.mem.lastIndexOfScalar(u8, name, '-') orelse return null; + const pid_str = name[last_dash + 1 ..]; + if (pid_str.len == 0) return null; + return std.fmt.parseInt(process.Pid, pid_str, 10) catch null; +} + +fn reapSocketOrphans() void { + const d = sys.opendir("/tmp") orelse return; + defer _ = sys.closedir(d); + while (sys.readdir(d)) |ent| { + const name = std.mem.sliceTo(&ent.d_name, 0); + const pid = pidFromSocketName(name) orelse continue; + if (process.is_alive(pid)) continue; + var buf: [320]u8 = undefined; + const full = std.fmt.bufPrintZ(&buf, "/tmp/{s}", .{name}) catch continue; + _ = sys.unlink(full.ptr); + } +} + +fn reapShmOrphans() void { + const d = sys.opendir("/dev/shm") orelse return; + defer _ = sys.closedir(d); + while (sys.readdir(d)) |ent| { + const name = std.mem.sliceTo(&ent.d_name, 0); + const pid = pidFromShmName(name) orelse continue; + if (process.is_alive(pid)) continue; + // `shm_unlink` takes the name as passed to `shm_open` (leading + // slash), which maps to `/dev/shm/` on Linux. + var buf: [320]u8 = undefined; + const shm_name = std.fmt.bufPrintZ(&buf, "/{s}", .{name}) catch continue; + _ = sys.shm_unlink(shm_name.ptr); + } +} + +// ---------------------------------------------------------------- tests -- + +extern "c" fn getpid() process.Pid; +extern "c" fn creat(path: [*:0]const u8, mode: c_uint) c_int; +extern "c" fn close(fd: c_int) c_int; +extern "c" fn access(path: [*:0]const u8, mode: c_int) c_int; + +test "pidFromSocketName parses weld-.sock only" { + try std.testing.expectEqual(@as(?process.Pid, 1234), pidFromSocketName("weld-1234.sock")); + try std.testing.expectEqual(@as(?process.Pid, null), pidFromSocketName("weld-crashtest-1234.sock")); + try std.testing.expectEqual(@as(?process.Pid, null), pidFromSocketName("weld-.sock")); + try std.testing.expectEqual(@as(?process.Pid, null), pidFromSocketName("other-1234.sock")); + try std.testing.expectEqual(@as(?process.Pid, null), pidFromSocketName("weld-1234.txt")); +} + +test "pidFromShmName parses the trailing pid of weld-shm--" { + try std.testing.expectEqual(@as(?process.Pid, 77), pidFromShmName("weld-shm-viewport-77")); + try std.testing.expectEqual(@as(?process.Pid, 5), pidFromShmName("weld-shm-overlays-5")); + try std.testing.expectEqual(@as(?process.Pid, null), pidFromShmName("weld-viewport-9")); + try std.testing.expectEqual(@as(?process.Pid, null), pidFromShmName("weld-shm-noprefixmatch")); +} + +test "reapOrphans removes a dead-pid socket and keeps a live-pid one" { + if (comptime !is_posix) return error.SkipZigTest; + + // A PID far above any real one — `kill(pid, 0)` returns ESRCH, so + // `is_alive` is false. Deterministic on Linux + macOS. + const dead_pid: process.Pid = 0x7FFF_FFFF; + const my_pid = getpid(); + try std.testing.expect(!process.is_alive(dead_pid)); + try std.testing.expect(process.is_alive(my_pid)); + + var dead_buf: [64]u8 = undefined; + var live_buf: [64]u8 = undefined; + const dead_path = try std.fmt.bufPrintZ(&dead_buf, "/tmp/weld-{d}.sock", .{dead_pid}); + const live_path = try std.fmt.bufPrintZ(&live_buf, "/tmp/weld-{d}.sock", .{my_pid}); + + _ = sys.unlink(dead_path.ptr); + _ = sys.unlink(live_path.ptr); + const fd_dead = creat(dead_path.ptr, 0o600); + try std.testing.expect(fd_dead >= 0); + _ = close(fd_dead); + const fd_live = creat(live_path.ptr, 0o600); + try std.testing.expect(fd_live >= 0); + _ = close(fd_live); + defer _ = sys.unlink(live_path.ptr); + defer _ = sys.unlink(dead_path.ptr); + + reapOrphans(); + + // The dead-pid orphan is gone; the live-pid endpoint survives. + try std.testing.expect(access(dead_path.ptr, 0) != 0); + try std.testing.expect(access(live_path.ptr, 0) == 0); +} + +test "reapOrphans removes a dead-pid shm region on Linux" { + if (comptime !is_linux) return error.SkipZigTest; + + const dead_pid: process.Pid = 0x7FFF_FFFF; + var name_buf: [64]u8 = undefined; + const shm_name = try std.fmt.bufPrintZ(&name_buf, "/weld-shm-reaptest-{d}", .{dead_pid}); + const O_RDWR: i32 = 0x0002; + const O_CREAT: i32 = 0x0040; + const shm = struct { + extern "c" fn shm_open(name: [*:0]const u8, oflag: i32, mode: u32) i32; + }; + _ = sys.shm_unlink(shm_name.ptr); // clear a prior run's leftover + const fd = shm.shm_open(shm_name.ptr, O_RDWR | O_CREAT, 0o600); + try std.testing.expect(fd >= 0); + _ = close(fd); + defer _ = sys.shm_unlink(shm_name.ptr); // belt-and-suspenders + + // `/dev/shm/weld-shm-reaptest-` now exists. + var path_buf: [64]u8 = undefined; + const dev_path = try std.fmt.bufPrintZ(&path_buf, "/dev/shm/weld-shm-reaptest-{d}", .{dead_pid}); + try std.testing.expect(access(dev_path.ptr, 0) == 0); + + reapOrphans(); + + try std.testing.expect(access(dev_path.ptr, 0) != 0); +} diff --git a/src/core/ipc/command_log.zig b/src/core/ipc/command_log.zig new file mode 100644 index 0000000..aafb6af --- /dev/null +++ b/src/core/ipc/command_log.zig @@ -0,0 +1,264 @@ +//! Editor-side command log for best-effort replay after a runtime crash +//! (`engine-ipc.md` §7, `engine-tools-editor.md` §2.7.3). A fixed-capacity +//! ring of the editor→runtime commands sent, each retaining its encoded +//! frame so it can be re-sent verbatim to a freshly restarted runtime. +//! +//! `last_clean_line` is advanced to the current head when a `SaveProject` +//! ack (`ProjectSaved`) arrives — everything appended up to that point is +//! durable on disk (the runtime's minimal snapshot, §7.1) and need not be +//! replayed. After a crash + restart the editor replays the entries since +//! `last_clean_line` that the runtime never acked (§7.2). No idempotence +//! is attempted (§7.3): a replay that nacks or times out stops hard. +//! +//! M0.7 scope: this is the IPC-replay materialization the brief E4 calls +//! for. The richer Islandz `Command` model (`engine-tools-editor.md` +//! §2.4) is Phase 2 — here an entry is just the wire frame + status. + +const std = @import("std"); + +/// Ring capacity (`engine-tools-editor.md` §2.7.3). The oldest entry is +/// FIFO-dropped once exceeded. +pub const capacity: usize = 1024; + +/// Max encoded-frame bytes retained per entry. The largest editor→runtime +/// command frame is `LoadScene` / `SaveScene` at `16 + 8 + 256 = 280` B; +/// 512 leaves headroom for future commands without resizing the ring. +pub const max_frame_bytes: usize = 512; + +/// Lifecycle of a logged command relative to the runtime. +pub const EntryStatus = enum(u8) { pending, acked, nacked }; + +/// One logged command. `frame` holds the full encoded frame exactly as +/// sent, so replay re-sends it byte-for-byte (same `seq_id`). +pub const Entry = struct { + seq_id: u32, + msg_type: u16, + status: EntryStatus, + issued_at_us: u64, + /// 0 until `markAcked`. + acked_at_us: u64, + frame_len: u32, + frame: [max_frame_bytes]u8, + + /// The encoded frame as sent, ready to re-send verbatim on replay. + pub fn frameBytes(self: *const Entry) []const u8 { + return self.frame[0..self.frame_len]; + } +}; + +/// Errors raised by `CommandLog` operations. +pub const Error = error{FrameTooLarge} || std.mem.Allocator.Error; + +/// Ring of `capacity` command entries plus the `last_clean_line` anchor. +/// `head` is the monotone count of commands ever appended; ring slot is +/// `head % capacity`. Entries older than `head - capacity` are gone. +pub const CommandLog = struct { + gpa: std.mem.Allocator, + entries: []Entry, + head: u64 = 0, + last_clean_line: u64 = 0, + + /// Allocate the ring. Caller owns it; pair with `deinit`. + pub fn init(gpa: std.mem.Allocator) Error!CommandLog { + const entries = try gpa.alloc(Entry, capacity); + return .{ .gpa = gpa, .entries = entries }; + } + + /// Free the ring and poison the value. + pub fn deinit(self: *CommandLog) void { + self.gpa.free(self.entries); + self.* = undefined; + } + + /// Record a sent command. FIFO: once `capacity` is exceeded the oldest + /// entry is overwritten. `frame` is the full encoded frame as sent. + pub fn append( + self: *CommandLog, + seq_id: u32, + msg_type: u16, + frame: []const u8, + now_us: u64, + ) Error!void { + if (frame.len > max_frame_bytes) return error.FrameTooLarge; + const e = &self.entries[self.head % capacity]; + e.seq_id = seq_id; + e.msg_type = msg_type; + e.status = .pending; + e.issued_at_us = now_us; + e.acked_at_us = 0; + e.frame_len = @intCast(frame.len); + @memcpy(e.frame[0..frame.len], frame); + self.head += 1; + } + + /// Index of the oldest still-retained entry; entries before it were + /// FIFO-dropped. + fn oldestRetained(self: *const CommandLog) u64 { + return if (self.head > capacity) self.head - capacity else 0; + } + + /// Mark the pending entry carrying `seq_id` as acked. No-op if it is + /// not in the retained window (already dropped / unknown seq). + pub fn markAcked(self: *CommandLog, seq_id: u32, now_us: u64) void { + var i = self.oldestRetained(); + while (i < self.head) : (i += 1) { + const e = &self.entries[i % capacity]; + if (e.seq_id == seq_id and e.status == .pending) { + e.status = .acked; + e.acked_at_us = now_us; + return; + } + } + } + + /// Mark the pending entry carrying `seq_id` as nacked (runtime refused). + pub fn markNacked(self: *CommandLog, seq_id: u32) void { + var i = self.oldestRetained(); + while (i < self.head) : (i += 1) { + const e = &self.entries[i % capacity]; + if (e.seq_id == seq_id and e.status == .pending) { + e.status = .nacked; + return; + } + } + } + + /// Advance `last_clean_line` to the current head — call when the + /// `ProjectSaved` ack arrives (§7.1). Everything appended so far is + /// now durable and excluded from replay. + pub fn markCleanLine(self: *CommandLog) void { + self.last_clean_line = self.head; + } + + /// True when a command appended after the last clean line has been + /// FIFO-dropped (the ring overflowed since the last save). Those are + /// unrecoverable for replay; the editor warns the user (§7.1). + pub fn droppedUnsaved(self: *const CommandLog) bool { + return self.oldestRetained() > self.last_clean_line; + } + + /// Iterator over the entries to replay after a crash: those appended + /// since `last_clean_line` and still `pending` (never acked/nacked), + /// in send order, clamped to the retained window. + pub const ReplayIterator = struct { + log: *const CommandLog, + i: u64, + + /// Next pending entry to replay, or `null` when exhausted. + pub fn next(self: *ReplayIterator) ?*const Entry { + while (self.i < self.log.head) { + const e = &self.log.entries[self.i % capacity]; + self.i += 1; + if (e.status == .pending) return e; + } + return null; + } + }; + + /// Iterator over the commands to replay after a crash (pending, + /// appended since `last_clean_line`, in send order). + pub fn replaySince(self: *const CommandLog) ReplayIterator { + return .{ .log = self, .i = @max(self.last_clean_line, self.oldestRetained()) }; + } +}; + +// ---------------------------------------------------------------- tests -- + +fn dummyFrame(seq: u32) [16]u8 { + var f: [16]u8 = undefined; + std.mem.writeInt(u32, f[0..4], 0x57454C44, .little); // 'WELD' + std.mem.writeInt(u32, f[4..8], seq, .little); + @memset(f[8..], 0); + return f; +} + +test "append records a pending entry with the frame bytes" { + var log = try CommandLog.init(std.testing.allocator); + defer log.deinit(); + + const f = dummyFrame(7); + try log.append(7, 5, &f, 1000); + try std.testing.expectEqual(@as(u64, 1), log.head); + + var it = log.replaySince(); + const e = it.next().?; + try std.testing.expectEqual(@as(u32, 7), e.seq_id); + try std.testing.expectEqual(@as(u16, 5), e.msg_type); + try std.testing.expectEqual(EntryStatus.pending, e.status); + try std.testing.expectEqualSlices(u8, &f, e.frameBytes()); + try std.testing.expect(it.next() == null); +} + +test "append rejects an over-large frame" { + var log = try CommandLog.init(std.testing.allocator); + defer log.deinit(); + const big = [_]u8{0} ** (max_frame_bytes + 1); + try std.testing.expectError(error.FrameTooLarge, log.append(1, 1, &big, 0)); +} + +test "acked entries are excluded from replay" { + var log = try CommandLog.init(std.testing.allocator); + defer log.deinit(); + + const f1 = dummyFrame(1); + const f2 = dummyFrame(2); + try log.append(1, 5, &f1, 100); + try log.append(2, 5, &f2, 200); + log.markAcked(1, 150); + + var it = log.replaySince(); + const e = it.next().?; + try std.testing.expectEqual(@as(u32, 2), e.seq_id); // only the unacked one + try std.testing.expect(it.next() == null); +} + +test "markCleanLine excludes pre-save commands from replay" { + var log = try CommandLog.init(std.testing.allocator); + defer log.deinit(); + + const a = dummyFrame(1); + const b = dummyFrame(2); + const c = dummyFrame(3); + try log.append(1, 5, &a, 1); // before save (e.g. acked by ProjectSaved chain) + try log.append(2, 21, &b, 2); // SaveProject itself + log.markCleanLine(); // ProjectSaved ack arrived + try log.append(3, 5, &c, 3); // after save — the only replayable one + + var it = log.replaySince(); + const e = it.next().?; + try std.testing.expectEqual(@as(u32, 3), e.seq_id); + try std.testing.expect(it.next() == null); +} + +test "ring overflow FIFO-drops oldest and flags dropped-unsaved" { + var log = try CommandLog.init(std.testing.allocator); + defer log.deinit(); + + // Fill past capacity: append capacity + 5 commands, never saving. + var i: u32 = 0; + while (i < capacity + 5) : (i += 1) { + const f = dummyFrame(i); + try log.append(i, 5, &f, i); + } + try std.testing.expectEqual(@as(u64, capacity + 5), log.head); + // last_clean_line is still 0 but the first 5 entries were dropped. + try std.testing.expect(log.droppedUnsaved()); + + // Replay yields exactly the retained `capacity` entries, oldest first. + var it = log.replaySince(); + const first = it.next().?; + try std.testing.expectEqual(@as(u32, 5), first.seq_id); // 0..4 dropped + var count: usize = 1; + while (it.next()) |_| count += 1; + try std.testing.expectEqual(capacity, count); +} + +test "nacked entries are excluded from replay" { + var log = try CommandLog.init(std.testing.allocator); + defer log.deinit(); + const f = dummyFrame(9); + try log.append(9, 5, &f, 0); + log.markNacked(9); + var it = log.replaySince(); + try std.testing.expect(it.next() == null); +} diff --git a/src/core/ipc/connection.zig b/src/core/ipc/connection.zig index e7757d2..0564ea5 100644 --- a/src/core/ipc/connection.zig +++ b/src/core/ipc/connection.zig @@ -29,6 +29,7 @@ const framing = @import("framing.zig"); const messages = @import("messages.zig"); const protocol = @import("protocol.zig"); const transport = @import("transport.zig"); +const command_log = @import("command_log.zig"); /// All errors a connection method can raise. Union of the transport /// errors (socket I/O), framing errors (invalid header / schema @@ -45,6 +46,16 @@ pub const Frame = struct { payload_bytes: []const u8, }; +/// A frame received alongside out-of-band OS handles +/// (`recvFrameWithHandles`). `handles` is the number of descriptors +/// written into the caller's `handles_out` slot vector — the order +/// matches the sender's `sendMessageWithHandles` handle order. +pub const FrameWithHandles = struct { + header: framing.Header, + payload_bytes: []const u8, + handles: usize, +}; + /// One IPC connection. `socket` is borrowed — the caller owns the /// `IpcSocket` lifetime. pub const IpcConnection = struct { @@ -130,6 +141,53 @@ pub const IpcConnection = struct { }; } + /// Receive one frame together with the out-of-band OS handles the + /// sender attached via `sendMessageWithHandles` (POSIX + /// `SCM_RIGHTS`). The ancillary fds are delivered by the kernel + /// with the first byte chunk; this reads that chunk via + /// `recvWithHandles` (capturing the handles), then tops up any + /// short read with plain `recv` to complete the frame — the + /// handles have already arrived. Used for `ShmRegionsHandoff` + /// (`engine-ipc.md` §4.8). `buf` should be sized to exactly + /// `framing.frameSizeOf(T)` so the first read cannot pull bytes of + /// a following frame. Returns `error.Unimplemented` on Windows + /// (the named-pipe backend has no `recvWithHandles` in M0.7). + pub fn recvFrameWithHandles( + self: *IpcConnection, + buf: []u8, + handles_out: []transport.OsHandle, + ) Error!FrameWithHandles { + if (buf.len < @sizeOf(framing.Header)) return error.UnexpectedEof; + + const first = try self.socket.recvWithHandles(buf, handles_out); + if (first.bytes == 0) return error.UnexpectedEof; + var got: usize = first.bytes; + + // Top up the header if the first chunk was short — the fds + // already rode in with `first`, so plain `recv` is correct here. + while (got < @sizeOf(framing.Header)) { + const n = try self.socket.recv(buf[got..]); + if (n == 0) return error.UnexpectedEof; + got += n; + } + const header = try framing.parseHeader(buf[0..@sizeOf(framing.Header)]); + + const payload_len: usize = @intCast(header.payload_len); + const total = @sizeOf(framing.Header) + payload_len; + if (total > buf.len) return error.PayloadTooLarge; + while (got < total) { + const n = try self.socket.recv(buf[got..]); + if (n == 0) return error.UnexpectedEof; + got += n; + } + + return .{ + .header = header, + .payload_bytes = buf[@sizeOf(framing.Header)..total], + .handles = first.handles, + }; + } + /// Convenience helper — receive a frame and decode it as `T` in /// one shot. The caller must size `scratch` to at least /// `framing.frameSizeOf(T)`. Returns `error.UnknownMsgType` if @@ -152,3 +210,93 @@ fn readExact(socket: *transport.IpcSocket, dst: []u8) transport.Error!void { got += n; } } + +/// Raised by `acceptShmHandoff` when a `ShmRegionsHandoff` is malformed. +pub const HandoffError = error{InvalidHandoff}; + +/// Validate a decoded `ShmRegionsHandoff` against the fds delivered +/// out-of-band and select the viewport fd to map (`engine-ipc.md` +/// §8.3). `handles` is the populated prefix of the receiver's handle +/// vector (i.e. `handoff_handles[0..recv_result.handles]`). +/// +/// Rules (any violation ⇒ `error.InvalidHandoff`): +/// - `region_count` is in `[1, MAX_SHM_REGIONS]`; +/// - the fd count equals `region_count` exactly. +/// +/// On a violation, **every** received fd is closed before returning so +/// a malformed handoff cannot leak descriptors into the runtime. On +/// success, M0.7 maps only `regions[0]` (`viewport_framebuffer`); the +/// fds of any further declared regions are closed here, and the +/// viewport fd (`handles[0]`, now owned by the caller) is returned. +pub fn acceptShmHandoff( + handoff: *const messages.ShmRegionsHandoff, + handles: []const transport.OsHandle, +) HandoffError!transport.OsHandle { + const region_count: usize = handoff.region_count; + if (region_count == 0 or + region_count > messages.MAX_SHM_REGIONS or + handles.len != region_count) + { + for (handles) |h| transport.closeHandle(h); + return error.InvalidHandoff; + } + // Map only the viewport (regions[0]); close every other region fd so + // a multi-region handoff cannot leak descriptors into the runtime. + for (handles[1..]) |h| transport.closeHandle(h); + return handles[0]; +} + +/// Outcome of a `replayCommands` pass. +pub const ReplayResult = struct { + /// Commands successfully re-sent and acked. + replayed: usize, + /// True when every pending command replayed; false when a nack / + /// timeout / desync stopped the pass early (§7.2). + complete: bool, +}; + +/// Best-effort replay after a crash + restart (`engine-ipc.md` §7.2, +/// `engine-tools-editor.md` §2.7.4). For each command in `log` since the +/// last clean line still pending, re-send its frame verbatim over `conn` +/// and await a reply carrying the same `seq_id`; on a match, mark it +/// acked and continue. The first nack, timeout, or desync stops the pass +/// hard — no idempotence is attempted (§7.3). The caller arms the +/// per-command timeout via a socket recv timeout (`SO_RCVTIMEO` on +/// POSIX); a recv error (timeout / EOF) ends the pass. `scratch` must +/// hold one full reply frame. Never raises — failures end the pass with +/// `complete = false`. +/// +/// **Invariant — `seq_id` safety (§3.4).** This pass is *synchronous* and +/// drains each replayed command fully — `send` → `recvFrame` of the ack +/// carrying the same `seq_id` → `markAcked` — before advancing to the next +/// entry, and the caller (the editor) MUST NOT resume emitting new commands +/// until this function returns. That strict serialization is what +/// guarantees a replayed `seq_id` can never coexist with a freshly-issued +/// one in the editor's `seq_id`→callback map: each replayed id is retired +/// (acked) one at a time, before any new id is minted. Making the replay +/// asynchronous, or pipelining it (issuing the next frame before the prior +/// ack lands, or overlapping it with normal traffic), would BREAK this +/// guarantee and reopen the collision window. Keep it strictly serial. +pub fn replayCommands( + conn: *IpcConnection, + log: *command_log.CommandLog, + scratch: []u8, + now_us: u64, +) ReplayResult { + var it = log.replaySince(); + var replayed: usize = 0; + while (it.next()) |entry| { + const seq = entry.seq_id; + // Re-send the original frame byte-for-byte (same seq_id). `seq` + // is captured before any mutation; `entry` is not read after the + // `markAcked` below (forward-only iteration, no revisit). + // Synchronous drain: block on THIS command's ack before the next + // send — never pipeline (see the seq_id-safety invariant above). + conn.socket.send(entry.frameBytes()) catch return .{ .replayed = replayed, .complete = false }; + const frame = conn.recvFrame(scratch) catch return .{ .replayed = replayed, .complete = false }; + if (frame.header.seq_id != seq) return .{ .replayed = replayed, .complete = false }; + log.markAcked(seq, now_us); + replayed += 1; + } + return .{ .replayed = replayed, .complete = true }; +} diff --git a/src/core/ipc/messages.zig b/src/core/ipc/messages.zig index 89ee806..2a1eb98 100644 --- a/src/core/ipc/messages.zig +++ b/src/core/ipc/messages.zig @@ -1,8 +1,15 @@ -//! Catalogue of the 13 IPC messages used in S6, defined as `extern -//! struct` POD per `engine-ipc.md` §3.2 + brief § Scope. Every payload -//! is written/read byte-for-byte across the socket, preceded by an -//! 8-byte `schema_hash` that detects build-version drift between the -//! editor and the runtime. +//! Catalogue of the IPC messages, defined as `extern struct` POD per +//! `engine-ipc.md` §3.2 + brief § Scope. Every payload is written/read +//! byte-for-byte across the socket, preceded by an 8-byte +//! `schema_hash` that detects build-version drift between the editor +//! and the runtime. +//! +//! S6 shipped 13 message types; M0.7 / E1 adds `ShmRegionsHandoff` +//! (the POSIX fd handoff, §3.3 + §4.8). M0.7 / E2 extends the +//! catalogue further (`Play`/`Pause`/`Stop`, `LoadScene`, +//! `HotReloadScript`, `SaveScene`, `SaveProject`/`ProjectSaved`, +//! `RuntimeError`). The `WELD_IPC_PROTOCOL_VERSION` 2→3 bump covers +//! the whole M0.7 catalogue + attach-semantics change. //! //! The S6 brief acknowledges a triple count inconsistency in its own //! text — the catalogue is described as "exactly 11 message types", @@ -28,10 +35,9 @@ const std = @import("std"); const rtti = @import("../rtti/root.zig"); /// Message-type discriminator written in the framing header -/// (`framing.zig` `Header.msg_type: u16`). Values are stable across -/// the protocol version `WELD_IPC_PROTOCOL_VERSION = 1`; reordering -/// or renumbering is a breaking change that bumps the protocol -/// version. +/// (`framing.zig` `Header.msg_type: u16`). Discriminant values are +/// stable for a given `WELD_IPC_PROTOCOL_VERSION`; reordering or +/// renumbering is a breaking change that bumps the protocol version. pub const MsgType = enum(u16) { /// Runtime → Editor — handshake (first message after connect). protocol_hello = 1, @@ -62,13 +68,39 @@ pub const MsgType = enum(u16) { shutdown_ack = 12, /// Runtime → Editor — unidirectional log event (no ack). log_message = 13, + /// Editor → Runtime — POSIX shm fd handoff (M0.7 / E1, + /// `engine-ipc.md` §3.3 + §4.8). Sent right after the handshake + /// via `sendWithHandles`; the fds ride as ancillary data. + shm_regions_handoff = 14, + /// Editor → Runtime — start simulation (fire-and-forget, §3.4). + play = 15, + /// Editor → Runtime — pause simulation (fire-and-forget). + pause = 16, + /// Editor → Runtime — stop simulation (fire-and-forget). + stop = 17, + /// Editor → Runtime — load a scene by path (fire-and-forget). + load_scene = 18, + /// Editor → Runtime — hot-reload a script by asset handle. + hot_reload_script = 19, + /// Editor → Runtime — save ONE scene by path (scene granularity). + /// Declared in M0.7 with **no wired handler** — wiring deferred to + /// the scene serialization pipeline (out of Phase 0, brief § Out-of-scope). + save_scene = 20, + /// Editor → Runtime — save the whole project (transactional, §3.4). + /// The runtime replies with `project_saved` carrying the same `seq_id`. + save_project = 21, + /// Runtime → Editor — ack of `save_project` (same `seq_id`). + project_saved = 22, + /// Runtime → Editor — non-fatal recoverable error event (no ack). + /// Distinct from `CrashReport` (reserved for the fatal case). + runtime_error = 23, /// Returns true when the raw `u16` from a frame header maps to a /// declared variant. Used by `framing.validate` to fail fast on /// unknown discriminants. pub fn isKnown(raw: u16) bool { return switch (raw) { - 1...13 => true, + 1...23 => true, else => false, }; } @@ -93,6 +125,13 @@ pub const LogLevel = enum(u32) { err = 4, }; +/// Severity carried by `RuntimeError.severity` (`engine-ipc.md` §3.3). +/// Numeric values are stable across the protocol version. +pub const ErrorSeverity = enum(u32) { + warning = 0, + err = 1, +}; + /// Runtime → Editor. First message of the handshake (cf. /// `engine-ipc.md` §5.1). The editor replies with `ProtocolHelloAck` /// to accept or reject. @@ -210,6 +249,120 @@ pub const LogMessage = extern struct { text: [256]u8, }; +/// NUL-terminated capacity for a `ShmRegionDesc.logical_name`. +/// `"viewport_framebuffer"` (20 bytes) is the longest name M0.7 hands +/// off; 32 leaves headroom for the §4.1 names (`debug_overlays`, +/// `profiler_samples`, `selection_snapshot`, `log_stream`). +pub const SHM_LOGICAL_NAME_LEN: usize = 32; + +/// Maximum shm regions carried by one `ShmRegionsHandoff`. M0.7 hands +/// off only `viewport_framebuffer`; the §4.1 catalogue tops out at 5 +/// regions. 8 is comfortable headroom and keeps the frame small +/// (`8 × 40 + 8 = 328` payload bytes). +pub const MAX_SHM_REGIONS: usize = 8; + +/// One shm-region descriptor inside a `ShmRegionsHandoff` +/// (`engine-ipc.md` §3.3). The fd travels out-of-band via +/// `SCM_RIGHTS`; this struct carries only the logical name and size so +/// the runtime can pair each received fd with its role and `mmap` the +/// right length via `ShmRegion.fromFd`. +pub const ShmRegionDesc = extern struct { + /// NUL-terminated logical role, e.g. `"viewport_framebuffer"`. + logical_name: [SHM_LOGICAL_NAME_LEN]u8, + /// Region size in bytes — the `mmap` length on the runtime side. + size: u64, +}; + +/// Editor → Runtime, POSIX (M0.7 / E1). Hands the runtime the file +/// descriptors of the shm regions the editor created +/// (`engine-ipc.md` §4.8 + §3.3). Sent immediately after +/// `ProtocolHelloAck` through `IpcSocket.sendWithHandles`: the fds +/// ride as `SCM_RIGHTS` ancillary data in the same order as +/// `regions[0..region_count]`. The runtime maps each via +/// `ShmRegion.fromFd` and **never** calls cross-process `shm_open`. +/// The receiver validates that the ancillary fd count equals +/// `region_count` (`engine-ipc.md` §8.3). +pub const ShmRegionsHandoff = extern struct { + /// Number of valid entries in `regions` (and of fds in the + /// ancillary data). `1` in M0.7 (`viewport_framebuffer` only). + region_count: u32, + _pad0: u32 = 0, + /// Fixed-capacity descriptor table; only the first `region_count` + /// entries are meaningful. Fixed size keeps the frame an + /// `extern struct` POD like every other catalogue message. + regions: [MAX_SHM_REGIONS]ShmRegionDesc, +}; + +/// Editor → Runtime. Start the simulation. Fire-and-forget (§3.4) — no +/// ack. The single reserved byte keeps it a non-zero-sized extern POD. +pub const Play = extern struct { + _reserved: u8 = 0, +}; + +/// Editor → Runtime. Pause the simulation. Fire-and-forget. +pub const Pause = extern struct { + _reserved: u8 = 0, +}; + +/// Editor → Runtime. Stop the simulation. Fire-and-forget. +pub const Stop = extern struct { + _reserved: u8 = 0, +}; + +/// Editor → Runtime. Load a scene by filesystem path. Fire-and-forget. +pub const LoadScene = extern struct { + /// NUL-terminated scene path. Longer paths truncate at the sender. + path: [256]u8, +}; + +/// Editor → Runtime. Hot-reload a script identified by its stable asset +/// handle (`AssetHandle` = `u64` per §3.2). +pub const HotReloadScript = extern struct { + script_handle: u64, +}; + +/// Editor → Runtime. Save ONE scene by path (scene granularity, maps +/// Conduit `scene.save`). Declared in M0.7 with **no wired handler** +/// (see `MsgType.save_scene`); wiring deferred to the scene +/// serialization pipeline (out of Phase 0). +pub const SaveScene = extern struct { + /// NUL-terminated scene path. + path: [256]u8, +}; + +/// Editor → Runtime. Save the whole project (all dirty scenes + project +/// settings + modified prefabs). Transactional (§3.4): the runtime +/// replies with `ProjectSaved` carrying the same `seq_id`. This ack +/// anchors the editor `CommandLog.last_clean_line` (§7, wired in E4). +/// No body — project granularity carries no path. +pub const SaveProject = extern struct { + _reserved: u8 = 0, +}; + +/// Runtime → Editor. Ack of `SaveProject` (same `seq_id` in the frame +/// header). `ok == 0` carries a human-readable `reason`. +pub const ProjectSaved = extern struct { + /// 1 = saved, 0 = failed. `u8` because `bool` is not legal in an + /// `extern struct` in Zig 0.16. + ok: u8, + _pad0: [3]u8 = .{ 0, 0, 0 }, + /// NUL-terminated failure reason. Empty when `ok == 1`. + reason: [128]u8, +}; + +/// Runtime → Editor. Non-fatal, recoverable error event (failed +/// non-transactional command, missing asset, …), surfaced for a toast +/// or the "Replay Errors" panel. Unidirectional — no ack. Distinct from +/// `CrashReport` (reserved for the fatal signal + stacktrace case). +pub const RuntimeError = extern struct { + /// `ErrorSeverity` as `u32` — extern struct can't embed Zig enums. + severity: u32, + /// NUL-terminated source module name. + source: [64]u8, + /// NUL-terminated UTF-8 error text. + text: [256]u8, +}; + /// Returns the `MsgType` discriminator for a given message struct. /// Used by callers to fill the framing header without manually /// keeping the type↔enum mapping in sync at each call site. @@ -228,6 +381,16 @@ pub fn msgTypeOf(comptime T: type) MsgType { Shutdown => .shutdown, ShutdownAck => .shutdown_ack, LogMessage => .log_message, + ShmRegionsHandoff => .shm_regions_handoff, + Play => .play, + Pause => .pause, + Stop => .stop, + LoadScene => .load_scene, + HotReloadScript => .hot_reload_script, + SaveScene => .save_scene, + SaveProject => .save_project, + ProjectSaved => .project_saved, + RuntimeError => .runtime_error, else => @compileError("msgTypeOf: not a Weld IPC message type: " ++ @typeName(T)), }; } @@ -274,7 +437,12 @@ test "every message type is extern with non-zero size" { ModifyComponent, ModifyAck, Heartbeat, HeartbeatAck, Shutdown, ShutdownAck, - LogMessage, + LogMessage, ShmRegionsHandoff, + Play, Pause, + Stop, LoadScene, + HotReloadScript, SaveScene, + SaveProject, ProjectSaved, + RuntimeError, }) |T| { try std.testing.expect(@sizeOf(T) > 0); } @@ -288,9 +456,10 @@ test "msgTypeOf maps every message to its discriminator" { test "MsgType.isKnown rejects out-of-range values" { try std.testing.expect(MsgType.isKnown(1)); - try std.testing.expect(MsgType.isKnown(13)); + try std.testing.expect(MsgType.isKnown(14)); // shm_regions_handoff (M0.7 / E1) + try std.testing.expect(MsgType.isKnown(23)); // runtime_error (M0.7 / E2) try std.testing.expect(!MsgType.isKnown(0)); - try std.testing.expect(!MsgType.isKnown(14)); + try std.testing.expect(!MsgType.isKnown(24)); try std.testing.expect(!MsgType.isKnown(65535)); } @@ -302,7 +471,12 @@ test "schemaHash is non-zero for every message type" { ModifyComponent, ModifyAck, Heartbeat, HeartbeatAck, Shutdown, ShutdownAck, - LogMessage, + LogMessage, ShmRegionsHandoff, + Play, Pause, + Stop, LoadScene, + HotReloadScript, SaveScene, + SaveProject, ProjectSaved, + RuntimeError, }) |T| { try std.testing.expect(schemaHash(T) != 0); } diff --git a/src/core/ipc/protocol.zig b/src/core/ipc/protocol.zig index 20fa137..249103b 100644 --- a/src/core/ipc/protocol.zig +++ b/src/core/ipc/protocol.zig @@ -35,7 +35,17 @@ pub const MAGIC: u32 = 0x57454C44; /// algorithms produce different bytes on the wire and /// `engine-ipc.md` §5.2 forbids negotiation, hence the version bump. /// Cf. `briefs/M0.2-rtti-resources-events-bindgen.md` E2. -pub const WELD_IPC_PROTOCOL_VERSION: u16 = 2; +/// +/// Bumped M0.7 (2 → 3) — two conjoint breaking changes +/// (`engine-ipc.md` §5.2): (1) POSIX shm attach becomes `SCM_RIGHTS` +/// fd-passing via `ShmRegionsHandoff` after the handshake (attach +/// semantics, §4.8 / §5.1); (2) the catalogue gains mandatory messages +/// (`ShmRegionsHandoff`, `SaveProject`, `ProjectSaved`, `RuntimeError`) +/// and activates `Play`/`Pause`/`Stop`, `LoadScene`, `HotReloadScript`. +/// An S6/M0.2 editor and an M0.7 runtime are strictly incompatible — +/// expected behavior, no negotiation. Cf. +/// `briefs/M0.7-ipc-scm-rights-windows-fuzz.md`. +pub const WELD_IPC_PROTOCOL_VERSION: u16 = 3; /// Maximum payload size in bytes (`payload_len` ceiling per /// `engine-ipc.md` §3.1). Frames with `payload_len > MAX_PAYLOAD_LEN` diff --git a/src/core/ipc/shm.zig b/src/core/ipc/shm.zig index 622676e..62b7fc3 100644 --- a/src/core/ipc/shm.zig +++ b/src/core/ipc/shm.zig @@ -6,9 +6,13 @@ //! //! Lifetime: the editor side calls `create(name, size)` to allocate //! the region and `close()` to release it (POSIX also `shm_unlink`s -//! the name). The runtime side calls `open(name)` to attach to an -//! existing region and `close()` to detach (no `shm_unlink` on the -//! attached side). +//! the name). On POSIX the runtime side attaches via `fromFd(fd, size)` +//! on the descriptor the editor passes over the socket (`SCM_RIGHTS`, +//! the **primary cross-process attach** per `engine-ipc.md` §4.8); +//! `open(name)` is demoted to intra-process discovery (a single +//! process re-attaching a region it created) and is no longer used +//! for the cross-process runtime attach. On Windows the attach stays +//! by name (`open`) — the named mapping has no BSD shm quirk. //! //! Naming convention per `engine-ipc.md` §2: //! - POSIX : `/weld-shm--` @@ -27,6 +31,14 @@ const backend = switch (builtin.os.tag) { else => @compileError("Weld IPC shm: unsupported OS"), }; +const transport = @import("transport.zig"); + +/// OS-native handle backing a region: `std.posix.fd_t` (i32) on +/// Linux/macOS, `std.os.windows.HANDLE` on Windows. Same alias the +/// transport layer passes to `IpcSocket.sendWithHandles`, so the +/// editor can forward `ShmRegion.fd()` directly without a cast. +pub const OsHandle = transport.OsHandle; + /// Error set for shared-memory segment operations (create, open, /// resize, unlink). Backends translate native errors into this set. pub const Error = error{ @@ -40,6 +52,9 @@ pub const Error = error{ ShmTruncateFailed, ShmMapFailed, ShmOpenFailed, + /// `fromFd` on Windows: CPU shm attach there is by name, not by + /// descriptor (the `SCM_RIGHTS` pivot is POSIX-only, §4.8). + Unimplemented, } || std.mem.Allocator.Error; /// One shared-memory region. Both the creator (editor) and the @@ -71,7 +86,11 @@ pub const ShmRegion = struct { }; } - /// Runtime side. Attaches to an already-created region. + /// Intra-process attach by name. Demoted in M0.7: it is **no + /// longer** the cross-process runtime attach (that is `fromFd`, + /// §4.8). Reserved for a single process re-attaching a region it + /// created, and for the Windows attach path (named mapping, no + /// BSD shm quirk). pub fn open(name: []const u8, size: usize) Error!ShmRegion { const impl = try backend.Backend.open(name, size); return .{ @@ -82,6 +101,30 @@ pub const ShmRegion = struct { }; } + /// Runtime side, POSIX. Attaches to a region from a file descriptor + /// received over the IPC socket (`SCM_RIGHTS`). The **primary + /// cross-process attach** per `engine-ipc.md` §4.8 — no `shm_open`, + /// no name. The fd ownership transfers to the region (closed in + /// `close()`); the region is never the owner, so `close()` does not + /// `shm_unlink`. Returns `error.Unimplemented` on Windows. + pub fn fromFd(handle: OsHandle, size: usize) Error!ShmRegion { + const impl = try backend.Backend.fromFd(handle, size); + return .{ + .impl = impl, + .ptr = impl.ptr, + .size = size, + .is_owner = false, + }; + } + + /// The OS handle backing this region (POSIX fd / Windows mapping + /// handle), to forward to the runtime via + /// `IpcSocket.sendWithHandles` (`engine-ipc.md` §4.8). Only the + /// POSIX fd is used for the cross-process attach in M0.7. + pub fn fd(self: *const ShmRegion) OsHandle { + return self.impl.handle(); + } + /// Unmap + (creator only) unlink the underlying name. The /// kernel keeps the backing pages alive while any process still /// has the region mapped, so the close order between creator diff --git a/src/core/ipc/shm_posix.zig b/src/core/ipc/shm_posix.zig index e4fa196..53ad052 100644 --- a/src/core/ipc/shm_posix.zig +++ b/src/core/ipc/shm_posix.zig @@ -91,11 +91,14 @@ const Error = shm.Error; /// POSIX `shm_open` + `mmap` backend for the IPC viewport shared /// memory segment. Embedded inside `shm.Segment.impl` on Linux/macOS. pub const Backend = struct { - name_z: [:0]u8, + /// `null` for a `fromFd` attach (the received fd has no name in + /// this process — cross-process attach is by fd, not by name, + /// per `engine-ipc.md` §4.8). Non-null for `create`/`open`. + name_z: ?[:0]u8 = null, gpa: std.mem.Allocator, - /// `shm_open` fd. Kept open for the lifetime of the `Backend` - /// per the macOS quirk documented in the file header. Closed in - /// `close()`. + /// `shm_open` fd (create/open) or the fd received via SCM_RIGHTS + /// (`fromFd`). Kept open for the lifetime of the `Backend` per the + /// macOS quirk documented in the file header. Closed in `close()`. fd: i32, ptr: [*]align(std.heap.pageSize()) u8, size: usize, @@ -162,15 +165,44 @@ pub const Backend = struct { }; } + /// Attach to a shm region from a file descriptor received over the + /// IPC socket via `SCM_RIGHTS` (`IpcSocket.recvWithHandles`). This + /// is the **primary cross-process attach** on POSIX + /// (`engine-ipc.md` §4.8): no `shm_open`, no name. The fd ownership + /// transfers to the `Backend` and is closed in `close()`. The + /// region is never the owner (the editor that created it via + /// `create` keeps ownership), so `close()` never `shm_unlink`s. + pub fn fromFd(fd: i32, size: usize) Error!Backend { + const raw = sys.mmap(null, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (raw == null or @intFromPtr(raw.?) == MAP_FAILED_RAW) return error.ShmMapFailed; + + const ptr: [*]align(std.heap.pageSize()) u8 = @ptrCast(@alignCast(raw.?)); + return Backend{ + .name_z = null, + .gpa = std.heap.page_allocator, + .fd = fd, + .ptr = ptr, + .size = size, + }; + } + + /// The backing fd, to transmit to the runtime via + /// `IpcSocket.sendWithHandles` (`engine-ipc.md` §4.8). Named + /// `handle` rather than `fd` to avoid shadowing the `fd` field. + pub fn handle(self: *const Backend) i32 { + return self.fd; + } + pub fn close(self: *Backend, is_owner: bool) void { _ = sys.munmap(@ptrCast(self.ptr), self.size); _ = sys.close(self.fd); - if (is_owner) _ = sys.shm_unlink(self.name_z.ptr); - self.gpa.free(self.name_z); + if (self.name_z) |nz| { + if (is_owner) _ = sys.shm_unlink(nz.ptr); + self.gpa.free(nz); + } self.fd = -1; self.size = 0; - // `name_z` is left dangling — close() is single-shot, the - // caller must drop the Backend value after. + self.name_z = null; } }; diff --git a/src/core/ipc/shm_windows.zig b/src/core/ipc/shm_windows.zig index dd4ba01..0b3c2b9 100644 --- a/src/core/ipc/shm_windows.zig +++ b/src/core/ipc/shm_windows.zig @@ -111,6 +111,25 @@ pub const Backend = struct { return Backend{ .mapping = mapping, .ptr = ptr, .size = size }; } + /// Windows CPU shm attach stays by name (`open`) — the named + /// mapping has no BSD shm quirk, so the `SCM_RIGHTS`/`fromFd` pivot + /// (`engine-ipc.md` §4.8) is POSIX-only. Handle passing on Windows + /// (`DuplicateHandle`) is reserved for the Phase 3 GPU shared + /// framebuffer (§4.7). Returns `error.Unimplemented` so a caller + /// that mistakenly routes the Windows attach through `fromFd` + /// fails loudly instead of silently. + pub fn fromFd(handle_in: shm.OsHandle, size: usize) Error!Backend { + _ = .{ handle_in, size }; + return error.Unimplemented; + } + + /// The mapping kernel-object handle. Unused by the M0.7 Windows + /// attach path (which is by name); present for API symmetry with + /// the POSIX backend. + pub fn handle(self: *const Backend) Handle { + return self.mapping; + } + pub fn close(self: *Backend, is_owner: bool) void { _ = is_owner; // Windows refcounts the mapping kernel object — // no `unlink` step distinct from the unmap+close pair. diff --git a/src/core/ipc/snapshot.zig b/src/core/ipc/snapshot.zig new file mode 100644 index 0000000..37538df --- /dev/null +++ b/src/core/ipc/snapshot.zig @@ -0,0 +1,105 @@ +//! Minimal binary scene snapshot persisted by the runtime on +//! `SaveProject`, reloaded on restart as the best-effort-replay +//! reference point (`engine-ipc.md` §7.1, brief E4 "option 1"). This is +//! **not** a `.scene.etch` writer and carries no project-settings +//! serialization (out of Phase 0): for the M0.7 runtime stub the +//! "active scene" is the mire, so the snapshot records a single marker +//! (`frame_id`) — enough to give replay a concrete reload point on the +//! restarted runtime. +//! +//! File I/O goes through `std.Io.Dir` + `io` (the 0.16 filesystem API); +//! `std.fs.cwd()` no longer exists. The runtime supplies `io` from its +//! `std.process.Init`. + +const std = @import("std"); + +/// `"WSNP"` little-endian — distinct from the framing/viewport magics. +pub const magic: u32 = 0x57534E50; +/// Snapshot layout revision; a mismatch makes `read` return `null`. +pub const version: u16 = 1; + +/// On-disk snapshot record (fixed size, little-endian extern layout). +pub const Snapshot = extern struct { + magic: u32, + version: u16, + _pad: u16 = 0, + /// The active scene's minimal state. M0.7 stub: a save marker (the + /// `SaveProject` seq_id) standing in for the reloadable scene state. + frame_id: u64, +}; + +/// Persist `snap` to `path`, overwriting any prior snapshot. Called by +/// the runtime when it acks `SaveProject`. `magic`/`version` are stamped +/// here so callers need only fill `frame_id`. +pub fn write(io: std.Io, path: []const u8, snap: Snapshot) !void { + var rec = snap; + rec.magic = magic; + rec.version = version; + const f = try std.Io.Dir.cwd().createFile(io, path, .{ .truncate = true }); + defer f.close(io); + try f.writeStreamingAll(io, std.mem.asBytes(&rec)); +} + +/// Read the snapshot at `path`. Returns `null` when absent or malformed +/// (a fresh runtime with no prior save starts clean — `engine-ipc.md` +/// §7.2: no save ⇒ no reload point). +pub fn read(io: std.Io, path: []const u8) ?Snapshot { + const f = std.Io.Dir.cwd().openFile(io, path, .{}) catch return null; + defer f.close(io); + + var scratch: [64]u8 = undefined; + var reader = f.reader(io, &scratch); + var dst: [@sizeOf(Snapshot)]u8 = undefined; + var got: usize = 0; + while (got < dst.len) { + const n = reader.interface.readSliceShort(dst[got..]) catch return null; + if (n == 0) break; + got += n; + } + if (got != @sizeOf(Snapshot)) return null; + + var snap: Snapshot = undefined; + @memcpy(std.mem.asBytes(&snap), &dst); + if (snap.magic != magic or snap.version != version) return null; + return snap; +} + +// ---------------------------------------------------------------- tests -- + +test "snapshot write then read round-trips" { + var threaded = std.Io.Threaded.init(std.testing.allocator, .{}); + defer threaded.deinit(); + const io = threaded.io(); + + const path = "weld-snapshot-unittest.bin"; + std.Io.Dir.cwd().deleteFile(io, path) catch {}; + defer std.Io.Dir.cwd().deleteFile(io, path) catch {}; + + try write(io, path, .{ .magic = 0, .version = 0, .frame_id = 4242 }); + const got = read(io, path) orelse return error.SnapshotMissing; + try std.testing.expectEqual(@as(u32, magic), got.magic); + try std.testing.expectEqual(@as(u16, version), got.version); + try std.testing.expectEqual(@as(u64, 4242), got.frame_id); +} + +test "snapshot read of an absent path returns null" { + var threaded = std.Io.Threaded.init(std.testing.allocator, .{}); + defer threaded.deinit(); + const io = threaded.io(); + try std.testing.expect(read(io, "weld-snapshot-absent-xyz.bin") == null); +} + +test "snapshot read rejects a wrong-magic file" { + var threaded = std.Io.Threaded.init(std.testing.allocator, .{}); + defer threaded.deinit(); + const io = threaded.io(); + + const path = "weld-snapshot-badmagic.bin"; + std.Io.Dir.cwd().deleteFile(io, path) catch {}; + defer std.Io.Dir.cwd().deleteFile(io, path) catch {}; + + const f = try std.Io.Dir.cwd().createFile(io, path, .{ .truncate = true }); + try f.writeStreamingAll(io, &[_]u8{ 0xDE, 0xAD, 0xBE, 0xEF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }); + f.close(io); + try std.testing.expect(read(io, path) == null); +} diff --git a/src/core/ipc/transport.zig b/src/core/ipc/transport.zig index 6250165..6c09e9a 100644 --- a/src/core/ipc/transport.zig +++ b/src/core/ipc/transport.zig @@ -46,6 +46,15 @@ pub const OsHandle = backend.OsHandle; /// Sentinel marking an absent handle in a slot. pub const invalid_handle: OsHandle = backend.invalid_handle; +/// Close a single OS handle (POSIX `close` / Windows `CloseHandle`). +/// Used to release an fd received via `recvWithHandles` that the +/// receiver will not retain — e.g. a shm region the runtime declines +/// to map — so a malformed or multi-region handoff cannot leak +/// descriptors (`engine-ipc.md` §8.3). +pub fn closeHandle(h: OsHandle) void { + backend.closeHandle(h); +} + /// Result returned by `recvWithHandles`. pub const RecvResult = struct { bytes: usize, diff --git a/src/core/ipc/transport_posix.zig b/src/core/ipc/transport_posix.zig index 66a6f74..d7a6701 100644 --- a/src/core/ipc/transport_posix.zig +++ b/src/core/ipc/transport_posix.zig @@ -140,6 +140,12 @@ pub const OsHandle = std.posix.fd_t; /// `invalid_handle` re-export. pub const invalid_handle: OsHandle = -1; +/// Close a bare fd received via `recvWithHandles` that the receiver +/// will not retain (avoids leaking descriptors from a handoff). +pub fn closeHandle(h: OsHandle) void { + _ = sys.close(h); +} + const Error = transport.Error; /// Backend struct embedded inside `IpcSocket.impl`. The single field diff --git a/src/core/ipc/transport_windows.zig b/src/core/ipc/transport_windows.zig index c192ba1..fc5835a 100644 --- a/src/core/ipc/transport_windows.zig +++ b/src/core/ipc/transport_windows.zig @@ -89,6 +89,13 @@ pub const OsHandle = std.os.windows.HANDLE; /// re-export. pub const invalid_handle: OsHandle = INVALID_HANDLE_VALUE; +/// Close a bare handle received out-of-band that the receiver will not +/// retain. (M0.7: the Windows handoff path is by-name, so this is only +/// exercised on POSIX; present for cross-platform parity.) +pub fn closeHandle(h: OsHandle) void { + _ = sys.CloseHandle(h); +} + const Error = transport.Error; /// Win32 named-pipe backend for `IpcSocket`. Embedded inside diff --git a/src/core/ipc/viewport.zig b/src/core/ipc/viewport.zig index 7744ab4..a5cb1da 100644 --- a/src/core/ipc/viewport.zig +++ b/src/core/ipc/viewport.zig @@ -162,20 +162,43 @@ pub const ShmViewport = struct { return .{ .region = region, .width = width, .height = height }; } - /// Runtime side. Attaches to an existing region and validates - /// the header. + /// Runtime side, intra-process / Windows. Attaches by name and + /// validates the header. On POSIX the cross-process runtime attach + /// is `fromFd` (§4.8); `open` is retained for Windows (named + /// mapping) and intra-process re-attach. pub fn open(name: []const u8, width: u32, height: u32) Error!ShmViewport { const size = regionSize(width, height); var region = try shm.ShmRegion.open(name, size); errdefer region.close(); + try validateHeader(®ion, width, height); + return .{ .region = region, .width = width, .height = height }; + } + /// Runtime side, POSIX. Attaches from a descriptor received over + /// the socket (`SCM_RIGHTS`) — the primary cross-process attach + /// (`engine-ipc.md` §4.8) — and validates the header. + pub fn fromFd(handle: shm.OsHandle, width: u32, height: u32) Error!ShmViewport { + const size = regionSize(width, height); + var region = try shm.ShmRegion.fromFd(handle, size); + errdefer region.close(); + try validateHeader(®ion, width, height); + return .{ .region = region, .width = width, .height = height }; + } + + /// The fd of the backing region, for the editor to forward to the + /// runtime via `IpcSocket.sendWithHandles` (§4.8). + pub fn fd(self: *const ShmViewport) shm.OsHandle { + return self.region.fd(); + } + + /// Validates the viewport header of an attached region. Shared by + /// `open` and `fromFd`. + fn validateHeader(region: *const shm.ShmRegion, width: u32, height: u32) Error!void { const hdr: *Header = @ptrCast(@alignCast(region.ptr)); if (hdr.magic != HEADER_MAGIC) return error.InvalidHeader; if (hdr.version != HEADER_VERSION) return error.InvalidHeader; if (hdr.width != width or hdr.height != height) return error.InvalidHeader; if (hdr.slot_count != slot_count) return error.InvalidHeader; - - return .{ .region = region, .width = width, .height = height }; } pub fn close(self: *ShmViewport) void { diff --git a/src/core/platform/process.zig b/src/core/platform/process.zig index 683e4a7..b533ea7 100644 --- a/src/core/platform/process.zig +++ b/src/core/platform/process.zig @@ -89,6 +89,40 @@ const win = struct { extern "kernel32" fn GetExitCodeProcess(hProcess: *anyopaque, lpExitCode: *u32) callconv(.winapi) i32; extern "kernel32" fn CloseHandle(hObject: *anyopaque) callconv(.winapi) i32; extern "kernel32" fn OpenProcess(dwDesiredAccess: u32, bInheritHandle: i32, dwProcessId: u32) callconv(.winapi) ?*anyopaque; + extern "kernel32" fn GetLastError() callconv(.winapi) u32; +}; + +/// `STARTUPINFOW` — `cb` must be `@sizeOf(STARTUPINFOW)`; the rest is +/// zeroed for a plain console-less spawn (we inherit nothing and pipe +/// nothing — stdio piping is Phase 0.3 per the file header). +const STARTUPINFOW = extern struct { + cb: u32, + lpReserved: ?[*:0]u16, + lpDesktop: ?[*:0]u16, + lpTitle: ?[*:0]u16, + dwX: u32, + dwY: u32, + dwXSize: u32, + dwYSize: u32, + dwXCountChars: u32, + dwYCountChars: u32, + dwFillAttribute: u32, + dwFlags: u32, + wShowWindow: u16, + cbReserved2: u16, + lpReserved2: ?[*]u8, + hStdInput: ?*anyopaque, + hStdOutput: ?*anyopaque, + hStdError: ?*anyopaque, +}; + +/// `PROCESS_INFORMATION` — filled by `CreateProcessW` with the child's +/// process + primary-thread handles and ids. +const PROCESS_INFORMATION = extern struct { + hProcess: ?*anyopaque, + hThread: ?*anyopaque, + dwProcessId: u32, + dwThreadId: u32, }; // `posix_spawnp` needs the parent process's `envp` pointer. The @@ -106,6 +140,67 @@ fn currentEnvp() [*]const ?[*:0]const u8 { }; } +/// Quotes a single argument for a Windows command line per the MSVCRT / +/// `CommandLineToArgvW` rules, so the spawned process reconstructs +/// `argv[i]` byte-for-byte — including the tricky cases the naive +/// `"arg"` wrapping gets wrong (a path ending in one or more `\`, or an +/// argument containing `"`). Caller owns the returned slice. +/// +/// Operates on UTF-8: every metacharacter (` `, `\t`, `\n`, vertical +/// tab, `"`, `\`) is ASCII and UTF-8 is ASCII-transparent, so byte-wise +/// quoting matches the wide-char algorithm `CreateProcessW` will parse. +/// +/// Algorithm (Daniel Colascione's `ArgvQuote`): emit the argument +/// verbatim when it is non-empty and contains no whitespace or `"`; +/// otherwise wrap in `"` and, scanning runs of backslashes, double them +/// before a `"` (literal or the closing one) and leave them as-is +/// elsewhere. +pub fn quoteArg(gpa: std.mem.Allocator, arg: []const u8) std.mem.Allocator.Error![]u8 { + var out: std.ArrayList(u8) = .empty; + errdefer out.deinit(gpa); + + if (arg.len != 0 and std.mem.indexOfAny(u8, arg, " \t\n\x0B\"") == null) { + try out.appendSlice(gpa, arg); + return out.toOwnedSlice(gpa); + } + + try out.append(gpa, '"'); + var i: usize = 0; + while (i < arg.len) { + var backslashes: usize = 0; + while (i < arg.len and arg[i] == '\\') : (i += 1) backslashes += 1; + if (i == arg.len) { + // Trailing backslashes precede the closing quote — double them + // so the quote stays a delimiter, not an escaped literal. + try out.appendNTimes(gpa, '\\', backslashes * 2); + break; + } else if (arg[i] == '"') { + // Escape the run of backslashes AND the embedded quote. + try out.appendNTimes(gpa, '\\', backslashes * 2 + 1); + try out.append(gpa, '"'); + i += 1; + } else { + // Backslashes are literal away from a quote. + try out.appendNTimes(gpa, '\\', backslashes); + try out.append(gpa, arg[i]); + i += 1; + } + } + try out.append(gpa, '"'); + return out.toOwnedSlice(gpa); +} + +/// UTF-8 → NUL-terminated UTF-16LE for the Win32 wide APIs, remapping a +/// non-UTF-8 input to `error.InvalidArgument` (it is invalid caller +/// input, not an engine fault) so the process `Error` set stays free of +/// a Unicode member. Caller owns the returned slice. +fn utf8ToUtf16Z(gpa: std.mem.Allocator, s: []const u8) error{ InvalidArgument, OutOfMemory }![:0]u16 { + return std.unicode.utf8ToUtf16LeAllocZ(gpa, s) catch |e| switch (e) { + error.InvalidUtf8 => error.InvalidArgument, + error.OutOfMemory => error.OutOfMemory, + }; +} + /// Spawns a child process running `path` with the supplied /// `argv`. The caller's environment is inherited as-is. The /// returned `Process` must be passed to `wait_nonblock` / @@ -147,14 +242,54 @@ pub fn spawn_process( return .{ .pid = pid }; }, .windows => { - // Windows path is wired in S6 only at the API-surface - // level — the editor + runtime binaries are exercised on - // Linux/macOS for S6 acceptance. A real CreateProcessW - // implementation lands when Win11 hardware validation is - // added in Phase 0.6 (consistent with the S3/S4 inherited- - // debt pattern for Windows-only paths). - _ = .{ gpa, path, argv }; - return error.SpawnFailed; + // Build a UTF-8 command line (each arg quoted), convert to + // UTF-16, and spawn via CreateProcessW. `lpApplicationName` + // pins the binary; argv[0] stays in the command line by + // convention. M0.7 / E3 — wires the Windows editor path. + var cmd: std.ArrayList(u8) = .empty; + defer cmd.deinit(gpa); + for (argv, 0..) |a, i| { + if (i != 0) try cmd.append(gpa, ' '); + const quoted = try quoteArg(gpa, a); + defer gpa.free(quoted); + try cmd.appendSlice(gpa, quoted); + } + const cmd_w = try utf8ToUtf16Z(gpa, cmd.items); + defer gpa.free(cmd_w); + const path_w = try utf8ToUtf16Z(gpa, path); + defer gpa.free(path_w); + + var si: STARTUPINFOW = std.mem.zeroes(STARTUPINFOW); + si.cb = @sizeOf(STARTUPINFOW); + var pi: PROCESS_INFORMATION = std.mem.zeroes(PROCESS_INFORMATION); + + const ok = win.CreateProcessW( + path_w.ptr, + cmd_w.ptr, + null, + null, + 0, // bInheritHandles = FALSE + 0, // dwCreationFlags + null, + null, + @ptrCast(&si), + @ptrCast(&pi), + ); + if (ok == 0) { + // Surface the Win32 last-error so a spawn failure is + // diagnosable (e.g. 2 = ERROR_FILE_NOT_FOUND when the + // exe path is wrong / missing the `.exe` suffix) instead + // of an opaque `error.SpawnFailed`. + std.log.scoped(.process).err( + "CreateProcessW failed: path='{s}' GetLastError={d}", + .{ path, win.GetLastError() }, + ); + return error.SpawnFailed; + } + // The primary-thread handle is unused; close it now. The + // process handle is retained for `wait_nonblock` / `kill`. + if (pi.hThread) |h| _ = win.CloseHandle(h); + return .{ .pid = pi.dwProcessId, .handle = pi.hProcess }; }, else => @compileError("spawn_process: unsupported OS"), } diff --git a/src/core/root.zig b/src/core/root.zig index 2b4113a..a8b5891 100644 --- a/src/core/root.zig +++ b/src/core/root.zig @@ -73,6 +73,9 @@ pub const ipc = struct { pub const connection = @import("ipc/connection.zig"); pub const server = @import("ipc/server.zig"); pub const client = @import("ipc/client.zig"); + pub const cleanup = @import("ipc/cleanup.zig"); + pub const command_log = @import("ipc/command_log.zig"); + pub const snapshot = @import("ipc/snapshot.zig"); }; /// RTTI namespace — Tier 0 reflection runtime (M0.2 / E1). Comptime @@ -119,6 +122,9 @@ comptime { _ = ipc.connection; _ = ipc.server; _ = ipc.client; + _ = ipc.cleanup; + _ = ipc.command_log; + _ = ipc.snapshot; // Same guard for the M0.1 identity module — `entity.zig`'s inline // tests must be reachable from the core test target's root. _ = ecs.entity; diff --git a/src/editor/main.zig b/src/editor/main.zig index e871cfb..ab808c6 100644 --- a/src/editor/main.zig +++ b/src/editor/main.zig @@ -41,7 +41,10 @@ const vk_blit = @import("vk_blit.zig"); const is_posix = builtin.os.tag == .linux or builtin.os.tag == .macos; const Args = struct { - runtime_path: []const u8 = "zig-out/bin/weld-runtime", + /// Empty = auto-derive from the editor's own executable directory + /// (`/weld-runtime[.exe]`), set after parsing. `--runtime=` + /// overrides it (e.g. tests passing an explicit path). + runtime_path: []const u8 = "", frames: u64 = 3600, no_heartbeat: bool = false, no_spawn: bool = false, @@ -49,7 +52,10 @@ const Args = struct { fn parseArgs(gpa: std.mem.Allocator, init: std.process.Init.Minimal) !Args { var a = Args{}; - var it = std.process.Args.Iterator.init(init.args); + // `Iterator.init` is a `@compileError` on Windows (no POSIX argv) — + // the allocator variant parses the wide command line. `init.args` + // (Juicy Main) is preserved; `deinit` frees the Windows buffer. + var it = try std.process.Args.Iterator.initAllocator(init.args, gpa); defer it.deinit(); _ = it.skip(); while (it.next()) |s| { @@ -79,21 +85,47 @@ fn sleepMs(ms: u64) void { _ = nanosleep(&ts, null); } -pub fn main(init: std.process.Init.Minimal) !void { - if (!is_posix) { - std.debug.print("editor stub: Windows path not implemented in S6 (cf. brief)\n", .{}); - return error.Unimplemented; - } +pub fn main(init: std.process.Init) !void { + // Full Juicy Main (engine-zig-conventions §2 — `Init` for dev tools): + // `init.arena` is process-lifetime + auto-cleaned; `init.io` drives + // the executable-directory lookup used to resolve the runtime path. + const gpa = init.arena.allocator(); + const io = init.io; - var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); - defer arena.deinit(); - const gpa = arena.allocator(); + const args = try parseArgs(gpa, init.minimal); - const args = try parseArgs(gpa, init); + // Resolve the runtime binary. Without `--runtime=`, derive it from + // the editor's own executable directory + `weld-runtime[.exe]` — + // robust against the CWD and OS-correct (CreateProcessW with + // lpApplicationName needs the exact path, incl. the `.exe` suffix, + // and does not search PATH). + const runtime_path: []const u8 = if (args.runtime_path.len != 0) + args.runtime_path + else blk: { + const dir = try std.process.executableDirPathAlloc(io, gpa); + const exe_name = if (builtin.os.tag == .windows) "weld-runtime.exe" else "weld-runtime"; + break :blk try std.fs.path.join(gpa, &.{ dir, exe_name }); + }; const my_pid = getpid(); - const socket_path = try std.fmt.allocPrint(gpa, "/tmp/weld-{d}.sock", .{my_pid}); - const shm_name = try std.fmt.allocPrint(gpa, "/weld-shm-viewport-{d}", .{my_pid}); + // OS-correct endpoints. Socket: `/tmp/weld-.sock` (POSIX Unix + // socket) vs `\\.\pipe\weld-` (Windows named pipe), via + // `transport.buildSocketPath`. Shm name: POSIX `/weld-shm-...` vs + // Windows session-local `Local\weld-shm-...` (engine-ipc.md §2.2). + var ep_name_buf: [64]u8 = undefined; + const ep_name = try std.fmt.bufPrint(&ep_name_buf, "weld-{d}", .{my_pid}); + var sock_path_buf: [128]u8 = undefined; + const socket_path: []const u8 = try ipc.transport.buildSocketPath(&sock_path_buf, ep_name); + const shm_name = if (is_posix) + try std.fmt.allocPrint(gpa, "/weld-shm-viewport-{d}", .{my_pid}) + else + try std.fmt.allocPrint(gpa, "Local\\weld-shm-viewport-{d}", .{my_pid}); + + // ---- Reap orphan sockets / shm regions from any previously + // crashed editor (engine-ipc.md §2.4). Runs before we create our + // own endpoints; only dead-PID orphans are removed, so a second + // live editor is never disturbed. ---- + ipc.cleanup.reapOrphans(); // ---- shm region (created before everything else; runtime // attaches to it once spawned) ---- @@ -128,26 +160,31 @@ pub fn main(init: std.process.Init.Minimal) !void { const socket_arg = try std.fmt.allocPrint(gpa, "--socket={s}", .{socket_path}); const shm_arg = try std.fmt.allocPrint(gpa, "--shm={s}", .{shm_name}); const pid_arg = try std.fmt.allocPrint(gpa, "--editor-pid={d}", .{my_pid}); + // Snapshot path for SaveProject persistence / replay reload (§7.1). + // CWD-relative so it resolves identically in the spawned runtime + // (which inherits this process's CWD) on both POSIX and Windows. + const snapshot_arg = try std.fmt.allocPrint(gpa, "--snapshot=weld-snapshot-{d}.bin", .{my_pid}); var spawn_argv = std.ArrayList([]const u8).empty; defer spawn_argv.deinit(gpa); - try spawn_argv.append(gpa, args.runtime_path); + try spawn_argv.append(gpa, runtime_path); try spawn_argv.append(gpa, socket_arg); try spawn_argv.append(gpa, shm_arg); try spawn_argv.append(gpa, pid_arg); const frames_arg = try std.fmt.allocPrint(gpa, "--frames={d}", .{args.frames}); try spawn_argv.append(gpa, frames_arg); + try spawn_argv.append(gpa, snapshot_arg); var proc_opt: ?platform_process.Process = null; if (args.no_spawn) { std.debug.print( "[editor] --no-spawn: launch the runtime manually with:\n {s}", - .{args.runtime_path}, + .{runtime_path}, ); for (spawn_argv.items[1..]) |a| std.debug.print(" {s}", .{a}); std.debug.print("\n[editor] waiting for runtime to connect on {s} ...\n", .{socket_path}); } else { - proc_opt = try platform_process.spawn_process(gpa, args.runtime_path, spawn_argv.items); + proc_opt = try platform_process.spawn_process(gpa, runtime_path, spawn_argv.items); } try server.acceptOne(); @@ -163,6 +200,30 @@ pub fn main(init: std.process.Init.Minimal) !void { return error.HandshakeRejected; } + // ---- POSIX shm fd handoff (engine-ipc.md §4.8) ---- + // Hand the runtime the viewport region's fd via SCM_RIGHTS so it + // maps the framebuffer with ShmRegion.fromFd — never cross-process + // shm_open. The editor stays the region owner; its own mapping is + // untouched by the transfer. Windows skips this: the runtime opens + // the named mapping by name (§2.2), so there is no fd to pass. + if (is_posix) { + var handoff = messages.ShmRegionsHandoff{ + .region_count = 1, + .regions = std.mem.zeroes([messages.MAX_SHM_REGIONS]messages.ShmRegionDesc), + }; + messages.writeFixedString(&handoff.regions[0].logical_name, "viewport_framebuffer"); + handoff.regions[0].size = viewport.regionSize( + viewport.default_resolution.width, + viewport.default_resolution.height, + ); + try server.connection().sendMessageWithHandles( + messages.ShmRegionsHandoff, + 0, + &handoff, + &[_]ipc.transport.OsHandle{vp.fd()}, + ); + } + // ---- Render loop ---- var frame: u64 = 0; var should_close = false; diff --git a/src/runtime/main.zig b/src/runtime/main.zig index cc478c3..c149f53 100644 --- a/src/runtime/main.zig +++ b/src/runtime/main.zig @@ -30,7 +30,9 @@ const ipc = weld_core.ipc; const framing = ipc.framing; const messages = ipc.messages; const protocol = ipc.protocol; +const transport = ipc.transport; const viewport = ipc.viewport; +const snapshot = ipc.snapshot; const is_posix = builtin.os.tag == .linux or builtin.os.tag == .macos; @@ -39,11 +41,17 @@ const Args = struct { shm: []const u8 = "", editor_pid: i64 = 0, frames: ?u64 = null, + /// Path of the minimal scene snapshot (engine-ipc.md §7.1): written + /// on `SaveProject`, reloaded on restart. Empty = no persistence. + snapshot: []const u8 = "", }; fn parseArgs(gpa: std.mem.Allocator, init: std.process.Init.Minimal) !Args { var args = Args{}; - var it = std.process.Args.Iterator.init(init.args); + // `Iterator.init` is a `@compileError` on Windows (no POSIX argv) — + // the allocator variant parses the wide command line. `init.args` + // (Juicy Main) is preserved; `deinit` frees the Windows buffer. + var it = try std.process.Args.Iterator.initAllocator(init.args, gpa); defer it.deinit(); _ = it.skip(); // argv[0] (binary path) @@ -56,6 +64,8 @@ fn parseArgs(gpa: std.mem.Allocator, init: std.process.Init.Minimal) !Args { args.editor_pid = try std.fmt.parseInt(i64, a["--editor-pid=".len..], 10); } else if (std.mem.startsWith(u8, a, "--frames=")) { args.frames = try std.fmt.parseInt(u64, a["--frames=".len..], 10); + } else if (std.mem.startsWith(u8, a, "--snapshot=")) { + args.snapshot = try gpa.dupe(u8, a["--snapshot=".len..]); } } if (args.socket.len == 0) return error.MissingSocketArg; @@ -91,26 +101,26 @@ fn sleepMs(ms: u64) void { } pub fn main(init: std.process.Init.Minimal) !void { - if (!is_posix) { - std.debug.print("runtime stub: Windows path not implemented in S6 (cf. brief)\n", .{}); - return error.Unimplemented; - } - var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); defer arena.deinit(); const gpa = arena.allocator(); const args = try parseArgs(gpa, init); + // Default I/O for the snapshot file ops. The runtime stays + // `Init.Minimal` per convention (it will bind a custom Io on the job + // system in Phase 1); a local `Threaded` covers M0.7. `page_allocator` + // is threadsafe, satisfying Threaded's async-allocator contract, and + // the `io` is safe to share with the reader thread. + var threaded = std.Io.Threaded.init(std.heap.page_allocator, .{}); + defer threaded.deinit(); + const io = threaded.io(); + var client = ipc.client.IpcClient.init(gpa); defer client.deinit(); try client.connect(args.socket); - // Attach the viewport shm region the editor created. - var vp = try viewport.ShmViewport.open(args.shm, viewport.default_resolution.width, viewport.default_resolution.height); - defer vp.close(); - - // Send ProtocolHello. + // Send ProtocolHello and await the editor's acceptance. try client.sendHello("0.0.7-S6", "deadbee", 0); var ack_buf: [framing.frameSizeOf(messages.ProtocolHelloAck)]u8 = undefined; @@ -121,17 +131,53 @@ pub fn main(init: std.process.Init.Minimal) !void { return error.HandshakeRejected; } + // Attach the viewport shm. POSIX: the editor passes the region fd + // out-of-band (SCM_RIGHTS) in a `ShmRegionsHandoff` right after the + // handshake — map it with `fromFd`, never cross-process `shm_open` + // (engine-ipc.md §4.8). Windows: no fd-passing — `open` the named + // mapping the editor created (§2.2), whose name arrived on argv. + var vp = if (is_posix) blk: { + var handoff_buf: [framing.frameSizeOf(messages.ShmRegionsHandoff)]u8 = undefined; + var handoff_handles: [messages.MAX_SHM_REGIONS]transport.OsHandle = undefined; + @memset(&handoff_handles, transport.invalid_handle); + const hf = try client.connection().recvFrameWithHandles(&handoff_buf, &handoff_handles); + const handoff = try framing.decode(messages.ShmRegionsHandoff, hf.header, hf.payload_bytes); + // Validate §8.3 (count in range + strict fd/descriptor equality); + // `acceptShmHandoff` closes every excess / unmapped region fd so a + // malformed handoff cannot leak descriptors. + const viewport_fd = try ipc.connection.acceptShmHandoff(&handoff, handoff_handles[0..hf.handles]); + break :blk try viewport.ShmViewport.fromFd(viewport_fd, viewport.default_resolution.width, viewport.default_resolution.height); + } else blk: { + break :blk try viewport.ShmViewport.open(args.shm, viewport.default_resolution.width, viewport.default_resolution.height); + }; + defer vp.close(); + + // Reload point: if the editor's last SaveProject persisted a snapshot, + // resume the mire from it (engine-ipc.md §7.2). Absent ⇒ start clean. + var start_frame: u64 = 0; + if (args.snapshot.len != 0) { + if (snapshot.read(io, args.snapshot)) |snap| start_frame = snap.frame_id; + } + // Spawn the dedicated IPC reader thread per brief § Scope — // the main loop renders the mire at ~60 Hz while the reader // drains the socket and replies to transactional messages. - var reader_state = ReaderState{ .client = &client, .shutdown_requested = std.atomic.Value(u8).init(0), .read_failed = std.atomic.Value(u8).init(0) }; + var reader_state = ReaderState{ + .client = &client, + .shutdown_requested = std.atomic.Value(u8).init(0), + .read_failed = std.atomic.Value(u8).init(0), + .play_state = std.atomic.Value(u8).init(play_playing), + .io = io, + .snapshot_path = args.snapshot, + }; const reader = try std.Thread.spawn(.{}, readerLoop, .{&reader_state}); defer reader.join(); - var frame: u64 = 0; + var frame: u64 = start_frame; // mire animation parameter — advances only while playing + var iter: u64 = 0; // loop iterations — bounds the lifetime via --frames while (true) { if (args.frames) |max| { - if (frame >= max) break; + if (iter >= max) break; } if (reader_state.shutdown_requested.load(.acquire) != 0) break; if (reader_state.read_failed.load(.acquire) != 0) break; @@ -139,23 +185,60 @@ pub fn main(init: std.process.Init.Minimal) !void { const slot = vp.nextWriteSlot(); renderMire(&vp, slot, frame); vp.commit(slot); + // Play/Pause/Stop gate the animation: advance the mire only while + // playing; paused/stopped re-commit the held frame so the viewport + // stays live (G6 visual) without animating. + if (reader_state.play_state.load(.acquire) == play_playing) frame += 1; sleepMs(16); // ~60 Hz - frame += 1; + iter += 1; } } +/// Play-state driven by the `Play` / `Pause` / `Stop` commands +/// (`engine-ipc.md` §3.3). Default `playing` so the S6 mire renders +/// immediately when no control command is sent (e.g. the crash-recovery +/// tests). The reader thread sets it; the render loop reads it to gate +/// the mire's frame advance. +const play_stopped: u8 = 0; +const play_playing: u8 = 1; +const play_paused: u8 = 2; + const ReaderState = struct { client: *ipc.client.IpcClient, shutdown_requested: std.atomic.Value(u8), read_failed: std.atomic.Value(u8), + /// `play_stopped` / `play_playing` / `play_paused`. + play_state: std.atomic.Value(u8), + /// I/O for the `SaveProject` snapshot write (shared with main). + io: std.Io, + /// Snapshot path, or empty to skip persistence. + snapshot_path: []const u8, }; fn readerLoop(state: *ReaderState) void { - const max_frame_buf_size = comptime @max( - @max(framing.frameSizeOf(messages.Heartbeat), framing.frameSizeOf(messages.Shutdown)), - framing.frameSizeOf(messages.Echo), - ); - var scratch: [@as(usize, max_frame_buf_size) + 256]u8 = undefined; + // Sized to the largest frame the editor can send the runtime — + // computed over the FULL incoming set (every editor→runtime type the + // reader reads, whether or not it decodes it: `recvFrame` buffers the + // whole frame before the switch). Runtime→editor types (RuntimeError, + // the acks, …) are never received here and do not size this buffer. + // Current max: LoadScene / SaveScene at 280 B (16 + 8 + 256). Relying + // on @max(Echo, LoadScene) would silently undersize if a future + // incoming message grew past 256 B — so enumerate them explicitly. + const max_incoming_frame = comptime blk: { + var m: usize = 0; + for (.{ + messages.Heartbeat, messages.Shutdown, + messages.Echo, messages.SpawnEntity, + messages.ModifyComponent, messages.Play, + messages.Pause, messages.Stop, + messages.LoadScene, messages.HotReloadScript, + messages.SaveScene, messages.SaveProject, + }) |T| { + m = @max(m, framing.frameSizeOf(T)); + } + break :blk m; + }; + var scratch: [max_incoming_frame]u8 = undefined; while (true) { const fr = state.client.connection().recvFrame(&scratch) catch { state.read_failed.store(1, .release); @@ -190,6 +273,55 @@ fn readerLoop(state: *ReaderState) void { const ack = messages.ModifyAck{ .success = 1 }; state.client.connection().sendMessage(messages.ModifyAck, fr.header.seq_id, &ack) catch return; }, + .play => state.play_state.store(play_playing, .release), + .pause => state.play_state.store(play_paused, .release), + .stop => state.play_state.store(play_stopped, .release), + .load_scene => { + const ls = framing.decode(messages.LoadScene, fr.header, fr.payload_bytes) catch return; + const path = messages.readFixedString(&ls.path); + if (path.len == 0) { + // Recoverable, non-transactional command failure → a + // non-fatal RuntimeError event (§3.3), not a protocol + // fatal. Surfaced for the editor's "Replay Errors" panel. + var re = messages.RuntimeError{ + .severity = @intFromEnum(messages.ErrorSeverity.warning), + .source = std.mem.zeroes([64]u8), + .text = std.mem.zeroes([256]u8), + }; + messages.writeFixedString(&re.source, "runtime"); + messages.writeFixedString(&re.text, "load_scene: empty path"); + state.client.connection().sendMessage(messages.RuntimeError, 0, &re) catch return; + } + // A non-empty path is accepted (the stub has no scene to load). + }, + .hot_reload_script => { + // Stub: decode to validate the frame. The real reload + the + // ScriptHotReloadComplete event land with the script pipeline + // (out of M0.7 scope). + _ = framing.decode(messages.HotReloadScript, fr.header, fr.payload_bytes) catch return; + }, + .save_project => { + // Transactional (§3.4): persist the minimal scene snapshot + // (the replay reload point, §7.1) THEN reply `ProjectSaved` + // with the same seq_id. The stub records the SaveProject + // seq_id as the scene marker. A snapshot write failure is + // surfaced via `ok = 0` so the editor does not advance its + // clean line on a save that did not persist. + var ok: u8 = 1; + if (state.snapshot_path.len != 0) { + snapshot.write(state.io, state.snapshot_path, .{ + .magic = 0, + .version = 0, + .frame_id = fr.header.seq_id, + }) catch { + ok = 0; + }; + } + const ps = messages.ProjectSaved{ .ok = ok, .reason = std.mem.zeroes([128]u8) }; + state.client.connection().sendMessage(messages.ProjectSaved, fr.header.seq_id, &ps) catch return; + }, + // `save_scene` (scene granularity) is declared with no wired + // handler in M0.7 — it falls through to `else` and is ignored. else => { // Unilateral / unsupported types — ignore at the stub level. }, diff --git a/tests/ipc/catalogue.zig b/tests/ipc/catalogue.zig new file mode 100644 index 0000000..c889af8 --- /dev/null +++ b/tests/ipc/catalogue.zig @@ -0,0 +1,256 @@ +//! M0.7 / E2 — extended message-catalogue tests (brief § Acceptance +//! criteria › Tests). Two layers: +//! +//! 1. Pure framing round-trips (`encode` → `decode` parity) for every +//! message added in M0.7 — portable, no runtime, validates the +//! wire format + schema_hash of each new type. +//! 2. End-to-end handler behaviour against the real `weld-runtime` +//! binary (POSIX-gated, like `crash_recovery.zig`; the SCM_RIGHTS +//! pivot makes the cross-process attach work on macOS too): +//! `SaveProject` → `ProjectSaved` (same seq_id), `LoadScene` with +//! an empty path → `RuntimeError` event, and `Play`/`Pause`/`Stop` +//! accepted without desync (an `Echo` after them still round-trips). +//! +//! External-resource discipline (engine-zig-conventions.md §13): the +//! accepted socket gets a 5 s `SO_RCVTIMEO` so a missing reply fails the +//! test instead of hanging the suite. + +const std = @import("std"); +const builtin = @import("builtin"); + +const weld_core = @import("weld_core"); +const ipc = weld_core.ipc; +const framing = ipc.framing; +const messages = ipc.messages; +const transport = ipc.transport; +const viewport = ipc.viewport; +const platform_process = weld_core.platform.process; + +const is_posix = builtin.os.tag == .linux or builtin.os.tag == .macos; + +// ----------------------------------------------- pure framing round-trips -- + +/// Encode a message, parse its header, decode it back, and assert the +/// bytes survive the round-trip. Exercises the wire format + schema_hash +/// for `T`. +fn roundTrip(comptime T: type, msg: T) !void { + const gpa = std.testing.allocator; + const buf = try framing.encode(gpa, T, 123, &msg); + defer gpa.free(buf); + + const header = try framing.parseHeader(buf); + try std.testing.expectEqual(@as(u16, @intFromEnum(messages.msgTypeOf(T))), header.msg_type); + try std.testing.expectEqual(@as(u32, 123), header.seq_id); + + const decoded = try framing.decode(T, header, buf[@sizeOf(framing.Header)..]); + try std.testing.expectEqualSlices(u8, std.mem.asBytes(&msg), std.mem.asBytes(&decoded)); +} + +test "catalogue messages round-trip through encode/decode" { + try roundTrip(messages.Play, .{}); + try roundTrip(messages.Pause, .{}); + try roundTrip(messages.Stop, .{}); + try roundTrip(messages.SaveProject, .{}); + try roundTrip(messages.SaveScene, .{ .path = std.mem.zeroes([256]u8) }); + try roundTrip(messages.HotReloadScript, .{ .script_handle = 0xDEADBEEF_CAFEBABE }); + + var load = messages.LoadScene{ .path = std.mem.zeroes([256]u8) }; + messages.writeFixedString(&load.path, "scenes/level1.scene.etch"); + try roundTrip(messages.LoadScene, load); + + var saved = messages.ProjectSaved{ .ok = 1, .reason = std.mem.zeroes([128]u8) }; + messages.writeFixedString(&saved.reason, ""); + try roundTrip(messages.ProjectSaved, saved); + + var err = messages.RuntimeError{ + .severity = @intFromEnum(messages.ErrorSeverity.warning), + .source = std.mem.zeroes([64]u8), + .text = std.mem.zeroes([256]u8), + }; + messages.writeFixedString(&err.source, "runtime"); + messages.writeFixedString(&err.text, "load_scene: empty path"); + try roundTrip(messages.RuntimeError, err); +} + +// --------------------------------------------------- end-to-end fixtures -- + +extern "c" fn unlink(path: [*:0]const u8) c_int; +extern "c" fn shm_unlink(name: [*:0]const u8) i32; +extern "c" fn setsockopt(sockfd: c_int, level: c_int, optname: c_int, optval: *const anyopaque, optlen: u32) c_int; +const timespec_t = extern struct { tv_sec: i64, tv_nsec: i64 }; +extern "c" fn nanosleep(req: *const timespec_t, rem: ?*timespec_t) c_int; +extern "c" fn getpid() i32; + +const timeval = extern struct { tv_sec: i64, tv_usec: i32, _pad: i32 = 0 }; +const SOL_SOCKET: c_int = if (builtin.os.tag == .linux) 1 else 0xFFFF; +const SO_RCVTIMEO: c_int = if (builtin.os.tag == .linux) 20 else 0x1006; + +fn sleepMs(ms: u64) void { + var ts = timespec_t{ .tv_sec = @intCast(ms / 1000), .tv_nsec = @intCast((ms % 1000) * std.time.ns_per_ms) }; + _ = nanosleep(&ts, null); +} + +/// The viewport + child process produced by `spawnRuntime`. The +/// `IpcServer` is **not** returned — it is caller-owned and stable, so +/// its internal `conn.socket` pointer (set by `acceptOne` to +/// `&server.client.?`) survives. `ShmViewport` / `Process` have no +/// self-references, so returning them by value is safe. +const Spawned = struct { + vp: viewport.ShmViewport, + proc: platform_process.Process, +}; + +/// Spawn `weld-runtime`, run the handshake, and hand off the viewport fd +/// (mirroring `src/editor/main.zig`), driving the caller-owned `server`. +/// The caller owns the `socket_path` / `shm_name` buffers and unlinks +/// them, and must call `teardown` + `server.deinit()`. +fn spawnRuntime( + server: *ipc.server.IpcServer, + gpa: std.mem.Allocator, + socket_path: [:0]const u8, + shm_name: [:0]const u8, +) !Spawned { + var vp = try viewport.ShmViewport.create(shm_name, viewport.default_resolution.width, viewport.default_resolution.height); + errdefer vp.close(); + + try server.listen(socket_path); + + const socket_arg = try std.fmt.allocPrint(gpa, "--socket={s}", .{socket_path}); + defer gpa.free(socket_arg); + const shm_arg = try std.fmt.allocPrint(gpa, "--shm={s}", .{shm_name}); + defer gpa.free(shm_arg); + const pid_arg = try std.fmt.allocPrint(gpa, "--editor-pid={d}", .{getpid()}); + defer gpa.free(pid_arg); + const argv = [_][]const u8{ "zig-out/bin/weld-runtime", socket_arg, shm_arg, pid_arg }; + + const proc = try platform_process.spawn_process(gpa, "zig-out/bin/weld-runtime", &argv); + try server.acceptOne(); + + // 5 s recv timeout on the accepted socket (engine-zig-conventions §13). + var tv = timeval{ .tv_sec = 5, .tv_usec = 0 }; + _ = setsockopt(server.client.?.impl.fd, SOL_SOCKET, SO_RCVTIMEO, &tv, @sizeOf(timeval)); + + var hello_buf: [framing.frameSizeOf(messages.ProtocolHello)]u8 = undefined; + _ = try server.recvHello(&hello_buf); + try server.sendHelloAck(true, ""); + + // Viewport fd handoff (engine-ipc.md §4.8). + var handoff = messages.ShmRegionsHandoff{ .region_count = 1, .regions = std.mem.zeroes([messages.MAX_SHM_REGIONS]messages.ShmRegionDesc) }; + messages.writeFixedString(&handoff.regions[0].logical_name, "viewport_framebuffer"); + handoff.regions[0].size = viewport.regionSize(viewport.default_resolution.width, viewport.default_resolution.height); + try server.connection().sendMessageWithHandles(messages.ShmRegionsHandoff, 0, &handoff, &[_]transport.OsHandle{vp.fd()}); + + return .{ .vp = vp, .proc = proc }; +} + +/// Graceful teardown: `Shutdown` → `ShutdownAck` → reap the runtime. +fn teardown(server: *ipc.server.IpcServer, proc: *platform_process.Process) void { + const sd = messages.Shutdown{}; + server.connection().sendMessage(messages.Shutdown, 0, &sd) catch {}; + var sa_buf: [framing.frameSizeOf(messages.ShutdownAck)]u8 = undefined; + _ = server.connection().recvMessage(messages.ShutdownAck, &sa_buf) catch {}; + var attempts: usize = 0; + while (attempts < 50) : (attempts += 1) { + if (platform_process.wait_nonblock(proc) catch null) |_| break; + sleepMs(10); + } +} + +test "SaveProject is acked by ProjectSaved with the same seq_id" { + if (!is_posix) return error.SkipZigTest; + const gpa = std.testing.allocator; + const pid = getpid(); + var sock_buf: [64]u8 = undefined; + var shm_buf: [64]u8 = undefined; + const socket_path = try std.fmt.bufPrintZ(&sock_buf, "/tmp/weld-cat-save-{d}.sock", .{pid}); + const shm_name = try std.fmt.bufPrintZ(&shm_buf, "/weld-shm-cat-save-{d}", .{pid}); + _ = unlink(socket_path.ptr); + _ = shm_unlink(shm_name.ptr); + defer _ = unlink(socket_path.ptr); + defer _ = shm_unlink(shm_name.ptr); + + var server = ipc.server.IpcServer.init(gpa); + defer server.deinit(); + var sp = try spawnRuntime(&server, gpa, socket_path, shm_name); + defer sp.vp.close(); + defer teardown(&server, &sp.proc); + + const seq: u32 = 4242; + const save = messages.SaveProject{}; + try server.connection().sendMessage(messages.SaveProject, seq, &save); + + var buf: [framing.frameSizeOf(messages.ProjectSaved)]u8 = undefined; + const frame = try server.connection().recvFrame(&buf); + try std.testing.expectEqual(@as(u16, @intFromEnum(messages.MsgType.project_saved)), frame.header.msg_type); + try std.testing.expectEqual(seq, frame.header.seq_id); + const ack = try framing.decode(messages.ProjectSaved, frame.header, frame.payload_bytes); + try std.testing.expectEqual(@as(u8, 1), ack.ok); +} + +test "LoadScene with an empty path yields a RuntimeError event" { + if (!is_posix) return error.SkipZigTest; + const gpa = std.testing.allocator; + const pid = getpid(); + var sock_buf: [64]u8 = undefined; + var shm_buf: [64]u8 = undefined; + const socket_path = try std.fmt.bufPrintZ(&sock_buf, "/tmp/weld-cat-load-{d}.sock", .{pid}); + const shm_name = try std.fmt.bufPrintZ(&shm_buf, "/weld-shm-cat-load-{d}", .{pid}); + _ = unlink(socket_path.ptr); + _ = shm_unlink(shm_name.ptr); + defer _ = unlink(socket_path.ptr); + defer _ = shm_unlink(shm_name.ptr); + + var server = ipc.server.IpcServer.init(gpa); + defer server.deinit(); + var sp = try spawnRuntime(&server, gpa, socket_path, shm_name); + defer sp.vp.close(); + defer teardown(&server, &sp.proc); + + const load = messages.LoadScene{ .path = std.mem.zeroes([256]u8) }; // empty path + try server.connection().sendMessage(messages.LoadScene, 0, &load); + + var buf: [framing.frameSizeOf(messages.RuntimeError)]u8 = undefined; + const frame = try server.connection().recvFrame(&buf); + try std.testing.expectEqual(@as(u16, @intFromEnum(messages.MsgType.runtime_error)), frame.header.msg_type); + const re = try framing.decode(messages.RuntimeError, frame.header, frame.payload_bytes); + try std.testing.expectEqual(@as(u32, @intFromEnum(messages.ErrorSeverity.warning)), re.severity); + try std.testing.expectEqualStrings("runtime", messages.readFixedString(&re.source)); +} + +test "Play/Pause/Stop are accepted without desync" { + if (!is_posix) return error.SkipZigTest; + const gpa = std.testing.allocator; + const pid = getpid(); + var sock_buf: [64]u8 = undefined; + var shm_buf: [64]u8 = undefined; + const socket_path = try std.fmt.bufPrintZ(&sock_buf, "/tmp/weld-cat-play-{d}.sock", .{pid}); + const shm_name = try std.fmt.bufPrintZ(&shm_buf, "/weld-shm-cat-play-{d}", .{pid}); + _ = unlink(socket_path.ptr); + _ = shm_unlink(shm_name.ptr); + defer _ = unlink(socket_path.ptr); + defer _ = shm_unlink(shm_name.ptr); + + var server = ipc.server.IpcServer.init(gpa); + defer server.deinit(); + var sp = try spawnRuntime(&server, gpa, socket_path, shm_name); + defer sp.vp.close(); + defer teardown(&server, &sp.proc); + + // Fire-and-forget control messages — no ack expected. + const pause = messages.Pause{}; + try server.connection().sendMessage(messages.Pause, 0, &pause); + const play = messages.Play{}; + try server.connection().sendMessage(messages.Play, 0, &play); + const stop = messages.Stop{}; + try server.connection().sendMessage(messages.Stop, 0, &stop); + + // An Echo after the control burst must still round-trip — proves the + // runtime consumed the three frames without losing socket sync. + var echo = messages.Echo{ .payload = std.mem.zeroes([64]u8) }; + for (&echo.payload, 0..) |*b, i| b.* = @intCast(i & 0xFF); + try server.connection().sendMessage(messages.Echo, 7, &echo); + + var buf: [framing.frameSizeOf(messages.EchoReply)]u8 = undefined; + const reply = try server.connection().recvMessage(messages.EchoReply, &buf); + try std.testing.expectEqualSlices(u8, &echo.payload, &reply.payload); +} diff --git a/tests/ipc/crash_recovery.zig b/tests/ipc/crash_recovery.zig index 6d25340..49fe5a5 100644 --- a/tests/ipc/crash_recovery.zig +++ b/tests/ipc/crash_recovery.zig @@ -1,28 +1,18 @@ -//! S6 crash-recovery test (G4 + G5). Exercises both directions of -//! the abrupt-termination contract. +//! Crash-recovery + best-effort-replay tests (C0.4; brief E4). Drives +//! the real `weld-runtime` binary end-to-end. **Un-gated to Windows in +//! M0.7 / E4** (was POSIX-only): the per-OS differences are isolated in +//! `spawnAndHandshake` (POSIX hands the viewport fd off via SCM_RIGHTS; +//! Windows opens the named mapping by name, §2.2) and in the cleanup +//! helpers. Clock/sleep use cross-platform `std` (no POSIX externs). //! -//! G4 — runtime kill -9 → editor detects + restarts: -//! The test process plays the editor (creates shm, listens), -//! spawns the runtime binary, handshakes, then `SIGKILL`s the -//! runtime. Two tests : detect latency < 100 ms, restart succeeds -//! + first post-restart Echo round-trips OK. +//! - kill -9 runtime → editor detects EOF < 100 ms. +//! - kill -9 → editor restarts + the first post-restart Echo round-trips. +//! - editor close → runtime detects EOF + exits clean (code 0). +//! - kill -9 + best-effort replay → after restart, the post-save +//! pending commands replay < 500 ms aggregate (engine-ipc.md §7.2). //! -//! G5 — editor kill -9 → runtime detects + exits clean: -//! Test plays the editor again, spawns the runtime, handshakes, -//! then **abruptly closes the server-side socket** via -//! `IpcServer.deinit` without sending a `Shutdown` message. This -//! is a faithful simulation of a real editor `kill -9`: in both -//! cases the kernel tears the editor's socket down, and the -//! runtime sees an EOF on its next `recv`. The runtime's reader -//! thread sets `read_failed`, the main loop observes the flag, -//! `defer`s run, the process exits with code 0. Asserts the runtime -//! exits within < 500 ms of the close (16 ms main-loop tick + scope -//! teardown) and that `exit_code == 0`. -//! -//! Linux-gated because the shared shm region cross-process pattern -//! is unreliable on macOS (see `src/core/ipc/shm_posix.zig` file -//! header). The full G4/G5 verdict lives in -//! `validation/s6-go-nogo.md` and is generated by a Linux CI run. +//! Windows behaviour is validated on Guy's PC + CI; macOS dev exercises +//! the same paths thanks to the SCM_RIGHTS pivot (E1). const std = @import("std"); const builtin = @import("builtin"); @@ -31,235 +21,328 @@ const weld_core = @import("weld_core"); const ipc = weld_core.ipc; const framing = ipc.framing; const messages = ipc.messages; +const transport = ipc.transport; +const command_log = ipc.command_log; const platform_process = weld_core.platform.process; +const platform_time = weld_core.platform.time; const viewport = ipc.viewport; -const is_linux = builtin.os.tag == .linux; +const is_windows = builtin.os.tag == .windows; +const W = viewport.default_resolution.width; +const H = viewport.default_resolution.height; +extern "c" fn getpid() i32; extern "c" fn unlink(path: [*:0]const u8) c_int; extern "c" fn shm_unlink(name: [*:0]const u8) i32; -extern "c" fn clock_gettime(clk_id: i32, tp: *timespec_t) c_int; -extern "c" fn nanosleep(req: *const timespec_t, rem: ?*timespec_t) c_int; -extern "c" fn getpid() i32; -const CLOCK_MONOTONIC: i32 = if (builtin.os.tag == .linux) 1 else 6; -const timespec_t = extern struct { tv_sec: i64, tv_nsec: i64 }; +/// Monotonic milliseconds via the cross-platform platform-time wrapper +/// (`QueryPerformanceCounter` / `clock_gettime`) — `std.time.milliTimestamp` +/// no longer exists in 0.16. fn nowMs() i64 { - var ts = timespec_t{ .tv_sec = 0, .tv_nsec = 0 }; - _ = clock_gettime(CLOCK_MONOTONIC, &ts); - return ts.tv_sec * 1000 + @divFloor(ts.tv_nsec, std.time.ns_per_ms); + return @intCast(platform_time.nowNanos() / std.time.ns_per_ms); } -fn sleepMs(ms: u64) void { - var ts = timespec_t{ - .tv_sec = @intCast(ms / 1_000), - .tv_nsec = @intCast((ms % 1_000) * std.time.ns_per_ms), - }; - _ = nanosleep(&ts, null); +/// Cross-platform sleep via the platform-time wrapper (`Sleep` / +/// `nanosleep`); `std.Thread.sleep` is gone in 0.16. Needs an `io`. +fn sleepMs(io: std.Io, ms: u64) void { + platform_time.sleepPrecise(io, ms * std.time.ns_per_ms) catch {}; } -test "runtime kill -9 → editor detects EOF in <100ms" { - if (!is_linux) return error.SkipZigTest; +/// The runtime binary path, relative to the project root (the cwd when +/// `zig build test` dispatches the test). `.exe` on Windows so +/// `CreateProcessW` resolves it. +const runtime_exe = if (is_windows) "zig-out/bin/weld-runtime.exe" else "zig-out/bin/weld-runtime"; + +/// POSIX shm name `/weld-shm--` vs Windows session-local +/// `Local\weld-shm--` (§2.2). Written into `buf`. +fn shmName(buf: []u8, tag: []const u8, pid: i32) ![]const u8 { + return if (is_windows) + std.fmt.bufPrintZ(buf, "Local\\weld-shm-{s}-{d}", .{ tag, pid }) + else + std.fmt.bufPrintZ(buf, "/weld-shm-{s}-{d}", .{ tag, pid }); +} - const gpa = std.testing.allocator; - const pid = getpid(); - var sock_buf: [64]u8 = undefined; - const socket_path = try std.fmt.bufPrintZ(&sock_buf, "/tmp/weld-crashtest-{d}.sock", .{pid}); - var shm_buf: [64]u8 = undefined; - const shm_name = try std.fmt.bufPrintZ(&shm_buf, "/weld-shm-crashtest-{d}", .{pid}); +/// Best-effort removal of a POSIX socket file + shm region. No-op on +/// Windows (named pipes + named mappings are refcounted kernel objects +/// that vanish with their last handle). +fn cleanupPosix(socket_path: [:0]const u8, shm: [:0]const u8) void { + if (comptime is_windows) return; _ = unlink(socket_path.ptr); - _ = shm_unlink(shm_name.ptr); - defer _ = unlink(socket_path.ptr); - defer _ = shm_unlink(shm_name.ptr); + _ = shm_unlink(shm.ptr); +} - var vp = try viewport.ShmViewport.create(shm_name, viewport.default_resolution.width, viewport.default_resolution.height); - defer vp.close(); +/// Editor-side viewport fd handoff (POSIX only) — mirrors +/// `src/editor/main.zig`. On Windows the runtime opens the mapping by +/// name, so no handoff is sent. +fn sendViewportHandoff(server: *ipc.server.IpcServer, vp: *const viewport.ShmViewport) !void { + var handoff = messages.ShmRegionsHandoff{ + .region_count = 1, + .regions = std.mem.zeroes([messages.MAX_SHM_REGIONS]messages.ShmRegionDesc), + }; + messages.writeFixedString(&handoff.regions[0].logical_name, "viewport_framebuffer"); + handoff.regions[0].size = viewport.regionSize(W, H); + try server.connection().sendMessageWithHandles( + messages.ShmRegionsHandoff, + 0, + &handoff, + &[_]transport.OsHandle{vp.fd()}, + ); +} + +const Spawned = struct { vp: viewport.ShmViewport, proc: platform_process.Process }; + +/// Spawn the runtime against the caller-owned (stable) `server`, run the +/// handshake, and attach the viewport — POSIX handoff vs Windows by-name. +/// Returns the created viewport + child. `socket_path` / `shm_name` / +/// `snapshot_path` are caller-owned. +fn spawnAndHandshake( + server: *ipc.server.IpcServer, + gpa: std.mem.Allocator, + socket_path: []const u8, + shm_name: []const u8, + snapshot_path: []const u8, +) !Spawned { + var vp = try viewport.ShmViewport.create(shm_name, W, H); + errdefer vp.close(); - var server = ipc.server.IpcServer.init(gpa); - defer server.deinit(); try server.listen(socket_path); + const pid = getpid(); const socket_arg = try std.fmt.allocPrint(gpa, "--socket={s}", .{socket_path}); defer gpa.free(socket_arg); const shm_arg = try std.fmt.allocPrint(gpa, "--shm={s}", .{shm_name}); defer gpa.free(shm_arg); const pid_arg = try std.fmt.allocPrint(gpa, "--editor-pid={d}", .{pid}); defer gpa.free(pid_arg); - const argv = [_][]const u8{ "zig-out/bin/weld-runtime", socket_arg, shm_arg, pid_arg }; + const snap_arg = try std.fmt.allocPrint(gpa, "--snapshot={s}", .{snapshot_path}); + defer gpa.free(snap_arg); + const argv = [_][]const u8{ runtime_exe, socket_arg, shm_arg, pid_arg, snap_arg }; - var proc = try platform_process.spawn_process(gpa, "zig-out/bin/weld-runtime", &argv); + const proc = try platform_process.spawn_process(gpa, runtime_exe, &argv); try server.acceptOne(); var hello_buf: [framing.frameSizeOf(messages.ProtocolHello)]u8 = undefined; _ = try server.recvHello(&hello_buf); try server.sendHelloAck(true, ""); + if (comptime !is_windows) try sendViewportHandoff(server, &vp); + + return .{ .vp = vp, .proc = proc }; +} - // Sleep a beat to let the runtime settle, then kill. - sleepMs(50); +fn reap(io: std.Io, proc: *platform_process.Process) void { + var attempts: usize = 0; + while (attempts < 200) : (attempts += 1) { + if (platform_process.wait_nonblock(proc) catch null) |_| return; + sleepMs(io, 10); + } +} + +test "runtime kill -9 → editor detects EOF in <100ms" { + const gpa = std.testing.allocator; + var threaded = std.Io.Threaded.init(gpa, .{}); + defer threaded.deinit(); + const io = threaded.io(); + const pid = getpid(); + var sock_buf: [96]u8 = undefined; + const socket_path = try ipc.transport.buildSocketPath(&sock_buf, "weld-crashtest"); + var shm_buf: [64]u8 = undefined; + const shm = try shmName(&shm_buf, "crashtest", pid); + var snap_buf: [64]u8 = undefined; + const snap = try std.fmt.bufPrint(&snap_buf, "weld-snap-crashtest-{d}.bin", .{pid}); + cleanupPosix(socket_path, @ptrCast(shm)); + defer cleanupPosix(socket_path, @ptrCast(shm)); + + var server = ipc.server.IpcServer.init(gpa); + defer server.deinit(); + var sp = try spawnAndHandshake(&server, gpa, socket_path, shm, snap); + defer sp.vp.close(); + + sleepMs(io, 50); // let the runtime settle into its loops const t0 = nowMs(); - try platform_process.kill(&proc); + try platform_process.kill(&sp.proc); - // Detect EOF on the editor side by sending a probe message and - // expecting `error.UnexpectedEof` from the next `recvFrame`. var scratch: [256]u8 = undefined; const detect_res = server.connection().recvFrame(&scratch); const detect_ms = nowMs() - t0; try std.testing.expect(detect_ms < 100); try std.testing.expectError(error.UnexpectedEof, detect_res); - // Reap. - var reap_attempts: usize = 0; - while (reap_attempts < 50) : (reap_attempts += 1) { - if (try platform_process.wait_nonblock(&proc)) |_| break; - sleepMs(10); - } + reap(io, &sp.proc); } test "runtime kill -9 → editor restarts + first post-restart Echo OK" { - if (!is_linux) return error.SkipZigTest; - // Smoke-shape: the runtime is restarted by repeating the - // spawn_process call; we verify the new connection delivers an - // EchoReply for an Echo we send. const gpa = std.testing.allocator; + var threaded = std.Io.Threaded.init(gpa, .{}); + defer threaded.deinit(); + const io = threaded.io(); const pid = getpid(); - var sock_buf: [64]u8 = undefined; - const socket_path = try std.fmt.bufPrintZ(&sock_buf, "/tmp/weld-restart-{d}.sock", .{pid}); + var sock_buf: [96]u8 = undefined; + const socket_path = try ipc.transport.buildSocketPath(&sock_buf, "weld-restart"); var shm_buf: [64]u8 = undefined; - const shm_name = try std.fmt.bufPrintZ(&shm_buf, "/weld-shm-restart-{d}", .{pid}); - _ = unlink(socket_path.ptr); - _ = shm_unlink(shm_name.ptr); - defer _ = unlink(socket_path.ptr); - defer _ = shm_unlink(shm_name.ptr); - - var vp = try viewport.ShmViewport.create(shm_name, viewport.default_resolution.width, viewport.default_resolution.height); - defer vp.close(); - - var server = ipc.server.IpcServer.init(gpa); - defer server.deinit(); - try server.listen(socket_path); - - const socket_arg = try std.fmt.allocPrint(gpa, "--socket={s}", .{socket_path}); - defer gpa.free(socket_arg); - const shm_arg = try std.fmt.allocPrint(gpa, "--shm={s}", .{shm_name}); - defer gpa.free(shm_arg); - const pid_arg = try std.fmt.allocPrint(gpa, "--editor-pid={d}", .{pid}); - defer gpa.free(pid_arg); - const argv = [_][]const u8{ "zig-out/bin/weld-runtime", socket_arg, shm_arg, pid_arg }; + const shm = try shmName(&shm_buf, "restart", pid); + var snap_buf: [64]u8 = undefined; + const snap = try std.fmt.bufPrint(&snap_buf, "weld-snap-restart-{d}.bin", .{pid}); + cleanupPosix(socket_path, @ptrCast(shm)); + defer cleanupPosix(socket_path, @ptrCast(shm)); // First spawn + handshake + kill. - var proc = try platform_process.spawn_process(gpa, "zig-out/bin/weld-runtime", &argv); - try server.acceptOne(); - var hbuf: [framing.frameSizeOf(messages.ProtocolHello)]u8 = undefined; - _ = try server.recvHello(&hbuf); - try server.sendHelloAck(true, ""); - try platform_process.kill(&proc); + var server = ipc.server.IpcServer.init(gpa); + var sp1 = try spawnAndHandshake(&server, gpa, socket_path, shm, snap); + try platform_process.kill(&sp1.proc); var scratch: [256]u8 = undefined; _ = server.connection().recvFrame(&scratch) catch {}; - // Tear down the first connection so we can accept the second. + sp1.vp.close(); server.deinit(); - var reap_attempts: usize = 0; - while (reap_attempts < 50) : (reap_attempts += 1) { - if (try platform_process.wait_nonblock(&proc)) |_| break; - sleepMs(10); - } + reap(io, &sp1.proc); // Second spawn + handshake + Echo round-trip. - server = ipc.server.IpcServer.init(gpa); - try server.listen(socket_path); - var proc2 = try platform_process.spawn_process(gpa, "zig-out/bin/weld-runtime", &argv); - try server.acceptOne(); - _ = try server.recvHello(&hbuf); - try server.sendHelloAck(true, ""); + var server2 = ipc.server.IpcServer.init(gpa); + defer server2.deinit(); + var sp2 = try spawnAndHandshake(&server2, gpa, socket_path, shm, snap); + defer sp2.vp.close(); var echo = messages.Echo{ .payload = std.mem.zeroes([64]u8) }; for (&echo.payload, 0..) |*b, i| b.* = @intCast(i & 0xFF); - try server.connection().sendMessage(messages.Echo, 0, &echo); + try server2.connection().sendMessage(messages.Echo, 0, &echo); var rep_buf: [framing.frameSizeOf(messages.EchoReply)]u8 = undefined; - const reply = try server.connection().recvMessage(messages.EchoReply, &rep_buf); + const reply = try server2.connection().recvMessage(messages.EchoReply, &rep_buf); try std.testing.expectEqualSlices(u8, &echo.payload, &reply.payload); - // Graceful shutdown of the second runtime. const sd = messages.Shutdown{}; - try server.connection().sendMessage(messages.Shutdown, 0, &sd); + try server2.connection().sendMessage(messages.Shutdown, 0, &sd); var sa_buf: [framing.frameSizeOf(messages.ShutdownAck)]u8 = undefined; - _ = try server.connection().recvMessage(messages.ShutdownAck, &sa_buf); - var attempts: usize = 0; - while (attempts < 50) : (attempts += 1) { - if (try platform_process.wait_nonblock(&proc2)) |_| break; - sleepMs(10); - } + _ = server2.connection().recvMessage(messages.ShutdownAck, &sa_buf) catch {}; + reap(io, &sp2.proc); } test "editor close → runtime detects EOF + exits clean code 0" { - if (!is_linux) return error.SkipZigTest; - - // G5 — see file header. The test process IS the editor. We - // create the shm, listen, accept the runtime, handshake, then - // call `server.deinit()` without sending `Shutdown` — the - // kernel tears the socket down exactly the way it would after - // an editor SIGKILL. The runtime's reader thread sees EOF on - // its next recv, the main loop trips `read_failed`, the - // process exits with code 0. - const gpa = std.testing.allocator; + var threaded = std.Io.Threaded.init(gpa, .{}); + defer threaded.deinit(); + const io = threaded.io(); const pid = getpid(); - var sock_buf: [64]u8 = undefined; - const socket_path = try std.fmt.bufPrintZ(&sock_buf, "/tmp/weld-g5-{d}.sock", .{pid}); + var sock_buf: [96]u8 = undefined; + const socket_path = try ipc.transport.buildSocketPath(&sock_buf, "weld-g5"); var shm_buf: [64]u8 = undefined; - const shm_name = try std.fmt.bufPrintZ(&shm_buf, "/weld-shm-g5-{d}", .{pid}); - _ = unlink(socket_path.ptr); - _ = shm_unlink(shm_name.ptr); - defer _ = unlink(socket_path.ptr); - defer _ = shm_unlink(shm_name.ptr); - - var vp = try viewport.ShmViewport.create(shm_name, viewport.default_resolution.width, viewport.default_resolution.height); - defer vp.close(); + const shm = try shmName(&shm_buf, "g5", pid); + var snap_buf: [64]u8 = undefined; + const snap = try std.fmt.bufPrint(&snap_buf, "weld-snap-g5-{d}.bin", .{pid}); + cleanupPosix(socket_path, @ptrCast(shm)); + defer cleanupPosix(socket_path, @ptrCast(shm)); var server = ipc.server.IpcServer.init(gpa); - defer server.deinit(); - try server.listen(socket_path); - - const socket_arg = try std.fmt.allocPrint(gpa, "--socket={s}", .{socket_path}); - defer gpa.free(socket_arg); - const shm_arg = try std.fmt.allocPrint(gpa, "--shm={s}", .{shm_name}); - defer gpa.free(shm_arg); - const pid_arg = try std.fmt.allocPrint(gpa, "--editor-pid={d}", .{pid}); - defer gpa.free(pid_arg); - const argv = [_][]const u8{ "zig-out/bin/weld-runtime", socket_arg, shm_arg, pid_arg }; - - var proc = try platform_process.spawn_process(gpa, "zig-out/bin/weld-runtime", &argv); - try server.acceptOne(); - - var hello_buf: [framing.frameSizeOf(messages.ProtocolHello)]u8 = undefined; - _ = try server.recvHello(&hello_buf); - try server.sendHelloAck(true, ""); - - // Let the runtime settle into its main render + reader loops. - sleepMs(50); + var sp = try spawnAndHandshake(&server, gpa, socket_path, shm, snap); + defer sp.vp.close(); - // Simulate editor SIGKILL: abrupt server-side teardown, no - // `Shutdown` message. Kernel sends FIN to the runtime end; - // runtime sees `recv == 0` → `error.UnexpectedEof`. + sleepMs(io, 50); + // Simulate editor SIGKILL: abrupt server teardown, no Shutdown. The + // runtime sees EOF on its next recv and exits 0. const t0 = nowMs(); server.deinit(); - // Poll for runtime exit. Target wall-clock < 500 ms (16 ms - // main-loop tick × small handful of iterations + scope - // teardown). The brief's < 100 ms gate is for the detection - // itself; the wider 500 ms here covers the runtime's full - // exit path. var exit_code: ?i32 = null; var poll: usize = 0; - while (poll < 100) : (poll += 1) { - if (try platform_process.wait_nonblock(&proc)) |code| { + while (poll < 200) : (poll += 1) { + if (try platform_process.wait_nonblock(&sp.proc)) |code| { exit_code = code; break; } - sleepMs(10); + sleepMs(io, 10); } const exit_ms = nowMs() - t0; - try std.testing.expect(exit_code != null); try std.testing.expectEqual(@as(i32, 0), exit_code.?); try std.testing.expect(exit_ms < 500); } + +test "kill -9 + best-effort replay of post-save commands" { + const gpa = std.testing.allocator; + var threaded = std.Io.Threaded.init(gpa, .{}); + defer threaded.deinit(); + const io = threaded.io(); + const pid = getpid(); + var sock_buf: [96]u8 = undefined; + const socket_path = try ipc.transport.buildSocketPath(&sock_buf, "weld-replay"); + var shm_buf: [64]u8 = undefined; + const shm = try shmName(&shm_buf, "replay", pid); + var snap_buf: [64]u8 = undefined; + const snap = try std.fmt.bufPrint(&snap_buf, "weld-snap-replay-{d}.bin", .{pid}); + cleanupPosix(socket_path, @ptrCast(shm)); + defer cleanupPosix(socket_path, @ptrCast(shm)); + // SaveProject persists this snapshot; remove it on the way out so the + // marker file does not leak into the work tree (cross-platform via io). + defer std.Io.Dir.cwd().deleteFile(io, snap) catch {}; + + var log = try command_log.CommandLog.init(gpa); + defer log.deinit(); + + var scratch: [256]u8 = undefined; + + // ---- First session: establish a clean line, then queue pending + // post-save commands, then crash. ---- + { + var server = ipc.server.IpcServer.init(gpa); + var sp = try spawnAndHandshake(&server, gpa, socket_path, shm, snap); + + // SaveProject → ProjectSaved: the runtime writes the snapshot; + // the clean line advances on the ack. + const save = messages.SaveProject{}; + try server.connection().sendMessage(messages.SaveProject, 100, &save); + const saved = try server.connection().recvMessage(messages.ProjectSaved, &scratch); + try std.testing.expectEqual(@as(u8, 1), saved.ok); + log.markCleanLine(); + + // Queue 3 post-save transactional commands, logging each frame + // but NOT reading their acks — they stay pending for replay. + var seq: u32 = 101; + while (seq <= 103) : (seq += 1) { + const cmd = messages.SpawnEntity{ .archetype_hint = seq }; + const frame = try framing.encode(gpa, messages.SpawnEntity, seq, &cmd); + defer gpa.free(frame); + try server.connection().socket.send(frame); + try log.append(seq, @intFromEnum(messages.MsgType.spawn_entity), frame, 0); + } + + // Crash the runtime, then drain any buffered acks until EOF — + // detection must be < 100 ms. + const t0 = nowMs(); + try platform_process.kill(&sp.proc); + while (true) { + _ = server.connection().recvFrame(&scratch) catch break; // EOF/broken + } + try std.testing.expect(nowMs() - t0 < 100); + + sp.vp.close(); + server.deinit(); + reap(io, &sp.proc); + } + + // 3 commands appended after the clean line, none acked. + var pending: usize = 0; + var it = log.replaySince(); + while (it.next()) |_| pending += 1; + try std.testing.expectEqual(@as(usize, 3), pending); + + // ---- Restart + replay. The fresh runtime reloads from the snapshot + // and re-acks the replayed commands. ---- + { + var server = ipc.server.IpcServer.init(gpa); + defer server.deinit(); + var sp = try spawnAndHandshake(&server, gpa, socket_path, shm, snap); + defer sp.vp.close(); + + const t0 = nowMs(); + const result = ipc.connection.replayCommands(server.connection(), &log, &scratch, 0); + const replay_ms = nowMs() - t0; + + try std.testing.expect(result.complete); + try std.testing.expectEqual(@as(usize, 3), result.replayed); + try std.testing.expect(replay_ms < 500); // aggregate budget + + const sd = messages.Shutdown{}; + try server.connection().sendMessage(messages.Shutdown, 0, &sd); + var sa_buf: [framing.frameSizeOf(messages.ShutdownAck)]u8 = undefined; + _ = server.connection().recvMessage(messages.ShutdownAck, &sa_buf) catch {}; + reap(io, &sp.proc); + } +} diff --git a/tests/ipc/fuzz_1h.zig b/tests/ipc/fuzz_1h.zig index c0a76ba..720402d 100644 --- a/tests/ipc/fuzz_1h.zig +++ b/tests/ipc/fuzz_1h.zig @@ -1,14 +1,21 @@ -//! S6 long fuzz harness (1 hour). Manual invocation only — -//! not added to `zig build test` because it would dominate every -//! CI run for the lifetime of Phase −1/0. +//! S6 long fuzz harness — promoted to a nightly target at M0.7 / E4 +//! (was manual-only at S6). Stresses the **whole message catalogue**, not +//! just `Echo`: each iteration the writer picks a random message type +//! (incl. `ShmRegionsHandoff`, hardened in E1, and every E2 command) and +//! sends a well-formed frame. Interleaving heterogeneous frame *sizes* is +//! the real test — it exercises the length-prefixed framing's delimiting +//! over tens of millions of back-to-back frames (the "no magic desync" +//! gate). A counting allocator wraps `page_allocator` so any leak fails +//! the run; `sent == recv` and a clean reader confirm no desync. //! -//! Run via `zig build test-ipc-fuzz-1h`. Result digest goes into -//! `validation/s6-go-nogo.md` for the G3 gate. +//! Run via `zig build test-ipc-fuzz-1h` (1 h default). Pass a shorter +//! duration for a local smoke run: //! -//! Identical harness shape to `tests/ipc/fuzz_short.zig`, scaled -//! to 1 hour. Counting allocator wraps `std.heap.page_allocator` -//! so any leak fails the test immediately. Cross-platform — runs -//! on Linux / macOS / Windows; pick whichever box is available. +//! zig build test-ipc-fuzz-1h -- --duration-ms=3000 +//! +//! Cross-platform — runs on Linux / macOS / Windows. The nightly cron +//! (`.github/workflows/nightly-fuzz.yml`) runs it on Linux + Windows and +//! archives the stdout digest as an artifact (G3 gate). const std = @import("std"); const builtin = @import("builtin"); @@ -17,6 +24,7 @@ const weld_core = @import("weld_core"); const ipc = weld_core.ipc; const framing = ipc.framing; const messages = ipc.messages; +const CountingAllocator = weld_core.testing.alloc_counting.CountingAllocator; const can_unlink = builtin.os.tag == .linux or builtin.os.tag == .macos; extern "c" fn unlink(path: [*:0]const u8) c_int; @@ -53,46 +61,164 @@ fn nowMs() i64 { }; } +// The fuzzed slice of the catalogue. A deliberate spread of payload sizes +// — tiny (`Play`/`Pause`/`Stop`), mid (`Echo`/`Heartbeat`), and large +// (`LoadScene`/`SaveScene`/`ShmRegionsHandoff`) — so the stream constantly +// changes frame length and the reader's delimiting is genuinely stressed. +const fuzz_types = [_]type{ + messages.Echo, messages.Heartbeat, messages.SpawnEntity, + messages.ModifyComponent, messages.LogMessage, messages.Play, + messages.Pause, messages.Stop, messages.LoadScene, + messages.HotReloadScript, messages.SaveScene, messages.SaveProject, + messages.ProjectSaved, messages.RuntimeError, messages.ShmRegionsHandoff, +}; + +// The clean end-of-stream marker: a well-formed frame of a catalogue type +// deliberately kept OUT of `fuzz_types`, so it can only ever appear on the +// wire as the teardown sentinel, never as fuzz traffic. Using a *valid* +// frame (not a bad-magic blob) is what lets `error.InvalidMagic` stay an +// UNCONDITIONAL fault: a real framing desync on the last backlog frames — +// after the writer has already set `stop` — can no longer be misread as a +// benign teardown. `ShutdownAck` fits semantically (end of session). +const sentinel_type = messages.ShutdownAck; +const sentinel_msg_type: u16 = @intFromEnum(messages.msgTypeOf(sentinel_type)); +comptime { + for (fuzz_types) |T| { + if (T == sentinel_type) @compileError( + "sentinel_type must stay out of fuzz_types — otherwise the teardown " ++ + "marker is indistinguishable from fuzz traffic", + ); + } +} + +// Reader scratch must hold the largest frame it can receive — the biggest +// `fuzz_types` entry or the sentinel — else a big frame trips +// `error.PayloadTooLarge` and reads as a (false) desync. +const max_frame_size = blk: { + var m: usize = framing.frameSizeOf(sentinel_type); + for (fuzz_types) |T| m = @max(m, framing.frameSizeOf(T)); + break :blk m; +}; + +// Encode a well-formed frame for `fuzz_types[idx]` with a zeroed body. The +// body content is irrelevant to the framing/transport stress; the type +// (hence the frame length + `msg_type`) is what varies. +fn encodeRandom(gpa: std.mem.Allocator, idx: usize, seq: u32) ![]u8 { + inline for (fuzz_types, 0..) |T, i| { + if (i == idx) { + const msg: T = std.mem.zeroes(T); + return framing.encode(gpa, T, seq, &msg); + } + } + unreachable; +} + const FuzzCtx = struct { server_sock: *ipc.transport.IpcSocket, client_sock: *ipc.transport.IpcSocket, duration_ms: i64, - /// Outgoing `seq_id`. Matches the protocol-level `framing.Header.seq_id` - /// width (cf. `framing.zig`). 1 h × 10 000 msg/s ≈ 36 M, well under - /// `u32` max (~4.3 B), so the wraparound `+%` is theoretical here. + /// Writer-owned. Outgoing `seq_id` source; 1 h × ~1 M msg/s stays far + /// below `u32` overflow, so `+%` is theoretical. sent: u32 = 0, - /// Reader-side counter. Same width as `sent` for symmetry — - /// drives the post-run sanity check that recv == sent. + /// Reader-owned. Compared to `sent` after join — equality proves every + /// frame was received intact and in order (no desync, no drop). recv: u32 = 0, + /// Set by the reader on any framing desync: `InvalidMagic` / + /// `UnknownMsgType` / version / size mismatch at ANY time, or a socket + /// EOF before `stop`. Only a post-`stop` EOF is exempt. fault: std.atomic.Value(u8) = std.atomic.Value(u8).init(0), + /// Published by the writer (release) before the teardown sentinel; gates + /// only the benign tolerance of the post-teardown socket EOF. stop: std.atomic.Value(u8) = std.atomic.Value(u8).init(0), }; fn writerLoop(ctx: *FuzzCtx, gpa: std.mem.Allocator) void { - const t = nowMs(); - while (nowMs() - t < ctx.duration_ms) { - const echo = messages.Echo{ .payload = std.mem.zeroes([64]u8) }; - const buf = framing.encode(gpa, messages.Echo, ctx.sent +% 1, &echo) catch return; + // Pre-encode the well-formed teardown sentinel once, before any traffic, + // so an (essentially impossible) allocation failure is handled up front + // rather than leaving the reader to block on its recv forever. + const sentinel_body = std.mem.zeroes(sentinel_type); + const sentinel_frame = framing.encode(gpa, sentinel_type, 0xFFFF_FFFF, &sentinel_body) catch { + ctx.fault.store(1, .release); + ctx.stop.store(1, .release); // let the reader's eventual EOF be benign + return; + }; + defer gpa.free(sentinel_frame); + + const t_start = nowMs(); + var prng = std.Random.DefaultPrng.init(0xCAFEBABE); + const rng = prng.random(); + while (nowMs() - t_start < ctx.duration_ms) { + const idx = rng.intRangeLessThan(usize, 0, fuzz_types.len); + const buf = encodeRandom(gpa, idx, ctx.sent +% 1) catch { + ctx.fault.store(1, .release); + break; + }; defer gpa.free(buf); - ctx.client_sock.send(buf) catch return; + ctx.client_sock.send(buf) catch { + ctx.fault.store(1, .release); + break; + }; ctx.sent += 1; } + // Teardown. Publish `stop` (release), then send the sentinel — a *valid* + // frame the reader recognises by its `msg_type`, not by any error. The + // reader therefore needs no error reclassification, so `InvalidMagic` + // stays an unconditional fault (see `readerLoop`). `stop` only gates the + // benign tolerance of the post-teardown socket EOF. ctx.stop.store(1, .release); + ctx.client_sock.send(sentinel_frame) catch {}; } fn readerLoop(ctx: *FuzzCtx, gpa: std.mem.Allocator) void { var connection = ipc.connection.IpcConnection.init(gpa, ctx.server_sock); - var scratch: [framing.frameSizeOf(messages.Echo) + 256]u8 = undefined; - while (ctx.stop.load(.acquire) == 0) { - _ = connection.recvFrame(&scratch) catch return; + var scratch: [max_frame_size]u8 = undefined; + while (true) { + const frame = connection.recvFrame(&scratch) catch |e| { + // Clean end-of-stream is the well-formed sentinel frame (handled + // below), never an error. The ONLY tolerated errors are a socket + // torn down AFTER teardown (`UnexpectedEof` / `BrokenPipe` once + // `stop` is set). Everything else — `InvalidMagic`, + // `UnknownMsgType`, version / payload-size mismatch, at any time — + // is an unconditional framing desync ⇒ fault. This closes the + // window where a desync on the last backlog frames (writer + // already at `stop == 1`) could be misread as a benign teardown. + const tolerated = switch (e) { + error.UnexpectedEof, error.BrokenPipe => ctx.stop.load(.acquire) == 1, + else => false, + }; + if (!tolerated) ctx.fault.store(1, .release); + return; + }; + // The sentinel marks the deliberate end of the run; its type never + // appears as fuzz traffic. Not counted, so `sent == recv` stays a + // pure data-frame identity. + if (frame.header.msg_type == sentinel_msg_type) return; ctx.recv += 1; } } -pub fn main() !void { - var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); - defer arena.deinit(); - const gpa = arena.allocator(); +pub fn main(init: std.process.Init.Minimal) !void { + // Duration override: `zig build test-ipc-fuzz-1h -- --duration-ms=N`. + // Env vars (`std.posix.getenv` / `hasEnvVarConstant`) were removed in + // 0.16, so argv is the portable knob. The nightly cron uses the 1 h + // default; a local smoke run passes a few seconds. A separate + // `page_allocator` parses argv so it cannot pollute the leak counters. + var duration_ms: i64 = 60 * 60 * 1000; + { + var it = try std.process.Args.Iterator.initAllocator(init.args, std.heap.page_allocator); + defer it.deinit(); + _ = it.skip(); + while (it.next()) |a| { + if (std.mem.startsWith(u8, a, "--duration-ms=")) { + duration_ms = std.fmt.parseInt(i64, a["--duration-ms=".len..], 10) catch duration_ms; + } + } + } + + // Leak detection: counting allocator over page_allocator. After join, + // alloc_count must equal free_count and the byte tallies must balance. + var counter = CountingAllocator.init(std.heap.page_allocator); + const gpa = counter.allocator(); var path_buf: [128]u8 = undefined; const path = try ipc.transport.buildSocketPath(&path_buf, "weld-fuzz-1h"); @@ -109,12 +235,29 @@ pub fn main() !void { var ctx = FuzzCtx{ .server_sock = &server, .client_sock = &client, - .duration_ms = 60 * 60 * 1000, + .duration_ms = duration_ms, }; const reader = try std.Thread.spawn(.{}, readerLoop, .{ &ctx, gpa }); const writer = try std.Thread.spawn(.{}, writerLoop, .{ &ctx, gpa }); writer.join(); reader.join(); - std.debug.print("fuzz_1h: sent={d} recv={d} fault={d}\n", .{ ctx.sent, ctx.recv, ctx.fault.load(.acquire) }); + const snap = counter.snapshot(); + const leaked = snap.alloc_count != snap.free_count or + snap.bytes_allocated != snap.bytes_freed; + std.debug.print( + "fuzz_1h: duration_ms={d} types={d} sent={d} recv={d} fault={d} " ++ + "alloc={d} free={d} bytes_alloc={d} bytes_freed={d}\n", + .{ + duration_ms, fuzz_types.len, ctx.sent, + ctx.recv, ctx.fault.load(.acquire), snap.alloc_count, + snap.free_count, snap.bytes_allocated, snap.bytes_freed, + }, + ); + + // Non-zero exit on any failure so the nightly job goes red. + if (ctx.fault.load(.acquire) != 0) return error.FuzzReaderFault; + if (ctx.sent == 0) return error.FuzzNoTraffic; + if (ctx.sent != ctx.recv) return error.FuzzSentRecvMismatch; + if (leaked) return error.FuzzLeak; } diff --git a/tests/ipc/handoff_fd.zig b/tests/ipc/handoff_fd.zig new file mode 100644 index 0000000..cfadb5c --- /dev/null +++ b/tests/ipc/handoff_fd.zig @@ -0,0 +1,207 @@ +//! M0.7 / E1 — shm attach via a received fd (`ShmRegion.fromFd`). +//! +//! Exercises the SCM_RIGHTS primary-attach pivot (`engine-ipc.md` +//! §4.8) at the `ShmRegion` level, one rung above the raw-socket fd +//! loopback of `tests/ipc/fd_passing.zig` (the S6 G7 test): +//! +//! 1. Side A (editor) creates a region with `ShmRegion.create` and +//! keeps its fd via `ShmRegion.fd()`. +//! 2. A sends the fd to side B over an `AF_UNIX` socket via +//! `sendWithHandles` (the bytes payload stands in for the +//! `ShmRegionsHandoff` descriptor; the fd rides as ancillary +//! data). +//! 3. Side B (runtime) maps the received fd with `ShmRegion.fromFd` +//! — **no `shm_open`** — and writes a known pattern. +//! 4. Side A reads the same pattern back: both ends share the same +//! physical pages. +//! +//! Because `fromFd` never calls `shm_open(O_RDWR)`, this runs +//! intra-process even on macOS — the whole point of the pivot is that +//! it sidesteps the BSD shm cross-process `EACCES` quirk that forces +//! the `tests/ipc/shm_cases/*` split. Green on Linux + macOS. +//! +//! Windows: `error.SkipZigTest` — the Windows CPU shm attach stays by +//! name (`open`), the fd-passing pivot is POSIX-only (§4.8). The +//! `ShmRegion.fromFd` Windows path is asserted to return +//! `error.Unimplemented` instead. +//! +//! External-resource discipline (engine-zig-conventions.md §13): a +//! 5 s `SO_RCVTIMEO` is installed on both endpoints so a lost cmsg +//! cannot hang the suite. + +const std = @import("std"); +const builtin = @import("builtin"); + +const weld_core = @import("weld_core"); +const shm = weld_core.ipc.shm; +const transport = weld_core.ipc.transport; +const connection = weld_core.ipc.connection; +const messages = weld_core.ipc.messages; + +const is_posix = builtin.os.tag == .linux or builtin.os.tag == .macos; + +extern "c" fn close(fd: c_int) c_int; +extern "c" fn unlink(path: [*:0]const u8) c_int; +extern "c" fn dup(fd: c_int) c_int; +extern "c" fn fcntl(fd: c_int, cmd: c_int) c_int; + +/// `F_GETFD` — same value (1) on Linux and macOS. `fcntl(fd, F_GETFD)` +/// returns -1 (EBADF) for a closed fd, ≥ 0 for an open one. +const F_GETFD: c_int = 1; + +/// True if `fd` is still an open descriptor in this process. +fn fdOpen(fd: transport.OsHandle) bool { + return fcntl(fd, F_GETFD) != -1; +} +extern "c" fn setsockopt( + sockfd: c_int, + level: c_int, + optname: c_int, + optval: *const anyopaque, + optlen: u32, +) c_int; + +const timeval = extern struct { + tv_sec: i64, + tv_usec: i32, + _pad: i32 = 0, +}; + +const SOL_SOCKET: c_int = if (builtin.os.tag == .linux) 1 else 0xFFFF; +const SO_RCVTIMEO: c_int = if (builtin.os.tag == .linux) 20 else 0x1006; + +fn installRecvTimeout(sock: *transport.IpcSocket) void { + if (comptime !is_posix) return; + var tv = timeval{ .tv_sec = 5, .tv_usec = 0 }; + _ = setsockopt(sock.impl.fd, SOL_SOCKET, SO_RCVTIMEO, &tv, @sizeOf(timeval)); +} + +test "shm attach via received fd" { + if (!is_posix) return error.SkipZigTest; + + const region_size: usize = 4096; + const region_name: []const u8 = "/weld-test-handoff"; + + const sock_path: [:0]const u8 = "/tmp/weld-test-handoff.sock"; + _ = unlink(sock_path.ptr); + defer _ = unlink(sock_path.ptr); + + // ---- Side A (editor): create the region, keep the fd. ---- + var region_a = try shm.ShmRegion.create(region_name, region_size); + defer region_a.close(); + + // Editor zeroes the region as it would before any handoff. + @memset(region_a.bytes(), 0); + + var listener = try transport.IpcSocket.listen(sock_path); + defer listener.close(); + var client = try transport.IpcSocket.connect(sock_path); + defer client.close(); + var server = try listener.accept(); + defer server.close(); + installRecvTimeout(&server); + installRecvTimeout(&client); + + // ---- Handoff: editor → runtime, fd in ancillary data. ---- + // The 1-byte payload stands in for the ShmRegionsHandoff frame; + // SCM_RIGHTS requires at least one regular byte alongside the fd. + try client.sendWithHandles(&[_]u8{1}, &[_]transport.OsHandle{region_a.fd()}); + + var recv_buf: [16]u8 = undefined; + var recv_handles: [1]transport.OsHandle = .{transport.invalid_handle}; + const result = try server.recvWithHandles(&recv_buf, &recv_handles); + try std.testing.expectEqual(@as(usize, 1), result.bytes); + try std.testing.expectEqual(@as(usize, 1), result.handles); + try std.testing.expect(recv_handles[0] >= 0); + + // ---- Side B (runtime): map the received fd, NO shm_open. ---- + var region_b = try shm.ShmRegion.fromFd(recv_handles[0], region_size); + defer region_b.close(); + + // Runtime writes a known pattern into its mapping. + const pattern = "weld-shm-handoff-roundtrip"; + @memcpy(region_b.bytes()[0..pattern.len], pattern); + + // ---- Side A reads the same physical pages back. ---- + try std.testing.expectEqualSlices( + u8, + pattern, + region_a.bytes()[0..pattern.len], + ); + + // A trailing byte the runtime did not touch stays zero — proves we + // mapped the same region, not a private copy. + try std.testing.expectEqual(@as(u8, 0), region_a.bytes()[pattern.len]); +} + +test "fromFd is unimplemented on Windows (attach stays by name)" { + if (is_posix) return error.SkipZigTest; + // The Windows CPU shm attach is by name (`open`); the fd-passing + // pivot is POSIX-only (§4.8). `fromFd` must fail loudly. + try std.testing.expectError( + error.Unimplemented, + shm.ShmRegion.fromFd(transport.invalid_handle, 4096), + ); +} + +fn zeroRegions() [messages.MAX_SHM_REGIONS]messages.ShmRegionDesc { + return std.mem.zeroes([messages.MAX_SHM_REGIONS]messages.ShmRegionDesc); +} + +test "acceptShmHandoff rejects fd/region_count mismatch and closes every fd" { + if (!is_posix) return error.SkipZigTest; + + // Two disposable fds, but a handoff that claims a single region — + // §8.3 requires fd count == region_count, so this is rejected. + const fd0 = dup(2); + const fd1 = dup(2); + try std.testing.expect(fd0 >= 0 and fd1 >= 0); + + const handoff = messages.ShmRegionsHandoff{ .region_count = 1, .regions = zeroRegions() }; + const handles = [_]transport.OsHandle{ fd0, fd1 }; + try std.testing.expectError( + error.InvalidHandoff, + connection.acceptShmHandoff(&handoff, &handles), + ); + + // Both received fds were closed — no descriptor leak on rejection. + try std.testing.expect(!fdOpen(fd0)); + try std.testing.expect(!fdOpen(fd1)); +} + +test "acceptShmHandoff rejects region_count above MAX_SHM_REGIONS" { + if (!is_posix) return error.SkipZigTest; + + const fd0 = dup(2); + try std.testing.expect(fd0 >= 0); + + const handoff = messages.ShmRegionsHandoff{ + .region_count = @as(u32, @intCast(messages.MAX_SHM_REGIONS)) + 1, + .regions = zeroRegions(), + }; + const handles = [_]transport.OsHandle{fd0}; + try std.testing.expectError( + error.InvalidHandoff, + connection.acceptShmHandoff(&handoff, &handles), + ); + try std.testing.expect(!fdOpen(fd0)); +} + +test "acceptShmHandoff returns the viewport fd and closes unmapped region fds" { + if (!is_posix) return error.SkipZigTest; + + // A well-formed two-region handoff: the runtime maps only the + // viewport (regions[0]); the second region's fd must be closed. + const fd0 = dup(2); + const fd1 = dup(2); + try std.testing.expect(fd0 >= 0 and fd1 >= 0); + + const handoff = messages.ShmRegionsHandoff{ .region_count = 2, .regions = zeroRegions() }; + const handles = [_]transport.OsHandle{ fd0, fd1 }; + const viewport_fd = try connection.acceptShmHandoff(&handoff, &handles); + + try std.testing.expectEqual(fd0, viewport_fd); // handles[0] returned + try std.testing.expect(fdOpen(fd0)); // caller owns it — still open + try std.testing.expect(!fdOpen(fd1)); // unmapped region fd closed + _ = close(fd0); // caller cleans up the viewport fd +} diff --git a/tests/ipc/process.zig b/tests/ipc/process.zig index 4689d45..ab6e288 100644 --- a/tests/ipc/process.zig +++ b/tests/ipc/process.zig @@ -1,7 +1,8 @@ -//! S6 process tests — `platform.process.spawn_process` + `wait_nonblock` +//! Process tests — `platform.process.spawn_process` + `wait_nonblock` //! + `is_alive` against the real `/bin/true` and `/bin/sleep` binaries -//! (POSIX). Windows is `skipNow` because `CreateProcessW` is stubbed in -//! S6 (cf. `src/core/platform/process.zig`). +//! (POSIX-gated). Plus `quoteArg` — the M0.7 / E3 Windows command-line +//! quoter — tested cross-platform (no Windows needed) via golden cases +//! and a round-trip through a reference `CommandLineToArgvW` parser. const std = @import("std"); const builtin = @import("builtin"); @@ -87,3 +88,159 @@ test "spawn-then-kill terminates a long-running child" { } return error.ChildNeverDied; } + +test "spawn_process runs a Windows binary and reaps exit 0" { + if (builtin.os.tag != .windows) return error.SkipZigTest; + // Anti-regression for the M0.7 / E3 addendum: the first real Windows + // run hit `CreateProcessW` → `error.SpawnFailed`. Exercise the path + // with a binary guaranteed present (`cmd.exe /c exit 0`). + const gpa = std.testing.allocator; + const exe = "C:\\Windows\\System32\\cmd.exe"; + const argv = [_][]const u8{ exe, "/c", "exit 0" }; + + var proc = try process.spawn_process(gpa, exe, &argv); + var attempts: usize = 0; + while (attempts < 200) : (attempts += 1) { + if (try process.wait_nonblock(&proc)) |code| { + try std.testing.expectEqual(@as(i32, 0), code); + return; + } + sleepMs(10); + } + return error.ChildNeverExited; +} + +// ------------------------------------------------------- quoteArg tests -- +// +// `quoteArg` is pure and cross-platform, so these run on every host. + +/// Reference re-implementation of `CommandLineToArgvW`, UTF-8 (the +/// metacharacters are all ASCII). Used to prove `quoteArg` output parses +/// back to the original argument. Faithful to the documented rules: +/// argv[0] is delimited by quotes only (backslashes literal); the rest +/// apply the `2n`/`2n+1` backslash-before-quote rules and toggle the +/// in-quotes state. Caller owns the returned slices. +fn refParseCommandLine(gpa: std.mem.Allocator, cmdline: []const u8) ![][]u8 { + var args: std.ArrayList([]u8) = .empty; + errdefer { + for (args.items) |a| gpa.free(a); + args.deinit(gpa); + } + var i: usize = 0; + + // argv[0]: quotes delimit; backslashes are literal; no escaping. + while (i < cmdline.len and (cmdline[i] == ' ' or cmdline[i] == '\t')) i += 1; + if (i < cmdline.len) { + var a0: std.ArrayList(u8) = .empty; + errdefer a0.deinit(gpa); + if (cmdline[i] == '"') { + i += 1; + while (i < cmdline.len and cmdline[i] != '"') : (i += 1) try a0.append(gpa, cmdline[i]); + if (i < cmdline.len) i += 1; // consume closing quote + } else { + while (i < cmdline.len and cmdline[i] != ' ' and cmdline[i] != '\t') : (i += 1) try a0.append(gpa, cmdline[i]); + } + try args.append(gpa, try a0.toOwnedSlice(gpa)); + } + + // Remaining args: standard backslash/quote rules. + while (true) { + while (i < cmdline.len and (cmdline[i] == ' ' or cmdline[i] == '\t')) i += 1; + if (i >= cmdline.len) break; + var arg: std.ArrayList(u8) = .empty; + errdefer arg.deinit(gpa); + var in_quotes = false; + while (i < cmdline.len) { + const c = cmdline[i]; + if (!in_quotes and (c == ' ' or c == '\t')) break; + if (c == '\\') { + var bs: usize = 0; + while (i < cmdline.len and cmdline[i] == '\\') : (i += 1) bs += 1; + if (i < cmdline.len and cmdline[i] == '"') { + try arg.appendNTimes(gpa, '\\', bs / 2); + if (bs % 2 == 1) { + try arg.append(gpa, '"'); // escaped literal quote + i += 1; + } + // even: leave the '"' for the quote branch next loop + } else { + try arg.appendNTimes(gpa, '\\', bs); // backslashes literal + } + } else if (c == '"') { + if (in_quotes and i + 1 < cmdline.len and cmdline[i + 1] == '"') { + try arg.append(gpa, '"'); // "" inside quotes → literal " + i += 2; + } else { + in_quotes = !in_quotes; + i += 1; + } + } else { + try arg.append(gpa, c); + i += 1; + } + } + try args.append(gpa, try arg.toOwnedSlice(gpa)); + } + return args.toOwnedSlice(gpa); +} + +test "quoteArg golden cases" { + const gpa = std.testing.allocator; + const Case = struct { arg: []const u8, want: []const u8 }; + const cases = [_]Case{ + .{ .arg = "", .want = "\"\"" }, // empty must be quoted + .{ .arg = "simple", .want = "simple" }, // no metachar → verbatim + .{ .arg = "My Game", .want = "\"My Game\"" }, // space → quoted + // Trailing backslash, no space → verbatim (NOT `"C:\dir\"`, which + // the naive quoter would emit, escaping the closing quote). + .{ .arg = "C:\\dir\\", .want = "C:\\dir\\" }, + // Space + trailing backslash → quoted with the backslash doubled. + .{ .arg = "C:\\My Dir\\", .want = "\"C:\\My Dir\\\\\"" }, + }; + for (cases) |c| { + const got = try process.quoteArg(gpa, c.arg); + defer gpa.free(got); + try std.testing.expectEqualStrings(c.want, got); + } +} + +test "quoteArg round-trips through a CommandLineToArgvW reference parser" { + const gpa = std.testing.allocator; + const cases = [_][]const u8{ + "", + "simple", + "My Game", + "C:\\dir\\", // trailing backslash, no space + "C:\\Program Files\\", // space + trailing backslash + "a\"b", // internal quote + "x\\\"y", // backslash + quote + "\\\\", // backslashes only + "tab\there", // embedded tab + "trailing\\\\\\", // three trailing backslashes + }; + + var cmd: std.ArrayList(u8) = .empty; + defer cmd.deinit(gpa); + const argv0 = "prog"; // simple argv[0] — round-trips trivially + const q0 = try process.quoteArg(gpa, argv0); + defer gpa.free(q0); + try cmd.appendSlice(gpa, q0); + for (cases) |c| { + try cmd.append(gpa, ' '); + const q = try process.quoteArg(gpa, c); + defer gpa.free(q); + try cmd.appendSlice(gpa, q); + } + + const parsed = try refParseCommandLine(gpa, cmd.items); + defer { + for (parsed) |p| gpa.free(p); + gpa.free(parsed); + } + + try std.testing.expectEqual(cases.len + 1, parsed.len); + try std.testing.expectEqualStrings(argv0, parsed[0]); + for (cases, 0..) |c, idx| { + try std.testing.expectEqualStrings(c, parsed[idx + 1]); + } +} diff --git a/tests/ipc/schema_hash.zig b/tests/ipc/schema_hash.zig index 9c054d6..cfaf53e 100644 --- a/tests/ipc/schema_hash.zig +++ b/tests/ipc/schema_hash.zig @@ -48,9 +48,10 @@ test "renaming a field changes schemaHash" { try std.testing.expect(h_orig != h_renamed); } -test "schemaHash distinguishes every S6 message type" { +test "schemaHash distinguishes every message type" { // A subtle hash collision between two message types would mask - // the schema-mismatch detection. Verify all 13 hashes are unique. + // the schema-mismatch detection. Verify all 23 hashes are unique + // (13 S6 messages + `ShmRegionsHandoff` (E1) + 9 catalogue messages (E2)). const hashes = [_]u64{ messages.schemaHash(messages.ProtocolHello), messages.schemaHash(messages.ProtocolHelloAck), @@ -65,6 +66,16 @@ test "schemaHash distinguishes every S6 message type" { messages.schemaHash(messages.Shutdown), messages.schemaHash(messages.ShutdownAck), messages.schemaHash(messages.LogMessage), + messages.schemaHash(messages.ShmRegionsHandoff), + messages.schemaHash(messages.Play), + messages.schemaHash(messages.Pause), + messages.schemaHash(messages.Stop), + messages.schemaHash(messages.LoadScene), + messages.schemaHash(messages.HotReloadScript), + messages.schemaHash(messages.SaveScene), + messages.schemaHash(messages.SaveProject), + messages.schemaHash(messages.ProjectSaved), + messages.schemaHash(messages.RuntimeError), }; for (hashes, 0..) |a, i| { for (hashes[i + 1 ..]) |b| {