diff --git a/CHANGELOG.md b/CHANGELOG.md index b88c0498..86d20746 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,6 +48,7 @@ All notable changes to vouch are documented here. Format follows KB under `eval/fixture-kb/`, and an `eval` workflow gating retrieval changes (#226). ### Fixed +- `audit.log_event` now holds an exclusive cross-process lock around read-prev-hash → derive → append, closing a TOCTOU race where two concurrent writers observed the same `prev_hash` and forked the chain — `verify_chain` then reported "previous hash mismatch" at the second concurrent event forever, breaking the tamper-evidence guarantee from #244 under ordinary multi-writer usage (`vouch serve` + concurrent CLI, multiple agents on JSONL, scripted backgrounded approvals). Uses `fcntl.flock` on POSIX and `msvcrt.locking` on Windows against a sibling `audit.log.jsonl.lock` file so the audit log itself is never opened in a mode that could truncate it. Fixes #262. - `parse_since` (the `--since` parser behind `vouch metrics`/`vouch audit`) now raises a clean `MetricsError` for a duration too large to represent (e.g. `--since 1000000000000d`), instead of letting an uncaught `OverflowError` traceback escape — restoring the documented "clean error, not a traceback" contract. - `sync_apply` now loads the sync source exactly once and passes the same `_SyncSource` instance into `sync_check`, closing a TOCTOU window where a bundle replaced on disk between the two `_load_source` calls could cause the validation and write phases to operate on different snapshots. Also eliminates redundant directory walks (KB sources) and triple tarball opens (bundle sources). Fixes #217. - `vault_to_kb` now passes `slug_hint=page_id` to `propose_page` so vault edit proposals target the existing page id from frontmatter instead of a slugified copy of the title (fixes #219). diff --git a/src/vouch/audit.py b/src/vouch/audit.py index e39703ba..a37bb1ca 100644 --- a/src/vouch/audit.py +++ b/src/vouch/audit.py @@ -7,6 +7,7 @@ from __future__ import annotations +import contextlib import hashlib import json import os @@ -23,6 +24,7 @@ from .storage import KBStore AUDIT_FILENAME = "audit.log.jsonl" +AUDIT_LOCKFILE = AUDIT_FILENAME + ".lock" GENESIS_HASH = "0" * 64 @@ -30,6 +32,45 @@ def _audit_path(kb_dir: Path) -> Path: return kb_dir / AUDIT_FILENAME +def _audit_lockfile(kb_dir: Path) -> Path: + return kb_dir / AUDIT_LOCKFILE + + +@contextlib.contextmanager +def _audit_lock(kb_dir: Path) -> Iterator[None]: + """Hold an exclusive cross-process lock for the duration of a log_event. + + Serialises the read-then-append sequence in `log_event` so two concurrent + writers cannot both observe the same `prev_hash` and fork the chain. + Lock is held on a sibling `audit.log.jsonl.lock` file so the audit log + itself is never opened in a mode that could truncate it. Blocks until + acquired and is always released, including on exceptions. + """ + lockfile = _audit_lockfile(kb_dir) + lockfile.parent.mkdir(parents=True, exist_ok=True) + fd = os.open(lockfile, os.O_RDWR | os.O_CREAT, 0o644) + try: + if os.name == "posix": + fcntl = __import__("fcntl") + fcntl.flock(fd, fcntl.LOCK_EX) + try: + yield + finally: + fcntl.flock(fd, fcntl.LOCK_UN) + else: + msvcrt = __import__("msvcrt") + os.lseek(fd, 0, os.SEEK_SET) + msvcrt.locking(fd, msvcrt.LK_LOCK, 1) + try: + yield + finally: + os.lseek(fd, 0, os.SEEK_SET) + with contextlib.suppress(OSError): + msvcrt.locking(fd, msvcrt.LK_UNLCK, 1) + finally: + os.close(fd) + + def new_event_id() -> str: return uuid.uuid4().hex @@ -75,28 +116,35 @@ def log_event( reversible: bool = True, data: dict[str, Any] | None = None, ) -> AuditEvent: - """Append one AuditEvent. Returns the persisted event.""" - prev_hash = _last_hash(kb_dir) - ev = AuditEvent( - id=new_event_id(), - event=event, - actor=actor, - object_ids=object_ids or [], - dry_run=dry_run, - reversible=reversible, - data=data or {}, - prev_hash=prev_hash, - ) - ev.hash = _compute_hash(prev_hash, _event_payload_for_hash(ev)) + """Append one AuditEvent. Returns the persisted event. + + Holds an exclusive cross-process lock around read-prev-hash → derive → + append so concurrent writers can't fork the chain. Without the lock, two + log_event calls racing on the same KB observe the same prev_hash and both + chain off it — verify_chain then reports "previous hash mismatch" forever. + """ path = _audit_path(kb_dir) path.parent.mkdir(parents=True, exist_ok=True) - line = _canonical_json(ev.model_dump(mode="json")) - # Open-write-close for crash safety — if the process dies mid-append the - # log is still parseable up to the last newline. - with path.open("a", encoding="utf-8") as f: - f.write(line + "\n") - f.flush() - os.fsync(f.fileno()) + with _audit_lock(kb_dir): + prev_hash = _last_hash(kb_dir) + ev = AuditEvent( + id=new_event_id(), + event=event, + actor=actor, + object_ids=object_ids or [], + dry_run=dry_run, + reversible=reversible, + data=data or {}, + prev_hash=prev_hash, + ) + ev.hash = _compute_hash(prev_hash, _event_payload_for_hash(ev)) + line = _canonical_json(ev.model_dump(mode="json")) + # Open-write-close for crash safety — if the process dies mid-append + # the log is still parseable up to the last newline. + with path.open("a", encoding="utf-8") as f: + f.write(line + "\n") + f.flush() + os.fsync(f.fileno()) return ev diff --git a/tests/test_audit.py b/tests/test_audit.py index 47246af4..d4a85d56 100644 --- a/tests/test_audit.py +++ b/tests/test_audit.py @@ -94,3 +94,79 @@ def test_audit_chain_detects_legacy_events(store: KBStore) -> None: assert not status.ok assert status.line == 1 assert status.reason == "legacy event is not hash-chained" + + +def _writer_worker(kb_dir_str: str, actor: str, count: int) -> None: + """Subprocess entry point for the concurrency test — module level so + multiprocessing's spawn context can pickle it.""" + from pathlib import Path as _Path + + from vouch import audit as _audit + kb_dir = _Path(kb_dir_str) + for i in range(count): + _audit.log_event( + kb_dir, event="claim.create", actor=actor, + object_ids=[f"{actor}-{i}"], + ) + + +def test_audit_chain_survives_concurrent_writers(store: KBStore) -> None: + """Concurrent processes must not fork the chain. + + Without the file lock around _last_hash → derive → append, two workers + observe the same prev_hash and both chain off it. verify_chain then + reports "previous hash mismatch" at the second concurrent event. Use + multiprocessing.Process — the GIL serialises Python code and hides the + file-level race that threads would expose. + """ + import multiprocessing as mp + + n_workers = 4 + per_worker = 20 + + ctx = mp.get_context("spawn") + procs = [ + ctx.Process(target=_writer_worker, + args=(str(store.kb_dir), f"agent-{i}", per_worker)) + for i in range(n_workers) + ] + for p in procs: + p.start() + for p in procs: + p.join(timeout=120) + for p in procs: + assert p.exitcode == 0, f"worker exit={p.exitcode}" + + status = audit.verify_chain(store.kb_dir) + assert status.ok, f"chain broken at line {status.line}: {status.reason}" + assert audit.count_events(store.kb_dir) == n_workers * per_worker + + +def test_audit_lock_released_on_exception( + store: KBStore, monkeypatch: pytest.MonkeyPatch, +) -> None: + """A writer raising mid-event releases the lock so the next call lands.""" + import os as _os + + audit.log_event(store.kb_dir, event="x.first", actor="u") + + original_fsync = _os.fsync + remaining = {"raises": 1} + + def _flaky_fsync(fd: int) -> None: + if remaining["raises"] > 0: + remaining["raises"] -= 1 + raise OSError("simulated I/O failure") + original_fsync(fd) + + monkeypatch.setattr(_os, "fsync", _flaky_fsync) + with pytest.raises(OSError, match="simulated I/O failure"): + audit.log_event(store.kb_dir, event="x.boom", actor="u") + + monkeypatch.setattr(_os, "fsync", original_fsync) + audit.log_event(store.kb_dir, event="x.third", actor="u") + events = list(audit.read_events(store.kb_dir)) + # x.first and x.third land; x.boom may have written its line before the + # fsync raised. In either case the surviving entries chain forward. + assert events[0].event == "x.first" + assert events[-1].event == "x.third"