From ce30276b73ceee7a3267e7f1207f8d96a930cb62 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 20 Jun 2026 23:02:06 +0000 Subject: [PATCH 01/25] docs(moq-archive): add design proposal for track archival crate Design-only first pass for a new moq-archive binary/library that records a single track to tiered storage (RAM -> disk -> S3 via object_store) and serves old groups back through the moq-lite-05 FETCH path (TrackDynamic). Covers the segment/index on-disk format, out-of-order group handling, per-tier optional retention measured by media timestamp with wall-clock fallback, the public API sketch, and open questions. No implementation yet. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-archive/DESIGN.md | 325 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 325 insertions(+) create mode 100644 rs/moq-archive/DESIGN.md diff --git a/rs/moq-archive/DESIGN.md b/rs/moq-archive/DESIGN.md new file mode 100644 index 000000000..66577318b --- /dev/null +++ b/rs/moq-archive/DESIGN.md @@ -0,0 +1,325 @@ +# moq-archive (design) + +> Status: design proposal, no implementation yet. Targets the `dev` branch because it +> builds on the moq-lite-05 FETCH API and media timestamps, both of which only exist there. + +`moq-archive` saves a live MoQ track to durable storage and serves it back on demand. It +sits beside `moq-relay` in the stack: the relay keeps a small in-memory cache for live +fan-out, while the archive is the long-term tier that answers FETCH requests for groups +that have long since aged out of the relay's cache. + +## Goals + +- Record a single `TrackConsumer` to disk and/or object storage, losslessly, frames intact. +- Serve any previously recorded group back through the normal FETCH path, so an unmodified + consumer can re-request old groups without knowing storage exists. +- Tier data across RAM, local disk, and remote object storage (S3/GCS/Azure), with an + independent, optional retention duration per tier. +- Survive out-of-order group delivery (groups arrive on independent QUIC streams and can + complete in any order). +- Avoid one-file-per-group: audio makes a group per frame, so a naive layout would create + thousands of tiny files. Groups are batched into larger segment objects. + +## Non-goals (v1) + +- Whole-broadcast archival. There is currently no generic way to enumerate every track in a + broadcast, so v1 records one track at a time. A broadcast-level wrapper (one recorder per + track, shared root) is a follow-up once track discovery exists. +- Transcoding, repackaging, or media awareness. The archive treats frames as opaque sized + payloads plus an optional timestamp, exactly like the relay. The catalog, container, and + codec layers stay in `hang`. +- Live VOD playback / DVR scrubbing UX. v1 serves groups via FETCH; building a seekable + player on top is downstream work. + +## Prior art + +There is no existing crate for this. A crates.io scan turns up only the `moq-dev` crates +(`moq-net`, `moq-relay`, `moq-mux`, ...), none of which persist tracks; the relay cache is +deliberately RAM-only and ephemeral. The IETF MoQ drafts leave durable storage to the +application. So we build it, but we do **not** hand-roll the storage backend: +[`object_store`](https://crates.io/crates/object_store) (0.13, Apache Arrow project) already +provides one trait over AWS S3, GCS, Azure Blob, local filesystem, in-memory, and HTTP, with +first-class byte-range GETs (`get_opts` / `GetRange`) and multipart uploads. That is exactly +the disk + S3 abstraction we need, so both the "disk" and the "S3" tiers are just two +`Arc` instances and serving a group is one ranged GET. + +## Background: the moq-net model we build on + +The relevant `moq-net` (dev) API, recapped so the design is self-contained: + +**Recording side (read a track):** +- `TrackConsumer::subscribe(None) -> TrackSubscriber`. +- `TrackSubscriber::recv_group() -> Result>` yields every group in + *arrival* order (preserves out-of-order delivery; `next_group()` would skip late arrivals, + which we do not want for an archive). +- `GroupConsumer { sequence }`, `GroupConsumer::read_frame() -> Result>` drains + frames in order; `Frame { size, timestamp: Option }` carries the media timestamp + when present (moq-lite-05+). +- `GroupConsumer::finished() -> Result` resolves once the group is complete, returning + the final frame count. This is our "safe to flush" signal. + +**Serving side (produce a track + answer fetches):** +- `TrackProducer::dynamic() -> TrackDynamic`. +- `TrackDynamic::requested_group() -> Result` blocks until a consumer FETCHes a + group that is not in the producer's cache. While any `TrackDynamic` handle is alive, the + miss waits to be served instead of failing fast with `NotFound`. +- `GroupRequest::sequence() -> u64`, `GroupRequest::accept(info) -> GroupProducer`. We fill the + returned `GroupProducer` with `create_frame` / `write` / `finish` from storage, then + `GroupProducer::finish()`. +- To expose the track over a session, wrap the producer in a `BroadcastProducer` and publish + via `OriginProducer::publish_broadcast`, then connect with `moq-native` (same as `moq-cli`). + +The two directions are deliberately decoupled: recording needs a `TrackConsumer`, serving +needs a `TrackProducer` + `TrackDynamic`. They share only the storage layer, so an archive +node can do either or both. + +## Architecture + +``` + record (TrackConsumer) serve (TrackProducer + TrackDynamic) + | ^ + v | + +------------------------------------+ +------------------------------+ + | Writer | | Reader | + | - drain groups in arrival order | | - on GroupRequest(seq): | + | - buffer in RAM until finished() | | look up seq in Index | + | - batch completed groups | | ranged GET the segment | + | - flush segment + index entries | | parse frames, stream out | + +------------------------------------+ +------------------------------+ + | ^ + v | + +-------------------------------------------------------------------------------------+ + | Storage (tiered) | + | RAM ring --(flush)--> disk store --(age + aggregate)--> S3 store | + | Index: group seq -> (tier, object key, byte offset, length, frame count, ts span) | + +-------------------------------------------------------------------------------------+ +``` + +Two halves, joined by a `Storage` abstraction and an `Index`: + +### Writer (ingest) + +1. Subscribe to the source `TrackConsumer` and loop on `recv_group()`. +2. For each group, spawn/track a buffer that drains `read_frame()` into an in-RAM + `BufferedGroup { sequence, frames: Vec<(Option, Bytes)> }`. Because groups + arrive concurrently, several buffers are open at once, keyed by sequence. +3. When a group's `finished()` resolves, mark it flushable. Incomplete groups never leave RAM + (we cannot serve a half-group). +4. A flusher batches flushable groups and writes them as one **segment object** plus appended + **index entries** when either threshold trips: a byte-size target (e.g. a few MB) or a time + window (the RAM retention duration). Batching is what keeps audio from making a file per + frame. +5. Tier maintenance runs on a timer: promote aged segments disk -> S3 (optionally aggregating + several small disk segments into one larger S3 object), and delete objects past each tier's + retention. + +### Reader (egress / serve) + +1. Hold a `TrackProducer` for the recorded track plus a `TrackDynamic`. Publish the broadcast + into an origin/session so consumers can reach it. +2. Loop on `TrackDynamic::requested_group()`. For each `GroupRequest(seq)`: + - Look up `seq` in the `Index` to find `(store, object key, offset, length)`. + - `store.get_opts(key, GetRange::Bounded(offset..offset+length))` -> the segment slice for + that one group (a single range request, S3-friendly). + - Parse the group's frames, `request.accept()`, and stream them into the `GroupProducer` + (honoring `frame_start` by skipping the first N frames; see Open questions). + - On a miss (seq never recorded or already evicted) reject with `Error::NotFound`. + +Reader and Writer are independent tasks sharing `Storage`; an archive process can run one or +both. The in-RAM tier doubles as a serving cache: a FETCH for a still-buffered recent group is +served from memory without touching disk. + +## Storage layout + +Everything is an `object_store` key, so the same code paths work for a local dir +(`LocalFileSystem`) and a bucket (`AmazonS3`). Proposed key scheme, rooted at a configurable +prefix and namespaced by broadcast/track: + +``` +///segments/ # concatenated groups +///index/.idx # entries for that segment (or one rolling index) +``` + +### Segment format + +A segment is a concatenation of groups. Each group is self-delimiting so a ranged GET of just +its slice is independently parseable: + +``` +group := group_header frame* +group_header := varint(sequence) varint(frame_count) +frame := varint(size) flags ts? payload[size] + flags: 1 byte; bit0 = timestamp present + ts: varint(zigzag delta vs previous frame ts in this group) # when bit0 set +``` + +This mirrors moq-net's own frame coding (size-prefixed, optional zigzag-delta timestamp) so +there is no information loss across a record/serve round-trip. The varint/zigzag helpers are +small; if moq-net's `coding` module is made `pub(crate)`-exportable we reuse it, otherwise a +~30-line local copy (the wire format is stable). Frame payloads are stored verbatim; +compression is a later option. + +### Index format + +The index maps group sequence to its physical location. One entry per group: + +```rust +struct IndexEntry { + sequence: u64, // group sequence (NOT necessarily contiguous or sorted) + segment: SegmentId, // which segment object + offset: u64, // byte offset of the group within the segment + length: u64, // byte length of the group + frames: u32, // frame count (lets us validate / size the GroupProducer) + ts_first: Option, // media timestamp span, used for retention + future seeking + ts_last: Option, + received: u64, // wall-clock ms at completion, retention fallback when ts absent +} +``` + +Because groups complete out of order, entries are appended in completion order, not sequence +order. The reader loads them into a `BTreeMap` (or per-segment index files +merged on startup) for O(log n) seq lookup. For v1 the index is JSON Lines: append-friendly, +trivially debuggable, and small relative to media. If it grows hot we switch the on-disk form +to `postcard`/`bincode` behind the same `Index` type. The in-RAM `BTreeMap` is the source of +truth at runtime; index objects are how we rebuild it after a restart. + +## Tiering and retention + +Three tiers, each optional, each with an optional retention `Duration`: + +| Tier | Backed by | `retain: Option` meaning | +|------|-----------|-----------------------------------| +| RAM | in-process buffers | how long *completed* groups linger in memory after flush (serving cache window). `None` -> drop right after flush. Incomplete groups always stay regardless. | +| Disk | `object_store` `LocalFileSystem` | how long segments stay on local disk before promotion to S3 (and deletion locally). `None` -> stay until evicted by the final-tier rule. | +| S3 | `object_store` `AmazonS3` (or GCS/Azure) | how long aggregated objects stay in the cloud. `None` -> keep forever. | + +Rules: +- A tier is *enabled* when its store is configured. Durations only cap retention within an + enabled tier; enabling/disabling a tier is separate from its duration (so "S3 forever" is + `s3.store = Some, s3.retain = None`). +- Data flows strictly downward: RAM -> disk -> S3. Disabling the middle tier (no disk store) + flushes RAM straight to S3. +- Pure RAM mode (no disk, no S3) is a bounded in-memory ring buffer, an ephemeral DVR window. +- Retention clock: prefer the group's **media timestamp** (`ts_last`) when present + (moq-lite-05), else fall back to the **wall-clock `received` time**. Evict a group from a + tier once `now - clock(group) > retain`, deleting the segment/index objects once every group + they contain has aged out (segments are evicted whole, so the flush batch granularity bounds + how long a single live group pins a segment). + +All three live in a `Storage` struct so the writer, reader, and the maintenance timer share +one view. Tier maintenance is a single periodic task: promote, aggregate, delete. + +## Public API sketch + +Smallest surface that does the job, per the repo's public-API guidance. One insulated entry +point per direction, plus a `#[non_exhaustive]` config built via `Default`. + +```rust +/// Where and how long to retain each tier. Build via `Config::default()` then set fields. +#[derive(Clone, Debug, Default)] +#[non_exhaustive] +pub struct Config { + pub disk: Option, // LocalFileSystem store + retention + pub s3: Option, // remote object_store + retention + pub ram: Option, // completed-group memory window + pub flush: FlushConfig, // batch size + interval thresholds +} + +#[derive(Clone, Debug)] +#[non_exhaustive] +pub struct TierConfig { + pub store: Arc, + pub prefix: object_store::path::Path, + pub retain: Option, +} + +/// An archive for a single track, over shared tiered storage. +pub struct Archive { /* Storage + Index */ } + +impl Archive { + pub fn new(config: Config) -> Result; + + /// Record a live track until it ends or errors. Drains groups, batches, flushes. + pub async fn record(&self, track: TrackConsumer) -> Result<()>; + + /// Serve recorded groups: answers `TrackDynamic` FETCH requests from storage. + /// Pass the producer side of the track you publish into an origin. + pub async fn serve(&self, track: TrackProducer) -> Result<()>; +} +``` + +`serve` takes the `TrackProducer` (it calls `.dynamic()` internally and owns the request +loop), which matches the user's instinct that the serving side is really about `TrackDynamic`. +The caller still owns publishing the broadcast into a session, keeping `moq-archive` free of +networking policy. + +## Binary + +`moq-archive` (the binary) wires the library to a relay, mirroring `moq-cli`: + +- clap config, TOML-loadable. Every `#[arg]` field is `Option` so the TOML->CLI merge does + not clobber file values with `Default` (repo rule; add the regression test like + `moq-relay`). Durations use `humantime-serde`. +- Subcommands: `record --url --broadcast --track ` connects, subscribes, + and records; `serve ...` connects, publishes, and answers fetches. A combined mode runs both. +- Storage flags map onto `object_store` builders: `--disk `, `--s3-url s3://bucket/prefix` + (+ standard AWS env for creds), `--ram 30s --disk 1h --s3 30d`. + +## Out-of-order handling (why it is first-class) + +Groups ride independent QUIC streams, so sequence 7 can finish before sequence 5. The writer +therefore keeps a map of open buffers and only flushes a group on its own `finished()`; it +never assumes contiguity. The index is keyed by sequence but appended in completion order, and +the reader's `BTreeMap` makes lookup order-independent. FETCH is inherently random-access +(consumer asks for an arbitrary old seq), so the read path has no ordering assumptions either. +Sequence gaps (a group that was lost upstream and never recorded) are legal: a FETCH for a gap +returns `NotFound`. + +## Open questions + +1. **`frame_start` granularity.** moq-lite-05 FETCH can request "group N starting at frame K". + The cheap path: ranged-GET the whole group, parse, skip K frames in memory (groups are + bounded at 32 MB, so this is fine). The optimization: store per-frame offsets in the index + for a partial ranged GET. Recommend the cheap path for v1, add per-frame offsets only if + profiling demands it. +2. **Restart/recovery.** Rebuild the `BTreeMap` by listing + reading index objects on startup. + Need a crash-consistency story: write the segment object first, then its index entries, so a + half-written segment is simply never indexed (and is GC'd by a startup sweep of unindexed + segments). +3. **Aggregation/compaction shape.** When promoting disk -> S3, do we copy segments 1:1 or + concatenate many small disk segments into one big S3 object (rewriting offsets in the + index)? Concatenation is better for S3 request economics but adds a rewrite step. Lean + toward 1:1 in v1, compaction later. +4. **Serving the *latest* group / live edge.** v1 answers FETCH for past groups. Should the + archive also serve a live `subscribe` (replay newest groups as they land) so it can stand in + for a departed origin? That is closer to DVR and probably a follow-up. +5. **Index for a hot, long archive.** A multi-day archive has a large index. JSONL + in-RAM + `BTreeMap` is fine for v1; a segmented/columnar index (or sqlite) may be needed at scale. +6. **Backpressure.** If storage is slower than ingest, do we drop oldest buffered groups + (bounded memory, lossy) or apply backpressure to the subscription? Recommend a bounded RAM + budget that drops oldest *completed-but-unflushed* groups and records the gap, never blocking + live ingest. + +## Testing plan + +- Unit: segment encode/decode round-trip, including absent vs present timestamps and + zigzag-delta edges; index append + reload; out-of-order completion ordering. +- Storage: run the full record -> flush -> serve loop against `object_store`'s in-memory and + `LocalFileSystem` backends (no network needed). Use `tokio::time::pause()` for retention/tier + timers per the repo's async-test rule. +- Integration: record a synthetic track (audio-shaped: one frame per group), serve it back via + FETCH, assert byte-exact frames and timestamps. Confirm a FETCH for an evicted/gap sequence + returns `NotFound`. +- Config: TOML<->CLI merge regression test (the `Option` flag rule). + +## Cross-package sync + +Per the repo's sync table, a new standalone crate that only *consumes* `moq-net`'s public API +needs no wire/catalog changes. Touch points: + +- Add `rs/moq-archive` to the workspace `members` / `default-members` in the root `Cargo.toml`. +- New docs page under `doc/bin/` for the binary (and a `doc/concept/` note on the archive tier + relative to the relay cache). +- If we end up needing moq-net's `coding` varint/zigzag helpers, that is a small additive + `pub` export in `moq-net` (non-breaking), to avoid duplicating the wire codec. +``` From 470420230eb801bd66535c64ca0552aec39a9733 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 20 Jun 2026 23:38:49 +0000 Subject: [PATCH 02/25] docs(moq-archive): revise design per review Address the review on the planning doc: - serve() takes a TrackDynamic, not a TrackProducer; archive is a composable link in a cache chain (relay -> archive -> origin) and forwards storage misses to an upstream TrackConsumer. Note the open question of folding this into moq_net core types. - Tiering reworked around progressive rollup: each step concatenates multiple units from the tier above (RAM 30s -> 10s disk segments -> 1m S3 objects), so fragmentation drops downward. Resolves the 1:1-vs-concatenate question. - Eviction is LRU + size budget, not just age; always keep the latest group in RAM; use moq-net used()/unused() to flush unused groups early. - Nail down the index format: per-segment postcard footer + per-track manifest (Parquet/Iceberg shaped), replacing the JSONL sketch. - Percent-encode broadcast/track names since they contain slashes. - Drop sub-group frame_start (likely removed from moq-lite-05; unsupported by the current API); serve whole groups. - Add a prior-art survey (BookKeeper, Kafka KIP-405, Haystack/Bitcask, Parquet/Iceberg) and why no single embeddable crate covers batch+index+S3. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-archive/DESIGN.md | 291 +++++++++++++++++++++++++++------------ 1 file changed, 202 insertions(+), 89 deletions(-) diff --git a/rs/moq-archive/DESIGN.md b/rs/moq-archive/DESIGN.md index 66577318b..795aae2af 100644 --- a/rs/moq-archive/DESIGN.md +++ b/rs/moq-archive/DESIGN.md @@ -43,6 +43,32 @@ first-class byte-range GETs (`get_opts` / `GetRange`) and multipart uploads. Tha the disk + S3 abstraction we need, so both the "disk" and the "S3" tiers are just two `Arc` instances and serving a group is one ranged GET. +### Approaches we borrow from + +"Many small finite append-only streams, batched into larger objects with an index, tiered to +object storage" is a well-trodden pattern. We are reassembling established techniques, not +inventing one: + +- **Apache BookKeeper "ledgers"** (Pulsar) are the closest match: a ledger is a finite, + append-only, sealed sequence of entries, exactly a MoQ group; Pulsar offloads sealed ledgers + to S3. A track is a chain of ledgers. +- **Kafka log segments + tiered storage (KIP-405)** already solved "don't make a file per + record": batch records into large segments, each with an offset index, and offload old + segments to S3. Our segment + per-group offset table is the same shape; the lesson stolen is + a *sparse* index for very large archives (open question 5). +- **Facebook Haystack / f4** and **Bitcask** are the small-object fix: pack many small blobs + into few large files with an in-memory `key -> (file, offset, len)` index. Our in-RAM + `BTreeMap` is a Bitcask keydir. +- **Parquet footer + Iceberg manifest** shape our index: a self-describing per-segment footer + plus a per-track manifest that routes a sequence to a segment. + +No single embeddable crate does all of (batch + index + S3 tiering) for opaque streams: the +systems that do (BookKeeper, Kafka, Pravega) are servers, not libraries. An embedded LSM +(`fjall`, `redb`, or RocksDB BlobDB) would give batching, an index, and compaction for free but +is local-disk only with **no S3 tier**, which is the whole point here. So `object_store` for +the tiers plus our own thin segment/manifest format is the smallest thing that fits; an +embedded LSM as the *disk-tier engine* remains a viable future swap behind the `Storage` trait. + ## Background: the moq-net model we build on The relevant `moq-net` (dev) API, recapped so the design is self-contained: @@ -58,7 +84,7 @@ The relevant `moq-net` (dev) API, recapped so the design is self-contained: - `GroupConsumer::finished() -> Result` resolves once the group is complete, returning the final frame count. This is our "safe to flush" signal. -**Serving side (produce a track + answer fetches):** +**Serving side (answer fetches):** - `TrackProducer::dynamic() -> TrackDynamic`. - `TrackDynamic::requested_group() -> Result` blocks until a consumer FETCHes a group that is not in the producer's cache. While any `TrackDynamic` handle is alive, the @@ -66,12 +92,35 @@ The relevant `moq-net` (dev) API, recapped so the design is self-contained: - `GroupRequest::sequence() -> u64`, `GroupRequest::accept(info) -> GroupProducer`. We fill the returned `GroupProducer` with `create_frame` / `write` / `finish` from storage, then `GroupProducer::finish()`. -- To expose the track over a session, wrap the producer in a `BroadcastProducer` and publish - via `OriginProducer::publish_broadcast`, then connect with `moq-native` (same as `moq-cli`). -The two directions are deliberately decoupled: recording needs a `TrackConsumer`, serving -needs a `TrackProducer` + `TrackDynamic`. They share only the storage layer, so an archive -node can do either or both. +The archive's serving entry point is therefore a **`TrackDynamic`, not a `TrackProducer`**: +the caller owns the `TrackProducer` (and publishing the broadcast into a session via +`BroadcastProducer` / `OriginProducer::publish_broadcast`), calls `.dynamic()`, and hands the +archive the request side. This makes the archive one composable link in a fallback chain +rather than a thing that owns the track. + +**The archive is a link in a cache chain.** The caller decides where it sits. For example +`moq-relay` would try the archive first for any dynamic request, and the archive answers from +RAM/disk/S3. On a miss, the request must fall through to the **original publisher** (the live +origin), because a group might never have reached storage. So the archive needs both an +incoming `TrackDynamic` (requests from downstream) *and* a downstream handle to forward misses +to (its own `TrackDynamic` over the upstream track, which it also records). The chain is: + +``` +consumer FETCH -> relay cache -> moq-archive (RAM -> disk -> S3) -> origin publisher +``` + +> **Open architectural question (from review):** this cache-fallback-plus-record behavior +> could instead live *inside* `moq_net::TrackProducer` / `TrackConsumer` themselves. That is a +> friendlier API (the archive becomes a storage backend you attach, not a chain you wire) and +> moq-net would then know precisely when a group is evicted from its RAM cache, which is the +> natural trigger to flush. The cost is baking storage concerns into the core wire types, which +> feels out of place there. v1 keeps the logic in `moq-archive` and treats moq-net integration +> as a follow-up; flagged here because it shapes the public API. + +The two directions stay decoupled: recording needs a `TrackConsumer`, serving needs a +`TrackDynamic` (plus an upstream handle for miss fallback). They share only the storage layer, +so an archive node can do either or both. ## Architecture @@ -105,24 +154,26 @@ Two halves, joined by a `Storage` abstraction and an `Index`: arrive concurrently, several buffers are open at once, keyed by sequence. 3. When a group's `finished()` resolves, mark it flushable. Incomplete groups never leave RAM (we cannot serve a half-group). -4. A flusher batches flushable groups and writes them as one **segment object** plus appended - **index entries** when either threshold trips: a byte-size target (e.g. a few MB) or a time - window (the RAM retention duration). Batching is what keeps audio from making a file per - frame. -5. Tier maintenance runs on a timer: promote aged segments disk -> S3 (optionally aggregating - several small disk segments into one larger S3 object), and delete objects past each tier's - retention. +4. A flusher batches flushable groups into one **segment object** (footer included) when a + threshold trips: a byte-size target, the RAM time window, or a group going `unused()` early. + Batching is what keeps audio from making a file per frame. The latest group and any `used()` + groups stay in RAM. +5. Tier maintenance runs on a timer: roll up (concatenate several disk segments into one larger + S3 object, rewriting the manifest), then LRU/age-evict and delete objects past each tier's + budget. ### Reader (egress / serve) -1. Hold a `TrackProducer` for the recorded track plus a `TrackDynamic`. Publish the broadcast - into an origin/session so consumers can reach it. +1. Take the incoming `TrackDynamic` (the caller owns the `TrackProducer` and publishes it). + Optionally hold a downstream handle to the upstream origin for miss fallback. 2. Loop on `TrackDynamic::requested_group()`. For each `GroupRequest(seq)`: - Look up `seq` in the `Index` to find `(store, object key, offset, length)`. - `store.get_opts(key, GetRange::Bounded(offset..offset+length))` -> the segment slice for that one group (a single range request, S3-friendly). - - Parse the group's frames, `request.accept()`, and stream them into the `GroupProducer` - (honoring `frame_start` by skipping the first N frames; see Open questions). + - Parse the group's frames, `request.accept()`, and stream them into the `GroupProducer`. + - On a miss (not in RAM/disk/S3), forward the request to the upstream origin if present, + relaying (and recording) the result. Only when nothing upstream can satisfy it does the + request resolve to `NotFound`. - On a miss (seq never recorded or already evicted) reject with `Error::NotFound`. Reader and Writer are independent tasks sharing `Storage`; an archive process can run one or @@ -137,9 +188,16 @@ prefix and namespaced by broadcast/track: ``` ///segments/ # concatenated groups -///index/.idx # entries for that segment (or one rolling index) +///manifest # append-only list of segments (see Index format) ``` +**Broadcast and track names contain slashes** (they are themselves path-shaped, e.g. +`room/alice/camera`). `object_store` paths are `/`-delimited, so the raw name would explode +into spurious directory levels and collide (`a/b` + `c` vs `a` + `b/c`). Percent-encode each +name as a single, reversible path segment before use (encode `/` and any other delimiter), so +`` / `` are opaque components. This keeps `list`-by-prefix working per +broadcast and lets us recover the original names on restart. + ### Segment format A segment is a concatenation of groups. Each group is self-delimiting so a ranged GET of just @@ -159,55 +217,100 @@ small; if moq-net's `coding` module is made `pub(crate)`-exportable we reuse it, ~30-line local copy (the wire format is stable). Frame payloads are stored verbatim; compression is a later option. -### Index format +### Index format (nailed down) + +Two levels, modeled on Parquet's self-describing footer plus an Iceberg-style manifest. This +avoids a separate `.idx` object per segment (which would reintroduce the small-object problem) +and makes each segment independently recoverable. -The index maps group sequence to its physical location. One entry per group: +**1. Per-segment footer.** Each segment object ends with its own group table plus a fixed +trailer, so the segment is self-describing: given only the object you can find every group in +it. The table is one record per group: ```rust -struct IndexEntry { +struct GroupEntry { sequence: u64, // group sequence (NOT necessarily contiguous or sorted) - segment: SegmentId, // which segment object - offset: u64, // byte offset of the group within the segment + offset: u64, // byte offset of the group within this segment length: u64, // byte length of the group - frames: u32, // frame count (lets us validate / size the GroupProducer) - ts_first: Option, // media timestamp span, used for retention + future seeking + frames: u32, // frame count (validates / sizes the GroupProducer) + ts_first: Option, // media timestamp span (retention + future seeking) ts_last: Option, - received: u64, // wall-clock ms at completion, retention fallback when ts absent + received: u64, // wall-clock ms at completion; retention fallback when ts absent } + +// segment := group* footer +// footer := postcard(Vec) u32(footer_len) u32(magic) ``` -Because groups complete out of order, entries are appended in completion order, not sequence -order. The reader loads them into a `BTreeMap` (or per-segment index files -merged on startup) for O(log n) seq lookup. For v1 the index is JSON Lines: append-friendly, -trivially debuggable, and small relative to media. If it grows hot we switch the on-disk form -to `postcard`/`bincode` behind the same `Index` type. The in-RAM `BTreeMap` is the source of -truth at runtime; index objects are how we rebuild it after a restart. +**2. Per-track manifest.** One append-only object per track listing its segments, so the +reader can route a sequence to a segment without opening every segment footer: + +```rust +struct ManifestEntry { + segment: SegmentId, // object key (relative to the track prefix) + tier: Tier, // Disk | S3 (RAM segments are not in the manifest) + seq_min: u64, seq_max: u64, // sequence range covered (groups out of order, so a range, not a set) + ts_min: Option, ts_max: Option, +} +``` + +**Encoding:** `postcard` for both (compact, `serde`, no schema server; chosen over JSONL so +the footer is fixed-shape and the manifest stays small for multi-day archives). The trailer's +`footer_len` + `magic` let the reader fetch the footer with one tail range GET +(`GetRange::Suffix`) without knowing its size up front. + +**Runtime + recovery:** on startup the reader reads each track manifest, building an in-RAM +`BTreeMap` for O(log n) seq lookup (segment footers are fetched +and cached lazily on first hit). Because groups complete out of order, footer entries are in +completion order; the `BTreeMap` makes lookup order-independent. The manifest is the routing +index; segment footers are the source of truth and let us rebuild a manifest by `list` + +tail-read if one is ever lost. ## Tiering and retention -Three tiers, each optional, each with an optional retention `Duration`: - -| Tier | Backed by | `retain: Option` meaning | -|------|-----------|-----------------------------------| -| RAM | in-process buffers | how long *completed* groups linger in memory after flush (serving cache window). `None` -> drop right after flush. Incomplete groups always stay regardless. | -| Disk | `object_store` `LocalFileSystem` | how long segments stay on local disk before promotion to S3 (and deletion locally). `None` -> stay until evicted by the final-tier rule. | -| S3 | `object_store` `AmazonS3` (or GCS/Azure) | how long aggregated objects stay in the cloud. `None` -> keep forever. | - -Rules: -- A tier is *enabled* when its store is configured. Durations only cap retention within an - enabled tier; enabling/disabling a tier is separate from its duration (so "S3 forever" is - `s3.store = Some, s3.retain = None`). -- Data flows strictly downward: RAM -> disk -> S3. Disabling the middle tier (no disk store) - flushes RAM straight to S3. -- Pure RAM mode (no disk, no S3) is a bounded in-memory ring buffer, an ephemeral DVR window. -- Retention clock: prefer the group's **media timestamp** (`ts_last`) when present - (moq-lite-05), else fall back to the **wall-clock `received` time**. Evict a group from a - tier once `now - clock(group) > retain`, deleting the segment/index objects once every group - they contain has aged out (segments are evicted whole, so the flush batch granularity bounds - how long a single live group pins a segment). - -All three live in a `Storage` struct so the writer, reader, and the maintenance timer share -one view. Tier maintenance is a single periodic task: promote, aggregate, delete. +Three tiers, RAM -> disk -> S3, each optional. The key idea (from review): **each rollup step +merges multiple units from the tier above into one larger object, so fragmentation decreases +as data moves down.** RAM can be highly fragmented (one buffer per group, audio makes many); +disk segments coalesce a window of groups; S3 objects coalesce a window of disk segments. + +| Tier | Backed by | Holds for | Flushes downward in | +|------|-----------|-----------|---------------------| +| RAM | in-process buffers | up to e.g. 30s | 10s segments to disk | +| Disk | `object_store` `LocalFileSystem` | up to e.g. 5m | 1m batches to S3 | +| S3 | `object_store` `AmazonS3` (GCS/Azure) | up to e.g. 30d (or forever) | n/a (final tier) | + +So a group is written many times to RAM individually, rewritten once into a 10s disk segment, +then several disk segments are concatenated into one 1m S3 object. This directly resolves the +old "1:1 copy vs concatenate" open question in favor of **concatenate at every rollup**. + +### Eviction: LRU + size budget, not just age + +Each tier has both a **max age** and a **size budget**. Within a tier, evict **least-recently- +used** first (an LRU keyed by group/segment), capped by the budget; the max age is an upper +bound layered on top. LRU is the right default for both RAM and disk because serving traffic +is bursty and skewed (a re-fetched group is likely to be re-fetched again). Pure-RAM mode (no +disk, no S3) is then a bounded LRU/DVR window rather than a strict ring buffer. + +Two refinements from review: + +- **Always keep the latest group in RAM**, exempt from LRU/age eviction. New subscribers and + the live edge need it immediately, and it is the one group most likely to be requested next. +- **Use moq-net's `used()` / `unused()` group state to flush early.** A group that has gone + `unused` (no active consumer interested) earns nothing by staying in RAM, so fold it into the + next disk flush instead of waiting out the full RAM window. `used()` groups stay hot. This + makes the RAM age a cap, not a fixed delay, and reclaims memory under churn. + +### Retention clock + +For the max-age bound, prefer the group's **media timestamp** (`ts_last`) when present +(moq-lite-05), else fall back to the **wall-clock `received` time**. A tier is *enabled* when +its store is configured; "S3 forever" is `s3.store = Some` with no max age. Data flows strictly +downward, so disabling the middle tier flushes RAM straight to S3. Segments/objects are deleted +whole once every group they contain has aged out, so the rollup batch granularity bounds how +long one live group pins an object. + +All tiers live in a `Storage` struct shared by the writer, reader, and a single periodic +maintenance task that does the three jobs: roll up (merge + promote), LRU/age evict, delete. ## Public API sketch @@ -215,14 +318,13 @@ Smallest surface that does the job, per the repo's public-API guidance. One insu point per direction, plus a `#[non_exhaustive]` config built via `Default`. ```rust -/// Where and how long to retain each tier. Build via `Config::default()` then set fields. +/// Per-tier sizing. Build via `Config::default()` then set fields. #[derive(Clone, Debug, Default)] #[non_exhaustive] pub struct Config { - pub disk: Option, // LocalFileSystem store + retention - pub s3: Option, // remote object_store + retention - pub ram: Option, // completed-group memory window - pub flush: FlushConfig, // batch size + interval thresholds + pub ram: RamConfig, // memory window + budget; always keeps the latest group + pub disk: Option, // LocalFileSystem store + pub s3: Option, // remote object_store; no max_age -> keep forever } #[derive(Clone, Debug)] @@ -230,7 +332,9 @@ pub struct Config { pub struct TierConfig { pub store: Arc, pub prefix: object_store::path::Path, - pub retain: Option, + pub max_age: Option, // upper bound; None on S3 means forever + pub budget: Option, // byte budget, LRU-evicted when exceeded + pub rollup: Duration, // window of upstream units merged into one object here } /// An archive for a single track, over shared tiered storage. @@ -242,16 +346,23 @@ impl Archive { /// Record a live track until it ends or errors. Drains groups, batches, flushes. pub async fn record(&self, track: TrackConsumer) -> Result<()>; - /// Serve recorded groups: answers `TrackDynamic` FETCH requests from storage. - /// Pass the producer side of the track you publish into an origin. - pub async fn serve(&self, track: TrackProducer) -> Result<()>; + /// Answer FETCH requests from storage. Takes the request side of a track the caller owns + /// and publishes; on a storage miss, forwards to `upstream` (the live origin) if given, + /// otherwise resolves the request to `NotFound`. + pub async fn serve( + &self, + requests: TrackDynamic, + upstream: impl Into>, + ) -> Result<()>; } ``` -`serve` takes the `TrackProducer` (it calls `.dynamic()` internally and owns the request -loop), which matches the user's instinct that the serving side is really about `TrackDynamic`. -The caller still owns publishing the broadcast into a session, keeping `moq-archive` free of -networking policy. +`serve` takes a `TrackDynamic`, not a `TrackProducer`: the caller owns the producer and +publishes the broadcast into a session, so the archive is a composable link in the cache chain +(relay -> archive -> origin) rather than the owner of the track. `upstream` is the miss- +fallback handle and, when recording the same track, the source the writer drains. Per the repo +convention it is `impl Into>`, so callers pass the consumer or `None`. +Folding this into `moq_net` directly is the open architectural question noted above. ## Binary @@ -263,7 +374,8 @@ networking policy. - Subcommands: `record --url --broadcast --track ` connects, subscribes, and records; `serve ...` connects, publishes, and answers fetches. A combined mode runs both. - Storage flags map onto `object_store` builders: `--disk `, `--s3-url s3://bucket/prefix` - (+ standard AWS env for creds), `--ram 30s --disk 1h --s3 30d`. + (+ standard AWS env for creds). Each tier takes a max age, a byte budget, and a rollup window, + e.g. `--ram-age 30s --disk-age 5m --disk-rollup 10s --s3-age 30d --s3-rollup 1m`. ## Out-of-order handling (why it is first-class) @@ -277,28 +389,29 @@ returns `NotFound`. ## Open questions -1. **`frame_start` granularity.** moq-lite-05 FETCH can request "group N starting at frame K". - The cheap path: ranged-GET the whole group, parse, skip K frames in memory (groups are - bounded at 32 MB, so this is fine). The optimization: store per-frame offsets in the index - for a partial ranged GET. Recommend the cheap path for v1, add per-frame offsets only if - profiling demands it. -2. **Restart/recovery.** Rebuild the `BTreeMap` by listing + reading index objects on startup. - Need a crash-consistency story: write the segment object first, then its index entries, so a - half-written segment is simply never indexed (and is GC'd by a startup sweep of unindexed - segments). -3. **Aggregation/compaction shape.** When promoting disk -> S3, do we copy segments 1:1 or - concatenate many small disk segments into one big S3 object (rewriting offsets in the - index)? Concatenation is better for S3 request economics but adds a rewrite step. Lean - toward 1:1 in v1, compaction later. -4. **Serving the *latest* group / live edge.** v1 answers FETCH for past groups. Should the - archive also serve a live `subscribe` (replay newest groups as they land) so it can stand in - for a departed origin? That is closer to DVR and probably a follow-up. -5. **Index for a hot, long archive.** A multi-day archive has a large index. JSONL + in-RAM - `BTreeMap` is fine for v1; a segmented/columnar index (or sqlite) may be needed at scale. +1. **moq-net integration (the big one).** Should the cache-fallback-plus-record behavior live + inside `moq_net::TrackProducer` / `TrackConsumer` instead of being wired as a chain by the + caller? Friendlier API and moq-net would know exactly when a group leaves its RAM cache (the + natural flush trigger), at the cost of putting storage concerns in the core wire types. See + the architecture section. Decide before locking the public API. +2. **Restart/recovery.** Rebuild the in-RAM `BTreeMap` from each track manifest on startup; + refetch segment footers lazily. Crash-consistency: write the segment object (footer last) + before appending to the manifest, so a half-written segment is simply unreferenced and a + startup `list` sweep GCs any segment missing from the manifest. +3. **Sub-group FETCH.** Earlier drafts let a FETCH start at frame K within a group. This is + likely being **removed from moq-lite-05** and the current API does not support it, so the + archive serves whole groups only. If it returns, the cheap path (ranged-GET the group, skip + K frames in memory; groups are bounded at 32 MB) suffices before adding per-frame offsets. +4. **Serving the live edge.** Keeping the latest group in RAM plus upstream miss-fallback lets + the archive answer recent FETCHes and stand in for a departed origin. A full live + `subscribe` replay (DVR-style) is a possible follow-up beyond FETCH. +5. **Very large archives.** The manifest + lazily-cached footers handle a multi-day archive, + but a months-long one may want manifest sharding (per time bucket) so startup does not read + the whole thing. Defer until the single-manifest form is measured. 6. **Backpressure.** If storage is slower than ingest, do we drop oldest buffered groups (bounded memory, lossy) or apply backpressure to the subscription? Recommend a bounded RAM - budget that drops oldest *completed-but-unflushed* groups and records the gap, never blocking - live ingest. + budget that LRU-drops oldest *completed-but-unflushed* groups and records the gap, never + blocking live ingest. ## Testing plan From 9304642057bd9584bf972008f8e3e0d2e7fc3d3d Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 20 Jun 2026 23:46:27 +0000 Subject: [PATCH 03/25] docs(moq-archive): add usage mockups, show the relay gap Add a Usage section with concrete call sites: - Scenario A: a standalone VOD node serving a recorded broadcast back; the per-track serve(TrackDynamic) API fits because the archive is the publisher. - Scenario B: why moq-relay cannot use the per-track API as-is. The relay only holds an OriginProducer and forwards whole broadcasts; the per-track TrackProducer + dynamic() is created inside moq-net's subscriber fan-out, so there is no seam to hand the archive a TrackDynamic. - Scenario C: the recommended moq-net seam, a pluggable Cache trait consulted on a miss and notified on eviction, attached to the origin, reducing the relay integration to one line. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-archive/DESIGN.md | 75 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/rs/moq-archive/DESIGN.md b/rs/moq-archive/DESIGN.md index 795aae2af..b296ef732 100644 --- a/rs/moq-archive/DESIGN.md +++ b/rs/moq-archive/DESIGN.md @@ -362,7 +362,80 @@ publishes the broadcast into a session, so the archive is a composable link in t (relay -> archive -> origin) rather than the owner of the track. `upstream` is the miss- fallback handle and, when recording the same track, the source the writer drains. Per the repo convention it is `impl Into>`, so callers pass the consumer or `None`. -Folding this into `moq_net` directly is the open architectural question noted above. + +## Usage mockups + +These compile-in-spirit sketches answer "how is this actually called?" and surface a real gap: +the per-track API above serves a **standalone / VOD** node cleanly, but `moq-relay` has no +seam to use it, which is the argument for the moq-net integration below. + +### A. Standalone VOD node (works with the per-track API) + +A node that recorded a broadcast earlier and now serves it back. Here the archive *is* the +publisher: the live broadcast is gone, so there is no path collision and the per-track API +fits. Recording is symmetric (`origin.consume()` -> `BroadcastConsumer::track` -> +`Archive::record`). + +```rust +let archive = Archive::new(config)?; + +// Publish a broadcast; its tracks answer FETCH from storage. +let broadcast = BroadcastInfo::new().produce(); +origin.publish_broadcast("vod/room-alice", broadcast.consume())?; + +// For each track a downstream subscriber asks for, serve it from storage. +let mut tracks = broadcast.dynamic(); +while let Ok(request) = tracks.requested_track().await { + let producer = request.accept(TrackInfo::default())?; // caller owns the producer + let requests = producer.dynamic(); // archive gets the request side + tokio::spawn(archive.serve(requests, None)); // no upstream: pure VOD +} +``` + +### B. Why `moq-relay` cannot use the per-track API as-is + +The relay is built entirely around a single `OriginProducer` (`Cluster::origin`). Remote +publishers `publish_broadcast` into it; downstream sessions read `origin.consume()`. The relay +code never constructs a `TrackProducer` and never calls `.dynamic()` on a track. That happens +*inside* moq-net's session fan-out (`lite::subscriber` / `ietf::subscriber`), which creates the +per-track producer and its `TrackDynamic` to forward a downstream cache-miss FETCH upstream. + +So there is no point in `moq-relay` where you could write `archive.serve(track_dynamic, ...)`: +the relay operates one layer up, at the broadcast/origin granularity, and the track objects the +archive needs only exist transiently deep inside moq-net. Wiring the per-track API into the +relay would mean interposing on every track of every forwarded broadcast (republishing each +broadcast through an archive-owned `BroadcastProducer`, re-`accept`ing every `requested_track`, +re-`subscribe`ing upstream), i.e. reimplementing the relay's fan-out around the archive. That +is the "wire a fallback chain" cost, and it is large. + +### C. The moq-net seam (what actually makes the relay one line) + +Give moq-net a pluggable cache backend that it consults on a miss and notifies on eviction, +attached where it already owns the per-track RAM cache (the origin, flowing down to each +`TrackProducer`). The archive implements the trait; the relay attaches it once. + +```rust +// moq-net (new): the one hook the relay can't get from outside is *when* a group is evicted. +pub trait Cache: Send + Sync + 'static { + /// A group aged out of the RAM cache. Persist it (called with the finished group). + fn store(&self, track: &TrackInfo, group: GroupConsumer); + + /// A consumer fetched a group not in RAM. Produce it from storage into `request`, + /// or return it unserved so moq-net falls through to the upstream wire FETCH. + async fn fetch(&self, track: &TrackInfo, request: GroupRequest); +} + +impl moq_net::Cache for Archive { /* store -> writer, fetch -> reader */ } + +// moq-relay: the entire integration. +let origin = Origin::random().produce().with_cache(archive); +``` + +Now the relay keeps working at the origin level, the per-track plumbing stays inside moq-net, +and the archive transparently catches evictions (the natural flush trigger) and serves misses +before they cost an upstream round-trip. This is the recommended shape; it makes the public +`Archive` a `Cache` impl plus the standalone `record`/`serve` helpers from scenario A, rather +than the chain wiring. Decision needed before the API is locked (see open question 1). ## Binary From d156900ad3e09f3a3f978af115b83aa6555df9bc Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 14:59:08 +0000 Subject: [PATCH 04/25] docs(moq-net): add per-track cache spike; reconcile archive doc Add rs/moq-net/CACHE.md: a spike for a per-track group cache owned by TrackProducer. Concrete CacheConfig value (no trait, no callback), per-track [min, max] bounds on size and duration, watermark flush that batches the max-min band into one segment (which an LRU cannot do), RAM -> disk -> remote tiers via object_store behind a feature flag, served by ranged read with no fault-in. Removes the wire-visible TrackInfo.cache in favor of local, producer-owned policy. Adds an interval flush backstop for low-rate tracks. Reconcile rs/moq-archive/DESIGN.md scenario C and open question 1 to the concrete cache (drop the rejected Cache trait sketch) and cross-link the spike. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-archive/DESIGN.md | 54 ++++++------ rs/moq-net/CACHE.md | 184 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 210 insertions(+), 28 deletions(-) create mode 100644 rs/moq-net/CACHE.md diff --git a/rs/moq-archive/DESIGN.md b/rs/moq-archive/DESIGN.md index b296ef732..903149887 100644 --- a/rs/moq-archive/DESIGN.md +++ b/rs/moq-archive/DESIGN.md @@ -408,34 +408,32 @@ broadcast through an archive-owned `BroadcastProducer`, re-`accept`ing every `re re-`subscribe`ing upstream), i.e. reimplementing the relay's fan-out around the archive. That is the "wire a fallback chain" cost, and it is large. -### C. The moq-net seam (what actually makes the relay one line) +### C. The moq-net seam: a concrete per-track cache (no trait, no callback) -Give moq-net a pluggable cache backend that it consults on a miss and notifies on eviction, -attached where it already owns the per-track RAM cache (the origin, flowing down to each -`TrackProducer`). The archive implements the trait; the relay attaches it once. +Rather than a `Cache` trait moq-net calls back into (rejected in review: no inversion of +control), the storage lives in a concrete `CacheConfig` value owned by each `TrackProducer`. +moq-net retains and serves groups itself, spilling to disk or remote object storage per the +config. The relay attaches a config; the archive crate just provides the tier setup and reads +the spilled segments. See [`../moq-net/CACHE.md`](../moq-net/CACHE.md) for the full spike. ```rust -// moq-net (new): the one hook the relay can't get from outside is *when* a group is evicted. -pub trait Cache: Send + Sync + 'static { - /// A group aged out of the RAM cache. Persist it (called with the finished group). - fn store(&self, track: &TrackInfo, group: GroupConsumer); - - /// A consumer fetched a group not in RAM. Produce it from storage into `request`, - /// or return it unserved so moq-net falls through to the upstream wire FETCH. - async fn fetch(&self, track: &TrackInfo, request: GroupRequest); -} - -impl moq_net::Cache for Archive { /* store -> writer, fetch -> reader */ } - -// moq-relay: the entire integration. -let origin = Origin::random().produce().with_cache(archive); +// moq-net (new): local cache policy, per-track, never on the wire. +let config = moq_net::CacheConfig { + ram: bounds(20.s(), 30.s()), // keep 20-30s in RAM + disk: Some(disk_tier(path, 4.m(), 5.m())), + remote: Some(remote_tier(s3, 30.days())), + ..Default::default() +}; + +// each track the relay creates: producer.with_cache(config.clone()) ``` -Now the relay keeps working at the origin level, the per-track plumbing stays inside moq-net, -and the archive transparently catches evictions (the natural flush trigger) and serves misses -before they cost an upstream round-trip. This is the recommended shape; it makes the public -`Archive` a `Cache` impl plus the standalone `record`/`serve` helpers from scenario A, rather -than the chain wiring. Decision needed before the API is locked (see open question 1). +Retention is per-track `[min, max]` bounds with a watermark flush (the `max - min` band becomes +one segment, so audio does not produce a file per frame). There is no eviction callback: the +archive learns what to persist because moq-net writes the spilled segments directly in the +shared on-tier format. This keeps the relay declarative (config, not loops) and keeps storage +concerns out of moq-net's behavior surface. Decision settled; the open API work is threading one +config onto the tracks moq-net auto-creates during fan-out (the Origin follow-up). ## Binary @@ -462,11 +460,11 @@ returns `NotFound`. ## Open questions -1. **moq-net integration (the big one).** Should the cache-fallback-plus-record behavior live - inside `moq_net::TrackProducer` / `TrackConsumer` instead of being wired as a chain by the - caller? Friendlier API and moq-net would know exactly when a group leaves its RAM cache (the - natural flush trigger), at the cost of putting storage concerns in the core wire types. See - the architecture section. Decide before locking the public API. +1. **moq-net integration.** Settled on a concrete per-track `CacheConfig` owned by + `TrackProducer` (no trait, no callback); see scenario C and [`../moq-net/CACHE.md`](../moq-net/CACHE.md). + The remaining work is on the moq-net side: threading one config onto the tracks moq-net + auto-creates during fan-out, removing `TrackInfo.cache`, and (separately) the Origin split + that lets a relay register dynamic broadcast/track handlers. 2. **Restart/recovery.** Rebuild the in-RAM `BTreeMap` from each track manifest on startup; refetch segment footers lazily. Crash-consistency: write the segment object (footer last) before appending to the manifest, so a half-written segment is simply unreferenced and a diff --git a/rs/moq-net/CACHE.md b/rs/moq-net/CACHE.md new file mode 100644 index 000000000..a4ce529ca --- /dev/null +++ b/rs/moq-net/CACHE.md @@ -0,0 +1,184 @@ +# moq-net track cache (spike) + +> Status: design spike, no implementation yet. Targets `dev`: it removes a public/wire field +> (`TrackInfo.cache`) and adds local API to `TrackProducer`. + +A per-track group cache owned by a `TrackProducer`. It lets a relay or edge retain recent +groups past the live window and serve them back on a FETCH, optionally spilling to local disk +or remote object storage. This is the moq-net mechanism the `moq-archive` crate builds on (see +`../moq-archive/DESIGN.md`); the on-tier byte format is shared with that crate. + +## Principles + +These come from design review and pin down the shape: + +- **Owned by `TrackProducer` only.** The cache is local policy set by whoever holds the + producer (the relay or edge), never by the original publisher and never carried on the wire. + This is why `TrackInfo.cache` goes away (see "Removing TrackInfo.cache"). +- **Per-track bounds, no shared LRU.** Each track keeps a `[min, max]` window of its own recent + groups. There is no cross-track accounting, so no shared lock and no contention. The cost is + that there is no global RAM ceiling; total footprint is the sum of per-track `max` across + live tracks. A global backstop, if ever needed, is additive and not part of v1. +- **No traits, no callbacks.** `Cache` is a concrete value you configure and attach. moq-net + owns all behavior. The disk and remote backends are an internal, configured `object_store`, + not a consumer-implemented extension point. +- **Watermark flush, not per-item eviction.** Groups accumulate to the high watermark, then a + whole band (the `max - min` worth) is flushed as one segment. This is the property an LRU + cannot provide: an LRU evicts one group the instant the budget trips, producing one tiny + object per group, which is fatal for audio (a group per frame). The watermark is what creates + batches. + +## Bounds + +Per track, on both size and duration, whichever trips first: + +```rust +/// Local cache policy for a single TrackProducer. Not on the wire, not in TrackInfo. +#[derive(Clone, Debug, Default)] +#[non_exhaustive] +pub struct CacheConfig { + pub ram: Bounds, // keep >= min in RAM; flush the band once > max + pub disk: Option, // local object_store + its own Bounds + pub remote: Option, // remote object_store + its own Bounds + pub interval: Option, // max wall-clock before a partial band flushes anyway +} + +/// A low/high watermark. The gap (max - min) is the flush batch size. +pub struct Bounds { pub min: Limit, pub max: Limit } + +/// A bound expressed as a duration, a byte count, or both (first to trip wins). +pub struct Limit { pub duration: Option, pub bytes: Option } + +/// A persistent tier: an object_store plus its own retention bounds. +pub struct Tier { pub store: /* path or url */ (), pub bounds: Bounds } +``` + +The flush batch is implicitly `max - min`, so the bounds map straight onto the tiering the +archive doc describes: + +| Want | Set | +|---|---| +| keep 30s in RAM, flush 10s segments to disk | `ram.min = 20s`, `ram.max = 30s` | +| keep 5m on disk, flush 1m objects to remote | `disk.min = 4m`, `disk.max = 5m` | + +At 30s the buffer drains back to 20s, emitting a 10s segment, then refills over the next 10s. +No explicit batch size: the band is the batch. + +`interval` is a backstop so a low data-rate track still flushes eventually instead of holding a +half-full band for a long time. A duration-based `max` already covers most of this (the oldest +group ages past `max` even with little data), so `interval` matters chiefly when the bounds are +byte-only. + +## State and flush + +Each track owns a small buffer plus an index of what has been flushed where: + +```rust +struct TrackCache { + ram: VecDeque, // recent groups, ordered by sequence + ram_bytes: u64, + flushed: BTreeMap, // sequence -> (tier, object key, offset) for serving + last_flush: Instant, +} +``` + +Flush runs on group completion and on a timer: + +```text +if over(ram.max) || ram.interval elapsed with a flushable band: + batch = drain oldest completed, unpinned groups until back to ram.min + match disk: + Some(d) => segment = serialize(batch) // archive segment format + d.put(key, segment) + for g in batch { flushed[g.seq] = Disk(key, offset) } + None => drop(batch) // RAM-only cache: just evict +// the disk tier runs the same watermark loop against disk.max, concatenating several +// small segments into one larger remote object (the rollup) and updating `flushed`. +``` + +## Serving + +```text +get(seq): + if let Some(g) = ram.find(seq) -> serve(g) // RAM hit, pin while read + if let Some(loc) = flushed.get(seq) -> stream_from_tier(loc) // ranged GET, no fault-back + else -> None // miss: upstream / Unroutable +``` + +A lower-tier hit streams straight from disk or remote via a ranged read. There is no fault-in +and no re-population of RAM, so a group lives in exactly one tier and is served from there. This +is what makes the watermark model simpler than an LRU, which needs to move items back up on +access. + +## Always-latest and pinning + +- **The latest group is never evicted.** It sits inside `ram.min` by construction, so this is + free, and it is the group a new subscriber needs first. +- **A pinned (actively read) group is never flushed.** A `GroupConsumer` handed out from the + cache holds a pin (hooked into the group's existing refcount); the flush skips pinned groups + and emits the rest of the band. Old groups are rarely pinned, so segments stay contiguous in + practice. If strictly contiguous segments are ever required, hold the batch until the pin + clears instead. + +## Tiers + +RAM is always present and dependency-free. disk and remote are `object_store`, behind a +`cache-tiered` feature flag so RAM-only native builds (and any wasm consumers) do not pull the +cloud stack. The on-tier bytes reuse the `moq-archive` segment plus manifest format, so the +cache and the archive crate agree byte-for-byte and a relay's spilled data is directly readable +by an archive node. + +## Integration with TrackProducer / TrackState + +Today `TrackState.groups` is the inline per-track cache, bounded by the `TrackInfo.cache` +duration. With a `CacheConfig` attached: + +- finished groups beyond `ram.min` move from the inline buffer into the cache's RAM tier; +- a `get_group` or `dynamic()` miss consults the cache (RAM, then disk, then remote) before + failing with `NotFound`; +- nothing reads `TrackInfo.cache` any more. + +The attach point is one local method: + +```rust +impl TrackProducer { + /// Attach a local cache. Retains and serves groups per `config`, independent of any + /// retention the original publisher set. + pub fn with_cache(self, config: CacheConfig) -> Self; +} +``` + +## Removing TrackInfo.cache + +`TrackInfo.cache` is a producer-set, wire-serialized duration. It conflates "how long the +publisher keeps groups for late subscribers" with "cache policy," and a relay should not +inherit the publisher's number to size its own cache. Since the cache here is local and fully +independent: + +- stop using `TrackInfo.cache` to size anything; +- remove the field from `TrackInfo`. This is a public-API and wire change, hence the `dev` + target. If a producer-side retention knob is still wanted, it stays internal to the producer + rather than on the shared `TrackInfo`. + +## Per-binary use + +- **moq-cli:** no cache, or a small RAM-only `CacheConfig` for a single track. +- **moq-relay:** one `CacheConfig` template applied to every track it creates. Threading that + config onto the tracks moq-net auto-creates during fan-out is the Origin follow-up; here it is + just `TrackProducer::with_cache(config)`. A relay RAM cache that spills to disk or S3 becomes + configuration, not code. +- **moq-edge:** the same, plus its own dynamic-handler business logic on top. + +## Open questions + +1. **object_store in moq-net.** Feature-gate `cache-tiered`; RAM-only stays dependency-free. + This is the one heavy dependency decision, since moq-net is the core wire crate. +2. **Async get.** RAM hits must stay synchronous (serve under the lock); only disk and remote + faults are async. The return type needs a "ready now or pending" shape, matching moq-net's + existing `kio::Pending`. +3. **Default bounds.** With `TrackInfo.cache` gone, pick a conservative RAM-only default so an + unconfigured `TrackProducer` behaves like today: a small recent window, no spill. +4. **Footprint.** Per-track bounds mean total RAM is the sum of `ram.max` across live tracks. + Keep the default modest and document footprint = bound times track count. +5. **Pinned groups mid-band.** Skip and flush around them, or hold the batch until unpinned. + Skipping is simpler and old groups are rarely pinned; revisit only if it bites. From 17368b5bf1db4a27e1989814ab17ebaa5e6e75cf Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 16:24:39 +0000 Subject: [PATCH 05/25] docs(moq-net): add moq-cli cache flag design to spike Sketch the CacheArgs clap group for `moq serve` / `moq accept`: --cache-ram (+ -min), --cache-disk(-age), --cache-remote(-age), --cache-interval, mapping onto the per-track [min, max] bounds and the RAM -> disk -> remote cascade. Absent --cache-ram leaves caching off. Design only; wiring waits on the moq_net::CacheConfig API. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-net/CACHE.md | 75 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/rs/moq-net/CACHE.md b/rs/moq-net/CACHE.md index a4ce529ca..57675a947 100644 --- a/rs/moq-net/CACHE.md +++ b/rs/moq-net/CACHE.md @@ -162,13 +162,86 @@ independent: ## Per-binary use -- **moq-cli:** no cache, or a small RAM-only `CacheConfig` for a single track. +- **moq-cli:** no cache, or a small RAM-only `CacheConfig` for a single track. See "moq-cli + flags" below for the concrete surface. - **moq-relay:** one `CacheConfig` template applied to every track it creates. Threading that config onto the tracks moq-net auto-creates during fan-out is the Origin follow-up; here it is just `TrackProducer::with_cache(config)`. A relay RAM cache that spills to disk or S3 becomes configuration, not code. - **moq-edge:** the same, plus its own dynamic-handler business logic on top. +## moq-cli flags + +The cache is most useful on the commands that run a local origin and serve a broadcast back +(`moq serve`, `moq accept`), so a flattened `CacheArgs` group lands on those. The flags map onto +the `[min, max]` bounds and the tier cascade; an absent `--cache-ram` means no cache (today's +behavior). This is the proposed surface; wiring it waits on the `moq_net::CacheConfig` API. + +```rust +/// Retain recent groups so late subscribers and FETCHes get old content. +/// Absent `--cache-ram` leaves caching off. +#[derive(clap::Args, Clone, Default)] +pub struct CacheArgs { + /// Keep up to this much of each track's recent groups in RAM (high watermark). + /// Setting it enables the cache. e.g. `30s`. + #[arg(long, value_parser = humantime::parse_duration)] + pub cache_ram: Option, + + /// RAM low watermark; a flush drains down to this, and the band between the two + /// becomes one segment. Defaults to two-thirds of `--cache-ram`. + #[arg(long, value_parser = humantime::parse_duration)] + pub cache_ram_min: Option, + + /// Also retain on local disk at this path (spill from RAM). + #[arg(long)] + pub cache_disk: Option, + + /// How long to keep groups on disk before rolling up to remote (or dropping if + /// no remote tier). e.g. `5m`. + #[arg(long, value_parser = humantime::parse_duration)] + pub cache_disk_age: Option, + + /// Also retain in remote object storage, e.g. `s3://bucket/prefix`. + #[arg(long)] + pub cache_remote: Option, + + /// How long to keep groups in remote storage. Omit to keep forever. + #[arg(long, value_parser = humantime::parse_duration)] + pub cache_remote_age: Option, + + /// Flush a partial RAM band after this long even below the high watermark, so a + /// low data-rate track still spills. Mostly redundant with a duration `--cache-ram`. + #[arg(long, value_parser = humantime::parse_duration)] + pub cache_interval: Option, +} +``` + +`CacheArgs` flattens into `Serve` and `Accept` (the relay-running commands), e.g. + +```text +moq serve --broadcast bbb --cache-ram 30s --cache-disk /var/cache/moq --cache-disk-age 5m \ + --cache-remote s3://moq-archive/bbb --cache-remote-age 30d fmp4 < bbb.mp4 +``` + +and converts to the config the producer takes: + +```rust +impl CacheArgs { + /// None when `--cache-ram` is unset (caching disabled). + pub fn config(&self) -> Option { /* map flags -> bounds + tiers */ } +} + +// in run_serve / run_accept, for each track produced: +if let Some(config) = cache.config() { + producer = producer.with_cache(config.clone()); +} +``` + +Notes: byte-budget variants (`--cache-ram-bytes`, etc.) are additive later; duration bounds +cover the common case. moq-cli parses straight from clap (no TOML merge), so plain +`Option` is fine here. The relay (`rs/moq-relay`), which does merge TOML, would carry +the same flags under its `Option` clobber rule. + ## Open questions 1. **object_store in moq-net.** Feature-gate `cache-tiered`; RAM-only stays dependency-free. From 18894baa43d6ac3de7c54cc10e2e710dd11f9abb Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 16:27:56 +0000 Subject: [PATCH 06/25] docs(moq-net): make the cache shareable across producer and consumer The cache becomes a cloneable `Cache` handle (built from CacheConfig) whose clone shares the same tiers, so one cache can back both a track's TrackProducer and its TrackConsumer. Add TrackConsumer::with_cache and spell out the consumer fetch semantics: fetch_group / get_group resolve from the cache first (RAM sync, disk/remote after a ranged read), miss falls through to the wire and populates the cache, and live subscribe groups populate it too. A cache-backed consumer with no upstream answers FETCH straight from storage (the archive serve path). Inserts dedup by sequence so sharing one cache across both sides is safe. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-net/CACHE.md | 81 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 20 deletions(-) diff --git a/rs/moq-net/CACHE.md b/rs/moq-net/CACHE.md index 57675a947..db0ac1a87 100644 --- a/rs/moq-net/CACHE.md +++ b/rs/moq-net/CACHE.md @@ -12,9 +12,11 @@ or remote object storage. This is the moq-net mechanism the `moq-archive` crate These come from design review and pin down the shape: -- **Owned by `TrackProducer` only.** The cache is local policy set by whoever holds the - producer (the relay or edge), never by the original publisher and never carried on the wire. - This is why `TrackInfo.cache` goes away (see "Removing TrackInfo.cache"). +- **Local, not on the wire.** The cache is local policy set by whoever holds a track endpoint + (the relay or edge), never by the original publisher and never carried on the wire. This is + why `TrackInfo.cache` goes away (see "Removing TrackInfo.cache"). The handle is **shareable**: + one cache can back both a track's `TrackProducer` and its `TrackConsumer` (see "Attaching to a + producer or a consumer"). - **Per-track bounds, no shared LRU.** Each track keeps a `[min, max]` window of its own recent groups. There is no cross-track accounting, so no shared lock and no contention. The cost is that there is no global RAM ceiling; total footprint is the sum of per-track `max` across @@ -128,26 +130,62 @@ cloud stack. The on-tier bytes reuse the `moq-archive` segment plus manifest for cache and the archive crate agree byte-for-byte and a relay's spilled data is directly readable by an archive node. -## Integration with TrackProducer / TrackState +## Attaching to a producer or a consumer -Today `TrackState.groups` is the inline per-track cache, bounded by the `TrackInfo.cache` -duration. With a `CacheConfig` attached: - -- finished groups beyond `ram.min` move from the inline buffer into the cache's RAM tier; -- a `get_group` or `dynamic()` miss consults the cache (RAM, then disk, then remote) before - failing with `NotFound`; -- nothing reads `TrackInfo.cache` any more. - -The attach point is one local method: +The cache is a `Cache` handle built from a `CacheConfig`. It is cloneable and a clone shares the +same store (RAM tier and disk/remote tiers), so **one cache can back both a track's producer and +its consumer**. ```rust +/// A live, shareable per-track store. Clone shares the underlying tiers. +#[derive(Clone)] +pub struct Cache { /* Arc inside */ } +impl Cache { pub fn new(config: CacheConfig) -> Self; } +impl From for Cache { /* Cache::new */ } + impl TrackProducer { - /// Attach a local cache. Retains and serves groups per `config`, independent of any - /// retention the original publisher set. - pub fn with_cache(self, config: CacheConfig) -> Self; + /// Retain and serve groups this producer creates, per the cache. Independent of any + /// retention the original publisher set. Accepts a `Cache` or a `CacheConfig`. + pub fn with_cache(self, cache: impl Into) -> Self; } + +impl TrackConsumer { + /// Back this consumer's `fetch_group` / `get_group` with the cache: hits resolve locally, + /// and groups read off the wire populate it. + pub fn with_cache(self, cache: impl Into) -> Self; +} +``` + +Sharing one store across both endpoints of a track: + +```rust +let cache = Cache::new(config); +let producer = producer.with_cache(cache.clone()); +let consumer = consumer.with_cache(cache); // same groups, one set of tiers ``` +### Producer side +`TrackState.groups` (today's inline buffer, bounded by the now-removed `TrackInfo.cache`) is +backed by the cache: finished groups beyond `ram.min` move into the RAM tier, and a `get_group` +or `dynamic()` miss consults the cache (RAM, then disk, then remote) before `NotFound`. + +### Consumer side (fetch) +`TrackConsumer::fetch_group(seq)` and `get_group(seq)` check the cache first: + +- **hit** (RAM, disk, or remote) resolves the `kio::Pending` locally with no wire FETCH (RAM + synchronously, disk/remote after the ranged read); +- **miss** falls through to the normal wire FETCH, and the result is inserted into the cache so a + repeat is local. + +Groups arriving on a live `subscribe` also populate the cache, so a consumer that watched a +track can later fetch its recent groups without going back upstream. This is exactly the +archive's serve path: a cache-backed `TrackConsumer`, with no live upstream, answers FETCH +straight from disk/remote. + +A shared cache makes the two directions symmetric: groups a producer creates are fetchable +through a consumer of the same track, and groups a consumer pulled off the wire are servable by +the producer. Inserts dedup by sequence, so attaching one cache to both sides is safe. + ## Removing TrackInfo.cache `TrackInfo.cache` is a producer-set, wire-serialized duration. It conflates "how long the @@ -228,12 +266,15 @@ and converts to the config the producer takes: ```rust impl CacheArgs { /// None when `--cache-ram` is unset (caching disabled). - pub fn config(&self) -> Option { /* map flags -> bounds + tiers */ } + pub fn cache(&self) -> Option { /* flags -> CacheConfig -> Cache::new */ } } -// in run_serve / run_accept, for each track produced: -if let Some(config) = cache.config() { - producer = producer.with_cache(config.clone()); +// in run_serve / run_accept: build the shared store once, then hand a clone to each endpoint. +let cache = args.cache(); +if let Some(cache) = &cache { + producer = producer.with_cache(cache.clone()); + // a TrackConsumer of the same track also takes cache.clone(), so its fetch_group is + // served locally (RAM, then disk/remote). } ``` From 8dd14bc89b4cf6cf2967f98fa3dba28a688bcbfe Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 16:39:57 +0000 Subject: [PATCH 07/25] feat(moq-net): implement RAM tier of the per-track cache Add the moq_net::cache module: a per-track bounded group cache with the produce/consume split. cache::Config::produce() yields a cache::Producer (write half, not Clone) and Producer::consume() a cache::Consumer (read half, Clone), sharing one store so a cache backs both a track's producer and consumer. Eviction is a high/low watermark, not an LRU: an insert over the high watermark drains the oldest groups down to the low watermark and returns them as one Batch (the caller persists it to the next tier), which is what lets a group-per-frame audio track avoid one tiny object per group. Bounds are per-track on bytes and duration (media-timestamp span), the latest group is never evicted, and inserts dedup by sequence so sharing one cache across both endpoints is safe. 14 unit tests cover get/miss, dedup, byte and duration watermarks, batch contents, always-keep-latest, hysteresis within the band, unbounded and min-unset edges, and out-of-order inserts. Disk/remote tiers and the TrackProducer/TrackConsumer with_cache wiring remain design; CACHE.md is reconciled to the implemented names and marks what is built. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-net/CACHE.md | 130 +++++---- rs/moq-net/src/model/cache.rs | 483 ++++++++++++++++++++++++++++++++++ rs/moq-net/src/model/mod.rs | 4 + 3 files changed, 562 insertions(+), 55 deletions(-) create mode 100644 rs/moq-net/src/model/cache.rs diff --git a/rs/moq-net/CACHE.md b/rs/moq-net/CACHE.md index db0ac1a87..06976972c 100644 --- a/rs/moq-net/CACHE.md +++ b/rs/moq-net/CACHE.md @@ -1,12 +1,18 @@ # moq-net track cache (spike) -> Status: design spike, no implementation yet. Targets `dev`: it removes a public/wire field -> (`TrackInfo.cache`) and adds local API to `TrackProducer`. +> Status: the RAM tier and eviction policy are implemented in `src/model/cache.rs` (module +> `moq_net::cache`, with unit tests). The disk/remote tiers and the `TrackProducer` / +> `TrackConsumer` wiring are still design. Targets `dev`: it removes a public/wire field +> (`TrackInfo.cache`) and adds local API to the track endpoints. -A per-track group cache owned by a `TrackProducer`. It lets a relay or edge retain recent -groups past the live window and serve them back on a FETCH, optionally spilling to local disk -or remote object storage. This is the moq-net mechanism the `moq-archive` crate builds on (see -`../moq-archive/DESIGN.md`); the on-tier byte format is shared with that crate. +A per-track group cache. It lets a relay or edge retain recent groups past the live window and +serve them back on a FETCH, optionally spilling to local disk or remote object storage. This is +the moq-net mechanism the `moq-archive` crate builds on (see `../moq-archive/DESIGN.md`); the +on-tier byte format is shared with that crate. + +The implemented surface follows moq-net's produce/consume split: `cache::Config::produce()` +yields a `cache::Producer` (the write half, not `Clone`), and `Producer::consume()` yields a +`cache::Consumer` (the read half, `Clone`). Names below are the real ones. ## Principles @@ -21,7 +27,7 @@ These come from design review and pin down the shape: groups. There is no cross-track accounting, so no shared lock and no contention. The cost is that there is no global RAM ceiling; total footprint is the sum of per-track `max` across live tracks. A global backstop, if ever needed, is additive and not part of v1. -- **No traits, no callbacks.** `Cache` is a concrete value you configure and attach. moq-net +- **No traits, no callbacks.** The cache is concrete values you configure and attach (`cache::Producer` / `cache::Consumer`). moq-net owns all behavior. The disk and remote backends are an internal, configured `object_store`, not a consumer-implemented extension point. - **Watermark flush, not per-item eviction.** Groups accumulate to the high watermark, then a @@ -35,24 +41,31 @@ These come from design review and pin down the shape: Per track, on both size and duration, whichever trips first: ```rust -/// Local cache policy for a single TrackProducer. Not on the wire, not in TrackInfo. +// module moq_net::cache + +/// Local cache policy for a single track. Not on the wire, not in TrackInfo. #[derive(Clone, Debug, Default)] #[non_exhaustive] -pub struct CacheConfig { +pub struct Config { pub ram: Bounds, // keep >= min in RAM; flush the band once > max - pub disk: Option, // local object_store + its own Bounds - pub remote: Option, // remote object_store + its own Bounds - pub interval: Option, // max wall-clock before a partial band flushes anyway + // forthcoming: disk + remote tiers (object_store, feature-gated) and an interval backstop. } /// A low/high watermark. The gap (max - min) is the flush batch size. pub struct Bounds { pub min: Limit, pub max: Limit } /// A bound expressed as a duration, a byte count, or both (first to trip wins). +/// All-None means unbounded as a high watermark, floor-zero as a low watermark. pub struct Limit { pub duration: Option, pub bytes: Option } +``` -/// A persistent tier: an object_store plus its own retention bounds. +The implemented `cache::Config` has only `ram` so far (it is `#[non_exhaustive]`, so adding +`disk` / `remote` / `interval` later is additive). The forthcoming tier shape: + +```rust +// forthcoming pub struct Tier { pub store: /* path or url */ (), pub bounds: Bounds } +// Config gains: disk: Option, remote: Option, interval: Option ``` The flush batch is implicitly `max - min`, so the bounds map straight onto the tiering the @@ -77,7 +90,7 @@ Each track owns a small buffer plus an index of what has been flushed where: ```rust struct TrackCache { - ram: VecDeque, // recent groups, ordered by sequence + ram: BTreeMap, // recent groups, keyed by sequence ram_bytes: u64, flushed: BTreeMap, // sequence -> (tier, object key, offset) for serving last_flush: Instant, @@ -132,55 +145,62 @@ by an archive node. ## Attaching to a producer or a consumer -The cache is a `Cache` handle built from a `CacheConfig`. It is cloneable and a clone shares the -same store (RAM tier and disk/remote tiers), so **one cache can back both a track's producer and -its consumer**. +The cache splits into a write half and a read half, like the rest of moq-net. `cache::Producer` +fills the cache and is **not `Clone`** (a single writer); `cache::Consumer` is `Clone` and shares +the same store. `Producer::consume()` derives a reader, so **one cache backs both a track's +producer and its consumer**. ```rust -/// A live, shareable per-track store. Clone shares the underlying tiers. -#[derive(Clone)] -pub struct Cache { /* Arc inside */ } -impl Cache { pub fn new(config: CacheConfig) -> Self; } -impl From for Cache { /* Cache::new */ } +// implemented (RAM tier) in moq_net::cache +let writer: cache::Producer = config.produce(); // not Clone +let reader: cache::Consumer = writer.consume(); // Clone; shares the store + +writer.insert(group); // -> Option (band to persist to the next tier) +reader.get(sequence); // -> Option +``` + +The forthcoming track wiring hands each endpoint the matching half: +```rust +// forthcoming impl TrackProducer { - /// Retain and serve groups this producer creates, per the cache. Independent of any - /// retention the original publisher set. Accepts a `Cache` or a `CacheConfig`. - pub fn with_cache(self, cache: impl Into) -> Self; + /// Fill `cache` with groups this producer creates and serve them on a miss. + pub fn with_cache(self, cache: cache::Producer) -> Self; } - impl TrackConsumer { - /// Back this consumer's `fetch_group` / `get_group` with the cache: hits resolve locally, - /// and groups read off the wire populate it. - pub fn with_cache(self, cache: impl Into) -> Self; + /// Back fetch_group / get_group with `cache`: hits resolve locally. + pub fn with_cache(self, cache: cache::Consumer) -> Self; } ``` Sharing one store across both endpoints of a track: ```rust -let cache = Cache::new(config); -let producer = producer.with_cache(cache.clone()); -let consumer = consumer.with_cache(cache); // same groups, one set of tiers +let writer = config.produce(); +let reader = writer.consume(); +let producer = producer.with_cache(writer); // fills the cache +let consumer = consumer.with_cache(reader); // fetches from it, same groups ``` +`cache::Producer` being non-`Clone` is also a deliberate step toward making `TrackProducer` +non-`Clone`: a single writer per track. + ### Producer side `TrackState.groups` (today's inline buffer, bounded by the now-removed `TrackInfo.cache`) is backed by the cache: finished groups beyond `ram.min` move into the RAM tier, and a `get_group` or `dynamic()` miss consults the cache (RAM, then disk, then remote) before `NotFound`. -### Consumer side (fetch) -`TrackConsumer::fetch_group(seq)` and `get_group(seq)` check the cache first: - -- **hit** (RAM, disk, or remote) resolves the `kio::Pending` locally with no wire FETCH (RAM - synchronously, disk/remote after the ranged read); -- **miss** falls through to the normal wire FETCH, and the result is inserted into the cache so a - repeat is local. +### Consumer side (fetch vs populate) +Reading and populating are different halves, which is what the produce/consume split buys: -Groups arriving on a live `subscribe` also populate the cache, so a consumer that watched a -track can later fetch its recent groups without going back upstream. This is exactly the -archive's serve path: a cache-backed `TrackConsumer`, with no live upstream, answers FETCH -straight from disk/remote. +- A `TrackConsumer` given a `cache::Consumer` (read half) checks the cache first on + `fetch_group(seq)` / `get_group(seq)`: a **hit** resolves the `kio::Pending` locally with no + wire FETCH (RAM synchronously, disk/remote after the ranged read); a **miss** falls through to + the wire. +- To *populate* the cache (insert groups read off the wire or off a live `subscribe`), a consumer + takes a `cache::Producer` (write half) instead. This is the archive's record-and-serve path: a + cache-backed consumer with no live upstream fills tiers as it reads and answers FETCH straight + from them. A shared cache makes the two directions symmetric: groups a producer creates are fetchable through a consumer of the same track, and groups a consumer pulled off the wire are servable by @@ -200,11 +220,11 @@ independent: ## Per-binary use -- **moq-cli:** no cache, or a small RAM-only `CacheConfig` for a single track. See "moq-cli +- **moq-cli:** no cache, or a small RAM-only `cache::Config` for a single track. See "moq-cli flags" below for the concrete surface. -- **moq-relay:** one `CacheConfig` template applied to every track it creates. Threading that +- **moq-relay:** one `cache::Config` template applied to every track it creates. Threading that config onto the tracks moq-net auto-creates during fan-out is the Origin follow-up; here it is - just `TrackProducer::with_cache(config)`. A relay RAM cache that spills to disk or S3 becomes + just `TrackProducer::with_cache(writer)`. A relay RAM cache that spills to disk or S3 becomes configuration, not code. - **moq-edge:** the same, plus its own dynamic-handler business logic on top. @@ -213,7 +233,7 @@ independent: The cache is most useful on the commands that run a local origin and serve a broadcast back (`moq serve`, `moq accept`), so a flattened `CacheArgs` group lands on those. The flags map onto the `[min, max]` bounds and the tier cascade; an absent `--cache-ram` means no cache (today's -behavior). This is the proposed surface; wiring it waits on the `moq_net::CacheConfig` API. +behavior). This is the proposed surface; wiring it waits on the track-endpoint `with_cache` API. ```rust /// Retain recent groups so late subscribers and FETCHes get old content. @@ -261,20 +281,20 @@ moq serve --broadcast bbb --cache-ram 30s --cache-disk /var/cache/moq --cache-di --cache-remote s3://moq-archive/bbb --cache-remote-age 30d fmp4 < bbb.mp4 ``` -and converts to the config the producer takes: +and converts to a `cache::Config` whose halves go to each endpoint: ```rust impl CacheArgs { /// None when `--cache-ram` is unset (caching disabled). - pub fn cache(&self) -> Option { /* flags -> CacheConfig -> Cache::new */ } + pub fn config(&self) -> Option { /* flags -> bounds (+ tiers) */ } } -// in run_serve / run_accept: build the shared store once, then hand a clone to each endpoint. -let cache = args.cache(); -if let Some(cache) = &cache { - producer = producer.with_cache(cache.clone()); - // a TrackConsumer of the same track also takes cache.clone(), so its fetch_group is - // served locally (RAM, then disk/remote). +// in run_serve / run_accept: produce the writer once, derive a reader, hand one to each endpoint. +if let Some(config) = args.config() { + let writer = config.produce(); + let reader = writer.consume(); // same store; serves fetch_group locally + producer = producer.with_cache(writer); + // a TrackConsumer of the same track takes `reader`. } ``` diff --git a/rs/moq-net/src/model/cache.rs b/rs/moq-net/src/model/cache.rs new file mode 100644 index 000000000..75e3b12b5 --- /dev/null +++ b/rs/moq-net/src/model/cache.rs @@ -0,0 +1,483 @@ +//! Per-track group cache: a bounded RAM window that evicts in batches. +//! +//! A cache is local policy attached to a single track, independent of any retention the original +//! publisher set (it is never carried on the wire). It keeps a `[min, max]` window of recent +//! groups in RAM. When an insert pushes the window past the high watermark (`max`), the oldest +//! groups down to the low watermark (`min`) are drained as one [`Batch`], which the caller hands +//! to the next tier (disk or remote object storage). Draining a whole band at once is what keeps +//! a low-latency track (audio makes a group per frame) from producing one tiny object per group; +//! an LRU, which evicts a single item the instant the budget trips, cannot batch. +//! +//! The cache is split into a write half ([`Producer`]) and a read half ([`Consumer`]), mirroring +//! the rest of moq-net. [`Producer`] is intentionally not `Clone` (a single writer fills the +//! cache); [`Consumer`] is `Clone` and shares the same store, so one cache backs both a track's +//! producer and its consumer. +//! +//! The disk and remote tiers and the [`crate::TrackProducer`] / [`crate::TrackConsumer`] wiring +//! are not implemented yet; see `rs/moq-net/CACHE.md`. This module is the RAM tier and the +//! eviction policy it builds on. + +use std::collections::BTreeMap; +use std::sync::{Arc, Mutex}; +use std::time::Duration; + +use bytes::Bytes; + +use super::Timestamp; + +/// A cache bound, as a duration, a byte count, or both (the first to trip wins). +/// +/// All-`None` means "no threshold": as a high watermark that is unbounded (never flush), as a low +/// watermark that is a floor of zero (drain everything but the latest group). +#[derive(Clone, Copy, Debug, Default)] +pub struct Limit { + /// Bound on the span between the oldest and newest buffered group's media timestamps. + pub duration: Option, + /// Bound on the total bytes of buffered group frames. + pub bytes: Option, +} + +impl Limit { + /// A duration-only limit. + pub fn duration(duration: Duration) -> Self { + Self { + duration: Some(duration), + bytes: None, + } + } + + /// A byte-only limit. + pub fn bytes(bytes: u64) -> Self { + Self { + duration: None, + bytes: Some(bytes), + } + } + + /// Whether either set threshold is unset (so the limit imposes no ceiling). + fn is_unset(&self) -> bool { + self.duration.is_none() && self.bytes.is_none() + } +} + +/// A low/high watermark pair. The gap between them is the flush batch size. +#[derive(Clone, Copy, Debug, Default)] +pub struct Bounds { + /// Low watermark: a flush drains down to this. + pub min: Limit, + /// High watermark: exceeding it triggers a flush. + pub max: Limit, +} + +impl Bounds { + /// Build bounds from a low and high watermark. + pub fn new(min: Limit, max: Limit) -> Self { + Self { min, max } + } +} + +/// Local cache policy for a single track. Not carried on the wire. +#[derive(Clone, Debug, Default)] +#[non_exhaustive] +pub struct Config { + /// Bounds on the RAM tier. + pub ram: Bounds, + // Disk and remote tiers are forthcoming (object_store-backed, feature-gated). +} + +impl Config { + /// Build a [`Config`] with the given RAM bounds. + pub fn new(ram: Bounds) -> Self { + Self { ram } + } + + /// Start an empty cache with this policy, returning its write half. + pub fn produce(self) -> Producer { + Producer::new(self) + } +} + +/// One cached group: enough to re-serve it or serialize it to a lower tier. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Group { + /// The group's sequence number within its track. + pub sequence: u64, + /// The group's frames, in order. + pub frames: Vec, + /// Media timestamp of the first frame, if known. Drives the duration bound. + pub ts_first: Option, + /// Media timestamp of the last frame, if known. Drives the duration bound. + pub ts_last: Option, +} + +impl Group { + /// Total size of the group's frame payloads in bytes. + pub fn size(&self) -> u64 { + self.frames.iter().map(|f| f.len() as u64).sum() + } +} + +/// A band of groups drained from a tier in one flush, oldest first. The caller persists it to the +/// next tier as a single segment. +pub type Batch = Vec; + +/// The shared store behind a [`Producer`] and its [`Consumer`]s. +struct State { + config: Config, + /// Groups keyed by sequence, so the first entry is the oldest and the last is the latest. + ram: BTreeMap, + ram_bytes: u64, +} + +impl State { + /// The time span between the oldest group's first frame and the newest group's last frame. + /// Zero unless both ends carry a timestamp, so a track without media timestamps applies no + /// duration pressure (byte bounds still apply). + fn span(&self) -> Duration { + let first = self.ram.values().next().and_then(|g| g.ts_first); + let last = self.ram.values().next_back().and_then(|g| g.ts_last); + match (first, last) { + (Some(a), Some(b)) => Duration::from(b).saturating_sub(Duration::from(a)), + _ => Duration::ZERO, + } + } + + /// Whether the current contents exceed `limit`. An unset limit is treated as a floor of zero + /// (any content exceeds it), which is what makes a flush with no `min` drain to just the + /// latest group. + fn exceeds(&self, limit: Limit) -> bool { + if limit.is_unset() { + return !self.ram.is_empty(); + } + limit.bytes.is_some_and(|b| self.ram_bytes > b) || limit.duration.is_some_and(|d| self.span() > d) + } + + /// Whether the high watermark is tripped. An unset high watermark is unbounded (never trips). + fn over_max(&self) -> bool { + !self.config.ram.max.is_unset() && self.exceeds(self.config.ram.max) + } + + fn insert(&mut self, group: Group) -> Option { + let size = group.size(); + if let Some(old) = self.ram.insert(group.sequence, group) { + self.ram_bytes -= old.size(); + } + self.ram_bytes += size; + self.flush() + } + + /// If over the high watermark, drain the oldest groups down to the low watermark, keeping the + /// latest group always. Returns the drained band, oldest first, or `None` if nothing flushed. + fn flush(&mut self) -> Option { + if !self.over_max() { + return None; + } + + let mut batch = Batch::new(); + // Drain oldest-first while still above the low watermark, but never the latest group: a + // new subscriber and the live edge need it, and it is the likeliest next fetch. + while self.ram.len() > 1 && self.exceeds(self.config.ram.min) { + let oldest = *self.ram.keys().next().expect("non-empty"); + let latest = *self.ram.keys().next_back().expect("non-empty"); + if oldest == latest { + break; + } + let group = self.ram.remove(&oldest).expect("just observed"); + self.ram_bytes -= group.size(); + batch.push(group); + } + + (!batch.is_empty()).then_some(batch) + } +} + +/// The write half of a track cache. Insert finished groups; not `Clone` (a single writer fills +/// the cache). Call [`consume`](Self::consume) for a read handle. +pub struct Producer { + state: Arc>, +} + +impl Producer { + fn new(config: Config) -> Self { + Self { + state: Arc::new(Mutex::new(State { + config, + ram: BTreeMap::new(), + ram_bytes: 0, + })), + } + } + + /// Insert a finished group. + /// + /// Returns a [`Batch`] when this insert pushed the RAM tier over its high watermark: the band + /// drained down to the low watermark, which the caller persists to the next tier. `None` when + /// nothing was evicted. A RAM-only cache ignores the return (the band is simply dropped). + pub fn insert(&mut self, group: Group) -> Option { + self.state.lock().expect("cache poisoned").insert(group) + } + + /// A read handle sharing this cache's store. + pub fn consume(&self) -> Consumer { + Consumer { + state: self.state.clone(), + } + } + + /// The highest sequence currently buffered in RAM, if any. + pub fn latest(&self) -> Option { + self.state + .lock() + .expect("cache poisoned") + .ram + .keys() + .next_back() + .copied() + } + + /// The number of groups currently buffered in RAM. + pub fn len(&self) -> usize { + self.state.lock().expect("cache poisoned").ram.len() + } + + /// Whether the RAM tier is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +/// The read half of a track cache. `Clone` shares the same store, so several readers (and a +/// matching [`Producer`]) cache the same groups. Backs a track's `fetch`. +#[derive(Clone)] +pub struct Consumer { + state: Arc>, +} + +impl Consumer { + /// Fetch a cached group by sequence, or `None` if it is not in the RAM tier. + /// + /// The returned [`Group`] is an owned copy (frame `Bytes` are reference-counted, so this is + /// cheap), so a later eviction never invalidates a fetch already in flight. + pub fn get(&self, sequence: u64) -> Option { + self.state.lock().expect("cache poisoned").ram.get(&sequence).cloned() + } + + /// Whether a group with this sequence is currently in the RAM tier. + pub fn contains(&self, sequence: u64) -> bool { + self.state.lock().expect("cache poisoned").ram.contains_key(&sequence) + } + + /// The highest sequence currently buffered in RAM, if any. + pub fn latest(&self) -> Option { + self.state + .lock() + .expect("cache poisoned") + .ram + .keys() + .next_back() + .copied() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// A group of `count` frames of `each` bytes, at sequence `seq`, spanning `[ts0, ts1]` micros. + fn group(seq: u64, count: usize, each: usize, ts: Option<(u64, u64)>) -> Group { + Group { + sequence: seq, + frames: vec![Bytes::from(vec![0u8; each]); count], + ts_first: ts.map(|(a, _)| Timestamp::from_micros(a).unwrap()), + ts_last: ts.map(|(_, b)| Timestamp::from_micros(b).unwrap()), + } + } + + /// A small group with no timestamps at the given sequence. + fn plain(seq: u64, bytes: usize) -> Group { + group(seq, 1, bytes, None) + } + + #[test] + fn size_sums_frame_bytes() { + let g = group(0, 3, 10, None); + assert_eq!(g.size(), 30); + } + + #[test] + fn insert_and_get() { + let mut producer = Config::default().produce(); + let consumer = producer.consume(); + + assert!(consumer.get(5).is_none()); + producer.insert(plain(5, 100)); + assert_eq!(consumer.get(5).map(|g| g.size()), Some(100)); + assert!(consumer.get(6).is_none()); + } + + #[test] + fn consumer_sees_producer_inserts() { + // A cloned consumer observes inserts on the shared store. + let mut producer = Config::default().produce(); + let a = producer.consume(); + let b = a.clone(); + + producer.insert(plain(1, 10)); + assert!(a.contains(1)); + assert!(b.contains(1)); + } + + #[test] + fn dedup_by_sequence() { + // Re-inserting a sequence replaces it and keeps byte accounting correct. + let mut producer = Config::default().produce(); + let consumer = producer.consume(); + + producer.insert(plain(1, 100)); + producer.insert(plain(1, 30)); + assert_eq!(producer.len(), 1); + assert_eq!(consumer.get(1).map(|g| g.size()), Some(30)); + } + + #[test] + fn unbounded_when_no_max_never_flushes() { + let mut producer = Config::default().produce(); + let mut flushed = None; + for seq in 0..100 { + flushed = flushed.or(producer.insert(plain(seq, 1000))); + } + assert!(flushed.is_none()); + assert_eq!(producer.len(), 100); + } + + #[test] + fn byte_high_watermark_flushes_batch_to_low() { + // Keep 60 bytes, flush once over 100. Groups of 20 bytes: the 6th insert (120 bytes) + // trips the high watermark and drains the three oldest down to the 60-byte low watermark. + let bounds = Bounds::new(Limit::bytes(60), Limit::bytes(100)); + let mut producer = Config::new(bounds).produce(); + + let mut batches: Vec = Vec::new(); + for seq in 0..=5 { + if let Some(batch) = producer.insert(plain(seq, 20)) { + batches.push(batch); + } + } + + // Exactly one flush, draining the three oldest groups as one oldest-first band. + assert_eq!(batches.len(), 1); + let drained: Vec = batches[0].iter().map(|g| g.sequence).collect(); + assert_eq!(drained, vec![0, 1, 2]); + // The low watermark (60 bytes = 3 groups) is retained, latest included. + assert_eq!(producer.len(), 3); + assert_eq!(producer.latest(), Some(5)); + } + + #[test] + fn settles_within_the_band() { + // Steady state stays between the low and high watermarks (hysteresis), never above max. + let bounds = Bounds::new(Limit::bytes(60), Limit::bytes(100)); + let mut producer = Config::new(bounds).produce(); + for seq in 0..50 { + producer.insert(plain(seq, 20)); + assert!(producer.len() <= 5, "exceeded high watermark: {}", producer.len()); + } + assert!(producer.len() >= 3, "below low watermark: {}", producer.len()); + assert_eq!(producer.latest(), Some(49)); + } + + #[test] + fn flush_keeps_latest_even_when_oversized() { + // A single group larger than the whole budget is still retained (never evict the latest). + let bounds = Bounds::new(Limit::bytes(10), Limit::bytes(50)); + let mut producer = Config::new(bounds).produce(); + + let batch = producer.insert(plain(0, 1000)); + assert!(batch.is_none()); + assert_eq!(producer.len(), 1); + assert_eq!(producer.latest(), Some(0)); + } + + #[test] + fn min_unset_drains_to_just_the_latest() { + // High watermark set, low watermark unset -> flush keeps only the latest group. + let bounds = Bounds::new(Limit::default(), Limit::bytes(50)); + let mut producer = Config::new(bounds).produce(); + + for seq in 0..5 { + producer.insert(plain(seq, 20)); + } + assert_eq!(producer.len(), 1); + assert_eq!(producer.latest(), Some(4)); + } + + #[test] + fn duration_high_watermark_evicts_by_timespan() { + // Keep 2s, flush down to 1s. Each group spans 1s of media time. + let bounds = Bounds::new( + Limit::duration(Duration::from_secs(1)), + Limit::duration(Duration::from_secs(2)), + ); + let mut producer = Config::new(bounds).produce(); + let consumer = producer.consume(); + + // seq 0: [0,1]s, seq 1: [1,2]s, seq 2: [2,3]s, seq 3: [3,4]s + for seq in 0..4u64 { + let t0 = seq * 1_000_000; + producer.insert(group(seq, 1, 10, Some((t0, t0 + 1_000_000)))); + } + + // The window cannot span more than ~2s, so the oldest groups were evicted. + assert!(consumer.contains(3), "latest kept"); + assert!(!consumer.contains(0), "oldest evicted"); + assert!(producer.len() <= 2, "len was {}", producer.len()); + } + + #[test] + fn no_duration_pressure_without_timestamps() { + // A duration bound with timestamp-less groups never flushes (byte bounds would still). + let bounds = Bounds::new( + Limit::duration(Duration::from_secs(1)), + Limit::duration(Duration::from_secs(2)), + ); + let mut producer = Config::new(bounds).produce(); + for seq in 0..20 { + assert!(producer.insert(plain(seq, 1000)).is_none()); + } + assert_eq!(producer.len(), 20); + } + + #[test] + fn latest_tracks_highest_sequence_out_of_order() { + let mut producer = Config::default().produce(); + producer.insert(plain(5, 1)); + producer.insert(plain(2, 1)); + producer.insert(plain(9, 1)); + producer.insert(plain(7, 1)); + assert_eq!(producer.latest(), Some(9)); + } + + #[test] + fn out_of_order_old_insert_can_flush_immediately() { + // Inserting a stale (low) sequence into a full cache evicts it (or an older one) at once. + let bounds = Bounds::new(Limit::bytes(40), Limit::bytes(50)); + let mut producer = Config::new(bounds).produce(); + for seq in 10..14 { + producer.insert(plain(seq, 20)); + } + // Now insert a much older sequence; the band drains oldest-first. + let batch = producer.insert(plain(0, 20)); + assert!(batch.is_some()); + assert_eq!(producer.latest(), Some(13)); + assert!(!producer.consume().contains(0), "stale insert flushed first"); + } + + #[test] + fn is_empty_and_len() { + let mut producer = Config::default().produce(); + assert!(producer.is_empty()); + producer.insert(plain(0, 1)); + assert!(!producer.is_empty()); + assert_eq!(producer.len(), 1); + } +} diff --git a/rs/moq-net/src/model/mod.rs b/rs/moq-net/src/model/mod.rs index 27258d847..b6556cfa0 100644 --- a/rs/moq-net/src/model/mod.rs +++ b/rs/moq-net/src/model/mod.rs @@ -8,6 +8,10 @@ mod subscription; mod time; mod track; +/// Per-track group cache (RAM tier and eviction policy). Namespaced: `cache::Producer`, +/// `cache::Consumer`, `cache::Config`. +pub mod cache; + pub use bandwidth::*; pub use broadcast::*; pub use compression::*; From 0da1ab0b4dc50f8ea6bcbe962a5ec6d6823e6916 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 16:43:36 +0000 Subject: [PATCH 08/25] fix(moq-net): plain code spans in cache module doc to fix rustdoc The module-level //! doc used intra-doc links to same-module items (Batch, Producer, Consumer), which rustdoc cannot resolve from an inner module doc (even with self:: paths), failing `cargo doc -D warnings` in CI. Use plain code spans there; item-level links are unaffected. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-net/src/model/cache.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/rs/moq-net/src/model/cache.rs b/rs/moq-net/src/model/cache.rs index 75e3b12b5..c125d9489 100644 --- a/rs/moq-net/src/model/cache.rs +++ b/rs/moq-net/src/model/cache.rs @@ -3,15 +3,15 @@ //! A cache is local policy attached to a single track, independent of any retention the original //! publisher set (it is never carried on the wire). It keeps a `[min, max]` window of recent //! groups in RAM. When an insert pushes the window past the high watermark (`max`), the oldest -//! groups down to the low watermark (`min`) are drained as one [`Batch`], which the caller hands -//! to the next tier (disk or remote object storage). Draining a whole band at once is what keeps -//! a low-latency track (audio makes a group per frame) from producing one tiny object per group; -//! an LRU, which evicts a single item the instant the budget trips, cannot batch. +//! groups down to the low watermark (`min`) are drained as one `Batch`, which the caller hands to +//! the next tier (disk or remote object storage). Draining a whole band at once is what keeps a +//! low-latency track (audio makes a group per frame) from producing one tiny object per group; an +//! LRU, which evicts a single item the instant the budget trips, cannot batch. //! -//! The cache is split into a write half ([`Producer`]) and a read half ([`Consumer`]), mirroring -//! the rest of moq-net. [`Producer`] is intentionally not `Clone` (a single writer fills the -//! cache); [`Consumer`] is `Clone` and shares the same store, so one cache backs both a track's -//! producer and its consumer. +//! The cache is split into a write half (`Producer`) and a read half (`Consumer`), mirroring the +//! rest of moq-net. `Producer` is intentionally not `Clone` (a single writer fills the cache); +//! `Consumer` is `Clone` and shares the same store, so one cache backs both a track's producer and +//! its consumer. //! //! The disk and remote tiers and the [`crate::TrackProducer`] / [`crate::TrackConsumer`] wiring //! are not implemented yet; see `rs/moq-net/CACHE.md`. This module is the RAM tier and the From dea43d75321d13b4ce4957dc6d11c41bbd704038 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 17:04:53 +0000 Subject: [PATCH 09/25] feat(moq-net): cache segment disk format and rollup compaction Add cache::segment: the on-disk byte format for the cache's disk/remote tiers and the rollup that compacts small segments into one larger object. A segment is one band of groups (a Batch) serialized as group blobs back to back, a footer offset table, then a fixed 8-byte trailer (footer length + magic). The trailer being last and fixed-size lets a reader fetch it with one tail-ranged GET, parse the footer, then fetch just the byte range of the wanted group. Each blob is self-delimiting (frame count, then length-prefixed frames carrying their optional media timestamp). Timestamps store raw value+scale, so a non-micro timescale (e.g. 90kHz video) round-trips exactly. Reuses the QUIC VarInt codec. rollup copies group blobs verbatim and rewrites offsets, so it is lossless and does not re-encode frames. To serialize losslessly, cache::Group now carries per-frame timestamps (cache::Frame { timestamp, payload }) with ts_first()/ts_last() derived for the duration bound; the RAM tier and its tests are updated accordingly. The module becomes a directory (cache/mod.rs + cache/segment.rs). 12 new segment tests (batch/single round-trip, footer summary, lossless non-micro scale, mixed/absent timestamps, empty group and empty batch, missing sequence, bad magic, truncation, rollup concat + offsets + single-segment + corrupt-input). All 415 moq-net lib tests pass; clippy and rustdoc clean. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-net/CACHE.md | 14 +- .../src/model/{cache.rs => cache/mod.rs} | 86 +++- rs/moq-net/src/model/cache/segment.rs | 483 ++++++++++++++++++ 3 files changed, 554 insertions(+), 29 deletions(-) rename rs/moq-net/src/model/{cache.rs => cache/mod.rs} (85%) create mode 100644 rs/moq-net/src/model/cache/segment.rs diff --git a/rs/moq-net/CACHE.md b/rs/moq-net/CACHE.md index 06976972c..f7d31cbd1 100644 --- a/rs/moq-net/CACHE.md +++ b/rs/moq-net/CACHE.md @@ -1,9 +1,15 @@ # moq-net track cache (spike) -> Status: the RAM tier and eviction policy are implemented in `src/model/cache.rs` (module -> `moq_net::cache`, with unit tests). The disk/remote tiers and the `TrackProducer` / -> `TrackConsumer` wiring are still design. Targets `dev`: it removes a public/wire field -> (`TrackInfo.cache`) and adds local API to the track endpoints. +> Status: implemented in `src/model/cache/` (module `moq_net::cache`, with unit tests): +> - the RAM tier and watermark eviction policy (`mod.rs`); +> - the on-disk **segment byte format** and **rollup** compaction (`segment.rs`): lossless +> per-frame encode/decode (raw timestamp value+scale, so any timescale round-trips), a +> self-describing footer offset table read from a fixed trailer, and `rollup` to concatenate +> small segments into one larger object. +> +> Still design: the tier I/O (object_store `put`/`get_range` + the disk-tier watermark and the +> seq->location index) and the `TrackProducer` / `TrackConsumer` wiring. Targets `dev`: it +> removes a public/wire field (`TrackInfo.cache`) and adds local API to the track endpoints. A per-track group cache. It lets a relay or edge retain recent groups past the live window and serve them back on a FETCH, optionally spilling to local disk or remote object storage. This is diff --git a/rs/moq-net/src/model/cache.rs b/rs/moq-net/src/model/cache/mod.rs similarity index 85% rename from rs/moq-net/src/model/cache.rs rename to rs/moq-net/src/model/cache/mod.rs index c125d9489..60c99dbe3 100644 --- a/rs/moq-net/src/model/cache.rs +++ b/rs/moq-net/src/model/cache/mod.rs @@ -13,9 +13,11 @@ //! `Consumer` is `Clone` and shares the same store, so one cache backs both a track's producer and //! its consumer. //! -//! The disk and remote tiers and the [`crate::TrackProducer`] / [`crate::TrackConsumer`] wiring -//! are not implemented yet; see `rs/moq-net/CACHE.md`. This module is the RAM tier and the -//! eviction policy it builds on. +//! The `segment` submodule is the on-disk byte format used by the disk and remote tiers (a band +//! of groups serialized as one self-describing object) plus the rollup that concatenates several +//! small segments into one larger object. The tier I/O (object_store) and the +//! [`crate::TrackProducer`] / [`crate::TrackConsumer`] wiring are not implemented yet; see +//! `rs/moq-net/CACHE.md`. use std::collections::BTreeMap; use std::sync::{Arc, Mutex}; @@ -25,6 +27,8 @@ use bytes::Bytes; use super::Timestamp; +pub mod segment; + /// A cache bound, as a duration, a byte count, or both (the first to trip wins). /// /// All-`None` means "no threshold": as a high watermark that is unbounded (never flush), as a low @@ -54,7 +58,7 @@ impl Limit { } } - /// Whether either set threshold is unset (so the limit imposes no ceiling). + /// Whether both thresholds are unset (so the limit imposes no ceiling). fn is_unset(&self) -> bool { self.duration.is_none() && self.bytes.is_none() } @@ -97,23 +101,38 @@ impl Config { } } -/// One cached group: enough to re-serve it or serialize it to a lower tier. +/// One frame within a cached group: its optional media timestamp and its payload. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Frame { + /// The frame's media timestamp, if the track carries them. + pub timestamp: Option, + /// The frame's payload bytes. + pub payload: Bytes, +} + +/// One cached group: its sequence and frames, enough to re-serve it or serialize it to a tier. #[derive(Clone, Debug, PartialEq, Eq)] pub struct Group { /// The group's sequence number within its track. pub sequence: u64, /// The group's frames, in order. - pub frames: Vec, - /// Media timestamp of the first frame, if known. Drives the duration bound. - pub ts_first: Option, - /// Media timestamp of the last frame, if known. Drives the duration bound. - pub ts_last: Option, + pub frames: Vec, } impl Group { /// Total size of the group's frame payloads in bytes. pub fn size(&self) -> u64 { - self.frames.iter().map(|f| f.len() as u64).sum() + self.frames.iter().map(|f| f.payload.len() as u64).sum() + } + + /// The first frame's media timestamp, if any. Used as the group's lower time bound. + pub fn ts_first(&self) -> Option { + self.frames.first().and_then(|f| f.timestamp) + } + + /// The last frame's media timestamp, if any. Used as the group's upper time bound. + pub fn ts_last(&self) -> Option { + self.frames.last().and_then(|f| f.timestamp) } } @@ -134,8 +153,8 @@ impl State { /// Zero unless both ends carry a timestamp, so a track without media timestamps applies no /// duration pressure (byte bounds still apply). fn span(&self) -> Duration { - let first = self.ram.values().next().and_then(|g| g.ts_first); - let last = self.ram.values().next_back().and_then(|g| g.ts_last); + let first = self.ram.values().next().and_then(|g| g.ts_first()); + let last = self.ram.values().next_back().and_then(|g| g.ts_last()); match (first, last) { (Some(a), Some(b)) => Duration::from(b).saturating_sub(Duration::from(a)), _ => Duration::ZERO, @@ -283,27 +302,46 @@ impl Consumer { mod tests { use super::*; - /// A group of `count` frames of `each` bytes, at sequence `seq`, spanning `[ts0, ts1]` micros. - fn group(seq: u64, count: usize, each: usize, ts: Option<(u64, u64)>) -> Group { + /// A frame of `bytes` zero bytes at an optional micros timestamp. + fn frame(bytes: usize, ts_micros: Option) -> Frame { + Frame { + timestamp: ts_micros.map(|t| Timestamp::from_micros(t).unwrap()), + payload: Bytes::from(vec![0u8; bytes]), + } + } + + /// A one-frame group with no timestamp at the given sequence. + fn plain(seq: u64, bytes: usize) -> Group { Group { sequence: seq, - frames: vec![Bytes::from(vec![0u8; each]); count], - ts_first: ts.map(|(a, _)| Timestamp::from_micros(a).unwrap()), - ts_last: ts.map(|(_, b)| Timestamp::from_micros(b).unwrap()), + frames: vec![frame(bytes, None)], } } - /// A small group with no timestamps at the given sequence. - fn plain(seq: u64, bytes: usize) -> Group { - group(seq, 1, bytes, None) + /// A two-frame group spanning `[t0, t1]` micros, total `bytes`. + fn timed(seq: u64, bytes: usize, t0: u64, t1: u64) -> Group { + Group { + sequence: seq, + frames: vec![frame(bytes / 2, Some(t0)), frame(bytes - bytes / 2, Some(t1))], + } } #[test] fn size_sums_frame_bytes() { - let g = group(0, 3, 10, None); + let g = Group { + sequence: 0, + frames: vec![frame(10, None), frame(10, None), frame(10, None)], + }; assert_eq!(g.size(), 30); } + #[test] + fn ts_first_and_last() { + let g = timed(0, 8, 100, 900); + assert_eq!(g.ts_first(), Some(Timestamp::from_micros(100).unwrap())); + assert_eq!(g.ts_last(), Some(Timestamp::from_micros(900).unwrap())); + } + #[test] fn insert_and_get() { let mut producer = Config::default().produce(); @@ -424,10 +462,9 @@ mod tests { // seq 0: [0,1]s, seq 1: [1,2]s, seq 2: [2,3]s, seq 3: [3,4]s for seq in 0..4u64 { let t0 = seq * 1_000_000; - producer.insert(group(seq, 1, 10, Some((t0, t0 + 1_000_000)))); + producer.insert(timed(seq, 10, t0, t0 + 1_000_000)); } - // The window cannot span more than ~2s, so the oldest groups were evicted. assert!(consumer.contains(3), "latest kept"); assert!(!consumer.contains(0), "oldest evicted"); assert!(producer.len() <= 2, "len was {}", producer.len()); @@ -465,7 +502,6 @@ mod tests { for seq in 10..14 { producer.insert(plain(seq, 20)); } - // Now insert a much older sequence; the band drains oldest-first. let batch = producer.insert(plain(0, 20)); assert!(batch.is_some()); assert_eq!(producer.latest(), Some(13)); diff --git a/rs/moq-net/src/model/cache/segment.rs b/rs/moq-net/src/model/cache/segment.rs new file mode 100644 index 000000000..a2eab2e84 --- /dev/null +++ b/rs/moq-net/src/model/cache/segment.rs @@ -0,0 +1,483 @@ +//! On-disk byte format for the cache's disk and remote tiers. +//! +//! A *segment* is one band of groups ([`super::Batch`]) serialized as a single self-describing +//! object: the group blobs back to back, then a footer holding a per-group offset table, then an +//! 8-byte trailer (footer length + magic). Because the trailer is last and fixed-size, a reader +//! can fetch it with one tail-ranged GET, parse the footer, then fetch just the byte range of the +//! group it wants. Each group blob is itself self-delimiting (frame count, then length-prefixed +//! frames carrying their optional media timestamp), so frames round-trip losslessly. +//! +//! `rollup` concatenates several small segments into one larger object, rewriting the offset +//! table. It copies group blobs verbatim (no frame re-encoding), so it is cheap and lossless; it +//! is how the disk tier compacts into one remote object. + +use bytes::{Buf, BufMut, Bytes, BytesMut}; + +use super::{Frame, Group}; +use crate::{DecodeError, EncodeError, Timescale, Timestamp, VarInt}; + +/// Magic trailer identifying a cache segment ("MOQS"). +const MAGIC: u32 = 0x4D4F_5153; + +/// Fixed trailer size: a little-endian u32 footer length followed by the u32 magic. +const TRAILER: usize = 8; + +/// An error decoding or encoding a [`Segment`]. +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum Error { + /// The data is shorter than a declared length or the trailer. + #[error("segment truncated")] + Truncated, + /// The trailing magic did not match, so this is not a cache segment. + #[error("bad segment magic")] + BadMagic, + /// A varint or field failed to decode. + #[error(transparent)] + Decode(#[from] DecodeError), + /// A varint failed to encode. + #[error(transparent)] + Encode(#[from] EncodeError), + /// A value (varint or timestamp) was out of the representable range. + #[error("value out of range")] + Value, +} + +/// One row of a segment's footer: where a group lives and its summary, without decoding the blob. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct GroupEntry { + /// The group's sequence number within its track. + pub sequence: u64, + /// Byte offset of the group blob within the segment. + pub offset: u64, + /// Byte length of the group blob. + pub length: u64, + /// Number of frames in the group. + pub frames: u64, + /// Media timestamp of the group's first frame, if any. + pub ts_first: Option, + /// Media timestamp of the group's last frame, if any. + pub ts_last: Option, +} + +/// Serialize a band of groups into one segment. +pub fn encode(batch: &[Group]) -> Result { + let mut buf = BytesMut::new(); + let mut entries = Vec::with_capacity(batch.len()); + + for group in batch { + let offset = buf.len() as u64; + put_group(&mut buf, group)?; + let length = buf.len() as u64 - offset; + entries.push(GroupEntry { + sequence: group.sequence, + offset, + length, + frames: group.frames.len() as u64, + ts_first: group.ts_first(), + ts_last: group.ts_last(), + }); + } + + write_footer(&mut buf, &entries)?; + Ok(buf.freeze()) +} + +/// Concatenate several segments into one, rewriting offsets. Group blobs are copied verbatim, so +/// this is lossless and does not re-encode frames. Entries keep their original order across the +/// inputs (segments are expected to cover disjoint, ascending sequence ranges). +pub fn rollup(segments: &[Bytes]) -> Result { + let mut buf = BytesMut::new(); + let mut entries = Vec::new(); + + for bytes in segments { + let segment = Segment::open(bytes.clone())?; + for entry in segment.entries() { + let blob = segment.blob(entry)?; + let offset = buf.len() as u64; + buf.extend_from_slice(&blob); + entries.push(GroupEntry { + offset, + ..entry.clone() + }); + } + } + + write_footer(&mut buf, &entries)?; + Ok(buf.freeze()) +} + +/// A parsed segment: the raw bytes plus its decoded footer. Cheap to clone (the bytes are shared). +#[derive(Clone)] +pub struct Segment { + data: Bytes, + entries: Vec, +} + +impl Segment { + /// Parse a segment from its full bytes. Reads the trailer, validates the magic, and decodes + /// the footer; group blobs are decoded lazily by [`group`](Self::group). + pub fn open(data: Bytes) -> Result { + let n = data.len(); + if n < TRAILER { + return Err(Error::Truncated); + } + + let trailer = &data[n - TRAILER..]; + let footer_len = u32::from_le_bytes(trailer[0..4].try_into().expect("4 bytes")) as usize; + let magic = u32::from_le_bytes(trailer[4..8].try_into().expect("4 bytes")); + if magic != MAGIC { + return Err(Error::BadMagic); + } + + let footer_end = n - TRAILER; + let footer_start = footer_end.checked_sub(footer_len).ok_or(Error::Truncated)?; + let entries = read_footer(data.slice(footer_start..footer_end))?; + + Ok(Self { data, entries }) + } + + /// The footer's offset table. + pub fn entries(&self) -> &[GroupEntry] { + &self.entries + } + + /// Number of groups in the segment. + pub fn len(&self) -> usize { + self.entries.len() + } + + /// Whether the segment holds no groups. + pub fn is_empty(&self) -> bool { + self.entries.is_empty() + } + + /// Decode the group with this sequence, or `None` if the segment does not contain it. + pub fn group(&self, sequence: u64) -> Option> { + let entry = self.entries.iter().find(|e| e.sequence == sequence)?; + Some(decode_group(entry.sequence, self.blob(entry))) + } + + /// Decode the group at the given footer index. + pub fn group_at(&self, index: usize) -> Option> { + let entry = self.entries.get(index)?; + Some(decode_group(entry.sequence, self.blob(entry))) + } + + /// The raw blob bytes for an entry, bounds-checked against the data. + fn blob(&self, entry: &GroupEntry) -> Result { + let start = entry.offset as usize; + let end = start.checked_add(entry.length as usize).ok_or(Error::Truncated)?; + if end > self.data.len() { + return Err(Error::Truncated); + } + Ok(self.data.slice(start..end)) + } +} + +/// Decode a group blob (frame count, then frames) at a known sequence. +fn decode_group(sequence: u64, blob: Result) -> Result { + let mut blob = blob?; + let count = get_varint(&mut blob)? as usize; + let mut frames = Vec::with_capacity(count.min(8192)); + for _ in 0..count { + frames.push(get_frame(&mut blob)?); + } + Ok(Group { sequence, frames }) +} + +fn put_group(buf: &mut BytesMut, group: &Group) -> Result<(), Error> { + put_varint(buf, group.frames.len() as u64)?; + for frame in &group.frames { + put_frame(buf, frame)?; + } + Ok(()) +} + +fn put_frame(buf: &mut BytesMut, frame: &Frame) -> Result<(), Error> { + put_varint(buf, frame.payload.len() as u64)?; + let flags = u8::from(frame.timestamp.is_some()); + buf.put_u8(flags); + if let Some(ts) = frame.timestamp { + put_timestamp(buf, ts)?; + } + buf.extend_from_slice(&frame.payload); + Ok(()) +} + +fn get_frame(buf: &mut Bytes) -> Result { + let len = get_varint(buf)? as usize; + let flags = get_u8(buf)?; + let timestamp = if flags & 1 != 0 { + Some(get_timestamp(buf)?) + } else { + None + }; + if buf.remaining() < len { + return Err(Error::Truncated); + } + let payload = buf.copy_to_bytes(len); + Ok(Frame { timestamp, payload }) +} + +fn write_footer(buf: &mut BytesMut, entries: &[GroupEntry]) -> Result<(), Error> { + let start = buf.len(); + put_varint(buf, entries.len() as u64)?; + for entry in entries { + put_varint(buf, entry.sequence)?; + put_varint(buf, entry.offset)?; + put_varint(buf, entry.length)?; + put_varint(buf, entry.frames)?; + let flags = u8::from(entry.ts_first.is_some()) | (u8::from(entry.ts_last.is_some()) << 1); + buf.put_u8(flags); + if let Some(ts) = entry.ts_first { + put_timestamp(buf, ts)?; + } + if let Some(ts) = entry.ts_last { + put_timestamp(buf, ts)?; + } + } + let footer_len = (buf.len() - start) as u32; + buf.put_u32_le(footer_len); + buf.put_u32_le(MAGIC); + Ok(()) +} + +fn read_footer(mut body: Bytes) -> Result, Error> { + let count = get_varint(&mut body)? as usize; + let mut entries = Vec::with_capacity(count.min(65536)); + for _ in 0..count { + let sequence = get_varint(&mut body)?; + let offset = get_varint(&mut body)?; + let length = get_varint(&mut body)?; + let frames = get_varint(&mut body)?; + let flags = get_u8(&mut body)?; + let ts_first = if flags & 1 != 0 { + Some(get_timestamp(&mut body)?) + } else { + None + }; + let ts_last = if flags & 2 != 0 { + Some(get_timestamp(&mut body)?) + } else { + None + }; + entries.push(GroupEntry { + sequence, + offset, + length, + frames, + ts_first, + ts_last, + }); + } + Ok(entries) +} + +fn put_timestamp(buf: &mut BytesMut, ts: Timestamp) -> Result<(), Error> { + // Store the raw (value, scale) so any timescale (e.g. 90kHz video) round-trips exactly. + put_varint(buf, ts.value())?; + put_varint(buf, ts.scale().as_u64())?; + Ok(()) +} + +fn get_timestamp(buf: &mut impl Buf) -> Result { + let value = get_varint(buf)?; + let scale = get_varint(buf)?; + let scale = Timescale::try_from(scale).map_err(|_| Error::Value)?; + Timestamp::new(value, scale).map_err(|_| Error::Value) +} + +fn put_varint(buf: &mut BytesMut, value: u64) -> Result<(), Error> { + VarInt::try_from(value).map_err(|_| Error::Value)?.encode_quic(buf)?; + Ok(()) +} + +fn get_varint(buf: &mut impl Buf) -> Result { + Ok(VarInt::decode_quic(buf)?.into()) +} + +fn get_u8(buf: &mut impl Buf) -> Result { + if buf.remaining() < 1 { + return Err(Error::Truncated); + } + Ok(buf.get_u8()) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn ts(value: u64, scale: u64) -> Timestamp { + Timestamp::from_scale(value, scale).unwrap() + } + + fn frame(payload: &[u8], timestamp: Option) -> Frame { + Frame { + timestamp, + payload: Bytes::copy_from_slice(payload), + } + } + + fn group(sequence: u64, frames: Vec) -> Group { + Group { sequence, frames } + } + + /// A small group whose frames carry 90kHz timestamps (a non-micro scale). + fn video_group(sequence: u64, base: u64) -> Group { + group( + sequence, + vec![ + frame(b"keyframe", Some(ts(base, 90_000))), + frame(b"delta", Some(ts(base + 3000, 90_000))), + ], + ) + } + + #[test] + fn round_trip_single_group() { + let g = video_group(7, 0); + let bytes = encode(std::slice::from_ref(&g)).unwrap(); + let segment = Segment::open(bytes).unwrap(); + + assert_eq!(segment.len(), 1); + let decoded = segment.group(7).unwrap().unwrap(); + assert_eq!(decoded, g); + } + + #[test] + fn round_trip_batch_and_entries() { + let batch = vec![video_group(0, 0), video_group(1, 6000), video_group(2, 12000)]; + let bytes = encode(&batch).unwrap(); + let segment = Segment::open(bytes).unwrap(); + + assert_eq!(segment.len(), 3); + // Footer summarizes each group. + for (entry, g) in segment.entries().iter().zip(&batch) { + assert_eq!(entry.sequence, g.sequence); + assert_eq!(entry.frames, g.frames.len() as u64); + assert_eq!(entry.ts_first, g.ts_first()); + assert_eq!(entry.ts_last, g.ts_last()); + } + // Every group decodes back to the original, by sequence and by index. + for (i, g) in batch.iter().enumerate() { + assert_eq!(&segment.group(g.sequence).unwrap().unwrap(), g); + assert_eq!(&segment.group_at(i).unwrap().unwrap(), g); + } + } + + #[test] + fn timestamps_lossless_at_any_scale() { + // A 90kHz tick is not an integer number of micros; raw (value, scale) must survive. + let bytes = encode(&[video_group(0, 1)]).unwrap(); + let segment = Segment::open(bytes).unwrap(); + let decoded = segment.group(0).unwrap().unwrap(); + + let t = decoded.frames[0].timestamp.unwrap(); + assert_eq!(t.value(), 1); + assert_eq!(t.scale().as_u64(), 90_000); + } + + #[test] + fn mixed_and_absent_timestamps() { + let g = group( + 3, + vec![ + frame(b"a", None), + frame(b"b", Some(ts(500, 1_000_000))), + frame(b"c", None), + ], + ); + let bytes = encode(std::slice::from_ref(&g)).unwrap(); + let segment = Segment::open(bytes).unwrap(); + assert_eq!(segment.group(3).unwrap().unwrap(), g); + // ts_first is absent (first frame), ts_last is absent (last frame). + assert_eq!(segment.entries()[0].ts_first, None); + assert_eq!(segment.entries()[0].ts_last, None); + } + + #[test] + fn empty_group_and_empty_batch() { + // A group with no frames, and a segment with no groups, both round-trip. + let g = group(9, vec![]); + let segment = Segment::open(encode(std::slice::from_ref(&g)).unwrap()).unwrap(); + assert_eq!(segment.group(9).unwrap().unwrap(), g); + + let empty = Segment::open(encode(&[]).unwrap()).unwrap(); + assert!(empty.is_empty()); + assert!(empty.group(0).is_none()); + } + + #[test] + fn missing_sequence_is_none() { + let segment = Segment::open(encode(&[video_group(5, 0)]).unwrap()).unwrap(); + assert!(segment.group(6).is_none()); + assert!(segment.group_at(1).is_none()); + } + + #[test] + fn bad_magic_is_rejected() { + let mut bytes = encode(&[video_group(0, 0)]).unwrap().to_vec(); + let n = bytes.len(); + bytes[n - 1] ^= 0xFF; // corrupt the magic + assert!(matches!(Segment::open(Bytes::from(bytes)), Err(Error::BadMagic))); + } + + #[test] + fn truncated_is_rejected() { + let bytes = encode(&[video_group(0, 0)]).unwrap(); + // Drop the trailer entirely. + assert!(Segment::open(bytes.slice(0..4)).is_err()); + // Keep the trailer but lie about the footer length by chopping the middle. + let short = bytes.slice(0..bytes.len() - TRAILER - 1); + assert!(matches!( + Segment::open(short), + Err(Error::Truncated) | Err(Error::BadMagic) + )); + } + + #[test] + fn rollup_concatenates_and_preserves_groups() { + let first = encode(&[video_group(0, 0), video_group(1, 6000)]).unwrap(); + let second = encode(&[video_group(2, 12000), video_group(3, 18000)]).unwrap(); + + let rolled = rollup(&[first, second]).unwrap(); + let segment = Segment::open(rolled).unwrap(); + + // All four groups present, in order, decoding identically to the originals. + assert_eq!(segment.len(), 4); + let expected = [ + video_group(0, 0), + video_group(1, 6000), + video_group(2, 12000), + video_group(3, 18000), + ]; + for (i, g) in expected.iter().enumerate() { + assert_eq!(&segment.group_at(i).unwrap().unwrap(), g); + assert_eq!(&segment.group(g.sequence).unwrap().unwrap(), g); + } + + // Offsets are rewritten to be ascending and non-overlapping in the merged object. + let entries = segment.entries(); + for pair in entries.windows(2) { + assert!(pair[1].offset >= pair[0].offset + pair[0].length); + } + } + + #[test] + fn rollup_of_one_segment_round_trips() { + let batch = vec![video_group(0, 0), video_group(1, 6000)]; + let single = encode(&batch).unwrap(); + let rolled = Segment::open(rollup(std::slice::from_ref(&single)).unwrap()).unwrap(); + for g in &batch { + assert_eq!(&rolled.group(g.sequence).unwrap().unwrap(), g); + } + } + + #[test] + fn rollup_rejects_corrupt_input() { + let good = encode(&[video_group(0, 0)]).unwrap(); + let bad = Bytes::from_static(b"not a segment!!!"); + assert!(rollup(&[good, bad]).is_err()); + } +} From 8d0a720a3631d6e710e79bf8c90698893ffd1a23 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 20:24:07 +0000 Subject: [PATCH 10/25] feat(moq-net): multi-tier cache index and promotion orchestration Add cache::index: the storage-agnostic layer that ties the segment format and rollup into serving across tiers. It maps each group sequence to a Location (tier + segment + byte range), so a fetch is "locate, then ranged-read that segment." It tracks per-tier byte and duration totals, and drives promotion: Index::promotion picks the oldest disk segments once the disk tier is over its high watermark (draining to the low watermark, oldest first), and Index::apply_promotion registers the rolled-up remote segment, repoints those sequences at the remote tier, and drops the promoted disk segments. The index holds only metadata, never group bytes, so it is the part that stays in memory while bytes live on disk/remote. Add segment::group_from_blob, the ranged-read decode entry point (decode one group from just its blob bytes), and Segment::byte_len for tier accounting. The remaining object_store put/get_range glue is a thin layer over these decisions. 7 index tests including an end-to-end check: encode segments, locate, ranged- read via group_from_blob, then promote (rollup) and confirm every group still decodes identically through the remote segment. Full moq-net suite 422 pass; clippy and rustdoc clean. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-net/CACHE.md | 15 +- rs/moq-net/src/model/cache/index.rs | 391 ++++++++++++++++++++++++++ rs/moq-net/src/model/cache/mod.rs | 1 + rs/moq-net/src/model/cache/segment.rs | 17 +- 4 files changed, 414 insertions(+), 10 deletions(-) create mode 100644 rs/moq-net/src/model/cache/index.rs diff --git a/rs/moq-net/CACHE.md b/rs/moq-net/CACHE.md index f7d31cbd1..954bacbe5 100644 --- a/rs/moq-net/CACHE.md +++ b/rs/moq-net/CACHE.md @@ -4,12 +4,17 @@ > - the RAM tier and watermark eviction policy (`mod.rs`); > - the on-disk **segment byte format** and **rollup** compaction (`segment.rs`): lossless > per-frame encode/decode (raw timestamp value+scale, so any timescale round-trips), a -> self-describing footer offset table read from a fixed trailer, and `rollup` to concatenate -> small segments into one larger object. +> self-describing footer offset table read from a fixed trailer, `rollup` to concatenate small +> segments into one larger object, and `group_from_blob` (the ranged-read decode path); +> - the storage-agnostic **multi-tier index + promotion** (`index.rs`): `sequence -> Location` +> (tier + segment + byte range), per-tier byte/duration accounting, `promotion` to pick the +> oldest disk segments over the high watermark, and `apply_promotion` to repoint them at the +> remote tier after a rollup. > -> Still design: the tier I/O (object_store `put`/`get_range` + the disk-tier watermark and the -> seq->location index) and the `TrackProducer` / `TrackConsumer` wiring. Targets `dev`: it -> removes a public/wire field (`TrackInfo.cache`) and adds local API to the track endpoints. +> Still design: the tier **I/O** (object_store `put`/`get_range`/`delete` wiring the index and +> rollup to real storage, feature-gated) and the `TrackProducer` / `TrackConsumer` wiring. +> Targets `dev`: it removes a public/wire field (`TrackInfo.cache`) and adds local API to the +> track endpoints. A per-track group cache. It lets a relay or edge retain recent groups past the live window and serve them back on a FETCH, optionally spilling to local disk or remote object storage. This is diff --git a/rs/moq-net/src/model/cache/index.rs b/rs/moq-net/src/model/cache/index.rs new file mode 100644 index 000000000..551677386 --- /dev/null +++ b/rs/moq-net/src/model/cache/index.rs @@ -0,0 +1,391 @@ +//! Multi-tier index: which segment, in which tier, holds each group, and which segments to promote. +//! +//! This is the storage-agnostic orchestration the disk and remote tiers run on top of the +//! [`segment`](super::segment) format. It records, per group sequence, a [`Location`] (tier + +//! segment + byte range), so a fetch is "look up the location, ranged-read that segment." It also +//! drives **promotion**: when the disk tier grows past its bound, [`Index::promotion`] picks the +//! oldest disk segments to compact, and after the caller rolls them into one remote object +//! ([`segment::rollup`](super::segment::rollup)) [`Index::apply_promotion`] repoints those +//! sequences at the remote tier and drops the disk segments. +//! +//! The index holds only metadata (offsets, sizes, timestamps), never group bytes, so it is the +//! piece that stays in memory while the bytes live on disk or in remote storage. The actual I/O +//! (object_store `put` / `get_range` / `delete`) is a thin layer that calls these methods for its +//! decisions; nothing here blocks or allocates per byte. + +use std::collections::{BTreeMap, HashSet}; +use std::time::Duration; + +use super::segment::Segment; +use super::{Bounds, Limit}; + +/// Identifier for a stored segment, assigned in creation order (so a lower id is older). +pub type SegmentId = u64; + +/// Which durable tier a segment lives in. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Tier { + /// Local disk: the staging tier that batches flushed RAM bands. + Disk, + /// Remote object storage: the long-term tier disk segments roll up into. + Remote, +} + +/// Where a group's bytes live: which tier and segment, and the byte range within that segment. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct Location { + /// The tier holding the segment. + pub tier: Tier, + /// The segment within that tier. + pub segment: SegmentId, + /// Byte offset of the group blob within the segment object. + pub offset: u64, + /// Byte length of the group blob. + pub length: u64, +} + +/// Per-segment bookkeeping for tier accounting and promotion. +struct Meta { + tier: Tier, + bytes: u64, + /// Timestamp extent, as durations (a common scale), so cross-timescale segments compare. + ts_min: Option, + ts_max: Option, +} + +/// A map from group sequence to its [`Location`], plus per-segment metadata for promotion. +#[derive(Default)] +pub struct Index { + groups: BTreeMap, + segments: BTreeMap, + next_id: SegmentId, +} + +impl Index { + /// An empty index. + pub fn new() -> Self { + Self::default() + } + + /// Record a freshly written `segment` on `tier`, returning its new id. Each group in the + /// segment becomes locatable; an already-present sequence is repointed to this segment (this + /// is how [`apply_promotion`](Self::apply_promotion) moves sequences to the remote tier). + pub fn add(&mut self, tier: Tier, segment: &Segment) -> SegmentId { + let id = self.next_id; + self.next_id += 1; + + let mut ts_min: Option = None; + let mut ts_max: Option = None; + + for entry in segment.entries() { + self.groups.insert( + entry.sequence, + Location { + tier, + segment: id, + offset: entry.offset, + length: entry.length, + }, + ); + if let Some(t) = entry.ts_first { + let d = Duration::from(t); + ts_min = Some(ts_min.map_or(d, |m| m.min(d))); + } + if let Some(t) = entry.ts_last { + let d = Duration::from(t); + ts_max = Some(ts_max.map_or(d, |m| m.max(d))); + } + } + + self.segments.insert( + id, + Meta { + tier, + bytes: segment.byte_len() as u64, + ts_min, + ts_max, + }, + ); + id + } + + /// Where the group with this sequence lives, or `None` if it is not in any tier. + pub fn locate(&self, sequence: u64) -> Option { + self.groups.get(&sequence).copied() + } + + /// Total bytes stored in `tier`. + pub fn bytes(&self, tier: Tier) -> u64 { + self.segments.values().filter(|m| m.tier == tier).map(|m| m.bytes).sum() + } + + /// Number of segments in `tier`. + pub fn segment_count(&self, tier: Tier) -> usize { + self.segments.values().filter(|m| m.tier == tier).count() + } + + /// Segment ids in `tier`, oldest first. + fn tier_segments(&self, tier: Tier) -> Vec { + // BTreeMap iterates by id, which is creation order, i.e. oldest first. + self.segments + .iter() + .filter(|(_, m)| m.tier == tier) + .map(|(id, _)| *id) + .collect() + } + + /// Total bytes and timestamp span across a set of segments. + fn stats(&self, ids: &[SegmentId]) -> (u64, Duration) { + let mut bytes = 0; + let mut lo: Option = None; + let mut hi: Option = None; + for id in ids { + let Some(m) = self.segments.get(id) else { continue }; + bytes += m.bytes; + if let Some(d) = m.ts_min { + lo = Some(lo.map_or(d, |x| x.min(d))); + } + if let Some(d) = m.ts_max { + hi = Some(hi.map_or(d, |x| x.max(d))); + } + } + let span = match (lo, hi) { + (Some(a), Some(b)) => b.saturating_sub(a), + _ => Duration::ZERO, + }; + (bytes, span) + } + + /// Whether `(bytes, span)` trips a high watermark. An all-unset limit is unbounded. + fn over_max(stats: (u64, Duration), max: Limit) -> bool { + !max.is_unset() && Self::over(stats, max) + } + + /// Whether `(bytes, span)` is still above a low watermark. An all-unset limit is a floor of + /// zero, so any non-empty content is above it. + fn above_min(stats: (u64, Duration), min: Limit) -> bool { + if min.is_unset() { + return stats.0 > 0; + } + Self::over(stats, min) + } + + fn over((bytes, span): (u64, Duration), limit: Limit) -> bool { + limit.bytes.is_some_and(|b| bytes > b) || limit.duration.is_some_and(|d| span > d) + } + + /// The oldest disk segments to promote so the disk tier returns within `bounds`. Empty unless + /// the disk tier is over its high watermark; otherwise the oldest segments are selected until + /// what remains is within the low watermark, oldest first (the order to roll them up in). + pub fn promotion(&self, bounds: Bounds) -> Vec { + let disk = self.tier_segments(Tier::Disk); + if !Self::over_max(self.stats(&disk), bounds.max) { + return Vec::new(); + } + + let mut promote = Vec::new(); + let mut remaining = disk; + while !remaining.is_empty() && Self::above_min(self.stats(&remaining), bounds.min) { + // Promote the oldest; recompute against what is left. + promote.push(remaining.remove(0)); + } + promote + } + + /// Register `remote` (the rollup of `promoted`) on the remote tier, repoint its sequences, and + /// drop the promoted disk segments. Returns the new remote segment id. `remote` must contain + /// exactly the groups of `promoted`; any sequence missing from it is dropped from the index. + pub fn apply_promotion(&mut self, promoted: &[SegmentId], remote: &Segment) -> SegmentId { + let new_id = self.add(Tier::Remote, remote); + + let promoted: HashSet = promoted.iter().copied().collect(); + // `add` already repointed every sequence in `remote` to `new_id`; anything still pointing + // at a promoted segment was not in the rollup, so drop it. + self.groups.retain(|_, loc| !promoted.contains(&loc.segment)); + self.segments.retain(|id, _| !promoted.contains(id)); + new_id + } +} + +#[cfg(test)] +mod tests { + use super::super::segment; + use super::super::{Frame, Group}; + use super::*; + use crate::Timestamp; + use bytes::Bytes; + use std::collections::HashMap; + + /// A one-frame group of `bytes` bytes at `secs` seconds, so segments carry a timestamp span. + fn group(sequence: u64, bytes: usize, secs: u64) -> Group { + Group { + sequence, + frames: vec![Frame { + timestamp: Some(Timestamp::from_secs(secs).unwrap()), + payload: Bytes::from(vec![7u8; bytes]), + }], + } + } + + fn encoded(groups: &[Group]) -> Bytes { + segment::encode(groups).unwrap() + } + + /// A tiny stand-in for the eventual object_store: segment id -> bytes. Mirrors what the I/O + /// layer will do (put on add, get_range on locate), so the index logic is exercised end to end. + #[derive(Default)] + struct Store { + objects: HashMap, + } + + impl Store { + /// Read a group as the real tier will: ranged-read `[offset, offset+length)`, decode the + /// blob with `group_from_blob`. No footer, no full-segment parse. + fn read(&self, sequence: u64, loc: Location) -> Group { + let bytes = &self.objects[&loc.segment]; + let blob = bytes.slice(loc.offset as usize..(loc.offset + loc.length) as usize); + segment::group_from_blob(sequence, blob).unwrap() + } + } + + #[test] + fn add_and_locate_disk() { + let mut index = Index::new(); + let seg = Segment::open(encoded(&[group(0, 10, 0), group(1, 10, 1)])).unwrap(); + let id = index.add(Tier::Disk, &seg); + + let loc0 = index.locate(0).unwrap(); + assert_eq!(loc0.tier, Tier::Disk); + assert_eq!(loc0.segment, id); + assert!(index.locate(2).is_none()); + // The footer entry and the index agree on the byte range. + assert_eq!( + (loc0.offset, loc0.length), + (seg.entries()[0].offset, seg.entries()[0].length) + ); + } + + #[test] + fn tier_byte_accounting() { + let mut index = Index::new(); + let a = Segment::open(encoded(&[group(0, 100, 0)])).unwrap(); + let b = Segment::open(encoded(&[group(1, 50, 1)])).unwrap(); + index.add(Tier::Disk, &a); + index.add(Tier::Disk, &b); + assert_eq!(index.bytes(Tier::Disk), a.byte_len() as u64 + b.byte_len() as u64); + assert_eq!(index.segment_count(Tier::Disk), 2); + assert_eq!(index.bytes(Tier::Remote), 0); + } + + #[test] + fn promotion_empty_within_bounds() { + let mut index = Index::new(); + index.add(Tier::Disk, &Segment::open(encoded(&[group(0, 10, 0)])).unwrap()); + // A high watermark well above the single small segment: nothing to promote. + let bounds = Bounds::new(Limit::bytes(0), Limit::bytes(1_000_000)); + assert!(index.promotion(bounds).is_empty()); + } + + #[test] + fn promotion_selects_oldest_over_high_watermark() { + let mut index = Index::new(); + let mut ids = Vec::new(); + for seq in 0..5u64 { + let seg = Segment::open(encoded(&[group(seq, 100, seq)])).unwrap(); + ids.push(index.add(Tier::Disk, &seg)); + } + // Each segment is >100 bytes; keep ~150 bytes, flush over ~350. + let bounds = Bounds::new(Limit::bytes(150), Limit::bytes(350)); + let promote = index.promotion(bounds); + + // Oldest-first, leaving the remainder within the low watermark. + assert_eq!(&promote[..], &ids[..promote.len()]); + assert!(!promote.is_empty()); + let remaining: Vec = ids[promote.len()..].to_vec(); + assert!(index.bytes(Tier::Disk) > 0); + // What remains must be within the low watermark (<= 150 bytes worth of segments). + let remaining_bytes: u64 = remaining.iter().map(|id| index.segments[id].bytes).sum(); + assert!(remaining_bytes <= 150, "remaining {remaining_bytes} over low watermark"); + } + + #[test] + fn promotion_duration_watermark() { + let mut index = Index::new(); + // Segments at 0s, 1s, 2s, 3s; keep 1s, flush over 2s of span. + for seq in 0..4u64 { + index.add(Tier::Disk, &Segment::open(encoded(&[group(seq, 10, seq)])).unwrap()); + } + let bounds = Bounds::new( + Limit::duration(Duration::from_secs(1)), + Limit::duration(Duration::from_secs(2)), + ); + let promote = index.promotion(bounds); + assert!(!promote.is_empty(), "3s span should exceed the 2s high watermark"); + } + + #[test] + fn apply_promotion_repoints_to_remote() { + let mut index = Index::new(); + let g0 = group(0, 100, 0); + let g1 = group(1, 100, 1); + let g2 = group(2, 100, 2); + let s0 = index.add(Tier::Disk, &Segment::open(encoded(&[g0.clone()])).unwrap()); + let s1 = index.add(Tier::Disk, &Segment::open(encoded(&[g1.clone()])).unwrap()); + index.add(Tier::Disk, &Segment::open(encoded(&[g2.clone()])).unwrap()); + + // Roll up the two oldest disk segments into one remote object. + let promoted = [s0, s1]; + let rolled = segment::rollup(&[encoded(&[g0]), encoded(&[g1])]).unwrap(); + let remote = Segment::open(rolled).unwrap(); + let new_id = index.apply_promotion(&promoted, &remote); + + // Sequences 0 and 1 now live remotely in one segment; the disk segments are gone. + assert_eq!(index.locate(0).unwrap().tier, Tier::Remote); + assert_eq!(index.locate(1).unwrap().tier, Tier::Remote); + assert_eq!(index.locate(0).unwrap().segment, new_id); + assert_eq!(index.locate(1).unwrap().segment, new_id); + // Sequence 2 is untouched on disk. + assert_eq!(index.locate(2).unwrap().tier, Tier::Disk); + // Disk dropped the two promoted segments; remote gained one. + assert_eq!(index.segment_count(Tier::Disk), 1); + assert_eq!(index.segment_count(Tier::Remote), 1); + } + + #[test] + fn end_to_end_locate_then_read_through_promotion() { + // Build disk segments, store their bytes, and verify a located group decodes correctly + // both before and after promotion (the rollup repoints offsets, the read still matches). + let mut index = Index::new(); + let mut store = Store::default(); + + let groups = [group(0, 40, 0), group(1, 40, 1), group(2, 40, 2)]; + for g in &groups { + let bytes = encoded(std::slice::from_ref(g)); + let id = index.add(Tier::Disk, &Segment::open(bytes.clone()).unwrap()); + store.objects.insert(id, bytes); + } + + // Before promotion: each group reads back identically from its disk location. + for g in &groups { + let loc = index.locate(g.sequence).unwrap(); + assert_eq!(&store.read(g.sequence, loc), g); + } + + // Promote sequences 0 and 1 into one remote object. + let promoted = [index.locate(0).unwrap().segment, index.locate(1).unwrap().segment]; + let rolled = segment::rollup(&[encoded(&[groups[0].clone()]), encoded(&[groups[1].clone()])]).unwrap(); + let remote_id = index.apply_promotion(&promoted, &Segment::open(rolled.clone()).unwrap()); + store.objects.insert(remote_id, rolled); + + // After promotion: every group still reads back identically, now via the remote segment. + for g in &groups { + let loc = index.locate(g.sequence).unwrap(); + assert_eq!( + &store.read(g.sequence, loc), + g, + "sequence {} mismatched after promotion", + g.sequence + ); + } + } +} diff --git a/rs/moq-net/src/model/cache/mod.rs b/rs/moq-net/src/model/cache/mod.rs index 60c99dbe3..48c462771 100644 --- a/rs/moq-net/src/model/cache/mod.rs +++ b/rs/moq-net/src/model/cache/mod.rs @@ -27,6 +27,7 @@ use bytes::Bytes; use super::Timestamp; +pub mod index; pub mod segment; /// A cache bound, as a duration, a byte count, or both (the first to trip wins). diff --git a/rs/moq-net/src/model/cache/segment.rs b/rs/moq-net/src/model/cache/segment.rs index a2eab2e84..52f868b3b 100644 --- a/rs/moq-net/src/model/cache/segment.rs +++ b/rs/moq-net/src/model/cache/segment.rs @@ -152,16 +152,21 @@ impl Segment { self.entries.is_empty() } + /// Total size of the segment object in bytes (blobs, footer, and trailer). + pub fn byte_len(&self) -> usize { + self.data.len() + } + /// Decode the group with this sequence, or `None` if the segment does not contain it. pub fn group(&self, sequence: u64) -> Option> { let entry = self.entries.iter().find(|e| e.sequence == sequence)?; - Some(decode_group(entry.sequence, self.blob(entry))) + Some(self.blob(entry).and_then(|b| group_from_blob(entry.sequence, b))) } /// Decode the group at the given footer index. pub fn group_at(&self, index: usize) -> Option> { let entry = self.entries.get(index)?; - Some(decode_group(entry.sequence, self.blob(entry))) + Some(self.blob(entry).and_then(|b| group_from_blob(entry.sequence, b))) } /// The raw blob bytes for an entry, bounds-checked against the data. @@ -175,9 +180,11 @@ impl Segment { } } -/// Decode a group blob (frame count, then frames) at a known sequence. -fn decode_group(sequence: u64, blob: Result) -> Result { - let mut blob = blob?; +/// Decode one group from just its blob bytes and known sequence. +/// +/// This is the ranged-read decode path: the disk/remote tier reads `[offset, offset+length)` for +/// a group (from the index) and decodes those bytes without the surrounding segment or footer. +pub fn group_from_blob(sequence: u64, mut blob: Bytes) -> Result { let count = get_varint(&mut blob)? as usize; let mut frames = Vec::with_capacity(count.min(8192)); for _ in 0..count { From 3f493c8ed2c76b8a353a03034ef64aa40c64f3b3 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 20:35:44 +0000 Subject: [PATCH 11/25] fix(moq-net): use slice::from_ref in cache index tests for clippy 1.96 The cloned_ref_to_slice_refs lint (rust 1.96) flags &[x.clone()]; use std::slice::from_ref(&x) instead. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-net/src/model/cache/index.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rs/moq-net/src/model/cache/index.rs b/rs/moq-net/src/model/cache/index.rs index 551677386..80b8a1ef2 100644 --- a/rs/moq-net/src/model/cache/index.rs +++ b/rs/moq-net/src/model/cache/index.rs @@ -329,9 +329,9 @@ mod tests { let g0 = group(0, 100, 0); let g1 = group(1, 100, 1); let g2 = group(2, 100, 2); - let s0 = index.add(Tier::Disk, &Segment::open(encoded(&[g0.clone()])).unwrap()); - let s1 = index.add(Tier::Disk, &Segment::open(encoded(&[g1.clone()])).unwrap()); - index.add(Tier::Disk, &Segment::open(encoded(&[g2.clone()])).unwrap()); + let s0 = index.add(Tier::Disk, &Segment::open(encoded(std::slice::from_ref(&g0))).unwrap()); + let s1 = index.add(Tier::Disk, &Segment::open(encoded(std::slice::from_ref(&g1))).unwrap()); + index.add(Tier::Disk, &Segment::open(encoded(std::slice::from_ref(&g2))).unwrap()); // Roll up the two oldest disk segments into one remote object. let promoted = [s0, s1]; @@ -373,7 +373,7 @@ mod tests { // Promote sequences 0 and 1 into one remote object. let promoted = [index.locate(0).unwrap().segment, index.locate(1).unwrap().segment]; - let rolled = segment::rollup(&[encoded(&[groups[0].clone()]), encoded(&[groups[1].clone()])]).unwrap(); + let rolled = segment::rollup(&[encoded(std::slice::from_ref(&groups[0])), encoded(std::slice::from_ref(&groups[1]))]).unwrap(); let remote_id = index.apply_promotion(&promoted, &Segment::open(rolled.clone()).unwrap()); store.objects.insert(remote_id, rolled); From 38b64045f95390c139d2678099fd8b4ee45d47f4 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 20:38:10 +0000 Subject: [PATCH 12/25] style(moq-net): rustfmt cache index test Wrap the long rollup line flagged by cargo fmt --check in CI. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-net/src/model/cache/index.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rs/moq-net/src/model/cache/index.rs b/rs/moq-net/src/model/cache/index.rs index 80b8a1ef2..a74bc36b8 100644 --- a/rs/moq-net/src/model/cache/index.rs +++ b/rs/moq-net/src/model/cache/index.rs @@ -373,7 +373,11 @@ mod tests { // Promote sequences 0 and 1 into one remote object. let promoted = [index.locate(0).unwrap().segment, index.locate(1).unwrap().segment]; - let rolled = segment::rollup(&[encoded(std::slice::from_ref(&groups[0])), encoded(std::slice::from_ref(&groups[1]))]).unwrap(); + let rolled = segment::rollup(&[ + encoded(std::slice::from_ref(&groups[0])), + encoded(std::slice::from_ref(&groups[1])), + ]) + .unwrap(); let remote_id = index.apply_promotion(&promoted, &Segment::open(rolled.clone()).unwrap()); store.objects.insert(remote_id, rolled); From 65972c6180a034bff5e7b4465cea3dfa1ea70ddd Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 20:38:10 +0000 Subject: [PATCH 13/25] feat(moq-net): cache <-> live group bridges Add cache::Group::read (async: drain a live GroupConsumer into a cache::Group, reading each frame's payload and timestamp, resolving on group finish) and cache::Group::produce (sync: rebuild a live GroupConsumer from a cache::Group, validating frame timestamps against the track timescale). These are the bridge the TrackProducer populate path and TrackConsumer serve path both use. Two async round-trip tests (timed and untimed groups): live -> cache -> live -> cache preserves sequence, payloads, and per-frame timestamps. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-net/src/model/cache/mod.rs | 78 ++++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) diff --git a/rs/moq-net/src/model/cache/mod.rs b/rs/moq-net/src/model/cache/mod.rs index 48c462771..2ddad2403 100644 --- a/rs/moq-net/src/model/cache/mod.rs +++ b/rs/moq-net/src/model/cache/mod.rs @@ -25,7 +25,7 @@ use std::time::Duration; use bytes::Bytes; -use super::Timestamp; +use super::{Timescale, Timestamp}; pub mod index; pub mod segment; @@ -135,6 +135,42 @@ impl Group { pub fn ts_last(&self) -> Option { self.frames.last().and_then(|f| f.timestamp) } + + /// Drain a live [`GroupConsumer`](crate::GroupConsumer) into a cached group, reading every + /// frame's payload and timestamp. Resolves once the group is finished, so this is how the + /// producer side snapshots a finished group for caching. + pub async fn read(mut group: crate::GroupConsumer) -> Result { + let sequence = group.sequence; + let mut frames = Vec::new(); + while let Some(mut frame) = group.next_frame().await? { + let timestamp = frame.timestamp; + let payload = frame.read_all().await?; + frames.push(Frame { timestamp, payload }); + } + Ok(Self { sequence, frames }) + } + + /// Rebuild a live [`GroupConsumer`](crate::GroupConsumer) from this cached group, for serving a + /// fetch. `timescale` must match the track's: each frame timestamp is validated against it. + pub fn produce(&self, timescale: impl Into>) -> Result { + let mut producer = crate::GroupProducer::new( + crate::Group { + sequence: self.sequence, + }, + timescale.into(), + ); + for frame in &self.frames { + let info = crate::Frame { + size: frame.payload.len() as u64, + timestamp: frame.timestamp, + }; + let mut chunk = producer.create_frame(info)?; + chunk.write(frame.payload.clone())?; + chunk.finish()?; + } + producer.finish()?; + Ok(producer.consume()) + } } /// A band of groups drained from a tier in one flush, oldest first. The caller persists it to the @@ -517,4 +553,44 @@ mod tests { assert!(!producer.is_empty()); assert_eq!(producer.len(), 1); } + + #[tokio::test] + async fn bridge_round_trips_a_live_group() { + // Build a live timed group, drain it into a cached group, rebuild a live one, drain again, + // and confirm the two cached snapshots match (payloads and per-frame timestamps survive). + let scale = Timescale::new(1_000_000).unwrap(); + let mut live = crate::GroupProducer::new(crate::Group { sequence: 4 }, Some(scale)); + for (i, payload) in [b"hello".as_slice(), b"world".as_slice()].into_iter().enumerate() { + let info = crate::Frame { + size: payload.len() as u64, + timestamp: Some(Timestamp::new(i as u64 * 1000, scale).unwrap()), + }; + let mut frame = live.create_frame(info).unwrap(); + frame.write(Bytes::copy_from_slice(payload)).unwrap(); + frame.finish().unwrap(); + } + live.finish().unwrap(); + + let cached = Group::read(live.consume()).await.unwrap(); + assert_eq!(cached.sequence, 4); + assert_eq!(cached.frames.len(), 2); + assert_eq!(cached.frames[0].payload, Bytes::from_static(b"hello")); + assert_eq!(cached.frames[1].timestamp, Some(Timestamp::new(1000, scale).unwrap())); + + let rebuilt = Group::read(cached.produce(scale).unwrap()).await.unwrap(); + assert_eq!(cached, rebuilt); + } + + #[tokio::test] + async fn bridge_untimed_group() { + // An untimed track (no timescale, no frame timestamps) round-trips too. + let mut live = crate::GroupProducer::new(crate::Group { sequence: 0 }, None); + live.write_frame(Bytes::from_static(b"data")).unwrap(); + live.finish().unwrap(); + + let cached = Group::read(live.consume()).await.unwrap(); + assert_eq!(cached.frames.len(), 1); + assert_eq!(cached.frames[0].timestamp, None); + assert_eq!(Group::read(cached.produce(None).unwrap()).await.unwrap(), cached); + } } From 407b5b5c916c11043c31de5d9cf78ff9f910bfd5 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 20:46:43 +0000 Subject: [PATCH 14/25] feat(moq-net): wire the cache into TrackProducer and TrackConsumer TrackProducer::with_cache(cache::Producer) spawns an internal subscriber that drains each finished group into the cache (producer fills). The subscription keeps the track active while caching, independent of downstream demand. TrackConsumer::with_cache(cache::Consumer) attaches a read-through cache: get_group and fetch_group resolve from it on a live-state miss, rebuilding the group at the track's timescale. fetch_group serves from the cache before failing with NotFound or waiting on a TrackDynamic, via a pre-resolved branch added to TrackFetch. So a consumer sharing the producer's cache supports fetch without a wire round-trip. Three tests: producer fills the cache and a shared reader sees the group; get_group and fetch_group fall through to the cache and read back byte-for-byte. Full moq-net suite 427 pass; clippy and rustdoc clean. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-net/CACHE.md | 11 ++- rs/moq-net/src/model/cache/mod.rs | 5 +- rs/moq-net/src/model/track.rs | 158 ++++++++++++++++++++++++++++-- 3 files changed, 158 insertions(+), 16 deletions(-) diff --git a/rs/moq-net/CACHE.md b/rs/moq-net/CACHE.md index 954bacbe5..887f01a69 100644 --- a/rs/moq-net/CACHE.md +++ b/rs/moq-net/CACHE.md @@ -9,12 +9,15 @@ > - the storage-agnostic **multi-tier index + promotion** (`index.rs`): `sequence -> Location` > (tier + segment + byte range), per-tier byte/duration accounting, `promotion` to pick the > oldest disk segments over the high watermark, and `apply_promotion` to repoint them at the -> remote tier after a rollup. +> remote tier after a rollup; +> - the **track wiring**: `cache::Group::read` / `produce` bridge a cached group to/from the live +> group model; `TrackProducer::with_cache(cache::Producer)` spawns a subscriber that drains +> finished groups into the cache; `TrackConsumer::with_cache(cache::Consumer)` makes `get_group` +> and `fetch_group` resolve from the cache on a live miss. > > Still design: the tier **I/O** (object_store `put`/`get_range`/`delete` wiring the index and -> rollup to real storage, feature-gated) and the `TrackProducer` / `TrackConsumer` wiring. -> Targets `dev`: it removes a public/wire field (`TrackInfo.cache`) and adds local API to the -> track endpoints. +> rollup to real storage, feature-gated) and removing the wire field `TrackInfo.cache`. Targets +> `dev`. A per-track group cache. It lets a relay or edge retain recent groups past the live window and serve them back on a FETCH, optionally spilling to local disk or remote object storage. This is diff --git a/rs/moq-net/src/model/cache/mod.rs b/rs/moq-net/src/model/cache/mod.rs index 2ddad2403..a8c457c83 100644 --- a/rs/moq-net/src/model/cache/mod.rs +++ b/rs/moq-net/src/model/cache/mod.rs @@ -15,8 +15,9 @@ //! //! The `segment` submodule is the on-disk byte format used by the disk and remote tiers (a band //! of groups serialized as one self-describing object) plus the rollup that concatenates several -//! small segments into one larger object. The tier I/O (object_store) and the -//! [`crate::TrackProducer`] / [`crate::TrackConsumer`] wiring are not implemented yet; see +//! small segments into one larger object. `Group::read` / `Group::produce` bridge a cached group +//! to and from the live group model, and `TrackProducer::with_cache` / `TrackConsumer::with_cache` +//! wire the cache into the track types. The tier I/O (object_store) is the remaining piece; see //! `rs/moq-net/CACHE.md`. use std::collections::BTreeMap; diff --git a/rs/moq-net/src/model/track.rs b/rs/moq-net/src/model/track.rs index 173df8668..23e268b53 100644 --- a/rs/moq-net/src/model/track.rs +++ b/rs/moq-net/src/model/track.rs @@ -15,7 +15,7 @@ use crate::{Error, Result, Subscription, Timescale, coding}; -use super::{Fetch, Group, GroupConsumer, GroupProducer}; +use super::{Fetch, Group, GroupConsumer, GroupProducer, cache}; use std::{ collections::{HashSet, VecDeque}, @@ -661,6 +661,7 @@ impl TrackProducer { TrackConsumer { name: self.name.clone(), state: self.state.consume(), + cache: None, } } @@ -691,6 +692,26 @@ impl TrackProducer { } } + /// Fill `cache` with this track's groups as they are produced, so a [`TrackConsumer`] sharing + /// the cache (via [`cache::Producer::consume`]) can serve them without a wire fetch. + /// + /// Spawns an internal subscriber that drains each finished group into the cache. That + /// subscription keeps the track active while caching, so the track stays alive until it ends + /// or the cache is dropped, independent of downstream demand. + pub fn with_cache(self, mut cache: cache::Producer) -> Self { + let mut subscriber = self.subscribe(None); + web_async::spawn(async move { + // Groups are drained in arrival order; `Group::read` resolves once each finishes. + // A drained band (over the RAM watermark) is dropped here until a disk tier consumes it. + while let Ok(Some(group)) = subscriber.recv_group().await { + if let Ok(group) = cache::Group::read(group).await { + let _ = cache.insert(group); + } + } + }); + self + } + /// Block until the aggregate subscription changes, then return the new value. /// /// Yields the most demanding request across all live subscribers, or `None` @@ -901,6 +922,7 @@ impl TrackWeak { TrackConsumer { name: self.name.clone(), state: self.state.consume(), + cache: None, } } @@ -964,6 +986,9 @@ impl TrackDemand { pub struct TrackConsumer { name: Arc, state: kio::Consumer, + /// Optional read-through cache (RAM tier). A `get_group` / `fetch_group` miss on the live + /// state falls through to this before failing or waiting on a `TrackDynamic`. + cache: Option, } impl TrackConsumer { @@ -972,6 +997,19 @@ impl TrackConsumer { &self.name } + /// Attach a read-through cache: `get_group` / `fetch_group` resolve locally on a cache hit. + /// Share the [`cache::Producer::consume`] handle of the cache a [`TrackProducer`] fills to + /// serve a track's recent groups without a wire fetch. + pub fn with_cache(mut self, cache: cache::Consumer) -> Self { + self.cache = Some(cache); + self + } + + /// The track's negotiated timescale, needed to rebuild a cached group. + fn timescale(&self) -> Option { + self.state.read().info.as_ref().and_then(|info| info.timescale) + } + pub(crate) fn weak(&self) -> TrackWeak { TrackWeak { name: self.name.clone(), @@ -999,7 +1037,13 @@ impl TrackConsumer { /// the cache. Use [`Self::fetch_group`] to wait for a group that a [`TrackDynamic`] /// will serve on demand. pub fn get_group(&self, sequence: u64) -> Option { - self.state.read().cached_group(sequence) + if let Some(group) = self.state.read().cached_group(sequence) { + return Some(group); + } + // Live miss: fall through to the read-through cache, rebuilding the group at the track's + // timescale. A cache decode/rebuild error is treated as a miss. + let cached = self.cache.as_ref()?.get(sequence)?; + cached.produce(self.timescale()).ok() } /// Fetch a single past group, without holding a live subscription. @@ -1020,21 +1064,38 @@ impl TrackConsumer { .write() .map_err(|s| s.abort.clone().unwrap_or(Error::Dropped))?; match state.poll_fetch(sequence) { - // Cached: the pending resolves immediately, no handler needed. + // Cached live: the pending resolves immediately from state, no handler needed. Poll::Ready(Ok(_)) => {} - // Unservable (NotFound) or already aborted: report it synchronously. - Poll::Ready(Err(err)) => return Err(err), - // A handler exists but the group isn't cached yet: queue it. - Poll::Pending => state.fetches.push_back(GroupRequested { - sequence, - priority: options.priority, - }), + // Live miss. Serve from the read-through cache if present (faster than waiting on a + // handler, and the only option when there is none); otherwise keep the live behavior. + other => { + if let Some(cached) = self.cache.as_ref().and_then(|c| c.get(sequence)) { + let timescale = state.info.as_ref().and_then(|info| info.timescale); + drop(state); + return Ok(kio::Pending::new(TrackFetch { + state: self.state.clone(), + sequence, + cached: Some((cached, timescale)), + })); + } + match other { + // Unservable (NotFound) or already aborted: report it synchronously. + Poll::Ready(Err(err)) => return Err(err), + // A handler exists but the group isn't cached yet: queue it. + Poll::Pending => state.fetches.push_back(GroupRequested { + sequence, + priority: options.priority, + }), + Poll::Ready(Ok(_)) => unreachable!("handled above"), + } + } } drop(state); Ok(kio::Pending::new(TrackFetch { state: self.state.clone(), sequence, + cached: None, })) } @@ -1167,12 +1228,19 @@ impl GroupRequest { pub struct TrackFetch { state: kio::Consumer, sequence: u64, + /// A group pre-resolved from the read-through cache, with the track's timescale to rebuild it. + /// When set, the fetch resolves from here instead of polling the live state. + cached: Option<(cache::Group, Option)>, } impl kio::Future for TrackFetch { type Output = Result; fn poll(&self, waiter: &kio::Waiter) -> Poll { + // A cache hit resolves immediately, rebuilding the group at the track's timescale. + if let Some((group, timescale)) = &self.cached { + return Poll::Ready(group.produce(*timescale)); + } // `poll_fetch` already yields a `Result` (group, or NotFound / // abort); the outer error is the channel closing without one. Poll::Ready( @@ -1425,6 +1493,7 @@ impl TrackRequest { TrackConsumer { name: self.name.clone(), state: self.state.consume(), + cache: None, } } @@ -1624,6 +1693,75 @@ mod test { } } + #[test] + fn get_group_falls_through_to_cache() { + let producer = TrackProducer::new("test", None); + // The live track has no groups; a read-through cache holds sequence 42. + let mut writer = cache::Config::default().produce(); + writer.insert(cache::Group { + sequence: 42, + frames: vec![cache::Frame { + timestamp: None, + payload: bytes::Bytes::from_static(b"hi"), + }], + }); + + // Without the cache the group is a miss; with it, it resolves. + assert!(producer.consume().get_group(42).is_none()); + let group = producer.consume().with_cache(writer.consume()).get_group(42); + assert_eq!(group.expect("served from cache").sequence, 42); + } + + #[tokio::test] + async fn fetch_group_serves_from_cache() { + let producer = TrackProducer::new("test", None); + let mut writer = cache::Config::default().produce(); + writer.insert(cache::Group { + sequence: 7, + frames: vec![cache::Frame { + timestamp: None, + payload: bytes::Bytes::from_static(b"data"), + }], + }); + let consumer = producer.consume().with_cache(writer.consume()); + + // No live group 7 and no TrackDynamic: the fetch is served from the cache instead of + // failing with NotFound, and the frame reads back byte-for-byte. + let mut group = consumer.fetch_group(7, None).unwrap().await.unwrap(); + assert_eq!(group.sequence, 7); + assert_eq!( + group.read_frame().await.unwrap().unwrap(), + bytes::Bytes::from_static(b"data") + ); + } + + #[tokio::test] + async fn producer_populates_cache() { + // A producer with a cache drains its finished groups into it; a reader sharing the cache + // then sees them. Producer fills, consumer reads, end to end. + let writer = cache::Config::default().produce(); + let reader = writer.consume(); + let mut producer = TrackProducer::new("test", None).with_cache(writer); + + let mut group = producer.append_group().unwrap(); // seq 0 + group.write_frame(bytes::Bytes::from_static(b"hello")).unwrap(); + group.finish().unwrap(); + + // Let the spawned populate task drain the finished group into the cache. + let mut cached = None; + for _ in 0..100 { + if let Some(group) = reader.get(0) { + cached = Some(group); + break; + } + tokio::task::yield_now().await; + } + + let cached = cached.expect("group populated into cache"); + assert_eq!(cached.frames.len(), 1); + assert_eq!(cached.frames[0].payload, bytes::Bytes::from_static(b"hello")); + } + #[tokio::test] async fn no_eviction_when_fresh() { tokio::time::pause(); From f3dcd611f325905a0863b620013384e8cb151720 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 21:09:40 +0000 Subject: [PATCH 15/25] feat(moq-net): disk and remote cache tiers over object_store Add the cache-tiered feature (off by default so RAM-only and wasm builds stay dependency-free) and cache::store::Store, the object_store glue over the index: - flush(batch): segment::encode the band, put it as one disk segment, record it in the index, then compact. - get(seq): index.locate -> get_range the blob -> segment::group_from_blob. - compact(): when the disk tier is over its bounds, read the oldest segments, segment::rollup them into one remote object, apply_promotion to repoint the index, and delete the disk objects; with no remote tier, evict them instead. object_store is added default-features = false (core + memory + local fs, no cloud SDKs). Add Index::evict for the no-remote eviction path. 5 tests against object_store::memory::InMemory: flush/get round-trip, promotion to remote preserving all groups, eviction of the oldest without a remote, plus a non-gated Index::evict unit test. Default suite 428 pass; with cache-tiered, 39 cache tests pass; clippy/rustdoc/fmt and --no-default-features clean. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- Cargo.toml | 1 + rs/moq-net/CACHE.md | 11 +- rs/moq-net/Cargo.toml | 4 + rs/moq-net/src/model/cache/index.rs | 20 +++ rs/moq-net/src/model/cache/mod.rs | 4 + rs/moq-net/src/model/cache/store.rs | 196 ++++++++++++++++++++++++++++ 6 files changed, 232 insertions(+), 4 deletions(-) create mode 100644 rs/moq-net/src/model/cache/store.rs diff --git a/Cargo.toml b/Cargo.toml index c337843dc..a0912031f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -72,6 +72,7 @@ moq-token = { version = "0.6", path = "rs/moq-token" } # Standalone crate (moq-dev/vaapi); vendored from cros-libva + cros-codecs. moq-vaapi = "0.0.2" moq-video = { version = "0.0.4", path = "rs/moq-video" } +object_store = { version = "0.12", default-features = false } qmux = { version = "0.1.3", default-features = false } serde = { version = "1", features = ["derive"] } tokio = "1.48" diff --git a/rs/moq-net/CACHE.md b/rs/moq-net/CACHE.md index 887f01a69..c4926b9e6 100644 --- a/rs/moq-net/CACHE.md +++ b/rs/moq-net/CACHE.md @@ -13,11 +13,14 @@ > - the **track wiring**: `cache::Group::read` / `produce` bridge a cached group to/from the live > group model; `TrackProducer::with_cache(cache::Producer)` spawns a subscriber that drains > finished groups into the cache; `TrackConsumer::with_cache(cache::Consumer)` makes `get_group` -> and `fetch_group` resolve from the cache on a live miss. +> and `fetch_group` resolve from the cache on a live miss; +> - the **disk/remote tier I/O** (`store.rs`, behind the `cache-tiered` feature): `cache::store::Store` +> over an `object_store` disk tier and optional remote tier. `flush` encodes a band and `put`s it +> as a disk segment; `get` ranged-reads a located blob; `compact` rolls the oldest disk segments +> up into one remote object (or evicts them with no remote), driven by the index. > -> Still design: the tier **I/O** (object_store `put`/`get_range`/`delete` wiring the index and -> rollup to real storage, feature-gated) and removing the wire field `TrackInfo.cache`. Targets -> `dev`. +> Still design: removing the wire field `TrackInfo.cache`, and threading a `CacheConfig` onto the +> tracks a relay auto-creates (the Origin follow-up). Targets `dev`. A per-track group cache. It lets a relay or edge retain recent groups past the live window and serve them back on a FETCH, optionally spilling to local disk or remote object storage. This is diff --git a/rs/moq-net/Cargo.toml b/rs/moq-net/Cargo.toml index f7ca2afba..4f23a8483 100644 --- a/rs/moq-net/Cargo.toml +++ b/rs/moq-net/Cargo.toml @@ -13,6 +13,9 @@ keywords = ["quic", "http3", "webtransport", "media", "live"] categories = ["multimedia", "network-programming", "web-programming"] [features] +# Disk and remote cache tiers (object_store). Off by default so RAM-only and +# wasm builds stay dependency-free. +cache-tiered = ["dep:object_store"] # Legacy no-op: serde is now unconditional (stats publishing requires it). serde = [] @@ -22,6 +25,7 @@ flate2 = { workspace = true } futures = "0.3" kio = { workspace = true } num_enum = "0.7" +object_store = { workspace = true, optional = true } rand = "0.10.1" serde = { workspace = true, features = ["rc"] } serde_json = "1" diff --git a/rs/moq-net/src/model/cache/index.rs b/rs/moq-net/src/model/cache/index.rs index a74bc36b8..d30d927a6 100644 --- a/rs/moq-net/src/model/cache/index.rs +++ b/rs/moq-net/src/model/cache/index.rs @@ -205,6 +205,14 @@ impl Index { self.segments.retain(|id, _| !promoted.contains(id)); new_id } + + /// Drop a set of segments and the group locations pointing at them. Used to evict from the + /// disk tier when there is no remote tier to promote into. + pub fn evict(&mut self, segments: &[SegmentId]) { + let drop: HashSet = segments.iter().copied().collect(); + self.groups.retain(|_, loc| !drop.contains(&loc.segment)); + self.segments.retain(|id, _| !drop.contains(id)); + } } #[cfg(test)] @@ -351,6 +359,18 @@ mod tests { assert_eq!(index.segment_count(Tier::Remote), 1); } + #[test] + fn evict_drops_segments_and_their_locations() { + let mut index = Index::new(); + let a = index.add(Tier::Disk, &Segment::open(encoded(&[group(0, 10, 0)])).unwrap()); + index.add(Tier::Disk, &Segment::open(encoded(&[group(1, 10, 1)])).unwrap()); + + index.evict(&[a]); + assert!(index.locate(0).is_none(), "evicted segment's groups are gone"); + assert!(index.locate(1).is_some(), "other segment untouched"); + assert_eq!(index.segment_count(Tier::Disk), 1); + } + #[test] fn end_to_end_locate_then_read_through_promotion() { // Build disk segments, store their bytes, and verify a located group decodes correctly diff --git a/rs/moq-net/src/model/cache/mod.rs b/rs/moq-net/src/model/cache/mod.rs index a8c457c83..1bf7f0944 100644 --- a/rs/moq-net/src/model/cache/mod.rs +++ b/rs/moq-net/src/model/cache/mod.rs @@ -31,6 +31,10 @@ use super::{Timescale, Timestamp}; pub mod index; pub mod segment; +/// Disk and remote tiers backed by object_store. Requires the `cache-tiered` feature. +#[cfg(feature = "cache-tiered")] +pub mod store; + /// A cache bound, as a duration, a byte count, or both (the first to trip wins). /// /// All-`None` means "no threshold": as a high watermark that is unbounded (never flush), as a low diff --git a/rs/moq-net/src/model/cache/store.rs b/rs/moq-net/src/model/cache/store.rs new file mode 100644 index 000000000..e13063f42 --- /dev/null +++ b/rs/moq-net/src/model/cache/store.rs @@ -0,0 +1,196 @@ +//! Disk and remote cache tiers backed by [`object_store`]. +//! +//! A `Store` persists flush bands from the RAM tier as segments (one object each), serves a group +//! by ranged-reading its blob, and compacts: once the disk tier is over its bounds, the oldest +//! segments roll up into one remote object (or are evicted if there is no remote tier). All the +//! decisions live in the `index` module; this module is the object_store glue. + +use std::ops::Range; +use std::sync::Arc; + +use object_store::{ObjectStore, PutPayload, path::Path}; + +use super::index::{Index, SegmentId, Tier}; +use super::segment::{self, Segment}; +use super::{Batch, Bounds, Group}; + +/// An error from a tiered [`Store`]. +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum Error { + /// A segment failed to encode or decode. + #[error(transparent)] + Segment(#[from] segment::Error), + /// The backing object store failed. + #[error(transparent)] + Store(#[from] object_store::Error), +} + +/// A tiered durable store: a disk object store, an optional remote one, and the index mapping +/// group sequences to their location. Bands flushed from the RAM tier land here; old disk segments +/// roll up into the remote tier, or are evicted when there is none. +pub struct Store { + disk: Arc, + remote: Option>, + bounds: Bounds, + prefix: Path, + index: Index, +} + +impl Store { + /// Create a store over `disk` with an optional `remote` tier, keyed under `prefix`. `bounds` + /// caps the disk tier; exceeding the high watermark promotes (or evicts) the oldest segments. + pub fn new(disk: Arc, remote: Option>, prefix: Path, bounds: Bounds) -> Self { + Self { + disk, + remote, + bounds, + prefix, + index: Index::new(), + } + } + + fn store_of(&self, tier: Tier) -> &Arc { + match tier { + Tier::Remote => self.remote.as_ref().unwrap_or(&self.disk), + Tier::Disk => &self.disk, + } + } + + fn key(&self, tier: Tier, id: SegmentId) -> Path { + let dir = match tier { + Tier::Disk => "disk", + Tier::Remote => "remote", + }; + self.prefix.child(dir).child(id.to_string()) + } + + /// Persist a flushed band as one disk segment, then compact if the disk tier is over budget. + pub async fn flush(&mut self, batch: Batch) -> Result<(), Error> { + if batch.is_empty() { + return Ok(()); + } + let bytes = segment::encode(&batch)?; + let segment = Segment::open(bytes.clone())?; + let id = self.index.add(Tier::Disk, &segment); + self.disk + .put(&self.key(Tier::Disk, id), PutPayload::from_bytes(bytes)) + .await?; + self.compact().await + } + + /// Fetch a group by sequence: locate it, ranged-read its blob, decode it. `None` if not stored. + pub async fn get(&self, sequence: u64) -> Result, Error> { + let Some(loc) = self.index.locate(sequence) else { + return Ok(None); + }; + let range: Range = loc.offset..loc.offset + loc.length; + let bytes = self + .store_of(loc.tier) + .get_range(&self.key(loc.tier, loc.segment), range) + .await?; + Ok(Some(segment::group_from_blob(sequence, bytes)?)) + } + + /// Bring the disk tier within bounds: roll the oldest segments up into one remote object, or + /// evict them when there is no remote tier. A no-op when the disk tier is within bounds. + pub async fn compact(&mut self) -> Result<(), Error> { + let promoted = self.index.promotion(self.bounds); + if promoted.is_empty() { + return Ok(()); + } + + match self.remote.clone() { + Some(remote) => { + // Read the promoted disk segments whole, roll them into one, write it remotely, + // repoint the index, then delete the disk objects. + let mut segments = Vec::with_capacity(promoted.len()); + for id in &promoted { + let bytes = self.disk.get(&self.key(Tier::Disk, *id)).await?.bytes().await?; + segments.push(bytes); + } + let rolled = segment::rollup(&segments)?; + let new_id = self.index.apply_promotion(&promoted, &Segment::open(rolled.clone())?); + remote + .put(&self.key(Tier::Remote, new_id), PutPayload::from_bytes(rolled)) + .await?; + for id in &promoted { + self.disk.delete(&self.key(Tier::Disk, *id)).await?; + } + } + None => { + // No remote tier: drop the oldest disk segments outright. + for id in &promoted { + self.disk.delete(&self.key(Tier::Disk, *id)).await?; + } + self.index.evict(&promoted); + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::super::{Frame, Limit}; + use super::*; + use bytes::Bytes; + use object_store::memory::InMemory; + + /// A one-frame untimed group of `bytes` bytes at the given sequence. + fn group(sequence: u64, bytes: usize) -> Group { + Group { + sequence, + frames: vec![Frame { + timestamp: None, + payload: Bytes::from(vec![sequence as u8; bytes]), + }], + } + } + + fn memory() -> Arc { + Arc::new(InMemory::new()) + } + + #[tokio::test] + async fn flush_and_get_from_disk() { + let mut store = Store::new(memory(), None, Path::from("cache"), Bounds::default()); + store.flush(vec![group(0, 10), group(1, 20)]).await.unwrap(); + + assert_eq!(store.get(0).await.unwrap().unwrap(), group(0, 10)); + assert_eq!(store.get(1).await.unwrap().unwrap(), group(1, 20)); + assert!(store.get(99).await.unwrap().is_none()); + } + + #[tokio::test] + async fn promotes_to_remote_over_budget() { + // Segments are ~1 KB; keep ~1 in disk (min 1100), promote at 2 (max 2000). + let bounds = Bounds::new(Limit::bytes(1100), Limit::bytes(2000)); + let mut store = Store::new(memory(), Some(memory()), Path::from("cache"), bounds); + + for seq in 0..5 { + store.flush(vec![group(seq, 1000)]).await.unwrap(); + } + + // Every group is still readable, whether it stayed on disk or rolled up to remote. + for seq in 0..5 { + assert_eq!(store.get(seq).await.unwrap().unwrap(), group(seq, 1000)); + } + // Some bytes ended up in the remote tier. + assert!(store.index.bytes(Tier::Remote) > 0); + } + + #[tokio::test] + async fn evicts_oldest_without_remote() { + let bounds = Bounds::new(Limit::bytes(1100), Limit::bytes(2000)); + let mut store = Store::new(memory(), None, Path::from("cache"), bounds); + + for seq in 0..5 { + store.flush(vec![group(seq, 1000)]).await.unwrap(); + } + + // The newest group is retained; the oldest was evicted (no remote to promote into). + assert_eq!(store.get(4).await.unwrap().unwrap(), group(4, 1000)); + assert!(store.get(0).await.unwrap().is_none()); + } +} From b769c662fc01291dd72ee465987fb97878c437c9 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 21:09:53 +0000 Subject: [PATCH 16/25] build: update Cargo.lock for object_store Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- Cargo.lock | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 445e2d918..288585479 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4226,6 +4226,7 @@ dependencies = [ "futures", "kio", "num_enum", + "object_store", "rand 0.10.1", "serde", "serde_json", @@ -5345,6 +5346,29 @@ dependencies = [ "memchr", ] +[[package]] +name = "object_store" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "http", + "humantime", + "itertools 0.14.0", + "parking_lot", + "percent-encoding", + "thiserror 2.0.18", + "tokio", + "tracing", + "url", + "wasm-bindgen-futures", + "web-time", +] + [[package]] name = "octets" version = "0.3.5" From 71deaf25e9d994edf5222198a4e8f7c72d33a143 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 22:48:45 +0000 Subject: [PATCH 17/25] fix(moq-net): address cache review findings Correctness: - index::promotion never selects the single newest disk segment (mirrors the RAM "keep latest" rule). Without it, an unset low watermark drained the whole disk tier on one over-max trip -> data loss of recent groups on the no-remote eviction path. - store crash-consistency: write the object before mutating the index. flush puts the disk segment under index.next_id() then add()s; compact uploads the rolled remote object before apply_promotion. A failed put/upload now leaves the index (and the disk segments it points at) intact instead of stranding sequences on a nonexistent object. Added Index::next_id() for the peek. - store::get guards offset+length with checked_add (a corrupt footer could overflow u64 and produce a bad range), matching segment::blob. - TrackConsumer::fetch_group rebuilds the cached group synchronously and treats a rebuild error as a miss (falling through to the live path), consistent with get_group. Previously a produce() error (e.g. timescale mismatch on a not-yet-accepted wire consumer) surfaced as a hard fetch failure. TrackFetch now holds the rebuilt GroupConsumer. Cleanup: - store_of expects a configured remote tier for a remote location instead of silently falling back to disk. - index is internal orchestration: gate it behind cache-tiered (its only user is the gated store) and drop it from the public surface; segment::GroupEntry gets #[non_exhaustive]. - Document the two with_cache caveats the review surfaced (the internal subscriber disables demand teardown; a stalled group head-of-line-blocks caching of later finished groups). Default suite 427 pass; with cache-tiered, 39 cache tests pass. clippy (both feature sets), rustdoc, and fmt clean. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-net/src/model/cache/index.rs | 22 +++++++++++--- rs/moq-net/src/model/cache/mod.rs | 6 +++- rs/moq-net/src/model/cache/segment.rs | 1 + rs/moq-net/src/model/cache/store.rs | 27 +++++++++++++---- rs/moq-net/src/model/track.rs | 43 +++++++++++++++++---------- 5 files changed, 73 insertions(+), 26 deletions(-) diff --git a/rs/moq-net/src/model/cache/index.rs b/rs/moq-net/src/model/cache/index.rs index d30d927a6..9b88fe84f 100644 --- a/rs/moq-net/src/model/cache/index.rs +++ b/rs/moq-net/src/model/cache/index.rs @@ -67,6 +67,13 @@ impl Index { Self::default() } + /// The id the next [`add`](Self::add) (or [`apply_promotion`](Self::apply_promotion)) will + /// assign. A caller can put the object under this key *before* recording it, so a failed write + /// leaves the index unchanged. + pub fn next_id(&self) -> SegmentId { + self.next_id + } + /// Record a freshly written `segment` on `tier`, returning its new id. Each group in the /// segment becomes locatable; an already-present sequence is repointed to this segment (this /// is how [`apply_promotion`](Self::apply_promotion) moves sequences to the remote tier). @@ -114,12 +121,14 @@ impl Index { self.groups.get(&sequence).copied() } - /// Total bytes stored in `tier`. + /// Total bytes stored in `tier`. Test-only accounting (promotion uses `stats`). + #[cfg(test)] pub fn bytes(&self, tier: Tier) -> u64 { self.segments.values().filter(|m| m.tier == tier).map(|m| m.bytes).sum() } - /// Number of segments in `tier`. + /// Number of segments in `tier`. Test-only. + #[cfg(test)] pub fn segment_count(&self, tier: Tier) -> usize { self.segments.values().filter(|m| m.tier == tier).count() } @@ -177,6 +186,11 @@ impl Index { /// The oldest disk segments to promote so the disk tier returns within `bounds`. Empty unless /// the disk tier is over its high watermark; otherwise the oldest segments are selected until /// what remains is within the low watermark, oldest first (the order to roll them up in). + /// + /// The single newest disk segment is never selected, mirroring the RAM tier's "keep the latest + /// group" rule. Without this, an unset low watermark (a floor of zero) would drain the whole + /// disk tier on one over-max trip, which for the no-remote eviction path is data loss of even + /// the most recent flushed groups. pub fn promotion(&self, bounds: Bounds) -> Vec { let disk = self.tier_segments(Tier::Disk); if !Self::over_max(self.stats(&disk), bounds.max) { @@ -185,8 +199,8 @@ impl Index { let mut promote = Vec::new(); let mut remaining = disk; - while !remaining.is_empty() && Self::above_min(self.stats(&remaining), bounds.min) { - // Promote the oldest; recompute against what is left. + // Keep at least the newest segment (`len() > 1`), oldest-first. + while remaining.len() > 1 && Self::above_min(self.stats(&remaining), bounds.min) { promote.push(remaining.remove(0)); } promote diff --git a/rs/moq-net/src/model/cache/mod.rs b/rs/moq-net/src/model/cache/mod.rs index 1bf7f0944..f6cbd4cc0 100644 --- a/rs/moq-net/src/model/cache/mod.rs +++ b/rs/moq-net/src/model/cache/mod.rs @@ -28,7 +28,11 @@ use bytes::Bytes; use super::{Timescale, Timestamp}; -pub mod index; +// Internal orchestration for the disk/remote tiers; not part of the public surface, and only +// needed (and only compiled) with those tiers. +#[cfg(feature = "cache-tiered")] +mod index; + pub mod segment; /// Disk and remote tiers backed by object_store. Requires the `cache-tiered` feature. diff --git a/rs/moq-net/src/model/cache/segment.rs b/rs/moq-net/src/model/cache/segment.rs index 52f868b3b..d84509a66 100644 --- a/rs/moq-net/src/model/cache/segment.rs +++ b/rs/moq-net/src/model/cache/segment.rs @@ -45,6 +45,7 @@ pub enum Error { /// One row of a segment's footer: where a group lives and its summary, without decoding the blob. #[derive(Clone, Debug, PartialEq, Eq)] +#[non_exhaustive] pub struct GroupEntry { /// The group's sequence number within its track. pub sequence: u64, diff --git a/rs/moq-net/src/model/cache/store.rs b/rs/moq-net/src/model/cache/store.rs index e13063f42..cfec6f680 100644 --- a/rs/moq-net/src/model/cache/store.rs +++ b/rs/moq-net/src/model/cache/store.rs @@ -52,7 +52,12 @@ impl Store { fn store_of(&self, tier: Tier) -> &Arc { match tier { - Tier::Remote => self.remote.as_ref().unwrap_or(&self.disk), + // A remote location is only ever recorded when a remote tier is configured (see + // `compact`), so this never falls back. + Tier::Remote => self + .remote + .as_ref() + .expect("a remote location implies a configured remote tier"), Tier::Disk => &self.disk, } } @@ -72,10 +77,13 @@ impl Store { } let bytes = segment::encode(&batch)?; let segment = Segment::open(bytes.clone())?; - let id = self.index.add(Tier::Disk, &segment); + // Write the object before recording it, so a failed put leaves the index unchanged. + let id = self.index.next_id(); self.disk .put(&self.key(Tier::Disk, id), PutPayload::from_bytes(bytes)) .await?; + let added = self.index.add(Tier::Disk, &segment); + debug_assert_eq!(added, id, "index id drifted from the written key"); self.compact().await } @@ -84,7 +92,8 @@ impl Store { let Some(loc) = self.index.locate(sequence) else { return Ok(None); }; - let range: Range = loc.offset..loc.offset + loc.length; + let end = loc.offset.checked_add(loc.length).ok_or(segment::Error::Truncated)?; + let range: Range = loc.offset..end; let bytes = self .store_of(loc.tier) .get_range(&self.key(loc.tier, loc.segment), range) @@ -102,18 +111,24 @@ impl Store { match self.remote.clone() { Some(remote) => { - // Read the promoted disk segments whole, roll them into one, write it remotely, - // repoint the index, then delete the disk objects. + // Read the promoted disk segments whole and roll them into one. let mut segments = Vec::with_capacity(promoted.len()); for id in &promoted { let bytes = self.disk.get(&self.key(Tier::Disk, *id)).await?.bytes().await?; segments.push(bytes); } let rolled = segment::rollup(&segments)?; - let new_id = self.index.apply_promotion(&promoted, &Segment::open(rolled.clone())?); + let rolled_segment = Segment::open(rolled.clone())?; + // Upload the remote object before repointing the index, so a failed put leaves the + // index (still pointing at the disk segments) intact. + let new_id = self.index.next_id(); remote .put(&self.key(Tier::Remote, new_id), PutPayload::from_bytes(rolled)) .await?; + let applied = self.index.apply_promotion(&promoted, &rolled_segment); + debug_assert_eq!(applied, new_id, "index id drifted from the uploaded key"); + // Best-effort cleanup; an index now pointing at remote makes any leftover disk + // objects orphans, not inconsistency. for id in &promoted { self.disk.delete(&self.key(Tier::Disk, *id)).await?; } diff --git a/rs/moq-net/src/model/track.rs b/rs/moq-net/src/model/track.rs index 23e268b53..f86e9b01a 100644 --- a/rs/moq-net/src/model/track.rs +++ b/rs/moq-net/src/model/track.rs @@ -695,13 +695,18 @@ impl TrackProducer { /// Fill `cache` with this track's groups as they are produced, so a [`TrackConsumer`] sharing /// the cache (via [`cache::Producer::consume`]) can serve them without a wire fetch. /// - /// Spawns an internal subscriber that drains each finished group into the cache. That - /// subscription keeps the track active while caching, so the track stays alive until it ends - /// or the cache is dropped, independent of downstream demand. + /// Spawns an internal subscriber that drains each finished group into the cache. Two caveats + /// follow from that, both being addressed in the cache design (see `rs/moq-net/CACHE.md`): + /// + /// - The subscriber counts as a consumer, so [`unused`](Self::unused) never resolves while a + /// cache is attached. A relay that drops idle tracks via demand will not drop a cached one; + /// it stays alive until it ends or the producer is dropped. Intended for "keep recording + /// when idle", but it disables demand-driven teardown for this track. + /// - Groups are drained in arrival order and [`cache::Group::read`] resolves only once a group + /// finishes, so a stalled group head-of-line-blocks the caching of later finished ones. pub fn with_cache(self, mut cache: cache::Producer) -> Self { let mut subscriber = self.subscribe(None); web_async::spawn(async move { - // Groups are drained in arrival order; `Group::read` resolves once each finishes. // A drained band (over the RAM watermark) is dropped here until a disk tier consumes it. while let Ok(Some(group)) = subscriber.recv_group().await { if let Ok(group) = cache::Group::read(group).await { @@ -1066,16 +1071,23 @@ impl TrackConsumer { match state.poll_fetch(sequence) { // Cached live: the pending resolves immediately from state, no handler needed. Poll::Ready(Ok(_)) => {} - // Live miss. Serve from the read-through cache if present (faster than waiting on a - // handler, and the only option when there is none); otherwise keep the live behavior. + // Live miss. Serve from the read-through cache if it holds the group and it rebuilds + // at the track's timescale (faster than waiting on a handler, and the only option when + // there is none). A rebuild error is treated as a miss, consistent with `get_group`, + // falling through to the live behavior. other => { - if let Some(cached) = self.cache.as_ref().and_then(|c| c.get(sequence)) { - let timescale = state.info.as_ref().and_then(|info| info.timescale); + let timescale = state.info.as_ref().and_then(|info| info.timescale); + if let Some(group) = self + .cache + .as_ref() + .and_then(|c| c.get(sequence)) + .and_then(|g| g.produce(timescale).ok()) + { drop(state); return Ok(kio::Pending::new(TrackFetch { state: self.state.clone(), sequence, - cached: Some((cached, timescale)), + cached: Some(group), })); } match other { @@ -1228,18 +1240,19 @@ impl GroupRequest { pub struct TrackFetch { state: kio::Consumer, sequence: u64, - /// A group pre-resolved from the read-through cache, with the track's timescale to rebuild it. - /// When set, the fetch resolves from here instead of polling the live state. - cached: Option<(cache::Group, Option)>, + /// A group already rebuilt from the read-through cache. When set, the fetch resolves from it + /// instead of polling the live state. + cached: Option, } impl kio::Future for TrackFetch { type Output = Result; fn poll(&self, waiter: &kio::Waiter) -> Poll { - // A cache hit resolves immediately, rebuilding the group at the track's timescale. - if let Some((group, timescale)) = &self.cached { - return Poll::Ready(group.produce(*timescale)); + // A cache hit resolves immediately. `poll` returns `Ready` on first call, so the clone + // happens once. + if let Some(group) = &self.cached { + return Poll::Ready(Ok(group.clone())); } // `poll_fetch` already yields a `Result` (group, or NotFound / // abort); the outer error is the channel closing without one. From c3f35dca9477570932c82801177b913e45e7447c Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 02:32:07 +0000 Subject: [PATCH 18/25] refactor(moq-net): move cache fully into moq-net; target-gate the tiers Delete rs/moq-archive (a design doc only, never a crate); the cache lives in moq-net now. Scrub the archive references from CACHE.md. Replace the `cache-tiered` feature with target-gating: object_store is a server-side library that doesn't build for wasm, but it needn't be opt-in. Move it to [target.'cfg(not(target_arch = "wasm32"))'.dependencies] and gate the index/store modules with cfg(not(target_arch = "wasm32")). Native builds now get the disk/remote tiers automatically with no feature flag, wasm drops them automatically, and the tier tests run in the default `cargo test` on the host. Default suite 438 pass; clippy/rustdoc/fmt clean. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-archive/DESIGN.md | 509 ------------------------------ rs/moq-net/CACHE.md | 22 +- rs/moq-net/Cargo.toml | 10 +- rs/moq-net/src/model/cache/mod.rs | 8 +- 4 files changed, 20 insertions(+), 529 deletions(-) delete mode 100644 rs/moq-archive/DESIGN.md diff --git a/rs/moq-archive/DESIGN.md b/rs/moq-archive/DESIGN.md deleted file mode 100644 index 903149887..000000000 --- a/rs/moq-archive/DESIGN.md +++ /dev/null @@ -1,509 +0,0 @@ -# moq-archive (design) - -> Status: design proposal, no implementation yet. Targets the `dev` branch because it -> builds on the moq-lite-05 FETCH API and media timestamps, both of which only exist there. - -`moq-archive` saves a live MoQ track to durable storage and serves it back on demand. It -sits beside `moq-relay` in the stack: the relay keeps a small in-memory cache for live -fan-out, while the archive is the long-term tier that answers FETCH requests for groups -that have long since aged out of the relay's cache. - -## Goals - -- Record a single `TrackConsumer` to disk and/or object storage, losslessly, frames intact. -- Serve any previously recorded group back through the normal FETCH path, so an unmodified - consumer can re-request old groups without knowing storage exists. -- Tier data across RAM, local disk, and remote object storage (S3/GCS/Azure), with an - independent, optional retention duration per tier. -- Survive out-of-order group delivery (groups arrive on independent QUIC streams and can - complete in any order). -- Avoid one-file-per-group: audio makes a group per frame, so a naive layout would create - thousands of tiny files. Groups are batched into larger segment objects. - -## Non-goals (v1) - -- Whole-broadcast archival. There is currently no generic way to enumerate every track in a - broadcast, so v1 records one track at a time. A broadcast-level wrapper (one recorder per - track, shared root) is a follow-up once track discovery exists. -- Transcoding, repackaging, or media awareness. The archive treats frames as opaque sized - payloads plus an optional timestamp, exactly like the relay. The catalog, container, and - codec layers stay in `hang`. -- Live VOD playback / DVR scrubbing UX. v1 serves groups via FETCH; building a seekable - player on top is downstream work. - -## Prior art - -There is no existing crate for this. A crates.io scan turns up only the `moq-dev` crates -(`moq-net`, `moq-relay`, `moq-mux`, ...), none of which persist tracks; the relay cache is -deliberately RAM-only and ephemeral. The IETF MoQ drafts leave durable storage to the -application. So we build it, but we do **not** hand-roll the storage backend: -[`object_store`](https://crates.io/crates/object_store) (0.13, Apache Arrow project) already -provides one trait over AWS S3, GCS, Azure Blob, local filesystem, in-memory, and HTTP, with -first-class byte-range GETs (`get_opts` / `GetRange`) and multipart uploads. That is exactly -the disk + S3 abstraction we need, so both the "disk" and the "S3" tiers are just two -`Arc` instances and serving a group is one ranged GET. - -### Approaches we borrow from - -"Many small finite append-only streams, batched into larger objects with an index, tiered to -object storage" is a well-trodden pattern. We are reassembling established techniques, not -inventing one: - -- **Apache BookKeeper "ledgers"** (Pulsar) are the closest match: a ledger is a finite, - append-only, sealed sequence of entries, exactly a MoQ group; Pulsar offloads sealed ledgers - to S3. A track is a chain of ledgers. -- **Kafka log segments + tiered storage (KIP-405)** already solved "don't make a file per - record": batch records into large segments, each with an offset index, and offload old - segments to S3. Our segment + per-group offset table is the same shape; the lesson stolen is - a *sparse* index for very large archives (open question 5). -- **Facebook Haystack / f4** and **Bitcask** are the small-object fix: pack many small blobs - into few large files with an in-memory `key -> (file, offset, len)` index. Our in-RAM - `BTreeMap` is a Bitcask keydir. -- **Parquet footer + Iceberg manifest** shape our index: a self-describing per-segment footer - plus a per-track manifest that routes a sequence to a segment. - -No single embeddable crate does all of (batch + index + S3 tiering) for opaque streams: the -systems that do (BookKeeper, Kafka, Pravega) are servers, not libraries. An embedded LSM -(`fjall`, `redb`, or RocksDB BlobDB) would give batching, an index, and compaction for free but -is local-disk only with **no S3 tier**, which is the whole point here. So `object_store` for -the tiers plus our own thin segment/manifest format is the smallest thing that fits; an -embedded LSM as the *disk-tier engine* remains a viable future swap behind the `Storage` trait. - -## Background: the moq-net model we build on - -The relevant `moq-net` (dev) API, recapped so the design is self-contained: - -**Recording side (read a track):** -- `TrackConsumer::subscribe(None) -> TrackSubscriber`. -- `TrackSubscriber::recv_group() -> Result>` yields every group in - *arrival* order (preserves out-of-order delivery; `next_group()` would skip late arrivals, - which we do not want for an archive). -- `GroupConsumer { sequence }`, `GroupConsumer::read_frame() -> Result>` drains - frames in order; `Frame { size, timestamp: Option }` carries the media timestamp - when present (moq-lite-05+). -- `GroupConsumer::finished() -> Result` resolves once the group is complete, returning - the final frame count. This is our "safe to flush" signal. - -**Serving side (answer fetches):** -- `TrackProducer::dynamic() -> TrackDynamic`. -- `TrackDynamic::requested_group() -> Result` blocks until a consumer FETCHes a - group that is not in the producer's cache. While any `TrackDynamic` handle is alive, the - miss waits to be served instead of failing fast with `NotFound`. -- `GroupRequest::sequence() -> u64`, `GroupRequest::accept(info) -> GroupProducer`. We fill the - returned `GroupProducer` with `create_frame` / `write` / `finish` from storage, then - `GroupProducer::finish()`. - -The archive's serving entry point is therefore a **`TrackDynamic`, not a `TrackProducer`**: -the caller owns the `TrackProducer` (and publishing the broadcast into a session via -`BroadcastProducer` / `OriginProducer::publish_broadcast`), calls `.dynamic()`, and hands the -archive the request side. This makes the archive one composable link in a fallback chain -rather than a thing that owns the track. - -**The archive is a link in a cache chain.** The caller decides where it sits. For example -`moq-relay` would try the archive first for any dynamic request, and the archive answers from -RAM/disk/S3. On a miss, the request must fall through to the **original publisher** (the live -origin), because a group might never have reached storage. So the archive needs both an -incoming `TrackDynamic` (requests from downstream) *and* a downstream handle to forward misses -to (its own `TrackDynamic` over the upstream track, which it also records). The chain is: - -``` -consumer FETCH -> relay cache -> moq-archive (RAM -> disk -> S3) -> origin publisher -``` - -> **Open architectural question (from review):** this cache-fallback-plus-record behavior -> could instead live *inside* `moq_net::TrackProducer` / `TrackConsumer` themselves. That is a -> friendlier API (the archive becomes a storage backend you attach, not a chain you wire) and -> moq-net would then know precisely when a group is evicted from its RAM cache, which is the -> natural trigger to flush. The cost is baking storage concerns into the core wire types, which -> feels out of place there. v1 keeps the logic in `moq-archive` and treats moq-net integration -> as a follow-up; flagged here because it shapes the public API. - -The two directions stay decoupled: recording needs a `TrackConsumer`, serving needs a -`TrackDynamic` (plus an upstream handle for miss fallback). They share only the storage layer, -so an archive node can do either or both. - -## Architecture - -``` - record (TrackConsumer) serve (TrackProducer + TrackDynamic) - | ^ - v | - +------------------------------------+ +------------------------------+ - | Writer | | Reader | - | - drain groups in arrival order | | - on GroupRequest(seq): | - | - buffer in RAM until finished() | | look up seq in Index | - | - batch completed groups | | ranged GET the segment | - | - flush segment + index entries | | parse frames, stream out | - +------------------------------------+ +------------------------------+ - | ^ - v | - +-------------------------------------------------------------------------------------+ - | Storage (tiered) | - | RAM ring --(flush)--> disk store --(age + aggregate)--> S3 store | - | Index: group seq -> (tier, object key, byte offset, length, frame count, ts span) | - +-------------------------------------------------------------------------------------+ -``` - -Two halves, joined by a `Storage` abstraction and an `Index`: - -### Writer (ingest) - -1. Subscribe to the source `TrackConsumer` and loop on `recv_group()`. -2. For each group, spawn/track a buffer that drains `read_frame()` into an in-RAM - `BufferedGroup { sequence, frames: Vec<(Option, Bytes)> }`. Because groups - arrive concurrently, several buffers are open at once, keyed by sequence. -3. When a group's `finished()` resolves, mark it flushable. Incomplete groups never leave RAM - (we cannot serve a half-group). -4. A flusher batches flushable groups into one **segment object** (footer included) when a - threshold trips: a byte-size target, the RAM time window, or a group going `unused()` early. - Batching is what keeps audio from making a file per frame. The latest group and any `used()` - groups stay in RAM. -5. Tier maintenance runs on a timer: roll up (concatenate several disk segments into one larger - S3 object, rewriting the manifest), then LRU/age-evict and delete objects past each tier's - budget. - -### Reader (egress / serve) - -1. Take the incoming `TrackDynamic` (the caller owns the `TrackProducer` and publishes it). - Optionally hold a downstream handle to the upstream origin for miss fallback. -2. Loop on `TrackDynamic::requested_group()`. For each `GroupRequest(seq)`: - - Look up `seq` in the `Index` to find `(store, object key, offset, length)`. - - `store.get_opts(key, GetRange::Bounded(offset..offset+length))` -> the segment slice for - that one group (a single range request, S3-friendly). - - Parse the group's frames, `request.accept()`, and stream them into the `GroupProducer`. - - On a miss (not in RAM/disk/S3), forward the request to the upstream origin if present, - relaying (and recording) the result. Only when nothing upstream can satisfy it does the - request resolve to `NotFound`. - - On a miss (seq never recorded or already evicted) reject with `Error::NotFound`. - -Reader and Writer are independent tasks sharing `Storage`; an archive process can run one or -both. The in-RAM tier doubles as a serving cache: a FETCH for a still-buffered recent group is -served from memory without touching disk. - -## Storage layout - -Everything is an `object_store` key, so the same code paths work for a local dir -(`LocalFileSystem`) and a bucket (`AmazonS3`). Proposed key scheme, rooted at a configurable -prefix and namespaced by broadcast/track: - -``` -///segments/ # concatenated groups -///manifest # append-only list of segments (see Index format) -``` - -**Broadcast and track names contain slashes** (they are themselves path-shaped, e.g. -`room/alice/camera`). `object_store` paths are `/`-delimited, so the raw name would explode -into spurious directory levels and collide (`a/b` + `c` vs `a` + `b/c`). Percent-encode each -name as a single, reversible path segment before use (encode `/` and any other delimiter), so -`` / `` are opaque components. This keeps `list`-by-prefix working per -broadcast and lets us recover the original names on restart. - -### Segment format - -A segment is a concatenation of groups. Each group is self-delimiting so a ranged GET of just -its slice is independently parseable: - -``` -group := group_header frame* -group_header := varint(sequence) varint(frame_count) -frame := varint(size) flags ts? payload[size] - flags: 1 byte; bit0 = timestamp present - ts: varint(zigzag delta vs previous frame ts in this group) # when bit0 set -``` - -This mirrors moq-net's own frame coding (size-prefixed, optional zigzag-delta timestamp) so -there is no information loss across a record/serve round-trip. The varint/zigzag helpers are -small; if moq-net's `coding` module is made `pub(crate)`-exportable we reuse it, otherwise a -~30-line local copy (the wire format is stable). Frame payloads are stored verbatim; -compression is a later option. - -### Index format (nailed down) - -Two levels, modeled on Parquet's self-describing footer plus an Iceberg-style manifest. This -avoids a separate `.idx` object per segment (which would reintroduce the small-object problem) -and makes each segment independently recoverable. - -**1. Per-segment footer.** Each segment object ends with its own group table plus a fixed -trailer, so the segment is self-describing: given only the object you can find every group in -it. The table is one record per group: - -```rust -struct GroupEntry { - sequence: u64, // group sequence (NOT necessarily contiguous or sorted) - offset: u64, // byte offset of the group within this segment - length: u64, // byte length of the group - frames: u32, // frame count (validates / sizes the GroupProducer) - ts_first: Option, // media timestamp span (retention + future seeking) - ts_last: Option, - received: u64, // wall-clock ms at completion; retention fallback when ts absent -} - -// segment := group* footer -// footer := postcard(Vec) u32(footer_len) u32(magic) -``` - -**2. Per-track manifest.** One append-only object per track listing its segments, so the -reader can route a sequence to a segment without opening every segment footer: - -```rust -struct ManifestEntry { - segment: SegmentId, // object key (relative to the track prefix) - tier: Tier, // Disk | S3 (RAM segments are not in the manifest) - seq_min: u64, seq_max: u64, // sequence range covered (groups out of order, so a range, not a set) - ts_min: Option, ts_max: Option, -} -``` - -**Encoding:** `postcard` for both (compact, `serde`, no schema server; chosen over JSONL so -the footer is fixed-shape and the manifest stays small for multi-day archives). The trailer's -`footer_len` + `magic` let the reader fetch the footer with one tail range GET -(`GetRange::Suffix`) without knowing its size up front. - -**Runtime + recovery:** on startup the reader reads each track manifest, building an in-RAM -`BTreeMap` for O(log n) seq lookup (segment footers are fetched -and cached lazily on first hit). Because groups complete out of order, footer entries are in -completion order; the `BTreeMap` makes lookup order-independent. The manifest is the routing -index; segment footers are the source of truth and let us rebuild a manifest by `list` + -tail-read if one is ever lost. - -## Tiering and retention - -Three tiers, RAM -> disk -> S3, each optional. The key idea (from review): **each rollup step -merges multiple units from the tier above into one larger object, so fragmentation decreases -as data moves down.** RAM can be highly fragmented (one buffer per group, audio makes many); -disk segments coalesce a window of groups; S3 objects coalesce a window of disk segments. - -| Tier | Backed by | Holds for | Flushes downward in | -|------|-----------|-----------|---------------------| -| RAM | in-process buffers | up to e.g. 30s | 10s segments to disk | -| Disk | `object_store` `LocalFileSystem` | up to e.g. 5m | 1m batches to S3 | -| S3 | `object_store` `AmazonS3` (GCS/Azure) | up to e.g. 30d (or forever) | n/a (final tier) | - -So a group is written many times to RAM individually, rewritten once into a 10s disk segment, -then several disk segments are concatenated into one 1m S3 object. This directly resolves the -old "1:1 copy vs concatenate" open question in favor of **concatenate at every rollup**. - -### Eviction: LRU + size budget, not just age - -Each tier has both a **max age** and a **size budget**. Within a tier, evict **least-recently- -used** first (an LRU keyed by group/segment), capped by the budget; the max age is an upper -bound layered on top. LRU is the right default for both RAM and disk because serving traffic -is bursty and skewed (a re-fetched group is likely to be re-fetched again). Pure-RAM mode (no -disk, no S3) is then a bounded LRU/DVR window rather than a strict ring buffer. - -Two refinements from review: - -- **Always keep the latest group in RAM**, exempt from LRU/age eviction. New subscribers and - the live edge need it immediately, and it is the one group most likely to be requested next. -- **Use moq-net's `used()` / `unused()` group state to flush early.** A group that has gone - `unused` (no active consumer interested) earns nothing by staying in RAM, so fold it into the - next disk flush instead of waiting out the full RAM window. `used()` groups stay hot. This - makes the RAM age a cap, not a fixed delay, and reclaims memory under churn. - -### Retention clock - -For the max-age bound, prefer the group's **media timestamp** (`ts_last`) when present -(moq-lite-05), else fall back to the **wall-clock `received` time**. A tier is *enabled* when -its store is configured; "S3 forever" is `s3.store = Some` with no max age. Data flows strictly -downward, so disabling the middle tier flushes RAM straight to S3. Segments/objects are deleted -whole once every group they contain has aged out, so the rollup batch granularity bounds how -long one live group pins an object. - -All tiers live in a `Storage` struct shared by the writer, reader, and a single periodic -maintenance task that does the three jobs: roll up (merge + promote), LRU/age evict, delete. - -## Public API sketch - -Smallest surface that does the job, per the repo's public-API guidance. One insulated entry -point per direction, plus a `#[non_exhaustive]` config built via `Default`. - -```rust -/// Per-tier sizing. Build via `Config::default()` then set fields. -#[derive(Clone, Debug, Default)] -#[non_exhaustive] -pub struct Config { - pub ram: RamConfig, // memory window + budget; always keeps the latest group - pub disk: Option, // LocalFileSystem store - pub s3: Option, // remote object_store; no max_age -> keep forever -} - -#[derive(Clone, Debug)] -#[non_exhaustive] -pub struct TierConfig { - pub store: Arc, - pub prefix: object_store::path::Path, - pub max_age: Option, // upper bound; None on S3 means forever - pub budget: Option, // byte budget, LRU-evicted when exceeded - pub rollup: Duration, // window of upstream units merged into one object here -} - -/// An archive for a single track, over shared tiered storage. -pub struct Archive { /* Storage + Index */ } - -impl Archive { - pub fn new(config: Config) -> Result; - - /// Record a live track until it ends or errors. Drains groups, batches, flushes. - pub async fn record(&self, track: TrackConsumer) -> Result<()>; - - /// Answer FETCH requests from storage. Takes the request side of a track the caller owns - /// and publishes; on a storage miss, forwards to `upstream` (the live origin) if given, - /// otherwise resolves the request to `NotFound`. - pub async fn serve( - &self, - requests: TrackDynamic, - upstream: impl Into>, - ) -> Result<()>; -} -``` - -`serve` takes a `TrackDynamic`, not a `TrackProducer`: the caller owns the producer and -publishes the broadcast into a session, so the archive is a composable link in the cache chain -(relay -> archive -> origin) rather than the owner of the track. `upstream` is the miss- -fallback handle and, when recording the same track, the source the writer drains. Per the repo -convention it is `impl Into>`, so callers pass the consumer or `None`. - -## Usage mockups - -These compile-in-spirit sketches answer "how is this actually called?" and surface a real gap: -the per-track API above serves a **standalone / VOD** node cleanly, but `moq-relay` has no -seam to use it, which is the argument for the moq-net integration below. - -### A. Standalone VOD node (works with the per-track API) - -A node that recorded a broadcast earlier and now serves it back. Here the archive *is* the -publisher: the live broadcast is gone, so there is no path collision and the per-track API -fits. Recording is symmetric (`origin.consume()` -> `BroadcastConsumer::track` -> -`Archive::record`). - -```rust -let archive = Archive::new(config)?; - -// Publish a broadcast; its tracks answer FETCH from storage. -let broadcast = BroadcastInfo::new().produce(); -origin.publish_broadcast("vod/room-alice", broadcast.consume())?; - -// For each track a downstream subscriber asks for, serve it from storage. -let mut tracks = broadcast.dynamic(); -while let Ok(request) = tracks.requested_track().await { - let producer = request.accept(TrackInfo::default())?; // caller owns the producer - let requests = producer.dynamic(); // archive gets the request side - tokio::spawn(archive.serve(requests, None)); // no upstream: pure VOD -} -``` - -### B. Why `moq-relay` cannot use the per-track API as-is - -The relay is built entirely around a single `OriginProducer` (`Cluster::origin`). Remote -publishers `publish_broadcast` into it; downstream sessions read `origin.consume()`. The relay -code never constructs a `TrackProducer` and never calls `.dynamic()` on a track. That happens -*inside* moq-net's session fan-out (`lite::subscriber` / `ietf::subscriber`), which creates the -per-track producer and its `TrackDynamic` to forward a downstream cache-miss FETCH upstream. - -So there is no point in `moq-relay` where you could write `archive.serve(track_dynamic, ...)`: -the relay operates one layer up, at the broadcast/origin granularity, and the track objects the -archive needs only exist transiently deep inside moq-net. Wiring the per-track API into the -relay would mean interposing on every track of every forwarded broadcast (republishing each -broadcast through an archive-owned `BroadcastProducer`, re-`accept`ing every `requested_track`, -re-`subscribe`ing upstream), i.e. reimplementing the relay's fan-out around the archive. That -is the "wire a fallback chain" cost, and it is large. - -### C. The moq-net seam: a concrete per-track cache (no trait, no callback) - -Rather than a `Cache` trait moq-net calls back into (rejected in review: no inversion of -control), the storage lives in a concrete `CacheConfig` value owned by each `TrackProducer`. -moq-net retains and serves groups itself, spilling to disk or remote object storage per the -config. The relay attaches a config; the archive crate just provides the tier setup and reads -the spilled segments. See [`../moq-net/CACHE.md`](../moq-net/CACHE.md) for the full spike. - -```rust -// moq-net (new): local cache policy, per-track, never on the wire. -let config = moq_net::CacheConfig { - ram: bounds(20.s(), 30.s()), // keep 20-30s in RAM - disk: Some(disk_tier(path, 4.m(), 5.m())), - remote: Some(remote_tier(s3, 30.days())), - ..Default::default() -}; - -// each track the relay creates: producer.with_cache(config.clone()) -``` - -Retention is per-track `[min, max]` bounds with a watermark flush (the `max - min` band becomes -one segment, so audio does not produce a file per frame). There is no eviction callback: the -archive learns what to persist because moq-net writes the spilled segments directly in the -shared on-tier format. This keeps the relay declarative (config, not loops) and keeps storage -concerns out of moq-net's behavior surface. Decision settled; the open API work is threading one -config onto the tracks moq-net auto-creates during fan-out (the Origin follow-up). - -## Binary - -`moq-archive` (the binary) wires the library to a relay, mirroring `moq-cli`: - -- clap config, TOML-loadable. Every `#[arg]` field is `Option` so the TOML->CLI merge does - not clobber file values with `Default` (repo rule; add the regression test like - `moq-relay`). Durations use `humantime-serde`. -- Subcommands: `record --url --broadcast --track ` connects, subscribes, - and records; `serve ...` connects, publishes, and answers fetches. A combined mode runs both. -- Storage flags map onto `object_store` builders: `--disk `, `--s3-url s3://bucket/prefix` - (+ standard AWS env for creds). Each tier takes a max age, a byte budget, and a rollup window, - e.g. `--ram-age 30s --disk-age 5m --disk-rollup 10s --s3-age 30d --s3-rollup 1m`. - -## Out-of-order handling (why it is first-class) - -Groups ride independent QUIC streams, so sequence 7 can finish before sequence 5. The writer -therefore keeps a map of open buffers and only flushes a group on its own `finished()`; it -never assumes contiguity. The index is keyed by sequence but appended in completion order, and -the reader's `BTreeMap` makes lookup order-independent. FETCH is inherently random-access -(consumer asks for an arbitrary old seq), so the read path has no ordering assumptions either. -Sequence gaps (a group that was lost upstream and never recorded) are legal: a FETCH for a gap -returns `NotFound`. - -## Open questions - -1. **moq-net integration.** Settled on a concrete per-track `CacheConfig` owned by - `TrackProducer` (no trait, no callback); see scenario C and [`../moq-net/CACHE.md`](../moq-net/CACHE.md). - The remaining work is on the moq-net side: threading one config onto the tracks moq-net - auto-creates during fan-out, removing `TrackInfo.cache`, and (separately) the Origin split - that lets a relay register dynamic broadcast/track handlers. -2. **Restart/recovery.** Rebuild the in-RAM `BTreeMap` from each track manifest on startup; - refetch segment footers lazily. Crash-consistency: write the segment object (footer last) - before appending to the manifest, so a half-written segment is simply unreferenced and a - startup `list` sweep GCs any segment missing from the manifest. -3. **Sub-group FETCH.** Earlier drafts let a FETCH start at frame K within a group. This is - likely being **removed from moq-lite-05** and the current API does not support it, so the - archive serves whole groups only. If it returns, the cheap path (ranged-GET the group, skip - K frames in memory; groups are bounded at 32 MB) suffices before adding per-frame offsets. -4. **Serving the live edge.** Keeping the latest group in RAM plus upstream miss-fallback lets - the archive answer recent FETCHes and stand in for a departed origin. A full live - `subscribe` replay (DVR-style) is a possible follow-up beyond FETCH. -5. **Very large archives.** The manifest + lazily-cached footers handle a multi-day archive, - but a months-long one may want manifest sharding (per time bucket) so startup does not read - the whole thing. Defer until the single-manifest form is measured. -6. **Backpressure.** If storage is slower than ingest, do we drop oldest buffered groups - (bounded memory, lossy) or apply backpressure to the subscription? Recommend a bounded RAM - budget that LRU-drops oldest *completed-but-unflushed* groups and records the gap, never - blocking live ingest. - -## Testing plan - -- Unit: segment encode/decode round-trip, including absent vs present timestamps and - zigzag-delta edges; index append + reload; out-of-order completion ordering. -- Storage: run the full record -> flush -> serve loop against `object_store`'s in-memory and - `LocalFileSystem` backends (no network needed). Use `tokio::time::pause()` for retention/tier - timers per the repo's async-test rule. -- Integration: record a synthetic track (audio-shaped: one frame per group), serve it back via - FETCH, assert byte-exact frames and timestamps. Confirm a FETCH for an evicted/gap sequence - returns `NotFound`. -- Config: TOML<->CLI merge regression test (the `Option` flag rule). - -## Cross-package sync - -Per the repo's sync table, a new standalone crate that only *consumes* `moq-net`'s public API -needs no wire/catalog changes. Touch points: - -- Add `rs/moq-archive` to the workspace `members` / `default-members` in the root `Cargo.toml`. -- New docs page under `doc/bin/` for the binary (and a `doc/concept/` note on the archive tier - relative to the relay cache). -- If we end up needing moq-net's `coding` varint/zigzag helpers, that is a small additive - `pub` export in `moq-net` (non-breaking), to avoid duplicating the wire codec. -``` diff --git a/rs/moq-net/CACHE.md b/rs/moq-net/CACHE.md index c4926b9e6..c4b0a969c 100644 --- a/rs/moq-net/CACHE.md +++ b/rs/moq-net/CACHE.md @@ -14,7 +14,7 @@ > group model; `TrackProducer::with_cache(cache::Producer)` spawns a subscriber that drains > finished groups into the cache; `TrackConsumer::with_cache(cache::Consumer)` makes `get_group` > and `fetch_group` resolve from the cache on a live miss; -> - the **disk/remote tier I/O** (`store.rs`, behind the `cache-tiered` feature): `cache::store::Store` +> - the **disk/remote tier I/O** (`store.rs`, native-only via target-gating): `cache::store::Store` > over an `object_store` disk tier and optional remote tier. `flush` encodes a band and `put`s it > as a disk segment; `get` ranged-reads a located blob; `compact` rolls the oldest disk segments > up into one remote object (or evicts them with no remote), driven by the index. @@ -23,9 +23,8 @@ > tracks a relay auto-creates (the Origin follow-up). Targets `dev`. A per-track group cache. It lets a relay or edge retain recent groups past the live window and -serve them back on a FETCH, optionally spilling to local disk or remote object storage. This is -the moq-net mechanism the `moq-archive` crate builds on (see `../moq-archive/DESIGN.md`); the -on-tier byte format is shared with that crate. +serve them back on a FETCH, optionally spilling to local disk or remote object storage. It lives +in `moq-net` so any consumer of a track (relay, edge, archiver) gets durable caching for free. The implemented surface follows moq-net's produce/consume split: `cache::Config::produce()` yields a `cache::Producer` (the write half, not `Clone`), and `Producer::consume()` yields a @@ -154,11 +153,10 @@ access. ## Tiers -RAM is always present and dependency-free. disk and remote are `object_store`, behind a -`cache-tiered` feature flag so RAM-only native builds (and any wasm consumers) do not pull the -cloud stack. The on-tier bytes reuse the `moq-archive` segment plus manifest format, so the -cache and the archive crate agree byte-for-byte and a relay's spilled data is directly readable -by an archive node. +RAM is always present and dependency-free. disk and remote are `object_store`, target-gated to +non-wasm targets (`cfg(not(target_arch = "wasm32"))`) so native builds get the tiers with no flag +and wasm builds drop the server-side cloud stack automatically. The on-tier byte format is the +`segment` module's, so a relay's spilled data is directly readable by anything using this cache. ## Attaching to a producer or a consumer @@ -295,7 +293,7 @@ pub struct CacheArgs { ```text moq serve --broadcast bbb --cache-ram 30s --cache-disk /var/cache/moq --cache-disk-age 5m \ - --cache-remote s3://moq-archive/bbb --cache-remote-age 30d fmp4 < bbb.mp4 + --cache-remote s3://moq-cache/bbb --cache-remote-age 30d fmp4 < bbb.mp4 ``` and converts to a `cache::Config` whose halves go to each endpoint: @@ -322,8 +320,8 @@ the same flags under its `Option` clobber rule. ## Open questions -1. **object_store in moq-net.** Feature-gate `cache-tiered`; RAM-only stays dependency-free. - This is the one heavy dependency decision, since moq-net is the core wire crate. +1. **object_store in moq-net.** Resolved: target-gated to `cfg(not(target_arch = "wasm32"))`, so + native always has the tiers and wasm drops them, with no opt-in feature. 2. **Async get.** RAM hits must stay synchronous (serve under the lock); only disk and remote faults are async. The return type needs a "ready now or pending" shape, matching moq-net's existing `kio::Pending`. diff --git a/rs/moq-net/Cargo.toml b/rs/moq-net/Cargo.toml index 4f23a8483..721dcd528 100644 --- a/rs/moq-net/Cargo.toml +++ b/rs/moq-net/Cargo.toml @@ -13,9 +13,6 @@ keywords = ["quic", "http3", "webtransport", "media", "live"] categories = ["multimedia", "network-programming", "web-programming"] [features] -# Disk and remote cache tiers (object_store). Off by default so RAM-only and -# wasm builds stay dependency-free. -cache-tiered = ["dep:object_store"] # Legacy no-op: serde is now unconditional (stats publishing requires it). serde = [] @@ -25,7 +22,6 @@ flate2 = { workspace = true } futures = "0.3" kio = { workspace = true } num_enum = "0.7" -object_store = { workspace = true, optional = true } rand = "0.10.1" serde = { workspace = true, features = ["rc"] } serde_json = "1" @@ -35,6 +31,12 @@ tracing = "0.1" web-async = { workspace = true } web-transport-trait = { workspace = true } +# The disk/remote cache tiers use object_store, a server-side library that doesn't build for +# wasm (browsers don't spill to disk/S3). Target-gated rather than feature-gated so native builds +# get the tiers automatically with no flag, and wasm builds drop them automatically. +[target.'cfg(not(target_arch = "wasm32"))'.dependencies] +object_store = { workspace = true } + [dev-dependencies] # test-util (tokio::time::pause/advance) is test-only and is NOT supported on # wasm, so it must not leak into the normal dependency feature set. diff --git a/rs/moq-net/src/model/cache/mod.rs b/rs/moq-net/src/model/cache/mod.rs index f6cbd4cc0..87ddf911e 100644 --- a/rs/moq-net/src/model/cache/mod.rs +++ b/rs/moq-net/src/model/cache/mod.rs @@ -29,14 +29,14 @@ use bytes::Bytes; use super::{Timescale, Timestamp}; // Internal orchestration for the disk/remote tiers; not part of the public surface, and only -// needed (and only compiled) with those tiers. -#[cfg(feature = "cache-tiered")] +// needed (and only buildable) where object_store is available. +#[cfg(not(target_arch = "wasm32"))] mod index; pub mod segment; -/// Disk and remote tiers backed by object_store. Requires the `cache-tiered` feature. -#[cfg(feature = "cache-tiered")] +/// Disk and remote tiers backed by object_store. Native-only (object_store doesn't build on wasm). +#[cfg(not(target_arch = "wasm32"))] pub mod store; /// A cache bound, as a duration, a byte count, or both (the first to trip wins). From 9e53d9f18a48cdb12ee67d70165dccb2e3363aa6 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 02:39:13 +0000 Subject: [PATCH 19/25] feat(moq-net): wire the disk/remote tiers into the cache Config cache::Config gains an optional `disk: Disk` tier (native-only): an object_store, a key prefix, retention bounds, and an optional remote rollup store. Build with Disk::new(...).with_remote(...). Config::produce() now spawns a background task (when a disk tier is set) that drains RAM-evicted bands through an mpsc channel into a shared store::Store (behind a tokio RwLock so reads run concurrently with the flusher). Producer::insert hands each evicted band to that task instead of returning it. Consumer gains an async fetch() that reads RAM, then disk, then remote (get() stays a sync RAM-only lookup). State now holds the RAM Bounds directly rather than the whole Config. So a cache with a disk tier now actually spills to and serves from disk/remote with no extra wiring, on native; wasm builds drop the tier fields entirely. Test: a Producer with an InMemory disk tier, insert past the RAM watermark, and the consumer fetches the evicted group back from disk. Default suite 439 pass; clippy/rustdoc/fmt and --no-default-features clean. CACHE.md updated to match. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-net/CACHE.md | 49 +++++--- rs/moq-net/src/model/cache/mod.rs | 185 ++++++++++++++++++++++++++---- 2 files changed, 190 insertions(+), 44 deletions(-) diff --git a/rs/moq-net/CACHE.md b/rs/moq-net/CACHE.md index c4b0a969c..727a3b680 100644 --- a/rs/moq-net/CACHE.md +++ b/rs/moq-net/CACHE.md @@ -17,10 +17,17 @@ > - the **disk/remote tier I/O** (`store.rs`, native-only via target-gating): `cache::store::Store` > over an `object_store` disk tier and optional remote tier. `flush` encodes a band and `put`s it > as a disk segment; `get` ranged-reads a located blob; `compact` rolls the oldest disk segments -> up into one remote object (or evicts them with no remote), driven by the index. +> up into one remote object (or evicts them with no remote), driven by the index; +> - the **tier wiring**: `cache::Config` carries an optional `disk: Disk` tier (an `object_store`, +> a key prefix, bounds, and an optional remote rollup store; native-only). `Config::produce` +> spawns a background task that flushes RAM-evicted bands to the disk tier, and +> `Consumer::fetch` (async) reads across RAM -> disk -> remote (`Consumer::get` stays a sync +> RAM-only lookup). > -> Still design: removing the wire field `TrackInfo.cache`, and threading a `CacheConfig` onto the -> tracks a relay auto-creates (the Origin follow-up). Targets `dev`. +> Still design: removing the wire field `TrackInfo.cache`; threading a `Config` onto the tracks a +> relay auto-creates (the Origin follow-up); and serving disk/remote through a track's +> `fetch_group` (today the sync track API serves RAM; the relay serve loop uses `Consumer::fetch` +> for the lower tiers). Targets `dev`. A per-track group cache. It lets a relay or edge retain recent groups past the live window and serve them back on a FETCH, optionally spilling to local disk or remote object storage. It lives @@ -63,8 +70,18 @@ Per track, on both size and duration, whichever trips first: #[derive(Clone, Debug, Default)] #[non_exhaustive] pub struct Config { - pub ram: Bounds, // keep >= min in RAM; flush the band once > max - // forthcoming: disk + remote tiers (object_store, feature-gated) and an interval backstop. + pub ram: Bounds, // keep >= min in RAM; flush the band once > max + #[cfg(not(target_arch = "wasm32"))] + pub disk: Option, // optional spill tier (native-only) +} + +/// The disk spill tier: an object store, key prefix, bounds, and an optional remote rollup store. +#[cfg(not(target_arch = "wasm32"))] +pub struct Disk { + pub store: Arc, + pub prefix: object_store::path::Path, + pub bounds: Bounds, // disk retention; over it rolls up to `remote` + pub remote: Option>, } /// A low/high watermark. The gap (max - min) is the flush batch size. @@ -75,17 +92,12 @@ pub struct Bounds { pub min: Limit, pub max: Limit } pub struct Limit { pub duration: Option, pub bytes: Option } ``` -The implemented `cache::Config` has only `ram` so far (it is `#[non_exhaustive]`, so adding -`disk` / `remote` / `interval` later is additive). The forthcoming tier shape: - -```rust -// forthcoming -pub struct Tier { pub store: /* path or url */ (), pub bounds: Bounds } -// Config gains: disk: Option, remote: Option, interval: Option -``` +`Config::produce()` builds the RAM tier and, when `disk` is set, spawns a background task that +flushes evicted bands to it. `Producer::insert` hands evicted bands to that task; `Consumer::fetch` +(async) reads RAM then disk then remote. An `interval` flush backstop is still a possible addition +(the `#[non_exhaustive]` Config keeps it additive). -The flush batch is implicitly `max - min`, so the bounds map straight onto the tiering the -archive doc describes: +The flush batch is implicitly `max - min`, so the bounds map straight onto the tiering: | Want | Set | |---|---| @@ -174,16 +186,15 @@ writer.insert(group); // -> Option (band to persist to the nex reader.get(sequence); // -> Option ``` -The forthcoming track wiring hands each endpoint the matching half: +The track wiring hands each endpoint the matching half: ```rust -// forthcoming impl TrackProducer { - /// Fill `cache` with groups this producer creates and serve them on a miss. + /// Fill `cache` with groups this producer creates (spawns a populate subscriber). pub fn with_cache(self, cache: cache::Producer) -> Self; } impl TrackConsumer { - /// Back fetch_group / get_group with `cache`: hits resolve locally. + /// Back fetch_group / get_group with `cache`: RAM hits resolve locally. pub fn with_cache(self, cache: cache::Consumer) -> Self; } ``` diff --git a/rs/moq-net/src/model/cache/mod.rs b/rs/moq-net/src/model/cache/mod.rs index 87ddf911e..6d0cfe061 100644 --- a/rs/moq-net/src/model/cache/mod.rs +++ b/rs/moq-net/src/model/cache/mod.rs @@ -28,6 +28,9 @@ use bytes::Bytes; use super::{Timescale, Timestamp}; +#[cfg(not(target_arch = "wasm32"))] +use object_store::{ObjectStore, path::Path}; + // Internal orchestration for the disk/remote tiers; not part of the public surface, and only // needed (and only buildable) where object_store is available. #[cfg(not(target_arch = "wasm32"))] @@ -96,18 +99,71 @@ impl Bounds { pub struct Config { /// Bounds on the RAM tier. pub ram: Bounds, - // Disk and remote tiers are forthcoming (object_store-backed, feature-gated). + /// Optional disk (and remote) spill tier. When set, bands evicted from RAM are flushed here. + /// Native-only: the field is absent on wasm, where object_store does not build. + #[cfg(not(target_arch = "wasm32"))] + pub disk: Option, } impl Config { - /// Build a [`Config`] with the given RAM bounds. + /// Build a [`Config`] with the given RAM bounds and no spill tier. pub fn new(ram: Bounds) -> Self { - Self { ram } + Self { + ram, + #[cfg(not(target_arch = "wasm32"))] + disk: None, + } } - /// Start an empty cache with this policy, returning its write half. + /// Start an empty cache with this policy, returning its write half. If a disk tier is + /// configured, spawns a background task that flushes evicted RAM bands to it. pub fn produce(self) -> Producer { - Producer::new(self) + let state = Arc::new(Mutex::new(State { + bounds: self.ram, + ram: BTreeMap::new(), + ram_bytes: 0, + })); + Producer { + state, + #[cfg(not(target_arch = "wasm32"))] + tiers: self.disk.map(Tiers::spawn), + } + } +} + +/// The disk spill tier: an object store, a key prefix, retention bounds, and an optional remote +/// store the disk tier rolls up into. Native-only (`object_store` does not build on wasm). Build +/// with [`Disk::new`], optionally [`with_remote`](Disk::with_remote). +#[cfg(not(target_arch = "wasm32"))] +#[derive(Clone, Debug)] +#[non_exhaustive] +pub struct Disk { + /// The object store for the disk tier (e.g. a `LocalFileSystem`). + pub store: Arc, + /// Key prefix under which segments are written. + pub prefix: Path, + /// Retention bounds on the disk tier; exceeding them rolls up to `remote` (or evicts). + pub bounds: Bounds, + /// Optional remote store the disk tier rolls up into when over its bounds. + pub remote: Option>, +} + +#[cfg(not(target_arch = "wasm32"))] +impl Disk { + /// A disk tier over `store`, writing under `prefix`, capped by `bounds`. No remote rollup. + pub fn new(store: Arc, prefix: Path, bounds: Bounds) -> Self { + Self { + store, + prefix, + bounds, + remote: None, + } + } + + /// Set the remote store the disk tier rolls up into. + pub fn with_remote(mut self, remote: Arc) -> Self { + self.remote = Some(remote); + self } } @@ -186,9 +242,9 @@ impl Group { /// next tier as a single segment. pub type Batch = Vec; -/// The shared store behind a [`Producer`] and its [`Consumer`]s. +/// The shared RAM tier behind a [`Producer`] and its [`Consumer`]s. struct State { - config: Config, + bounds: Bounds, /// Groups keyed by sequence, so the first entry is the oldest and the last is the latest. ram: BTreeMap, ram_bytes: u64, @@ -219,7 +275,7 @@ impl State { /// Whether the high watermark is tripped. An unset high watermark is unbounded (never trips). fn over_max(&self) -> bool { - !self.config.ram.max.is_unset() && self.exceeds(self.config.ram.max) + !self.bounds.max.is_unset() && self.exceeds(self.bounds.max) } fn insert(&mut self, group: Group) -> Option { @@ -241,7 +297,7 @@ impl State { let mut batch = Batch::new(); // Drain oldest-first while still above the low watermark, but never the latest group: a // new subscriber and the live edge need it, and it is the likeliest next fetch. - while self.ram.len() > 1 && self.exceeds(self.config.ram.min) { + while self.ram.len() > 1 && self.exceeds(self.bounds.min) { let oldest = *self.ram.keys().next().expect("non-empty"); let latest = *self.ram.keys().next_back().expect("non-empty"); if oldest == latest { @@ -256,36 +312,64 @@ impl State { } } +/// The disk/remote tier handle held by a [`Producer`] and shared with its [`Consumer`]s: a sender +/// to the background flush task, and the store itself for reads. +#[cfg(not(target_arch = "wasm32"))] +struct Tiers { + flush: tokio::sync::mpsc::UnboundedSender, + store: Arc>, +} + +#[cfg(not(target_arch = "wasm32"))] +impl Tiers { + /// Build the store and spawn the background flush task draining evicted bands into it. + fn spawn(disk: Disk) -> Self { + let store = store::Store::new(disk.store, disk.remote, disk.prefix, disk.bounds); + let store = Arc::new(tokio::sync::RwLock::new(store)); + let (flush, mut rx) = tokio::sync::mpsc::unbounded_channel::(); + let writer = store.clone(); + web_async::spawn(async move { + while let Some(band) = rx.recv().await { + if let Err(err) = writer.write().await.flush(band).await { + tracing::warn!(%err, "cache disk flush failed"); + } + } + }); + Self { flush, store } + } +} + /// The write half of a track cache. Insert finished groups; not `Clone` (a single writer fills /// the cache). Call [`consume`](Self::consume) for a read handle. pub struct Producer { state: Arc>, + #[cfg(not(target_arch = "wasm32"))] + tiers: Option, } impl Producer { - fn new(config: Config) -> Self { - Self { - state: Arc::new(Mutex::new(State { - config, - ram: BTreeMap::new(), - ram_bytes: 0, - })), - } - } - /// Insert a finished group. /// - /// Returns a [`Batch`] when this insert pushed the RAM tier over its high watermark: the band - /// drained down to the low watermark, which the caller persists to the next tier. `None` when - /// nothing was evicted. A RAM-only cache ignores the return (the band is simply dropped). + /// If this insert pushed the RAM tier over its high watermark, the drained band is flushed to + /// the disk tier (when one is configured) and `None` is returned. Without a disk tier the band + /// is returned for a RAM-only caller to handle (or drop); `None` means nothing was evicted. pub fn insert(&mut self, group: Group) -> Option { - self.state.lock().expect("cache poisoned").insert(group) + let band = self.state.lock().expect("cache poisoned").insert(group)?; + #[cfg(not(target_arch = "wasm32"))] + if let Some(tiers) = &self.tiers { + // Hand the evicted band to the background flush task; the disk tier owns it now. + let _ = tiers.flush.send(band); + return None; + } + Some(band) } - /// A read handle sharing this cache's store. + /// A read handle sharing this cache's RAM tier (and disk/remote store, if any). pub fn consume(&self) -> Consumer { Consumer { state: self.state.clone(), + #[cfg(not(target_arch = "wasm32"))] + store: self.tiers.as_ref().map(|t| t.store.clone()), } } @@ -316,10 +400,13 @@ impl Producer { #[derive(Clone)] pub struct Consumer { state: Arc>, + #[cfg(not(target_arch = "wasm32"))] + store: Option>>, } impl Consumer { - /// Fetch a cached group by sequence, or `None` if it is not in the RAM tier. + /// Fetch a cached group by sequence from the **RAM** tier, or `None` if it is not there. + /// Synchronous. Use [`fetch`](Self::fetch) to also consult the disk/remote tiers. /// /// The returned [`Group`] is an owned copy (frame `Bytes` are reference-counted, so this is /// cheap), so a later eviction never invalidates a fetch already in flight. @@ -327,6 +414,19 @@ impl Consumer { self.state.lock().expect("cache poisoned").ram.get(&sequence).cloned() } + /// Fetch a group from any tier: RAM first, then (native only) the disk/remote store. Async + /// because the lower tiers do I/O. A tier read error is treated as a miss. + pub async fn fetch(&self, sequence: u64) -> Option { + if let Some(group) = self.get(sequence) { + return Some(group); + } + #[cfg(not(target_arch = "wasm32"))] + if let Some(store) = &self.store { + return store.read().await.get(sequence).await.unwrap_or_default(); + } + None + } + /// Whether a group with this sequence is currently in the RAM tier. pub fn contains(&self, sequence: u64) -> bool { self.state.lock().expect("cache poisoned").ram.contains_key(&sequence) @@ -602,4 +702,39 @@ mod tests { assert_eq!(cached.frames[0].timestamp, None); assert_eq!(Group::read(cached.produce(None).unwrap()).await.unwrap(), cached); } + + #[cfg(not(target_arch = "wasm32"))] + #[tokio::test] + async fn spills_to_disk_and_fetches_back() { + use object_store::memory::InMemory; + use object_store::path::Path; + + // RAM keeps ~1 group (flush over 250 bytes, down to 100); disk is unbounded so it keeps + // everything it is handed. + let disk = Disk::new(Arc::new(InMemory::new()), Path::from("cache"), Bounds::default()); + let mut config = Config::new(Bounds::new(Limit::bytes(100), Limit::bytes(250))); + config.disk = Some(disk); + + let mut producer = config.produce(); + let consumer = producer.consume(); + + for seq in 0..5 { + producer.insert(plain(seq, 100)); + } + + // The oldest group was evicted from RAM and flushed to disk by the background task. + let mut from_disk = None; + for _ in 0..200 { + if let Some(group) = consumer.fetch(0).await { + from_disk = Some(group); + break; + } + tokio::task::yield_now().await; + } + + assert!(consumer.get(0).is_none(), "group 0 is no longer in the RAM tier"); + assert_eq!(from_disk.expect("group 0 fetched from disk").sequence, 0); + // A recent group is still served from RAM. + assert!(consumer.fetch(4).await.is_some()); + } } From 33fc0c2c4fd9c14af0da955609f65cf3f5bd9d44 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 03:49:41 +0000 Subject: [PATCH 20/25] refactor(moq-net): move the cache into TrackState, reuse the live group window The cache is no longer a separate Producer/Consumer pair bolted onto each track endpoint. It lives on the shared TrackState, so the RAM tier is the track's own live `groups` buffer and the disk/remote tiers hang off the same state. One store backs the producer and every consumer automatically. - RAM = `TrackState.groups`. A group is serialized to `cache::Group` and handed to the disk tier only when it ages out of the live window, not via a spawned populate subscriber (removed). Inserting a group is caching it. - Two retention gates in `evict_expired`, both sized by `TrackInfo::cache`: a wall-clock gate on the receive `Instant` (the hard memory backstop) and a media-time gate on the last frame timestamp vs the live media edge (bounds a startup stampede). A group is evicted when it trips either. - `fetch_group` serves a live miss from the disk/remote tiers via an async lookup spawned on the first poll; a hit resolves the fetch, a miss falls through to the live decision. `get_group` stays synchronous and live-only. - `TrackProducer::with_cache` now takes a `cache::Disk` (native-only) and stores the spilled tiers on the state; `TrackConsumer::with_cache` and the read-through `cache::Producer`/`Consumer`/`Config`/RAM-watermark types are gone. The disk byte format (segment.rs), multi-tier index (index.rs), and object_store glue (store.rs) are unchanged. Updated CACHE.md to match. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-net/CACHE.md | 420 ++++++------------------ rs/moq-net/src/model/cache/mod.rs | 529 +++++------------------------- rs/moq-net/src/model/group.rs | 7 + rs/moq-net/src/model/mod.rs | 4 +- rs/moq-net/src/model/track.rs | 319 ++++++++++-------- 5 files changed, 368 insertions(+), 911 deletions(-) diff --git a/rs/moq-net/CACHE.md b/rs/moq-net/CACHE.md index 727a3b680..7087ebc1b 100644 --- a/rs/moq-net/CACHE.md +++ b/rs/moq-net/CACHE.md @@ -1,344 +1,114 @@ -# moq-net track cache (spike) +# moq-net track cache -> Status: implemented in `src/model/cache/` (module `moq_net::cache`, with unit tests): -> - the RAM tier and watermark eviction policy (`mod.rs`); -> - the on-disk **segment byte format** and **rollup** compaction (`segment.rs`): lossless -> per-frame encode/decode (raw timestamp value+scale, so any timescale round-trips), a -> self-describing footer offset table read from a fixed trailer, `rollup` to concatenate small -> segments into one larger object, and `group_from_blob` (the ranged-read decode path); -> - the storage-agnostic **multi-tier index + promotion** (`index.rs`): `sequence -> Location` -> (tier + segment + byte range), per-tier byte/duration accounting, `promotion` to pick the -> oldest disk segments over the high watermark, and `apply_promotion` to repoint them at the -> remote tier after a rollup; -> - the **track wiring**: `cache::Group::read` / `produce` bridge a cached group to/from the live -> group model; `TrackProducer::with_cache(cache::Producer)` spawns a subscriber that drains -> finished groups into the cache; `TrackConsumer::with_cache(cache::Consumer)` makes `get_group` -> and `fetch_group` resolve from the cache on a live miss; -> - the **disk/remote tier I/O** (`store.rs`, native-only via target-gating): `cache::store::Store` -> over an `object_store` disk tier and optional remote tier. `flush` encodes a band and `put`s it -> as a disk segment; `get` ranged-reads a located blob; `compact` rolls the oldest disk segments -> up into one remote object (or evicts them with no remote), driven by the index; -> - the **tier wiring**: `cache::Config` carries an optional `disk: Disk` tier (an `object_store`, -> a key prefix, bounds, and an optional remote rollup store; native-only). `Config::produce` -> spawns a background task that flushes RAM-evicted bands to the disk tier, and -> `Consumer::fetch` (async) reads across RAM -> disk -> remote (`Consumer::get` stays a sync -> RAM-only lookup). -> -> Still design: removing the wire field `TrackInfo.cache`; threading a `Config` onto the tracks a -> relay auto-creates (the Origin follow-up); and serving disk/remote through a track's -> `fetch_group` (today the sync track API serves RAM; the relay serve loop uses `Consumer::fetch` -> for the lower tiers). Targets `dev`. - -A per-track group cache. It lets a relay or edge retain recent groups past the live window and -serve them back on a FETCH, optionally spilling to local disk or remote object storage. It lives +A per-track durable cache. It lets a relay or edge keep recent groups past the live window and +serve them back on a FETCH, spilling to local disk and optionally remote object storage. It lives in `moq-net` so any consumer of a track (relay, edge, archiver) gets durable caching for free. -The implemented surface follows moq-net's produce/consume split: `cache::Config::produce()` -yields a `cache::Producer` (the write half, not `Clone`), and `Producer::consume()` yields a -`cache::Consumer` (the read half, `Clone`). Names below are the real ones. - -## Principles - -These come from design review and pin down the shape: - -- **Local, not on the wire.** The cache is local policy set by whoever holds a track endpoint - (the relay or edge), never by the original publisher and never carried on the wire. This is - why `TrackInfo.cache` goes away (see "Removing TrackInfo.cache"). The handle is **shareable**: - one cache can back both a track's `TrackProducer` and its `TrackConsumer` (see "Attaching to a - producer or a consumer"). -- **Per-track bounds, no shared LRU.** Each track keeps a `[min, max]` window of its own recent - groups. There is no cross-track accounting, so no shared lock and no contention. The cost is - that there is no global RAM ceiling; total footprint is the sum of per-track `max` across - live tracks. A global backstop, if ever needed, is additive and not part of v1. -- **No traits, no callbacks.** The cache is concrete values you configure and attach (`cache::Producer` / `cache::Consumer`). moq-net - owns all behavior. The disk and remote backends are an internal, configured `object_store`, - not a consumer-implemented extension point. -- **Watermark flush, not per-item eviction.** Groups accumulate to the high watermark, then a - whole band (the `max - min` worth) is flushed as one segment. This is the property an LRU - cannot provide: an LRU evicts one group the instant the budget trips, producing one tiny - object per group, which is fatal for audio (a group per frame). The watermark is what creates - batches. +## Shape -## Bounds - -Per track, on both size and duration, whichever trips first: +The cache is **not** a separate handle you wire onto both endpoints. It lives on the shared track +state (`TrackState`), so the RAM tier is the track's own live `groups` buffer and the disk/remote +tiers hang off the same state. One store therefore backs the track's `TrackProducer` and every +`TrackConsumer` automatically; a fetch is served from whichever tier holds the group. ```rust -// module moq_net::cache - -/// Local cache policy for a single track. Not on the wire, not in TrackInfo. -#[derive(Clone, Debug, Default)] -#[non_exhaustive] -pub struct Config { - pub ram: Bounds, // keep >= min in RAM; flush the band once > max - #[cfg(not(target_arch = "wasm32"))] - pub disk: Option, // optional spill tier (native-only) -} - -/// The disk spill tier: an object store, key prefix, bounds, and an optional remote rollup store. -#[cfg(not(target_arch = "wasm32"))] -pub struct Disk { - pub store: Arc, - pub prefix: object_store::path::Path, - pub bounds: Bounds, // disk retention; over it rolls up to `remote` - pub remote: Option>, -} - -/// A low/high watermark. The gap (max - min) is the flush batch size. -pub struct Bounds { pub min: Limit, pub max: Limit } +// module moq_net::cache (native-only types are target-gated to non-wasm) -/// A bound expressed as a duration, a byte count, or both (first to trip wins). -/// All-None means unbounded as a high watermark, floor-zero as a low watermark. -pub struct Limit { pub duration: Option, pub bytes: Option } -``` - -`Config::produce()` builds the RAM tier and, when `disk` is set, spawns a background task that -flushes evicted bands to it. `Producer::insert` hands evicted bands to that task; `Consumer::fetch` -(async) reads RAM then disk then remote. An `interval` flush backstop is still a possible addition -(the `#[non_exhaustive]` Config keeps it additive). - -The flush batch is implicitly `max - min`, so the bounds map straight onto the tiering: - -| Want | Set | -|---|---| -| keep 30s in RAM, flush 10s segments to disk | `ram.min = 20s`, `ram.max = 30s` | -| keep 5m on disk, flush 1m objects to remote | `disk.min = 4m`, `disk.max = 5m` | - -At 30s the buffer drains back to 20s, emitting a 10s segment, then refills over the next 10s. -No explicit batch size: the band is the batch. +let disk = cache::Disk::new(store, prefix, bounds) // object_store + key prefix + bounds + .with_remote(remote); // optional rollup target -`interval` is a backstop so a low data-rate track still flushes eventually instead of holding a -half-full band for a long time. A duration-based `max` already covers most of this (the oldest -group ages past `max` even with little data), so `interval` matters chiefly when the bounds are -byte-only. - -## State and flush - -Each track owns a small buffer plus an index of what has been flushed where: - -```rust -struct TrackCache { - ram: BTreeMap, // recent groups, keyed by sequence - ram_bytes: u64, - flushed: BTreeMap, // sequence -> (tier, object key, offset) for serving - last_flush: Instant, -} +let producer = TrackProducer::new(name, info).with_cache(disk); +let consumer = producer.consume(); // shares the same store ``` -Flush runs on group completion and on a timer: - -```text -if over(ram.max) || ram.interval elapsed with a flushable band: - batch = drain oldest completed, unpinned groups until back to ram.min - match disk: - Some(d) => segment = serialize(batch) // archive segment format - d.put(key, segment) - for g in batch { flushed[g.seq] = Disk(key, offset) } - None => drop(batch) // RAM-only cache: just evict -// the disk tier runs the same watermark loop against disk.max, concatenating several -// small segments into one larger remote object (the rollup) and updating `flushed`. -``` +## Principles -## Serving +- **Local, not on the wire.** The cache is local policy set by whoever holds a track endpoint (the + relay or edge), never by the original publisher and never carried on the wire. +- **RAM is the live window.** There is no second in-memory copy of recent groups: the cache reuses + `TrackState.groups`, the buffer the track already keeps for live subscribers. A group is + serialized (to `cache::Group`) and handed to the disk tier only when it ages out of that window. +- **No traits, no callbacks.** The cache is concrete values you configure and attach. moq-net owns + all behavior; the disk and remote backends are a configured `object_store`, not a + consumer-implemented extension point. +- **Per-track, no shared LRU.** Each track keeps its own recent window; there is no cross-track + accounting, so no shared lock. Footprint is the sum of per-track windows across live tracks. + +## Retention: two gates + +A group is evicted from the live window (`TrackState::evict_expired`) when it trips **either** of +two gates, both sized by `TrackInfo::cache` (the publisher's retention duration). The newest group +(`max_sequence`) is never evicted. + +- **Wall-clock** — the group was *received* more than the window ago. The receive time is an + `Instant` stamped when the group lands in `groups`; it is never sent over the wire or set by the + publisher. This is the hard memory backstop: a publisher can't pin RAM by lying about media + timestamps. +- **Media-time** — the group's last frame timestamp is more than the window behind the live media + edge (the newest frame timestamp buffered). This bounds a startup stampede, where a burst of + buffered media arrives at once (all "received now", so the wall-clock gate alone would keep it + all) and a fresh subscriber would otherwise be flooded. + +In steady state, where media time advances with wall-clock time, the two gates coincide. They +diverge only under a stampede (media-time trims it) or timestamp abuse (wall-clock trims it). + +## Spill and serve ```text -get(seq): - if let Some(g) = ram.find(seq) -> serve(g) // RAM hit, pin while read - if let Some(loc) = flushed.get(seq) -> stream_from_tier(loc) // ranged GET, no fault-back - else -> None // miss: upstream / Unroutable +evict_expired: (synchronous, under the state lock) + for each group outside the window (not max_sequence): + tombstone it in `groups` + if a cache is attached: hand its live GroupConsumer to the flush task + +flush task: (one background task per cached track) + per eviction pass: drain the groups into cache::Group, write ONE disk segment, + then compact (roll the oldest disk segments up into one remote object, or evict + them when there is no remote tier) + +fetch_group(seq): + live hit in `groups` -> serve immediately + live miss, cache attached -> spawn an async disk/remote lookup; a hit + resolves the fetch, a miss falls through + live miss, no cache -> queue for a TrackDynamic, or NotFound ``` -A lower-tier hit streams straight from disk or remote via a ranged read. There is no fault-in -and no re-population of RAM, so a group lives in exactly one tier and is served from there. This -is what makes the watermark model simpler than an LRU, which needs to move items back up on -access. - -## Always-latest and pinning +`get_group(seq)` stays synchronous and only consults the live window; a spilled group is reachable +only through the async `fetch_group`. -- **The latest group is never evicted.** It sits inside `ram.min` by construction, so this is - free, and it is the group a new subscriber needs first. -- **A pinned (actively read) group is never flushed.** A `GroupConsumer` handed out from the - cache holds a pin (hooked into the group's existing refcount); the flush skips pinned groups - and emits the rest of the band. Old groups are rarely pinned, so segments stay contiguous in - practice. If strictly contiguous segments are ever required, hold the batch until the pin - clears instead. +Batching the disk write per eviction pass keeps a stampede-trim (many groups evicted at once) to a +single object. A steady-state single eviction still writes one small disk segment per group; the +remote tier is where rollup (`segment::rollup`) concatenates those into large objects, so a +per-frame (audio) track does not litter object storage with tiny remote objects. -## Tiers +## Tiers and the byte format -RAM is always present and dependency-free. disk and remote are `object_store`, target-gated to +RAM is always present and dependency-free. Disk and remote are `object_store`, target-gated to non-wasm targets (`cfg(not(target_arch = "wasm32"))`) so native builds get the tiers with no flag -and wasm builds drop the server-side cloud stack automatically. The on-tier byte format is the -`segment` module's, so a relay's spilled data is directly readable by anything using this cache. - -## Attaching to a producer or a consumer - -The cache splits into a write half and a read half, like the rest of moq-net. `cache::Producer` -fills the cache and is **not `Clone`** (a single writer); `cache::Consumer` is `Clone` and shares -the same store. `Producer::consume()` derives a reader, so **one cache backs both a track's -producer and its consumer**. - -```rust -// implemented (RAM tier) in moq_net::cache -let writer: cache::Producer = config.produce(); // not Clone -let reader: cache::Consumer = writer.consume(); // Clone; shares the store - -writer.insert(group); // -> Option (band to persist to the next tier) -reader.get(sequence); // -> Option -``` - -The track wiring hands each endpoint the matching half: - -```rust -impl TrackProducer { - /// Fill `cache` with groups this producer creates (spawns a populate subscriber). - pub fn with_cache(self, cache: cache::Producer) -> Self; -} -impl TrackConsumer { - /// Back fetch_group / get_group with `cache`: RAM hits resolve locally. - pub fn with_cache(self, cache: cache::Consumer) -> Self; -} -``` - -Sharing one store across both endpoints of a track: - -```rust -let writer = config.produce(); -let reader = writer.consume(); -let producer = producer.with_cache(writer); // fills the cache -let consumer = consumer.with_cache(reader); // fetches from it, same groups -``` - -`cache::Producer` being non-`Clone` is also a deliberate step toward making `TrackProducer` -non-`Clone`: a single writer per track. - -### Producer side -`TrackState.groups` (today's inline buffer, bounded by the now-removed `TrackInfo.cache`) is -backed by the cache: finished groups beyond `ram.min` move into the RAM tier, and a `get_group` -or `dynamic()` miss consults the cache (RAM, then disk, then remote) before `NotFound`. - -### Consumer side (fetch vs populate) -Reading and populating are different halves, which is what the produce/consume split buys: - -- A `TrackConsumer` given a `cache::Consumer` (read half) checks the cache first on - `fetch_group(seq)` / `get_group(seq)`: a **hit** resolves the `kio::Pending` locally with no - wire FETCH (RAM synchronously, disk/remote after the ranged read); a **miss** falls through to - the wire. -- To *populate* the cache (insert groups read off the wire or off a live `subscribe`), a consumer - takes a `cache::Producer` (write half) instead. This is the archive's record-and-serve path: a - cache-backed consumer with no live upstream fills tiers as it reads and answers FETCH straight - from them. - -A shared cache makes the two directions symmetric: groups a producer creates are fetchable -through a consumer of the same track, and groups a consumer pulled off the wire are servable by -the producer. Inserts dedup by sequence, so attaching one cache to both sides is safe. - -## Removing TrackInfo.cache - -`TrackInfo.cache` is a producer-set, wire-serialized duration. It conflates "how long the -publisher keeps groups for late subscribers" with "cache policy," and a relay should not -inherit the publisher's number to size its own cache. Since the cache here is local and fully -independent: - -- stop using `TrackInfo.cache` to size anything; -- remove the field from `TrackInfo`. This is a public-API and wire change, hence the `dev` - target. If a producer-side retention knob is still wanted, it stays internal to the producer - rather than on the shared `TrackInfo`. - -## Per-binary use - -- **moq-cli:** no cache, or a small RAM-only `cache::Config` for a single track. See "moq-cli - flags" below for the concrete surface. -- **moq-relay:** one `cache::Config` template applied to every track it creates. Threading that - config onto the tracks moq-net auto-creates during fan-out is the Origin follow-up; here it is - just `TrackProducer::with_cache(writer)`. A relay RAM cache that spills to disk or S3 becomes - configuration, not code. -- **moq-edge:** the same, plus its own dynamic-handler business logic on top. - -## moq-cli flags - -The cache is most useful on the commands that run a local origin and serve a broadcast back -(`moq serve`, `moq accept`), so a flattened `CacheArgs` group lands on those. The flags map onto -the `[min, max]` bounds and the tier cascade; an absent `--cache-ram` means no cache (today's -behavior). This is the proposed surface; wiring it waits on the track-endpoint `with_cache` API. - -```rust -/// Retain recent groups so late subscribers and FETCHes get old content. -/// Absent `--cache-ram` leaves caching off. -#[derive(clap::Args, Clone, Default)] -pub struct CacheArgs { - /// Keep up to this much of each track's recent groups in RAM (high watermark). - /// Setting it enables the cache. e.g. `30s`. - #[arg(long, value_parser = humantime::parse_duration)] - pub cache_ram: Option, - - /// RAM low watermark; a flush drains down to this, and the band between the two - /// becomes one segment. Defaults to two-thirds of `--cache-ram`. - #[arg(long, value_parser = humantime::parse_duration)] - pub cache_ram_min: Option, - - /// Also retain on local disk at this path (spill from RAM). - #[arg(long)] - pub cache_disk: Option, - - /// How long to keep groups on disk before rolling up to remote (or dropping if - /// no remote tier). e.g. `5m`. - #[arg(long, value_parser = humantime::parse_duration)] - pub cache_disk_age: Option, - - /// Also retain in remote object storage, e.g. `s3://bucket/prefix`. - #[arg(long)] - pub cache_remote: Option, - - /// How long to keep groups in remote storage. Omit to keep forever. - #[arg(long, value_parser = humantime::parse_duration)] - pub cache_remote_age: Option, - - /// Flush a partial RAM band after this long even below the high watermark, so a - /// low data-rate track still spills. Mostly redundant with a duration `--cache-ram`. - #[arg(long, value_parser = humantime::parse_duration)] - pub cache_interval: Option, -} -``` - -`CacheArgs` flattens into `Serve` and `Accept` (the relay-running commands), e.g. - -```text -moq serve --broadcast bbb --cache-ram 30s --cache-disk /var/cache/moq --cache-disk-age 5m \ - --cache-remote s3://moq-cache/bbb --cache-remote-age 30d fmp4 < bbb.mp4 -``` - -and converts to a `cache::Config` whose halves go to each endpoint: - -```rust -impl CacheArgs { - /// None when `--cache-ram` is unset (caching disabled). - pub fn config(&self) -> Option { /* flags -> bounds (+ tiers) */ } -} - -// in run_serve / run_accept: produce the writer once, derive a reader, hand one to each endpoint. -if let Some(config) = args.config() { - let writer = config.produce(); - let reader = writer.consume(); // same store; serves fetch_group locally - producer = producer.with_cache(writer); - // a TrackConsumer of the same track takes `reader`. -} -``` - -Notes: byte-budget variants (`--cache-ram-bytes`, etc.) are additive later; duration bounds -cover the common case. moq-cli parses straight from clap (no TOML merge), so plain -`Option` is fine here. The relay (`rs/moq-relay`), which does merge TOML, would carry -the same flags under its `Option` clobber rule. - -## Open questions - -1. **object_store in moq-net.** Resolved: target-gated to `cfg(not(target_arch = "wasm32"))`, so - native always has the tiers and wasm drops them, with no opt-in feature. -2. **Async get.** RAM hits must stay synchronous (serve under the lock); only disk and remote - faults are async. The return type needs a "ready now or pending" shape, matching moq-net's - existing `kio::Pending`. -3. **Default bounds.** With `TrackInfo.cache` gone, pick a conservative RAM-only default so an - unconfigured `TrackProducer` behaves like today: a small recent window, no spill. -4. **Footprint.** Per-track bounds mean total RAM is the sum of `ram.max` across live tracks. - Keep the default modest and document footprint = bound times track count. -5. **Pinned groups mid-band.** Skip and flush around them, or hold the batch until unpinned. - Skipping is simpler and old groups are rarely pinned; revisit only if it bites. +and wasm builds drop the server-side cloud stack automatically. + +The on-disk format lives in `segment.rs`: a band of groups serialized as one self-describing +object (a footer offset table read from a fixed trailer), lossless per-frame timestamps (raw +value + scale, so any timescale round-trips), `rollup` to concatenate small segments into one +larger object, and `group_from_blob` for the ranged-read decode path. `index.rs` is the +storage-agnostic multi-tier index (`sequence -> (tier, segment, byte range)`), per-tier byte and +duration accounting, and the promotion that picks the oldest disk segments over the disk high +watermark. `store.rs` is the `object_store` glue tying them together. The disk `Bounds` (a +low/high watermark) govern when disk segments roll up to remote, independent of the RAM retention +window above. + +## Bridging live <-> cached + +`cache::Group::read` drains a finished live `GroupConsumer` into the serializable `cache::Group` +(done on the flush task, off the state lock). `cache::Group::produce` rebuilds a live +`GroupConsumer` from a stored group at the track's timescale, for serving a fetch. + +## Still design + +- **Disk-then-upstream fetch.** A track with a disk cache serves old content from the store; it + does not also chain to a `TrackDynamic` (wire FETCH) on a store miss. A relay that wants "disk, + then upstream" would queue the dynamic fetch after the store lookup misses; additive later. +- **Removing `TrackInfo::cache`.** The retention window is still read from the wire-carried + `TrackInfo::cache`. Making retention purely local policy (and dropping the wire field) is a + separate wire change. +- **moq-cli / moq-relay flags.** Surfacing `with_cache` as CLI/TOML configuration (a disk path, a + remote URL, bounds) is follow-up work; the model API is in place. diff --git a/rs/moq-net/src/model/cache/mod.rs b/rs/moq-net/src/model/cache/mod.rs index 6d0cfe061..a72b5434d 100644 --- a/rs/moq-net/src/model/cache/mod.rs +++ b/rs/moq-net/src/model/cache/mod.rs @@ -1,27 +1,23 @@ -//! Per-track group cache: a bounded RAM window that evicts in batches. +//! Per-track durable cache: the disk/remote spill tiers behind a track's live RAM window. //! -//! A cache is local policy attached to a single track, independent of any retention the original -//! publisher set (it is never carried on the wire). It keeps a `[min, max]` window of recent -//! groups in RAM. When an insert pushes the window past the high watermark (`max`), the oldest -//! groups down to the low watermark (`min`) are drained as one `Batch`, which the caller hands to -//! the next tier (disk or remote object storage). Draining a whole band at once is what keeps a -//! low-latency track (audio makes a group per frame) from producing one tiny object per group; an -//! LRU, which evicts a single item the instant the budget trips, cannot batch. +//! The RAM tier is the track's own live group buffer ([`crate::TrackProducer`]'s `groups`); this +//! module is everything below it. When a group ages out of that window (see the two retention gates +//! in `track.rs`), it is serialized through `Group` and handed to the disk tier; a fetch that misses +//! the live window then reads it back from disk (or remote) instead of failing. //! -//! The cache is split into a write half (`Producer`) and a read half (`Consumer`), mirroring the -//! rest of moq-net. `Producer` is intentionally not `Clone` (a single writer fills the cache); -//! `Consumer` is `Clone` and shares the same store, so one cache backs both a track's producer and -//! its consumer. +//! A cache is local policy attached to a single track, independent of any retention the original +//! publisher set (it is never carried on the wire). Attach one with +//! [`crate::TrackProducer::with_cache`]; the disk tier and an optional remote rollup target are +//! described by `Disk`. Because the cache lives on the shared track state, the same store backs +//! the track's producer and every consumer, so a fetch is served from whichever tier holds the +//! group. //! -//! The `segment` submodule is the on-disk byte format used by the disk and remote tiers (a band -//! of groups serialized as one self-describing object) plus the rollup that concatenates several -//! small segments into one larger object. `Group::read` / `Group::produce` bridge a cached group -//! to and from the live group model, and `TrackProducer::with_cache` / `TrackConsumer::with_cache` -//! wire the cache into the track types. The tier I/O (object_store) is the remaining piece; see -//! `rs/moq-net/CACHE.md`. - -use std::collections::BTreeMap; -use std::sync::{Arc, Mutex}; +//! The `segment` submodule is the on-disk byte format (a band of groups serialized as one +//! self-describing object) plus the rollup that concatenates several small segments into one larger +//! object. `Group::read` / `Group::produce` bridge a cached group to and from the live group model. +//! The `store` submodule is the object_store glue (native-only). + +use std::sync::Arc; use std::time::Duration; use bytes::Bytes; @@ -72,7 +68,7 @@ impl Limit { } /// Whether both thresholds are unset (so the limit imposes no ceiling). - fn is_unset(&self) -> bool { + pub(crate) fn is_unset(&self) -> bool { self.duration.is_none() && self.bytes.is_none() } } @@ -93,47 +89,10 @@ impl Bounds { } } -/// Local cache policy for a single track. Not carried on the wire. -#[derive(Clone, Debug, Default)] -#[non_exhaustive] -pub struct Config { - /// Bounds on the RAM tier. - pub ram: Bounds, - /// Optional disk (and remote) spill tier. When set, bands evicted from RAM are flushed here. - /// Native-only: the field is absent on wasm, where object_store does not build. - #[cfg(not(target_arch = "wasm32"))] - pub disk: Option, -} - -impl Config { - /// Build a [`Config`] with the given RAM bounds and no spill tier. - pub fn new(ram: Bounds) -> Self { - Self { - ram, - #[cfg(not(target_arch = "wasm32"))] - disk: None, - } - } - - /// Start an empty cache with this policy, returning its write half. If a disk tier is - /// configured, spawns a background task that flushes evicted RAM bands to it. - pub fn produce(self) -> Producer { - let state = Arc::new(Mutex::new(State { - bounds: self.ram, - ram: BTreeMap::new(), - ram_bytes: 0, - })); - Producer { - state, - #[cfg(not(target_arch = "wasm32"))] - tiers: self.disk.map(Tiers::spawn), - } - } -} - /// The disk spill tier: an object store, a key prefix, retention bounds, and an optional remote /// store the disk tier rolls up into. Native-only (`object_store` does not build on wasm). Build -/// with [`Disk::new`], optionally [`with_remote`](Disk::with_remote). +/// with [`Disk::new`], optionally [`with_remote`](Disk::with_remote), then attach via +/// [`crate::TrackProducer::with_cache`]. #[cfg(not(target_arch = "wasm32"))] #[derive(Clone, Debug)] #[non_exhaustive] @@ -202,8 +161,8 @@ impl Group { } /// Drain a live [`GroupConsumer`](crate::GroupConsumer) into a cached group, reading every - /// frame's payload and timestamp. Resolves once the group is finished, so this is how the - /// producer side snapshots a finished group for caching. + /// frame's payload and timestamp. Resolves once the group is finished, so this is how an evicted + /// group is snapshotted before it is written to a tier. pub async fn read(mut group: crate::GroupConsumer) -> Result { let sequence = group.sequence; let mut frames = Vec::new(); @@ -238,209 +197,71 @@ impl Group { } } -/// A band of groups drained from a tier in one flush, oldest first. The caller persists it to the -/// next tier as a single segment. +/// A band of groups serialized to a tier in one flush, oldest first. pub type Batch = Vec; -/// The shared RAM tier behind a [`Producer`] and its [`Consumer`]s. -struct State { - bounds: Bounds, - /// Groups keyed by sequence, so the first entry is the oldest and the last is the latest. - ram: BTreeMap, - ram_bytes: u64, -} - -impl State { - /// The time span between the oldest group's first frame and the newest group's last frame. - /// Zero unless both ends carry a timestamp, so a track without media timestamps applies no - /// duration pressure (byte bounds still apply). - fn span(&self) -> Duration { - let first = self.ram.values().next().and_then(|g| g.ts_first()); - let last = self.ram.values().next_back().and_then(|g| g.ts_last()); - match (first, last) { - (Some(a), Some(b)) => Duration::from(b).saturating_sub(Duration::from(a)), - _ => Duration::ZERO, - } - } - - /// Whether the current contents exceed `limit`. An unset limit is treated as a floor of zero - /// (any content exceeds it), which is what makes a flush with no `min` drain to just the - /// latest group. - fn exceeds(&self, limit: Limit) -> bool { - if limit.is_unset() { - return !self.ram.is_empty(); - } - limit.bytes.is_some_and(|b| self.ram_bytes > b) || limit.duration.is_some_and(|d| self.span() > d) - } - - /// Whether the high watermark is tripped. An unset high watermark is unbounded (never trips). - fn over_max(&self) -> bool { - !self.bounds.max.is_unset() && self.exceeds(self.bounds.max) - } - - fn insert(&mut self, group: Group) -> Option { - let size = group.size(); - if let Some(old) = self.ram.insert(group.sequence, group) { - self.ram_bytes -= old.size(); - } - self.ram_bytes += size; - self.flush() - } - - /// If over the high watermark, drain the oldest groups down to the low watermark, keeping the - /// latest group always. Returns the drained band, oldest first, or `None` if nothing flushed. - fn flush(&mut self) -> Option { - if !self.over_max() { - return None; - } - - let mut batch = Batch::new(); - // Drain oldest-first while still above the low watermark, but never the latest group: a - // new subscriber and the live edge need it, and it is the likeliest next fetch. - while self.ram.len() > 1 && self.exceeds(self.bounds.min) { - let oldest = *self.ram.keys().next().expect("non-empty"); - let latest = *self.ram.keys().next_back().expect("non-empty"); - if oldest == latest { - break; - } - let group = self.ram.remove(&oldest).expect("just observed"); - self.ram_bytes -= group.size(); - batch.push(group); - } - - (!batch.is_empty()).then_some(batch) - } -} - -/// The disk/remote tier handle held by a [`Producer`] and shared with its [`Consumer`]s: a sender -/// to the background flush task, and the store itself for reads. +/// The disk/remote spill handle held on a track's shared state. +/// +/// Holds a sender to a background task that drains evicted groups to the disk tier, and the store +/// itself for fetch reads. Native-only (`object_store` does not build on wasm). Constructed by +/// [`crate::TrackProducer::with_cache`]. #[cfg(not(target_arch = "wasm32"))] -struct Tiers { - flush: tokio::sync::mpsc::UnboundedSender, +pub(crate) struct Tiers { + /// Hands each batch of evicted live groups to the background flush task. + flush: tokio::sync::mpsc::UnboundedSender>, + /// The disk/remote store, shared with the flush task; used to serve fetch misses. store: Arc>, } #[cfg(not(target_arch = "wasm32"))] impl Tiers { - /// Build the store and spawn the background flush task draining evicted bands into it. - fn spawn(disk: Disk) -> Self { + /// Build the store and spawn the background task that serializes evicted groups into it. + pub(crate) fn spawn(disk: Disk) -> Self { let store = store::Store::new(disk.store, disk.remote, disk.prefix, disk.bounds); let store = Arc::new(tokio::sync::RwLock::new(store)); - let (flush, mut rx) = tokio::sync::mpsc::unbounded_channel::(); + let (flush, mut rx) = tokio::sync::mpsc::unbounded_channel::>(); let writer = store.clone(); web_async::spawn(async move { - while let Some(band) = rx.recv().await { - if let Err(err) = writer.write().await.flush(band).await { + // Each message is one eviction pass. Drain its groups and write them as a single + // segment, so a stampede-trim (many groups at once) is one object rather than many. + while let Some(consumers) = rx.recv().await { + let mut batch = Batch::with_capacity(consumers.len()); + for consumer in consumers { + match Group::read(consumer).await { + Ok(group) => batch.push(group), + // A group torn down before we drained it (e.g. abort) is dropped, not cached. + Err(err) => tracing::debug!(%err, "skipped uncacheable evicted group"), + } + } + if batch.is_empty() { + continue; + } + if let Err(err) = writer.write().await.flush(batch).await { tracing::warn!(%err, "cache disk flush failed"); } } }); Self { flush, store } } -} - -/// The write half of a track cache. Insert finished groups; not `Clone` (a single writer fills -/// the cache). Call [`consume`](Self::consume) for a read handle. -pub struct Producer { - state: Arc>, - #[cfg(not(target_arch = "wasm32"))] - tiers: Option, -} - -impl Producer { - /// Insert a finished group. - /// - /// If this insert pushed the RAM tier over its high watermark, the drained band is flushed to - /// the disk tier (when one is configured) and `None` is returned. Without a disk tier the band - /// is returned for a RAM-only caller to handle (or drop); `None` means nothing was evicted. - pub fn insert(&mut self, group: Group) -> Option { - let band = self.state.lock().expect("cache poisoned").insert(group)?; - #[cfg(not(target_arch = "wasm32"))] - if let Some(tiers) = &self.tiers { - // Hand the evicted band to the background flush task; the disk tier owns it now. - let _ = tiers.flush.send(band); - return None; - } - Some(band) - } - - /// A read handle sharing this cache's RAM tier (and disk/remote store, if any). - pub fn consume(&self) -> Consumer { - Consumer { - state: self.state.clone(), - #[cfg(not(target_arch = "wasm32"))] - store: self.tiers.as_ref().map(|t| t.store.clone()), - } - } - - /// The highest sequence currently buffered in RAM, if any. - pub fn latest(&self) -> Option { - self.state - .lock() - .expect("cache poisoned") - .ram - .keys() - .next_back() - .copied() - } - - /// The number of groups currently buffered in RAM. - pub fn len(&self) -> usize { - self.state.lock().expect("cache poisoned").ram.len() - } - - /// Whether the RAM tier is empty. - pub fn is_empty(&self) -> bool { - self.len() == 0 - } -} - -/// The read half of a track cache. `Clone` shares the same store, so several readers (and a -/// matching [`Producer`]) cache the same groups. Backs a track's `fetch`. -#[derive(Clone)] -pub struct Consumer { - state: Arc>, - #[cfg(not(target_arch = "wasm32"))] - store: Option>>, -} - -impl Consumer { - /// Fetch a cached group by sequence from the **RAM** tier, or `None` if it is not there. - /// Synchronous. Use [`fetch`](Self::fetch) to also consult the disk/remote tiers. - /// - /// The returned [`Group`] is an owned copy (frame `Bytes` are reference-counted, so this is - /// cheap), so a later eviction never invalidates a fetch already in flight. - pub fn get(&self, sequence: u64) -> Option { - self.state.lock().expect("cache poisoned").ram.get(&sequence).cloned() - } - /// Fetch a group from any tier: RAM first, then (native only) the disk/remote store. Async - /// because the lower tiers do I/O. A tier read error is treated as a miss. - pub async fn fetch(&self, sequence: u64) -> Option { - if let Some(group) = self.get(sequence) { - return Some(group); + /// Hand a batch of evicted live groups to the flush task. Dropped silently once the task is gone. + pub(crate) fn evict(&self, groups: Vec) { + if !groups.is_empty() { + let _ = self.flush.send(groups); } - #[cfg(not(target_arch = "wasm32"))] - if let Some(store) = &self.store { - return store.read().await.get(sequence).await.unwrap_or_default(); - } - None } - /// Whether a group with this sequence is currently in the RAM tier. - pub fn contains(&self, sequence: u64) -> bool { - self.state.lock().expect("cache poisoned").ram.contains_key(&sequence) + /// A handle to the shared disk/remote store, for serving a fetch off the track's poll path. + pub(crate) fn store_handle(&self) -> Arc> { + self.store.clone() } - /// The highest sequence currently buffered in RAM, if any. - pub fn latest(&self) -> Option { - self.state - .lock() - .expect("cache poisoned") - .ram - .keys() - .next_back() - .copied() + /// Fetch a group from the disk/remote tiers, rebuilt at `timescale`. `None` on a miss or any + /// tier read / rebuild error (a fetch falls through to the live path). + #[cfg(test)] + pub(crate) async fn fetch(&self, sequence: u64, timescale: Option) -> Option { + let group = self.store.read().await.get(sequence).await.ok()??; + group.produce(timescale).ok() } } @@ -456,14 +277,6 @@ mod tests { } } - /// A one-frame group with no timestamp at the given sequence. - fn plain(seq: u64, bytes: usize) -> Group { - Group { - sequence: seq, - frames: vec![frame(bytes, None)], - } - } - /// A two-frame group spanning `[t0, t1]` micros, total `bytes`. fn timed(seq: u64, bytes: usize, t0: u64, t1: u64) -> Group { Group { @@ -488,181 +301,6 @@ mod tests { assert_eq!(g.ts_last(), Some(Timestamp::from_micros(900).unwrap())); } - #[test] - fn insert_and_get() { - let mut producer = Config::default().produce(); - let consumer = producer.consume(); - - assert!(consumer.get(5).is_none()); - producer.insert(plain(5, 100)); - assert_eq!(consumer.get(5).map(|g| g.size()), Some(100)); - assert!(consumer.get(6).is_none()); - } - - #[test] - fn consumer_sees_producer_inserts() { - // A cloned consumer observes inserts on the shared store. - let mut producer = Config::default().produce(); - let a = producer.consume(); - let b = a.clone(); - - producer.insert(plain(1, 10)); - assert!(a.contains(1)); - assert!(b.contains(1)); - } - - #[test] - fn dedup_by_sequence() { - // Re-inserting a sequence replaces it and keeps byte accounting correct. - let mut producer = Config::default().produce(); - let consumer = producer.consume(); - - producer.insert(plain(1, 100)); - producer.insert(plain(1, 30)); - assert_eq!(producer.len(), 1); - assert_eq!(consumer.get(1).map(|g| g.size()), Some(30)); - } - - #[test] - fn unbounded_when_no_max_never_flushes() { - let mut producer = Config::default().produce(); - let mut flushed = None; - for seq in 0..100 { - flushed = flushed.or(producer.insert(plain(seq, 1000))); - } - assert!(flushed.is_none()); - assert_eq!(producer.len(), 100); - } - - #[test] - fn byte_high_watermark_flushes_batch_to_low() { - // Keep 60 bytes, flush once over 100. Groups of 20 bytes: the 6th insert (120 bytes) - // trips the high watermark and drains the three oldest down to the 60-byte low watermark. - let bounds = Bounds::new(Limit::bytes(60), Limit::bytes(100)); - let mut producer = Config::new(bounds).produce(); - - let mut batches: Vec = Vec::new(); - for seq in 0..=5 { - if let Some(batch) = producer.insert(plain(seq, 20)) { - batches.push(batch); - } - } - - // Exactly one flush, draining the three oldest groups as one oldest-first band. - assert_eq!(batches.len(), 1); - let drained: Vec = batches[0].iter().map(|g| g.sequence).collect(); - assert_eq!(drained, vec![0, 1, 2]); - // The low watermark (60 bytes = 3 groups) is retained, latest included. - assert_eq!(producer.len(), 3); - assert_eq!(producer.latest(), Some(5)); - } - - #[test] - fn settles_within_the_band() { - // Steady state stays between the low and high watermarks (hysteresis), never above max. - let bounds = Bounds::new(Limit::bytes(60), Limit::bytes(100)); - let mut producer = Config::new(bounds).produce(); - for seq in 0..50 { - producer.insert(plain(seq, 20)); - assert!(producer.len() <= 5, "exceeded high watermark: {}", producer.len()); - } - assert!(producer.len() >= 3, "below low watermark: {}", producer.len()); - assert_eq!(producer.latest(), Some(49)); - } - - #[test] - fn flush_keeps_latest_even_when_oversized() { - // A single group larger than the whole budget is still retained (never evict the latest). - let bounds = Bounds::new(Limit::bytes(10), Limit::bytes(50)); - let mut producer = Config::new(bounds).produce(); - - let batch = producer.insert(plain(0, 1000)); - assert!(batch.is_none()); - assert_eq!(producer.len(), 1); - assert_eq!(producer.latest(), Some(0)); - } - - #[test] - fn min_unset_drains_to_just_the_latest() { - // High watermark set, low watermark unset -> flush keeps only the latest group. - let bounds = Bounds::new(Limit::default(), Limit::bytes(50)); - let mut producer = Config::new(bounds).produce(); - - for seq in 0..5 { - producer.insert(plain(seq, 20)); - } - assert_eq!(producer.len(), 1); - assert_eq!(producer.latest(), Some(4)); - } - - #[test] - fn duration_high_watermark_evicts_by_timespan() { - // Keep 2s, flush down to 1s. Each group spans 1s of media time. - let bounds = Bounds::new( - Limit::duration(Duration::from_secs(1)), - Limit::duration(Duration::from_secs(2)), - ); - let mut producer = Config::new(bounds).produce(); - let consumer = producer.consume(); - - // seq 0: [0,1]s, seq 1: [1,2]s, seq 2: [2,3]s, seq 3: [3,4]s - for seq in 0..4u64 { - let t0 = seq * 1_000_000; - producer.insert(timed(seq, 10, t0, t0 + 1_000_000)); - } - - assert!(consumer.contains(3), "latest kept"); - assert!(!consumer.contains(0), "oldest evicted"); - assert!(producer.len() <= 2, "len was {}", producer.len()); - } - - #[test] - fn no_duration_pressure_without_timestamps() { - // A duration bound with timestamp-less groups never flushes (byte bounds would still). - let bounds = Bounds::new( - Limit::duration(Duration::from_secs(1)), - Limit::duration(Duration::from_secs(2)), - ); - let mut producer = Config::new(bounds).produce(); - for seq in 0..20 { - assert!(producer.insert(plain(seq, 1000)).is_none()); - } - assert_eq!(producer.len(), 20); - } - - #[test] - fn latest_tracks_highest_sequence_out_of_order() { - let mut producer = Config::default().produce(); - producer.insert(plain(5, 1)); - producer.insert(plain(2, 1)); - producer.insert(plain(9, 1)); - producer.insert(plain(7, 1)); - assert_eq!(producer.latest(), Some(9)); - } - - #[test] - fn out_of_order_old_insert_can_flush_immediately() { - // Inserting a stale (low) sequence into a full cache evicts it (or an older one) at once. - let bounds = Bounds::new(Limit::bytes(40), Limit::bytes(50)); - let mut producer = Config::new(bounds).produce(); - for seq in 10..14 { - producer.insert(plain(seq, 20)); - } - let batch = producer.insert(plain(0, 20)); - assert!(batch.is_some()); - assert_eq!(producer.latest(), Some(13)); - assert!(!producer.consume().contains(0), "stale insert flushed first"); - } - - #[test] - fn is_empty_and_len() { - let mut producer = Config::default().produce(); - assert!(producer.is_empty()); - producer.insert(plain(0, 1)); - assert!(!producer.is_empty()); - assert_eq!(producer.len(), 1); - } - #[tokio::test] async fn bridge_round_trips_a_live_group() { // Build a live timed group, drain it into a cached group, rebuild a live one, drain again, @@ -705,36 +343,37 @@ mod tests { #[cfg(not(target_arch = "wasm32"))] #[tokio::test] - async fn spills_to_disk_and_fetches_back() { + async fn tiers_evict_then_fetch_back() { use object_store::memory::InMemory; use object_store::path::Path; - // RAM keeps ~1 group (flush over 250 bytes, down to 100); disk is unbounded so it keeps - // everything it is handed. + // Disk is unbounded so it keeps everything handed to it. let disk = Disk::new(Arc::new(InMemory::new()), Path::from("cache"), Bounds::default()); - let mut config = Config::new(Bounds::new(Limit::bytes(100), Limit::bytes(250))); - config.disk = Some(disk); - - let mut producer = config.produce(); - let consumer = producer.consume(); - - for seq in 0..5 { - producer.insert(plain(seq, 100)); + let tiers = Tiers::spawn(disk); + + // Build three finished live groups and hand them to the flush task as one eviction pass. + let mut consumers = Vec::new(); + for seq in 0..3u64 { + let mut live = crate::GroupProducer::new(crate::Group { sequence: seq }, None); + live.write_frame(Bytes::from(vec![seq as u8; 100])).unwrap(); + live.finish().unwrap(); + consumers.push(live.consume()); } + tiers.evict(consumers); - // The oldest group was evicted from RAM and flushed to disk by the background task. - let mut from_disk = None; + // The background task writes them to disk; fetch reads them back. + let mut fetched = None; for _ in 0..200 { - if let Some(group) = consumer.fetch(0).await { - from_disk = Some(group); + if let Some(group) = tiers.fetch(0, None).await { + fetched = Some(group); break; } tokio::task::yield_now().await; } - - assert!(consumer.get(0).is_none(), "group 0 is no longer in the RAM tier"); - assert_eq!(from_disk.expect("group 0 fetched from disk").sequence, 0); - // A recent group is still served from RAM. - assert!(consumer.fetch(4).await.is_some()); + let mut group = fetched.expect("group 0 fetched from disk"); + assert_eq!(group.sequence, 0); + assert_eq!(group.read_frame().await.unwrap().unwrap(), Bytes::from(vec![0u8; 100])); + assert!(tiers.fetch(2, None).await.is_some()); + assert!(tiers.fetch(99, None).await.is_none()); } } diff --git a/rs/moq-net/src/model/group.rs b/rs/moq-net/src/model/group.rs index 470bca67d..a16a04142 100644 --- a/rs/moq-net/src/model/group.rs +++ b/rs/moq-net/src/model/group.rs @@ -185,6 +185,13 @@ impl GroupProducer { self.timescale } + /// The media timestamp of the last buffered frame, for the track cache's media-time + /// retention gate. `None` when the group has no frames yet or the track is untimed. + /// Reads the producer's own buffer, so it never blocks. + pub(crate) fn last_timestamp(&self) -> Option { + self.state.read().frames.back().and_then(|f| f.timestamp) + } + /// A helper method to write a frame from a single byte buffer. /// /// If you want to write multiple chunks, use [Self::create_frame] to get a frame producer. diff --git a/rs/moq-net/src/model/mod.rs b/rs/moq-net/src/model/mod.rs index b6556cfa0..2cd55b13c 100644 --- a/rs/moq-net/src/model/mod.rs +++ b/rs/moq-net/src/model/mod.rs @@ -8,8 +8,8 @@ mod subscription; mod time; mod track; -/// Per-track group cache (RAM tier and eviction policy). Namespaced: `cache::Producer`, -/// `cache::Consumer`, `cache::Config`. +/// Per-track durable cache: the disk/remote spill tiers below a track's live RAM window. +/// Attached via [`TrackProducer::with_cache`]. Namespaced: `cache::Disk`, `cache::Group`. pub mod cache; pub use bandwidth::*; diff --git a/rs/moq-net/src/model/track.rs b/rs/moq-net/src/model/track.rs index f86e9b01a..3fcbd1406 100644 --- a/rs/moq-net/src/model/track.rs +++ b/rs/moq-net/src/model/track.rs @@ -187,6 +187,13 @@ struct TrackState { // uncached groups, so a cache-miss `fetch` on an accepted track fails fast // instead of blocking forever (mirrors `BroadcastState::dynamic`). dynamic: usize, + + // Optional durable spill below the live `groups` window: groups aged out of `groups` are + // serialized to the disk tier, and a fetch that misses `groups` reads them back. Shared by the + // producer and every consumer through this state. Native-only (object_store doesn't build on + // wasm). Set via `TrackProducer::with_cache`. + #[cfg(not(target_arch = "wasm32"))] + cache: Option, } impl TrackState { @@ -384,13 +391,32 @@ impl TrackState { } } - /// Evict groups older than `max_age`, never evicting the max_sequence group. + /// Evict groups that fall outside the retention window by either gate, never evicting the + /// max_sequence group. Evicted groups are handed to the cache (when one is attached) so a later + /// fetch can read them back from disk. + /// + /// Two gates, both sized by `max_age`; a group is evicted when it trips either: + /// - **wall-clock**: it was received more than `max_age` ago. The hard memory backstop, so a + /// publisher can't pin RAM by lying about media timestamps. + /// - **media-time**: its last frame's media timestamp is more than `max_age` behind the live + /// media edge. Bounds a startup stampede where a burst of buffered media arrives at once + /// (all "received now", so the wall-clock gate alone would keep it all). /// - /// Groups are in arrival order, so we can stop early when we hit a non-expired, - /// non-max_sequence group (everything after it arrived even later). - /// When max_sequence is at the front, we skip past it and tombstone expired groups - /// behind it. + /// Groups arrive in wall-clock order, but a late out-of-order group can be media-expired + /// anywhere in the deque, so this scans the whole (small) window rather than breaking early. fn evict_expired(&mut self, now: web_async::time::Instant, max_age: Duration) { + // The live media edge: the newest frame timestamp across buffered groups. + let media_now = self + .groups + .iter() + .flatten() + .filter_map(|(group, _)| group.last_timestamp()) + .max(); + + let mut removed = Vec::new(); + #[cfg(not(target_arch = "wasm32"))] + let mut evicted = Vec::new(); + for slot in self.groups.iter_mut() { let Some((group, created_at)) = slot else { continue }; @@ -398,14 +424,31 @@ impl TrackState { continue; } - if now.duration_since(*created_at) <= max_age { - break; + let wall_expired = now.duration_since(*created_at) > max_age; + let media_expired = match (media_now, group.last_timestamp()) { + (Some(latest), Some(ts)) => Duration::from(latest).saturating_sub(Duration::from(ts)) > max_age, + _ => false, + }; + if !wall_expired && !media_expired { + continue; } - self.duplicates.remove(&group.sequence); + removed.push(group.sequence); + #[cfg(not(target_arch = "wasm32"))] + if self.cache.is_some() { + evicted.push(group.consume()); + } *slot = None; } + for sequence in removed { + self.duplicates.remove(&sequence); + } + #[cfg(not(target_arch = "wasm32"))] + if let Some(cache) = &self.cache { + cache.evict(evicted); + } + // Trim leading tombstones to advance the offset. while let Some(None) = self.groups.front() { self.groups.pop_front(); @@ -661,7 +704,6 @@ impl TrackProducer { TrackConsumer { name: self.name.clone(), state: self.state.consume(), - cache: None, } } @@ -692,28 +734,18 @@ impl TrackProducer { } } - /// Fill `cache` with this track's groups as they are produced, so a [`TrackConsumer`] sharing - /// the cache (via [`cache::Producer::consume`]) can serve them without a wire fetch. - /// - /// Spawns an internal subscriber that drains each finished group into the cache. Two caveats - /// follow from that, both being addressed in the cache design (see `rs/moq-net/CACHE.md`): + /// Attach a durable disk (and optional remote) cache below this track's live window. /// - /// - The subscriber counts as a consumer, so [`unused`](Self::unused) never resolves while a - /// cache is attached. A relay that drops idle tracks via demand will not drop a cached one; - /// it stays alive until it ends or the producer is dropped. Intended for "keep recording - /// when idle", but it disables demand-driven teardown for this track. - /// - Groups are drained in arrival order and [`cache::Group::read`] resolves only once a group - /// finishes, so a stalled group head-of-line-blocks the caching of later finished ones. - pub fn with_cache(self, mut cache: cache::Producer) -> Self { - let mut subscriber = self.subscribe(None); - web_async::spawn(async move { - // A drained band (over the RAM watermark) is dropped here until a disk tier consumes it. - while let Ok(Some(group)) = subscriber.recv_group().await { - if let Ok(group) = cache::Group::read(group).await { - let _ = cache.insert(group); - } - } - }); + /// Groups aged out of the live `groups` window are serialized to `disk` instead of just + /// dropped, and a [`fetch_group`](TrackConsumer::fetch_group) that misses the live window reads + /// them back from disk (or the rolled-up remote tier). The cache lives on the shared track + /// state, so every [`TrackConsumer`] of this track serves from it automatically. Native-only + /// (`object_store` does not build on wasm). + #[cfg(not(target_arch = "wasm32"))] + pub fn with_cache(self, disk: cache::Disk) -> Self { + if let Ok(mut state) = self.modify() { + state.cache = Some(cache::Tiers::spawn(disk)); + } self } @@ -927,7 +959,6 @@ impl TrackWeak { TrackConsumer { name: self.name.clone(), state: self.state.consume(), - cache: None, } } @@ -991,9 +1022,6 @@ impl TrackDemand { pub struct TrackConsumer { name: Arc, state: kio::Consumer, - /// Optional read-through cache (RAM tier). A `get_group` / `fetch_group` miss on the live - /// state falls through to this before failing or waiting on a `TrackDynamic`. - cache: Option, } impl TrackConsumer { @@ -1002,19 +1030,6 @@ impl TrackConsumer { &self.name } - /// Attach a read-through cache: `get_group` / `fetch_group` resolve locally on a cache hit. - /// Share the [`cache::Producer::consume`] handle of the cache a [`TrackProducer`] fills to - /// serve a track's recent groups without a wire fetch. - pub fn with_cache(mut self, cache: cache::Consumer) -> Self { - self.cache = Some(cache); - self - } - - /// The track's negotiated timescale, needed to rebuild a cached group. - fn timescale(&self) -> Option { - self.state.read().info.as_ref().and_then(|info| info.timescale) - } - pub(crate) fn weak(&self) -> TrackWeak { TrackWeak { name: self.name.clone(), @@ -1038,28 +1053,22 @@ impl TrackConsumer { })) } - /// Return a cached group by sequence without blocking, or `None` if it isn't in - /// the cache. Use [`Self::fetch_group`] to wait for a group that a [`TrackDynamic`] - /// will serve on demand. + /// Return a live-cached group by sequence without blocking, or `None` if it isn't in the live + /// window. A group spilled to the durable cache is only reachable via the async + /// [`Self::fetch_group`], as is one a [`TrackDynamic`] serves on demand. pub fn get_group(&self, sequence: u64) -> Option { - if let Some(group) = self.state.read().cached_group(sequence) { - return Some(group); - } - // Live miss: fall through to the read-through cache, rebuilding the group at the track's - // timescale. A cache decode/rebuild error is treated as a miss. - let cached = self.cache.as_ref()?.get(sequence)?; - cached.produce(self.timescale()).ok() + self.state.read().cached_group(sequence) } /// Fetch a single past group, without holding a live subscription. /// /// Returns a [`kio::Pending`] that resolves to the [`GroupConsumer`]: - /// immediately if the group is cached, otherwise once a [`TrackDynamic`] serves - /// the request (a wire FETCH for a relay). `options` accepts `None`, a [`Fetch`], - /// or `Fetch::default()`. + /// immediately if the group is in the live window, otherwise once it is read back from the + /// durable cache (when one is attached) or a [`TrackDynamic`] serves the request (a wire FETCH + /// for a relay). `options` accepts `None`, a [`Fetch`], or `Fetch::default()`. /// /// Fails synchronously with [`Error::NotFound`] when the group can never be served - /// (past the final sequence, or no [`TrackDynamic`] on the track), or the track's + /// (past the final sequence, or no cache and no [`TrackDynamic`] on the track), or the track's /// abort error if it's already closed. pub fn fetch_group(&self, sequence: u64, options: impl Into>) -> Result> { let options = options.into().unwrap_or_default(); @@ -1069,25 +1078,21 @@ impl TrackConsumer { .write() .map_err(|s| s.abort.clone().unwrap_or(Error::Dropped))?; match state.poll_fetch(sequence) { - // Cached live: the pending resolves immediately from state, no handler needed. + // Cached live: the pending resolves immediately from state, no lookup needed. Poll::Ready(Ok(_)) => {} - // Live miss. Serve from the read-through cache if it holds the group and it rebuilds - // at the track's timescale (faster than waiting on a handler, and the only option when - // there is none). A rebuild error is treated as a miss, consistent with `get_group`, - // falling through to the live behavior. + // Live miss. If a durable cache is attached, spawn an async lookup across its + // disk/remote tiers; the returned `TrackFetch` resolves from it on a hit and falls + // through to the live decision on a miss. other => { - let timescale = state.info.as_ref().and_then(|info| info.timescale); - if let Some(group) = self - .cache - .as_ref() - .and_then(|c| c.get(sequence)) - .and_then(|g| g.produce(timescale).ok()) - { + #[cfg(not(target_arch = "wasm32"))] + if let Some(tiers) = &state.cache { + let timescale = state.info.as_ref().and_then(|info| info.timescale); + let lookup = spawn_cache_lookup(tiers, sequence, timescale); drop(state); return Ok(kio::Pending::new(TrackFetch { state: self.state.clone(), sequence, - cached: Some(group), + lookup: Some(lookup), })); } match other { @@ -1107,7 +1112,8 @@ impl TrackConsumer { Ok(kio::Pending::new(TrackFetch { state: self.state.clone(), sequence, - cached: None, + #[cfg(not(target_arch = "wasm32"))] + lookup: None, })) } @@ -1232,27 +1238,66 @@ impl GroupRequest { } } +/// A pending durable-cache lookup spawned for a [`TrackFetch`] on a live miss. The background task +/// writes [`Done`](CacheLookup::Done) once the disk/remote tiers resolve (the group, or `None` on a +/// miss). Native-only (the durable cache doesn't build on wasm). +#[cfg(not(target_arch = "wasm32"))] +enum CacheLookup { + /// The lookup task is still reading the tiers. + Pending, + /// The lookup finished: the rebuilt group, or `None` on a miss. + Done(Option), +} + +/// Spawn a background task that reads `sequence` from the cache's disk/remote tiers, rebuilds it at +/// `timescale`, and publishes the result through the returned slot. +#[cfg(not(target_arch = "wasm32"))] +fn spawn_cache_lookup(tiers: &cache::Tiers, sequence: u64, timescale: Option) -> kio::Consumer { + let slot = kio::Producer::new(CacheLookup::Pending); + let consumer = slot.consume(); + let store = tiers.store_handle(); + web_async::spawn(async move { + let group = match store.read().await.get(sequence).await { + Ok(Some(group)) => group.produce(timescale).ok(), + // Miss, tier read error, or rebuild failure: report a miss so the fetch falls through. + Ok(None) | Err(_) => None, + }; + if let Ok(mut slot) = slot.write() { + *slot = CacheLookup::Done(group); + } + }); + consumer +} + /// The pollable state of a [`TrackConsumer::fetch_group`]. /// -/// Awaited via the [`kio::Pending`] wrapper; resolves to the -/// [`GroupConsumer`] once the group lands in the track's cache (already present, -/// or produced after a wire FETCH), or [`Error::NotFound`] if it can never exist. +/// Awaited via the [`kio::Pending`] wrapper; resolves to the [`GroupConsumer`] once the group is +/// read back from the durable cache, lands in the live window (e.g. after a wire FETCH), or +/// [`Error::NotFound`] if it can never exist. pub struct TrackFetch { state: kio::Consumer, sequence: u64, - /// A group already rebuilt from the read-through cache. When set, the fetch resolves from it - /// instead of polling the live state. - cached: Option, + /// A durable-cache lookup spawned on a live miss. On a hit it resolves the fetch; on a miss the + /// poll falls through to the live state. Native-only. + #[cfg(not(target_arch = "wasm32"))] + lookup: Option>, } impl kio::Future for TrackFetch { type Output = Result; fn poll(&self, waiter: &kio::Waiter) -> Poll { - // A cache hit resolves immediately. `poll` returns `Ready` on first call, so the clone - // happens once. - if let Some(group) = &self.cached { - return Poll::Ready(Ok(group.clone())); + // A durable-cache lookup, if one was spawned, resolves the fetch on a hit. On a miss (or if + // the task died without publishing) fall through to the live state below. + #[cfg(not(target_arch = "wasm32"))] + if let Some(lookup) = &self.lookup { + let resolved = ready!(lookup.poll(waiter, |slot| match &**slot { + CacheLookup::Pending => Poll::Pending, + CacheLookup::Done(group) => Poll::Ready(group.clone()), + })); + if let Ok(Some(group)) = resolved { + return Poll::Ready(Ok(group)); + } } // `poll_fetch` already yields a `Result` (group, or NotFound / // abort); the outer error is the channel closing without one. @@ -1506,7 +1551,6 @@ impl TrackRequest { TrackConsumer { name: self.name.clone(), state: self.state.consume(), - cache: None, } } @@ -1706,73 +1750,70 @@ mod test { } } - #[test] - fn get_group_falls_through_to_cache() { - let producer = TrackProducer::new("test", None); - // The live track has no groups; a read-through cache holds sequence 42. - let mut writer = cache::Config::default().produce(); - writer.insert(cache::Group { - sequence: 42, - frames: vec![cache::Frame { - timestamp: None, - payload: bytes::Bytes::from_static(b"hi"), - }], - }); + /// A disk-backed cache over an in-memory object store, retaining 1s of live groups. + #[cfg(not(target_arch = "wasm32"))] + fn disk_cached_producer() -> TrackProducer { + use object_store::memory::InMemory; + use object_store::path::Path; + let disk = cache::Disk::new(Arc::new(InMemory::new()), Path::from("test"), cache::Bounds::default()); + TrackProducer::new("test", TrackInfo::default().with_cache(Duration::from_secs(1))).with_cache(disk) + } - // Without the cache the group is a miss; with it, it resolves. - assert!(producer.consume().get_group(42).is_none()); - let group = producer.consume().with_cache(writer.consume()).get_group(42); - assert_eq!(group.expect("served from cache").sequence, 42); + /// Write+finish a single-frame group at the next sequence. + fn write_group(producer: &mut TrackProducer, payload: &'static [u8]) { + let mut group = producer.append_group().unwrap(); + group.write_frame(bytes::Bytes::from_static(payload)).unwrap(); + group.finish().unwrap(); } + #[cfg(not(target_arch = "wasm32"))] #[tokio::test] - async fn fetch_group_serves_from_cache() { - let producer = TrackProducer::new("test", None); - let mut writer = cache::Config::default().produce(); - writer.insert(cache::Group { - sequence: 7, - frames: vec![cache::Frame { - timestamp: None, - payload: bytes::Bytes::from_static(b"data"), - }], - }); - let consumer = producer.consume().with_cache(writer.consume()); + async fn get_group_does_not_read_disk() { + tokio::time::pause(); + let mut producer = disk_cached_producer(); + write_group(&mut producer, b"hello"); // seq 0 - // No live group 7 and no TrackDynamic: the fetch is served from the cache instead of - // failing with NotFound, and the frame reads back byte-for-byte. - let mut group = consumer.fetch_group(7, None).unwrap().await.unwrap(); - assert_eq!(group.sequence, 7); - assert_eq!( - group.read_frame().await.unwrap().unwrap(), - bytes::Bytes::from_static(b"data") - ); + // seq 0 is live: get_group sees it synchronously. + assert!(producer.consume().get_group(0).is_some()); + + // Age seq 0 out of the live window; it spills to disk. + tokio::time::advance(Duration::from_secs(2)).await; + write_group(&mut producer, b"world"); // seq 1, evicts seq 0 + + // get_group is sync and never reads disk, so an evicted group is a miss there. + assert!(producer.consume().get_group(0).is_none()); + assert!(producer.consume().get_group(1).is_some()); } + #[cfg(not(target_arch = "wasm32"))] #[tokio::test] - async fn producer_populates_cache() { - // A producer with a cache drains its finished groups into it; a reader sharing the cache - // then sees them. Producer fills, consumer reads, end to end. - let writer = cache::Config::default().produce(); - let reader = writer.consume(); - let mut producer = TrackProducer::new("test", None).with_cache(writer); + async fn fetch_group_serves_evicted_group_from_disk() { + tokio::time::pause(); + let mut producer = disk_cached_producer(); + write_group(&mut producer, b"hello"); // seq 0 - let mut group = producer.append_group().unwrap(); // seq 0 - group.write_frame(bytes::Bytes::from_static(b"hello")).unwrap(); - group.finish().unwrap(); + // Age seq 0 out of the live window so it is flushed to the disk tier. + tokio::time::advance(Duration::from_secs(2)).await; + write_group(&mut producer, b"world"); // seq 1, evicts seq 0 + + let consumer = producer.consume(); - // Let the spawned populate task drain the finished group into the cache. - let mut cached = None; - for _ in 0..100 { - if let Some(group) = reader.get(0) { - cached = Some(group); + // The background flush is async; retry the fetch until the disk write lands. With no + // TrackDynamic, a disk miss resolves to NotFound, so a failed fetch just means "not yet". + let mut served = None; + for _ in 0..500 { + if let Ok(group) = consumer.fetch_group(0, None).unwrap().await { + served = Some(group); break; } tokio::task::yield_now().await; } - - let cached = cached.expect("group populated into cache"); - assert_eq!(cached.frames.len(), 1); - assert_eq!(cached.frames[0].payload, bytes::Bytes::from_static(b"hello")); + let mut group = served.expect("evicted group served from disk"); + assert_eq!(group.sequence, 0); + assert_eq!( + group.read_frame().await.unwrap().unwrap(), + bytes::Bytes::from_static(b"hello") + ); } #[tokio::test] From 8f4d39f34189e94c188ed66dbf7ac6d7786e323c Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 04:51:40 +0000 Subject: [PATCH 21/25] feat(moq-net): chain upstream on a cache miss A track with a durable cache attached now falls through to a TrackDynamic (the wire FETCH) when the disk/remote tiers miss, instead of dead-ending in NotFound. The store-lookup task, on a miss, queues the request for a TrackDynamic when one exists; the TrackFetch then resolves once upstream serves the group into the live window. Queuing only after the store misses keeps the store the fast path and avoids a redundant upstream fetch when the group is already cached. With no handler, a miss is still NotFound (now via the async lookup rather than synchronously). A fetch past the final sequence or on an aborted track skips the cache and reports synchronously as before. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-net/CACHE.md | 11 +++-- rs/moq-net/src/model/track.rs | 89 ++++++++++++++++++++++++++++++----- 2 files changed, 83 insertions(+), 17 deletions(-) diff --git a/rs/moq-net/CACHE.md b/rs/moq-net/CACHE.md index 7087ebc1b..d7b4b158f 100644 --- a/rs/moq-net/CACHE.md +++ b/rs/moq-net/CACHE.md @@ -68,13 +68,19 @@ flush task: (one background task per cached trac fetch_group(seq): live hit in `groups` -> serve immediately live miss, cache attached -> spawn an async disk/remote lookup; a hit - resolves the fetch, a miss falls through + resolves the fetch, a miss chains upstream + (queues for a TrackDynamic), else NotFound live miss, no cache -> queue for a TrackDynamic, or NotFound ``` `get_group(seq)` stays synchronous and only consults the live window; a spilled group is reachable only through the async `fetch_group`. +On a tier miss the lookup task chains upstream: it queues the request for a `TrackDynamic` (a wire +FETCH for a relay) when one exists, so the fetch then resolves once upstream serves the group into +the live window. Queuing only *after* the store misses keeps the store the fast path and avoids a +redundant upstream fetch when the group is already cached. With no handler, a miss is `NotFound`. + Batching the disk write per eviction pass keeps a stampede-trim (many groups evicted at once) to a single object. A steady-state single eviction still writes one small disk segment per group; the remote tier is where rollup (`segment::rollup`) concatenates those into large objects, so a @@ -104,9 +110,6 @@ window above. ## Still design -- **Disk-then-upstream fetch.** A track with a disk cache serves old content from the store; it - does not also chain to a `TrackDynamic` (wire FETCH) on a store miss. A relay that wants "disk, - then upstream" would queue the dynamic fetch after the store lookup misses; additive later. - **Removing `TrackInfo::cache`.** The retention window is still read from the wire-carried `TrackInfo::cache`. Making retention purely local policy (and dropping the wire field) is a separate wire change. diff --git a/rs/moq-net/src/model/track.rs b/rs/moq-net/src/model/track.rs index 3fcbd1406..93a51cff0 100644 --- a/rs/moq-net/src/model/track.rs +++ b/rs/moq-net/src/model/track.rs @@ -1080,20 +1080,27 @@ impl TrackConsumer { match state.poll_fetch(sequence) { // Cached live: the pending resolves immediately from state, no lookup needed. Poll::Ready(Ok(_)) => {} - // Live miss. If a durable cache is attached, spawn an async lookup across its - // disk/remote tiers; the returned `TrackFetch` resolves from it on a hit and falls - // through to the live decision on a miss. + // Live miss. If a durable cache is attached and the group could still exist, spawn an + // async lookup across its disk/remote tiers. The returned `TrackFetch` resolves from it + // on a hit; on a miss the lookup task chains upstream (queues for a `TrackDynamic`) and + // the fetch falls through to that live decision. other => { #[cfg(not(target_arch = "wasm32"))] - if let Some(tiers) = &state.cache { - let timescale = state.info.as_ref().and_then(|info| info.timescale); - let lookup = spawn_cache_lookup(tiers, sequence, timescale); - drop(state); - return Ok(kio::Pending::new(TrackFetch { - state: self.state.clone(), - sequence, - lookup: Some(lookup), - })); + { + // A group past the final sequence can never exist in any tier, and an aborted + // track is terminal, so skip the cache and report those synchronously below. + let exhausted = state.abort.is_some() || state.final_sequence.is_some_and(|fin| sequence >= fin); + if !exhausted && let Some(tiers) = &state.cache { + let timescale = state.info.as_ref().and_then(|info| info.timescale); + let lookup = + spawn_cache_lookup(tiers, self.state.clone(), sequence, options.priority, timescale); + drop(state); + return Ok(kio::Pending::new(TrackFetch { + state: self.state.clone(), + sequence, + lookup: Some(lookup), + })); + } } match other { // Unservable (NotFound) or already aborted: report it synchronously. @@ -1251,8 +1258,19 @@ enum CacheLookup { /// Spawn a background task that reads `sequence` from the cache's disk/remote tiers, rebuilds it at /// `timescale`, and publishes the result through the returned slot. +/// +/// On a tier miss the task chains upstream: it queues the request for a [`TrackDynamic`] (a wire +/// FETCH for a relay) when one exists, so the [`TrackFetch`] then resolves once upstream serves it. +/// Queuing only after the store misses keeps the store the fast path and avoids a redundant +/// upstream fetch when the group is already cached. #[cfg(not(target_arch = "wasm32"))] -fn spawn_cache_lookup(tiers: &cache::Tiers, sequence: u64, timescale: Option) -> kio::Consumer { +fn spawn_cache_lookup( + tiers: &cache::Tiers, + state: kio::Consumer, + sequence: u64, + priority: u8, + timescale: Option, +) -> kio::Consumer { let slot = kio::Producer::new(CacheLookup::Pending); let consumer = slot.consume(); let store = tiers.store_handle(); @@ -1262,6 +1280,13 @@ fn spawn_cache_lookup(tiers: &cache::Tiers, sequence: u64, timescale: Option None, }; + if group.is_none() + && let Ok(mut state) = state.write() + && state.dynamic > 0 + { + // Cache miss: chain upstream so a handler fetches the group into the live window. + state.fetches.push_back(GroupRequested { sequence, priority }); + } if let Ok(mut slot) = slot.write() { *slot = CacheLookup::Done(group); } @@ -1816,6 +1841,44 @@ mod test { ); } + #[cfg(not(target_arch = "wasm32"))] + #[tokio::test] + async fn fetch_chains_upstream_on_cache_miss() { + // A group that is neither live nor in the cache must still reach a TrackDynamic: the cache + // miss chains upstream rather than dead-ending in NotFound. + let producer = disk_cached_producer(); + let dynamic = producer.dynamic(); + let consumer = producer.consume(); + + let fetch = consumer.fetch_group(5, None).unwrap(); + + // The async cache miss queues the request, which the handler then receives and serves. + let request = dynamic.requested_group().await.unwrap(); + assert_eq!(request.sequence(), 5); + let mut group = request.accept(None).unwrap(); + group.write_frame(bytes::Bytes::from_static(b"upstream")).unwrap(); + group.finish().unwrap(); + + let mut served = fetch.await.unwrap(); + assert_eq!(served.sequence, 5); + assert_eq!( + served.read_frame().await.unwrap().unwrap(), + bytes::Bytes::from_static(b"upstream") + ); + } + + #[cfg(not(target_arch = "wasm32"))] + #[tokio::test] + async fn fetch_cache_miss_without_dynamic_is_not_found() { + // A cache miss with no handler to chain to resolves NotFound, not a hang. + let producer = disk_cached_producer(); + let consumer = producer.consume(); + assert!(matches!( + consumer.fetch_group(5, None).unwrap().await, + Err(Error::NotFound) + )); + } + #[tokio::test] async fn no_eviction_when_fresh() { tokio::time::pause(); From 3db189c187a4bff01220380a40ea735b7757a936 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 18:14:21 +0000 Subject: [PATCH 22/25] fix(moq-net): bound the cache flush backlog and only spill finished groups Three robustness fixes from review of the durable cache: - Bound the flush channel (was unbounded). A queued eviction pass pins its groups' frame buffers, so an unbounded channel let a slow disk migrate the RAM the live tier just freed into the channel backlog, defeating the memory bound eviction exists to enforce. `evict` now `try_send`s (it runs under the track state lock and must not block) and drops on a full backlog, since the cache is best-effort: a hole beats unbounded RAM growth. - Only spill finished groups. Draining an open group via `Group::read` parks the flush task until the group completes (or forever if the writer stalled). An unfinished evicted group is dropped from the live tier as before, just not cached. - Coalesce every queued eviction pass into one segment in the flush task (drain the channel with try_recv after each recv), so a backlog or a stampede-trim becomes one disk object instead of one per pass, and sort the batch by sequence. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-net/src/model/cache/mod.rs | 46 +++++++++++++++++++++++-------- rs/moq-net/src/model/group.rs | 6 ++++ rs/moq-net/src/model/track.rs | 32 ++++++++++++++++++++- 3 files changed, 72 insertions(+), 12 deletions(-) diff --git a/rs/moq-net/src/model/cache/mod.rs b/rs/moq-net/src/model/cache/mod.rs index a72b5434d..f0adebaab 100644 --- a/rs/moq-net/src/model/cache/mod.rs +++ b/rs/moq-net/src/model/cache/mod.rs @@ -200,6 +200,13 @@ impl Group { /// A band of groups serialized to a tier in one flush, oldest first. pub type Batch = Vec; +/// Backlog of eviction passes the flush task may fall behind before evicted groups are dropped +/// rather than queued. A queued pass pins its groups' frame buffers, so an unbounded queue would +/// let a slow disk migrate the RAM the live tier just freed into the channel. The cache is +/// best-effort, so on overflow we drop (creating a hole) instead of growing memory. +#[cfg(not(target_arch = "wasm32"))] +const FLUSH_BACKLOG: usize = 256; + /// The disk/remote spill handle held on a track's shared state. /// /// Holds a sender to a background task that drains evicted groups to the disk tier, and the store @@ -207,8 +214,9 @@ pub type Batch = Vec; /// [`crate::TrackProducer::with_cache`]. #[cfg(not(target_arch = "wasm32"))] pub(crate) struct Tiers { - /// Hands each batch of evicted live groups to the background flush task. - flush: tokio::sync::mpsc::UnboundedSender>, + /// Hands each batch of evicted live groups to the background flush task. Bounded (see + /// [`FLUSH_BACKLOG`]); a full channel drops rather than blocks the eviction path. + flush: tokio::sync::mpsc::Sender>, /// The disk/remote store, shared with the flush task; used to serve fetch misses. store: Arc>, } @@ -219,14 +227,19 @@ impl Tiers { pub(crate) fn spawn(disk: Disk) -> Self { let store = store::Store::new(disk.store, disk.remote, disk.prefix, disk.bounds); let store = Arc::new(tokio::sync::RwLock::new(store)); - let (flush, mut rx) = tokio::sync::mpsc::unbounded_channel::>(); + let (flush, mut rx) = tokio::sync::mpsc::channel::>(FLUSH_BACKLOG); let writer = store.clone(); web_async::spawn(async move { - // Each message is one eviction pass. Drain its groups and write them as a single - // segment, so a stampede-trim (many groups at once) is one object rather than many. - while let Some(consumers) = rx.recv().await { - let mut batch = Batch::with_capacity(consumers.len()); - for consumer in consumers { + while let Some(first) = rx.recv().await { + // Coalesce every eviction pass already queued into one segment, so a backlog (or a + // stampede-trim) becomes one disk object rather than one per pass. + let mut passes = vec![first]; + while let Ok(more) = rx.try_recv() { + passes.push(more); + } + + let mut batch = Batch::new(); + for consumer in passes.into_iter().flatten() { match Group::read(consumer).await { Ok(group) => batch.push(group), // A group torn down before we drained it (e.g. abort) is dropped, not cached. @@ -236,6 +249,8 @@ impl Tiers { if batch.is_empty() { continue; } + // Keep groups in ascending sequence so the segment's footer is ordered. + batch.sort_by_key(|group| group.sequence); if let Err(err) = writer.write().await.flush(batch).await { tracing::warn!(%err, "cache disk flush failed"); } @@ -244,10 +259,19 @@ impl Tiers { Self { flush, store } } - /// Hand a batch of evicted live groups to the flush task. Dropped silently once the task is gone. + /// Hand a batch of evicted live groups to the flush task. Non-blocking (the caller holds the + /// track state lock): a full backlog or a gone task drops the batch rather than waiting, leaving + /// a hole in the best-effort cache instead of stalling eviction or growing RAM. pub(crate) fn evict(&self, groups: Vec) { - if !groups.is_empty() { - let _ = self.flush.send(groups); + if groups.is_empty() { + return; + } + if let Err(err) = self.flush.try_send(groups) { + let dropped = match &err { + tokio::sync::mpsc::error::TrySendError::Full(g) => g.len(), + tokio::sync::mpsc::error::TrySendError::Closed(g) => g.len(), + }; + tracing::warn!(dropped, "cache flush backlog full; dropping evicted groups"); } } diff --git a/rs/moq-net/src/model/group.rs b/rs/moq-net/src/model/group.rs index a16a04142..b9c63eb69 100644 --- a/rs/moq-net/src/model/group.rs +++ b/rs/moq-net/src/model/group.rs @@ -192,6 +192,12 @@ impl GroupProducer { self.state.read().frames.back().and_then(|f| f.timestamp) } + /// Whether the group has been finished (no more frames will be written). The track cache only + /// spills finished groups: draining an open one would park the flush task until it completes. + pub(crate) fn is_finished(&self) -> bool { + self.state.read().fin + } + /// A helper method to write a frame from a single byte buffer. /// /// If you want to write multiple chunks, use [Self::create_frame] to get a frame producer. diff --git a/rs/moq-net/src/model/track.rs b/rs/moq-net/src/model/track.rs index 93a51cff0..b6a8eacd8 100644 --- a/rs/moq-net/src/model/track.rs +++ b/rs/moq-net/src/model/track.rs @@ -434,8 +434,11 @@ impl TrackState { } removed.push(group.sequence); + // Only spill finished groups: draining an open one would park the flush task until it + // completes (or forever, if the writer stalled). An unfinished evicted group is dropped + // from the live tier as before, just not cached. #[cfg(not(target_arch = "wasm32"))] - if self.cache.is_some() { + if self.cache.is_some() && group.is_finished() { evicted.push(group.consume()); } *slot = None; @@ -1879,6 +1882,33 @@ mod test { )); } + #[cfg(not(target_arch = "wasm32"))] + #[tokio::test] + async fn unfinished_evicted_group_is_not_spilled() { + tokio::time::pause(); + let mut producer = disk_cached_producer(); + + // An open (never-finished) group, kept open by holding its producer handle. + let mut open = producer.append_group().unwrap(); // seq 0 + open.write_frame(bytes::Bytes::from_static(b"partial")).unwrap(); + + // Age it out of the live window; it is dropped, not handed to the flush task (draining an + // open group would park the task forever). + tokio::time::advance(Duration::from_secs(2)).await; + write_group(&mut producer, b"next"); // seq 1, evicts the open seq 0 + + let consumer = producer.consume(); + for _ in 0..50 { + tokio::task::yield_now().await; + } + // seq 0 is gone from RAM and was never spilled, and there is no dynamic: a clean NotFound, + // not a hang. + assert!(matches!( + consumer.fetch_group(0, None).unwrap().await, + Err(Error::NotFound) + )); + } + #[tokio::test] async fn no_eviction_when_fresh() { tokio::time::pause(); From 29f1054d09c04daf7d656b355cafd660ef44295a Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 18:17:56 +0000 Subject: [PATCH 23/25] build: bump quinn-proto to 0.11.15 (RUSTSEC-2026-0185) cargo-deny flagged RUSTSEC-2026-0185, a remote memory-exhaustion vulnerability in quinn-proto 0.11.14 (unbounded out-of-order stream reassembly). Pulled in transitively via quinn -> web-transport-quinn -> moq-native. 0.11.15 is the fix release and a drop-in patch (same dependency tree), so this is a surgical lockfile bump. Unrelated to the cache feature; it just lands here to get the PR's CI green. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 275ec9d97..5b01e8f23 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6104,9 +6104,9 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.14" +version = "0.11.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" +checksum = "4fcb935c5bec503c2f0e306bdd3e58bb9029dcb14fa8d9ac76e3a5256ac0763e" dependencies = [ "aws-lc-rs", "bytes", From 9b8e1b9396c6892788124475a954b1a7c76ef7d6 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 19:43:13 +0000 Subject: [PATCH 24/25] feat(moq-net): abort stale arrivals; run cache rollup upload off-lock Two review follow-ups: - Media-time gate now drops stale arrivals outright instead of archiving them. A group evicted by the media gate (media already past the window the instant it lands: a startup burst or a lagging publisher) is not spilled to the cache, and if it is still open it is aborted (Error::Old) so a producer still downloading a group too stale to keep stops wasting bandwidth and releases its buffers. Only a finished group aged out by the wall-clock gate is archived. A finished media-stale group (e.g. a deliberately fetched old group) is dropped without aborting, so a consumer still reading it is unaffected. - compact no longer holds the store lock across the remote upload. Split into plan_compaction (locked: snapshot the rollup, reading disk bytes and building the rolled object) -> Rollup::upload (unlocked: the slow remote put) -> apply_compaction (locked: repoint the index, delete disk). The rolled object is refcounted Bytes, so the snapshot is cheap, and the disk segments stay indexed until apply, so a concurrent fetch is unaffected. Store::compact keeps the all-in-one path for tests. The flush task binds each phase to its own statement so the lock guard drops at the `;`; holding it across the match also self-deadlocked the re-entrant write() in apply. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-net/src/model/cache/mod.rs | 65 ++++++++++++++ rs/moq-net/src/model/cache/store.rs | 128 ++++++++++++++++++++-------- rs/moq-net/src/model/track.rs | 78 ++++++++++++++--- 3 files changed, 224 insertions(+), 47 deletions(-) diff --git a/rs/moq-net/src/model/cache/mod.rs b/rs/moq-net/src/model/cache/mod.rs index f0adebaab..12c212383 100644 --- a/rs/moq-net/src/model/cache/mod.rs +++ b/rs/moq-net/src/model/cache/mod.rs @@ -253,6 +253,26 @@ impl Tiers { batch.sort_by_key(|group| group.sequence); if let Err(err) = writer.write().await.flush(batch).await { tracing::warn!(%err, "cache disk flush failed"); + continue; + } + + // Compact in phases so the slow remote upload runs without the store lock that + // fetches need: plan (locked) snapshots the rollup, upload (unlocked) does the remote + // put, apply (locked) repoints the index. Bind each phase to its own statement so the + // lock guard drops at the `;` rather than being held (a held guard would also + // deadlock the re-entrant `write()` in apply). + let planned = writer.write().await.plan_compaction().await; + match planned { + Ok(Some(rollup)) => { + if let Err(err) = rollup.upload().await { + // The index still points at the intact disk segments; safe to leave. + tracing::warn!(%err, "cache remote rollup upload failed"); + } else if let Err(err) = writer.write().await.apply_compaction(rollup).await { + tracing::warn!(%err, "cache rollup apply failed"); + } + } + Ok(None) => {} + Err(err) => tracing::warn!(%err, "cache compaction planning failed"), } } }); @@ -400,4 +420,49 @@ mod tests { assert!(tiers.fetch(2, None).await.is_some()); assert!(tiers.fetch(99, None).await.is_none()); } + + #[cfg(not(target_arch = "wasm32"))] + #[tokio::test] + async fn tiers_roll_up_to_remote_and_fetch() { + use object_store::memory::InMemory; + use object_store::path::Path; + + // Disk keeps ~1 segment (promote over budget); the rolled-up bytes go to the remote tier. + // This exercises the phased plan -> upload (off-lock) -> apply path in the flush task. + let bounds = Bounds::new(Limit::bytes(1100), Limit::bytes(2000)); + let disk = + Disk::new(Arc::new(InMemory::new()), Path::from("cache"), bounds).with_remote(Arc::new(InMemory::new())); + let tiers = Tiers::spawn(disk); + + // Evict five ~1 KB groups, one pass at a time, so the disk tier exceeds budget and rolls up. + for seq in 0..5u64 { + let mut live = crate::GroupProducer::new(crate::Group { sequence: seq }, None); + live.write_frame(Bytes::from(vec![seq as u8; 1000])).unwrap(); + live.finish().unwrap(); + tiers.evict(vec![live.consume()]); + // Let the flush task process this pass (flush + compaction) before the next eviction, so + // each becomes its own segment rather than coalescing into one. + for _ in 0..50 { + tokio::task::yield_now().await; + } + } + + // Every group is still fetchable, whether it stayed on disk or rolled up to the remote tier. + for seq in 0..5u64 { + let mut found = None; + for _ in 0..200 { + if let Some(group) = tiers.fetch(seq, None).await { + found = Some(group); + break; + } + tokio::task::yield_now().await; + } + let mut group = found.unwrap_or_else(|| panic!("group {seq} fetchable after rollup")); + assert_eq!(group.sequence, seq); + assert_eq!( + group.read_frame().await.unwrap().unwrap(), + Bytes::from(vec![seq as u8; 1000]) + ); + } + } } diff --git a/rs/moq-net/src/model/cache/store.rs b/rs/moq-net/src/model/cache/store.rs index cfec6f680..a0c7de815 100644 --- a/rs/moq-net/src/model/cache/store.rs +++ b/rs/moq-net/src/model/cache/store.rs @@ -8,6 +8,7 @@ use std::ops::Range; use std::sync::Arc; +use bytes::Bytes; use object_store::{ObjectStore, PutPayload, path::Path}; use super::index::{Index, SegmentId, Tier}; @@ -70,7 +71,8 @@ impl Store { self.prefix.child(dir).child(id.to_string()) } - /// Persist a flushed band as one disk segment, then compact if the disk tier is over budget. + /// Persist a flushed band as one disk segment. Does not compact: the caller drives compaction + /// (see [`Self::plan_compaction`]) so the slow remote upload can run without the store lock. pub async fn flush(&mut self, batch: Batch) -> Result<(), Error> { if batch.is_empty() { return Ok(()); @@ -84,7 +86,7 @@ impl Store { .await?; let added = self.index.add(Tier::Disk, &segment); debug_assert_eq!(added, id, "index id drifted from the written key"); - self.compact().await + Ok(()) } /// Fetch a group by sequence: locate it, ranged-read its blob, decode it. `None` if not stored. @@ -101,48 +103,100 @@ impl Store { Ok(Some(segment::group_from_blob(sequence, bytes)?)) } - /// Bring the disk tier within bounds: roll the oldest segments up into one remote object, or - /// evict them when there is no remote tier. A no-op when the disk tier is within bounds. - pub async fn compact(&mut self) -> Result<(), Error> { + /// Phase 1 of compaction, run **under the store lock**: if the disk tier is over bounds, snapshot + /// a rollup of the oldest segments (read their bytes and build the rolled object), reserving the + /// remote segment id. With no remote tier, drop the oldest disk segments inline instead. Returns + /// the snapshot to [upload](Rollup::upload), or `None` when within bounds or evicted inline. + /// + /// Disk reads happen here (local, fast); only the remote upload is slow, and it runs after this + /// returns so the lock can be released across it. The disk segments stay in place and indexed + /// until [`Self::apply_compaction`], so a concurrent fetch still reads them. + pub async fn plan_compaction(&mut self) -> Result, Error> { let promoted = self.index.promotion(self.bounds); if promoted.is_empty() { - return Ok(()); + return Ok(None); } - match self.remote.clone() { - Some(remote) => { - // Read the promoted disk segments whole and roll them into one. - let mut segments = Vec::with_capacity(promoted.len()); - for id in &promoted { - let bytes = self.disk.get(&self.key(Tier::Disk, *id)).await?.bytes().await?; - segments.push(bytes); - } - let rolled = segment::rollup(&segments)?; - let rolled_segment = Segment::open(rolled.clone())?; - // Upload the remote object before repointing the index, so a failed put leaves the - // index (still pointing at the disk segments) intact. - let new_id = self.index.next_id(); - remote - .put(&self.key(Tier::Remote, new_id), PutPayload::from_bytes(rolled)) - .await?; - let applied = self.index.apply_promotion(&promoted, &rolled_segment); - debug_assert_eq!(applied, new_id, "index id drifted from the uploaded key"); - // Best-effort cleanup; an index now pointing at remote makes any leftover disk - // objects orphans, not inconsistency. - for id in &promoted { - self.disk.delete(&self.key(Tier::Disk, *id)).await?; - } - } - None => { - // No remote tier: drop the oldest disk segments outright. - for id in &promoted { - self.disk.delete(&self.key(Tier::Disk, *id)).await?; - } - self.index.evict(&promoted); + let Some(remote) = self.remote.clone() else { + // No remote tier: drop the oldest disk segments outright (local deletes, fast). + for id in &promoted { + self.disk.delete(&self.key(Tier::Disk, *id)).await?; } + self.index.evict(&promoted); + return Ok(None); + }; + + // Read the promoted disk segments whole and roll them into one. + let mut segments = Vec::with_capacity(promoted.len()); + for id in &promoted { + let bytes = self.disk.get(&self.key(Tier::Disk, *id)).await?.bytes().await?; + segments.push(bytes); + } + let rolled = segment::rollup(&segments)?; + let segment = Segment::open(rolled.clone())?; + // Reserve the id (and thus the key) the upload writes to. Only the flush task mutates the + // index, so `next_id` is stable until `apply_compaction` consumes it. + let new_id = self.index.next_id(); + let key = self.key(Tier::Remote, new_id); + Ok(Some(Rollup { + promoted, + rolled, + segment, + new_id, + remote, + key, + })) + } + + /// Phase 3 of compaction, run **under the store lock** after [`Rollup::upload`] succeeds: repoint + /// the index at the uploaded remote object and delete the now-orphaned disk segments. + pub async fn apply_compaction(&mut self, rollup: Rollup) -> Result<(), Error> { + let applied = self.index.apply_promotion(&rollup.promoted, &rollup.segment); + debug_assert_eq!(applied, rollup.new_id, "index id drifted from the uploaded key"); + // Best-effort cleanup; an index now pointing at remote makes any leftover disk objects + // orphans, not inconsistency. + for id in &rollup.promoted { + self.disk.delete(&self.key(Tier::Disk, *id)).await?; } Ok(()) } + + /// Bring the disk tier within bounds in one call, holding the lock across the remote upload. + /// Convenience for callers that don't need to release the lock (tests, single-tier setups); the + /// tiered cache path uses [`plan_compaction`](Self::plan_compaction) / + /// [`Rollup::upload`] / [`apply_compaction`](Self::apply_compaction) so a slow remote upload + /// doesn't block fetches. + pub async fn compact(&mut self) -> Result<(), Error> { + if let Some(rollup) = self.plan_compaction().await? { + rollup.upload().await?; + self.apply_compaction(rollup).await?; + } + Ok(()) + } +} + +/// A snapshot of a planned disk -> remote rollup, taken under the store lock by +/// [`Store::plan_compaction`] so the slow remote upload can run without holding it. The rolled +/// bytes are reference-counted, so the snapshot is cheap to carry across the unlocked upload. +pub struct Rollup { + promoted: Vec, + rolled: Bytes, + segment: Segment, + new_id: SegmentId, + remote: Arc, + key: Path, +} + +impl Rollup { + /// Upload the rolled object to the remote tier. Run this **without** holding the store lock; the + /// index still points at the (intact) disk segments until [`Store::apply_compaction`], so a + /// failed upload is a safe no-op to retry and a concurrent fetch is unaffected. + pub async fn upload(&self) -> Result<(), Error> { + self.remote + .put(&self.key, PutPayload::from_bytes(self.rolled.clone())) + .await?; + Ok(()) + } } #[cfg(test)] @@ -185,6 +239,7 @@ mod tests { for seq in 0..5 { store.flush(vec![group(seq, 1000)]).await.unwrap(); + store.compact().await.unwrap(); } // Every group is still readable, whether it stayed on disk or rolled up to remote. @@ -202,6 +257,7 @@ mod tests { for seq in 0..5 { store.flush(vec![group(seq, 1000)]).await.unwrap(); + store.compact().await.unwrap(); } // The newest group is retained; the oldest was evicted (no remote to promote into). diff --git a/rs/moq-net/src/model/track.rs b/rs/moq-net/src/model/track.rs index b6a8eacd8..994fd6280 100644 --- a/rs/moq-net/src/model/track.rs +++ b/rs/moq-net/src/model/track.rs @@ -392,15 +392,20 @@ impl TrackState { } /// Evict groups that fall outside the retention window by either gate, never evicting the - /// max_sequence group. Evicted groups are handed to the cache (when one is attached) so a later - /// fetch can read them back from disk. + /// max_sequence group. /// /// Two gates, both sized by `max_age`; a group is evicted when it trips either: /// - **wall-clock**: it was received more than `max_age` ago. The hard memory backstop, so a - /// publisher can't pin RAM by lying about media timestamps. + /// publisher can't pin RAM by lying about media timestamps. A finished group aged out this + /// way is archived to the cache (when one is attached) so a later fetch reads it back. /// - **media-time**: its last frame's media timestamp is more than `max_age` behind the live - /// media edge. Bounds a startup stampede where a burst of buffered media arrives at once - /// (all "received now", so the wall-clock gate alone would keep it all). + /// media edge. Drops a stale arrival (a group whose media is already past the window the + /// instant it lands, e.g. a startup burst or a lagging publisher) rather than spending RAM on + /// media too old to serve. A media-stale group is not archived. + /// + /// An unfinished evicted group is [aborted](GroupProducer::abort): a producer still filling it + /// (a wire receive loop downloading a group already too stale to keep) stops and releases its + /// buffers instead of finishing a group we will immediately drop. /// /// Groups arrive in wall-clock order, but a late out-of-order group can be media-expired /// anywhere in the deque, so this scans the whole (small) window rather than breaking early. @@ -434,12 +439,21 @@ impl TrackState { } removed.push(group.sequence); - // Only spill finished groups: draining an open one would park the flush task until it - // completes (or forever, if the writer stalled). An unfinished evicted group is dropped - // from the live tier as before, just not cached. - #[cfg(not(target_arch = "wasm32"))] - if self.cache.is_some() && group.is_finished() { - evicted.push(group.consume()); + if group.is_finished() { + // A finished group that aged out by wall-clock is archived to the cache. A finished + // group that is only media-stale (a deliberately fetched old group, or a brief live + // group) is dropped without archiving and without aborting, so a consumer still + // reading it is unaffected. + #[cfg(not(target_arch = "wasm32"))] + if wall_expired && self.cache.is_some() { + evicted.push(group.consume()); + } + } else { + // An unfinished group is dropped from RAM regardless. Abort it so a producer still + // filling it (e.g. a wire receive loop downloading a group already too stale to + // serve) stops wasting bandwidth and releases its buffers, rather than finishing a + // group we will never keep. + let _ = group.abort(Error::Old); } *slot = None; } @@ -1909,6 +1923,48 @@ mod test { )); } + #[tokio::test] + async fn media_stale_unfinished_group_is_aborted() { + tokio::time::pause(); + let scale = crate::Timescale::new(1_000_000).unwrap(); // microseconds + + fn timed_frame(group: &mut GroupProducer, scale: crate::Timescale, micros: u64, payload: &'static [u8]) { + let info = crate::Frame { + size: payload.len() as u64, + timestamp: Some(crate::Timestamp::new(micros, scale).unwrap()), + }; + let mut frame = group.create_frame(info).unwrap(); + frame.write(bytes::Bytes::from_static(payload)).unwrap(); + frame.finish().unwrap(); + } + + // Retain 10s of media (and wall-clock); no time advances, so only the media gate can fire. + let mut producer = TrackProducer::new( + "test", + TrackInfo::default() + .with_timescale(scale) + .with_cache(Duration::from_secs(10)), + ); + + // A stale group still being received: one frame at media t=0, left open (producer held). + let mut stale = producer.create_group(Group { sequence: 0 }).unwrap(); + let mut reader = stale.consume(); + timed_frame(&mut stale, scale, 0, b"old"); + + // The live edge jumps media time to 20s, well past the 10s window. + let mut edge = producer.create_group(Group { sequence: 1 }).unwrap(); + timed_frame(&mut edge, scale, 20_000_000, b"new"); + edge.finish().unwrap(); + + // A further insert runs eviction with media_now = 20s; the open group 0 (t=0) is >10s stale. + producer.append_group().unwrap(); // seq 2 + + // The stale, still-open group was aborted (Error::Old), signaling its receiver to stop, and + // it is gone from the live window. + assert!(matches!(reader.read_frame().await, Err(Error::Old))); + assert!(producer.consume().get_group(0).is_none()); + } + #[tokio::test] async fn no_eviction_when_fresh() { tokio::time::pause(); From ad3de0abb28982ee0d047c1388e08e4af10f2178 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 23 Jun 2026 04:06:13 +0000 Subject: [PATCH 25/25] refactor(moq-net): fetch_group is an inline-async future, no spawned task Addresses review: "make something we poll to completion" (no async tasks in model) and "way too much wasm gating". fetch_group no longer spawns a background lookup task. It now returns an owned future that does its disk/remote I/O inline when polled, driven by the caller's await (every caller already awaits it). The future: 1. resolves a live hit (or a terminal miss: aborted / past the final sequence), 2. reads the durable cache across the disk/remote tiers (native only, no lock held across the await), 3. on a miss, chains upstream by queuing for a TrackDynamic and waits via kio. Removes TrackFetch, CacheLookup, and spawn_cache_lookup. The wasm gating collapses to a single cfg block around the tier read (wasm has no durable cache, so it goes straight from live miss to the dynamic/NotFound path). fetch_group now returns `impl Future>` instead of `Result>`; callers drop the inner `?`/`.unwrap()` before the await. Tests that relied on eager (synchronous) queuing now drive the fetch concurrently with the handler. This is step 1 of the cache poll-to-completion refactor; the flush-task spawn and the store lock-across-IO are next. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01R8gBynFAeeVnxuffr4ZKUo --- rs/moq-native/tests/broadcast.rs | 4 +- rs/moq-net/src/lite/publisher.rs | 2 +- rs/moq-net/src/model/track.rs | 297 +++++++++++-------------------- rs/moq-relay/src/web.rs | 2 +- 4 files changed, 109 insertions(+), 196 deletions(-) diff --git a/rs/moq-native/tests/broadcast.rs b/rs/moq-native/tests/broadcast.rs index 35406518a..dad27b4e2 100644 --- a/rs/moq-native/tests/broadcast.rs +++ b/rs/moq-native/tests/broadcast.rs @@ -307,7 +307,7 @@ async fn lite05_fetch_roundtrip(scheme: &str) { // Fetch group 0 directly, without subscribing. No live producer holds the group // on the client, so this issues a wire FETCH upstream. let mut group_sub = tokio::time::timeout(TIMEOUT, async { - bc.track("video").unwrap().fetch_group(0, None).unwrap().await + bc.track("video").unwrap().fetch_group(0, None).await }) .await .expect("fetch timed out") @@ -451,7 +451,7 @@ async fn lite05_fetch_during_subscribe(scheme: &str) { // relay doesn't have it cached (subscription started at the latest), so this // must issue a wire FETCH concurrently with the live subscription. let mut fetched = tokio::time::timeout(TIMEOUT, async { - bc.track("video").unwrap().fetch_group(0, None).unwrap().await + bc.track("video").unwrap().fetch_group(0, None).await }) .await .expect("fetch timed out") diff --git a/rs/moq-net/src/lite/publisher.rs b/rs/moq-net/src/lite/publisher.rs index b2660045a..f48553353 100644 --- a/rs/moq-net/src/lite/publisher.rs +++ b/rs/moq-net/src/lite/publisher.rs @@ -682,7 +682,7 @@ impl Publisher { crate::Fetch { priority: fetch.priority, }, - )? + ) .await?; // FETCH is gated to lite-05+, which always carries timestamps when the track diff --git a/rs/moq-net/src/model/track.rs b/rs/moq-net/src/model/track.rs index 994fd6280..eb60f1bbe 100644 --- a/rs/moq-net/src/model/track.rs +++ b/rs/moq-net/src/model/track.rs @@ -1079,66 +1079,78 @@ impl TrackConsumer { /// Fetch a single past group, without holding a live subscription. /// - /// Returns a [`kio::Pending`] that resolves to the [`GroupConsumer`]: - /// immediately if the group is in the live window, otherwise once it is read back from the - /// durable cache (when one is attached) or a [`TrackDynamic`] serves the request (a wire FETCH - /// for a relay). `options` accepts `None`, a [`Fetch`], or `Fetch::default()`. + /// The returned future resolves to the [`GroupConsumer`]: immediately if the group is in the + /// live window, otherwise once it is read back from the durable cache (when one is attached) or + /// a [`TrackDynamic`] serves the request (a wire FETCH for a relay). `options` accepts `None`, a + /// [`Fetch`], or `Fetch::default()`. /// - /// Fails synchronously with [`Error::NotFound`] when the group can never be served - /// (past the final sequence, or no cache and no [`TrackDynamic`] on the track), or the track's - /// abort error if it's already closed. - pub fn fetch_group(&self, sequence: u64, options: impl Into>) -> Result> { - let options = options.into().unwrap_or_default(); - - let mut state = self - .state - .write() - .map_err(|s| s.abort.clone().unwrap_or(Error::Dropped))?; - match state.poll_fetch(sequence) { - // Cached live: the pending resolves immediately from state, no lookup needed. - Poll::Ready(Ok(_)) => {} - // Live miss. If a durable cache is attached and the group could still exist, spawn an - // async lookup across its disk/remote tiers. The returned `TrackFetch` resolves from it - // on a hit; on a miss the lookup task chains upstream (queues for a `TrackDynamic`) and - // the fetch falls through to that live decision. - other => { - #[cfg(not(target_arch = "wasm32"))] - { - // A group past the final sequence can never exist in any tier, and an aborted - // track is terminal, so skip the cache and report those synchronously below. - let exhausted = state.abort.is_some() || state.final_sequence.is_some_and(|fin| sequence >= fin); - if !exhausted && let Some(tiers) = &state.cache { - let timescale = state.info.as_ref().and_then(|info| info.timescale); - let lookup = - spawn_cache_lookup(tiers, self.state.clone(), sequence, options.priority, timescale); - drop(state); - return Ok(kio::Pending::new(TrackFetch { - state: self.state.clone(), - sequence, - lookup: Some(lookup), - })); + /// Resolves to [`Error::NotFound`] when the group can never be served (past the final sequence, + /// or no cache and no [`TrackDynamic`] on the track), or the track's abort error if it's closed. + /// + /// The future does its own disk/remote I/O when polled (driven by the caller's `await`); it + /// spawns nothing. The store is consulted before chaining upstream, so a cached group never + /// triggers a redundant wire FETCH. + pub fn fetch_group( + &self, + sequence: u64, + options: impl Into>, + ) -> impl std::future::Future> + 'static { + let priority = options.into().unwrap_or_default().priority; + let state = self.state.clone(); + // Snapshot the durable cache handle (and the track's timescale) up front; the async body + // reads the tiers without touching the track lock. Absent on wasm (no durable cache). + #[cfg(not(target_arch = "wasm32"))] + let cache = { + let state = self.state.read(); + state.cache.as_ref().map(|tiers| { + ( + tiers.store_handle(), + state.info.as_ref().and_then(|info| info.timescale), + ) + }) + }; + + async move { + // 1. Decide from the live window: a hit resolves now; a terminal miss (aborted, or past + // the final sequence) can't exist in any tier and resolves now. + { + let live = state.read(); + match live.poll_fetch(sequence) { + Poll::Ready(Ok(group)) => return Ok(group), + Poll::Ready(Err(err)) => { + if live.abort.is_some() || live.final_sequence.is_some_and(|fin| sequence >= fin) { + return Err(err); + } } - } - match other { - // Unservable (NotFound) or already aborted: report it synchronously. - Poll::Ready(Err(err)) => return Err(err), - // A handler exists but the group isn't cached yet: queue it. - Poll::Pending => state.fetches.push_back(GroupRequested { - sequence, - priority: options.priority, - }), - Poll::Ready(Ok(_)) => unreachable!("handled above"), + Poll::Pending => {} } } - } - drop(state); - Ok(kio::Pending::new(TrackFetch { - state: self.state.clone(), - sequence, + // 2. Durable cache (native only): read across the disk/remote tiers, no lock held. #[cfg(not(target_arch = "wasm32"))] - lookup: None, - })) + if let Some((store, timescale)) = &cache + && let Ok(Some(group)) = store.read().await.get(sequence).await + && let Ok(consumer) = group.produce(*timescale) + { + return Ok(consumer); + } + + // 3. Cache miss: chain upstream by queuing for a `TrackDynamic`, then wait for it to + // serve the group into the live window. With no handler this resolves NotFound. + { + let mut live = state.write().map_err(|s| s.abort.clone().unwrap_or(Error::Dropped))?; + match live.poll_fetch(sequence) { + Poll::Ready(res) => return res, + Poll::Pending => live.fetches.push_back(GroupRequested { sequence, priority }), + } + } + kio::wait(|waiter| match state.poll(waiter, |live| live.poll_fetch(sequence)) { + Poll::Ready(Ok(res)) => Poll::Ready(res), + Poll::Ready(Err(closed)) => Poll::Ready(Err(closed.abort.clone().unwrap_or(Error::Dropped))), + Poll::Pending => Poll::Pending, + }) + .await + } } pub fn info(&self) -> kio::Pending { @@ -1262,96 +1274,6 @@ impl GroupRequest { } } -/// A pending durable-cache lookup spawned for a [`TrackFetch`] on a live miss. The background task -/// writes [`Done`](CacheLookup::Done) once the disk/remote tiers resolve (the group, or `None` on a -/// miss). Native-only (the durable cache doesn't build on wasm). -#[cfg(not(target_arch = "wasm32"))] -enum CacheLookup { - /// The lookup task is still reading the tiers. - Pending, - /// The lookup finished: the rebuilt group, or `None` on a miss. - Done(Option), -} - -/// Spawn a background task that reads `sequence` from the cache's disk/remote tiers, rebuilds it at -/// `timescale`, and publishes the result through the returned slot. -/// -/// On a tier miss the task chains upstream: it queues the request for a [`TrackDynamic`] (a wire -/// FETCH for a relay) when one exists, so the [`TrackFetch`] then resolves once upstream serves it. -/// Queuing only after the store misses keeps the store the fast path and avoids a redundant -/// upstream fetch when the group is already cached. -#[cfg(not(target_arch = "wasm32"))] -fn spawn_cache_lookup( - tiers: &cache::Tiers, - state: kio::Consumer, - sequence: u64, - priority: u8, - timescale: Option, -) -> kio::Consumer { - let slot = kio::Producer::new(CacheLookup::Pending); - let consumer = slot.consume(); - let store = tiers.store_handle(); - web_async::spawn(async move { - let group = match store.read().await.get(sequence).await { - Ok(Some(group)) => group.produce(timescale).ok(), - // Miss, tier read error, or rebuild failure: report a miss so the fetch falls through. - Ok(None) | Err(_) => None, - }; - if group.is_none() - && let Ok(mut state) = state.write() - && state.dynamic > 0 - { - // Cache miss: chain upstream so a handler fetches the group into the live window. - state.fetches.push_back(GroupRequested { sequence, priority }); - } - if let Ok(mut slot) = slot.write() { - *slot = CacheLookup::Done(group); - } - }); - consumer -} - -/// The pollable state of a [`TrackConsumer::fetch_group`]. -/// -/// Awaited via the [`kio::Pending`] wrapper; resolves to the [`GroupConsumer`] once the group is -/// read back from the durable cache, lands in the live window (e.g. after a wire FETCH), or -/// [`Error::NotFound`] if it can never exist. -pub struct TrackFetch { - state: kio::Consumer, - sequence: u64, - /// A durable-cache lookup spawned on a live miss. On a hit it resolves the fetch; on a miss the - /// poll falls through to the live state. Native-only. - #[cfg(not(target_arch = "wasm32"))] - lookup: Option>, -} - -impl kio::Future for TrackFetch { - type Output = Result; - - fn poll(&self, waiter: &kio::Waiter) -> Poll { - // A durable-cache lookup, if one was spawned, resolves the fetch on a hit. On a miss (or if - // the task died without publishing) fall through to the live state below. - #[cfg(not(target_arch = "wasm32"))] - if let Some(lookup) = &self.lookup { - let resolved = ready!(lookup.poll(waiter, |slot| match &**slot { - CacheLookup::Pending => Poll::Pending, - CacheLookup::Done(group) => Poll::Ready(group.clone()), - })); - if let Ok(Some(group)) = resolved { - return Poll::Ready(Ok(group)); - } - } - // `poll_fetch` already yields a `Result` (group, or NotFound / - // abort); the outer error is the channel closing without one. - Poll::Ready( - match ready!(self.state.poll(waiter, |state| state.poll_fetch(self.sequence))) { - Ok(res) => res, - Err(closed) => Err(closed.abort.clone().unwrap_or(Error::Dropped)), - }, - ) - } -} - /// A live subscription to a track, used to read its groups. /// /// Created via [`TrackConsumer::subscribe`](crate::TrackConsumer::subscribe), or @@ -1844,7 +1766,7 @@ mod test { // TrackDynamic, a disk miss resolves to NotFound, so a failed fetch just means "not yet". let mut served = None; for _ in 0..500 { - if let Ok(group) = consumer.fetch_group(0, None).unwrap().await { + if let Ok(group) = consumer.fetch_group(0, None).await { served = Some(group); break; } @@ -1867,16 +1789,19 @@ mod test { let dynamic = producer.dynamic(); let consumer = producer.consume(); - let fetch = consumer.fetch_group(5, None).unwrap(); - - // The async cache miss queues the request, which the handler then receives and serves. - let request = dynamic.requested_group().await.unwrap(); - assert_eq!(request.sequence(), 5); - let mut group = request.accept(None).unwrap(); - group.write_frame(bytes::Bytes::from_static(b"upstream")).unwrap(); - group.finish().unwrap(); + // Drive the fetch and the handler concurrently: the fetch's cache miss queues the request + // (on first poll), which the handler then receives and serves. + let fetch = consumer.fetch_group(5, None); + let serve = async { + let request = dynamic.requested_group().await.unwrap(); + assert_eq!(request.sequence(), 5); + let mut group = request.accept(None).unwrap(); + group.write_frame(bytes::Bytes::from_static(b"upstream")).unwrap(); + group.finish().unwrap(); + }; + let (served, ()) = tokio::join!(fetch, serve); - let mut served = fetch.await.unwrap(); + let mut served = served.unwrap(); assert_eq!(served.sequence, 5); assert_eq!( served.read_frame().await.unwrap().unwrap(), @@ -1890,10 +1815,7 @@ mod test { // A cache miss with no handler to chain to resolves NotFound, not a hang. let producer = disk_cached_producer(); let consumer = producer.consume(); - assert!(matches!( - consumer.fetch_group(5, None).unwrap().await, - Err(Error::NotFound) - )); + assert!(matches!(consumer.fetch_group(5, None).await, Err(Error::NotFound))); } #[cfg(not(target_arch = "wasm32"))] @@ -1917,10 +1839,7 @@ mod test { } // seq 0 is gone from RAM and was never spilled, and there is no dynamic: a clean NotFound, // not a hang. - assert!(matches!( - consumer.fetch_group(0, None).unwrap().await, - Err(Error::NotFound) - )); + assert!(matches!(consumer.fetch_group(0, None).await, Err(Error::NotFound))); } #[tokio::test] @@ -2684,7 +2603,7 @@ mod test { let dynamic = producer.dynamic(); let consumer = producer.consume(); assert!(consumer.get_group(0).is_some()); - let mut g = consumer.fetch_group(0, None).unwrap().await.unwrap(); + let mut g = consumer.fetch_group(0, None).await.unwrap(); assert_eq!(g.sequence, 0); assert_eq!(&g.read_frame().await.unwrap().unwrap()[..], b"hello"); @@ -2698,39 +2617,34 @@ mod test { let dynamic = producer.dynamic(); let consumer = producer.consume(); - // A cache miss isn't in `get_group`, but a dynamic handler exists, so - // `fetch_group` stays pending and queues a request. `*pending` derefs the - // wrapper to the inner `TrackFetch` (a `kio::Future`). + // A cache miss isn't in `get_group`, but a dynamic handler exists, so the fetch queues a + // request (on first poll) and resolves once it's served. Drive the fetch and the handler + // concurrently. assert!(consumer.get_group(5).is_none()); - let pending = consumer.fetch_group(5, Fetch::default().with_priority(7)).unwrap(); - assert!(kio::Future::poll(&*pending, &kio::Waiter::noop()).is_pending()); - - let req = dynamic - .requested_group() - .now_or_never() - .expect("should not block") - .unwrap(); - assert_eq!(req.sequence(), 5); - assert_eq!(req.priority(), 7); - - // Serve it by accepting the request; the fetch then resolves. - let mut group = req.accept(None).unwrap(); - group.write_frame(bytes::Bytes::from_static(b"hi")).unwrap(); - group.finish().unwrap(); + let fetch = consumer.fetch_group(5, Fetch::default().with_priority(7)); + let serve = async { + let req = dynamic.requested_group().await.unwrap(); + assert_eq!(req.sequence(), 5); + assert_eq!(req.priority(), 7); + let mut group = req.accept(None).unwrap(); + group.write_frame(bytes::Bytes::from_static(b"hi")).unwrap(); + group.finish().unwrap(); + }; + let (g, ()) = tokio::join!(fetch, serve); - let mut g = pending.await.unwrap(); + let mut g = g.unwrap(); assert_eq!(g.sequence, 5); assert_eq!(&g.read_frame().await.unwrap().unwrap()[..], b"hi"); } #[tokio::test] async fn fetch_miss_no_dynamic_not_found() { - // A track with no `TrackDynamic` can't serve old content, so a cache miss - // fails fast (synchronously) instead of blocking forever. + // A track with no `TrackDynamic` can't serve old content, so a cache miss resolves NotFound + // instead of blocking forever. let mut producer = TrackProducer::new("test", None); producer.append_group().unwrap(); // seq 0, but we miss on seq 5 let consumer = producer.consume(); - assert!(matches!(consumer.fetch_group(5, None), Err(Error::NotFound))); + assert!(matches!(consumer.fetch_group(5, None).await, Err(Error::NotFound))); } #[tokio::test] @@ -2739,11 +2653,11 @@ mod test { producer.append_group().unwrap(); // seq 0 producer.finish().unwrap(); // final_sequence = 1 - // A group at or past the final sequence can never exist, even with a handler, - // so it fails fast synchronously. + // A group at or past the final sequence can never exist, even with a handler, so it resolves + // NotFound without consulting any tier. let dynamic = producer.dynamic(); let consumer = producer.consume(); - assert!(matches!(consumer.fetch_group(5, None), Err(Error::NotFound))); + assert!(matches!(consumer.fetch_group(5, None).await, Err(Error::NotFound))); // And it doesn't signal the dynamic handler. assert!(dynamic.poll_requested_group(&kio::Waiter::noop()).is_pending()); @@ -2755,11 +2669,10 @@ mod test { let dynamic = producer.dynamic(); let consumer = producer.consume(); - let pending = consumer.fetch_group(3, None).unwrap(); - assert!(kio::Future::poll(&*pending, &kio::Waiter::noop()).is_pending()); - + // Abort before awaiting: the fetch sees the terminal state on its first poll and errors. + let fetch = consumer.fetch_group(3, None); producer.abort(Error::Cancel).unwrap(); - assert!(pending.await.is_err()); + assert!(fetch.await.is_err()); drop(dynamic); } } diff --git a/rs/moq-relay/src/web.rs b/rs/moq-relay/src/web.rs index b19e8e95d..6fc9a2ad9 100644 --- a/rs/moq-relay/src/web.rs +++ b/rs/moq-relay/src/web.rs @@ -600,7 +600,7 @@ async fn serve_fetch( Err(err) => Err(err), }, // A one-shot fetch, no subscription required. - FetchGroup::Num(sequence) => async { broadcast.track(&track)?.fetch_group(sequence, None)?.await } + FetchGroup::Num(sequence) => async { broadcast.track(&track)?.fetch_group(sequence, None).await } .await .map(Some), };