From 181988200973c483bddfd746a07f28bb4348988c Mon Sep 17 00:00:00 2001 From: Ariel Molina Date: Mon, 15 Jun 2026 14:10:44 -0600 Subject: [PATCH 01/13] moq-gst: add moqsinkspike parallel element (lifecycle core) A parallel sink element (moqsinkspike, Rank::NONE) reimplementing the moqsink core with isolated, testable seams: pure timeline policy, per-pad state, a bounded data channel with cancellation independent of the data path, single finalization (catalog last), and observable connection status. The production moqsink is left untouched. Co-Authored-By: Claude Opus 4.8 --- rs/moq-gst/src/lib.rs | 2 + rs/moq-gst/src/spike/imp.rs | 275 +++++++++++++++ rs/moq-gst/src/spike/mod.rs | 14 + rs/moq-gst/src/spike/session.rs | 558 +++++++++++++++++++++++++++++++ rs/moq-gst/src/spike/timeline.rs | 132 ++++++++ 5 files changed, 981 insertions(+) create mode 100644 rs/moq-gst/src/spike/imp.rs create mode 100644 rs/moq-gst/src/spike/mod.rs create mode 100644 rs/moq-gst/src/spike/session.rs create mode 100644 rs/moq-gst/src/spike/timeline.rs diff --git a/rs/moq-gst/src/lib.rs b/rs/moq-gst/src/lib.rs index 69bff0f10..95f640fd5 100644 --- a/rs/moq-gst/src/lib.rs +++ b/rs/moq-gst/src/lib.rs @@ -2,6 +2,7 @@ use gst::glib; mod sink; mod source; +mod spike; use tracing::level_filters::LevelFilter; use tracing_subscriber::EnvFilter; @@ -9,6 +10,7 @@ use tracing_subscriber::EnvFilter; pub fn plugin_init(plugin: &gst::Plugin) -> Result<(), glib::BoolError> { sink::register(plugin)?; source::register(plugin)?; + spike::register(plugin)?; let filter = EnvFilter::builder() .with_default_directive(LevelFilter::INFO.into()) diff --git a/rs/moq-gst/src/spike/imp.rs b/rs/moq-gst/src/spike/imp.rs new file mode 100644 index 000000000..da89cce2c --- /dev/null +++ b/rs/moq-gst/src/spike/imp.rs @@ -0,0 +1,275 @@ +//! GObject shell, deliberately a parallel element (`moqsinkspike`) so production `moqsink` is untouched. + +use std::sync::{Arc, LazyLock, Mutex}; + +use anyhow::{Context, Result}; +use bytes::Bytes; +use gst::glib; +use gst::prelude::*; +use gst::subclass::prelude::*; + +use super::session::{DataMsg, ResolvedSettings, SessionHandle, Status, CAT}; + +#[derive(Debug, Clone, Default)] +struct Settings { + url: Option, + broadcast: Option, + tls_disable_verify: bool, +} + +impl TryFrom for ResolvedSettings { + type Error = anyhow::Error; + + fn try_from(value: Settings) -> Result { + Ok(Self { + url: url::Url::parse(value.url.as_ref().context("url property is required")?)?, + broadcast: value.broadcast.as_ref().context("broadcast property is required")?.clone(), + tls_disable_verify: value.tls_disable_verify, + }) + } +} + +#[derive(Default)] +pub struct MoqSinkSpike { + settings: Mutex, + session: Mutex>, + status: Arc, +} + +#[glib::object_subclass] +impl ObjectSubclass for MoqSinkSpike { + const NAME: &'static str = "MoqSinkSpike"; + type Type = super::MoqSinkSpike; + type ParentType = gst::Element; +} + +impl ObjectImpl for MoqSinkSpike { + fn properties() -> &'static [glib::ParamSpec] { + static PROPS: LazyLock> = LazyLock::new(|| { + vec![ + glib::ParamSpecString::builder("url") + .nick("Source URL") + .blurb("Connect to the given URL") + .build(), + glib::ParamSpecString::builder("broadcast") + .nick("Broadcast") + .blurb("The name of the broadcast to publish") + .build(), + glib::ParamSpecBoolean::builder("tls-disable-verify") + .nick("TLS disable verify") + .blurb("Disable TLS verification") + .default_value(false) + .build(), + // Read-only, served from the shared Status the task writes. + glib::ParamSpecBoolean::builder("connected") + .nick("Connected") + .blurb("Whether the session is currently connected") + .read_only() + .build(), + glib::ParamSpecString::builder("moq-version") + .nick("Negotiated version") + .blurb("The negotiated MoQ protocol version, null when disconnected") + .read_only() + .build(), + glib::ParamSpecUInt64::builder("estimated-send-bitrate") + .nick("Estimated send bitrate") + .blurb("Estimated send bitrate in bits per second (congestion controller), 0 when unavailable") + .read_only() + .build(), + ] + }); + PROPS.as_ref() + } + + fn set_property(&self, _id: usize, value: &glib::Value, pspec: &glib::ParamSpec) { + let mut settings = self.settings.lock().unwrap(); + match pspec.name() { + "url" => settings.url = value.get().unwrap(), + "broadcast" => settings.broadcast = value.get().unwrap(), + "tls-disable-verify" => settings.tls_disable_verify = value.get().unwrap(), + _ => unreachable!(), + } + } + + fn property(&self, _id: usize, pspec: &glib::ParamSpec) -> glib::Value { + match pspec.name() { + "connected" => self.status.connected().to_value(), + "moq-version" => self.status.version().to_value(), + "estimated-send-bitrate" => self.status.send_bitrate().to_value(), + name => { + let settings = self.settings.lock().unwrap(); + match name { + "url" => settings.url.to_value(), + "broadcast" => settings.broadcast.to_value(), + "tls-disable-verify" => settings.tls_disable_verify.to_value(), + _ => unreachable!(), + } + } + } + } + + fn constructed(&self) { + self.parent_constructed(); + self.obj().set_element_flags(gst::ElementFlags::SINK); + } +} + +impl GstObjectImpl for MoqSinkSpike {} + +impl ElementImpl for MoqSinkSpike { + fn metadata() -> Option<&'static gst::subclass::ElementMetadata> { + static METADATA: LazyLock = LazyLock::new(|| { + gst::subclass::ElementMetadata::new( + "MoQ Sink (spike)", + "Sink/Network/MoQ", + "Transmits media over MoQ (spike)", + "Ariel Molina ", + ) + }); + Some(&*METADATA) + } + + fn pad_templates() -> &'static [gst::PadTemplate] { + static PAD_TEMPLATES: LazyLock> = LazyLock::new(|| { + // Spike scope: a single H.264 pad; the point is the lifecycle, not codec breadth. + let caps = gst::Caps::builder("video/x-h264") + .field("stream-format", "byte-stream") + .field("alignment", "au") + .build(); + let templ = + gst::PadTemplate::new("sink_%u", gst::PadDirection::Sink, gst::PadPresence::Request, &caps).unwrap(); + vec![templ] + }); + PAD_TEMPLATES.as_ref() + } + + fn request_new_pad( + &self, + templ: &gst::PadTemplate, + name: Option<&str>, + _caps: Option<&gst::Caps>, + ) -> Option { + let pad_builder = gst::Pad::builder_from_template(templ) + .chain_function(|pad, parent, buffer| { + MoqSinkSpike::catch_panic_pad_function( + parent, + || Err(gst::FlowError::Error), + |sink| sink.forward_buffer(pad, buffer), + ) + }) + .event_function(|pad, parent, event| { + MoqSinkSpike::catch_panic_pad_function(parent, || false, |sink| sink.handle_event(pad, event)) + }); + + let pad = match name { + Some(name) => pad_builder.name(name).build(), + None => pad_builder.generated_name().build(), + }; + + self.obj().add_pad(&pad).ok()?; + Some(pad) + } + + fn release_pad(&self, pad: &gst::Pad) { + // Drop the session guard before blocking_send: holding it across a full-channel block deadlocks stop_session. + let sender = { + let session = self.session.lock().unwrap(); + session.as_ref().map(SessionHandle::sender) + }; + if let Some(sender) = sender { + let _ = sender.blocking_send(DataMsg::DropPad { + pad: pad.name().to_string(), + }); + } + let _ = self.obj().remove_pad(pad); + } + + fn change_state(&self, transition: gst::StateChange) -> Result { + match transition { + gst::StateChange::ReadyToPaused => { + self.start_session().map_err(|err| { + gst::error!(CAT, obj = self.obj(), "failed to start session: {err:#}"); + gst::StateChangeError + })?; + } + gst::StateChange::PausedToReady => self.stop_session(), + _ => (), + } + + self.parent_change_state(transition) + } +} + +impl MoqSinkSpike { + fn start_session(&self) -> Result<()> { + // Synchronous settings validation surfaces as a StateChangeError; async failures go to the bus. + let settings = ResolvedSettings::try_from(self.settings.lock().unwrap().clone())?; + let handle = SessionHandle::start(settings, self.status.clone(), self.obj().downgrade()); + *self.session.lock().unwrap() = Some(handle); + Ok(()) + } + + fn stop_session(&self) { + if let Some(handle) = self.session.lock().unwrap().take() { + handle.stop(); + } + } + + /// Clone the sender, release the lock, then blocking-send: never apply backpressure under the session lock. + fn forward_buffer(&self, pad: &gst::Pad, buffer: gst::Buffer) -> Result { + let sender = self + .session + .lock() + .unwrap() + .as_ref() + .map(|handle| handle.sender()) + .ok_or(gst::FlowError::Flushing)?; + + let map = buffer.map_readable().map_err(|_| gst::FlowError::Error)?; + let data = Bytes::copy_from_slice(map.as_slice()); + let pts = buffer.pts(); + + sender + .blocking_send(DataMsg::Buffer { + pad: pad.name().to_string(), + data, + pts, + }) + .map_err(|_| gst::FlowError::Flushing)?; + Ok(gst::FlowSuccess::Ok) + } + + fn handle_event(&self, pad: &gst::Pad, event: gst::Event) -> bool { + let sender = self.session.lock().unwrap().as_ref().map(|handle| handle.sender()); + + match event.view() { + gst::EventView::Caps(caps) => { + let Some(sender) = sender else { return false }; + let msg = DataMsg::Caps { + pad: pad.name().to_string(), + caps: caps.caps().to_owned(), + }; + if sender.blocking_send(msg).is_err() { + return false; + } + gst::Pad::event_default(pad, Some(&*self.obj()), event) + } + gst::EventView::Segment(segment) => { + let Some(sender) = sender else { return false }; + let msg = DataMsg::Segment { + pad: pad.name().to_string(), + segment: segment.segment().to_owned(), + }; + if sender.blocking_send(msg).is_err() { + return false; + } + gst::Pad::event_default(pad, Some(&*self.obj()), event) + } + gst::EventView::Eos(_) => { + let Some(sender) = sender else { return false }; + sender.blocking_send(DataMsg::Eos { pad: pad.name().to_string() }).is_ok() + } + _ => gst::Pad::event_default(pad, Some(&*self.obj()), event), + } + } +} diff --git a/rs/moq-gst/src/spike/mod.rs b/rs/moq-gst/src/spike/mod.rs new file mode 100644 index 000000000..be17192d0 --- /dev/null +++ b/rs/moq-gst/src/spike/mod.rs @@ -0,0 +1,14 @@ +use gst::glib; +use gst::prelude::*; + +mod imp; +mod session; +mod timeline; + +glib::wrapper! { + pub struct MoqSinkSpike(ObjectSubclass) @extends gst::Element, gst::Object; +} + +pub fn register(plugin: &gst::Plugin) -> Result<(), glib::BoolError> { + gst::Element::register(Some(plugin), "moqsinkspike", gst::Rank::NONE, MoqSinkSpike::static_type()) +} diff --git a/rs/moq-gst/src/spike/session.rs b/rs/moq-gst/src/spike/session.rs new file mode 100644 index 000000000..2c5e97552 --- /dev/null +++ b/rs/moq-gst/src/spike/session.rs @@ -0,0 +1,558 @@ +//! Async core: the session/pads/timeline seams, isolated from the GObject shell. + +use std::collections::{HashMap, HashSet}; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::{Arc, LazyLock, Mutex}; + +use anyhow::{ensure, Context, Result}; +use bytes::Bytes; +use gst::glib; +use gst::prelude::*; +use tokio::sync::{mpsc, watch}; + +use hang::moq_net; + +use super::timeline::{classify_segment, frame_micros, FrameDecision, SegmentDecision, SegmentInfo}; +use super::MoqSinkSpike as Element; + +static RUNTIME: LazyLock = LazyLock::new(|| { + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .expect("spawn tokio runtime") +}); + +pub static CAT: LazyLock = LazyLock::new(|| { + gst::DebugCategory::new( + "moq-sink-spike", + gst::DebugColorFlags::empty(), + Some("MoQ Sink spike"), + ) +}); + +/// Handoff, not a buffer: a full channel must block the streaming thread, not grow. +const DATA_CHANNEL_BOUND: usize = 8; + +/// Read by the element's getters without touching the task; reset on every exit. +#[derive(Default)] +pub struct Status { + connected: AtomicBool, + version: Mutex>, + send_bitrate: AtomicU64, +} + +impl Status { + fn set_connected(&self, value: bool) { + self.connected.store(value, Ordering::Relaxed); + } + + pub fn connected(&self) -> bool { + self.connected.load(Ordering::Relaxed) + } + + fn set_version(&self, value: Option) { + *self.version.lock().unwrap() = value; + } + + pub fn version(&self) -> Option { + self.version.lock().unwrap().clone() + } + + fn set_send_bitrate(&self, bits_per_sec: u64) { + self.send_bitrate.store(bits_per_sec, Ordering::Relaxed); + } + + pub fn send_bitrate(&self) -> u64 { + self.send_bitrate.load(Ordering::Relaxed) + } +} + +/// Ordered; shutdown is deliberately elsewhere (cancellation channel) so it can cut a blocked send. +pub enum DataMsg { + Caps { pad: String, caps: gst::Caps }, + Segment { pad: String, segment: gst::Segment }, + Buffer { pad: String, data: Bytes, pts: Option }, + Eos { pad: String }, + DropPad { pad: String }, +} + +pub struct ResolvedSettings { + pub url: url::Url, + pub broadcast: String, + pub tls_disable_verify: bool, +} + +pub struct SessionHandle { + data: mpsc::Sender, + shutdown: watch::Sender, + join: tokio::task::JoinHandle<()>, +} + +impl SessionHandle { + pub fn start(settings: ResolvedSettings, status: Arc, element: glib::WeakRef) -> Self { + let (data_tx, data_rx) = mpsc::channel(DATA_CHANNEL_BOUND); + let (shutdown_tx, shutdown_rx) = watch::channel(false); + + let join = RUNTIME.spawn(async move { + // Only a remote close reaches the bus as an error; a local shutdown returns Ok and stays quiet. + if let Err(err) = run_session(settings, status, data_rx, shutdown_rx, element.clone()).await { + if let Some(obj) = element.upgrade() { + gst::element_error!(obj, gst::CoreError::Failed, ("session error"), ["{err:?}"]); + } + } + }); + + Self { + data: data_tx, + shutdown: shutdown_tx, + join, + } + } + + /// Cloned out so the element blocking-sends without holding the session lock (else a full channel deadlocks stop). + pub fn sender(&self) -> mpsc::Sender { + self.data.clone() + } + + pub fn stop(self) { + // Cancel first so a send blocked on a full channel wakes via the dropped receiver; reap off-thread. + let _ = self.shutdown.send(true); + RUNTIME.spawn(async move { + if let Err(err) = self.join.await { + gst::warning!(CAT, "session task ended with error: {err:?}"); + } + }); + } +} + +async fn run_session( + settings: ResolvedSettings, + status: Arc, + mut data: mpsc::Receiver, + mut shutdown: watch::Receiver, + element: glib::WeakRef, +) -> Result<()> { + let mut config = moq_native::ClientConfig::default(); + config.tls.disable_verify = Some(settings.tls_disable_verify); + let client = config.init()?; + + let origin = moq_net::Origin::random().produce(); + let mut broadcast = moq_net::Broadcast::new().produce(); + let broadcast_consumer = broadcast.consume(); + let catalog = moq_mux::catalog::Producer::new(&mut broadcast)?; + ensure!( + origin.publish_broadcast(&settings.broadcast, broadcast_consumer), + "failed to publish broadcast {}", + settings.broadcast + ); + let client = client.with_publish(origin.consume()); + + // Cancellation covers connect: a shutdown while connecting is a clean local close, not an error. + let session = tokio::select! { + result = client.connect(settings.url.clone()) => result?, + _ = shutdown.changed() => return Ok(()), + }; + status.set_connected(true); + status.set_version(Some(session.version().to_string())); + notify_connected(&element); + gst::info!(CAT, "session connected to {}", settings.url); + + let mut pad_set = PadSet::new(broadcast, catalog); + let result = run_loop(session, &mut data, &mut shutdown, &mut pad_set, &element, &status).await; + + // Finalize every live producer once on the way out, catalog last; runs on every exit path. + let finalized = pad_set.finalize_all(); + gst::debug!(CAT, "finalized on exit: {finalized:?}"); + // Reset the whole observable surface on exit, not just connected. + status.set_connected(false); + status.set_version(None); + status.set_send_bitrate(0); + notify_connected(&element); + result +} + +/// Only on the connect/disconnect edges, never per sample. +fn notify_connected(element: &glib::WeakRef) { + if let Some(obj) = element.upgrade() { + obj.notify("connected"); + } +} + +async fn run_loop( + session: moq_net::Session, + data: &mut mpsc::Receiver, + shutdown: &mut watch::Receiver, + pad_set: &mut PadSet, + element: &glib::WeakRef, + status: &Status, +) -> Result<()> { + // Congestion-controller send estimate; None when unavailable, then this arm parks forever. + let mut send_bandwidth = session.send_bandwidth(); + + // Resolves to Err when the transport dies; pinned so the select polls it each iteration. + let closed = session.closed(); + tokio::pin!(closed); + + loop { + tokio::select! { + // Local close: quiet Ok, no ERROR. + _ = shutdown.changed() => return Ok(()), + // Remote death: propagate the Err so the wrapper posts ERROR to the bus. + result = &mut closed => { + result?; + return Ok(()); + } + // A closed estimate stops the polling. + bitrate = async { + match send_bandwidth.as_mut() { + Some(bw) => bw.changed().await, + None => std::future::pending::>().await, + } + } => match bitrate { + Some(rate) => status.set_send_bitrate(rate), + None => send_bandwidth = None, + }, + msg = data.recv() => match msg { + Some(DataMsg::Caps { pad, caps }) => pad_set.caps(&pad, &caps)?, + Some(DataMsg::Segment { pad, segment }) => pad_set.segment(&pad, segment), + Some(DataMsg::Buffer { pad, data, pts }) => pad_set.buffer(&pad, data, pts)?, + Some(DataMsg::Eos { pad }) => { + if pad_set.eos(&pad)? { + gst::info!(CAT, "all pads ended, posting EOS"); + if let Some(obj) = element.upgrade() { + let _ = obj.post_message(gst::message::Eos::builder().src(&obj).build()); + } + return Ok(()); + } + } + Some(DataMsg::DropPad { pad }) => pad_set.drop_pad(&pad), + // The element dropped the sender (state change to READY) without a shutdown signal. + None => return Ok(()), + }, + } + } +} + +/// Running time is shared, not per-pad, so there is no anchor to drift. +struct PadSet { + broadcast: moq_net::BroadcastProducer, + catalog: Option, + pads: HashMap, + eos: HashSet, +} + +impl PadSet { + fn new(broadcast: moq_net::BroadcastProducer, catalog: moq_mux::catalog::Producer) -> Self { + Self { + broadcast, + catalog: Some(catalog), + pads: HashMap::new(), + eos: HashSet::new(), + } + } + + fn caps(&mut self, pad: &str, caps: &gst::Caps) -> Result<()> { + let broadcast = self.broadcast.clone(); + let catalog = self.catalog.clone().context("catalog already finalized")?; + self.pads + .entry(pad.to_string()) + .or_insert_with(Pad::new) + .set_caps(broadcast, catalog, caps) + } + + fn segment(&mut self, pad: &str, segment: gst::Segment) { + // SEGMENT may arrive before CAPS (independent sticky events); this only records timing. + self.pads.entry(pad.to_string()).or_insert_with(Pad::new).set_segment(segment); + } + + fn buffer(&mut self, pad: &str, data: Bytes, pts: Option) -> Result<()> { + match self.pads.get_mut(pad) { + Some(pad) => pad.push_buffer(data, pts), + None => { + gst::warning!(CAT, "buffer for unknown pad {pad}"); + Ok(()) + } + } + } + + /// Returns whether every pad has now ended, so the caller posts the element EOS once. + fn eos(&mut self, pad: &str) -> Result { + if let Some(p) = self.pads.get_mut(pad) { + p.finalize()?; + } + self.eos.insert(pad.to_string()); + Ok(!self.pads.is_empty() && self.eos.len() == self.pads.len()) + } + + fn drop_pad(&mut self, pad: &str) { + if let Some(mut p) = self.pads.remove(pad) { + if let Err(err) = p.finalize() { + gst::warning!(CAT, "finalize on drop {pad}: {err:?}"); + } + } + self.eos.remove(pad); + } + + /// Idempotent (skips already-finalized pads); the returned order proves "catalog last". + fn finalize_all(&mut self) -> Vec { + let mut order = Vec::new(); + for (name, pad) in self.pads.iter_mut() { + match pad.finalize() { + Ok(true) => order.push(name.clone()), + Ok(false) => {} + Err(err) => gst::warning!(CAT, "finalize {name}: {err:?}"), + } + } + // finish() closes both the hang and MSF tracks; a bare drop would not. + if let Some(mut catalog) = self.catalog.take() { + match catalog.finish() { + Ok(()) => order.push("catalog".to_string()), + Err(err) => gst::warning!(CAT, "finalize catalog: {err:?}"), + } + } + order + } +} + +struct Pad { + framed: Option, + caps: Option, + segment_info: Option, + // Kept only to map a buffer PTS to a running time. + segment: Option>, +} + +impl Pad { + fn new() -> Self { + Self { + framed: None, + caps: None, + segment_info: None, + segment: None, + } + } + + fn set_caps( + &mut self, + broadcast: moq_net::BroadcastProducer, + catalog: moq_mux::catalog::Producer, + caps: &gst::Caps, + ) -> Result<()> { + let structure = caps.structure(0).context("empty caps")?; + ensure!( + structure.name() == "video/x-h264", + "spike only carries H.264 byte-stream, got {}", + structure.name() + ); + // Identical caps re-sent (sticky event): keep the live producer, don't finalize and recreate. + if self.framed.is_some() && self.caps.as_ref() == Some(caps) { + return Ok(()); + } + // Renegotiation: finalize the previous producer before replacing it (closed once, not abandoned). + self.finalize()?; + let mut empty = Bytes::new(); + self.framed = Some(moq_mux::import::Framed::new( + broadcast, + catalog, + moq_mux::import::FramedFormat::Avc3, + &mut empty, + )?); + self.caps = Some(caps.clone()); + Ok(()) + } + + fn set_segment(&mut self, segment: gst::Segment) { + let info = segment_info(&segment); + match classify_segment(self.segment_info.as_ref(), &info) { + SegmentDecision::Accept => { + self.segment_info = Some(info); + self.segment = segment.downcast::().ok(); + } + SegmentDecision::Reject(reason) => gst::warning!(CAT, "ignoring segment: {reason}"), + } + } + + /// Pure of the importer, so it can be tested with real segments and no codec. + fn frame_timestamp(&self, pts: Option) -> FrameDecision { + let running_time = self + .segment + .as_ref() + .zip(pts) + .and_then(|(segment, pts)| segment.to_running_time(pts)) + .map(|time| time.nseconds()); + frame_micros(self.segment_info.is_some(), running_time) + } + + fn push_buffer(&mut self, mut data: Bytes, pts: Option) -> Result<()> { + let decision = self.frame_timestamp(pts); + let Some(framed) = self.framed.as_mut() else { + gst::warning!(CAT, "dropping buffer received before caps"); + return Ok(()); + }; + + match decision { + FrameDecision::Emit(micros) => { + let ts = hang::container::Timestamp::from_micros(micros).ok(); + framed.decode_frame(&mut data, ts).map_err(|err| anyhow::anyhow!(err)) + } + FrameDecision::Drop(reason) => { + gst::warning!(CAT, "dropping frame: {reason}"); + Ok(()) + } + } + } + + /// Consumes the producer so a second call is a no-op (`Framed::finish()` is not idempotent). + fn finalize(&mut self) -> Result { + match self.framed.take() { + Some(mut framed) => { + framed.finish()?; + Ok(true) + } + None => Ok(false), + } + } +} + +fn segment_info(segment: &gst::Segment) -> SegmentInfo { + match segment.downcast_ref::() { + Some(time) => SegmentInfo { + time_format: true, + rate: time.rate(), + start_nanos: time.start().map(|c| c.nseconds()).unwrap_or(0), + base_nanos: time.base().map(|c| c.nseconds()).unwrap_or(0), + }, + None => SegmentInfo { + time_format: false, + rate: segment.rate(), + start_nanos: 0, + base_nanos: 0, + }, + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use super::*; + + fn pad_set() -> PadSet { + let mut broadcast = moq_net::Broadcast::new().produce(); + let catalog = moq_mux::catalog::Producer::new(&mut broadcast).unwrap(); + PadSet::new(broadcast, catalog) + } + + fn h264_caps() -> gst::Caps { + gst::Caps::builder("video/x-h264") + .field("stream-format", "byte-stream") + .field("alignment", "au") + .build() + } + + // EOS/new-caps/drop/shutdown converge on exactly one finalize per producer; catalog last. + #[test] + fn finalize_all_finishes_pads_then_catalog_once() { + gst::init().unwrap(); + let mut set = pad_set(); + set.caps("video", &h264_caps()).unwrap(); + set.caps("audio", &h264_caps()).unwrap(); + + let order = set.finalize_all(); + assert_eq!(order.last().map(String::as_str), Some("catalog"), "catalog must finalize last"); + assert!(order.contains(&"video".to_string()) && order.contains(&"audio".to_string())); + + // A second pass finalizes nothing again. + assert!(set.finalize_all().is_empty()); + } + + #[test] + fn eos_then_shutdown_does_not_double_finalize() { + gst::init().unwrap(); + let mut set = pad_set(); + set.caps("video", &h264_caps()).unwrap(); + + assert!(set.eos("video").unwrap()); + // Only the catalog is left; the pad is not finalized twice. + assert_eq!(set.finalize_all(), vec!["catalog".to_string()]); + } + + #[test] + fn identical_caps_keep_one_live_producer() { + gst::init().unwrap(); + let mut set = pad_set(); + set.caps("video", &h264_caps()).unwrap(); + // Re-sent identical caps are a no-op; the pad still holds exactly one live producer. + set.caps("video", &h264_caps()).unwrap(); + assert_eq!(set.pads.len(), 1); + assert!(set.pads["video"].framed.is_some()); + } + + #[test] + fn buffer_for_unknown_pad_is_dropped_without_error() { + let mut set = pad_set(); + assert!(set.buffer("ghost", Bytes::from_static(b"x"), Some(gst::ClockTime::ZERO)).is_ok()); + } + + // SEGMENT before CAPS: the pad is created and the segment retained when CAPS arrives. + #[test] + fn segment_before_caps_is_retained() { + gst::init().unwrap(); + let mut set = pad_set(); + set.segment("video", time_segment()); + set.caps("video", &h264_caps()).unwrap(); + + let pad = &set.pads["video"]; + assert!(pad.segment_info.is_some(), "segment kept across a later caps"); + assert!(pad.framed.is_some()); + } + + // Firing the watch drops the receiver, waking a sender parked on the full channel with Err. + #[test] + fn shutdown_via_watch_releases_a_blocked_send() { + let runtime = tokio::runtime::Runtime::new().unwrap(); + let (data_tx, data_rx) = mpsc::channel::(DATA_CHANNEL_BOUND); + let (shutdown_tx, mut shutdown_rx) = watch::channel(false); + + for _ in 0..DATA_CHANNEL_BOUND { + data_tx.blocking_send(0).unwrap(); // fill to capacity + } + + // Hold the receiver without draining (as if busy in a branch body), then return on the watch. + let loop_task = runtime.spawn(async move { + let _held = data_rx; + shutdown_rx.changed().await.ok(); + }); + + let sender = data_tx.clone(); + let blocked = std::thread::spawn(move || sender.blocking_send(1)); + std::thread::sleep(Duration::from_millis(50)); // let the send park + + shutdown_tx.send(true).unwrap(); + runtime.block_on(loop_task).unwrap(); + + assert!( + blocked.join().unwrap().is_err(), + "a send blocked on the full channel must wake with Err when shutdown drops the receiver" + ); + } + + // Two pads, real PTS via to_running_time: the A/V offset survives because running time is shared. + #[test] + fn two_pads_keep_av_aligned_through_real_segments() { + gst::init().unwrap(); + let mut video = Pad::new(); + let mut audio = Pad::new(); + video.set_segment(time_segment()); + audio.set_segment(time_segment()); + + assert_eq!(video.frame_timestamp(Some(gst::ClockTime::from_mseconds(7))), FrameDecision::Emit(7_000)); + assert_eq!(audio.frame_timestamp(Some(gst::ClockTime::from_mseconds(5))), FrameDecision::Emit(5_000)); + } + + fn time_segment() -> gst::Segment { + let mut segment = gst::FormattedSegment::::new(); + segment.set_start(gst::ClockTime::ZERO); + segment.upcast() + } +} diff --git a/rs/moq-gst/src/spike/timeline.rs b/rs/moq-gst/src/spike/timeline.rs new file mode 100644 index 000000000..f883f1242 --- /dev/null +++ b/rs/moq-gst/src/spike/timeline.rs @@ -0,0 +1,132 @@ +//! Pure SEGMENT/running-time policy, split out so it unit-tests with plain numbers, no pipeline. + +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct SegmentInfo { + /// Only TIME segments map to a media timeline (not BYTES/DEFAULT). + pub time_format: bool, + pub rate: f64, + pub start_nanos: u64, + pub base_nanos: u64, +} + +#[derive(Debug, PartialEq, Eq)] +pub enum SegmentDecision { + Accept, + Reject(&'static str), +} + +#[derive(Debug, PartialEq, Eq)] +pub enum FrameDecision { + /// Emit at this MoQ timestamp (micros). + Emit(u64), + Drop(&'static str), +} + +/// First TIME segment fixes the timeline; a discontinuity is rejected so the pad stops rather than +/// splicing two timelines. +pub fn classify_segment(prev: Option<&SegmentInfo>, next: &SegmentInfo) -> SegmentDecision { + if !next.time_format { + return SegmentDecision::Reject("segment is not in TIME format"); + } + if next.rate != 1.0 { + return SegmentDecision::Reject("segment rate is not 1.0"); + } + match prev { + None => SegmentDecision::Accept, + Some(prev) if next.start_nanos == prev.start_nanos && next.base_nanos >= prev.base_nanos => { + SegmentDecision::Accept + } + Some(_) => SegmentDecision::Reject("discontinuous segment (flush/seek)"), + } +} + +/// Stateless and shared across pads on purpose: re-anchoring per pad is what breaks A/V alignment. +/// `None` (outside the segment) drops, never clamps. +pub fn frame_micros(has_segment: bool, running_time_nanos: Option) -> FrameDecision { + if !has_segment { + return FrameDecision::Drop("buffer arrived before a SEGMENT"); + } + match running_time_nanos { + Some(running_time) => FrameDecision::Emit(running_time / 1000), + None => FrameDecision::Drop("buffer outside the segment"), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn time(rate: f64, start: u64, base: u64) -> SegmentInfo { + SegmentInfo { + time_format: true, + rate, + start_nanos: start, + base_nanos: base, + } + } + + #[test] + fn first_time_segment_is_accepted() { + assert_eq!(classify_segment(None, &time(1.0, 0, 0)), SegmentDecision::Accept); + } + + #[test] + fn non_time_segment_is_rejected() { + let bytes = SegmentInfo { + time_format: false, + rate: 1.0, + start_nanos: 0, + base_nanos: 0, + }; + assert!(matches!(classify_segment(None, &bytes), SegmentDecision::Reject(_))); + } + + #[test] + fn non_unit_rate_is_rejected() { + assert!(matches!(classify_segment(None, &time(2.0, 0, 0)), SegmentDecision::Reject(_))); + assert!(matches!(classify_segment(None, &time(-1.0, 0, 0)), SegmentDecision::Reject(_))); + } + + #[test] + fn continuous_second_segment_is_accepted() { + let first = time(1.0, 100, 0); + assert_eq!(classify_segment(Some(&first), &time(1.0, 100, 500)), SegmentDecision::Accept); + } + + #[test] + fn discontinuous_second_segment_is_rejected() { + let first = time(1.0, 100, 500); + // start moved (a seek) -> reject. + assert!(matches!( + classify_segment(Some(&first), &time(1.0, 200, 600)), + SegmentDecision::Reject(_) + )); + // base went backwards (a flush rewind) -> reject. + assert!(matches!( + classify_segment(Some(&first), &time(1.0, 100, 400)), + SegmentDecision::Reject(_) + )); + } + + #[test] + fn buffer_before_segment_is_dropped() { + assert!(matches!(frame_micros(false, Some(0)), FrameDecision::Drop(_))); + } + + #[test] + fn out_of_segment_frame_is_dropped_not_clamped() { + assert!(matches!(frame_micros(true, None), FrameDecision::Drop(_))); + } + + // Stateless conversion: two pads sharing one running-time clock keep their relative offset. + #[test] + fn shared_timeline_keeps_av_aligned() { + // Simultaneous frames on two pads (same running time) get the same timestamp. + assert_eq!(frame_micros(true, Some(20_000_000)), FrameDecision::Emit(20_000)); + assert_eq!(frame_micros(true, Some(20_000_000)), FrameDecision::Emit(20_000)); + + // A real 2ms offset between a video and an audio frame survives, regardless of call order. + assert_eq!(frame_micros(true, Some(7_000_000)), FrameDecision::Emit(7_000)); + assert_eq!(frame_micros(true, Some(5_000_000)), FrameDecision::Emit(5_000)); + } +} From 466e7cdb58028de0224bd52c8cff3adfcc7dd471 Mon Sep 17 00:00:00 2001 From: Ariel Molina Date: Mon, 15 Jun 2026 14:10:44 -0600 Subject: [PATCH 02/13] moq-gst: run moq-gst tests via a just recipe wired into ci moq-gst is outside default-members (needs GStreamer), so 'just rs test' skips it. Add 'just rs test-moq-gst' and chain it into 'just rs ci' so the spike's tests gate. The CI workflow only needs gstreamer, which the nix dev shell already provides; that step is applied at PR time. Co-Authored-By: Claude Opus 4.8 --- rs/justfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/rs/justfile b/rs/justfile index 127f25040..298133375 100644 --- a/rs/justfile +++ b/rs/justfile @@ -40,6 +40,7 @@ ci FILES="": cargo check --workspace --all-features cargo deny check --show-stats just rs test --all-features + just rs test-moq-gst --all-features just build # Auto-fix clippy/format/shear/sort. @@ -52,6 +53,10 @@ fix: test *args: cargo test --all-targets {{ args }} +# moq-gst is outside default-members (needs GStreamer), so `test` above skips it. +test-moq-gst *args: + cargo test -p moq-gst --all-targets {{ args }} + build: cargo build From 4967584dff0e77fb376a50f47625a531af35e836 Mon Sep 17 00:00:00 2001 From: Ariel Molina Date: Mon, 15 Jun 2026 14:20:56 -0600 Subject: [PATCH 03/13] moq-gst: timeline correctness (running-time continuity, Invalid pad state) Judge SEGMENT continuity in running time, not media origin: classify_segment keys on the running-time base, so a moved start stays continuous while base does not rewind (the old start-equality rule wrongly rejected this). Map timestamps with to_running_time_full (signed) so a buffer before the segment is dropped, never clamped to zero. Add an explicit per-pad state (NoSegment/Active/Invalid): a discontinuity invalidates the pad and the next valid SEGMENT re-anchors it. Also applies rustfmt to the spike baseline, which was committed before a fmt pass. Co-Authored-By: Claude Opus 4.8 --- rs/moq-gst/src/spike/imp.rs | 12 +- rs/moq-gst/src/spike/mod.rs | 7 +- rs/moq-gst/src/spike/session.rs | 196 ++++++++++++++++++++++++++----- rs/moq-gst/src/spike/timeline.rs | 94 ++++++++------- 4 files changed, 233 insertions(+), 76 deletions(-) diff --git a/rs/moq-gst/src/spike/imp.rs b/rs/moq-gst/src/spike/imp.rs index da89cce2c..b805b4e97 100644 --- a/rs/moq-gst/src/spike/imp.rs +++ b/rs/moq-gst/src/spike/imp.rs @@ -23,7 +23,11 @@ impl TryFrom for ResolvedSettings { fn try_from(value: Settings) -> Result { Ok(Self { url: url::Url::parse(value.url.as_ref().context("url property is required")?)?, - broadcast: value.broadcast.as_ref().context("broadcast property is required")?.clone(), + broadcast: value + .broadcast + .as_ref() + .context("broadcast property is required")? + .clone(), tls_disable_verify: value.tls_disable_verify, }) } @@ -267,7 +271,11 @@ impl MoqSinkSpike { } gst::EventView::Eos(_) => { let Some(sender) = sender else { return false }; - sender.blocking_send(DataMsg::Eos { pad: pad.name().to_string() }).is_ok() + sender + .blocking_send(DataMsg::Eos { + pad: pad.name().to_string(), + }) + .is_ok() } _ => gst::Pad::event_default(pad, Some(&*self.obj()), event), } diff --git a/rs/moq-gst/src/spike/mod.rs b/rs/moq-gst/src/spike/mod.rs index be17192d0..ba4929020 100644 --- a/rs/moq-gst/src/spike/mod.rs +++ b/rs/moq-gst/src/spike/mod.rs @@ -10,5 +10,10 @@ glib::wrapper! { } pub fn register(plugin: &gst::Plugin) -> Result<(), glib::BoolError> { - gst::Element::register(Some(plugin), "moqsinkspike", gst::Rank::NONE, MoqSinkSpike::static_type()) + gst::Element::register( + Some(plugin), + "moqsinkspike", + gst::Rank::NONE, + MoqSinkSpike::static_type(), + ) } diff --git a/rs/moq-gst/src/spike/session.rs b/rs/moq-gst/src/spike/session.rs index 2c5e97552..aa8e95951 100644 --- a/rs/moq-gst/src/spike/session.rs +++ b/rs/moq-gst/src/spike/session.rs @@ -22,13 +22,8 @@ static RUNTIME: LazyLock = LazyLock::new(|| { .expect("spawn tokio runtime") }); -pub static CAT: LazyLock = LazyLock::new(|| { - gst::DebugCategory::new( - "moq-sink-spike", - gst::DebugColorFlags::empty(), - Some("MoQ Sink spike"), - ) -}); +pub static CAT: LazyLock = + LazyLock::new(|| gst::DebugCategory::new("moq-sink-spike", gst::DebugColorFlags::empty(), Some("MoQ Sink spike"))); /// Handoff, not a buffer: a full channel must block the streaming thread, not grow. const DATA_CHANNEL_BOUND: usize = 8; @@ -69,11 +64,25 @@ impl Status { /// Ordered; shutdown is deliberately elsewhere (cancellation channel) so it can cut a blocked send. pub enum DataMsg { - Caps { pad: String, caps: gst::Caps }, - Segment { pad: String, segment: gst::Segment }, - Buffer { pad: String, data: Bytes, pts: Option }, - Eos { pad: String }, - DropPad { pad: String }, + Caps { + pad: String, + caps: gst::Caps, + }, + Segment { + pad: String, + segment: gst::Segment, + }, + Buffer { + pad: String, + data: Bytes, + pts: Option, + }, + Eos { + pad: String, + }, + DropPad { + pad: String, + }, } pub struct ResolvedSettings { @@ -262,7 +271,10 @@ impl PadSet { fn segment(&mut self, pad: &str, segment: gst::Segment) { // SEGMENT may arrive before CAPS (independent sticky events); this only records timing. - self.pads.entry(pad.to_string()).or_insert_with(Pad::new).set_segment(segment); + self.pads + .entry(pad.to_string()) + .or_insert_with(Pad::new) + .set_segment(segment); } fn buffer(&mut self, pad: &str, data: Bytes, pts: Option) -> Result<()> { @@ -314,9 +326,22 @@ impl PadSet { } } +/// Per-pad timeline state. Buffers only map and emit while `Active`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum PadState { + /// No valid SEGMENT seen yet. + NoSegment, + /// A valid timeline is anchored. + Active, + /// A live timeline broke (discontinuity, non-TIME, or rate != 1.0); buffers drop until a valid + /// SEGMENT re-anchors the pad. + Invalid, +} + struct Pad { framed: Option, caps: Option, + state: PadState, segment_info: Option, // Kept only to map a buffer PTS to a running time. segment: Option>, @@ -327,6 +352,7 @@ impl Pad { Self { framed: None, caps: None, + state: PadState::NoSegment, segment_info: None, segment: None, } @@ -363,24 +389,46 @@ impl Pad { fn set_segment(&mut self, segment: gst::Segment) { let info = segment_info(&segment); - match classify_segment(self.segment_info.as_ref(), &info) { + // Only an Active pad enforces continuity against its previous segment; NoSegment and Invalid + // re-anchor from scratch on the next valid one. + let prev = match self.state { + PadState::Active => self.segment_info.as_ref(), + PadState::NoSegment | PadState::Invalid => None, + }; + match classify_segment(prev, &info) { SegmentDecision::Accept => { self.segment_info = Some(info); self.segment = segment.downcast::().ok(); + self.state = PadState::Active; + } + SegmentDecision::Reject(reason) => { + gst::warning!(CAT, "rejecting segment: {reason}"); + // A break only invalidates a live timeline; a bad segment before any valid one leaves + // the pad in NoSegment. + if self.state == PadState::Active { + self.state = PadState::Invalid; + } } - SegmentDecision::Reject(reason) => gst::warning!(CAT, "ignoring segment: {reason}"), } } /// Pure of the importer, so it can be tested with real segments and no codec. fn frame_timestamp(&self, pts: Option) -> FrameDecision { - let running_time = self - .segment - .as_ref() - .zip(pts) - .and_then(|(segment, pts)| segment.to_running_time(pts)) - .map(|time| time.nseconds()); - frame_micros(self.segment_info.is_some(), running_time) + match self.state { + PadState::Active => { + // to_running_time_full is signed: a buffer before the segment returns Negative, which + // frame_micros drops; to_running_time would instead clip it to None and lose the reason. + let running_time = self + .segment + .as_ref() + .zip(pts) + .and_then(|(segment, pts)| segment.to_running_time_full(pts)) + .map(signed_nanos); + frame_micros(running_time) + } + PadState::NoSegment => FrameDecision::Drop("buffer before a valid SEGMENT"), + PadState::Invalid => FrameDecision::Drop("buffer on an invalidated timeline"), + } } fn push_buffer(&mut self, mut data: Bytes, pts: Option) -> Result<()> { @@ -419,18 +467,24 @@ fn segment_info(segment: &gst::Segment) -> SegmentInfo { Some(time) => SegmentInfo { time_format: true, rate: time.rate(), - start_nanos: time.start().map(|c| c.nseconds()).unwrap_or(0), base_nanos: time.base().map(|c| c.nseconds()).unwrap_or(0), }, None => SegmentInfo { time_format: false, rate: segment.rate(), - start_nanos: 0, base_nanos: 0, }, } } +/// Flattens a signed running time to nanos, keeping the sign so the timeline can drop negatives. +fn signed_nanos(running_time: gst::Signed) -> i64 { + match running_time { + gst::Signed::Positive(time) => time.nseconds() as i64, + gst::Signed::Negative(time) => -(time.nseconds() as i64), + } +} + #[cfg(test)] mod tests { use std::time::Duration; @@ -459,7 +513,11 @@ mod tests { set.caps("audio", &h264_caps()).unwrap(); let order = set.finalize_all(); - assert_eq!(order.last().map(String::as_str), Some("catalog"), "catalog must finalize last"); + assert_eq!( + order.last().map(String::as_str), + Some("catalog"), + "catalog must finalize last" + ); assert!(order.contains(&"video".to_string()) && order.contains(&"audio".to_string())); // A second pass finalizes nothing again. @@ -491,7 +549,9 @@ mod tests { #[test] fn buffer_for_unknown_pad_is_dropped_without_error() { let mut set = pad_set(); - assert!(set.buffer("ghost", Bytes::from_static(b"x"), Some(gst::ClockTime::ZERO)).is_ok()); + assert!(set + .buffer("ghost", Bytes::from_static(b"x"), Some(gst::ClockTime::ZERO)) + .is_ok()); } // SEGMENT before CAPS: the pad is created and the segment retained when CAPS arrives. @@ -537,7 +597,7 @@ mod tests { ); } - // Two pads, real PTS via to_running_time: the A/V offset survives because running time is shared. + // Two pads, real PTS via to_running_time_full: the A/V offset survives because running time is shared. #[test] fn two_pads_keep_av_aligned_through_real_segments() { gst::init().unwrap(); @@ -546,8 +606,79 @@ mod tests { video.set_segment(time_segment()); audio.set_segment(time_segment()); - assert_eq!(video.frame_timestamp(Some(gst::ClockTime::from_mseconds(7))), FrameDecision::Emit(7_000)); - assert_eq!(audio.frame_timestamp(Some(gst::ClockTime::from_mseconds(5))), FrameDecision::Emit(5_000)); + assert_eq!( + video.frame_timestamp(Some(gst::ClockTime::from_mseconds(7))), + FrameDecision::Emit(7_000) + ); + assert_eq!( + audio.frame_timestamp(Some(gst::ClockTime::from_mseconds(5))), + FrameDecision::Emit(5_000) + ); + } + + // A pad with no SEGMENT yet drops buffers (NoSegment), distinct from an invalidated timeline. + #[test] + fn pad_without_segment_drops_buffers() { + let pad = Pad::new(); + assert_eq!(pad.state, PadState::NoSegment); + assert!(matches!( + pad.frame_timestamp(Some(gst::ClockTime::from_mseconds(5))), + FrameDecision::Drop(_) + )); + } + + // The fix: a moved media start stays continuous as long as the running-time base advances. The + // old start-equality rule would have rejected this and stalled the pad. + #[test] + fn moved_start_with_advancing_base_stays_continuous() { + gst::init().unwrap(); + let mut pad = Pad::new(); + pad.set_segment(time_segment_at(0, 0)); + assert_eq!(pad.state, PadState::Active); + pad.set_segment(time_segment_at(30_000, 5_000)); + assert_eq!(pad.state, PadState::Active); + } + + // A buffer before the segment start yields a negative running time: drop it, never clamp to zero. + #[test] + fn frame_before_segment_start_is_dropped_not_clamped() { + gst::init().unwrap(); + let mut pad = Pad::new(); + pad.set_segment(time_segment_at(10_000, 0)); + assert!(matches!( + pad.frame_timestamp(Some(gst::ClockTime::from_mseconds(5_000))), + FrameDecision::Drop(_) + )); + // A PTS at or after the start maps to a non-negative running time and emits. + assert_eq!( + pad.frame_timestamp(Some(gst::ClockTime::from_mseconds(12_000))), + FrameDecision::Emit(2_000_000) + ); + } + + // A discontinuity invalidates the pad (drops), and the next valid SEGMENT re-anchors it to Active. + #[test] + fn invalid_segment_drops_then_a_valid_one_recovers() { + gst::init().unwrap(); + let mut pad = Pad::new(); + pad.set_segment(time_segment_at(0, 5_000)); + assert_eq!(pad.state, PadState::Active); + + // base rewinds -> discontinuous -> Invalid; buffers drop. + pad.set_segment(time_segment_at(0, 0)); + assert_eq!(pad.state, PadState::Invalid); + assert!(matches!( + pad.frame_timestamp(Some(gst::ClockTime::from_mseconds(6_000))), + FrameDecision::Drop(_) + )); + + // A valid SEGMENT re-anchors -> Active; buffers emit again. + pad.set_segment(time_segment_at(0, 10_000)); + assert_eq!(pad.state, PadState::Active); + assert_eq!( + pad.frame_timestamp(Some(gst::ClockTime::ZERO)), + FrameDecision::Emit(10_000_000) + ); } fn time_segment() -> gst::Segment { @@ -555,4 +686,11 @@ mod tests { segment.set_start(gst::ClockTime::ZERO); segment.upcast() } + + fn time_segment_at(start_ms: u64, base_ms: u64) -> gst::Segment { + let mut segment = gst::FormattedSegment::::new(); + segment.set_start(gst::ClockTime::from_mseconds(start_ms)); + segment.set_base(gst::ClockTime::from_mseconds(base_ms)); + segment.upcast() + } } diff --git a/rs/moq-gst/src/spike/timeline.rs b/rs/moq-gst/src/spike/timeline.rs index f883f1242..27340f05b 100644 --- a/rs/moq-gst/src/spike/timeline.rs +++ b/rs/moq-gst/src/spike/timeline.rs @@ -5,7 +5,7 @@ pub struct SegmentInfo { /// Only TIME segments map to a media timeline (not BYTES/DEFAULT). pub time_format: bool, pub rate: f64, - pub start_nanos: u64, + /// Running-time anchor of the segment; continuity is judged on this, not on `start`. pub base_nanos: u64, } @@ -22,8 +22,10 @@ pub enum FrameDecision { Drop(&'static str), } -/// First TIME segment fixes the timeline; a discontinuity is rejected so the pad stops rather than -/// splicing two timelines. +/// The first TIME segment fixes the timeline. Continuity is judged in running time, not media +/// origin: `base` is the running-time anchor, so a moved `start` (a seek that keeps moving forward) +/// stays continuous as long as `base` does not rewind. A rewind is rejected so the pad stops rather +/// than splicing two timelines. pub fn classify_segment(prev: Option<&SegmentInfo>, next: &SegmentInfo) -> SegmentDecision { if !next.time_format { return SegmentDecision::Reject("segment is not in TIME format"); @@ -33,21 +35,19 @@ pub fn classify_segment(prev: Option<&SegmentInfo>, next: &SegmentInfo) -> Segme } match prev { None => SegmentDecision::Accept, - Some(prev) if next.start_nanos == prev.start_nanos && next.base_nanos >= prev.base_nanos => { - SegmentDecision::Accept - } - Some(_) => SegmentDecision::Reject("discontinuous segment (flush/seek)"), + Some(prev) if next.base_nanos >= prev.base_nanos => SegmentDecision::Accept, + Some(_) => SegmentDecision::Reject("discontinuous segment (running time rewound)"), } } -/// Stateless and shared across pads on purpose: re-anchoring per pad is what breaks A/V alignment. -/// `None` (outside the segment) drops, never clamps. -pub fn frame_micros(has_segment: bool, running_time_nanos: Option) -> FrameDecision { - if !has_segment { - return FrameDecision::Drop("buffer arrived before a SEGMENT"); - } +/// Maps a signed running time (nanos) to a MoQ timestamp. Stateless and shared across pads on +/// purpose: re-anchoring per pad is what breaks A/V alignment. A buffer before the segment (a +/// negative running time) is dropped, never clamped to zero, since clamping would collapse distinct +/// frames onto one timestamp. +pub fn frame_micros(running_time_nanos: Option) -> FrameDecision { match running_time_nanos { - Some(running_time) => FrameDecision::Emit(running_time / 1000), + Some(nanos) if nanos >= 0 => FrameDecision::Emit(nanos as u64 / 1000), + Some(_) => FrameDecision::Drop("buffer before the segment (negative running time)"), None => FrameDecision::Drop("buffer outside the segment"), } } @@ -56,18 +56,17 @@ pub fn frame_micros(has_segment: bool, running_time_nanos: Option) -> Frame mod tests { use super::*; - fn time(rate: f64, start: u64, base: u64) -> SegmentInfo { + fn time(rate: f64, base: u64) -> SegmentInfo { SegmentInfo { time_format: true, rate, - start_nanos: start, base_nanos: base, } } #[test] fn first_time_segment_is_accepted() { - assert_eq!(classify_segment(None, &time(1.0, 0, 0)), SegmentDecision::Accept); + assert_eq!(classify_segment(None, &time(1.0, 0)), SegmentDecision::Accept); } #[test] @@ -75,7 +74,6 @@ mod tests { let bytes = SegmentInfo { time_format: false, rate: 1.0, - start_nanos: 0, base_nanos: 0, }; assert!(matches!(classify_segment(None, &bytes), SegmentDecision::Reject(_))); @@ -83,50 +81,58 @@ mod tests { #[test] fn non_unit_rate_is_rejected() { - assert!(matches!(classify_segment(None, &time(2.0, 0, 0)), SegmentDecision::Reject(_))); - assert!(matches!(classify_segment(None, &time(-1.0, 0, 0)), SegmentDecision::Reject(_))); + assert!(matches!( + classify_segment(None, &time(2.0, 0)), + SegmentDecision::Reject(_) + )); + assert!(matches!( + classify_segment(None, &time(-1.0, 0)), + SegmentDecision::Reject(_) + )); } #[test] - fn continuous_second_segment_is_accepted() { - let first = time(1.0, 100, 0); - assert_eq!(classify_segment(Some(&first), &time(1.0, 100, 500)), SegmentDecision::Accept); + fn advancing_base_is_continuous() { + let first = time(1.0, 0); + assert_eq!(classify_segment(Some(&first), &time(1.0, 500)), SegmentDecision::Accept); } + // Equal base is still continuous: continuity is base-monotonic, not strictly increasing. #[test] - fn discontinuous_second_segment_is_rejected() { - let first = time(1.0, 100, 500); - // start moved (a seek) -> reject. - assert!(matches!( - classify_segment(Some(&first), &time(1.0, 200, 600)), - SegmentDecision::Reject(_) - )); - // base went backwards (a flush rewind) -> reject. + fn equal_base_is_continuous() { + let first = time(1.0, 500); + assert_eq!(classify_segment(Some(&first), &time(1.0, 500)), SegmentDecision::Accept); + } + + #[test] + fn rewinding_base_is_rejected() { + let first = time(1.0, 500); assert!(matches!( - classify_segment(Some(&first), &time(1.0, 100, 400)), + classify_segment(Some(&first), &time(1.0, 400)), SegmentDecision::Reject(_) )); } #[test] - fn buffer_before_segment_is_dropped() { - assert!(matches!(frame_micros(false, Some(0)), FrameDecision::Drop(_))); + fn positive_running_time_emits_micros() { + assert_eq!(frame_micros(Some(20_000_000)), FrameDecision::Emit(20_000)); } #[test] - fn out_of_segment_frame_is_dropped_not_clamped() { - assert!(matches!(frame_micros(true, None), FrameDecision::Drop(_))); + fn negative_running_time_is_dropped_not_clamped() { + assert!(matches!(frame_micros(Some(-5_000_000)), FrameDecision::Drop(_))); } - // Stateless conversion: two pads sharing one running-time clock keep their relative offset. #[test] - fn shared_timeline_keeps_av_aligned() { - // Simultaneous frames on two pads (same running time) get the same timestamp. - assert_eq!(frame_micros(true, Some(20_000_000)), FrameDecision::Emit(20_000)); - assert_eq!(frame_micros(true, Some(20_000_000)), FrameDecision::Emit(20_000)); + fn out_of_segment_frame_is_dropped() { + assert!(matches!(frame_micros(None), FrameDecision::Drop(_))); + } - // A real 2ms offset between a video and an audio frame survives, regardless of call order. - assert_eq!(frame_micros(true, Some(7_000_000)), FrameDecision::Emit(7_000)); - assert_eq!(frame_micros(true, Some(5_000_000)), FrameDecision::Emit(5_000)); + // Stateless conversion: two pads sharing one running-time clock keep their relative offset, + // regardless of call order. + #[test] + fn shared_timeline_keeps_av_aligned() { + assert_eq!(frame_micros(Some(7_000_000)), FrameDecision::Emit(7_000)); + assert_eq!(frame_micros(Some(5_000_000)), FrameDecision::Emit(5_000)); } } From 99ddadb25269aaf2614cca98d696d3ecb2287516 Mon Sep 17 00:00:00 2001 From: Ariel Molina Date: Mon, 15 Jun 2026 15:43:05 -0600 Subject: [PATCH 04/13] moq-gst: use a GStreamer-valid plugin license so the plugin loads GStreamer only registers plugins whose license is in its recognised set. The plugin_define! license was 'Apache 2.0', which gst rejects ('invalid license, not loading'), so the moq plugin (moqsink/moqsrc) never loads via the registry. The crate is MIT OR Apache-2.0, so declare the MIT side ('MIT/X11'). Co-Authored-By: Claude Opus 4.8 --- rs/moq-gst/src/lib.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rs/moq-gst/src/lib.rs b/rs/moq-gst/src/lib.rs index 95f640fd5..8d3c5e580 100644 --- a/rs/moq-gst/src/lib.rs +++ b/rs/moq-gst/src/lib.rs @@ -34,7 +34,9 @@ gst::plugin_define!( env!("CARGO_PKG_DESCRIPTION"), plugin_init, concat!(env!("CARGO_PKG_VERSION"), "-", env!("COMMIT_ID")), - "Apache 2.0", + // GStreamer only loads plugins whose license is in its recognised set; the crate is + // MIT OR Apache-2.0, so declare the MIT side ("Apache 2.0" is rejected and the plugin never loads). + "MIT/X11", env!("CARGO_PKG_NAME"), env!("CARGO_PKG_NAME"), env!("CARGO_PKG_REPOSITORY"), From 8fa624c75274c1e53014d9ec2caafa96f0e5500e Mon Sep 17 00:00:00 2001 From: Ariel Molina Date: Mon, 15 Jun 2026 15:43:05 -0600 Subject: [PATCH 05/13] moq-gst: explicit pad membership (AddPad + per-pad generation) PadSet inferred membership lazily from CAPS, so EOS aggregation (eos.len() == pads.len()) was wrong when EOS arrived before CAPS. Make membership authoritative: an 'active' set populated by AddPad (sent from request_new_pad, and seeded at session start for pads requested earlier), with EOS counted against it. Every data message carries its pad's generation, captured per incarnation, so a pad recreated with the same name discards the previous one's in-flight messages. Covers EOS-before-CAPS, duplicate EOS, late members, buffers after EOS, and a release that completes the element. Adds a hermetic element test (request/release) plus gstreamer as a dev-dependency. Session-dependent flows (multipad EOS, per-pad errors) stay in the relay-backed harness. Co-Authored-By: Claude Opus 4.8 --- rs/moq-gst/Cargo.toml | 3 + rs/moq-gst/src/spike/imp.rs | 66 +++++++-- rs/moq-gst/src/spike/session.rs | 232 +++++++++++++++++++++++++++----- rs/moq-gst/tests/element.rs | 35 +++++ 4 files changed, 295 insertions(+), 41 deletions(-) create mode 100644 rs/moq-gst/tests/element.rs diff --git a/rs/moq-gst/Cargo.toml b/rs/moq-gst/Cargo.toml index dddc361b1..8e86e8dc8 100644 --- a/rs/moq-gst/Cargo.toml +++ b/rs/moq-gst/Cargo.toml @@ -29,5 +29,8 @@ tracing = "0.1" tracing-subscriber = "0.3" url = "2" +[dev-dependencies] +gst = { package = "gstreamer", version = "0.23" } + [build-dependencies] gst-plugin-version-helper = "0.8" diff --git a/rs/moq-gst/src/spike/imp.rs b/rs/moq-gst/src/spike/imp.rs index b805b4e97..366957e98 100644 --- a/rs/moq-gst/src/spike/imp.rs +++ b/rs/moq-gst/src/spike/imp.rs @@ -1,5 +1,7 @@ //! GObject shell, deliberately a parallel element (`moqsinkspike`) so production `moqsink` is untouched. +use std::collections::HashMap; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, LazyLock, Mutex}; use anyhow::{Context, Result}; @@ -38,6 +40,10 @@ pub struct MoqSinkSpike { settings: Mutex, session: Mutex>, status: Arc, + // Monotonic per-pad generation; a pad recreated with the same name gets a fresh value so the + // worker discards the previous incarnation's in-flight messages. + next_generation: AtomicU64, + pad_generations: Mutex>, } #[glib::object_subclass] @@ -153,16 +159,23 @@ impl ElementImpl for MoqSinkSpike { name: Option<&str>, _caps: Option<&gst::Caps>, ) -> Option { + // Fixed per pad incarnation and captured here, so a buffer in flight from a released pad never + // reads a successor's generation. + let generation = self.next_generation.fetch_add(1, Ordering::Relaxed); let pad_builder = gst::Pad::builder_from_template(templ) - .chain_function(|pad, parent, buffer| { + .chain_function(move |pad, parent, buffer| { MoqSinkSpike::catch_panic_pad_function( parent, || Err(gst::FlowError::Error), - |sink| sink.forward_buffer(pad, buffer), + |sink| sink.forward_buffer(pad, generation, buffer), ) }) - .event_function(|pad, parent, event| { - MoqSinkSpike::catch_panic_pad_function(parent, || false, |sink| sink.handle_event(pad, event)) + .event_function(move |pad, parent, event| { + MoqSinkSpike::catch_panic_pad_function( + parent, + || false, + |sink| sink.handle_event(pad, generation, event), + ) }); let pad = match name { @@ -170,20 +183,29 @@ impl ElementImpl for MoqSinkSpike { None => pad_builder.generated_name().build(), }; + let name = pad.name().to_string(); + self.pad_generations.lock().unwrap().insert(name.clone(), generation); + + // Announce the pad before adding it: its own CAPS/buffers can only flow after add_pad, so they + // are never enqueued ahead of the AddPad that declares its membership. + let sender = self.session.lock().unwrap().as_ref().map(SessionHandle::sender); + if let Some(sender) = sender { + let _ = sender.blocking_send(DataMsg::AddPad { pad: name, generation }); + } self.obj().add_pad(&pad).ok()?; Some(pad) } fn release_pad(&self, pad: &gst::Pad) { + let name = pad.name().to_string(); + let generation = self.pad_generations.lock().unwrap().remove(&name); // Drop the session guard before blocking_send: holding it across a full-channel block deadlocks stop_session. let sender = { let session = self.session.lock().unwrap(); session.as_ref().map(SessionHandle::sender) }; - if let Some(sender) = sender { - let _ = sender.blocking_send(DataMsg::DropPad { - pad: pad.name().to_string(), - }); + if let (Some(sender), Some(generation)) = (sender, generation) { + let _ = sender.blocking_send(DataMsg::DropPad { pad: name, generation }); } let _ = self.obj().remove_pad(pad); } @@ -208,7 +230,20 @@ impl MoqSinkSpike { fn start_session(&self) -> Result<()> { // Synchronous settings validation surfaces as a StateChangeError; async failures go to the bus. let settings = ResolvedSettings::try_from(self.settings.lock().unwrap().clone())?; - let handle = SessionHandle::start(settings, self.status.clone(), self.obj().downgrade()); + // Seed pads requested before the session existed; the data channel is created inside start(). + let seed = { + let gens = self.pad_generations.lock().unwrap(); + self.obj() + .pads() + .iter() + .map(|pad| { + let name = pad.name().to_string(); + let generation = gens.get(&name).copied().unwrap_or(0); + (name, generation) + }) + .collect() + }; + let handle = SessionHandle::start(settings, self.status.clone(), self.obj().downgrade(), seed); *self.session.lock().unwrap() = Some(handle); Ok(()) } @@ -220,7 +255,12 @@ impl MoqSinkSpike { } /// Clone the sender, release the lock, then blocking-send: never apply backpressure under the session lock. - fn forward_buffer(&self, pad: &gst::Pad, buffer: gst::Buffer) -> Result { + fn forward_buffer( + &self, + pad: &gst::Pad, + generation: u64, + buffer: gst::Buffer, + ) -> Result { let sender = self .session .lock() @@ -236,6 +276,7 @@ impl MoqSinkSpike { sender .blocking_send(DataMsg::Buffer { pad: pad.name().to_string(), + generation, data, pts, }) @@ -243,7 +284,7 @@ impl MoqSinkSpike { Ok(gst::FlowSuccess::Ok) } - fn handle_event(&self, pad: &gst::Pad, event: gst::Event) -> bool { + fn handle_event(&self, pad: &gst::Pad, generation: u64, event: gst::Event) -> bool { let sender = self.session.lock().unwrap().as_ref().map(|handle| handle.sender()); match event.view() { @@ -251,6 +292,7 @@ impl MoqSinkSpike { let Some(sender) = sender else { return false }; let msg = DataMsg::Caps { pad: pad.name().to_string(), + generation, caps: caps.caps().to_owned(), }; if sender.blocking_send(msg).is_err() { @@ -262,6 +304,7 @@ impl MoqSinkSpike { let Some(sender) = sender else { return false }; let msg = DataMsg::Segment { pad: pad.name().to_string(), + generation, segment: segment.segment().to_owned(), }; if sender.blocking_send(msg).is_err() { @@ -274,6 +317,7 @@ impl MoqSinkSpike { sender .blocking_send(DataMsg::Eos { pad: pad.name().to_string(), + generation, }) .is_ok() } diff --git a/rs/moq-gst/src/spike/session.rs b/rs/moq-gst/src/spike/session.rs index aa8e95951..c9087659e 100644 --- a/rs/moq-gst/src/spike/session.rs +++ b/rs/moq-gst/src/spike/session.rs @@ -63,25 +63,36 @@ impl Status { } /// Ordered; shutdown is deliberately elsewhere (cancellation channel) so it can cut a blocked send. +/// Every message carries its pad's generation so a pad recreated with the same name discards the +/// previous incarnation's in-flight messages. pub enum DataMsg { + AddPad { + pad: String, + generation: u64, + }, Caps { pad: String, + generation: u64, caps: gst::Caps, }, Segment { pad: String, + generation: u64, segment: gst::Segment, }, Buffer { pad: String, + generation: u64, data: Bytes, pts: Option, }, Eos { pad: String, + generation: u64, }, DropPad { pad: String, + generation: u64, }, } @@ -98,13 +109,18 @@ pub struct SessionHandle { } impl SessionHandle { - pub fn start(settings: ResolvedSettings, status: Arc, element: glib::WeakRef) -> Self { + pub fn start( + settings: ResolvedSettings, + status: Arc, + element: glib::WeakRef, + seed: Vec<(String, u64)>, + ) -> Self { let (data_tx, data_rx) = mpsc::channel(DATA_CHANNEL_BOUND); let (shutdown_tx, shutdown_rx) = watch::channel(false); let join = RUNTIME.spawn(async move { // Only a remote close reaches the bus as an error; a local shutdown returns Ok and stays quiet. - if let Err(err) = run_session(settings, status, data_rx, shutdown_rx, element.clone()).await { + if let Err(err) = run_session(settings, status, seed, data_rx, shutdown_rx, element.clone()).await { if let Some(obj) = element.upgrade() { gst::element_error!(obj, gst::CoreError::Failed, ("session error"), ["{err:?}"]); } @@ -137,6 +153,7 @@ impl SessionHandle { async fn run_session( settings: ResolvedSettings, status: Arc, + seed: Vec<(String, u64)>, mut data: mpsc::Receiver, mut shutdown: watch::Receiver, element: glib::WeakRef, @@ -167,6 +184,10 @@ async fn run_session( gst::info!(CAT, "session connected to {}", settings.url); let mut pad_set = PadSet::new(broadcast, catalog); + // Pads requested before the session task existed are seeded into the authoritative set. + for (name, generation) in seed { + pad_set.add_pad(&name, generation); + } let result = run_loop(session, &mut data, &mut shutdown, &mut pad_set, &element, &status).await; // Finalize every live producer once on the way out, catalog last; runs on every exit path. @@ -187,6 +208,14 @@ fn notify_connected(element: &glib::WeakRef) { } } +/// Posted once when every active pad has ended. +fn post_element_eos(element: &glib::WeakRef) { + gst::info!(CAT, "all pads ended, posting EOS"); + if let Some(obj) = element.upgrade() { + let _ = obj.post_message(gst::message::Eos::builder().src(&obj).build()); + } +} + async fn run_loop( session: moq_net::Session, data: &mut mpsc::Receiver, @@ -222,19 +251,23 @@ async fn run_loop( None => send_bandwidth = None, }, msg = data.recv() => match msg { - Some(DataMsg::Caps { pad, caps }) => pad_set.caps(&pad, &caps)?, - Some(DataMsg::Segment { pad, segment }) => pad_set.segment(&pad, segment), - Some(DataMsg::Buffer { pad, data, pts }) => pad_set.buffer(&pad, data, pts)?, - Some(DataMsg::Eos { pad }) => { - if pad_set.eos(&pad)? { - gst::info!(CAT, "all pads ended, posting EOS"); - if let Some(obj) = element.upgrade() { - let _ = obj.post_message(gst::message::Eos::builder().src(&obj).build()); - } + Some(DataMsg::AddPad { pad, generation }) => pad_set.add_pad(&pad, generation), + Some(DataMsg::Caps { pad, generation, caps }) => pad_set.caps(&pad, generation, &caps)?, + Some(DataMsg::Segment { pad, generation, segment }) => pad_set.segment(&pad, generation, segment), + Some(DataMsg::Buffer { pad, generation, data, pts }) => pad_set.buffer(&pad, generation, data, pts)?, + Some(DataMsg::Eos { pad, generation }) => { + if pad_set.eos(&pad, generation)? { + post_element_eos(element); + return Ok(()); + } + } + // A release can complete the element if the remaining pads have all ended. + Some(DataMsg::DropPad { pad, generation }) => { + if pad_set.drop_pad(&pad, generation) { + post_element_eos(element); return Ok(()); } } - Some(DataMsg::DropPad { pad }) => pad_set.drop_pad(&pad), // The element dropped the sender (state change to READY) without a shutdown signal. None => return Ok(()), }, @@ -246,6 +279,10 @@ async fn run_loop( struct PadSet { broadcast: moq_net::BroadcastProducer, catalog: Option, + /// Authoritative membership: name -> current generation. EOS aggregation counts against this, not + /// the lazily-created `pads`, so a pad that ends before CAPS still counts. + active: HashMap, + /// Producer state, created lazily on the first CAPS (a member without CAPS has no entry here). pads: HashMap, eos: HashSet, } @@ -255,12 +292,38 @@ impl PadSet { Self { broadcast, catalog: Some(catalog), + active: HashMap::new(), pads: HashMap::new(), eos: HashSet::new(), } } - fn caps(&mut self, pad: &str, caps: &gst::Caps) -> Result<()> { + /// Declares (or re-declares) a pad's membership. A recreated pad (new generation) starts fresh: + /// any producer and EOS mark from a previous incarnation are dropped. + fn add_pad(&mut self, pad: &str, generation: u64) { + if let Some(mut old) = self.pads.remove(pad) { + let _ = old.finalize(); + } + self.eos.remove(pad); + self.active.insert(pad.to_string(), generation); + } + + /// A message is current only if its generation matches the pad's active generation; otherwise it + /// belongs to a previous incarnation (or an unknown pad) and is dropped. + fn is_current(&self, pad: &str, generation: u64) -> bool { + self.active.get(pad) == Some(&generation) + } + + /// Every active pad has ended (and there is at least one). + fn all_ended(&self) -> bool { + !self.active.is_empty() && self.active.keys().all(|name| self.eos.contains(name)) + } + + fn caps(&mut self, pad: &str, generation: u64, caps: &gst::Caps) -> Result<()> { + if !self.is_current(pad, generation) { + gst::warning!(CAT, "caps for stale or unknown pad {pad}, dropping"); + return Ok(()); + } let broadcast = self.broadcast.clone(); let catalog = self.catalog.clone().context("catalog already finalized")?; self.pads @@ -269,7 +332,11 @@ impl PadSet { .set_caps(broadcast, catalog, caps) } - fn segment(&mut self, pad: &str, segment: gst::Segment) { + fn segment(&mut self, pad: &str, generation: u64, segment: gst::Segment) { + if !self.is_current(pad, generation) { + gst::warning!(CAT, "segment for stale or unknown pad {pad}, dropping"); + return; + } // SEGMENT may arrive before CAPS (independent sticky events); this only records timing. self.pads .entry(pad.to_string()) @@ -277,32 +344,55 @@ impl PadSet { .set_segment(segment); } - fn buffer(&mut self, pad: &str, data: Bytes, pts: Option) -> Result<()> { + fn buffer(&mut self, pad: &str, generation: u64, data: Bytes, pts: Option) -> Result<()> { + if !self.is_current(pad, generation) { + gst::warning!(CAT, "buffer for stale or unknown pad {pad}, dropping"); + return Ok(()); + } + if self.eos.contains(pad) { + gst::warning!(CAT, "buffer after EOS on pad {pad}, dropping"); + return Ok(()); + } match self.pads.get_mut(pad) { - Some(pad) => pad.push_buffer(data, pts), + Some(p) => p.push_buffer(data, pts), None => { - gst::warning!(CAT, "buffer for unknown pad {pad}"); + gst::warning!(CAT, "buffer before caps on pad {pad}, dropping"); Ok(()) } } } - /// Returns whether every pad has now ended, so the caller posts the element EOS once. - fn eos(&mut self, pad: &str) -> Result { + /// Returns whether every active pad has now ended, so the caller posts the element EOS once. + fn eos(&mut self, pad: &str, generation: u64) -> Result { + if !self.is_current(pad, generation) { + gst::warning!(CAT, "EOS for stale or unknown pad {pad}, ignoring"); + return Ok(false); + } + // Duplicate EOS is idempotent: do not re-finalize, just re-report completion. + if self.eos.contains(pad) { + return Ok(self.all_ended()); + } if let Some(p) = self.pads.get_mut(pad) { p.finalize()?; } self.eos.insert(pad.to_string()); - Ok(!self.pads.is_empty() && self.eos.len() == self.pads.len()) + Ok(self.all_ended()) } - fn drop_pad(&mut self, pad: &str) { + /// Returns whether the remaining active pads have all ended (a release can complete the element). + fn drop_pad(&mut self, pad: &str, generation: u64) -> bool { + if self.active.get(pad) != Some(&generation) { + gst::warning!(CAT, "DropPad for stale or unknown pad {pad}, ignoring"); + return false; + } if let Some(mut p) = self.pads.remove(pad) { if let Err(err) = p.finalize() { gst::warning!(CAT, "finalize on drop {pad}: {err:?}"); } } + self.active.remove(pad); self.eos.remove(pad); + self.all_ended() } /// Idempotent (skips already-finalized pads); the returned order proves "catalog last". @@ -509,8 +599,10 @@ mod tests { fn finalize_all_finishes_pads_then_catalog_once() { gst::init().unwrap(); let mut set = pad_set(); - set.caps("video", &h264_caps()).unwrap(); - set.caps("audio", &h264_caps()).unwrap(); + set.add_pad("video", 0); + set.add_pad("audio", 0); + set.caps("video", 0, &h264_caps()).unwrap(); + set.caps("audio", 0, &h264_caps()).unwrap(); let order = set.finalize_all(); assert_eq!( @@ -528,9 +620,10 @@ mod tests { fn eos_then_shutdown_does_not_double_finalize() { gst::init().unwrap(); let mut set = pad_set(); - set.caps("video", &h264_caps()).unwrap(); + set.add_pad("video", 0); + set.caps("video", 0, &h264_caps()).unwrap(); - assert!(set.eos("video").unwrap()); + assert!(set.eos("video", 0).unwrap()); // Only the catalog is left; the pad is not finalized twice. assert_eq!(set.finalize_all(), vec!["catalog".to_string()]); } @@ -539,9 +632,10 @@ mod tests { fn identical_caps_keep_one_live_producer() { gst::init().unwrap(); let mut set = pad_set(); - set.caps("video", &h264_caps()).unwrap(); + set.add_pad("video", 0); + set.caps("video", 0, &h264_caps()).unwrap(); // Re-sent identical caps are a no-op; the pad still holds exactly one live producer. - set.caps("video", &h264_caps()).unwrap(); + set.caps("video", 0, &h264_caps()).unwrap(); assert_eq!(set.pads.len(), 1); assert!(set.pads["video"].framed.is_some()); } @@ -550,17 +644,95 @@ mod tests { fn buffer_for_unknown_pad_is_dropped_without_error() { let mut set = pad_set(); assert!(set - .buffer("ghost", Bytes::from_static(b"x"), Some(gst::ClockTime::ZERO)) + .buffer("ghost", 0, Bytes::from_static(b"x"), Some(gst::ClockTime::ZERO)) .is_ok()); } + // AddPad declares membership independent of CAPS, and generation discriminates incarnations. + #[test] + fn add_pad_makes_a_member_independent_of_caps() { + let mut set = pad_set(); + set.add_pad("video", 0); + assert!(set.is_current("video", 0)); + assert!(!set.is_current("video", 1), "a different generation is not current"); + assert!(!set.is_current("audio", 0), "an unknown pad is not current"); + } + + // EOS before any CAPS still counts: a member with no producer has nothing to finalize but ends. + #[test] + fn eos_before_caps_counts_toward_completion() { + let mut set = pad_set(); + set.add_pad("video", 0); + assert!(set.eos("video", 0).unwrap()); + } + + // The element completes only once every member has ended; a still-open member holds it open. + #[test] + fn element_completes_only_after_all_members_eos() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("a", 0); + set.add_pad("b", 0); + set.caps("a", 0, &h264_caps()).unwrap(); + set.caps("b", 0, &h264_caps()).unwrap(); + assert!(!set.eos("a", 0).unwrap(), "one of two members ended, not complete"); + assert!(set.eos("b", 0).unwrap(), "both members ended, complete"); + } + + // Duplicate EOS is idempotent: it neither re-finalizes the producer nor changes completion. + #[test] + fn duplicate_eos_is_idempotent() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("video", 0); + set.caps("video", 0, &h264_caps()).unwrap(); + assert!(set.eos("video", 0).unwrap()); + assert!(set.eos("video", 0).unwrap(), "duplicate EOS stays complete"); + // Finalized exactly once: only the catalog remains for the exit sweep. + assert_eq!(set.finalize_all(), vec!["catalog".to_string()]); + } + + // A buffer that arrives after the pad's EOS is dropped (the producer is already finalized). + #[test] + fn buffer_after_eos_is_dropped() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("video", 0); + set.caps("video", 0, &h264_caps()).unwrap(); + set.eos("video", 0).unwrap(); + assert!(set + .buffer("video", 0, Bytes::from_static(b"x"), Some(gst::ClockTime::ZERO)) + .is_ok()); + } + + // A pad recreated with the same name (new generation) discards the previous incarnation's messages. + #[test] + fn stale_generation_messages_are_dropped() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("video", 0); + set.caps("video", 0, &h264_caps()).unwrap(); + set.add_pad("video", 1); + // Old-generation messages no longer match the active generation. + assert!(set + .buffer("video", 0, Bytes::from_static(b"x"), Some(gst::ClockTime::ZERO)) + .is_ok()); + assert!( + !set.eos("video", 0).unwrap(), + "a stale EOS must not complete the element" + ); + // The current generation still completes it. + assert!(set.eos("video", 1).unwrap()); + } + // SEGMENT before CAPS: the pad is created and the segment retained when CAPS arrives. #[test] fn segment_before_caps_is_retained() { gst::init().unwrap(); let mut set = pad_set(); - set.segment("video", time_segment()); - set.caps("video", &h264_caps()).unwrap(); + set.add_pad("video", 0); + set.segment("video", 0, time_segment()); + set.caps("video", 0, &h264_caps()).unwrap(); let pad = &set.pads["video"]; assert!(pad.segment_info.is_some(), "segment kept across a later caps"); diff --git a/rs/moq-gst/tests/element.rs b/rs/moq-gst/tests/element.rs new file mode 100644 index 000000000..3921c1049 --- /dev/null +++ b/rs/moq-gst/tests/element.rs @@ -0,0 +1,35 @@ +//! Hermetic element-boundary tests: behaviour reachable without a live MoQ session. +//! +//! Session-dependent flows (multipad EOS aggregation, per-pad errors, FLUSH, remote death) require a +//! real relay, so they live in the relay-backed harness, not here. + +use std::sync::Once; + +use gst::prelude::*; + +fn init() { + static INIT: Once = Once::new(); + INIT.call_once(|| { + gst::init().unwrap(); + gstmoq::plugin_register_static().expect("register moq plugin"); + }); +} + +// Request pads appear and disappear through the real GObject boundary, with no session attached. +#[test] +fn request_and_release_sink_pads() { + init(); + let sink = gst::ElementFactory::make("moqsinkspike") + .build() + .expect("create moqsinkspike"); + + let pad0 = sink.request_pad_simple("sink_0").expect("request sink_0"); + assert_eq!(pad0.name().as_str(), "sink_0"); + let pad1 = sink.request_pad_simple("sink_1").expect("request sink_1"); + assert_eq!(sink.num_sink_pads(), 2); + + sink.release_request_pad(&pad1); + assert_eq!(sink.num_sink_pads(), 1); + sink.release_request_pad(&pad0); + assert_eq!(sink.num_sink_pads(), 0); +} From af72ba871ef327dc699b3632f30cb146b6bc596b Mon Sep 17 00:00:00 2001 From: Ariel Molina Date: Mon, 15 Jun 2026 16:02:22 -0600 Subject: [PATCH 06/13] moq-gst: per-pad error isolation (FlowError, sync NotNegotiated) A bad caps or bitstream now invalidates only the offending pad instead of tearing down the session: the worker drops that pad's producer and marks it failed in the shared Status, and the chain returns a FlowError on the pad's next buffer (one buffer later, since the worker is async) rather than silently dropping. Session-level failures (a closed catalog) still propagate to element_error! on the bus. Unsupported caps are also rejected synchronously in the event handler (NotNegotiated) before reaching the worker. Hermetic tests: worker-side pad-vs-session error classification and the caps validator; element-level invalid-settings StateChangeError and connect-failure to the bus. The per-pad FlowError observation and the live error flows need a session, so they stay in the relay-backed harness. Co-Authored-By: Claude Opus 4.8 --- rs/moq-gst/src/spike/imp.rs | 16 ++++- rs/moq-gst/src/spike/session.rs | 119 +++++++++++++++++++++++++++++--- rs/moq-gst/tests/element.rs | 34 +++++++++ 3 files changed, 158 insertions(+), 11 deletions(-) diff --git a/rs/moq-gst/src/spike/imp.rs b/rs/moq-gst/src/spike/imp.rs index 366957e98..e4cea6057 100644 --- a/rs/moq-gst/src/spike/imp.rs +++ b/rs/moq-gst/src/spike/imp.rs @@ -10,7 +10,7 @@ use gst::glib; use gst::prelude::*; use gst::subclass::prelude::*; -use super::session::{DataMsg, ResolvedSettings, SessionHandle, Status, CAT}; +use super::session::{caps_supported, DataMsg, ResolvedSettings, SessionHandle, Status, CAT}; #[derive(Debug, Clone, Default)] struct Settings { @@ -261,6 +261,12 @@ impl MoqSinkSpike { generation: u64, buffer: gst::Buffer, ) -> Result { + // The worker marks a pad failed after rejecting its data; surface that to GStreamer instead of + // silently dropping. Because the worker is async, this lands on the buffer after the bad one. + if self.status.is_failed(pad.name().as_str()) { + return Err(gst::FlowError::NotNegotiated); + } + let sender = self .session .lock() @@ -289,11 +295,17 @@ impl MoqSinkSpike { match event.view() { gst::EventView::Caps(caps) => { + let caps = caps.caps(); + // Reject unsupported caps synchronously (NotNegotiated) before handing off to the worker. + if !caps_supported(caps) { + gst::warning!(CAT, "rejecting unsupported caps on pad {}", pad.name()); + return false; + } let Some(sender) = sender else { return false }; let msg = DataMsg::Caps { pad: pad.name().to_string(), generation, - caps: caps.caps().to_owned(), + caps: caps.to_owned(), }; if sender.blocking_send(msg).is_err() { return false; diff --git a/rs/moq-gst/src/spike/session.rs b/rs/moq-gst/src/spike/session.rs index c9087659e..1080b9707 100644 --- a/rs/moq-gst/src/spike/session.rs +++ b/rs/moq-gst/src/spike/session.rs @@ -34,6 +34,9 @@ pub struct Status { connected: AtomicBool, version: Mutex>, send_bitrate: AtomicU64, + /// Pads whose data the worker rejected; the chain reads this to return a FlowError instead of + /// silently dropping. Cleared per pad on recreate, and wholesale on session exit. + failed: Mutex>, } impl Status { @@ -41,6 +44,22 @@ impl Status { self.connected.store(value, Ordering::Relaxed); } + fn mark_failed(&self, pad: &str) { + self.failed.lock().unwrap().insert(pad.to_string()); + } + + fn clear_failed(&self, pad: &str) { + self.failed.lock().unwrap().remove(pad); + } + + pub fn is_failed(&self, pad: &str) -> bool { + self.failed.lock().unwrap().contains(pad) + } + + fn reset_failed(&self) { + self.failed.lock().unwrap().clear(); + } + pub fn connected(&self) -> bool { self.connected.load(Ordering::Relaxed) } @@ -183,7 +202,7 @@ async fn run_session( notify_connected(&element); gst::info!(CAT, "session connected to {}", settings.url); - let mut pad_set = PadSet::new(broadcast, catalog); + let mut pad_set = PadSet::new(broadcast, catalog, status.clone()); // Pads requested before the session task existed are seeded into the authoritative set. for (name, generation) in seed { pad_set.add_pad(&name, generation); @@ -197,6 +216,7 @@ async fn run_session( status.set_connected(false); status.set_version(None); status.set_send_bitrate(0); + status.reset_failed(); notify_connected(&element); result } @@ -251,7 +271,11 @@ async fn run_loop( None => send_bandwidth = None, }, msg = data.recv() => match msg { - Some(DataMsg::AddPad { pad, generation }) => pad_set.add_pad(&pad, generation), + Some(DataMsg::AddPad { pad, generation }) => { + pad_set.add_pad(&pad, generation); + // A fresh incarnation is not failed even if a previous one was. + status.clear_failed(&pad); + } Some(DataMsg::Caps { pad, generation, caps }) => pad_set.caps(&pad, generation, &caps)?, Some(DataMsg::Segment { pad, generation, segment }) => pad_set.segment(&pad, generation, segment), Some(DataMsg::Buffer { pad, generation, data, pts }) => pad_set.buffer(&pad, generation, data, pts)?, @@ -279,6 +303,8 @@ async fn run_loop( struct PadSet { broadcast: moq_net::BroadcastProducer, catalog: Option, + /// Shared with the element so the chain can read which pads the worker has failed. + status: Arc, /// Authoritative membership: name -> current generation. EOS aggregation counts against this, not /// the lazily-created `pads`, so a pad that ends before CAPS still counts. active: HashMap, @@ -288,10 +314,11 @@ struct PadSet { } impl PadSet { - fn new(broadcast: moq_net::BroadcastProducer, catalog: moq_mux::catalog::Producer) -> Self { + fn new(broadcast: moq_net::BroadcastProducer, catalog: moq_mux::catalog::Producer, status: Arc) -> Self { Self { broadcast, catalog: Some(catalog), + status, active: HashMap::new(), pads: HashMap::new(), eos: HashSet::new(), @@ -319,6 +346,7 @@ impl PadSet { !self.active.is_empty() && self.active.keys().all(|name| self.eos.contains(name)) } + /// `Err` is session-fatal (the catalog is gone); a bad-caps failure invalidates only this pad. fn caps(&mut self, pad: &str, generation: u64, caps: &gst::Caps) -> Result<()> { if !self.is_current(pad, generation) { gst::warning!(CAT, "caps for stale or unknown pad {pad}, dropping"); @@ -326,10 +354,27 @@ impl PadSet { } let broadcast = self.broadcast.clone(); let catalog = self.catalog.clone().context("catalog already finalized")?; - self.pads + let result = self + .pads .entry(pad.to_string()) .or_insert_with(Pad::new) - .set_caps(broadcast, catalog, caps) + .set_caps(broadcast, catalog, caps); + if let Err(err) = result { + gst::warning!(CAT, "invalidating pad {pad}: {err:?}"); + self.fail_pad(pad); + } + Ok(()) + } + + /// Drops a pad's producer (closing its track) and marks it failed so the chain returns a FlowError + /// on its next buffer. The pad stays a member, so the session and the other pads keep going. + fn fail_pad(&mut self, pad: &str) { + if let Some(mut p) = self.pads.remove(pad) { + if let Err(err) = p.finalize() { + gst::warning!(CAT, "finalize on failed pad {pad}: {err:?}"); + } + } + self.status.mark_failed(pad); } fn segment(&mut self, pad: &str, generation: u64, segment: gst::Segment) { @@ -353,13 +398,19 @@ impl PadSet { gst::warning!(CAT, "buffer after EOS on pad {pad}, dropping"); return Ok(()); } - match self.pads.get_mut(pad) { + let result = match self.pads.get_mut(pad) { Some(p) => p.push_buffer(data, pts), None => { gst::warning!(CAT, "buffer before caps on pad {pad}, dropping"); - Ok(()) + return Ok(()); } + }; + // A bad bitstream invalidates only this pad; the session and other pads continue. + if let Err(err) = result { + gst::warning!(CAT, "invalidating pad {pad}: {err:?}"); + self.fail_pad(pad); } + Ok(()) } /// Returns whether every active pad has now ended, so the caller posts the element EOS once. @@ -456,7 +507,7 @@ impl Pad { ) -> Result<()> { let structure = caps.structure(0).context("empty caps")?; ensure!( - structure.name() == "video/x-h264", + caps_supported(caps), "spike only carries H.264 byte-stream, got {}", structure.name() ); @@ -552,6 +603,12 @@ impl Pad { } } +/// The spike only carries H.264 byte-stream. Checked synchronously at the event boundary (so an +/// unsupported caps is rejected with NotNegotiated) and again in `set_caps` as the worker's defence. +pub(super) fn caps_supported(caps: &gst::CapsRef) -> bool { + caps.structure(0).map(|s| s.name() == "video/x-h264").unwrap_or(false) +} + fn segment_info(segment: &gst::Segment) -> SegmentInfo { match segment.downcast_ref::() { Some(time) => SegmentInfo { @@ -584,7 +641,7 @@ mod tests { fn pad_set() -> PadSet { let mut broadcast = moq_net::Broadcast::new().produce(); let catalog = moq_mux::catalog::Producer::new(&mut broadcast).unwrap(); - PadSet::new(broadcast, catalog) + PadSet::new(broadcast, catalog, Arc::new(Status::default())) } fn h264_caps() -> gst::Caps { @@ -594,6 +651,10 @@ mod tests { .build() } + fn audio_caps() -> gst::Caps { + gst::Caps::builder("audio/x-raw").build() + } + // EOS/new-caps/drop/shutdown converge on exactly one finalize per producer; catalog last. #[test] fn finalize_all_finishes_pads_then_catalog_once() { @@ -725,6 +786,46 @@ mod tests { assert!(set.eos("video", 1).unwrap()); } + #[test] + fn caps_supported_accepts_only_h264() { + gst::init().unwrap(); + assert!(caps_supported(&h264_caps())); + assert!(!caps_supported(&audio_caps())); + } + + // A pad with unsupported caps is invalidated alone: marked failed and its producer dropped, while + // the session and the other pad keep going (a pad error is not a session error). + #[test] + fn invalid_caps_fails_only_that_pad() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("video", 0); + set.add_pad("data", 0); + set.caps("video", 0, &h264_caps()).unwrap(); + + assert!( + set.caps("data", 0, &audio_caps()).is_ok(), + "a pad error must not kill the session" + ); + assert!(set.status.is_failed("data"), "the bad pad is marked failed"); + assert!(!set.status.is_failed("video"), "the good pad is untouched"); + assert!(set.pads.contains_key("video"), "the good pad keeps its producer"); + assert!(!set.pads.contains_key("data"), "the failed pad's producer is dropped"); + } + + // A finalized catalog is a session failure: caps returns Err so the worker tears the session down. + #[test] + fn caps_after_catalog_finalized_is_a_session_error() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("video", 0); + set.finalize_all(); + assert!( + set.caps("video", 0, &h264_caps()).is_err(), + "a gone catalog is session-fatal" + ); + } + // SEGMENT before CAPS: the pad is created and the segment retained when CAPS arrives. #[test] fn segment_before_caps_is_retained() { diff --git a/rs/moq-gst/tests/element.rs b/rs/moq-gst/tests/element.rs index 3921c1049..443496875 100644 --- a/rs/moq-gst/tests/element.rs +++ b/rs/moq-gst/tests/element.rs @@ -33,3 +33,37 @@ fn request_and_release_sink_pads() { sink.release_request_pad(&pad0); assert_eq!(sink.num_sink_pads(), 0); } + +// Settings are validated synchronously: a missing url fails the state change, not the bus. +#[test] +fn missing_url_fails_state_change() { + init(); + let sink = gst::ElementFactory::make("moqsinkspike") + .build() + .expect("create moqsinkspike"); + assert!( + sink.set_state(gst::State::Paused).is_err(), + "a missing url must fail the Ready->Paused state change" + ); + let _ = sink.set_state(gst::State::Null); +} + +// A connect that cannot succeed surfaces as an ERROR on the bus, not a silent log. +#[test] +fn connect_failure_posts_error_to_bus() { + init(); + let pipeline = gst::Pipeline::new(); + let sink = gst::ElementFactory::make("moqsinkspike") + .property("url", "https://nonexistent.invalid:443") + .property("broadcast", "test") + .build() + .expect("create moqsinkspike"); + pipeline.add(&sink).expect("add sink to pipeline"); + + let _ = pipeline.set_state(gst::State::Playing); + let bus = pipeline.bus().expect("pipeline bus"); + let msg = bus.timed_pop_filtered(gst::ClockTime::from_seconds(20), &[gst::MessageType::Error]); + let _ = pipeline.set_state(gst::State::Null); + + assert!(msg.is_some(), "a failed connect must post an ERROR to the bus"); +} From 4627971df4192a2d5a4a3d80948be672719cd406 Mon Sep 17 00:00:00 2001 From: Ariel Molina Date: Mon, 15 Jun 2026 17:52:00 -0600 Subject: [PATCH 07/13] moq-gst: review hardening for moqsinkspike (finalize, errors, limits) Post-review fixes to the parallel spike core, all hermetically tested: - Post the element EOS only after the catalog finalizes cleanly; a finalize failure now surfaces as element_error instead of a silent log (run_loop returns an ExitReason, finalize_all returns a Result). - Enforce per-pad frame-timestamp monotonicity: a frame that would regress below the last emitted one is dropped, not emitted out of order. - Make a failed pad terminal: its track is finalized and it stops blocking EOS aggregation, so the element completes on the good pads. - Key the per-pad failure set by generation so a pad recreated with the same name never inherits a previous incarnation's failure. - Require byte-stream/au in caps_supported (what FramedFormat::Avc3 actually consumes), not just the media type. - Reject a buffer past the MoQ frame limit (16 MiB) before copying, so hostile input cannot drive an unbounded allocation. - Test honesty: rename tests that over-promised, add the buffer/bitstream decode-error path, and assert connected=false on a failed connect. CONTEXT Rejected alternatives: - Escalating a pad failure to a session/element error: per-pad isolation is the contract, so a bad pad is finalized and counts as terminal, never fatal to the session. - A transport test seam to make session-level flows hermetic: that is architecture-for-tests; multipad EOS and the per-pad FlowError observation are validated against a real relay instead. - A loopback connect endpoint (127.0.0.1:1) for the connect-failure test: measured to ride the QUIC idle timeout (no fast localhost-UDP reject), so the fast RFC-invalid DNS host was kept. - A spike-side cap for the importer's non-VCL NAL accumulation: the spike has no "frame not emitted" signal, so that cap belongs in the moq-mux importer. Key decisions: - EOS must not claim success the broadcast did not reach: it waits for catalog.finish() to succeed, and a failure becomes element_error. - Segment-base continuity alone does not stop a frame from regressing: monotonicity is enforced at the frame level (last_emitted), not only at the segment boundary. - The frame cap is aligned to moq-net's MAX_FRAME_SIZE: a larger frame could not be consumed anyway. --- rs/moq-gst/src/spike/imp.rs | 18 +- rs/moq-gst/src/spike/session.rs | 279 ++++++++++++++++++++++++------- rs/moq-gst/src/spike/timeline.rs | 6 +- rs/moq-gst/tests/element.rs | 11 +- 4 files changed, 246 insertions(+), 68 deletions(-) diff --git a/rs/moq-gst/src/spike/imp.rs b/rs/moq-gst/src/spike/imp.rs index e4cea6057..c458b62a4 100644 --- a/rs/moq-gst/src/spike/imp.rs +++ b/rs/moq-gst/src/spike/imp.rs @@ -12,6 +12,10 @@ use gst::subclass::prelude::*; use super::session::{caps_supported, DataMsg, ResolvedSettings, SessionHandle, Status, CAT}; +/// Reject a frame past the MoQ frame limit (moq-net's MAX_FRAME_SIZE, 16 MiB): it could not be +/// consumed anyway, and copying it would let hostile input drive an unbounded allocation. +const MAX_FRAME_BYTES: usize = 16 * 1024 * 1024; + #[derive(Debug, Clone, Default)] struct Settings { url: Option, @@ -263,10 +267,22 @@ impl MoqSinkSpike { ) -> Result { // The worker marks a pad failed after rejecting its data; surface that to GStreamer instead of // silently dropping. Because the worker is async, this lands on the buffer after the bad one. - if self.status.is_failed(pad.name().as_str()) { + if self.status.is_failed(pad.name().as_str(), generation) { return Err(gst::FlowError::NotNegotiated); } + // Bound the per-frame allocation before copying: a buffer past the frame limit cannot be + // consumed and would let hostile input drive an unbounded copy. + if buffer.size() > MAX_FRAME_BYTES { + gst::warning!( + CAT, + "rejecting {}-byte buffer on pad {} (exceeds frame limit)", + buffer.size(), + pad.name() + ); + return Err(gst::FlowError::Error); + } + let sender = self .session .lock() diff --git a/rs/moq-gst/src/spike/session.rs b/rs/moq-gst/src/spike/session.rs index 1080b9707..8f51656a8 100644 --- a/rs/moq-gst/src/spike/session.rs +++ b/rs/moq-gst/src/spike/session.rs @@ -34,9 +34,9 @@ pub struct Status { connected: AtomicBool, version: Mutex>, send_bitrate: AtomicU64, - /// Pads whose data the worker rejected; the chain reads this to return a FlowError instead of - /// silently dropping. Cleared per pad on recreate, and wholesale on session exit. - failed: Mutex>, + /// Pads whose data the worker rejected, keyed by the failed generation so a pad recreated with the + /// same name does not inherit it. The chain reads this to return a FlowError instead of dropping. + failed: Mutex>, } impl Status { @@ -44,16 +44,16 @@ impl Status { self.connected.store(value, Ordering::Relaxed); } - fn mark_failed(&self, pad: &str) { - self.failed.lock().unwrap().insert(pad.to_string()); + fn mark_failed(&self, pad: &str, generation: u64) { + self.failed.lock().unwrap().insert(pad.to_string(), generation); } fn clear_failed(&self, pad: &str) { self.failed.lock().unwrap().remove(pad); } - pub fn is_failed(&self, pad: &str) -> bool { - self.failed.lock().unwrap().contains(pad) + pub fn is_failed(&self, pad: &str, generation: u64) -> bool { + self.failed.lock().unwrap().get(pad) == Some(&generation) } fn reset_failed(&self) { @@ -207,18 +207,33 @@ async fn run_session( for (name, generation) in seed { pad_set.add_pad(&name, generation); } - let result = run_loop(session, &mut data, &mut shutdown, &mut pad_set, &element, &status).await; + let reason = run_loop(session, &mut data, &mut shutdown, &mut pad_set, &status).await; // Finalize every live producer once on the way out, catalog last; runs on every exit path. let finalized = pad_set.finalize_all(); - gst::debug!(CAT, "finalized on exit: {finalized:?}"); // Reset the whole observable surface on exit, not just connected. status.set_connected(false); status.set_version(None); status.set_send_bitrate(0); status.reset_failed(); notify_connected(&element); - result + + match (reason, finalized) { + // Clean end: post the element EOS only once the catalog has finalized cleanly. + (Ok(ExitReason::Ended), Ok(order)) => { + gst::debug!(CAT, "finalized on exit: {order:?}"); + post_element_eos(&element); + Ok(()) + } + (Ok(ExitReason::Stopped), Ok(order)) => { + gst::debug!(CAT, "finalized on exit: {order:?}"); + Ok(()) + } + // A finalize failure is surfaced (becomes element_error on the bus), never silently logged. + (Ok(_), Err(err)) => Err(err), + // The session already failed; finalize was best-effort. + (Err(session_err), _) => Err(session_err), + } } /// Only on the connect/disconnect edges, never per sample. @@ -236,14 +251,20 @@ fn post_element_eos(element: &glib::WeakRef) { } } +/// Why run_loop returned: a clean end (all pads EOS) posts the element EOS after finalize; a stop +/// (local shutdown, dropped sender, or clean remote close) finalizes quietly. +enum ExitReason { + Ended, + Stopped, +} + async fn run_loop( session: moq_net::Session, data: &mut mpsc::Receiver, shutdown: &mut watch::Receiver, pad_set: &mut PadSet, - element: &glib::WeakRef, status: &Status, -) -> Result<()> { +) -> Result { // Congestion-controller send estimate; None when unavailable, then this arm parks forever. let mut send_bandwidth = session.send_bandwidth(); @@ -253,12 +274,12 @@ async fn run_loop( loop { tokio::select! { - // Local close: quiet Ok, no ERROR. - _ = shutdown.changed() => return Ok(()), + // Local close: quiet stop, no ERROR. + _ = shutdown.changed() => return Ok(ExitReason::Stopped), // Remote death: propagate the Err so the wrapper posts ERROR to the bus. result = &mut closed => { result?; - return Ok(()); + return Ok(ExitReason::Stopped); } // A closed estimate stops the polling. bitrate = async { @@ -276,24 +297,30 @@ async fn run_loop( // A fresh incarnation is not failed even if a previous one was. status.clear_failed(&pad); } - Some(DataMsg::Caps { pad, generation, caps }) => pad_set.caps(&pad, generation, &caps)?, + Some(DataMsg::Caps { pad, generation, caps }) => { + if pad_set.caps(&pad, generation, &caps)? { + return Ok(ExitReason::Ended); + } + } Some(DataMsg::Segment { pad, generation, segment }) => pad_set.segment(&pad, generation, segment), - Some(DataMsg::Buffer { pad, generation, data, pts }) => pad_set.buffer(&pad, generation, data, pts)?, + Some(DataMsg::Buffer { pad, generation, data, pts }) => { + if pad_set.buffer(&pad, generation, data, pts)? { + return Ok(ExitReason::Ended); + } + } Some(DataMsg::Eos { pad, generation }) => { if pad_set.eos(&pad, generation)? { - post_element_eos(element); - return Ok(()); + return Ok(ExitReason::Ended); } } // A release can complete the element if the remaining pads have all ended. Some(DataMsg::DropPad { pad, generation }) => { if pad_set.drop_pad(&pad, generation) { - post_element_eos(element); - return Ok(()); + return Ok(ExitReason::Ended); } } // The element dropped the sender (state change to READY) without a shutdown signal. - None => return Ok(()), + None => return Ok(ExitReason::Stopped), }, } } @@ -332,6 +359,7 @@ impl PadSet { let _ = old.finalize(); } self.eos.remove(pad); + self.status.clear_failed(pad); self.active.insert(pad.to_string(), generation); } @@ -346,11 +374,12 @@ impl PadSet { !self.active.is_empty() && self.active.keys().all(|name| self.eos.contains(name)) } - /// `Err` is session-fatal (the catalog is gone); a bad-caps failure invalidates only this pad. - fn caps(&mut self, pad: &str, generation: u64, caps: &gst::Caps) -> Result<()> { + /// `Ok(true)` means the element is now complete (a pad failure can finish it); `Err` is session-fatal + /// (the catalog is gone). A bad-caps failure invalidates only this pad. + fn caps(&mut self, pad: &str, generation: u64, caps: &gst::Caps) -> Result { if !self.is_current(pad, generation) { gst::warning!(CAT, "caps for stale or unknown pad {pad}, dropping"); - return Ok(()); + return Ok(false); } let broadcast = self.broadcast.clone(); let catalog = self.catalog.clone().context("catalog already finalized")?; @@ -361,20 +390,23 @@ impl PadSet { .set_caps(broadcast, catalog, caps); if let Err(err) = result { gst::warning!(CAT, "invalidating pad {pad}: {err:?}"); - self.fail_pad(pad); + self.fail_pad(pad, generation); + return Ok(self.all_ended()); } - Ok(()) + Ok(false) } /// Drops a pad's producer (closing its track) and marks it failed so the chain returns a FlowError /// on its next buffer. The pad stays a member, so the session and the other pads keep going. - fn fail_pad(&mut self, pad: &str) { + fn fail_pad(&mut self, pad: &str, generation: u64) { if let Some(mut p) = self.pads.remove(pad) { if let Err(err) = p.finalize() { gst::warning!(CAT, "finalize on failed pad {pad}: {err:?}"); } } - self.status.mark_failed(pad); + self.status.mark_failed(pad, generation); + // The failed pad's track is finalized, so it is terminal for EOS aggregation (counts as ended). + self.eos.insert(pad.to_string()); } fn segment(&mut self, pad: &str, generation: u64, segment: gst::Segment) { @@ -389,28 +421,30 @@ impl PadSet { .set_segment(segment); } - fn buffer(&mut self, pad: &str, generation: u64, data: Bytes, pts: Option) -> Result<()> { + /// `Ok(true)` means the element is now complete (a pad failure can finish it). + fn buffer(&mut self, pad: &str, generation: u64, data: Bytes, pts: Option) -> Result { if !self.is_current(pad, generation) { gst::warning!(CAT, "buffer for stale or unknown pad {pad}, dropping"); - return Ok(()); + return Ok(false); } if self.eos.contains(pad) { gst::warning!(CAT, "buffer after EOS on pad {pad}, dropping"); - return Ok(()); + return Ok(false); } let result = match self.pads.get_mut(pad) { Some(p) => p.push_buffer(data, pts), None => { gst::warning!(CAT, "buffer before caps on pad {pad}, dropping"); - return Ok(()); + return Ok(false); } }; // A bad bitstream invalidates only this pad; the session and other pads continue. if let Err(err) = result { gst::warning!(CAT, "invalidating pad {pad}: {err:?}"); - self.fail_pad(pad); + self.fail_pad(pad, generation); + return Ok(self.all_ended()); } - Ok(()) + Ok(false) } /// Returns whether every active pad has now ended, so the caller posts the element EOS once. @@ -446,8 +480,9 @@ impl PadSet { self.all_ended() } - /// Idempotent (skips already-finalized pads); the returned order proves "catalog last". - fn finalize_all(&mut self) -> Vec { + /// Idempotent (skips already-finalized pads); the returned order proves "catalog last". `Err` means + /// the catalog could not be closed cleanly, which the caller surfaces instead of posting EOS. + fn finalize_all(&mut self) -> Result> { let mut order = Vec::new(); for (name, pad) in self.pads.iter_mut() { match pad.finalize() { @@ -458,12 +493,10 @@ impl PadSet { } // finish() closes both the hang and MSF tracks; a bare drop would not. if let Some(mut catalog) = self.catalog.take() { - match catalog.finish() { - Ok(()) => order.push("catalog".to_string()), - Err(err) => gst::warning!(CAT, "finalize catalog: {err:?}"), - } + catalog.finish().context("finalize catalog")?; + order.push("catalog".to_string()); } - order + Ok(order) } } @@ -486,6 +519,8 @@ struct Pad { segment_info: Option, // Kept only to map a buffer PTS to a running time. segment: Option>, + /// Last emitted MoQ timestamp (micros); a frame that would regress below it is dropped. + last_emitted_micros: Option, } impl Pad { @@ -496,6 +531,7 @@ impl Pad { state: PadState::NoSegment, segment_info: None, segment: None, + last_emitted_micros: None, } } @@ -554,8 +590,8 @@ impl Pad { } /// Pure of the importer, so it can be tested with real segments and no codec. - fn frame_timestamp(&self, pts: Option) -> FrameDecision { - match self.state { + fn frame_timestamp(&mut self, pts: Option) -> FrameDecision { + let decision = match self.state { PadState::Active => { // to_running_time_full is signed: a buffer before the segment returns Negative, which // frame_micros drops; to_running_time would instead clip it to None and lose the reason. @@ -564,12 +600,21 @@ impl Pad { .as_ref() .zip(pts) .and_then(|(segment, pts)| segment.to_running_time_full(pts)) - .map(signed_nanos); + .and_then(signed_nanos); frame_micros(running_time) } PadState::NoSegment => FrameDecision::Drop("buffer before a valid SEGMENT"), PadState::Invalid => FrameDecision::Drop("buffer on an invalidated timeline"), + }; + // Frame-level monotonicity: segment continuity (base) does not by itself stop a frame from + // regressing, so drop a timestamp that would go backwards rather than emit out of order. + if let FrameDecision::Emit(micros) = decision { + if self.last_emitted_micros.is_some_and(|last| micros < last) { + return FrameDecision::Drop("non-monotonic timestamp (regressed)"); + } + self.last_emitted_micros = Some(micros); } + decision } fn push_buffer(&mut self, mut data: Bytes, pts: Option) -> Result<()> { @@ -606,7 +651,11 @@ impl Pad { /// The spike only carries H.264 byte-stream. Checked synchronously at the event boundary (so an /// unsupported caps is rejected with NotNegotiated) and again in `set_caps` as the worker's defence. pub(super) fn caps_supported(caps: &gst::CapsRef) -> bool { - caps.structure(0).map(|s| s.name() == "video/x-h264").unwrap_or(false) + let Some(s) = caps.structure(0) else { return false }; + // FramedFormat::Avc3 consumes Annex-B access units, so require byte-stream/au, not just the type. + s.name() == "video/x-h264" + && s.get::("stream-format").is_ok_and(|f| f == "byte-stream") + && s.get::("alignment").is_ok_and(|a| a == "au") } fn segment_info(segment: &gst::Segment) -> SegmentInfo { @@ -625,10 +674,11 @@ fn segment_info(segment: &gst::Segment) -> SegmentInfo { } /// Flattens a signed running time to nanos, keeping the sign so the timeline can drop negatives. -fn signed_nanos(running_time: gst::Signed) -> i64 { +/// None on overflow of u64 nanos into i64 (unreachable in practice). +fn signed_nanos(running_time: gst::Signed) -> Option { match running_time { - gst::Signed::Positive(time) => time.nseconds() as i64, - gst::Signed::Negative(time) => -(time.nseconds() as i64), + gst::Signed::Positive(time) => i64::try_from(time.nseconds()).ok(), + gst::Signed::Negative(time) => i64::try_from(time.nseconds()).ok().map(|nanos| -nanos), } } @@ -665,7 +715,7 @@ mod tests { set.caps("video", 0, &h264_caps()).unwrap(); set.caps("audio", 0, &h264_caps()).unwrap(); - let order = set.finalize_all(); + let order = set.finalize_all().unwrap(); assert_eq!( order.last().map(String::as_str), Some("catalog"), @@ -674,7 +724,7 @@ mod tests { assert!(order.contains(&"video".to_string()) && order.contains(&"audio".to_string())); // A second pass finalizes nothing again. - assert!(set.finalize_all().is_empty()); + assert!(set.finalize_all().unwrap().is_empty()); } #[test] @@ -686,7 +736,7 @@ mod tests { assert!(set.eos("video", 0).unwrap()); // Only the catalog is left; the pad is not finalized twice. - assert_eq!(set.finalize_all(), vec!["catalog".to_string()]); + assert_eq!(set.finalize_all().unwrap(), vec!["catalog".to_string()]); } #[test] @@ -750,7 +800,7 @@ mod tests { assert!(set.eos("video", 0).unwrap()); assert!(set.eos("video", 0).unwrap(), "duplicate EOS stays complete"); // Finalized exactly once: only the catalog remains for the exit sweep. - assert_eq!(set.finalize_all(), vec!["catalog".to_string()]); + assert_eq!(set.finalize_all().unwrap(), vec!["catalog".to_string()]); } // A buffer that arrives after the pad's EOS is dropped (the producer is already finalized). @@ -787,10 +837,16 @@ mod tests { } #[test] - fn caps_supported_accepts_only_h264() { + fn caps_supported_requires_h264_byte_stream_au() { gst::init().unwrap(); assert!(caps_supported(&h264_caps())); assert!(!caps_supported(&audio_caps())); + // H.264 but not byte-stream/au is rejected: FramedFormat::Avc3 consumes Annex-B AUs. + let avc = gst::Caps::builder("video/x-h264") + .field("stream-format", "avc") + .field("alignment", "au") + .build(); + assert!(!caps_supported(&avc), "length-prefixed avc must be rejected"); } // A pad with unsupported caps is invalidated alone: marked failed and its producer dropped, while @@ -807,25 +863,126 @@ mod tests { set.caps("data", 0, &audio_caps()).is_ok(), "a pad error must not kill the session" ); - assert!(set.status.is_failed("data"), "the bad pad is marked failed"); - assert!(!set.status.is_failed("video"), "the good pad is untouched"); + assert!(set.status.is_failed("data", 0), "the bad pad is marked failed"); + assert!(!set.status.is_failed("video", 0), "the good pad is untouched"); assert!(set.pads.contains_key("video"), "the good pad keeps its producer"); assert!(!set.pads.contains_key("data"), "the failed pad's producer is dropped"); } + // The buffer/bitstream failure path (decode_frame error), distinct from the caps path: a malformed + // Annex-B NAL invalidates only that pad, which is then terminal for aggregation. + #[test] + fn malformed_bitstream_fails_only_that_pad() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("video", 0); + set.add_pad("audio", 0); + set.caps("video", 0, &h264_caps()).unwrap(); + set.caps("audio", 0, &h264_caps()).unwrap(); + set.segment("video", 0, time_segment()); + + // Annex-B start code + a NAL header with the forbidden_zero_bit set: a real importer error. + let bad = Bytes::from_static(&[0x00, 0x00, 0x00, 0x01, 0x80]); + assert!( + set.buffer("video", 0, bad, Some(gst::ClockTime::ZERO)).is_ok(), + "a bitstream error must not kill the session" + ); + assert!( + set.status.is_failed("video", 0), + "the bad-bitstream pad is marked failed" + ); + assert!(!set.status.is_failed("audio", 0), "the other pad is untouched"); + assert!(!set.pads.contains_key("video"), "the failed pad's producer is dropped"); + // The failed pad is terminal: the audio EOS completes the element. + assert!( + set.eos("audio", 0).unwrap(), + "the failed video pad no longer blocks completion" + ); + } + + // A failed pad is terminal (its track is finalized), so it stops blocking aggregation: the element + // completes once the remaining good pads end. + #[test] + fn a_failed_pad_is_terminal_for_aggregation() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("video", 0); + set.add_pad("data", 0); + set.caps("video", 0, &h264_caps()).unwrap(); + // "data" fails on unsupported caps; the element is not complete yet (video still active). + assert!(!set.caps("data", 0, &audio_caps()).unwrap()); + // video EOS now finishes the element: data (failed, terminal) + video (EOS) are both ended. + assert!( + set.eos("video", 0).unwrap(), + "the failed pad no longer blocks completion" + ); + } + + // Failing the last pending pad completes the element directly (caps reports completion). + #[test] + fn failing_the_last_pending_pad_completes_the_element() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("video", 0); + set.add_pad("data", 0); + set.caps("video", 0, &h264_caps()).unwrap(); + assert!(!set.eos("video", 0).unwrap(), "video ended but data is still pending"); + assert!( + set.caps("data", 0, &audio_caps()).unwrap(), + "failing the last pending pad completes the element" + ); + } + // A finalized catalog is a session failure: caps returns Err so the worker tears the session down. #[test] fn caps_after_catalog_finalized_is_a_session_error() { gst::init().unwrap(); let mut set = pad_set(); set.add_pad("video", 0); - set.finalize_all(); + set.finalize_all().unwrap(); assert!( set.caps("video", 0, &h264_caps()).is_err(), "a gone catalog is session-fatal" ); } + // Frame monotonicity: a segment accepted on base alone can still place a frame before one already + // emitted; that regression is dropped, not emitted out of order. + #[test] + fn regressing_timestamp_is_dropped() { + gst::init().unwrap(); + let mut pad = Pad::new(); + pad.set_segment(time_segment_at(0, 0)); + assert_eq!( + pad.frame_timestamp(Some(gst::ClockTime::from_mseconds(10_000))), + FrameDecision::Emit(10_000_000) + ); + // New segment: base advances past the previous base (so it is accepted) but below the running + // time already emitted. + pad.set_segment(time_segment_at(0, 5_000)); + assert_eq!(pad.state, PadState::Active); + // Its first frame maps to 5s, which regresses below the 10s already emitted: dropped. + assert!(matches!( + pad.frame_timestamp(Some(gst::ClockTime::ZERO)), + FrameDecision::Drop(_) + )); + // A later frame that catches up past 10s emits again. + assert_eq!( + pad.frame_timestamp(Some(gst::ClockTime::from_mseconds(6_000))), + FrameDecision::Emit(11_000_000) + ); + } + + // The failure set is scoped to the generation, so a pad recreated with the same name does not + // inherit a previous incarnation's failure even before the worker clears it. + #[test] + fn failure_is_scoped_to_the_generation() { + let status = Status::default(); + status.mark_failed("video", 0); + assert!(status.is_failed("video", 0), "the failed incarnation is failed"); + assert!(!status.is_failed("video", 1), "a newer incarnation is not failed"); + } + // SEGMENT before CAPS: the pad is created and the segment retained when CAPS arrives. #[test] fn segment_before_caps_is_retained() { @@ -840,9 +997,11 @@ mod tests { assert!(pad.framed.is_some()); } - // Firing the watch drops the receiver, waking a sender parked on the full channel with Err. + // The tokio property SessionHandle::stop relies on: dropping the receiver (which the worker does + // when run_session returns) wakes a sender parked on the full channel with Err, so a stop never + // deadlocks a chain thread applying backpressure. The element-level path needs a connected session. #[test] - fn shutdown_via_watch_releases_a_blocked_send() { + fn dropped_receiver_wakes_blocked_send() { let runtime = tokio::runtime::Runtime::new().unwrap(); let (data_tx, data_rx) = mpsc::channel::(DATA_CHANNEL_BOUND); let (shutdown_tx, mut shutdown_rx) = watch::channel(false); @@ -892,7 +1051,7 @@ mod tests { // A pad with no SEGMENT yet drops buffers (NoSegment), distinct from an invalidated timeline. #[test] fn pad_without_segment_drops_buffers() { - let pad = Pad::new(); + let mut pad = Pad::new(); assert_eq!(pad.state, PadState::NoSegment); assert!(matches!( pad.frame_timestamp(Some(gst::ClockTime::from_mseconds(5))), diff --git a/rs/moq-gst/src/spike/timeline.rs b/rs/moq-gst/src/spike/timeline.rs index 27340f05b..693636c13 100644 --- a/rs/moq-gst/src/spike/timeline.rs +++ b/rs/moq-gst/src/spike/timeline.rs @@ -128,10 +128,10 @@ mod tests { assert!(matches!(frame_micros(None), FrameDecision::Drop(_))); } - // Stateless conversion: two pads sharing one running-time clock keep their relative offset, - // regardless of call order. + // frame_micros is a stateless conversion (no last-emitted state, no per-pad anchor). The real + // A/V-offset guarantee is exercised by two_pads_keep_av_aligned_through_real_segments. #[test] - fn shared_timeline_keeps_av_aligned() { + fn frame_micros_is_stateless() { assert_eq!(frame_micros(Some(7_000_000)), FrameDecision::Emit(7_000)); assert_eq!(frame_micros(Some(5_000_000)), FrameDecision::Emit(5_000)); } diff --git a/rs/moq-gst/tests/element.rs b/rs/moq-gst/tests/element.rs index 443496875..9d5925390 100644 --- a/rs/moq-gst/tests/element.rs +++ b/rs/moq-gst/tests/element.rs @@ -1,7 +1,7 @@ //! Hermetic element-boundary tests: behaviour reachable without a live MoQ session. //! -//! Session-dependent flows (multipad EOS aggregation, per-pad errors, FLUSH, remote death) require a -//! real relay, so they live in the relay-backed harness, not here. +//! Flows that need a connected session (multipad EOS aggregation, per-pad error propagation, remote +//! close) are validated against a real relay, separately from this hermetic suite. use std::sync::Once; @@ -48,7 +48,8 @@ fn missing_url_fails_state_change() { let _ = sink.set_state(gst::State::Null); } -// A connect that cannot succeed surfaces as an ERROR on the bus, not a silent log. +// A connect that cannot succeed surfaces as an ERROR on the bus (not a silent log) and leaves the +// element disconnected. The `.invalid` host fails fast at DNS resolution in this test environment. #[test] fn connect_failure_posts_error_to_bus() { init(); @@ -62,8 +63,10 @@ fn connect_failure_posts_error_to_bus() { let _ = pipeline.set_state(gst::State::Playing); let bus = pipeline.bus().expect("pipeline bus"); - let msg = bus.timed_pop_filtered(gst::ClockTime::from_seconds(20), &[gst::MessageType::Error]); + let msg = bus.timed_pop_filtered(gst::ClockTime::from_seconds(10), &[gst::MessageType::Error]); + let connected = sink.property::("connected"); let _ = pipeline.set_state(gst::State::Null); assert!(msg.is_some(), "a failed connect must post an ERROR to the bus"); + assert!(!connected, "a failed connect must leave connected = false"); } From 4d24a4434f05cfcd730a5f3f9d3163a68581cbb1 Mon Sep 17 00:00:00 2001 From: Ariel Molina Date: Mon, 15 Jun 2026 19:05:23 -0600 Subject: [PATCH 08/13] moq-gst: codec/caps parity for moqsinkspike (caps factory + frame-through) Close the codec gap in the parallel spike so it accepts the same media as moqsink, without regressing the rewrite's seams (shared running time, per-pad membership, per-pad error isolation, EOS-after-finalize): - Expand the pad template and replace the hardcoded Avc3 path with a caps -> Framed factory over seven media types: H.264/H.265 (byte-stream/au), AV1, VP8, VP9, AAC (mpegversion=4, raw), Opus. Every codec converges on one Framed; only the construction from caps differs. - Keep caps_supported media-type only. The factory enforces per-codec specifics (byte-stream/au, AAC mpegversion/raw plus codec_data) and derives Opus channels/rate with defaults, failing just that pad on a bad or missing detail, never the session. - finalize only finishes an initialized producer. A lazy importer (H.265/AV1/VP8/VP9) given CAPS then EOS with no frame never created its track, so finish() would error "not initialized"; it now no-ops while a real finish error on a live track still surfaces. - Tests: strong frame-through for H.264 and Opus that observe a real emitted frame on the rendition track (not just the catalog rendition); creation tests for H.265/AV1/VP8/VP9/AAC; negatives for length-prefixed H.264, byte-stream-less H.265, AAC without codec_data, and AAC with wrong mpegversion; a clean CAPS-then-EOS test for the lazy codecs; caps_supported retargeted to the seven media types. CONTEXT Rejected alternatives: - Per-codec field checks inside caps_supported: it stays thin and media-type only, so the factory is the one enforcement point and a bad detail fails the pad, not the session. - A new MediaPad/MediaProducer abstraction: Pad already holds an Option, so a factory function is enough. - Frame-through fixtures for all seven codecs: only H.264 and Opus get strong frame-through; the rest get creation tests, with real transmission left to the relay harness. - A moq-mux change to make a lazy finish() tolerant: the no-track no-op is kept spike-local in Pad::finalize via the existing Framed::track() probe, so moq-mux behavior is unchanged and real finish errors on live tracks still propagate. - Reintroducing the production sink's per-pad reference_pts anchoring: the spike keeps its shared running time. Key decisions: - caps_supported is media-type only; per-codec specifics live in the factory, which fails the pad (not the session) on a bad or missing detail. - A frame-through test must observe a real emitted frame: it subscribes to the rendition track and asserts latest().is_some(), because the catalog rendition is published from the SPS/config and would pass with no frame at all. - CAPS then EOS before the first frame must be clean: a lazy importer never created its track, so finalize no-ops instead of turning "not initialized" into a session error. --- rs/moq-gst/src/spike/imp.rs | 31 ++- rs/moq-gst/src/spike/session.rs | 337 +++++++++++++++++++++++++++++--- 2 files changed, 338 insertions(+), 30 deletions(-) diff --git a/rs/moq-gst/src/spike/imp.rs b/rs/moq-gst/src/spike/imp.rs index c458b62a4..253bc4b2a 100644 --- a/rs/moq-gst/src/spike/imp.rs +++ b/rs/moq-gst/src/spike/imp.rs @@ -145,11 +145,32 @@ impl ElementImpl for MoqSinkSpike { fn pad_templates() -> &'static [gst::PadTemplate] { static PAD_TEMPLATES: LazyLock> = LazyLock::new(|| { - // Spike scope: a single H.264 pad; the point is the lifecycle, not codec breadth. - let caps = gst::Caps::builder("video/x-h264") - .field("stream-format", "byte-stream") - .field("alignment", "au") - .build(); + // Every codec that converges on moq_mux::import::Framed; per-codec specifics are validated + // when the producer is built from caps, not here. + let mut caps = gst::Caps::new_empty(); + caps.merge( + gst::Caps::builder("video/x-h264") + .field("stream-format", "byte-stream") + .field("alignment", "au") + .build(), + ); + caps.merge( + gst::Caps::builder("video/x-h265") + .field("stream-format", "byte-stream") + .field("alignment", "au") + .build(), + ); + caps.merge(gst::Caps::builder("video/x-av1").build()); + caps.merge(gst::Caps::builder("video/x-vp8").build()); + caps.merge(gst::Caps::builder("video/x-vp9").build()); + caps.merge( + gst::Caps::builder("audio/mpeg") + .field("mpegversion", 4i32) + .field("stream-format", "raw") + .build(), + ); + caps.merge(gst::Caps::builder("audio/x-opus").build()); + let templ = gst::PadTemplate::new("sink_%u", gst::PadDirection::Sink, gst::PadPresence::Request, &caps).unwrap(); vec![templ] diff --git a/rs/moq-gst/src/spike/session.rs b/rs/moq-gst/src/spike/session.rs index 8f51656a8..3b1ce044e 100644 --- a/rs/moq-gst/src/spike/session.rs +++ b/rs/moq-gst/src/spike/session.rs @@ -11,6 +11,7 @@ use gst::prelude::*; use tokio::sync::{mpsc, watch}; use hang::moq_net; +use moq_mux::import::{Framed, FramedFormat}; use super::timeline::{classify_segment, frame_micros, FrameDecision, SegmentDecision, SegmentInfo}; use super::MoqSinkSpike as Element; @@ -542,24 +543,60 @@ impl Pad { caps: &gst::Caps, ) -> Result<()> { let structure = caps.structure(0).context("empty caps")?; - ensure!( - caps_supported(caps), - "spike only carries H.264 byte-stream, got {}", - structure.name() - ); // Identical caps re-sent (sticky event): keep the live producer, don't finalize and recreate. if self.framed.is_some() && self.caps.as_ref() == Some(caps) { return Ok(()); } // Renegotiation: finalize the previous producer before replacing it (closed once, not abandoned). self.finalize()?; - let mut empty = Bytes::new(); - self.framed = Some(moq_mux::import::Framed::new( - broadcast, - catalog, - moq_mux::import::FramedFormat::Avc3, - &mut empty, - )?); + // Every codec converges on one Framed; only the caps -> producer construction differs. A bad or + // unsupported caps is a per-pad error (the caller invalidates just this pad), not session-fatal. + let framed: Framed = match structure.name().as_str() { + "video/x-h264" => { + require_byte_stream_au(structure)?; + Framed::new(broadcast, catalog, FramedFormat::Avc3, &mut Bytes::new())? + } + "video/x-h265" => { + require_byte_stream_au(structure)?; + Framed::new(broadcast, catalog, FramedFormat::Hev1, &mut Bytes::new())? + } + "video/x-av1" => Framed::new(broadcast, catalog, FramedFormat::Av01, &mut Bytes::new())?, + "video/x-vp8" => Framed::new(broadcast, catalog, FramedFormat::Vp8, &mut Bytes::new())?, + "video/x-vp9" => Framed::new(broadcast, catalog, FramedFormat::Vp9, &mut Bytes::new())?, + "audio/mpeg" => { + // AAC: the AudioSpecificConfig rides in caps as codec_data, not in the bitstream. + ensure!( + structure.get::("mpegversion").is_ok_and(|v| v == 4), + "AAC requires mpegversion=4" + ); + ensure!( + structure.get::("stream-format").is_ok_and(|f| f == "raw"), + "AAC requires stream-format=raw" + ); + let codec_data = structure + .get::("codec_data") + .context("AAC caps missing codec_data")?; + let map = codec_data.map_readable().context("failed to map AAC codec_data")?; + let mut data = Bytes::copy_from_slice(map.as_slice()); + Framed::new(broadcast, catalog, FramedFormat::Aac, &mut data)? + } + "audio/x-opus" => { + // Opus: GStreamer gives channels/rate in caps (not an OpusHead), so build the config here. + let channels: i32 = structure.get("channels").unwrap_or(2); + let rate: i32 = structure.get("rate").unwrap_or(48_000); + let channel_count = u32::try_from(channels) + .with_context(|| format!("Opus caps has negative channel count {channels}"))?; + let sample_rate = + u32::try_from(rate).with_context(|| format!("Opus caps has negative sample rate {rate}"))?; + let config = moq_mux::codec::opus::Config { + sample_rate, + channel_count, + }; + moq_mux::codec::opus::Import::new(broadcast, catalog, config)?.into() + } + other => anyhow::bail!("unsupported caps: {other}"), + }; + self.framed = Some(framed); self.caps = Some(caps.clone()); Ok(()) } @@ -640,7 +677,12 @@ impl Pad { fn finalize(&mut self) -> Result { match self.framed.take() { Some(mut framed) => { - framed.finish()?; + // A lazy codec (H.265/AV1/VP8/VP9) given CAPS but no frame never created its track, so + // there is nothing to flush and finish() would error "not initialized". track() is Ok only + // once a track exists; a real finish error on an initialized one still surfaces. + if framed.track().is_ok() { + framed.finish()?; + } Ok(true) } None => Ok(false), @@ -648,14 +690,31 @@ impl Pad { } } -/// The spike only carries H.264 byte-stream. Checked synchronously at the event boundary (so an -/// unsupported caps is rejected with NotNegotiated) and again in `set_caps` as the worker's defence. +/// Media types the spike can build a producer for. Checked synchronously at the event boundary (an +/// unsupported caps is rejected with NotNegotiated) and again in `set_caps`. Per-codec specifics +/// (byte-stream/au, AAC codec_data) are enforced when the producer is built, so a bad detail fails +/// that pad, not the session. pub(super) fn caps_supported(caps: &gst::CapsRef) -> bool { let Some(s) = caps.structure(0) else { return false }; - // FramedFormat::Avc3 consumes Annex-B access units, so require byte-stream/au, not just the type. - s.name() == "video/x-h264" - && s.get::("stream-format").is_ok_and(|f| f == "byte-stream") - && s.get::("alignment").is_ok_and(|a| a == "au") + matches!( + s.name().as_str(), + "video/x-h264" | "video/x-h265" | "video/x-av1" | "video/x-vp8" | "video/x-vp9" | "audio/mpeg" | "audio/x-opus" + ) +} + +/// FramedFormat::Avc3/Hev1 consume Annex-B access units, so H.264/H.265 caps must be byte-stream/au. +fn require_byte_stream_au(s: &gst::StructureRef) -> Result<()> { + ensure!( + s.get::("stream-format").is_ok_and(|f| f == "byte-stream"), + "{} requires stream-format=byte-stream", + s.name() + ); + ensure!( + s.get::("alignment").is_ok_and(|a| a == "au"), + "{} requires alignment=au", + s.name() + ); + Ok(()) } fn segment_info(segment: &gst::Segment) -> SegmentInfo { @@ -694,6 +753,17 @@ mod tests { PadSet::new(broadcast, catalog, Arc::new(Status::default())) } + // A producer that actually emitted has at least one group. latest() is the cross-crate-visible + // sync read; moq-net's assert_group helper is gated to that crate's own tests. + fn emitted_a_frame(set: &PadSet, track: &str) -> bool { + set.broadcast + .consume() + .subscribe_track(&moq_net::Track::new(track)) + .expect("the rendition track is published") + .latest() + .is_some() + } + fn h264_caps() -> gst::Caps { gst::Caps::builder("video/x-h264") .field("stream-format", "byte-stream") @@ -836,17 +906,234 @@ mod tests { assert!(set.eos("video", 1).unwrap()); } + // caps_supported is media-type only; per-codec specifics are the factory's/template's job. + #[test] + fn caps_supported_accepts_known_media_types() { + gst::init().unwrap(); + for media in [ + "video/x-h264", + "video/x-h265", + "video/x-av1", + "video/x-vp8", + "video/x-vp9", + "audio/mpeg", + "audio/x-opus", + ] { + assert!( + caps_supported(&gst::Caps::builder(media).build()), + "{media} must be accepted" + ); + } + assert!(!caps_supported(&audio_caps()), "an unsupported media type is rejected"); + } + + // Frame-through (real init + emit): a keyframe AU emits a frame to the rendition track, not just + // the SPS-published rendition. #[test] - fn caps_supported_requires_h264_byte_stream_au() { + fn frame_through_h264_emits_a_frame() { gst::init().unwrap(); - assert!(caps_supported(&h264_caps())); - assert!(!caps_supported(&audio_caps())); - // H.264 but not byte-stream/au is rejected: FramedFormat::Avc3 consumes Annex-B AUs. - let avc = gst::Caps::builder("video/x-h264") + let mut set = pad_set(); + set.add_pad("video", 0); + set.caps("video", 0, &h264_caps()).unwrap(); + set.segment("video", 0, time_segment()); + + // SPS (the importer's own proven bytes) + PPS + a type-5 IDR slice. The slice header is not + // fully parsed for IDR, but this drives the real decode_frame path and emits one frame. + let sps: &[u8] = &[ + 0x67, 0x42, 0xc0, 0x1f, 0xda, 0x01, 0x40, 0x16, 0xe9, 0xb8, 0x08, 0x08, 0x0a, 0x00, 0x00, 0x07, 0xd0, 0x00, + 0x01, 0xd4, 0xc0, 0x80, + ]; + let pps: &[u8] = &[0x68, 0xce, 0x3c, 0x80]; + let idr: &[u8] = &[0x65, 0x88, 0x84, 0x00, 0x21]; + let mut au = Vec::new(); + for nal in [sps, pps, idr] { + au.extend_from_slice(&[0, 0, 0, 1]); + au.extend_from_slice(nal); + } + set.buffer("video", 0, Bytes::from(au), Some(gst::ClockTime::ZERO)) + .unwrap(); + + let snapshot = set.catalog.as_ref().unwrap().snapshot(); + let track = snapshot.video.renditions.keys().next().expect("a video rendition"); + // The discriminant: a real frame reached the track. The rendition alone would pass on the SPS. + assert!(emitted_a_frame(&set, track), "the IDR AU emitted a frame to the track"); + assert_eq!( + snapshot.video.renditions.len(), + 1, + "the SPS published exactly one rendition" + ); + } + + // Frame-through for Opus: the rendition publishes from caps, then a packet emits a real frame. + #[test] + fn frame_through_opus_emits_a_frame() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("audio", 0); + let caps = gst::Caps::builder("audio/x-opus") + .field("channels", 2i32) + .field("rate", 48_000i32) + .build(); + set.caps("audio", 0, &caps).unwrap(); + assert_eq!( + set.catalog.as_ref().unwrap().snapshot().audio.renditions.len(), + 1, + "Opus publishes its rendition from channels/rate at construction" + ); + + set.segment("audio", 0, time_segment()); + // The Opus importer carries the packet verbatim; any non-empty payload is one frame. + set.buffer( + "audio", + 0, + Bytes::from_static(&[0xfc, 0xff, 0xfe]), + Some(gst::ClockTime::ZERO), + ) + .unwrap(); + + let track = set + .catalog + .as_ref() + .unwrap() + .snapshot() + .audio + .renditions + .keys() + .next() + .expect("an audio rendition") + .clone(); + assert!( + emitted_a_frame(&set, &track), + "the Opus packet emitted a frame to the track" + ); + } + + // Creation only (decision c): these importers build from an empty init and create the track lazily, + // so assert the producer exists, not a rendition. + #[test] + fn creation_succeeds_for_video_codecs() { + gst::init().unwrap(); + let mut set = pad_set(); + for (i, media) in ["video/x-h265", "video/x-av1", "video/x-vp8", "video/x-vp9"] + .into_iter() + .enumerate() + { + let name = format!("v{i}"); + set.add_pad(&name, 0); + // H.265 shares H.264's Annex-B requirement; AV1/VP8/VP9 carry no such specifics. + let mut builder = gst::Caps::builder(media); + if media == "video/x-h265" { + builder = builder.field("stream-format", "byte-stream").field("alignment", "au"); + } + set.caps(&name, 0, &builder.build()).unwrap(); + assert!(set.pads[&name].framed.is_some(), "{media} producer built"); + } + } + + #[test] + fn creation_succeeds_for_aac_with_codec_data() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("audio", 0); + // AudioSpecificConfig: AAC-LC, 44100 Hz, stereo. + let codec_data = gst::Buffer::from_slice([0x12u8, 0x10]); + let caps = gst::Caps::builder("audio/mpeg") + .field("mpegversion", 4i32) + .field("stream-format", "raw") + .field("codec_data", &codec_data) + .build(); + set.caps("audio", 0, &caps).unwrap(); + assert!(set.pads["audio"].framed.is_some()); + } + + // Missing codec_data is a per-pad error (the factory cannot build AAC), not a session failure. + #[test] + fn aac_without_codec_data_fails_the_pad() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("audio", 0); + let caps = gst::Caps::builder("audio/mpeg") + .field("mpegversion", 4i32) + .field("stream-format", "raw") + .build(); + assert!( + set.caps("audio", 0, &caps).is_ok(), + "a missing codec_data fails the pad, not the session" + ); + assert!(set.status.is_failed("audio", 0), "the pad is marked failed"); + assert!(!set.pads.contains_key("audio"), "the failed pad's producer is dropped"); + } + + // Per-codec specifics that caps_supported delegates: the factory fails the pad on a bad detail. + #[test] + fn h264_length_prefixed_avc_fails_the_pad() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("video", 0); + // FramedFormat::Avc3 needs Annex-B, so length-prefixed avc is rejected by the factory. + let caps = gst::Caps::builder("video/x-h264") .field("stream-format", "avc") .field("alignment", "au") .build(); - assert!(!caps_supported(&avc), "length-prefixed avc must be rejected"); + set.caps("video", 0, &caps).unwrap(); + assert!(set.status.is_failed("video", 0), "length-prefixed avc fails the pad"); + assert!(!set.pads.contains_key("video")); + } + + #[test] + fn h265_without_byte_stream_au_fails_the_pad() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("video", 0); + set.caps("video", 0, &gst::Caps::builder("video/x-h265").build()) + .unwrap(); + assert!( + set.status.is_failed("video", 0), + "H.265 without byte-stream/au fails the pad" + ); + assert!(!set.pads.contains_key("video")); + } + + #[test] + fn aac_wrong_mpegversion_fails_the_pad() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("audio", 0); + let codec_data = gst::Buffer::from_slice([0x12u8, 0x10]); + let caps = gst::Caps::builder("audio/mpeg") + .field("mpegversion", 1i32) + .field("stream-format", "raw") + .field("codec_data", &codec_data) + .build(); + set.caps("audio", 0, &caps).unwrap(); + assert!(set.status.is_failed("audio", 0), "mpegversion!=4 fails the pad"); + assert!(!set.pads.contains_key("audio")); + } + + // CAPS then EOS with no frame: lazy importers (H.265/AV1/VP8/VP9) never created a track, so + // finalize must be a clean no-op rather than a "not initialized" session error. + #[test] + fn caps_then_eos_before_first_frame_is_clean_for_lazy_codecs() { + gst::init().unwrap(); + for media in ["video/x-h265", "video/x-av1", "video/x-vp8", "video/x-vp9"] { + let mut set = pad_set(); + set.add_pad("video", 0); + let mut builder = gst::Caps::builder(media); + if media == "video/x-h265" { + builder = builder.field("stream-format", "byte-stream").field("alignment", "au"); + } + set.caps("video", 0, &builder.build()).unwrap(); + // EOS before any frame must be clean, not a session error. + assert!( + set.eos("video", 0).is_ok(), + "{media}: CAPS->EOS with no frame must be clean" + ); + // The pad must still count as ended (it is terminal for aggregation). + assert!( + set.eos("video", 0).unwrap(), + "{media}: the EOS'd pad completes the element" + ); + } } // A pad with unsupported caps is invalidated alone: marked failed and its producer dropped, while From aa47c2b880657e31f7f244a6538a12f1160da32f Mon Sep 17 00:00:00 2001 From: Ariel Molina Date: Tue, 16 Jun 2026 13:26:41 -0600 Subject: [PATCH 09/13] moq-gst: hardening for moqsinkspike (FLUSH, B-frames, session/pad lifecycle races) Production-hardening pass over the parallel spike, all hermetically tested (62 unit + 3 element, clippy -D warnings, fmt). FLUSH plus the fixes from an external broadcast review: - FLUSH: handle FLUSH_START/STOP as a discontinuity plus a cancellation. The worker re-anchors the pad's timeline (NoSegment) so a post-seek rewinding SEGMENT is accepted instead of rejected, and a per-pad watch cuts a chain blocked on the bounded channel without tearing the session down. run_loop is biased so the out-of-band flush re-anchors before the post-flush SEGMENT that rides the data FIFO. - B-frames: drop the per-frame PTS monotonicity filter. The moq-mux container carries frames in decode order and documents that B-frames have non-monotonic presentation timestamps, so the filter silently dropped every B-frame (30-60% of frames on High-profile content). The wire order is already decode order. - Pad lifecycle: populate the membership maps before add_pad, roll back only the entry this attempt still owns on failure, and announce AddPad only after add_pad succeeds, so a duplicate or failed add never orphans the live pad nor seeds a phantom that blocks EOS aggregation forever. - Status races on restart: a session generation token. All observable state (connected, version, send_bitrate, failed) lives under one Mutex so "check the live generation, then write" is indivisible; a stale session's late writes and exit-reset become no-ops. failed is keyed by (session generation, pad generation), so a restart reusing the same pad never inherits a stale failure. - Worker panic: run the session worker on an inner task and surface a JoinError (panic or cancel) as element_error, instead of a silent death only observed when stop() reaps the JoinHandle. - Refactor: bundle the worker's inbound channels into one Inbound struct. CONTEXT Rejected alternatives: - Enforcing frame monotonicity on the DTS instead of removing the filter: best-practice in the abstract, but it needs DTS threaded through every layer and only guards against out-of-decode-order delivery, which GStreamer does not produce. Removing the filter to match the container contract is simpler and correct. - Gating only the exit-reset by generation: a stale session could still write connected/version/bitrate. Every observable write is gated. - Generation-gating with separate atomics: a load-then-store is not an atomic check-and-write, so a stale write can still interleave between the check and the store. One Mutex over the observable state makes it indivisible. - Serializing request_new_pad under a lock held across add_pad: add_pad emits pad-added synchronously, so a reentrant handler would deadlock on that lock, a worse failure than the narrow same-name-concurrent window it would close. That window is left documented, not closed. - A moq-mux Framed::discontinuity() for the FLUSH partial-AU reset: kept this change moq-gst-only. The partial-AU edge is documented and low-risk with alignment=au (one complete access unit per buffer). - catch_unwind via the futures crate for the worker panic: futures is not a direct dependency, so an inner task plus JoinError is used instead. Key decisions: - FLUSH cancellation uses a watch per pad: no new dependency, and per-pad so flushing one input never cancels another input's blocked send. send_or_flush is biased toward the flush arm and re-checks the watch after winning a channel permit, which narrows (does not atomically close) the capacity-vs-FLUSH_START tie; the residual one-buffer leak is absorbed by the re-anchor. - The failure set carries both generations; is_failed matches the live session, so neither a recreated pad nor a recreated session inherits a stale failure. - Honest test coverage: decode-order B-frame emit, generation scoping, and blocked-send cancellation have discriminating tests. The panic-to-bus path and the load-vs-store / same-name-concurrent races are covered by construction (a lock) or documented, not by a deterministic test, since they need a real session or an injected race. --- rs/moq-gst/src/spike/imp.rs | 213 +++++++-- rs/moq-gst/src/spike/session.rs | 735 +++++++++++++++++++++++++++----- 2 files changed, 826 insertions(+), 122 deletions(-) diff --git a/rs/moq-gst/src/spike/imp.rs b/rs/moq-gst/src/spike/imp.rs index 253bc4b2a..43e131051 100644 --- a/rs/moq-gst/src/spike/imp.rs +++ b/rs/moq-gst/src/spike/imp.rs @@ -9,8 +9,11 @@ use bytes::Bytes; use gst::glib; use gst::prelude::*; use gst::subclass::prelude::*; +use tokio::sync::watch; -use super::session::{caps_supported, DataMsg, ResolvedSettings, SessionHandle, Status, CAT}; +use super::session::{ + caps_supported, send_or_flush, DataMsg, FlushSignal, ResolvedSettings, SendOutcome, SessionHandle, Status, CAT, +}; /// Reject a frame past the MoQ frame limit (moq-net's MAX_FRAME_SIZE, 16 MiB): it could not be /// consumed anyway, and copying it would let hostile input drive an unbounded allocation. @@ -48,6 +51,10 @@ pub struct MoqSinkSpike { // worker discards the previous incarnation's in-flight messages. next_generation: AtomicU64, pad_generations: Mutex>, + // Per-pad FLUSH gate: FLUSH_START flips it true to cut a blocked send (the chain holds a cloned + // receiver); FLUSH_STOP flips it false. Per-pad so flushing one input never cancels another's send. + // Keyed with the generation so a stale FLUSH from a previous incarnation cannot flip the live pad. + pad_flush: Mutex)>>, } #[glib::object_subclass] @@ -187,19 +194,26 @@ impl ElementImpl for MoqSinkSpike { // Fixed per pad incarnation and captured here, so a buffer in flight from a released pad never // reads a successor's generation. let generation = self.next_generation.fetch_add(1, Ordering::Relaxed); + // One flush watch per pad: the sender lives in `pad_flush` (toggled by FLUSH_START/STOP), each + // pad function carries its own cloned receiver to cancel a blocked send. + let (flush_tx, flush_rx) = watch::channel(false); + let chain_flush = flush_rx.clone(); + let event_flush = flush_rx; let pad_builder = gst::Pad::builder_from_template(templ) .chain_function(move |pad, parent, buffer| { + let mut flush = chain_flush.clone(); MoqSinkSpike::catch_panic_pad_function( parent, || Err(gst::FlowError::Error), - |sink| sink.forward_buffer(pad, generation, buffer), + |sink| sink.forward_buffer(pad, generation, &mut flush, buffer), ) }) .event_function(move |pad, parent, event| { + let mut flush = event_flush.clone(); MoqSinkSpike::catch_panic_pad_function( parent, || false, - |sink| sink.handle_event(pad, generation, event), + |sink| sink.handle_event(pad, generation, &mut flush, event), ) }); @@ -208,28 +222,73 @@ impl ElementImpl for MoqSinkSpike { None => pad_builder.generated_name().build(), }; + // Populate the maps BEFORE the pad is visible to GStreamer, so a concurrent start_session seed + // never sees a pad without its generation. Capture the previous holders so a failed add_pad (a + // duplicate name, or a concurrent request that lost the race) rolls back without orphaning the + // live pad, and announce AddPad only after add_pad succeeds so the worker never gets a phantom. + // + // Known limitation (documented, not closed): two concurrent requests for the SAME name racing a + // start_session seed can leave the seed reading the loser's generation for the winner's pad, so + // the winner's events are then dropped as stale. Closing it would mean holding a lock across + // add_pad, which emits pad-added synchronously and would deadlock a reentrant handler; that risk + // is worse than the bug, whose trigger (same-name concurrent request_new_pad) apps do not produce. let name = pad.name().to_string(); - self.pad_generations.lock().unwrap().insert(name.clone(), generation); + let prev_gen = self.pad_generations.lock().unwrap().insert(name.clone(), generation); + let prev_flush = self + .pad_flush + .lock() + .unwrap() + .insert(name.clone(), (generation, flush_tx)); + + if self.obj().add_pad(&pad).is_err() { + // Roll back, but only if this attempt still owns the entry. A concurrent same-name request + // that won add_pad may have overwritten it; restoring our captured `prev` (or removing) would + // then clobber the live pad it just registered. Touch the maps only while they still hold our + // own generation. + { + let mut gens = self.pad_generations.lock().unwrap(); + if gens.get(name.as_str()) == Some(&generation) { + match prev_gen { + Some(g) => gens.insert(name.clone(), g), + None => gens.remove(&name), + }; + } + } + { + let mut flushes = self.pad_flush.lock().unwrap(); + if flushes.get(name.as_str()).map(|(g, _)| *g) == Some(generation) { + match prev_flush { + Some(f) => flushes.insert(name.clone(), f), + None => flushes.remove(&name), + }; + } + } + return None; + } - // Announce the pad before adding it: its own CAPS/buffers can only flow after add_pad, so they - // are never enqueued ahead of the AddPad that declares its membership. + // A request pad is linked by the caller only after this returns, so its CAPS/buffers cannot reach + // the worker ahead of this AddPad. let sender = self.session.lock().unwrap().as_ref().map(SessionHandle::sender); if let Some(sender) = sender { let _ = sender.blocking_send(DataMsg::AddPad { pad: name, generation }); } - self.obj().add_pad(&pad).ok()?; Some(pad) } fn release_pad(&self, pad: &gst::Pad) { let name = pad.name().to_string(); let generation = self.pad_generations.lock().unwrap().remove(&name); + // Dropping the watch sender wakes any send still blocked on this pad (changed() errors -> Flushing). + self.pad_flush.lock().unwrap().remove(&name); // Drop the session guard before blocking_send: holding it across a full-channel block deadlocks stop_session. let sender = { let session = self.session.lock().unwrap(); session.as_ref().map(SessionHandle::sender) }; if let (Some(sender), Some(generation)) = (sender, generation) { + // Uncancellable: a full data channel or an in-progress connect stalls release here, and the + // per-pad flush watch (removed just above) cannot cut it. Known control-path-blocking limit, + // shared with the AddPad send; out of scope for FLUSH. let _ = sender.blocking_send(DataMsg::DropPad { pad: name, generation }); } let _ = self.obj().remove_pad(pad); @@ -284,6 +343,7 @@ impl MoqSinkSpike { &self, pad: &gst::Pad, generation: u64, + flush: &mut watch::Receiver, buffer: gst::Buffer, ) -> Result { // The worker marks a pad failed after rejecting its data; surface that to GStreamer instead of @@ -304,6 +364,13 @@ impl MoqSinkSpike { return Err(gst::FlowError::Error); } + // Skip the map + copy if the pad is already flushing; send_or_flush re-checks for a flush that + // starts mid-send, but during a flush repeated buffers would otherwise burn a copy each before + // being dropped. + if *flush.borrow() { + return Err(gst::FlowError::Flushing); + } + let sender = self .session .lock() @@ -316,18 +383,27 @@ impl MoqSinkSpike { let data = Bytes::copy_from_slice(map.as_slice()); let pts = buffer.pts(); - sender - .blocking_send(DataMsg::Buffer { - pad: pad.name().to_string(), - generation, - data, - pts, - }) - .map_err(|_| gst::FlowError::Flushing)?; - Ok(gst::FlowSuccess::Ok) + let msg = DataMsg::Buffer { + pad: pad.name().to_string(), + generation, + data, + pts, + }; + // FLUSH_START on this pad cuts a send blocked on a full channel (returns Flushing), instead of + // stalling the streaming thread until the relay drains. + match send_or_flush(&sender, msg, flush) { + SendOutcome::Sent => Ok(gst::FlowSuccess::Ok), + SendOutcome::Flushed | SendOutcome::Closed => Err(gst::FlowError::Flushing), + } } - fn handle_event(&self, pad: &gst::Pad, generation: u64, event: gst::Event) -> bool { + fn handle_event( + &self, + pad: &gst::Pad, + generation: u64, + flush: &mut watch::Receiver, + event: gst::Event, + ) -> bool { let sender = self.session.lock().unwrap().as_ref().map(|handle| handle.sender()); match event.view() { @@ -344,10 +420,10 @@ impl MoqSinkSpike { generation, caps: caps.to_owned(), }; - if sender.blocking_send(msg).is_err() { - return false; + match send_or_flush(&sender, msg, flush) { + SendOutcome::Sent => gst::Pad::event_default(pad, Some(&*self.obj()), event), + SendOutcome::Flushed | SendOutcome::Closed => false, } - gst::Pad::event_default(pad, Some(&*self.obj()), event) } gst::EventView::Segment(segment) => { let Some(sender) = sender else { return false }; @@ -356,21 +432,104 @@ impl MoqSinkSpike { generation, segment: segment.segment().to_owned(), }; - if sender.blocking_send(msg).is_err() { - return false; + match send_or_flush(&sender, msg, flush) { + SendOutcome::Sent => gst::Pad::event_default(pad, Some(&*self.obj()), event), + SendOutcome::Flushed | SendOutcome::Closed => false, } - gst::Pad::event_default(pad, Some(&*self.obj()), event) } gst::EventView::Eos(_) => { let Some(sender) = sender else { return false }; - sender - .blocking_send(DataMsg::Eos { + let msg = DataMsg::Eos { + pad: pad.name().to_string(), + generation, + }; + matches!(send_or_flush(&sender, msg, flush), SendOutcome::Sent) + } + // FLUSH_START arrives out of band on the flushing thread: flip this pad's watch to cut a send + // blocked in the chain, and tell the worker to re-anchor the timeline. Never blocks. + gst::EventView::FlushStart(_) => { + toggle_pad_flush(&self.pad_flush.lock().unwrap(), pad.name().as_str(), generation, true); + if let Some(handle) = self.session.lock().unwrap().as_ref() { + let _ = handle.flush_sender().send(FlushSignal { pad: pad.name().to_string(), generation, - }) - .is_ok() + }); + } + gst::Pad::event_default(pad, Some(&*self.obj()), event) + } + // FLUSH_STOP clears the gate so the chain resumes; the trailing SEGMENT re-anchors the worker. + gst::EventView::FlushStop(_) => { + toggle_pad_flush(&self.pad_flush.lock().unwrap(), pad.name().as_str(), generation, false); + gst::Pad::event_default(pad, Some(&*self.obj()), event) } _ => gst::Pad::event_default(pad, Some(&*self.obj()), event), } } } + +// Flip a pad's FLUSH watch only when the generation matches, so a stale FLUSH from a previous +// incarnation cannot cancel the live pad's sends. Mirrors the worker's generation discipline. +fn toggle_pad_flush(flush: &HashMap)>, name: &str, generation: u64, value: bool) { + if let Some((gen, tx)) = flush.get(name) { + if *gen == generation { + let _ = tx.send(value); + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use tokio::sync::watch; + + use super::toggle_pad_flush; + + #[test] + fn pad_flush_toggle_ignores_stale_generation() { + let mut map = HashMap::new(); + let (tx, rx) = watch::channel(false); + map.insert("video".to_string(), (1u64, tx)); + // A stale generation must not flip the live pad's watch. + toggle_pad_flush(&map, "video", 0, true); + assert!(!*rx.borrow(), "stale-generation flush is ignored"); + // The current generation flips it. + toggle_pad_flush(&map, "video", 1, true); + assert!(*rx.borrow(), "current-generation flush flips the watch"); + } + + // A failed pad add must leave membership untouched. request_new_pad adds the pad before announcing + // AddPad or mutating the maps, so a duplicate name (or a concurrent request that lost the race) that + // fails add_pad does not corrupt the live pad: its generation and flush sender survive. Announcing + // for a name already held would otherwise make the worker finalize the live pad's producer and + // overwrite its generation. Exercised via two direct request_new_pad calls (the concurrent path). + #[test] + fn failed_duplicate_pad_keeps_membership_consistent() { + use gst::prelude::*; + use gst::subclass::prelude::*; + gst::init().unwrap(); + + let obj = gst::glib::Object::new::(); + let imp = obj.imp(); + let templ = obj.pad_template("sink_%u").expect("sink template"); + + let p0 = imp.request_new_pad(&templ, Some("sink_0"), None); + let p1 = imp.request_new_pad(&templ, Some("sink_0"), None); + assert!(p0.is_some(), "first request succeeds"); + assert!(p1.is_none(), "duplicate name fails to add"); + + let gen = imp.pad_generations.lock().unwrap().get("sink_0").copied(); + let flush_gen = imp.pad_flush.lock().unwrap().get("sink_0").map(|(g, _)| *g); + // After the failed duplicate the maps must still describe the LIVE pad (generation 0). + assert_eq!( + gen, + Some(0), + "live pad's generation must survive a failed duplicate (got {gen:?})" + ); + assert_eq!( + flush_gen, + Some(0), + "live pad's flush sender must survive a failed duplicate (got {flush_gen:?})" + ); + } +} diff --git a/rs/moq-gst/src/spike/session.rs b/rs/moq-gst/src/spike/session.rs index 3b1ce044e..7267dca19 100644 --- a/rs/moq-gst/src/spike/session.rs +++ b/rs/moq-gst/src/spike/session.rs @@ -1,7 +1,6 @@ //! Async core: the session/pads/timeline seams, isolated from the GObject shell. use std::collections::{HashMap, HashSet}; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::{Arc, LazyLock, Mutex}; use anyhow::{ensure, Context, Result}; @@ -29,56 +28,107 @@ pub static CAT: LazyLock = /// Handoff, not a buffer: a full channel must block the streaming thread, not grow. const DATA_CHANNEL_BOUND: usize = 8; -/// Read by the element's getters without touching the task; reset on every exit. +/// Shared observable state, read by the element's getters and the chain without touching the worker +/// task. One `Mutex` covers the session generation plus every gated field, so "check the live +/// generation, then write" is one indivisible step: a stale session (one whose task lingers after a +/// newer one started) cannot clobber the live status even by racing the check against `begin_session`. +#[derive(Default)] +struct StatusInner { + /// Bumped by `begin_session` on every new session; the latest value is the live generation. + generation: u64, + connected: bool, + version: Option, + send_bitrate: u64, + /// Pads the worker rejected, keyed by name -> (session generation, pad generation) of the failed + /// incarnation. `is_failed` matches only the live session, so neither a recreated pad nor a + /// recreated session (same pad name and pad generation across a restart) inherits a stale failure. + failed: HashMap, +} + #[derive(Default)] pub struct Status { - connected: AtomicBool, - version: Mutex>, - send_bitrate: AtomicU64, - /// Pads whose data the worker rejected, keyed by the failed generation so a pad recreated with the - /// same name does not inherit it. The chain reads this to return a FlowError instead of dropping. - failed: Mutex>, + inner: Mutex, } impl Status { - fn set_connected(&self, value: bool) { - self.connected.store(value, Ordering::Relaxed); + /// Claim a fresh generation for a starting session; the returned value is now the live generation. + fn begin_session(&self) -> u64 { + let mut s = self.inner.lock().unwrap(); + s.generation += 1; + s.generation } - fn mark_failed(&self, pad: &str, generation: u64) { - self.failed.lock().unwrap().insert(pad.to_string(), generation); + /// True only while `generation` is the live session. A point-in-time check (only the setters gate + /// atomically); use it where a stale result is harmless, e.g. skipping a notify. + fn is_live(&self, generation: u64) -> bool { + self.inner.lock().unwrap().generation == generation } - fn clear_failed(&self, pad: &str) { - self.failed.lock().unwrap().remove(pad); + fn set_connected(&self, generation: u64, value: bool) { + let mut s = self.inner.lock().unwrap(); + if s.generation == generation { + s.connected = value; + } } - pub fn is_failed(&self, pad: &str, generation: u64) -> bool { - self.failed.lock().unwrap().get(pad) == Some(&generation) + fn set_version(&self, generation: u64, value: Option) { + let mut s = self.inner.lock().unwrap(); + if s.generation == generation { + s.version = value; + } } - fn reset_failed(&self) { - self.failed.lock().unwrap().clear(); + fn set_send_bitrate(&self, generation: u64, bits_per_sec: u64) { + let mut s = self.inner.lock().unwrap(); + if s.generation == generation { + s.send_bitrate = bits_per_sec; + } } - pub fn connected(&self) -> bool { - self.connected.load(Ordering::Relaxed) + /// Reset the observable surface on a session's exit, but only if it is still the live generation. + /// A newer session may have started before this one's task unwinds, so a stale exit must not clobber + /// the live status. Returns whether it actually reset (so the caller can skip a spurious notify). + fn reset_on_exit(&self, generation: u64) -> bool { + let mut s = self.inner.lock().unwrap(); + if s.generation != generation { + return false; + } + s.connected = false; + s.version = None; + s.send_bitrate = 0; + s.failed.clear(); + true } - fn set_version(&self, value: Option) { - *self.version.lock().unwrap() = value; + fn mark_failed(&self, session_generation: u64, pad: &str, pad_generation: u64) { + let mut s = self.inner.lock().unwrap(); + if s.generation == session_generation { + s.failed.insert(pad.to_string(), (session_generation, pad_generation)); + } } - pub fn version(&self) -> Option { - self.version.lock().unwrap().clone() + fn clear_failed(&self, session_generation: u64, pad: &str) { + let mut s = self.inner.lock().unwrap(); + if s.generation == session_generation { + s.failed.remove(pad); + } } - fn set_send_bitrate(&self, bits_per_sec: u64) { - self.send_bitrate.store(bits_per_sec, Ordering::Relaxed); + pub fn is_failed(&self, pad: &str, pad_generation: u64) -> bool { + let s = self.inner.lock().unwrap(); + s.failed.get(pad) == Some(&(s.generation, pad_generation)) + } + + pub fn connected(&self) -> bool { + self.inner.lock().unwrap().connected + } + + pub fn version(&self) -> Option { + self.inner.lock().unwrap().version.clone() } pub fn send_bitrate(&self) -> u64 { - self.send_bitrate.load(Ordering::Relaxed) + self.inner.lock().unwrap().send_bitrate } } @@ -116,14 +166,32 @@ pub enum DataMsg { }, } +/// Out-of-band FLUSH notification to the worker. Carried on its own unbounded channel (not the bounded +/// `DataMsg` FIFO) so FLUSH_START re-anchors promptly and never blocks behind queued buffers. +pub struct FlushSignal { + pub pad: String, + pub generation: u64, +} + pub struct ResolvedSettings { pub url: url::Url, pub broadcast: String, pub tls_disable_verify: bool, } +/// The worker's inbound channels, bundled so `run_session` stays under the argument limit. +struct Inbound { + data: mpsc::Receiver, + flush: mpsc::UnboundedReceiver, + shutdown: watch::Receiver, +} + pub struct SessionHandle { data: mpsc::Sender, + /// Out-of-band control path for FLUSH: unbounded so FLUSH_START never blocks behind a full data + /// channel (the very condition a flush must break), and a discrete pad-targeted event a `watch` + /// would collapse. + flush: mpsc::UnboundedSender, shutdown: watch::Sender, join: tokio::task::JoinHandle<()>, } @@ -135,12 +203,34 @@ impl SessionHandle { element: glib::WeakRef, seed: Vec<(String, u64)>, ) -> Self { + // Claim the generation synchronously so a previous session's late exit-reset (running on its own + // task) sees a newer generation and becomes a no-op instead of clobbering this session's status. + let generation = status.begin_session(); let (data_tx, data_rx) = mpsc::channel(DATA_CHANNEL_BOUND); + let (flush_tx, flush_rx) = mpsc::unbounded_channel(); let (shutdown_tx, shutdown_rx) = watch::channel(false); let join = RUNTIME.spawn(async move { + // Run the worker on its own task so a panic surfaces as an element_error here, instead of a + // silent operational death only observed when stop() reaps the outer JoinHandle. + let worker = RUNTIME.spawn(run_session( + settings, + status, + generation, + seed, + Inbound { + data: data_rx, + flush: flush_rx, + shutdown: shutdown_rx, + }, + element.clone(), + )); // Only a remote close reaches the bus as an error; a local shutdown returns Ok and stays quiet. - if let Err(err) = run_session(settings, status, seed, data_rx, shutdown_rx, element.clone()).await { + // A panic (or cancellation) joins as Err and is surfaced too. + let outcome = worker + .await + .unwrap_or_else(|join_err| Err(anyhow::anyhow!("session worker panicked: {join_err}"))); + if let Err(err) = outcome { if let Some(obj) = element.upgrade() { gst::element_error!(obj, gst::CoreError::Failed, ("session error"), ["{err:?}"]); } @@ -149,6 +239,7 @@ impl SessionHandle { Self { data: data_tx, + flush: flush_tx, shutdown: shutdown_tx, join, } @@ -159,6 +250,12 @@ impl SessionHandle { self.data.clone() } + /// Out-of-band FLUSH signal to the worker; unbounded, so it is safe to call from the event thread + /// even while the data channel is full. + pub fn flush_sender(&self) -> mpsc::UnboundedSender { + self.flush.clone() + } + pub fn stop(self) { // Cancel first so a send blocked on a full channel wakes via the dropped receiver; reap off-thread. let _ = self.shutdown.send(true); @@ -170,14 +267,73 @@ impl SessionHandle { } } +/// Outcome of a flush-cancellable send. Both `Flushed` and `Closed` map to `FlowError::Flushing` for +/// the caller; they are split only so the reason stays legible. +pub(super) enum SendOutcome { + Sent, + /// FLUSH_START flipped this pad's flush watch (or the watch sender was dropped on release). + Flushed, + /// The worker's receiver is gone (the session ended). + Closed, +} + +/// Send `msg` on the bounded data channel, aborting promptly if this pad starts flushing. This is the +/// cancellable replacement for `blocking_send`: the chain runs off the runtime, so `block_on` is safe, +/// and FLUSH_START (signalled on `flush`) can cut a send blocked on a full channel without tearing the +/// session down. The loop re-arms on a FLUSH_STOP (`false`) that lands mid-block, so a quick flush+stop +/// does not drop a still-valid buffer. +pub(super) fn send_or_flush( + sender: &mpsc::Sender, + msg: DataMsg, + flush: &mut watch::Receiver, +) -> SendOutcome { + if *flush.borrow_and_update() { + return SendOutcome::Flushed; + } + RUNTIME.block_on(async move { + loop { + tokio::select! { + // Biased toward flush: FLUSH_START must win a tie with freed capacity, so a blocked chain is + // cut rather than enqueuing a pre-flush buffer. + biased; + changed = flush.changed() => { + // Sender dropped (release) or now flushing: abort. A `false` change (FLUSH_STOP) falls + // through and re-arms the wait. + if changed.is_err() || *flush.borrow() { + return SendOutcome::Flushed; + } + } + permit = sender.reserve() => match permit { + Ok(permit) => { + // Re-check after winning the permit: a flushing pad drops the buffer. This narrows (does + // not atomically close) the race: a FLUSH_START in the gap before the send below still + // enqueues one buffer, which the worker re-anchor and the next buffer's check absorb. + if *flush.borrow_and_update() { + return SendOutcome::Flushed; + } + permit.send(msg); + return SendOutcome::Sent; + } + Err(_) => return SendOutcome::Closed, + }, + } + } + }) +} + async fn run_session( settings: ResolvedSettings, status: Arc, + generation: u64, seed: Vec<(String, u64)>, - mut data: mpsc::Receiver, - mut shutdown: watch::Receiver, + inbound: Inbound, element: glib::WeakRef, ) -> Result<()> { + let Inbound { + mut data, + mut flush, + mut shutdown, + } = inbound; let mut config = moq_native::ClientConfig::default(); config.tls.disable_verify = Some(settings.tls_disable_verify); let client = config.init()?; @@ -198,26 +354,36 @@ async fn run_session( result = client.connect(settings.url.clone()) => result?, _ = shutdown.changed() => return Ok(()), }; - status.set_connected(true); - status.set_version(Some(session.version().to_string())); - notify_connected(&element); + status.set_connected(generation, true); + status.set_version(generation, Some(session.version().to_string())); + if status.is_live(generation) { + notify_connected(&element); + } gst::info!(CAT, "session connected to {}", settings.url); - let mut pad_set = PadSet::new(broadcast, catalog, status.clone()); + let mut pad_set = PadSet::new(broadcast, catalog, status.clone(), generation); // Pads requested before the session task existed are seeded into the authoritative set. for (name, generation) in seed { pad_set.add_pad(&name, generation); } - let reason = run_loop(session, &mut data, &mut shutdown, &mut pad_set, &status).await; + let reason = run_loop( + session, + generation, + &mut data, + &mut flush, + &mut shutdown, + &mut pad_set, + &status, + ) + .await; // Finalize every live producer once on the way out, catalog last; runs on every exit path. let finalized = pad_set.finalize_all(); - // Reset the whole observable surface on exit, not just connected. - status.set_connected(false); - status.set_version(None); - status.set_send_bitrate(0); - status.reset_failed(); - notify_connected(&element); + // Reset the whole observable surface on exit, but only if no newer session has taken over (else a + // stale exit would clobber the live session's status). Skip the notify when the reset was a no-op. + if status.reset_on_exit(generation) { + notify_connected(&element); + } match (reason, finalized) { // Clean end: post the element EOS only once the catalog has finalized cleanly. @@ -261,7 +427,9 @@ enum ExitReason { async fn run_loop( session: moq_net::Session, + generation: u64, data: &mut mpsc::Receiver, + flush: &mut mpsc::UnboundedReceiver, shutdown: &mut watch::Receiver, pad_set: &mut PadSet, status: &Status, @@ -275,6 +443,10 @@ async fn run_loop( loop { tokio::select! { + // Biased so a pending FLUSH re-anchors before the post-flush SEGMENT it precedes: the flush + // signal is enqueued before that SEGMENT on the streaming thread, and biased preserves that + // order across the two channels. Shutdown/death still preempt everything. + biased; // Local close: quiet stop, no ERROR. _ = shutdown.changed() => return Ok(ExitReason::Stopped), // Remote death: propagate the Err so the wrapper posts ERROR to the bus. @@ -282,6 +454,8 @@ async fn run_loop( result?; return Ok(ExitReason::Stopped); } + // FLUSH (out of band): re-anchor the pad's timeline before any post-flush data is processed. + Some(FlushSignal { pad, generation }) = flush.recv() => pad_set.flush(&pad, generation), // A closed estimate stops the polling. bitrate = async { match send_bandwidth.as_mut() { @@ -289,14 +463,13 @@ async fn run_loop( None => std::future::pending::>().await, } } => match bitrate { - Some(rate) => status.set_send_bitrate(rate), + Some(rate) => status.set_send_bitrate(generation, rate), None => send_bandwidth = None, }, msg = data.recv() => match msg { Some(DataMsg::AddPad { pad, generation }) => { + // add_pad clears any stale failure for a fresh incarnation (keyed by this session). pad_set.add_pad(&pad, generation); - // A fresh incarnation is not failed even if a previous one was. - status.clear_failed(&pad); } Some(DataMsg::Caps { pad, generation, caps }) => { if pad_set.caps(&pad, generation, &caps)? { @@ -333,6 +506,9 @@ struct PadSet { catalog: Option, /// Shared with the element so the chain can read which pads the worker has failed. status: Arc, + /// This session's generation; tags failure marks so a stale task's late mark/clear cannot affect the + /// live session's view of which pads have failed. + session_generation: u64, /// Authoritative membership: name -> current generation. EOS aggregation counts against this, not /// the lazily-created `pads`, so a pad that ends before CAPS still counts. active: HashMap, @@ -342,11 +518,17 @@ struct PadSet { } impl PadSet { - fn new(broadcast: moq_net::BroadcastProducer, catalog: moq_mux::catalog::Producer, status: Arc) -> Self { + fn new( + broadcast: moq_net::BroadcastProducer, + catalog: moq_mux::catalog::Producer, + status: Arc, + session_generation: u64, + ) -> Self { Self { broadcast, catalog: Some(catalog), status, + session_generation, active: HashMap::new(), pads: HashMap::new(), eos: HashSet::new(), @@ -360,7 +542,7 @@ impl PadSet { let _ = old.finalize(); } self.eos.remove(pad); - self.status.clear_failed(pad); + self.status.clear_failed(self.session_generation, pad); self.active.insert(pad.to_string(), generation); } @@ -405,7 +587,7 @@ impl PadSet { gst::warning!(CAT, "finalize on failed pad {pad}: {err:?}"); } } - self.status.mark_failed(pad, generation); + self.status.mark_failed(self.session_generation, pad, generation); // The failed pad's track is finalized, so it is terminal for EOS aggregation (counts as ended). self.eos.insert(pad.to_string()); } @@ -422,6 +604,19 @@ impl PadSet { .set_segment(segment); } + /// FLUSH_START re-anchor: drop this pad's timeline so the next SEGMENT is accepted fresh and + /// post-flush frames are not dropped as regressions. The producer and its track are kept (FLUSH is + /// not EOS). A flush before CAPS has no producer entry and is a no-op. + fn flush(&mut self, pad: &str, generation: u64) { + if !self.is_current(pad, generation) { + gst::warning!(CAT, "flush for stale or unknown pad {pad}, ignoring"); + return; + } + if let Some(p) = self.pads.get_mut(pad) { + p.flush(); + } + } + /// `Ok(true)` means the element is now complete (a pad failure can finish it). fn buffer(&mut self, pad: &str, generation: u64, data: Bytes, pts: Option) -> Result { if !self.is_current(pad, generation) { @@ -520,8 +715,6 @@ struct Pad { segment_info: Option, // Kept only to map a buffer PTS to a running time. segment: Option>, - /// Last emitted MoQ timestamp (micros); a frame that would regress below it is dropped. - last_emitted_micros: Option, } impl Pad { @@ -532,7 +725,6 @@ impl Pad { state: PadState::NoSegment, segment_info: None, segment: None, - last_emitted_micros: None, } } @@ -626,9 +818,23 @@ impl Pad { } } - /// Pure of the importer, so it can be tested with real segments and no codec. - fn frame_timestamp(&mut self, pts: Option) -> FrameDecision { - let decision = match self.state { + /// Re-anchor on FLUSH. A flushing seek rewinds running time, so the timeline must restart: dropping + /// the segment moves the pad to NoSegment (the next SEGMENT is accepted fresh via `prev = None`). The + /// producer is kept (FLUSH is not EOS); the codec's partial-AU reset is a documented follow-up, not + /// handled here. + fn flush(&mut self) { + self.state = PadState::NoSegment; + self.segment = None; + self.segment_info = None; + } + + /// Pure of the importer, so it can be tested with real segments and no codec. Emits the PTS-derived + /// running time without enforcing frame-level monotonicity: frames arrive in decode order and the + /// moq-mux container documents that B-frames carry non-monotonic presentation timestamps, so a PTS + /// regression is normal reordering, not an error. Timeline breaks are caught at the SEGMENT level + /// (the `Invalid` state), not per frame. + fn frame_timestamp(&self, pts: Option) -> FrameDecision { + match self.state { PadState::Active => { // to_running_time_full is signed: a buffer before the segment returns Negative, which // frame_micros drops; to_running_time would instead clip it to None and lose the reason. @@ -642,16 +848,7 @@ impl Pad { } PadState::NoSegment => FrameDecision::Drop("buffer before a valid SEGMENT"), PadState::Invalid => FrameDecision::Drop("buffer on an invalidated timeline"), - }; - // Frame-level monotonicity: segment continuity (base) does not by itself stop a frame from - // regressing, so drop a timestamp that would go backwards rather than emit out of order. - if let FrameDecision::Emit(micros) = decision { - if self.last_emitted_micros.is_some_and(|last| micros < last) { - return FrameDecision::Drop("non-monotonic timestamp (regressed)"); - } - self.last_emitted_micros = Some(micros); } - decision } fn push_buffer(&mut self, mut data: Bytes, pts: Option) -> Result<()> { @@ -750,7 +947,7 @@ mod tests { fn pad_set() -> PadSet { let mut broadcast = moq_net::Broadcast::new().produce(); let catalog = moq_mux::catalog::Producer::new(&mut broadcast).unwrap(); - PadSet::new(broadcast, catalog, Arc::new(Status::default())) + PadSet::new(broadcast, catalog, Arc::new(Status::default()), 0) } // A producer that actually emitted has at least one group. latest() is the cross-crate-visible @@ -927,18 +1124,9 @@ mod tests { assert!(!caps_supported(&audio_caps()), "an unsupported media type is rejected"); } - // Frame-through (real init + emit): a keyframe AU emits a frame to the rendition track, not just - // the SPS-published rendition. - #[test] - fn frame_through_h264_emits_a_frame() { - gst::init().unwrap(); - let mut set = pad_set(); - set.add_pad("video", 0); - set.caps("video", 0, &h264_caps()).unwrap(); - set.segment("video", 0, time_segment()); - - // SPS (the importer's own proven bytes) + PPS + a type-5 IDR slice. The slice header is not - // fully parsed for IDR, but this drives the real decode_frame path and emits one frame. + // SPS (the importer's own proven bytes) + PPS + a type-5 IDR slice: drives the real decode_frame + // path and emits one keyframe. The slice header is not fully parsed for IDR. + fn h264_keyframe_au() -> Bytes { let sps: &[u8] = &[ 0x67, 0x42, 0xc0, 0x1f, 0xda, 0x01, 0x40, 0x16, 0xe9, 0xb8, 0x08, 0x08, 0x0a, 0x00, 0x00, 0x07, 0xd0, 0x00, 0x01, 0xd4, 0xc0, 0x80, @@ -950,7 +1138,20 @@ mod tests { au.extend_from_slice(&[0, 0, 0, 1]); au.extend_from_slice(nal); } - set.buffer("video", 0, Bytes::from(au), Some(gst::ClockTime::ZERO)) + Bytes::from(au) + } + + // Frame-through (real init + emit): a keyframe AU emits a frame to the rendition track, not just + // the SPS-published rendition. + #[test] + fn frame_through_h264_emits_a_frame() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("video", 0); + set.caps("video", 0, &h264_caps()).unwrap(); + set.segment("video", 0, time_segment()); + + set.buffer("video", 0, h264_keyframe_au(), Some(gst::ClockTime::ZERO)) .unwrap(); let snapshot = set.catalog.as_ref().unwrap().snapshot(); @@ -1233,10 +1434,11 @@ mod tests { ); } - // Frame monotonicity: a segment accepted on base alone can still place a frame before one already - // emitted; that regression is dropped, not emitted out of order. + // A PTS that regresses within an Active timeline still emits: frames arrive in decode order and + // B-frames carry non-monotonic presentation timestamps (moq-mux container contract). The timeline + // itself is guarded at the SEGMENT level (the Invalid state), not per frame. #[test] - fn regressing_timestamp_is_dropped() { + fn regressing_pts_within_an_active_timeline_still_emits() { gst::init().unwrap(); let mut pad = Pad::new(); pad.set_segment(time_segment_at(0, 0)); @@ -1244,30 +1446,33 @@ mod tests { pad.frame_timestamp(Some(gst::ClockTime::from_mseconds(10_000))), FrameDecision::Emit(10_000_000) ); - // New segment: base advances past the previous base (so it is accepted) but below the running - // time already emitted. - pad.set_segment(time_segment_at(0, 5_000)); - assert_eq!(pad.state, PadState::Active); - // Its first frame maps to 5s, which regresses below the 10s already emitted: dropped. - assert!(matches!( - pad.frame_timestamp(Some(gst::ClockTime::ZERO)), - FrameDecision::Drop(_) - )); - // A later frame that catches up past 10s emits again. + // A later buffer whose PTS sits below the previous one (a B-frame in decode order) still emits + // at its own running time, rather than being dropped as a regression. assert_eq!( pad.frame_timestamp(Some(gst::ClockTime::from_mseconds(6_000))), - FrameDecision::Emit(11_000_000) + FrameDecision::Emit(6_000_000) ); } - // The failure set is scoped to the generation, so a pad recreated with the same name does not - // inherit a previous incarnation's failure even before the worker clears it. + // Failures are scoped to BOTH the pad generation and the session generation, so neither a recreated + // pad nor a recreated session (same pad name + generation across a restart) inherits a stale failure. #[test] fn failure_is_scoped_to_the_generation() { let status = Status::default(); - status.mark_failed("video", 0); + let session = status.begin_session(); + status.mark_failed(session, "video", 0); assert!(status.is_failed("video", 0), "the failed incarnation is failed"); - assert!(!status.is_failed("video", 1), "a newer incarnation is not failed"); + assert!(!status.is_failed("video", 1), "a newer pad incarnation is not failed"); + + // A new session does not inherit the old one's failures, even for the same pad name/generation. + let _next = status.begin_session(); + assert!( + !status.is_failed("video", 0), + "the failure does not carry across a session restart" + ); + // And a now-stale session's late mark is dropped, not surfaced for the live session. + status.mark_failed(session, "video", 0); + assert!(!status.is_failed("video", 0), "a stale session's mark is a no-op"); } // SEGMENT before CAPS: the pad is created and the segment retained when CAPS arrives. @@ -1316,6 +1521,158 @@ mod tests { ); } + // FLUSH must wake a send blocked on a full channel AND leave the receiver alive. The second half is + // the discriminator: dropped_receiver_wakes_blocked_send proves the shutdown wake (receiver gone), so + // a flush wake that also tore the receiver down would be indistinguishable from it. + #[test] + fn flush_wakes_a_blocked_send_and_keeps_the_receiver() { + let (data_tx, mut data_rx) = mpsc::channel::(DATA_CHANNEL_BOUND); + for _ in 0..DATA_CHANNEL_BOUND { + data_tx + .try_send(DataMsg::AddPad { + pad: "x".into(), + generation: 0, + }) + .unwrap(); + } + let (flush_tx, flush_rx) = watch::channel(false); + + // Rendezvous so the flush is sent only after the worker is running and about to call + // send_or_flush (else the send could see flush=true at its initial check and never block). The + // sleep then covers parking inside reserve(), which has no test seam to observe directly. + let started = Arc::new(std::sync::Barrier::new(2)); + let sender = data_tx.clone(); + let mut rx = flush_rx; + let started_worker = started.clone(); + let blocked = std::thread::spawn(move || { + started_worker.wait(); + send_or_flush( + &sender, + DataMsg::AddPad { + pad: "x".into(), + generation: 1, + }, + &mut rx, + ) + }); + started.wait(); + std::thread::sleep(Duration::from_millis(50)); // let the send park inside block_on + + flush_tx.send(true).unwrap(); + assert!( + matches!(blocked.join().unwrap(), SendOutcome::Flushed), + "flush woke the blocked send" + ); + + // The receiver is still alive (not dropped): it delivers the queued data, and a fresh non-flushing + // send then goes through. A teardown would fail both. + assert!(data_rx.blocking_recv().is_some(), "receiver still delivers"); + let (_gate_tx, mut rx) = watch::channel(false); + assert!( + matches!( + send_or_flush( + &data_tx, + DataMsg::AddPad { + pad: "x".into(), + generation: 2 + }, + &mut rx + ), + SendOutcome::Sent + ), + "the receiver survived the flush and accepts new sends" + ); + } + + // A per-pad watch means flushing pad A never cancels pad B's blocked send. + #[test] + fn flush_only_wakes_the_flushed_pad() { + let (data_tx, mut data_rx) = mpsc::channel::(DATA_CHANNEL_BOUND); + for _ in 0..DATA_CHANNEL_BOUND { + data_tx + .try_send(DataMsg::AddPad { + pad: "fill".into(), + generation: 0, + }) + .unwrap(); + } + let (flush_a_tx, flush_a_rx) = watch::channel(false); + let (_flush_b_tx, flush_b_rx) = watch::channel(false); + + // Both workers must be running before A is flushed, so B is genuinely inside send_or_flush (not + // merely unspawned). The sleep then covers parking inside reserve(). + let started = Arc::new(std::sync::Barrier::new(3)); + let sender_a = data_tx.clone(); + let mut rx_a = flush_a_rx; + let started_a = started.clone(); + let blocked_a = std::thread::spawn(move || { + started_a.wait(); + send_or_flush( + &sender_a, + DataMsg::AddPad { + pad: "a".into(), + generation: 1, + }, + &mut rx_a, + ) + }); + let sender_b = data_tx.clone(); + let mut rx_b = flush_b_rx; + let started_b = started.clone(); + let blocked_b = std::thread::spawn(move || { + started_b.wait(); + send_or_flush( + &sender_b, + DataMsg::AddPad { + pad: "b".into(), + generation: 1, + }, + &mut rx_b, + ) + }); + started.wait(); + std::thread::sleep(Duration::from_millis(50)); + + flush_a_tx.send(true).unwrap(); + assert!( + matches!(blocked_a.join().unwrap(), SendOutcome::Flushed), + "pad A's flush woke A" + ); + + // B was untouched: freeing a slot lets it send normally (Sent, not Flushed). + assert!(data_rx.blocking_recv().is_some()); + assert!( + matches!(blocked_b.join().unwrap(), SendOutcome::Sent), + "pad B's send was not cancelled by A's flush" + ); + } + + // Already-flushing contract: a pad whose watch is set drops the buffer (returns Flushed, enqueues + // nothing) even with capacity. This exercises the initial check in send_or_flush, NOT the select's + // biased arm; the capacity+FLUSH sub-poll tie (pitfall 14) stays structural, not covered here. + #[test] + fn flush_drops_the_buffer_even_with_capacity() { + let (data_tx, mut data_rx) = mpsc::channel::(DATA_CHANNEL_BOUND); // empty: capacity free + let (flush_tx, flush_rx) = watch::channel(false); + flush_tx.send(true).unwrap(); + let mut rx = flush_rx; + assert!( + matches!( + send_or_flush( + &data_tx, + DataMsg::AddPad { + pad: "x".into(), + generation: 0 + }, + &mut rx + ), + SendOutcome::Flushed + ), + "a flushing send returns Flushed" + ); + assert!(data_rx.try_recv().is_err(), "and enqueues nothing"); + } + // Two pads, real PTS via to_running_time_full: the A/V offset survives because running time is shared. #[test] fn two_pads_keep_av_aligned_through_real_segments() { @@ -1338,7 +1695,7 @@ mod tests { // A pad with no SEGMENT yet drops buffers (NoSegment), distinct from an invalidated timeline. #[test] fn pad_without_segment_drops_buffers() { - let mut pad = Pad::new(); + let pad = Pad::new(); assert_eq!(pad.state, PadState::NoSegment); assert!(matches!( pad.frame_timestamp(Some(gst::ClockTime::from_mseconds(5))), @@ -1400,6 +1757,194 @@ mod tests { ); } + // FLUSH re-anchor: flush drops the timeline to NoSegment, so a rewinding post-flush segment anchors + // fresh and is accepted (Active) rather than rejected as a discontinuity. Without the flush the + // rewind would go Invalid (see invalid_segment_drops_then_a_valid_one_recovers). + #[test] + fn flush_reanchors_so_a_rewinding_segment_recovers() { + gst::init().unwrap(); + let mut pad = Pad::new(); + pad.set_segment(time_segment_at(0, 5_000)); + assert_eq!(pad.state, PadState::Active); + assert!(matches!( + pad.frame_timestamp(Some(gst::ClockTime::from_mseconds(10_000))), + FrameDecision::Emit(_) + )); + + pad.flush(); + assert_eq!(pad.state, PadState::NoSegment, "flush re-anchors to NoSegment"); + + // A base that rewinds below the old one is now accepted fresh, not rejected. + pad.set_segment(time_segment_at(0, 0)); + assert_eq!(pad.state, PadState::Active, "post-flush rewinding segment is accepted"); + // And the post-flush timeline emits from the new anchor. + assert_eq!(pad.frame_timestamp(Some(gst::ClockTime::ZERO)), FrameDecision::Emit(0)); + } + + #[test] + fn flush_for_stale_generation_is_ignored() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("video", 1); + set.caps("video", 1, &h264_caps()).unwrap(); + set.segment("video", 1, time_segment()); + assert_eq!(set.pads["video"].state, PadState::Active); + // A flush for a previous incarnation must not touch the live pad. + set.flush("video", 0); + assert_eq!( + set.pads["video"].state, + PadState::Active, + "stale-generation flush is ignored" + ); + } + + #[test] + fn flush_before_caps_is_a_noop() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("video", 0); + // No CAPS yet, so no producer entry; flush must not panic or fabricate one. + set.flush("video", 0); + assert!(!set.pads.contains_key("video"), "flush before caps creates no producer"); + } + + // FLUSH is not EOS: the producer and its track (same name) survive a flush; only the timeline + // re-anchors. A keyframe AU publishes the rendition first so the track name is observable. + #[test] + fn flush_keeps_the_producer() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("video", 0); + set.caps("video", 0, &h264_caps()).unwrap(); + set.segment("video", 0, time_segment()); + set.buffer("video", 0, h264_keyframe_au(), Some(gst::ClockTime::ZERO)) + .unwrap(); + let before = set + .catalog + .as_ref() + .unwrap() + .snapshot() + .video + .renditions + .keys() + .next() + .expect("a video rendition") + .clone(); + + set.flush("video", 0); + + assert!(set.pads["video"].framed.is_some(), "flush keeps the producer"); + let after = set + .catalog + .as_ref() + .unwrap() + .snapshot() + .video + .renditions + .keys() + .next() + .cloned(); + assert_eq!( + after.as_deref(), + Some(before.as_str()), + "the track name is unchanged across the flush (no catalog churn)" + ); + assert_eq!(set.pads["video"].state, PadState::NoSegment, "the timeline re-anchored"); + } + + // The worker's re-anchor path (PadSet::flush then PadSet::segment): a rewinding post-flush segment is + // accepted (Active), where without the flush the rewind would be rejected to Invalid. + #[test] + fn worker_flush_then_rewinding_segment_reanchors() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("video", 0); + set.caps("video", 0, &h264_caps()).unwrap(); + set.segment("video", 0, time_segment_at(0, 5_000)); + assert_eq!(set.pads["video"].state, PadState::Active); + set.flush("video", 0); + assert_eq!(set.pads["video"].state, PadState::NoSegment); + // Rewinding base: accepted fresh after the flush (would go Invalid without it). + set.segment("video", 0, time_segment_at(0, 0)); + assert_eq!( + set.pads["video"].state, + PadState::Active, + "the worker accepts the post-flush rewinding segment fresh" + ); + } + + // Decode-order frames, including B-frames, must all emit. The moq-mux container contract + // (container/mod.rs:46-48) is "frames in DECODE order; B-frames may have non-monotonic presentation + // timestamps", so a PTS regression is normal reordering, not an error: frame_timestamp must not drop + // on PTS monotonicity (which would silently lose every B-frame). + #[test] + fn bframes_in_decode_order_all_emit() { + gst::init().unwrap(); + let mut pad = Pad::new(); + // start=0, base=0 so running time == pts. Display order I B B B P @25fps -> DECODE order + // I P B B B, so the PTS the sink sees is 0, 160, 40, 80, 120 ms. + pad.set_segment(time_segment()); + let decode_order_pts_ms = [0u64, 160, 40, 80, 120]; + let emitted = decode_order_pts_ms + .into_iter() + .filter(|&ms| { + matches!( + pad.frame_timestamp(Some(gst::ClockTime::from_mseconds(ms))), + FrameDecision::Emit(_) + ) + }) + .count(); + assert_eq!( + emitted, 5, + "all five decode-order frames must emit; B-frames are not regressions (got {emitted})" + ); + } + + // Status is a shared Arc written by every session, and stop() does not await the old task, so a stale + // session's exit-reset can run after a new session connected. The generation token makes a stale + // reset_on_exit a no-op, so it cannot clobber the live status (the restart-on-failure race). + #[test] + fn stale_session_reset_must_not_clobber_live_status() { + let status = Arc::new(Status::default()); + let stale = status.begin_session(); // an old session's generation + let live = status.begin_session(); // the new session's generation, now the live one + // The live session connects and writes the shared status. + status.set_connected(live, true); + status.set_version(live, Some("moq-lite-04".to_string())); + // The old session's exit-reset runs late, but it is stale: a no-op that leaves the live status. + assert!( + !status.reset_on_exit(stale), + "a stale generation must not reset the live status" + ); + assert!(status.connected(), "the live session is still connected"); + // The live session's own exit does reset. + assert!(status.reset_on_exit(live), "the current generation resets on exit"); + assert!( + !status.connected(), + "after the live session exits, the status is disconnected" + ); + } + + // Not just the exit-reset: a stale session's connected/version/bitrate writes must also be no-ops, or + // an old task that lingers (or wins its connect late) would clobber the live session's status. + #[test] + fn stale_session_writes_are_dropped() { + let status = Arc::new(Status::default()); + let stale = status.begin_session(); + let live = status.begin_session(); // now the live generation + status.set_connected(live, true); + status.set_send_bitrate(live, 1000); + assert!(status.connected()); + + // Stale writes from the old generation are dropped, leaving the live status intact. + status.set_connected(stale, false); + status.set_version(stale, Some("ghost".to_string())); + status.set_send_bitrate(stale, 999); + assert!(status.connected(), "stale set_connected ignored"); + assert_ne!(status.version().as_deref(), Some("ghost"), "stale set_version ignored"); + assert_eq!(status.send_bitrate(), 1000, "stale set_send_bitrate ignored"); + } + fn time_segment() -> gst::Segment { let mut segment = gst::FormattedSegment::::new(); segment.set_start(gst::ClockTime::ZERO); From 3963bb2a9e96b2f2dbaf893e71ec4afa23143283 Mon Sep 17 00:00:00 2001 From: Ariel Molina Date: Tue, 16 Jun 2026 15:56:51 -0600 Subject: [PATCH 10/13] moq-gst: promote the rewritten core to the moqsink element The parallel moqsinkspike element, built across the preceding commits, replaces the original moqsink. Remove the old implementation, move the new core into src/sink/, and register it as "moqsink" with the GType name kept as "MoqSink" for registration continuity. Behavior is a compatible superset of the old element: the same request sink pads, the same seven codec caps, and the same three writable properties (url, broadcast, tls-disable-verify), plus three read-only diagnostics (connected, moq-version, estimated-send-bitrate). Co-Authored-By: Claude Opus 4.8 --- doc/bin/gstreamer.md | 8 + rs/moq-gst/src/lib.rs | 2 - rs/moq-gst/src/sink/imp.rs | 640 +++++++++++---------- rs/moq-gst/src/sink/mod.rs | 2 + rs/moq-gst/src/{spike => sink}/session.rs | 6 +- rs/moq-gst/src/{spike => sink}/timeline.rs | 0 rs/moq-gst/src/spike/imp.rs | 535 ----------------- rs/moq-gst/src/spike/mod.rs | 19 - rs/moq-gst/tests/element.rs | 12 +- 9 files changed, 340 insertions(+), 884 deletions(-) rename rs/moq-gst/src/{spike => sink}/session.rs (99%) rename rs/moq-gst/src/{spike => sink}/timeline.rs (100%) delete mode 100644 rs/moq-gst/src/spike/imp.rs delete mode 100644 rs/moq-gst/src/spike/mod.rs diff --git a/doc/bin/gstreamer.md b/doc/bin/gstreamer.md index 8f8d6d3d8..085be7dcf 100644 --- a/doc/bin/gstreamer.md +++ b/doc/bin/gstreamer.md @@ -30,6 +30,14 @@ Both elements support the following properties: For `http://` URLs, `moq-native` automatically fetches the server's certificate fingerprint from `/certificate.sha256` and verifies TLS against it. You don't need `tls-disable-verify` for local development. ::: +`moqsink` additionally exposes these read-only properties for monitoring: + +| Property | Type | Description | +| ------------------------ | ------ | ----------------------------------------------------------- | +| `connected` | bool | Whether the publish session is currently connected | +| `moq-version` | string | The negotiated MoQ protocol version; null when disconnected | +| `estimated-send-bitrate` | uint64 | Estimated send bitrate in bits per second; 0 when unavailable | + ## Prerequisites The plugin requires GStreamer development libraries. It is **not** built by default since most users don't have them installed. diff --git a/rs/moq-gst/src/lib.rs b/rs/moq-gst/src/lib.rs index 8d3c5e580..18dc5a5e0 100644 --- a/rs/moq-gst/src/lib.rs +++ b/rs/moq-gst/src/lib.rs @@ -2,7 +2,6 @@ use gst::glib; mod sink; mod source; -mod spike; use tracing::level_filters::LevelFilter; use tracing_subscriber::EnvFilter; @@ -10,7 +9,6 @@ use tracing_subscriber::EnvFilter; pub fn plugin_init(plugin: &gst::Plugin) -> Result<(), glib::BoolError> { sink::register(plugin)?; source::register(plugin)?; - spike::register(plugin)?; let filter = EnvFilter::builder() .with_default_directive(LevelFilter::INFO.into()) diff --git a/rs/moq-gst/src/sink/imp.rs b/rs/moq-gst/src/sink/imp.rs index 643ebb296..22f051d6c 100644 --- a/rs/moq-gst/src/sink/imp.rs +++ b/rs/moq-gst/src/sink/imp.rs @@ -1,37 +1,23 @@ -//! Async-friendly MoqSink that keeps the original dynamic-pad Element -//! behavior while pushing all network setup and CMAF publishing work into -//! a Tokio task. The GLib state change thread never blocks, pads still get -//! requested dynamically, and each pad simply forwards buffers to the -//! background worker via an unbounded channel. Events are handled on the -//! sink pad, with EOS aggregated locally before posting element EOS. +//! GObject shell for the moqsink element. -use std::collections::{HashMap, HashSet}; -use std::sync::{LazyLock, Mutex}; +use std::collections::HashMap; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Arc, LazyLock, Mutex}; use anyhow::{Context, Result}; use bytes::Bytes; use gst::glib; use gst::prelude::*; use gst::subclass::prelude::*; -use tokio::sync::mpsc; -use url::Url; - -use hang::moq_net; - -static RUNTIME: LazyLock = LazyLock::new(|| { - tokio::runtime::Builder::new_multi_thread() - .enable_all() - .build() - .expect("spawn tokio runtime") -}); - -static CAT: LazyLock = LazyLock::new(|| { - gst::DebugCategory::new( - "moq-sink", - gst::DebugColorFlags::empty(), - Some("MoQ Sink (async element)"), - ) -}); +use tokio::sync::watch; + +use super::session::{ + caps_supported, send_or_flush, DataMsg, FlushSignal, ResolvedSettings, SendOutcome, SessionHandle, Status, CAT, +}; + +/// Reject a frame past the MoQ frame limit (moq-net's MAX_FRAME_SIZE, 16 MiB): it could not be +/// consumed anyway, and copying it would let hostile input drive an unbounded allocation. +const MAX_FRAME_BYTES: usize = 16 * 1024 * 1024; #[derive(Debug, Clone, Default)] struct Settings { @@ -40,19 +26,12 @@ struct Settings { tls_disable_verify: bool, } -#[derive(Debug, Clone)] -struct ResolvedSettings { - url: Url, - broadcast: String, - tls_disable_verify: bool, -} - impl TryFrom for ResolvedSettings { type Error = anyhow::Error; fn try_from(value: Settings) -> Result { Ok(Self { - url: Url::parse(value.url.as_ref().context("url property is required")?)?, + url: url::Url::parse(value.url.as_ref().context("url property is required")?)?, broadcast: value .broadcast .as_ref() @@ -63,60 +42,19 @@ impl TryFrom for ResolvedSettings { } } -#[derive(Debug)] -struct SessionHandle { - sender: mpsc::UnboundedSender, - join: tokio::task::JoinHandle<()>, -} - -impl SessionHandle { - fn stop(self) { - let _ = self.sender.send(ControlMessage::Shutdown); - RUNTIME.spawn(async move { - if let Err(err) = self.join.await { - gst::warning!(CAT, "session task ended with error: {err:?}"); - } - }); - } -} - -struct PadState { - decoder: moq_mux::import::Framed, - reference_pts: Option, -} - -struct RuntimeState { - #[allow(dead_code)] - session: moq_net::Session, - broadcast: moq_net::BroadcastProducer, - catalog: moq_mux::catalog::Producer, - pads: HashMap, -} - -#[derive(Debug)] -enum ControlMessage { - SetCaps { - pad_name: String, - caps: gst::Caps, - }, - Buffer { - pad_name: String, - data: Bytes, - pts: Option, - }, - Eos { - pad_name: String, - }, - DropPad { - pad_name: String, - }, - Shutdown, -} - #[derive(Default)] pub struct MoqSink { settings: Mutex, session: Mutex>, + status: Arc, + // Monotonic per-pad generation; a pad recreated with the same name gets a fresh value so the + // worker discards the previous incarnation's in-flight messages. + next_generation: AtomicU64, + pad_generations: Mutex>, + // Per-pad FLUSH gate: FLUSH_START flips it true to cut a blocked send (the chain holds a cloned + // receiver); FLUSH_STOP flips it false. Per-pad so flushing one input never cancels another's send. + // Keyed with the generation so a stale FLUSH from a previous incarnation cannot flip the live pad. + pad_flush: Mutex)>>, } #[glib::object_subclass] @@ -143,6 +81,22 @@ impl ObjectImpl for MoqSink { .blurb("Disable TLS verification") .default_value(false) .build(), + // Read-only, served from the shared Status the task writes. + glib::ParamSpecBoolean::builder("connected") + .nick("Connected") + .blurb("Whether the session is currently connected") + .read_only() + .build(), + glib::ParamSpecString::builder("moq-version") + .nick("Negotiated version") + .blurb("The negotiated MoQ protocol version, null when disconnected") + .read_only() + .build(), + glib::ParamSpecUInt64::builder("estimated-send-bitrate") + .nick("Estimated send bitrate") + .blurb("Estimated send bitrate in bits per second (congestion controller), 0 when unavailable") + .read_only() + .build(), ] }); PROPS.as_ref() @@ -159,12 +113,19 @@ impl ObjectImpl for MoqSink { } fn property(&self, _id: usize, pspec: &glib::ParamSpec) -> glib::Value { - let settings = self.settings.lock().unwrap(); match pspec.name() { - "url" => settings.url.to_value(), - "broadcast" => settings.broadcast.to_value(), - "tls-disable-verify" => settings.tls_disable_verify.to_value(), - _ => unreachable!(), + "connected" => self.status.connected().to_value(), + "moq-version" => self.status.version().to_value(), + "estimated-send-bitrate" => self.status.send_bitrate().to_value(), + name => { + let settings = self.settings.lock().unwrap(); + match name { + "url" => settings.url.to_value(), + "broadcast" => settings.broadcast.to_value(), + "tls-disable-verify" => settings.tls_disable_verify.to_value(), + _ => unreachable!(), + } + } } } @@ -178,19 +139,21 @@ impl GstObjectImpl for MoqSink {} impl ElementImpl for MoqSink { fn metadata() -> Option<&'static gst::subclass::ElementMetadata> { - static ELEMENT_METADATA: LazyLock = LazyLock::new(|| { + static METADATA: LazyLock = LazyLock::new(|| { gst::subclass::ElementMetadata::new( - "MoQ Sink (async)", + "MoQ Sink", "Sink/Network/MoQ", "Transmits media over MoQ", - "Luke Curley , Steve McFarlin ", + "Ariel Molina ", ) }); - Some(&*ELEMENT_METADATA) + Some(&*METADATA) } fn pad_templates() -> &'static [gst::PadTemplate] { static PAD_TEMPLATES: LazyLock> = LazyLock::new(|| { + // Every codec that converges on moq_mux::import::Framed; per-codec specifics are validated + // when the producer is built from caps, not here. let mut caps = gst::Caps::new_empty(); caps.merge( gst::Caps::builder("video/x-h264") @@ -228,35 +191,105 @@ impl ElementImpl for MoqSink { name: Option<&str>, _caps: Option<&gst::Caps>, ) -> Option { + // Fixed per pad incarnation and captured here, so a buffer in flight from a released pad never + // reads a successor's generation. + let generation = self.next_generation.fetch_add(1, Ordering::Relaxed); + // One flush watch per pad: the sender lives in `pad_flush` (toggled by FLUSH_START/STOP), each + // pad function carries its own cloned receiver to cancel a blocked send. + let (flush_tx, flush_rx) = watch::channel(false); + let chain_flush = flush_rx.clone(); + let event_flush = flush_rx; let pad_builder = gst::Pad::builder_from_template(templ) - .chain_function(|pad, parent, buffer| { - let element = parent - .and_then(|p| p.downcast_ref::()) - .ok_or(gst::FlowError::Error)?; - element.imp().forward_buffer(pad, buffer) + .chain_function(move |pad, parent, buffer| { + let mut flush = chain_flush.clone(); + MoqSink::catch_panic_pad_function( + parent, + || Err(gst::FlowError::Error), + |sink| sink.forward_buffer(pad, generation, &mut flush, buffer), + ) }) - .event_function(|pad, parent, event| { - let Some(element) = parent.and_then(|p| p.downcast_ref::()) else { - return false; - }; - element.imp().handle_event(pad, event) + .event_function(move |pad, parent, event| { + let mut flush = event_flush.clone(); + MoqSink::catch_panic_pad_function( + parent, + || false, + |sink| sink.handle_event(pad, generation, &mut flush, event), + ) }); - let pad = if let Some(name) = name { - pad_builder.name(name).build() - } else { - pad_builder.generated_name().build() + let pad = match name { + Some(name) => pad_builder.name(name).build(), + None => pad_builder.generated_name().build(), }; - self.obj().add_pad(&pad).ok()?; + // Populate the maps BEFORE the pad is visible to GStreamer, so a concurrent start_session seed + // never sees a pad without its generation. Capture the previous holders so a failed add_pad (a + // duplicate name, or a concurrent request that lost the race) rolls back without orphaning the + // live pad, and announce AddPad only after add_pad succeeds so the worker never gets a phantom. + // + // Known limitation (documented, not closed): two concurrent requests for the SAME name racing a + // start_session seed can leave the seed reading the loser's generation for the winner's pad, so + // the winner's events are then dropped as stale. Closing it would mean holding a lock across + // add_pad, which emits pad-added synchronously and would deadlock a reentrant handler; that risk + // is worse than the bug, whose trigger (same-name concurrent request_new_pad) apps do not produce. + let name = pad.name().to_string(); + let prev_gen = self.pad_generations.lock().unwrap().insert(name.clone(), generation); + let prev_flush = self + .pad_flush + .lock() + .unwrap() + .insert(name.clone(), (generation, flush_tx)); + + if self.obj().add_pad(&pad).is_err() { + // Roll back, but only if this attempt still owns the entry. A concurrent same-name request + // that won add_pad may have overwritten it; restoring our captured `prev` (or removing) would + // then clobber the live pad it just registered. Touch the maps only while they still hold our + // own generation. + { + let mut gens = self.pad_generations.lock().unwrap(); + if gens.get(name.as_str()) == Some(&generation) { + match prev_gen { + Some(g) => gens.insert(name.clone(), g), + None => gens.remove(&name), + }; + } + } + { + let mut flushes = self.pad_flush.lock().unwrap(); + if flushes.get(name.as_str()).map(|(g, _)| *g) == Some(generation) { + match prev_flush { + Some(f) => flushes.insert(name.clone(), f), + None => flushes.remove(&name), + }; + } + } + return None; + } + + // A request pad is linked by the caller only after this returns, so its CAPS/buffers cannot reach + // the worker ahead of this AddPad. + let sender = self.session.lock().unwrap().as_ref().map(SessionHandle::sender); + if let Some(sender) = sender { + let _ = sender.blocking_send(DataMsg::AddPad { pad: name, generation }); + } Some(pad) } fn release_pad(&self, pad: &gst::Pad) { - if let Some(session) = self.session.lock().unwrap().as_ref() { - let _ = session.sender.send(ControlMessage::DropPad { - pad_name: pad.name().to_string(), - }); + let name = pad.name().to_string(); + let generation = self.pad_generations.lock().unwrap().remove(&name); + // Dropping the watch sender wakes any send still blocked on this pad (changed() errors -> Flushing). + self.pad_flush.lock().unwrap().remove(&name); + // Drop the session guard before blocking_send: holding it across a full-channel block deadlocks stop_session. + let sender = { + let session = self.session.lock().unwrap(); + session.as_ref().map(SessionHandle::sender) + }; + if let (Some(sender), Some(generation)) = (sender, generation) { + // Uncancellable: a full data channel or an in-progress connect stalls release here, and the + // per-pad flush watch (removed just above) cannot cut it. Known control-path-blocking limit, + // shared with the AddPad send; out of scope for FLUSH. + let _ = sender.blocking_send(DataMsg::DropPad { pad: name, generation }); } let _ = self.obj().remove_pad(pad); } @@ -279,21 +312,23 @@ impl ElementImpl for MoqSink { impl MoqSink { fn start_session(&self) -> Result<()> { - let settings = { - let settings = self.settings.lock().unwrap().clone(); - ResolvedSettings::try_from(settings)? + // Synchronous settings validation surfaces as a StateChangeError; async failures go to the bus. + let settings = ResolvedSettings::try_from(self.settings.lock().unwrap().clone())?; + // Seed pads requested before the session existed; the data channel is created inside start(). + let seed = { + let gens = self.pad_generations.lock().unwrap(); + self.obj() + .pads() + .iter() + .map(|pad| { + let name = pad.name().to_string(); + let generation = gens.get(&name).copied().unwrap_or(0); + (name, generation) + }) + .collect() }; - - let (tx, rx) = mpsc::unbounded_channel::(); - let element_weak = self.obj().downgrade(); - - let join = RUNTIME.spawn(async move { - if let Err(err) = run_session(settings, rx, element_weak).await { - gst::error!(CAT, "session error: {err:#}"); - } - }); - - *self.session.lock().unwrap() = Some(SessionHandle { sender: tx, join }); + let handle = SessionHandle::start(settings, self.status.clone(), self.obj().downgrade(), seed); + *self.session.lock().unwrap() = Some(handle); Ok(()) } @@ -303,227 +338,198 @@ impl MoqSink { } } - fn forward_buffer(&self, pad: &gst::Pad, buffer: gst::Buffer) -> Result { + /// Clone the sender, release the lock, then blocking-send: never apply backpressure under the session lock. + fn forward_buffer( + &self, + pad: &gst::Pad, + generation: u64, + flush: &mut watch::Receiver, + buffer: gst::Buffer, + ) -> Result { + // The worker marks a pad failed after rejecting its data; surface that to GStreamer instead of + // silently dropping. Because the worker is async, this lands on the buffer after the bad one. + if self.status.is_failed(pad.name().as_str(), generation) { + return Err(gst::FlowError::NotNegotiated); + } + + // Bound the per-frame allocation before copying: a buffer past the frame limit cannot be + // consumed and would let hostile input drive an unbounded copy. + if buffer.size() > MAX_FRAME_BYTES { + gst::warning!( + CAT, + "rejecting {}-byte buffer on pad {} (exceeds frame limit)", + buffer.size(), + pad.name() + ); + return Err(gst::FlowError::Error); + } + + // Skip the map + copy if the pad is already flushing; send_or_flush re-checks for a flush that + // starts mid-send, but during a flush repeated buffers would otherwise burn a copy each before + // being dropped. + if *flush.borrow() { + return Err(gst::FlowError::Flushing); + } + let sender = self .session .lock() .unwrap() .as_ref() - .map(|handle| handle.sender.clone()) + .map(|handle| handle.sender()) .ok_or(gst::FlowError::Flushing)?; let map = buffer.map_readable().map_err(|_| gst::FlowError::Error)?; - let pts = buffer.pts(); let data = Bytes::copy_from_slice(map.as_slice()); + let pts = buffer.pts(); - sender - .send(ControlMessage::Buffer { - pad_name: pad.name().to_string(), - data, - pts, - }) - .map_err(|_| gst::FlowError::Flushing)?; - - Ok(gst::FlowSuccess::Ok) + let msg = DataMsg::Buffer { + pad: pad.name().to_string(), + generation, + data, + pts, + }; + // FLUSH_START on this pad cuts a send blocked on a full channel (returns Flushing), instead of + // stalling the streaming thread until the relay drains. + match send_or_flush(&sender, msg, flush) { + SendOutcome::Sent => Ok(gst::FlowSuccess::Ok), + SendOutcome::Flushed | SendOutcome::Closed => Err(gst::FlowError::Flushing), + } } - fn handle_event(&self, pad: &gst::Pad, event: gst::Event) -> bool { + fn handle_event( + &self, + pad: &gst::Pad, + generation: u64, + flush: &mut watch::Receiver, + event: gst::Event, + ) -> bool { + let sender = self.session.lock().unwrap().as_ref().map(|handle| handle.sender()); + match event.view() { gst::EventView::Caps(caps) => { - let Some(sender) = self - .session - .lock() - .unwrap() - .as_ref() - .map(|handle| handle.sender.clone()) - else { + let caps = caps.caps(); + // Reject unsupported caps synchronously (NotNegotiated) before handing off to the worker. + if !caps_supported(caps) { + gst::warning!(CAT, "rejecting unsupported caps on pad {}", pad.name()); return false; + } + let Some(sender) = sender else { return false }; + let msg = DataMsg::Caps { + pad: pad.name().to_string(), + generation, + caps: caps.to_owned(), }; - - if sender - .send(ControlMessage::SetCaps { - pad_name: pad.name().to_string(), - caps: caps.caps().to_owned(), - }) - .is_err() - { - return false; + match send_or_flush(&sender, msg, flush) { + SendOutcome::Sent => gst::Pad::event_default(pad, Some(&*self.obj()), event), + SendOutcome::Flushed | SendOutcome::Closed => false, } - - gst::Pad::event_default(pad, Some(&*self.obj()), event) } - gst::EventView::Eos(_) => { - let Some(sender) = self - .session - .lock() - .unwrap() - .as_ref() - .map(|handle| handle.sender.clone()) - else { - return false; + gst::EventView::Segment(segment) => { + let Some(sender) = sender else { return false }; + let msg = DataMsg::Segment { + pad: pad.name().to_string(), + generation, + segment: segment.segment().to_owned(), }; - - if sender - .send(ControlMessage::Eos { - pad_name: pad.name().to_string(), - }) - .is_err() - { - return false; + match send_or_flush(&sender, msg, flush) { + SendOutcome::Sent => gst::Pad::event_default(pad, Some(&*self.obj()), event), + SendOutcome::Flushed | SendOutcome::Closed => false, } - - true } - _ => gst::Pad::event_default(pad, Some(&*self.obj()), event), - } - } -} - -async fn run_session( - settings: ResolvedSettings, - mut rx: mpsc::UnboundedReceiver, - element_weak: gst::glib::WeakRef, -) -> Result<()> { - let mut client_config = moq_native::ClientConfig::default(); - client_config.tls.disable_verify = Some(settings.tls_disable_verify); - - let client = client_config.init()?; - - let origin = moq_net::Origin::random().produce(); - let mut broadcast = moq_net::Broadcast::new().produce(); - let broadcast_consumer = broadcast.consume(); - - let catalog = moq_mux::catalog::Producer::new(&mut broadcast)?; - - anyhow::ensure!( - origin.publish_broadcast(&settings.broadcast, broadcast_consumer), - "failed to publish broadcast {}", - settings.broadcast - ); - - let client = client.with_publish(origin.consume()); - let session = client.connect(settings.url.clone()).await?; - - let mut runtime = RuntimeState { - session, - broadcast, - catalog, - pads: HashMap::new(), - }; - let mut eos_pads = HashSet::new(); - - while let Some(msg) = rx.recv().await { - match msg { - ControlMessage::SetCaps { pad_name, caps } => { - if let Err(err) = handle_caps(&mut runtime, pad_name, caps) { - gst::error!(CAT, "failed to configure pad: {err:#}"); - } + gst::EventView::Eos(_) => { + let Some(sender) = sender else { return false }; + let msg = DataMsg::Eos { + pad: pad.name().to_string(), + generation, + }; + matches!(send_or_flush(&sender, msg, flush), SendOutcome::Sent) } - ControlMessage::Buffer { pad_name, data, pts } => { - if let Err(err) = handle_buffer(&mut runtime, pad_name, data, pts) { - gst::error!(CAT, "failed to publish buffer: {err:#}"); + // FLUSH_START arrives out of band on the flushing thread: flip this pad's watch to cut a send + // blocked in the chain, and tell the worker to re-anchor the timeline. Never blocks. + gst::EventView::FlushStart(_) => { + toggle_pad_flush(&self.pad_flush.lock().unwrap(), pad.name().as_str(), generation, true); + if let Some(handle) = self.session.lock().unwrap().as_ref() { + let _ = handle.flush_sender().send(FlushSignal { + pad: pad.name().to_string(), + generation, + }); } + gst::Pad::event_default(pad, Some(&*self.obj()), event) } - ControlMessage::DropPad { pad_name } => { - runtime.pads.remove(&pad_name); - eos_pads.remove(&pad_name); - } - ControlMessage::Eos { pad_name } => { - eos_pads.insert(pad_name); - - if !runtime.pads.is_empty() && eos_pads.len() == runtime.pads.len() { - if let Some(element) = element_weak.upgrade() { - let eos_message = gst::message::Eos::builder().src(&element).build(); - let _ = element.post_message(eos_message); - } - } + // FLUSH_STOP clears the gate so the chain resumes; the trailing SEGMENT re-anchors the worker. + gst::EventView::FlushStop(_) => { + toggle_pad_flush(&self.pad_flush.lock().unwrap(), pad.name().as_str(), generation, false); + gst::Pad::event_default(pad, Some(&*self.obj()), event) } - ControlMessage::Shutdown => break, + _ => gst::Pad::event_default(pad, Some(&*self.obj()), event), } } - - Ok(()) } -fn handle_caps(runtime: &mut RuntimeState, pad_name: String, caps: gst::Caps) -> Result<()> { - let structure = caps.structure(0).context("empty caps")?; - let decoder: moq_mux::import::Framed = match structure.name().as_str() { - "video/x-h264" => { - let mut bytes = Bytes::new(); - new_decoder(runtime, moq_mux::import::FramedFormat::Avc3, &mut bytes)? - } - "video/x-h265" => { - let mut bytes = Bytes::new(); - new_decoder(runtime, moq_mux::import::FramedFormat::Hev1, &mut bytes)? - } - "video/x-av1" => { - let mut bytes = Bytes::new(); - new_decoder(runtime, moq_mux::import::FramedFormat::Av01, &mut bytes)? - } - "video/x-vp8" => { - let mut bytes = Bytes::new(); - new_decoder(runtime, moq_mux::import::FramedFormat::Vp8, &mut bytes)? - } - "video/x-vp9" => { - let mut bytes = Bytes::new(); - new_decoder(runtime, moq_mux::import::FramedFormat::Vp9, &mut bytes)? - } - "audio/mpeg" => { - let codec_data = structure - .get::("codec_data") - .context("AAC caps missing codec_data")?; - let map = codec_data.map_readable().context("failed to map codec_data")?; - let mut data = Bytes::copy_from_slice(map.as_slice()); - new_decoder(runtime, moq_mux::import::FramedFormat::Aac, &mut data)? +// Flip a pad's FLUSH watch only when the generation matches, so a stale FLUSH from a previous +// incarnation cannot cancel the live pad's sends. Mirrors the worker's generation discipline. +fn toggle_pad_flush(flush: &HashMap)>, name: &str, generation: u64, value: bool) { + if let Some((gen, tx)) = flush.get(name) { + if *gen == generation { + let _ = tx.send(value); } - "audio/x-opus" => { - let channels: i32 = structure.get("channels").unwrap_or(2); - let rate: i32 = structure.get("rate").unwrap_or(48_000); - let channel_count = - u32::try_from(channels).with_context(|| format!("Opus caps has negative channel count {channels}"))?; - let sample_rate = - u32::try_from(rate).with_context(|| format!("Opus caps has negative sample rate {rate}"))?; - let config = moq_mux::codec::opus::Config { - sample_rate, - channel_count, - }; - moq_mux::codec::opus::Import::new(runtime.broadcast.clone(), runtime.catalog.clone(), config)?.into() - } - other => anyhow::bail!("unsupported caps: {}", other), - }; - - runtime.pads.insert( - pad_name, - PadState { - decoder, - reference_pts: None, - }, - ); - Ok(()) + } } -fn new_decoder( - runtime: &mut RuntimeState, - format: moq_mux::import::FramedFormat, - buf: &mut Bytes, -) -> Result { - let decoder = moq_mux::import::Framed::new(runtime.broadcast.clone(), runtime.catalog.clone(), format, buf)?; - Ok(decoder) -} +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use tokio::sync::watch; + + use super::toggle_pad_flush; + + #[test] + fn pad_flush_toggle_ignores_stale_generation() { + let mut map = HashMap::new(); + let (tx, rx) = watch::channel(false); + map.insert("video".to_string(), (1u64, tx)); + // A stale generation must not flip the live pad's watch. + toggle_pad_flush(&map, "video", 0, true); + assert!(!*rx.borrow(), "stale-generation flush is ignored"); + // The current generation flips it. + toggle_pad_flush(&map, "video", 1, true); + assert!(*rx.borrow(), "current-generation flush flips the watch"); + } -fn handle_buffer( - runtime: &mut RuntimeState, - pad_name: String, - mut data: Bytes, - pts: Option, -) -> Result<()> { - let pad = runtime.pads.get_mut(&pad_name).context("pad not configured")?; - - let ts = pts.and_then(|pts| { - let reference = *pad.reference_pts.get_or_insert(pts); - let relative = pts.checked_sub(reference)?; - hang::container::Timestamp::from_micros(relative.nseconds() / 1000).ok() - }); - - pad.decoder.decode_frame(&mut data, ts).map_err(|e| anyhow::anyhow!(e)) + // A failed pad add must leave membership untouched. request_new_pad adds the pad before announcing + // AddPad or mutating the maps, so a duplicate name (or a concurrent request that lost the race) that + // fails add_pad does not corrupt the live pad: its generation and flush sender survive. Announcing + // for a name already held would otherwise make the worker finalize the live pad's producer and + // overwrite its generation. Exercised via two direct request_new_pad calls (the concurrent path). + #[test] + fn failed_duplicate_pad_keeps_membership_consistent() { + use gst::prelude::*; + use gst::subclass::prelude::*; + gst::init().unwrap(); + + let obj = gst::glib::Object::new::(); + let imp = obj.imp(); + let templ = obj.pad_template("sink_%u").expect("sink template"); + + let p0 = imp.request_new_pad(&templ, Some("sink_0"), None); + let p1 = imp.request_new_pad(&templ, Some("sink_0"), None); + assert!(p0.is_some(), "first request succeeds"); + assert!(p1.is_none(), "duplicate name fails to add"); + + let gen = imp.pad_generations.lock().unwrap().get("sink_0").copied(); + let flush_gen = imp.pad_flush.lock().unwrap().get("sink_0").map(|(g, _)| *g); + // After the failed duplicate the maps must still describe the LIVE pad (generation 0). + assert_eq!( + gen, + Some(0), + "live pad's generation must survive a failed duplicate (got {gen:?})" + ); + assert_eq!( + flush_gen, + Some(0), + "live pad's flush sender must survive a failed duplicate (got {flush_gen:?})" + ); + } } diff --git a/rs/moq-gst/src/sink/mod.rs b/rs/moq-gst/src/sink/mod.rs index 80d890a1a..565efb3d9 100644 --- a/rs/moq-gst/src/sink/mod.rs +++ b/rs/moq-gst/src/sink/mod.rs @@ -2,6 +2,8 @@ use gst::glib; use gst::prelude::*; mod imp; +mod session; +mod timeline; glib::wrapper! { pub struct MoqSink(ObjectSubclass) @extends gst::Element, gst::Object; diff --git a/rs/moq-gst/src/spike/session.rs b/rs/moq-gst/src/sink/session.rs similarity index 99% rename from rs/moq-gst/src/spike/session.rs rename to rs/moq-gst/src/sink/session.rs index 7267dca19..4b6e4bc5c 100644 --- a/rs/moq-gst/src/spike/session.rs +++ b/rs/moq-gst/src/sink/session.rs @@ -13,7 +13,7 @@ use hang::moq_net; use moq_mux::import::{Framed, FramedFormat}; use super::timeline::{classify_segment, frame_micros, FrameDecision, SegmentDecision, SegmentInfo}; -use super::MoqSinkSpike as Element; +use super::MoqSink as Element; static RUNTIME: LazyLock = LazyLock::new(|| { tokio::runtime::Builder::new_multi_thread() @@ -23,7 +23,7 @@ static RUNTIME: LazyLock = LazyLock::new(|| { }); pub static CAT: LazyLock = - LazyLock::new(|| gst::DebugCategory::new("moq-sink-spike", gst::DebugColorFlags::empty(), Some("MoQ Sink spike"))); + LazyLock::new(|| gst::DebugCategory::new("moq-sink", gst::DebugColorFlags::empty(), Some("MoQ Sink Element"))); /// Handoff, not a buffer: a full channel must block the streaming thread, not grow. const DATA_CHANNEL_BOUND: usize = 8; @@ -887,7 +887,7 @@ impl Pad { } } -/// Media types the spike can build a producer for. Checked synchronously at the event boundary (an +/// Media types moqsink can build a producer for. Checked synchronously at the event boundary (an /// unsupported caps is rejected with NotNegotiated) and again in `set_caps`. Per-codec specifics /// (byte-stream/au, AAC codec_data) are enforced when the producer is built, so a bad detail fails /// that pad, not the session. diff --git a/rs/moq-gst/src/spike/timeline.rs b/rs/moq-gst/src/sink/timeline.rs similarity index 100% rename from rs/moq-gst/src/spike/timeline.rs rename to rs/moq-gst/src/sink/timeline.rs diff --git a/rs/moq-gst/src/spike/imp.rs b/rs/moq-gst/src/spike/imp.rs deleted file mode 100644 index 43e131051..000000000 --- a/rs/moq-gst/src/spike/imp.rs +++ /dev/null @@ -1,535 +0,0 @@ -//! GObject shell, deliberately a parallel element (`moqsinkspike`) so production `moqsink` is untouched. - -use std::collections::HashMap; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::{Arc, LazyLock, Mutex}; - -use anyhow::{Context, Result}; -use bytes::Bytes; -use gst::glib; -use gst::prelude::*; -use gst::subclass::prelude::*; -use tokio::sync::watch; - -use super::session::{ - caps_supported, send_or_flush, DataMsg, FlushSignal, ResolvedSettings, SendOutcome, SessionHandle, Status, CAT, -}; - -/// Reject a frame past the MoQ frame limit (moq-net's MAX_FRAME_SIZE, 16 MiB): it could not be -/// consumed anyway, and copying it would let hostile input drive an unbounded allocation. -const MAX_FRAME_BYTES: usize = 16 * 1024 * 1024; - -#[derive(Debug, Clone, Default)] -struct Settings { - url: Option, - broadcast: Option, - tls_disable_verify: bool, -} - -impl TryFrom for ResolvedSettings { - type Error = anyhow::Error; - - fn try_from(value: Settings) -> Result { - Ok(Self { - url: url::Url::parse(value.url.as_ref().context("url property is required")?)?, - broadcast: value - .broadcast - .as_ref() - .context("broadcast property is required")? - .clone(), - tls_disable_verify: value.tls_disable_verify, - }) - } -} - -#[derive(Default)] -pub struct MoqSinkSpike { - settings: Mutex, - session: Mutex>, - status: Arc, - // Monotonic per-pad generation; a pad recreated with the same name gets a fresh value so the - // worker discards the previous incarnation's in-flight messages. - next_generation: AtomicU64, - pad_generations: Mutex>, - // Per-pad FLUSH gate: FLUSH_START flips it true to cut a blocked send (the chain holds a cloned - // receiver); FLUSH_STOP flips it false. Per-pad so flushing one input never cancels another's send. - // Keyed with the generation so a stale FLUSH from a previous incarnation cannot flip the live pad. - pad_flush: Mutex)>>, -} - -#[glib::object_subclass] -impl ObjectSubclass for MoqSinkSpike { - const NAME: &'static str = "MoqSinkSpike"; - type Type = super::MoqSinkSpike; - type ParentType = gst::Element; -} - -impl ObjectImpl for MoqSinkSpike { - fn properties() -> &'static [glib::ParamSpec] { - static PROPS: LazyLock> = LazyLock::new(|| { - vec![ - glib::ParamSpecString::builder("url") - .nick("Source URL") - .blurb("Connect to the given URL") - .build(), - glib::ParamSpecString::builder("broadcast") - .nick("Broadcast") - .blurb("The name of the broadcast to publish") - .build(), - glib::ParamSpecBoolean::builder("tls-disable-verify") - .nick("TLS disable verify") - .blurb("Disable TLS verification") - .default_value(false) - .build(), - // Read-only, served from the shared Status the task writes. - glib::ParamSpecBoolean::builder("connected") - .nick("Connected") - .blurb("Whether the session is currently connected") - .read_only() - .build(), - glib::ParamSpecString::builder("moq-version") - .nick("Negotiated version") - .blurb("The negotiated MoQ protocol version, null when disconnected") - .read_only() - .build(), - glib::ParamSpecUInt64::builder("estimated-send-bitrate") - .nick("Estimated send bitrate") - .blurb("Estimated send bitrate in bits per second (congestion controller), 0 when unavailable") - .read_only() - .build(), - ] - }); - PROPS.as_ref() - } - - fn set_property(&self, _id: usize, value: &glib::Value, pspec: &glib::ParamSpec) { - let mut settings = self.settings.lock().unwrap(); - match pspec.name() { - "url" => settings.url = value.get().unwrap(), - "broadcast" => settings.broadcast = value.get().unwrap(), - "tls-disable-verify" => settings.tls_disable_verify = value.get().unwrap(), - _ => unreachable!(), - } - } - - fn property(&self, _id: usize, pspec: &glib::ParamSpec) -> glib::Value { - match pspec.name() { - "connected" => self.status.connected().to_value(), - "moq-version" => self.status.version().to_value(), - "estimated-send-bitrate" => self.status.send_bitrate().to_value(), - name => { - let settings = self.settings.lock().unwrap(); - match name { - "url" => settings.url.to_value(), - "broadcast" => settings.broadcast.to_value(), - "tls-disable-verify" => settings.tls_disable_verify.to_value(), - _ => unreachable!(), - } - } - } - } - - fn constructed(&self) { - self.parent_constructed(); - self.obj().set_element_flags(gst::ElementFlags::SINK); - } -} - -impl GstObjectImpl for MoqSinkSpike {} - -impl ElementImpl for MoqSinkSpike { - fn metadata() -> Option<&'static gst::subclass::ElementMetadata> { - static METADATA: LazyLock = LazyLock::new(|| { - gst::subclass::ElementMetadata::new( - "MoQ Sink (spike)", - "Sink/Network/MoQ", - "Transmits media over MoQ (spike)", - "Ariel Molina ", - ) - }); - Some(&*METADATA) - } - - fn pad_templates() -> &'static [gst::PadTemplate] { - static PAD_TEMPLATES: LazyLock> = LazyLock::new(|| { - // Every codec that converges on moq_mux::import::Framed; per-codec specifics are validated - // when the producer is built from caps, not here. - let mut caps = gst::Caps::new_empty(); - caps.merge( - gst::Caps::builder("video/x-h264") - .field("stream-format", "byte-stream") - .field("alignment", "au") - .build(), - ); - caps.merge( - gst::Caps::builder("video/x-h265") - .field("stream-format", "byte-stream") - .field("alignment", "au") - .build(), - ); - caps.merge(gst::Caps::builder("video/x-av1").build()); - caps.merge(gst::Caps::builder("video/x-vp8").build()); - caps.merge(gst::Caps::builder("video/x-vp9").build()); - caps.merge( - gst::Caps::builder("audio/mpeg") - .field("mpegversion", 4i32) - .field("stream-format", "raw") - .build(), - ); - caps.merge(gst::Caps::builder("audio/x-opus").build()); - - let templ = - gst::PadTemplate::new("sink_%u", gst::PadDirection::Sink, gst::PadPresence::Request, &caps).unwrap(); - vec![templ] - }); - PAD_TEMPLATES.as_ref() - } - - fn request_new_pad( - &self, - templ: &gst::PadTemplate, - name: Option<&str>, - _caps: Option<&gst::Caps>, - ) -> Option { - // Fixed per pad incarnation and captured here, so a buffer in flight from a released pad never - // reads a successor's generation. - let generation = self.next_generation.fetch_add(1, Ordering::Relaxed); - // One flush watch per pad: the sender lives in `pad_flush` (toggled by FLUSH_START/STOP), each - // pad function carries its own cloned receiver to cancel a blocked send. - let (flush_tx, flush_rx) = watch::channel(false); - let chain_flush = flush_rx.clone(); - let event_flush = flush_rx; - let pad_builder = gst::Pad::builder_from_template(templ) - .chain_function(move |pad, parent, buffer| { - let mut flush = chain_flush.clone(); - MoqSinkSpike::catch_panic_pad_function( - parent, - || Err(gst::FlowError::Error), - |sink| sink.forward_buffer(pad, generation, &mut flush, buffer), - ) - }) - .event_function(move |pad, parent, event| { - let mut flush = event_flush.clone(); - MoqSinkSpike::catch_panic_pad_function( - parent, - || false, - |sink| sink.handle_event(pad, generation, &mut flush, event), - ) - }); - - let pad = match name { - Some(name) => pad_builder.name(name).build(), - None => pad_builder.generated_name().build(), - }; - - // Populate the maps BEFORE the pad is visible to GStreamer, so a concurrent start_session seed - // never sees a pad without its generation. Capture the previous holders so a failed add_pad (a - // duplicate name, or a concurrent request that lost the race) rolls back without orphaning the - // live pad, and announce AddPad only after add_pad succeeds so the worker never gets a phantom. - // - // Known limitation (documented, not closed): two concurrent requests for the SAME name racing a - // start_session seed can leave the seed reading the loser's generation for the winner's pad, so - // the winner's events are then dropped as stale. Closing it would mean holding a lock across - // add_pad, which emits pad-added synchronously and would deadlock a reentrant handler; that risk - // is worse than the bug, whose trigger (same-name concurrent request_new_pad) apps do not produce. - let name = pad.name().to_string(); - let prev_gen = self.pad_generations.lock().unwrap().insert(name.clone(), generation); - let prev_flush = self - .pad_flush - .lock() - .unwrap() - .insert(name.clone(), (generation, flush_tx)); - - if self.obj().add_pad(&pad).is_err() { - // Roll back, but only if this attempt still owns the entry. A concurrent same-name request - // that won add_pad may have overwritten it; restoring our captured `prev` (or removing) would - // then clobber the live pad it just registered. Touch the maps only while they still hold our - // own generation. - { - let mut gens = self.pad_generations.lock().unwrap(); - if gens.get(name.as_str()) == Some(&generation) { - match prev_gen { - Some(g) => gens.insert(name.clone(), g), - None => gens.remove(&name), - }; - } - } - { - let mut flushes = self.pad_flush.lock().unwrap(); - if flushes.get(name.as_str()).map(|(g, _)| *g) == Some(generation) { - match prev_flush { - Some(f) => flushes.insert(name.clone(), f), - None => flushes.remove(&name), - }; - } - } - return None; - } - - // A request pad is linked by the caller only after this returns, so its CAPS/buffers cannot reach - // the worker ahead of this AddPad. - let sender = self.session.lock().unwrap().as_ref().map(SessionHandle::sender); - if let Some(sender) = sender { - let _ = sender.blocking_send(DataMsg::AddPad { pad: name, generation }); - } - Some(pad) - } - - fn release_pad(&self, pad: &gst::Pad) { - let name = pad.name().to_string(); - let generation = self.pad_generations.lock().unwrap().remove(&name); - // Dropping the watch sender wakes any send still blocked on this pad (changed() errors -> Flushing). - self.pad_flush.lock().unwrap().remove(&name); - // Drop the session guard before blocking_send: holding it across a full-channel block deadlocks stop_session. - let sender = { - let session = self.session.lock().unwrap(); - session.as_ref().map(SessionHandle::sender) - }; - if let (Some(sender), Some(generation)) = (sender, generation) { - // Uncancellable: a full data channel or an in-progress connect stalls release here, and the - // per-pad flush watch (removed just above) cannot cut it. Known control-path-blocking limit, - // shared with the AddPad send; out of scope for FLUSH. - let _ = sender.blocking_send(DataMsg::DropPad { pad: name, generation }); - } - let _ = self.obj().remove_pad(pad); - } - - fn change_state(&self, transition: gst::StateChange) -> Result { - match transition { - gst::StateChange::ReadyToPaused => { - self.start_session().map_err(|err| { - gst::error!(CAT, obj = self.obj(), "failed to start session: {err:#}"); - gst::StateChangeError - })?; - } - gst::StateChange::PausedToReady => self.stop_session(), - _ => (), - } - - self.parent_change_state(transition) - } -} - -impl MoqSinkSpike { - fn start_session(&self) -> Result<()> { - // Synchronous settings validation surfaces as a StateChangeError; async failures go to the bus. - let settings = ResolvedSettings::try_from(self.settings.lock().unwrap().clone())?; - // Seed pads requested before the session existed; the data channel is created inside start(). - let seed = { - let gens = self.pad_generations.lock().unwrap(); - self.obj() - .pads() - .iter() - .map(|pad| { - let name = pad.name().to_string(); - let generation = gens.get(&name).copied().unwrap_or(0); - (name, generation) - }) - .collect() - }; - let handle = SessionHandle::start(settings, self.status.clone(), self.obj().downgrade(), seed); - *self.session.lock().unwrap() = Some(handle); - Ok(()) - } - - fn stop_session(&self) { - if let Some(handle) = self.session.lock().unwrap().take() { - handle.stop(); - } - } - - /// Clone the sender, release the lock, then blocking-send: never apply backpressure under the session lock. - fn forward_buffer( - &self, - pad: &gst::Pad, - generation: u64, - flush: &mut watch::Receiver, - buffer: gst::Buffer, - ) -> Result { - // The worker marks a pad failed after rejecting its data; surface that to GStreamer instead of - // silently dropping. Because the worker is async, this lands on the buffer after the bad one. - if self.status.is_failed(pad.name().as_str(), generation) { - return Err(gst::FlowError::NotNegotiated); - } - - // Bound the per-frame allocation before copying: a buffer past the frame limit cannot be - // consumed and would let hostile input drive an unbounded copy. - if buffer.size() > MAX_FRAME_BYTES { - gst::warning!( - CAT, - "rejecting {}-byte buffer on pad {} (exceeds frame limit)", - buffer.size(), - pad.name() - ); - return Err(gst::FlowError::Error); - } - - // Skip the map + copy if the pad is already flushing; send_or_flush re-checks for a flush that - // starts mid-send, but during a flush repeated buffers would otherwise burn a copy each before - // being dropped. - if *flush.borrow() { - return Err(gst::FlowError::Flushing); - } - - let sender = self - .session - .lock() - .unwrap() - .as_ref() - .map(|handle| handle.sender()) - .ok_or(gst::FlowError::Flushing)?; - - let map = buffer.map_readable().map_err(|_| gst::FlowError::Error)?; - let data = Bytes::copy_from_slice(map.as_slice()); - let pts = buffer.pts(); - - let msg = DataMsg::Buffer { - pad: pad.name().to_string(), - generation, - data, - pts, - }; - // FLUSH_START on this pad cuts a send blocked on a full channel (returns Flushing), instead of - // stalling the streaming thread until the relay drains. - match send_or_flush(&sender, msg, flush) { - SendOutcome::Sent => Ok(gst::FlowSuccess::Ok), - SendOutcome::Flushed | SendOutcome::Closed => Err(gst::FlowError::Flushing), - } - } - - fn handle_event( - &self, - pad: &gst::Pad, - generation: u64, - flush: &mut watch::Receiver, - event: gst::Event, - ) -> bool { - let sender = self.session.lock().unwrap().as_ref().map(|handle| handle.sender()); - - match event.view() { - gst::EventView::Caps(caps) => { - let caps = caps.caps(); - // Reject unsupported caps synchronously (NotNegotiated) before handing off to the worker. - if !caps_supported(caps) { - gst::warning!(CAT, "rejecting unsupported caps on pad {}", pad.name()); - return false; - } - let Some(sender) = sender else { return false }; - let msg = DataMsg::Caps { - pad: pad.name().to_string(), - generation, - caps: caps.to_owned(), - }; - match send_or_flush(&sender, msg, flush) { - SendOutcome::Sent => gst::Pad::event_default(pad, Some(&*self.obj()), event), - SendOutcome::Flushed | SendOutcome::Closed => false, - } - } - gst::EventView::Segment(segment) => { - let Some(sender) = sender else { return false }; - let msg = DataMsg::Segment { - pad: pad.name().to_string(), - generation, - segment: segment.segment().to_owned(), - }; - match send_or_flush(&sender, msg, flush) { - SendOutcome::Sent => gst::Pad::event_default(pad, Some(&*self.obj()), event), - SendOutcome::Flushed | SendOutcome::Closed => false, - } - } - gst::EventView::Eos(_) => { - let Some(sender) = sender else { return false }; - let msg = DataMsg::Eos { - pad: pad.name().to_string(), - generation, - }; - matches!(send_or_flush(&sender, msg, flush), SendOutcome::Sent) - } - // FLUSH_START arrives out of band on the flushing thread: flip this pad's watch to cut a send - // blocked in the chain, and tell the worker to re-anchor the timeline. Never blocks. - gst::EventView::FlushStart(_) => { - toggle_pad_flush(&self.pad_flush.lock().unwrap(), pad.name().as_str(), generation, true); - if let Some(handle) = self.session.lock().unwrap().as_ref() { - let _ = handle.flush_sender().send(FlushSignal { - pad: pad.name().to_string(), - generation, - }); - } - gst::Pad::event_default(pad, Some(&*self.obj()), event) - } - // FLUSH_STOP clears the gate so the chain resumes; the trailing SEGMENT re-anchors the worker. - gst::EventView::FlushStop(_) => { - toggle_pad_flush(&self.pad_flush.lock().unwrap(), pad.name().as_str(), generation, false); - gst::Pad::event_default(pad, Some(&*self.obj()), event) - } - _ => gst::Pad::event_default(pad, Some(&*self.obj()), event), - } - } -} - -// Flip a pad's FLUSH watch only when the generation matches, so a stale FLUSH from a previous -// incarnation cannot cancel the live pad's sends. Mirrors the worker's generation discipline. -fn toggle_pad_flush(flush: &HashMap)>, name: &str, generation: u64, value: bool) { - if let Some((gen, tx)) = flush.get(name) { - if *gen == generation { - let _ = tx.send(value); - } - } -} - -#[cfg(test)] -mod tests { - use std::collections::HashMap; - - use tokio::sync::watch; - - use super::toggle_pad_flush; - - #[test] - fn pad_flush_toggle_ignores_stale_generation() { - let mut map = HashMap::new(); - let (tx, rx) = watch::channel(false); - map.insert("video".to_string(), (1u64, tx)); - // A stale generation must not flip the live pad's watch. - toggle_pad_flush(&map, "video", 0, true); - assert!(!*rx.borrow(), "stale-generation flush is ignored"); - // The current generation flips it. - toggle_pad_flush(&map, "video", 1, true); - assert!(*rx.borrow(), "current-generation flush flips the watch"); - } - - // A failed pad add must leave membership untouched. request_new_pad adds the pad before announcing - // AddPad or mutating the maps, so a duplicate name (or a concurrent request that lost the race) that - // fails add_pad does not corrupt the live pad: its generation and flush sender survive. Announcing - // for a name already held would otherwise make the worker finalize the live pad's producer and - // overwrite its generation. Exercised via two direct request_new_pad calls (the concurrent path). - #[test] - fn failed_duplicate_pad_keeps_membership_consistent() { - use gst::prelude::*; - use gst::subclass::prelude::*; - gst::init().unwrap(); - - let obj = gst::glib::Object::new::(); - let imp = obj.imp(); - let templ = obj.pad_template("sink_%u").expect("sink template"); - - let p0 = imp.request_new_pad(&templ, Some("sink_0"), None); - let p1 = imp.request_new_pad(&templ, Some("sink_0"), None); - assert!(p0.is_some(), "first request succeeds"); - assert!(p1.is_none(), "duplicate name fails to add"); - - let gen = imp.pad_generations.lock().unwrap().get("sink_0").copied(); - let flush_gen = imp.pad_flush.lock().unwrap().get("sink_0").map(|(g, _)| *g); - // After the failed duplicate the maps must still describe the LIVE pad (generation 0). - assert_eq!( - gen, - Some(0), - "live pad's generation must survive a failed duplicate (got {gen:?})" - ); - assert_eq!( - flush_gen, - Some(0), - "live pad's flush sender must survive a failed duplicate (got {flush_gen:?})" - ); - } -} diff --git a/rs/moq-gst/src/spike/mod.rs b/rs/moq-gst/src/spike/mod.rs deleted file mode 100644 index ba4929020..000000000 --- a/rs/moq-gst/src/spike/mod.rs +++ /dev/null @@ -1,19 +0,0 @@ -use gst::glib; -use gst::prelude::*; - -mod imp; -mod session; -mod timeline; - -glib::wrapper! { - pub struct MoqSinkSpike(ObjectSubclass) @extends gst::Element, gst::Object; -} - -pub fn register(plugin: &gst::Plugin) -> Result<(), glib::BoolError> { - gst::Element::register( - Some(plugin), - "moqsinkspike", - gst::Rank::NONE, - MoqSinkSpike::static_type(), - ) -} diff --git a/rs/moq-gst/tests/element.rs b/rs/moq-gst/tests/element.rs index 9d5925390..cb5b32446 100644 --- a/rs/moq-gst/tests/element.rs +++ b/rs/moq-gst/tests/element.rs @@ -19,9 +19,7 @@ fn init() { #[test] fn request_and_release_sink_pads() { init(); - let sink = gst::ElementFactory::make("moqsinkspike") - .build() - .expect("create moqsinkspike"); + let sink = gst::ElementFactory::make("moqsink").build().expect("create moqsink"); let pad0 = sink.request_pad_simple("sink_0").expect("request sink_0"); assert_eq!(pad0.name().as_str(), "sink_0"); @@ -38,9 +36,7 @@ fn request_and_release_sink_pads() { #[test] fn missing_url_fails_state_change() { init(); - let sink = gst::ElementFactory::make("moqsinkspike") - .build() - .expect("create moqsinkspike"); + let sink = gst::ElementFactory::make("moqsink").build().expect("create moqsink"); assert!( sink.set_state(gst::State::Paused).is_err(), "a missing url must fail the Ready->Paused state change" @@ -54,11 +50,11 @@ fn missing_url_fails_state_change() { fn connect_failure_posts_error_to_bus() { init(); let pipeline = gst::Pipeline::new(); - let sink = gst::ElementFactory::make("moqsinkspike") + let sink = gst::ElementFactory::make("moqsink") .property("url", "https://nonexistent.invalid:443") .property("broadcast", "test") .build() - .expect("create moqsinkspike"); + .expect("create moqsink"); pipeline.add(&sink).expect("add sink to pipeline"); let _ = pipeline.set_state(gst::State::Playing); From a8583089a59226321560b47bfd9986d816fe19b3 Mon Sep 17 00:00:00 2001 From: Ariel Molina Date: Wed, 17 Jun 2026 10:41:03 -0600 Subject: [PATCH 11/13] moq-gst: harden the moqsink core (FLUSH restart, release/EOS, error isolation) Post-review fixes to the rewritten moqsink, all internal to rs/moq-gst: - The FLUSH watch is keyed to the pad's lifetime, not the session's. A FLUSH_START whose FLUSH_STOP never arrived before a PAUSED->READY->PAUSED restart left that pad muted across the new session. Clear every watch at session start (reset_pad_flushes), before the new session is visible. - A pad release is reconfiguration, not end-of-stream: DropPad no longer posts element EOS. The genuine "all pads ended" completion still fires from the EOS handler, so drop_pad now returns (). - A failed finish() at EOS is isolated to that pad, like a per-pad data error, instead of escalating to a session error that kills the whole broadcast. This splits fail_pad into mark_pad_failed_terminal (mark failed + terminal, no producer touch) so the EOS path never double-finalizes, and Pad::finalize takes the producer up front (let-else) to make its attempt-once contract explicit. - handle_data_msg returns Result instead of Result>; the only variant it could produce was Ended, so run_loop now constructs every ExitReason in one place. - Fix a comment on the worker's biased select: FLUSH_START arrives out of band (a different thread), so the bias only wins a readiness tie, not a happens-before against the bounded data channel. CONTEXT Rejected alternatives: - FLUSH restart: a "check session first" guard in the FlushStart handler. It narrows a window that reset-on-start already closes fully and reinforces a wrong race-condition reading; dropped for the single reset. - EOS finalize error: calling fail_pad directly. It would re-finalize the producer; the mark_pad_failed_terminal split avoids the double-finalize. Key decisions: - Isolating finalize errors per pad matches how mid-stream caps/buffer errors are already handled; genuinely fatal failures still surface via the session, catalog, and transport paths. - The send_or_flush post-permit window (one pre-flush buffer can still enqueue) is accepted and documented in place: the worker re-anchor absorbs it. Verified: 65 unit + 3 element tests, cargo clippy -D warnings, cargo fmt. Memory-leak soak (moqsink does not leak): a 35 min x264->moqsink run held RSS flat with 0 leaked gst objects; a ~12 h msdk->moqsink vs msdk->fakesink bisection showed identical flat RSS, so moqsink adds no leak over the encoder. Co-Authored-By: Claude Opus 4.8 --- rs/moq-gst/src/sink/imp.rs | 42 ++++++++- rs/moq-gst/src/sink/session.rs | 163 ++++++++++++++++++++++++--------- 2 files changed, 160 insertions(+), 45 deletions(-) diff --git a/rs/moq-gst/src/sink/imp.rs b/rs/moq-gst/src/sink/imp.rs index 22f051d6c..f7cb0972b 100644 --- a/rs/moq-gst/src/sink/imp.rs +++ b/rs/moq-gst/src/sink/imp.rs @@ -328,6 +328,8 @@ impl MoqSink { .collect() }; let handle = SessionHandle::start(settings, self.status.clone(), self.obj().downgrade(), seed); + // Clear any watch left armed by a previous incarnation before the new session is visible. + reset_pad_flushes(&self.pad_flush.lock().unwrap()); *self.session.lock().unwrap() = Some(handle); Ok(()) } @@ -477,13 +479,51 @@ fn toggle_pad_flush(flush: &HashMap)>, name: & } } +// Clear every pad's FLUSH watch at session start. The watch is keyed to the pad's lifetime, not the +// session's, so a FLUSH_START whose FLUSH_STOP never arrived before a restart would otherwise keep that +// pad muted across the new session. `false` is always safe here; a later live FLUSH_START re-arms it. +fn reset_pad_flushes(flush: &HashMap)>) { + for (_, tx) in flush.values() { + let _ = tx.send(false); + } +} + #[cfg(test)] mod tests { use std::collections::HashMap; use tokio::sync::watch; - use super::toggle_pad_flush; + use super::{reset_pad_flushes, toggle_pad_flush}; + + // A FLUSH_START left unpaired by a previous incarnation (stopped before its FLUSH_STOP) leaves the + // pad's watch armed; the watch outlives the session, so a new session must clear it or the pad stays + // muted across the restart. Exercised over a real pad registered by request_new_pad. + #[test] + fn session_start_clears_a_pads_stuck_flush_watch() { + use gst::prelude::*; + use gst::subclass::prelude::*; + gst::init().unwrap(); + + let obj = gst::glib::Object::new::(); + let imp = obj.imp(); + let templ = obj.pad_template("sink_%u").expect("sink template"); + imp.request_new_pad(&templ, Some("sink_0"), None) + .expect("request sink_0"); + + // Arm the watch as an unpaired FLUSH_START would, then take an independent receiver to observe it. + let rx = { + let map = imp.pad_flush.lock().unwrap(); + let (_, tx) = map.get("sink_0").expect("pad is registered"); + tx.send(true).unwrap(); + tx.subscribe() + }; + assert!(*rx.borrow(), "watch is armed before session start"); + + // The reset start_session runs must clear it. + reset_pad_flushes(&imp.pad_flush.lock().unwrap()); + assert!(!*rx.borrow(), "session start cleared the stuck watch"); + } #[test] fn pad_flush_toggle_ignores_stale_generation() { diff --git a/rs/moq-gst/src/sink/session.rs b/rs/moq-gst/src/sink/session.rs index 4b6e4bc5c..23f03bad2 100644 --- a/rs/moq-gst/src/sink/session.rs +++ b/rs/moq-gst/src/sink/session.rs @@ -443,9 +443,10 @@ async fn run_loop( loop { tokio::select! { - // Biased so a pending FLUSH re-anchors before the post-flush SEGMENT it precedes: the flush - // signal is enqueued before that SEGMENT on the streaming thread, and biased preserves that - // order across the two channels. Shutdown/death still preempt everything. + // Biased so a ready FLUSH wins a readiness tie and re-anchors the pad before its queued data is + // processed. FLUSH_START is out of band (a different thread), so this is not a happens-before + // against the bounded data channel; the pad goes NoSegment and drops until its next valid + // SEGMENT, which absorbs any residual reorder. Shutdown/death still preempt everything. biased; // Local close: quiet stop, no ERROR. _ = shutdown.changed() => return Ok(ExitReason::Stopped), @@ -467,29 +468,8 @@ async fn run_loop( None => send_bandwidth = None, }, msg = data.recv() => match msg { - Some(DataMsg::AddPad { pad, generation }) => { - // add_pad clears any stale failure for a fresh incarnation (keyed by this session). - pad_set.add_pad(&pad, generation); - } - Some(DataMsg::Caps { pad, generation, caps }) => { - if pad_set.caps(&pad, generation, &caps)? { - return Ok(ExitReason::Ended); - } - } - Some(DataMsg::Segment { pad, generation, segment }) => pad_set.segment(&pad, generation, segment), - Some(DataMsg::Buffer { pad, generation, data, pts }) => { - if pad_set.buffer(&pad, generation, data, pts)? { - return Ok(ExitReason::Ended); - } - } - Some(DataMsg::Eos { pad, generation }) => { - if pad_set.eos(&pad, generation)? { - return Ok(ExitReason::Ended); - } - } - // A release can complete the element if the remaining pads have all ended. - Some(DataMsg::DropPad { pad, generation }) => { - if pad_set.drop_pad(&pad, generation) { + Some(msg) => { + if handle_data_msg(pad_set, msg)? { return Ok(ExitReason::Ended); } } @@ -500,6 +480,41 @@ async fn run_loop( } } +/// Apply one data-channel message to the pad set, returning `true` when the element is now complete. +/// Completion is driven only by EOS aggregation (and a pad failure that ends the last open pad); a pad +/// release is reconfiguration and never completes the element. +fn handle_data_msg(pad_set: &mut PadSet, msg: DataMsg) -> Result { + let complete = match msg { + // add_pad clears any stale failure for a fresh incarnation (keyed by this session). + DataMsg::AddPad { pad, generation } => { + pad_set.add_pad(&pad, generation); + false + } + DataMsg::Caps { pad, generation, caps } => pad_set.caps(&pad, generation, &caps)?, + DataMsg::Segment { + pad, + generation, + segment, + } => { + pad_set.segment(&pad, generation, segment); + false + } + DataMsg::Buffer { + pad, + generation, + data, + pts, + } => pad_set.buffer(&pad, generation, data, pts)?, + DataMsg::Eos { pad, generation } => pad_set.eos(&pad, generation)?, + // A release is reconfiguration, not EOS: drop the pad without completing the element. + DataMsg::DropPad { pad, generation } => { + pad_set.drop_pad(&pad, generation); + false + } + }; + Ok(complete) +} + /// Running time is shared, not per-pad, so there is no anchor to drift. struct PadSet { broadcast: moq_net::BroadcastProducer, @@ -587,6 +602,12 @@ impl PadSet { gst::warning!(CAT, "finalize on failed pad {pad}: {err:?}"); } } + self.mark_pad_failed_terminal(pad, generation); + } + + /// Marks a pad failed in the shared Status and terminal for EOS aggregation, without touching its + /// producer; the caller has already finalized or dropped it. + fn mark_pad_failed_terminal(&mut self, pad: &str, generation: u64) { self.status.mark_failed(self.session_generation, pad, generation); // The failed pad's track is finalized, so it is terminal for EOS aggregation (counts as ended). self.eos.insert(pad.to_string()); @@ -653,18 +674,27 @@ impl PadSet { if self.eos.contains(pad) { return Ok(self.all_ended()); } - if let Some(p) = self.pads.get_mut(pad) { - p.finalize()?; + // A failed finish() at EOS is isolated to this pad, like a per-pad data error: mark it + // failed-terminal instead of escalating to a session error, so the other pads still complete. + match self.pads.get_mut(pad).map(Pad::finalize) { + Some(Ok(_)) | None => { + self.eos.insert(pad.to_string()); + } + Some(Err(err)) => { + gst::warning!(CAT, "finalize on EOS {pad}: {err:?}"); + self.pads.remove(pad); + self.mark_pad_failed_terminal(pad, generation); + } } - self.eos.insert(pad.to_string()); Ok(self.all_ended()) } - /// Returns whether the remaining active pads have all ended (a release can complete the element). - fn drop_pad(&mut self, pad: &str, generation: u64) -> bool { + /// Removes a released pad from membership. A release is reconfiguration, not end-of-stream, so it + /// never completes the element; element EOS comes only from every active pad reaching EOS. + fn drop_pad(&mut self, pad: &str, generation: u64) { if self.active.get(pad) != Some(&generation) { gst::warning!(CAT, "DropPad for stale or unknown pad {pad}, ignoring"); - return false; + return; } if let Some(mut p) = self.pads.remove(pad) { if let Err(err) = p.finalize() { @@ -673,7 +703,6 @@ impl PadSet { } self.active.remove(pad); self.eos.remove(pad); - self.all_ended() } /// Idempotent (skips already-finalized pads); the returned order proves "catalog last". `Err` means @@ -872,18 +901,18 @@ impl Pad { /// Consumes the producer so a second call is a no-op (`Framed::finish()` is not idempotent). fn finalize(&mut self) -> Result { - match self.framed.take() { - Some(mut framed) => { - // A lazy codec (H.265/AV1/VP8/VP9) given CAPS but no frame never created its track, so - // there is nothing to flush and finish() would error "not initialized". track() is Ok only - // once a track exists; a real finish error on an initialized one still surfaces. - if framed.track().is_ok() { - framed.finish()?; - } - Ok(true) - } - None => Ok(false), + // take() up front makes this attempt-once: after a failed finish() the producer is already gone, + // so a later finalize (fail_pad, finalize_all) is a no-op and never double-finishes. + let Some(mut framed) = self.framed.take() else { + return Ok(false); + }; + // A lazy codec (H.265/AV1/VP8/VP9) given CAPS but no frame never created its track, so there is + // nothing to flush and finish() would error "not initialized". track() is Ok only once a track + // exists; a real finish error on an initialized one still surfaces. + if framed.track().is_ok() { + framed.finish()?; } + Ok(true) } } @@ -1044,6 +1073,52 @@ mod tests { assert!(set.eos("video", 0).unwrap()); } + // A pad release is reconfiguration, not EOS: releasing the still-active pad must not complete the + // element, even though the remaining pad has already ended (its remainder reads all-ended). Guards + // against a release posting a spurious element EOS. + #[test] + fn releasing_active_pad_does_not_complete_the_element() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("a", 0); + set.add_pad("b", 0); + let done = handle_data_msg( + &mut set, + DataMsg::Eos { + pad: "a".into(), + generation: 0, + }, + ) + .unwrap(); + assert!(!done, "one of two members ended, not complete"); + let done = handle_data_msg( + &mut set, + DataMsg::DropPad { + pad: "b".into(), + generation: 0, + }, + ) + .unwrap(); + assert!(!done, "releasing the active pad must not complete the element"); + } + + // The normal completion path still fires through the dispatch: the last member's EOS completes it. + #[test] + fn last_eos_completes_the_element_through_dispatch() { + gst::init().unwrap(); + let mut set = pad_set(); + set.add_pad("a", 0); + let done = handle_data_msg( + &mut set, + DataMsg::Eos { + pad: "a".into(), + generation: 0, + }, + ) + .unwrap(); + assert!(done, "the last member's EOS completes the element"); + } + // The element completes only once every member has ended; a still-open member holds it open. #[test] fn element_completes_only_after_all_members_eos() { From 50c6f5df29108a0a5c73575438c352b5a66ac6af Mon Sep 17 00:00:00 2001 From: Ariel Molina Date: Wed, 17 Jun 2026 12:12:17 -0600 Subject: [PATCH 12/13] moq-gst: order Cargo.toml dep tables for cargo sort The new [dev-dependencies] section landed before [build-dependencies], but cargo sort expects dependencies -> build-dependencies -> dev-dependencies, failing `just ci`. Swap the two so the lint passes. Co-Authored-By: Claude Opus 4.8 --- rs/moq-gst/Cargo.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rs/moq-gst/Cargo.toml b/rs/moq-gst/Cargo.toml index dbaf5dbfb..90c21df40 100644 --- a/rs/moq-gst/Cargo.toml +++ b/rs/moq-gst/Cargo.toml @@ -29,8 +29,8 @@ tracing = "0.1" tracing-subscriber = "0.3" url = "2" -[dev-dependencies] -gst = { package = "gstreamer", version = "0.23" } - [build-dependencies] gst-plugin-version-helper = "0.8" + +[dev-dependencies] +gst = { package = "gstreamer", version = "0.23" } From f570b2f1f25d2236bed621fc2a7937773e078a77 Mon Sep 17 00:00:00 2001 From: Ariel Molina Date: Mon, 22 Jun 2026 13:12:12 -0600 Subject: [PATCH 13/13] moq-gst: make moqsink handoff non-blocking The session handoff used a bounded mpsc channel for backpressure, so a send blocked on a full channel could not be cut by an out-of-band FLUSH_START and could wedge a seek/teardown. But moq-net's producer is a synchronous RAM cache (it appends and evicts, never blocking on the network), so the sink never needs to apply backpressure at the GStreamer boundary. Make the handoff unbounded: chain accepts each buffer toward the worker and returns without blocking. With no blocked send the flush-cancellation apparatus is unnecessary and removed: the bounded channel, send_or_flush/SendOutcome, the per-pad flush watches that cut a blocked send, and the AddPad/DropPad blocking sends. FlushSignal and the per-pad generations stay; FlushSignal now carries only the timeline re-anchor (FLUSH re-anchors a pad's timeline, kept intact). Net -369 lines. Caveat: the handoff is unbounded, so buffers accumulate transiently in it before connect completes and under worker lag, an unbounded buffer ahead of the cache. Building the producers up front and writing directly would remove it; left as a follow-up. Co-Authored-By: Claude Opus 4.8 --- rs/moq-gst/src/sink/imp.rs | 183 +++------------------ rs/moq-gst/src/sink/session.rs | 287 +++------------------------------ 2 files changed, 51 insertions(+), 419 deletions(-) diff --git a/rs/moq-gst/src/sink/imp.rs b/rs/moq-gst/src/sink/imp.rs index f7cb0972b..3d650c183 100644 --- a/rs/moq-gst/src/sink/imp.rs +++ b/rs/moq-gst/src/sink/imp.rs @@ -9,11 +9,8 @@ use bytes::Bytes; use gst::glib; use gst::prelude::*; use gst::subclass::prelude::*; -use tokio::sync::watch; -use super::session::{ - caps_supported, send_or_flush, DataMsg, FlushSignal, ResolvedSettings, SendOutcome, SessionHandle, Status, CAT, -}; +use super::session::{caps_supported, DataMsg, FlushSignal, ResolvedSettings, SessionHandle, Status, CAT}; /// Reject a frame past the MoQ frame limit (moq-net's MAX_FRAME_SIZE, 16 MiB): it could not be /// consumed anyway, and copying it would let hostile input drive an unbounded allocation. @@ -51,10 +48,6 @@ pub struct MoqSink { // worker discards the previous incarnation's in-flight messages. next_generation: AtomicU64, pad_generations: Mutex>, - // Per-pad FLUSH gate: FLUSH_START flips it true to cut a blocked send (the chain holds a cloned - // receiver); FLUSH_STOP flips it false. Per-pad so flushing one input never cancels another's send. - // Keyed with the generation so a stale FLUSH from a previous incarnation cannot flip the live pad. - pad_flush: Mutex)>>, } #[glib::object_subclass] @@ -194,27 +187,16 @@ impl ElementImpl for MoqSink { // Fixed per pad incarnation and captured here, so a buffer in flight from a released pad never // reads a successor's generation. let generation = self.next_generation.fetch_add(1, Ordering::Relaxed); - // One flush watch per pad: the sender lives in `pad_flush` (toggled by FLUSH_START/STOP), each - // pad function carries its own cloned receiver to cancel a blocked send. - let (flush_tx, flush_rx) = watch::channel(false); - let chain_flush = flush_rx.clone(); - let event_flush = flush_rx; let pad_builder = gst::Pad::builder_from_template(templ) .chain_function(move |pad, parent, buffer| { - let mut flush = chain_flush.clone(); MoqSink::catch_panic_pad_function( parent, || Err(gst::FlowError::Error), - |sink| sink.forward_buffer(pad, generation, &mut flush, buffer), + |sink| sink.forward_buffer(pad, generation, buffer), ) }) .event_function(move |pad, parent, event| { - let mut flush = event_flush.clone(); - MoqSink::catch_panic_pad_function( - parent, - || false, - |sink| sink.handle_event(pad, generation, &mut flush, event), - ) + MoqSink::catch_panic_pad_function(parent, || false, |sink| sink.handle_event(pad, generation, event)) }); let pad = match name { @@ -234,11 +216,6 @@ impl ElementImpl for MoqSink { // is worse than the bug, whose trigger (same-name concurrent request_new_pad) apps do not produce. let name = pad.name().to_string(); let prev_gen = self.pad_generations.lock().unwrap().insert(name.clone(), generation); - let prev_flush = self - .pad_flush - .lock() - .unwrap() - .insert(name.clone(), (generation, flush_tx)); if self.obj().add_pad(&pad).is_err() { // Roll back, but only if this attempt still owns the entry. A concurrent same-name request @@ -254,15 +231,6 @@ impl ElementImpl for MoqSink { }; } } - { - let mut flushes = self.pad_flush.lock().unwrap(); - if flushes.get(name.as_str()).map(|(g, _)| *g) == Some(generation) { - match prev_flush { - Some(f) => flushes.insert(name.clone(), f), - None => flushes.remove(&name), - }; - } - } return None; } @@ -270,7 +238,7 @@ impl ElementImpl for MoqSink { // the worker ahead of this AddPad. let sender = self.session.lock().unwrap().as_ref().map(SessionHandle::sender); if let Some(sender) = sender { - let _ = sender.blocking_send(DataMsg::AddPad { pad: name, generation }); + let _ = sender.send(DataMsg::AddPad { pad: name, generation }); } Some(pad) } @@ -278,18 +246,13 @@ impl ElementImpl for MoqSink { fn release_pad(&self, pad: &gst::Pad) { let name = pad.name().to_string(); let generation = self.pad_generations.lock().unwrap().remove(&name); - // Dropping the watch sender wakes any send still blocked on this pad (changed() errors -> Flushing). - self.pad_flush.lock().unwrap().remove(&name); - // Drop the session guard before blocking_send: holding it across a full-channel block deadlocks stop_session. + // Drop the session guard before the send so the unbounded send never runs under the session lock. let sender = { let session = self.session.lock().unwrap(); session.as_ref().map(SessionHandle::sender) }; if let (Some(sender), Some(generation)) = (sender, generation) { - // Uncancellable: a full data channel or an in-progress connect stalls release here, and the - // per-pad flush watch (removed just above) cannot cut it. Known control-path-blocking limit, - // shared with the AddPad send; out of scope for FLUSH. - let _ = sender.blocking_send(DataMsg::DropPad { pad: name, generation }); + let _ = sender.send(DataMsg::DropPad { pad: name, generation }); } let _ = self.obj().remove_pad(pad); } @@ -328,8 +291,6 @@ impl MoqSink { .collect() }; let handle = SessionHandle::start(settings, self.status.clone(), self.obj().downgrade(), seed); - // Clear any watch left armed by a previous incarnation before the new session is visible. - reset_pad_flushes(&self.pad_flush.lock().unwrap()); *self.session.lock().unwrap() = Some(handle); Ok(()) } @@ -340,12 +301,12 @@ impl MoqSink { } } - /// Clone the sender, release the lock, then blocking-send: never apply backpressure under the session lock. + /// Clone the sender, release the lock, then send: never hold the session lock across the (non-blocking, + /// unbounded) send. fn forward_buffer( &self, pad: &gst::Pad, generation: u64, - flush: &mut watch::Receiver, buffer: gst::Buffer, ) -> Result { // The worker marks a pad failed after rejecting its data; surface that to GStreamer instead of @@ -366,13 +327,6 @@ impl MoqSink { return Err(gst::FlowError::Error); } - // Skip the map + copy if the pad is already flushing; send_or_flush re-checks for a flush that - // starts mid-send, but during a flush repeated buffers would otherwise burn a copy each before - // being dropped. - if *flush.borrow() { - return Err(gst::FlowError::Flushing); - } - let sender = self .session .lock() @@ -391,21 +345,14 @@ impl MoqSink { data, pts, }; - // FLUSH_START on this pad cuts a send blocked on a full channel (returns Flushing), instead of - // stalling the streaming thread until the relay drains. - match send_or_flush(&sender, msg, flush) { - SendOutcome::Sent => Ok(gst::FlowSuccess::Ok), - SendOutcome::Flushed | SendOutcome::Closed => Err(gst::FlowError::Flushing), - } + // The channel is unbounded, so the write never blocks: it hands the buffer to the worker, which + // drains it into the moq producers (the cache). An Err means the worker's receiver is gone + // (session ended) -> Flushing. + sender.send(msg).map_err(|_| gst::FlowError::Flushing)?; + Ok(gst::FlowSuccess::Ok) } - fn handle_event( - &self, - pad: &gst::Pad, - generation: u64, - flush: &mut watch::Receiver, - event: gst::Event, - ) -> bool { + fn handle_event(&self, pad: &gst::Pad, generation: u64, event: gst::Event) -> bool { let sender = self.session.lock().unwrap().as_ref().map(|handle| handle.sender()); match event.view() { @@ -422,9 +369,9 @@ impl MoqSink { generation, caps: caps.to_owned(), }; - match send_or_flush(&sender, msg, flush) { - SendOutcome::Sent => gst::Pad::event_default(pad, Some(&*self.obj()), event), - SendOutcome::Flushed | SendOutcome::Closed => false, + match sender.send(msg) { + Ok(()) => gst::Pad::event_default(pad, Some(&*self.obj()), event), + Err(_) => false, } } gst::EventView::Segment(segment) => { @@ -434,9 +381,9 @@ impl MoqSink { generation, segment: segment.segment().to_owned(), }; - match send_or_flush(&sender, msg, flush) { - SendOutcome::Sent => gst::Pad::event_default(pad, Some(&*self.obj()), event), - SendOutcome::Flushed | SendOutcome::Closed => false, + match sender.send(msg) { + Ok(()) => gst::Pad::event_default(pad, Some(&*self.obj()), event), + Err(_) => false, } } gst::EventView::Eos(_) => { @@ -445,12 +392,12 @@ impl MoqSink { pad: pad.name().to_string(), generation, }; - matches!(send_or_flush(&sender, msg, flush), SendOutcome::Sent) + sender.send(msg).is_ok() } - // FLUSH_START arrives out of band on the flushing thread: flip this pad's watch to cut a send - // blocked in the chain, and tell the worker to re-anchor the timeline. Never blocks. + // FLUSH_START arrives out of band on the flushing thread: tell the worker to re-anchor the + // pad's timeline. The data channel is unbounded, so there is no blocked send to cut, and + // FLUSH_STOP needs no special handling (it propagates via the default arm below). gst::EventView::FlushStart(_) => { - toggle_pad_flush(&self.pad_flush.lock().unwrap(), pad.name().as_str(), generation, true); if let Some(handle) = self.session.lock().unwrap().as_ref() { let _ = handle.flush_sender().send(FlushSignal { pad: pad.name().to_string(), @@ -459,88 +406,16 @@ impl MoqSink { } gst::Pad::event_default(pad, Some(&*self.obj()), event) } - // FLUSH_STOP clears the gate so the chain resumes; the trailing SEGMENT re-anchors the worker. - gst::EventView::FlushStop(_) => { - toggle_pad_flush(&self.pad_flush.lock().unwrap(), pad.name().as_str(), generation, false); - gst::Pad::event_default(pad, Some(&*self.obj()), event) - } _ => gst::Pad::event_default(pad, Some(&*self.obj()), event), } } } -// Flip a pad's FLUSH watch only when the generation matches, so a stale FLUSH from a previous -// incarnation cannot cancel the live pad's sends. Mirrors the worker's generation discipline. -fn toggle_pad_flush(flush: &HashMap)>, name: &str, generation: u64, value: bool) { - if let Some((gen, tx)) = flush.get(name) { - if *gen == generation { - let _ = tx.send(value); - } - } -} - -// Clear every pad's FLUSH watch at session start. The watch is keyed to the pad's lifetime, not the -// session's, so a FLUSH_START whose FLUSH_STOP never arrived before a restart would otherwise keep that -// pad muted across the new session. `false` is always safe here; a later live FLUSH_START re-arms it. -fn reset_pad_flushes(flush: &HashMap)>) { - for (_, tx) in flush.values() { - let _ = tx.send(false); - } -} - #[cfg(test)] mod tests { - use std::collections::HashMap; - - use tokio::sync::watch; - - use super::{reset_pad_flushes, toggle_pad_flush}; - - // A FLUSH_START left unpaired by a previous incarnation (stopped before its FLUSH_STOP) leaves the - // pad's watch armed; the watch outlives the session, so a new session must clear it or the pad stays - // muted across the restart. Exercised over a real pad registered by request_new_pad. - #[test] - fn session_start_clears_a_pads_stuck_flush_watch() { - use gst::prelude::*; - use gst::subclass::prelude::*; - gst::init().unwrap(); - - let obj = gst::glib::Object::new::(); - let imp = obj.imp(); - let templ = obj.pad_template("sink_%u").expect("sink template"); - imp.request_new_pad(&templ, Some("sink_0"), None) - .expect("request sink_0"); - - // Arm the watch as an unpaired FLUSH_START would, then take an independent receiver to observe it. - let rx = { - let map = imp.pad_flush.lock().unwrap(); - let (_, tx) = map.get("sink_0").expect("pad is registered"); - tx.send(true).unwrap(); - tx.subscribe() - }; - assert!(*rx.borrow(), "watch is armed before session start"); - - // The reset start_session runs must clear it. - reset_pad_flushes(&imp.pad_flush.lock().unwrap()); - assert!(!*rx.borrow(), "session start cleared the stuck watch"); - } - - #[test] - fn pad_flush_toggle_ignores_stale_generation() { - let mut map = HashMap::new(); - let (tx, rx) = watch::channel(false); - map.insert("video".to_string(), (1u64, tx)); - // A stale generation must not flip the live pad's watch. - toggle_pad_flush(&map, "video", 0, true); - assert!(!*rx.borrow(), "stale-generation flush is ignored"); - // The current generation flips it. - toggle_pad_flush(&map, "video", 1, true); - assert!(*rx.borrow(), "current-generation flush flips the watch"); - } - // A failed pad add must leave membership untouched. request_new_pad adds the pad before announcing // AddPad or mutating the maps, so a duplicate name (or a concurrent request that lost the race) that - // fails add_pad does not corrupt the live pad: its generation and flush sender survive. Announcing + // fails add_pad does not corrupt the live pad: its generation survives. Announcing // for a name already held would otherwise make the worker finalize the live pad's producer and // overwrite its generation. Exercised via two direct request_new_pad calls (the concurrent path). #[test] @@ -559,17 +434,11 @@ mod tests { assert!(p1.is_none(), "duplicate name fails to add"); let gen = imp.pad_generations.lock().unwrap().get("sink_0").copied(); - let flush_gen = imp.pad_flush.lock().unwrap().get("sink_0").map(|(g, _)| *g); - // After the failed duplicate the maps must still describe the LIVE pad (generation 0). + // After the failed duplicate the map must still describe the LIVE pad (generation 0). assert_eq!( gen, Some(0), "live pad's generation must survive a failed duplicate (got {gen:?})" ); - assert_eq!( - flush_gen, - Some(0), - "live pad's flush sender must survive a failed duplicate (got {flush_gen:?})" - ); } } diff --git a/rs/moq-gst/src/sink/session.rs b/rs/moq-gst/src/sink/session.rs index 23f03bad2..2249243e1 100644 --- a/rs/moq-gst/src/sink/session.rs +++ b/rs/moq-gst/src/sink/session.rs @@ -25,9 +25,6 @@ static RUNTIME: LazyLock = LazyLock::new(|| { pub static CAT: LazyLock = LazyLock::new(|| gst::DebugCategory::new("moq-sink", gst::DebugColorFlags::empty(), Some("MoQ Sink Element"))); -/// Handoff, not a buffer: a full channel must block the streaming thread, not grow. -const DATA_CHANNEL_BOUND: usize = 8; - /// Shared observable state, read by the element's getters and the chain without touching the worker /// task. One `Mutex` covers the session generation plus every gated field, so "check the live /// generation, then write" is one indivisible step: a stale session (one whose task lingers after a @@ -132,9 +129,9 @@ impl Status { } } -/// Ordered; shutdown is deliberately elsewhere (cancellation channel) so it can cut a blocked send. -/// Every message carries its pad's generation so a pad recreated with the same name discards the -/// previous incarnation's in-flight messages. +/// Ordered; shutdown rides a separate watch channel so it preempts queued data instead of waiting +/// behind it. Every message carries its pad's generation so a pad recreated with the same name +/// discards the previous incarnation's in-flight messages. pub enum DataMsg { AddPad { pad: String, @@ -166,8 +163,9 @@ pub enum DataMsg { }, } -/// Out-of-band FLUSH notification to the worker. Carried on its own unbounded channel (not the bounded -/// `DataMsg` FIFO) so FLUSH_START re-anchors promptly and never blocks behind queued buffers. +/// Out-of-band FLUSH notification to the worker. Carried on its own channel (separate from the `DataMsg` +/// stream) so FLUSH_START re-anchors the pad's timeline promptly, and before queued data when both +/// channels are ready (no cross-channel happens-before; see `run_loop`'s `biased` note). pub struct FlushSignal { pub pad: String, pub generation: u64, @@ -181,16 +179,15 @@ pub struct ResolvedSettings { /// The worker's inbound channels, bundled so `run_session` stays under the argument limit. struct Inbound { - data: mpsc::Receiver, + data: mpsc::UnboundedReceiver, flush: mpsc::UnboundedReceiver, shutdown: watch::Receiver, } pub struct SessionHandle { - data: mpsc::Sender, - /// Out-of-band control path for FLUSH: unbounded so FLUSH_START never blocks behind a full data - /// channel (the very condition a flush must break), and a discrete pad-targeted event a `watch` - /// would collapse. + data: mpsc::UnboundedSender, + /// Out-of-band control path for FLUSH: a separate channel so FLUSH_START re-anchors promptly and wins + /// readiness ties against queued `data`, and a discrete pad-targeted event a `watch` would collapse. flush: mpsc::UnboundedSender, shutdown: watch::Sender, join: tokio::task::JoinHandle<()>, @@ -206,7 +203,12 @@ impl SessionHandle { // Claim the generation synchronously so a previous session's late exit-reset (running on its own // task) sees a newer generation and becomes a no-op instead of clobbering this session's status. let generation = status.begin_session(); - let (data_tx, data_rx) = mpsc::channel(DATA_CHANNEL_BOUND); + // Unbounded so the chain never blocks (cache-ingress, not a backpressure boundary): the worker + // drains this into the moq producers, whose cache owns retention/eviction. Caveat: until connect + // completes the worker is not yet draining (run_session connects before run_loop), and under worker + // lag buffers transiently accumulate here, so this handoff is a residual unbounded buffer ahead of + // the cache. A follow-up that builds the producers up front and writes to them directly removes it. + let (data_tx, data_rx) = mpsc::unbounded_channel(); let (flush_tx, flush_rx) = mpsc::unbounded_channel(); let (shutdown_tx, shutdown_rx) = watch::channel(false); @@ -245,19 +247,20 @@ impl SessionHandle { } } - /// Cloned out so the element blocking-sends without holding the session lock (else a full channel deadlocks stop). - pub fn sender(&self) -> mpsc::Sender { + /// Cloned out so the element sends without holding the session lock. The channel is unbounded, so the + /// send never blocks: moqsink is a cache-ingress, not a backpressure boundary. + pub fn sender(&self) -> mpsc::UnboundedSender { self.data.clone() } - /// Out-of-band FLUSH signal to the worker; unbounded, so it is safe to call from the event thread - /// even while the data channel is full. + /// Out-of-band FLUSH signal to the worker; on its own channel, so it is safe to call from the event + /// thread because it never waits on the data stream. pub fn flush_sender(&self) -> mpsc::UnboundedSender { self.flush.clone() } pub fn stop(self) { - // Cancel first so a send blocked on a full channel wakes via the dropped receiver; reap off-thread. + // Signal shutdown; the worker leaves its loop on the watch change. Reap off-thread. let _ = self.shutdown.send(true); RUNTIME.spawn(async move { if let Err(err) = self.join.await { @@ -267,60 +270,6 @@ impl SessionHandle { } } -/// Outcome of a flush-cancellable send. Both `Flushed` and `Closed` map to `FlowError::Flushing` for -/// the caller; they are split only so the reason stays legible. -pub(super) enum SendOutcome { - Sent, - /// FLUSH_START flipped this pad's flush watch (or the watch sender was dropped on release). - Flushed, - /// The worker's receiver is gone (the session ended). - Closed, -} - -/// Send `msg` on the bounded data channel, aborting promptly if this pad starts flushing. This is the -/// cancellable replacement for `blocking_send`: the chain runs off the runtime, so `block_on` is safe, -/// and FLUSH_START (signalled on `flush`) can cut a send blocked on a full channel without tearing the -/// session down. The loop re-arms on a FLUSH_STOP (`false`) that lands mid-block, so a quick flush+stop -/// does not drop a still-valid buffer. -pub(super) fn send_or_flush( - sender: &mpsc::Sender, - msg: DataMsg, - flush: &mut watch::Receiver, -) -> SendOutcome { - if *flush.borrow_and_update() { - return SendOutcome::Flushed; - } - RUNTIME.block_on(async move { - loop { - tokio::select! { - // Biased toward flush: FLUSH_START must win a tie with freed capacity, so a blocked chain is - // cut rather than enqueuing a pre-flush buffer. - biased; - changed = flush.changed() => { - // Sender dropped (release) or now flushing: abort. A `false` change (FLUSH_STOP) falls - // through and re-arms the wait. - if changed.is_err() || *flush.borrow() { - return SendOutcome::Flushed; - } - } - permit = sender.reserve() => match permit { - Ok(permit) => { - // Re-check after winning the permit: a flushing pad drops the buffer. This narrows (does - // not atomically close) the race: a FLUSH_START in the gap before the send below still - // enqueues one buffer, which the worker re-anchor and the next buffer's check absorb. - if *flush.borrow_and_update() { - return SendOutcome::Flushed; - } - permit.send(msg); - return SendOutcome::Sent; - } - Err(_) => return SendOutcome::Closed, - }, - } - } - }) -} - async fn run_session( settings: ResolvedSettings, status: Arc, @@ -428,7 +377,7 @@ enum ExitReason { async fn run_loop( session: moq_net::Session, generation: u64, - data: &mut mpsc::Receiver, + data: &mut mpsc::UnboundedReceiver, flush: &mut mpsc::UnboundedReceiver, shutdown: &mut watch::Receiver, pad_set: &mut PadSet, @@ -445,8 +394,8 @@ async fn run_loop( tokio::select! { // Biased so a ready FLUSH wins a readiness tie and re-anchors the pad before its queued data is // processed. FLUSH_START is out of band (a different thread), so this is not a happens-before - // against the bounded data channel; the pad goes NoSegment and drops until its next valid - // SEGMENT, which absorbs any residual reorder. Shutdown/death still preempt everything. + // against the data channel; the pad goes NoSegment and drops until its next valid SEGMENT, + // which absorbs any residual reorder. Shutdown/death still preempt everything. biased; // Local close: quiet stop, no ERROR. _ = shutdown.changed() => return Ok(ExitReason::Stopped), @@ -969,8 +918,6 @@ fn signed_nanos(running_time: gst::Signed) -> Option { #[cfg(test)] mod tests { - use std::time::Duration; - use super::*; fn pad_set() -> PadSet { @@ -1564,190 +1511,6 @@ mod tests { assert!(pad.framed.is_some()); } - // The tokio property SessionHandle::stop relies on: dropping the receiver (which the worker does - // when run_session returns) wakes a sender parked on the full channel with Err, so a stop never - // deadlocks a chain thread applying backpressure. The element-level path needs a connected session. - #[test] - fn dropped_receiver_wakes_blocked_send() { - let runtime = tokio::runtime::Runtime::new().unwrap(); - let (data_tx, data_rx) = mpsc::channel::(DATA_CHANNEL_BOUND); - let (shutdown_tx, mut shutdown_rx) = watch::channel(false); - - for _ in 0..DATA_CHANNEL_BOUND { - data_tx.blocking_send(0).unwrap(); // fill to capacity - } - - // Hold the receiver without draining (as if busy in a branch body), then return on the watch. - let loop_task = runtime.spawn(async move { - let _held = data_rx; - shutdown_rx.changed().await.ok(); - }); - - let sender = data_tx.clone(); - let blocked = std::thread::spawn(move || sender.blocking_send(1)); - std::thread::sleep(Duration::from_millis(50)); // let the send park - - shutdown_tx.send(true).unwrap(); - runtime.block_on(loop_task).unwrap(); - - assert!( - blocked.join().unwrap().is_err(), - "a send blocked on the full channel must wake with Err when shutdown drops the receiver" - ); - } - - // FLUSH must wake a send blocked on a full channel AND leave the receiver alive. The second half is - // the discriminator: dropped_receiver_wakes_blocked_send proves the shutdown wake (receiver gone), so - // a flush wake that also tore the receiver down would be indistinguishable from it. - #[test] - fn flush_wakes_a_blocked_send_and_keeps_the_receiver() { - let (data_tx, mut data_rx) = mpsc::channel::(DATA_CHANNEL_BOUND); - for _ in 0..DATA_CHANNEL_BOUND { - data_tx - .try_send(DataMsg::AddPad { - pad: "x".into(), - generation: 0, - }) - .unwrap(); - } - let (flush_tx, flush_rx) = watch::channel(false); - - // Rendezvous so the flush is sent only after the worker is running and about to call - // send_or_flush (else the send could see flush=true at its initial check and never block). The - // sleep then covers parking inside reserve(), which has no test seam to observe directly. - let started = Arc::new(std::sync::Barrier::new(2)); - let sender = data_tx.clone(); - let mut rx = flush_rx; - let started_worker = started.clone(); - let blocked = std::thread::spawn(move || { - started_worker.wait(); - send_or_flush( - &sender, - DataMsg::AddPad { - pad: "x".into(), - generation: 1, - }, - &mut rx, - ) - }); - started.wait(); - std::thread::sleep(Duration::from_millis(50)); // let the send park inside block_on - - flush_tx.send(true).unwrap(); - assert!( - matches!(blocked.join().unwrap(), SendOutcome::Flushed), - "flush woke the blocked send" - ); - - // The receiver is still alive (not dropped): it delivers the queued data, and a fresh non-flushing - // send then goes through. A teardown would fail both. - assert!(data_rx.blocking_recv().is_some(), "receiver still delivers"); - let (_gate_tx, mut rx) = watch::channel(false); - assert!( - matches!( - send_or_flush( - &data_tx, - DataMsg::AddPad { - pad: "x".into(), - generation: 2 - }, - &mut rx - ), - SendOutcome::Sent - ), - "the receiver survived the flush and accepts new sends" - ); - } - - // A per-pad watch means flushing pad A never cancels pad B's blocked send. - #[test] - fn flush_only_wakes_the_flushed_pad() { - let (data_tx, mut data_rx) = mpsc::channel::(DATA_CHANNEL_BOUND); - for _ in 0..DATA_CHANNEL_BOUND { - data_tx - .try_send(DataMsg::AddPad { - pad: "fill".into(), - generation: 0, - }) - .unwrap(); - } - let (flush_a_tx, flush_a_rx) = watch::channel(false); - let (_flush_b_tx, flush_b_rx) = watch::channel(false); - - // Both workers must be running before A is flushed, so B is genuinely inside send_or_flush (not - // merely unspawned). The sleep then covers parking inside reserve(). - let started = Arc::new(std::sync::Barrier::new(3)); - let sender_a = data_tx.clone(); - let mut rx_a = flush_a_rx; - let started_a = started.clone(); - let blocked_a = std::thread::spawn(move || { - started_a.wait(); - send_or_flush( - &sender_a, - DataMsg::AddPad { - pad: "a".into(), - generation: 1, - }, - &mut rx_a, - ) - }); - let sender_b = data_tx.clone(); - let mut rx_b = flush_b_rx; - let started_b = started.clone(); - let blocked_b = std::thread::spawn(move || { - started_b.wait(); - send_or_flush( - &sender_b, - DataMsg::AddPad { - pad: "b".into(), - generation: 1, - }, - &mut rx_b, - ) - }); - started.wait(); - std::thread::sleep(Duration::from_millis(50)); - - flush_a_tx.send(true).unwrap(); - assert!( - matches!(blocked_a.join().unwrap(), SendOutcome::Flushed), - "pad A's flush woke A" - ); - - // B was untouched: freeing a slot lets it send normally (Sent, not Flushed). - assert!(data_rx.blocking_recv().is_some()); - assert!( - matches!(blocked_b.join().unwrap(), SendOutcome::Sent), - "pad B's send was not cancelled by A's flush" - ); - } - - // Already-flushing contract: a pad whose watch is set drops the buffer (returns Flushed, enqueues - // nothing) even with capacity. This exercises the initial check in send_or_flush, NOT the select's - // biased arm; the capacity+FLUSH sub-poll tie (pitfall 14) stays structural, not covered here. - #[test] - fn flush_drops_the_buffer_even_with_capacity() { - let (data_tx, mut data_rx) = mpsc::channel::(DATA_CHANNEL_BOUND); // empty: capacity free - let (flush_tx, flush_rx) = watch::channel(false); - flush_tx.send(true).unwrap(); - let mut rx = flush_rx; - assert!( - matches!( - send_or_flush( - &data_tx, - DataMsg::AddPad { - pad: "x".into(), - generation: 0 - }, - &mut rx - ), - SendOutcome::Flushed - ), - "a flushing send returns Flushed" - ); - assert!(data_rx.try_recv().is_err(), "and enqueues nothing"); - } - // Two pads, real PTS via to_running_time_full: the A/V offset survives because running time is shared. #[test] fn two_pads_keep_av_aligned_through_real_segments() {