From e2b69369a58d7cdab20063178f77ec7474b029fe Mon Sep 17 00:00:00 2001 From: alex newman Date: Wed, 3 Jun 2026 10:30:44 -0400 Subject: [PATCH 1/2] feat(cf): cross-env reconcile plan, dry-run (POST /admin/cf/reconcile) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Computes — read-only — what a reconcile WOULD do across the whole CF map, env-labelled, in three buckets: - adopt: live (healthy) CF agent tunnels in the serving env the CP store is missing → fill-only rebuild from CF. - prune: a serving-env agent tunnel that's unclaimed AND not healthy, an unexpected serving-env CNAME, every resource of an env with no live control plane (closed PR), and the whole (unattributed) leak bucket. - refill: hostnames the serving CP expects but CF has no CNAME for. A live foreign env (another CP's, store not held here) is left untouched with a note. A degraded map yields an empty plan + refusal note. Adds tunnel `status`/`created_at` to CfTunnel (populated in both the per-env snapshot and the map) so adopt-vs-prune can tell a live agent from a dead/leaked tunnel; exposes `build_cp_state` for reuse. The endpoint is dry-run ONLY — `?apply=true` is acknowledged but performs no mutations (the guarded, operator-gated apply lands next). Same auth as the other /admin/cf/* surfaces. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cf_map.rs | 5 ++ src/cf_reconcile.rs | 183 ++++++++++++++++++++++++++++++++++++++++++++ src/cf_snapshot.rs | 15 +++- src/cp.rs | 36 +++++++++ src/lib.rs | 1 + 5 files changed, 239 insertions(+), 1 deletion(-) create mode 100644 src/cf_reconcile.rs diff --git a/src/cf_map.rs b/src/cf_map.rs index d1eaadc..772e183 100644 --- a/src/cf_map.rs +++ b/src/cf_map.rs @@ -129,6 +129,11 @@ pub async fn build_map( .get("deleted_at") .and_then(|v| v.as_str()) .map(String::from), + status: t.get("status").and_then(|v| v.as_str()).map(String::from), + created_at: t + .get("created_at") + .and_then(|v| v.as_str()) + .map(String::from), }; tunnel_env.insert(cft.id.clone(), env.clone()); buckets.entry(env).or_default().tunnels.push(cft); diff --git a/src/cf_reconcile.rs b/src/cf_reconcile.rs new file mode 100644 index 0000000..0423e10 --- /dev/null +++ b/src/cf_reconcile.rs @@ -0,0 +1,183 @@ +//! Cross-environment Cloudflare reconcile plan (read-only / dry-run). +//! +//! Given the unified [`crate::cf_map::CfMap`] and the serving CP's view of +//! its own env, compute what a reconcile WOULD do — env-labelled — in +//! three buckets: +//! - **adopt**: live CF agent tunnels in the serving env that the CP +//! store is missing (fill-only recovery — rebuild the store from CF). +//! - **prune**: leaked resources — a dead/unclaimed orphan in the serving +//! env, OR everything belonging to an env with no live control plane +//! (e.g. a closed PR), OR the `(unattributed)` bucket. +//! - **refill**: hostnames the serving CP expects but CF has no CNAME for. +//! +//! This module only *plans*. Execution — with the in-flight-deploy + TTL + +//! zero-connection guards — is a separate, operator-gated step (next PR). + +use std::collections::HashSet; +use std::time::SystemTime; + +use serde::Serialize; + +use crate::cf_map::CfMap; +use crate::cf_snapshot::CpState; + +#[derive(Debug, Clone, Serialize)] +pub struct ReconcilePlan { + pub computed_at: String, + pub serving_env: String, + /// The CF map was partial (a list call failed); the plan is empty and + /// must not be applied. + pub degraded: bool, + pub adopt: Vec, + pub prune: Vec, + pub refill: Vec, + /// Human-readable notes (e.g. live foreign envs intentionally skipped). + pub notes: Vec, +} + +#[derive(Debug, Clone, Serialize)] +pub struct PlanItem { + pub env: String, + pub kind: String, // "tunnel" | "dns" + pub id: String, + pub name: String, + pub reason: String, +} + +fn item(env: &str, kind: &str, id: &str, name: &str, reason: &str) -> PlanItem { + PlanItem { + env: env.into(), + kind: kind.into(), + id: id.into(), + name: name.into(), + reason: reason.into(), + } +} + +/// Compute the dry-run plan. Pure over the map + the serving CP state. +pub fn plan(map: &CfMap, cp: &CpState) -> ReconcilePlan { + let mut out = ReconcilePlan { + computed_at: chrono::DateTime::::from(SystemTime::now()).to_rfc3339(), + serving_env: cp.env_label.clone(), + degraded: map.degraded, + adopt: vec![], + prune: vec![], + refill: vec![], + notes: vec![], + }; + if map.degraded { + out.notes.push( + "CF map is degraded (a list call failed); plan is empty and reconcile must not run" + .into(), + ); + return out; + } + + // Serving CP's ground truth: the tunnel ids it knows + the hostnames it + // expects to exist (agent hostnames + their per-workload labels). + let known_tunnel_ids: HashSet<&str> = cp + .agents + .iter() + .filter(|a| !a.tunnel_id.is_empty()) + .map(|a| a.tunnel_id.as_str()) + .collect(); + let mut expected_hostnames: HashSet = HashSet::new(); + for a in &cp.agents { + expected_hostnames.insert(a.hostname.clone()); + for (label, _) in &a.extras { + expected_hostnames.insert(crate::cf::label_hostname(&a.hostname, label)); + } + } + + for inst in &map.installations { + let serving = inst.env == cp.env_label; + let unattributed = inst.kind == "unattributed"; + + if serving { + // Agent tunnels the CP store doesn't know: adopt if live + // (healthy), prune if dead. Never touch the CP's own `-cp-` + // tunnel or already soft-deleted tunnels. + for t in &inst.tunnels { + if t.name.contains("-cp-") || t.deleted_at.is_some() { + continue; + } + if known_tunnel_ids.contains(t.id.as_str()) { + continue; + } + if t.status.as_deref() == Some("healthy") { + out.adopt.push(item( + &inst.env, + "tunnel", + &t.id, + &t.name, + "live CF agent tunnel not in the CP store — adopt (fill-only)", + )); + } else { + out.prune.push(item( + &inst.env, + "tunnel", + &t.id, + &t.name, + &format!( + "agent tunnel unclaimed by any CP agent and not healthy (status={}) — prune", + t.status.as_deref().unwrap_or("unknown") + ), + )); + } + } + // DNS: unexpected CNAME → prune; expected-but-absent → refill. + let cf_dns_names: HashSet<&str> = inst.dns.iter().map(|d| d.name.as_str()).collect(); + for d in &inst.dns { + if d.name != cp.control_plane_hostname && !expected_hostnames.contains(&d.name) { + out.prune.push(item( + &inst.env, + "dns", + &d.id, + &d.name, + "CNAME not claimed by any CP agent — prune", + )); + } + } + for h in &expected_hostnames { + if h != &cp.control_plane_hostname && !cf_dns_names.contains(h.as_str()) { + out.refill.push(item( + &inst.env, + "dns", + "", + h, + "CP expects this hostname but no CF CNAME exists — refill", + )); + } + } + } else if unattributed || !inst.has_live_cp { + // A whole env with no live control plane (e.g. a closed PR), or + // the unattributed leak bucket → every live resource is prunable. + let why = if unattributed { + "resource has no parseable env / its target tunnel is gone — prune" + } else { + "env has no live control plane (torn-down install) — prune" + }; + for t in &inst.tunnels { + if t.deleted_at.is_some() { + continue; + } + out.prune + .push(item(&inst.env, "tunnel", &t.id, &t.name, why)); + } + for d in &inst.dns { + out.prune.push(item(&inst.env, "dns", &d.id, &d.name, why)); + } + } else { + // Another live env whose CP store this CP doesn't hold — only + // its own CP can safely judge its agent set. + out.notes.push(format!( + "{}: live foreign env ({} tunnels, {} dns) left untouched — reconcile from its own CP", + inst.env, + inst.tunnels.len(), + inst.dns.len() + )); + } + } + + out +} diff --git a/src/cf_snapshot.rs b/src/cf_snapshot.rs index 75789e7..90b0d59 100644 --- a/src/cf_snapshot.rs +++ b/src/cf_snapshot.rs @@ -68,6 +68,12 @@ pub struct CfTunnel { pub id: String, pub name: String, pub deleted_at: Option, + /// Cloudflare tunnel status: `healthy` (≥1 live connection), else + /// `degraded` / `down` / `inactive`. Used to tell a live agent (adopt) + /// from a dead/leaked tunnel (prune). + pub status: Option, + /// RFC3339 creation time, for the prune age (TTL) guard. + pub created_at: Option, } #[derive(Debug, Clone, Serialize)] @@ -161,7 +167,7 @@ pub async fn snapshot( } } -async fn build_cp_state( +pub(crate) async fn build_cp_state( env_label: &str, cp_hostname: &str, store: &Arc>>, @@ -243,6 +249,11 @@ async fn build_cf_state( .get("deleted_at") .and_then(|v| v.as_str()) .map(String::from), + status: t.get("status").and_then(|v| v.as_str()).map(String::from), + created_at: t + .get("created_at") + .and_then(|v| v.as_str()) + .map(String::from), }) }) .collect(); @@ -568,6 +579,8 @@ mod tests { id: id.into(), name: name.into(), deleted_at: None, + status: Some("healthy".into()), + created_at: None, } } diff --git a/src/cp.rs b/src/cp.rs index db354ca..1a070f5 100644 --- a/src/cp.rs +++ b/src/cp.rs @@ -350,6 +350,7 @@ pub async fn run() -> Result<()> { .route("/api/fleet", get(fleet_fragment)) .route("/admin/cf/snapshot", get(cf_snapshot_handler)) .route("/admin/cf/map", get(cf_map_handler)) + .route("/admin/cf/reconcile", post(cf_reconcile_handler)) .route("/api/v1/admin/export", get(export_state)) .route("/admin/enroll", get(enroll_page)) .with_state(state); @@ -1332,6 +1333,41 @@ async fn cf_map_handler( Ok(Json(map)) } +#[derive(Debug, Deserialize)] +struct ReconcileParams { + #[serde(default)] + apply: bool, +} + +/// POST /admin/cf/reconcile — cross-env reconcile. Returns the dry-run +/// plan (adopt / prune / refill, each env-labelled) computed over the +/// unified CF map + the serving CP's store. This build is **dry-run only**: +/// `?apply=true` is acknowledged but performs NO mutations — the guarded, +/// operator-gated apply path lands in a follow-up. Same auth as the other +/// `/admin/cf/*` surfaces. +async fn cf_reconcile_handler( + State(s): State, + axum::extract::ConnectInfo(peer): axum::extract::ConnectInfo, + axum::extract::Query(params): axum::extract::Query, + headers: axum::http::HeaderMap, +) -> Result> { + if !agents_auth_ok(&s, peer, &headers).await { + return Err(Error::Unauthorized); + } + let http = cf::http_client(); + let map = crate::cf_map::build_map(&http, &s.cfg.cf, s.cfg.common.env.label(), &s.store).await; + let cp = + crate::cf_snapshot::build_cp_state(s.cfg.common.env.label(), &s.cfg.hostname, &s.store) + .await; + let plan = crate::cf_reconcile::plan(&map, &cp); + Ok(Json(serde_json::json!({ + "dry_run": true, + "applied": false, + "apply_requested": params.apply, + "plan": plan, + }))) +} + /// Accept the request if the caller is on the loopback interface /// (same-VM trust — any CP-VM workload / dd-agent-proxy) or presents a valid /// bearer that verifies as either a GitHub Actions OIDC token for diff --git a/src/lib.rs b/src/lib.rs index f8f032e..e622107 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,7 @@ pub mod agent; pub mod auth; pub mod cf; pub mod cf_map; +pub mod cf_reconcile; pub mod cf_snapshot; pub mod collector; pub mod config; From 1bd7370e3cb04d15e5db2994045c14711547e138 Mon Sep 17 00:00:00 2001 From: alex newman Date: Wed, 3 Jun 2026 10:49:59 -0400 Subject: [PATCH 2/2] fix(cf): prune DNS by target tunnel, never by hostname guess MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dry-run plan flagged 3 live pr-N agent CNAMEs (its -api-/oracle/shell vanity hosts) for prune: 'expected hostnames' was derived from the CP store's extras, but the CP creates more CNAMEs than it records there (and agent-api uses a different name format), so live records looked orphaned — a delete-a-healthy-agent bug at apply time. Rewrite as two passes: decide tunnel actions first (recording pruned tunnel ids), then prune a CNAME only if its target tunnel is gone (unattributed) or is itself being pruned. A CNAME pointing at a live/kept tunnel is always kept, regardless of whether we can re-derive its name. Refill is limited to the reliably-known primary agent hostname. --- src/cf_reconcile.rs | 143 +++++++++++++++++++++++++------------------- 1 file changed, 80 insertions(+), 63 deletions(-) diff --git a/src/cf_reconcile.rs b/src/cf_reconcile.rs index 0423e10..ce72110 100644 --- a/src/cf_reconcile.rs +++ b/src/cf_reconcile.rs @@ -73,36 +73,29 @@ pub fn plan(map: &CfMap, cp: &CpState) -> ReconcilePlan { return out; } - // Serving CP's ground truth: the tunnel ids it knows + the hostnames it - // expects to exist (agent hostnames + their per-workload labels). + // The tunnel ids the serving CP knows it owns. let known_tunnel_ids: HashSet<&str> = cp .agents .iter() .filter(|a| !a.tunnel_id.is_empty()) .map(|a| a.tunnel_id.as_str()) .collect(); - let mut expected_hostnames: HashSet = HashSet::new(); - for a in &cp.agents { - expected_hostnames.insert(a.hostname.clone()); - for (label, _) in &a.extras { - expected_hostnames.insert(crate::cf::label_hostname(&a.hostname, label)); - } - } + // Pass 1 — tunnels. Decide adopt/prune/keep and record which tunnels + // are being pruned, so DNS can follow its tunnel (pass 2). + let mut prune_tunnel_ids: HashSet = HashSet::new(); for inst in &map.installations { let serving = inst.env == cp.env_label; let unattributed = inst.kind == "unattributed"; + let leaked_env = unattributed || !inst.has_live_cp; - if serving { - // Agent tunnels the CP store doesn't know: adopt if live - // (healthy), prune if dead. Never touch the CP's own `-cp-` - // tunnel or already soft-deleted tunnels. - for t in &inst.tunnels { - if t.name.contains("-cp-") || t.deleted_at.is_some() { - continue; - } - if known_tunnel_ids.contains(t.id.as_str()) { - continue; + for t in &inst.tunnels { + if t.deleted_at.is_some() { + continue; // already soft-deleted + } + if serving { + if t.name.contains("-cp-") || known_tunnel_ids.contains(t.id.as_str()) { + continue; // the CP's own tunnel, or a claimed agent → keep } if t.status.as_deref() == Some("healthy") { out.adopt.push(item( @@ -113,63 +106,31 @@ pub fn plan(map: &CfMap, cp: &CpState) -> ReconcilePlan { "live CF agent tunnel not in the CP store — adopt (fill-only)", )); } else { + prune_tunnel_ids.insert(t.id.clone()); out.prune.push(item( &inst.env, "tunnel", &t.id, &t.name, &format!( - "agent tunnel unclaimed by any CP agent and not healthy (status={}) — prune", + "serving-env agent tunnel unclaimed by any CP agent and not healthy (status={}) — prune", t.status.as_deref().unwrap_or("unknown") ), )); } - } - // DNS: unexpected CNAME → prune; expected-but-absent → refill. - let cf_dns_names: HashSet<&str> = inst.dns.iter().map(|d| d.name.as_str()).collect(); - for d in &inst.dns { - if d.name != cp.control_plane_hostname && !expected_hostnames.contains(&d.name) { - out.prune.push(item( - &inst.env, - "dns", - &d.id, - &d.name, - "CNAME not claimed by any CP agent — prune", - )); - } - } - for h in &expected_hostnames { - if h != &cp.control_plane_hostname && !cf_dns_names.contains(h.as_str()) { - out.refill.push(item( - &inst.env, - "dns", - "", - h, - "CP expects this hostname but no CF CNAME exists — refill", - )); - } - } - } else if unattributed || !inst.has_live_cp { - // A whole env with no live control plane (e.g. a closed PR), or - // the unattributed leak bucket → every live resource is prunable. - let why = if unattributed { - "resource has no parseable env / its target tunnel is gone — prune" - } else { - "env has no live control plane (torn-down install) — prune" - }; - for t in &inst.tunnels { - if t.deleted_at.is_some() { - continue; - } + } else if leaked_env { + prune_tunnel_ids.insert(t.id.clone()); + let why = if unattributed { + "tunnel name has no parseable env — prune" + } else { + "tunnel for an env with no live control plane (torn-down install) — prune" + }; out.prune .push(item(&inst.env, "tunnel", &t.id, &t.name, why)); } - for d in &inst.dns { - out.prune.push(item(&inst.env, "dns", &d.id, &d.name, why)); - } - } else { - // Another live env whose CP store this CP doesn't hold — only - // its own CP can safely judge its agent set. + // live foreign env → leave its tunnels alone (noted below) + } + if !serving && !leaked_env { out.notes.push(format!( "{}: live foreign env ({} tunnels, {} dns) left untouched — reconcile from its own CP", inst.env, @@ -179,5 +140,61 @@ pub fn plan(map: &CfMap, cp: &CpState) -> ReconcilePlan { } } + // Pass 2 — DNS, keyed purely on the tunnel it targets. We never guess + // by hostname: the CP creates more CNAMEs (agent-api, oracle, shell) + // than it records in its store, so a name-based "orphan" check would + // falsely prune live records. A CNAME is prunable only if its target + // tunnel is gone (unattributed bucket) or is itself being pruned. + for inst in &map.installations { + let unattributed = inst.kind == "unattributed"; + for d in &inst.dns { + let targets_pruned_tunnel = d + .tunnel_id_ref + .as_deref() + .map(|t| prune_tunnel_ids.contains(t)) + .unwrap_or(false); + if unattributed { + out.prune.push(item( + &inst.env, + "dns", + &d.id, + &d.name, + "CNAME targets a tunnel that no longer exists — prune", + )); + } else if targets_pruned_tunnel { + out.prune.push(item( + &inst.env, + "dns", + &d.id, + &d.name, + "CNAME targets a tunnel being pruned — prune", + )); + } + } + } + + // Refill — only the reliably-known primary agent hostname. Extras + // (agent-api / oracle / shell) aren't tracked in the store, so we never + // synthesize them; a missing primary CNAME means the agent is + // unreachable and is safe to flag. + if let Some(serving_inst) = map.installations.iter().find(|i| i.env == cp.env_label) { + let cf_dns_names: HashSet<&str> = + serving_inst.dns.iter().map(|d| d.name.as_str()).collect(); + for a in &cp.agents { + if !a.tunnel_id.is_empty() + && a.hostname != cp.control_plane_hostname + && !cf_dns_names.contains(a.hostname.as_str()) + { + out.refill.push(item( + &cp.env_label, + "dns", + "", + &a.hostname, + "CP knows this agent but its primary CNAME is missing in CF — refill", + )); + } + } + } + out }