diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index 5f7599c..881201d 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -6,7 +6,8 @@ use bytes::Bytes; use std::sync::Arc; use crate::auth::{caller_authorized_to_push, AuthenticatedDid}; -use chrono::Utc; +use crate::db::RepoRecord; +use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use uuid::Uuid; @@ -227,8 +228,8 @@ pub async fn list_repos( } let repos = state.db.list_all_repos_with_stars().await?; - let filtered: Vec<_> = repos - .iter() + let filtered: Vec<(RepoRecord, i64)> = repos + .into_iter() .filter(|(r, _)| { if let Some(owner) = &query.owner { crate::api::did_matches(owner.as_str(), &r.owner_did) @@ -237,10 +238,11 @@ pub async fn list_repos( } }) .collect(); - let total = filtered.len() as i64; - let resp: Vec<_> = filtered + let deduped = dedupe_canonical_repos(filtered); + let total = deduped.len() as i64; + let resp: Vec<_> = deduped .into_iter() - .map(|(r, stars)| to_response(r, &state, *stars)) + .map(|(r, stars)| to_response(&r, &state, stars)) .collect(); let mut response = Json(resp).into_response(); response.headers_mut().insert( @@ -1121,7 +1123,7 @@ pub async fn list_refs( pub async fn list_federated_repos( State(state): State, ) -> Result> { - let local_repos = state.db.list_all_repos_with_stars().await?; + let local_repos = dedupe_canonical_repos(state.db.list_all_repos_with_stars().await?); let local_node_url = state .config .public_url @@ -1379,11 +1381,100 @@ fn to_response(record: &crate::db::RepoRecord, state: &AppState, star_count: i64 } } +/// Collapse short-owner mirror rows and canonical `did:key:` rows that point at the +/// same logical repo into a single entry, so profile/list surfaces don't render the +/// same repo twice (issue #6). +/// +/// Rows are grouped by `(normalized owner, name)`, where the normalized owner is the +/// key segment after the last `:` (so `did:key:z6Mk…` and the bare `z6Mk…` mirror row +/// collapse together). Within a group the canonical row wins: a non-mirror row is +/// preferred over a mirror, ties broken by earliest `created_at` then `id`. A mirror +/// row is identified structurally by its slash-form `id` (`{owner_short}/{name}`, +/// written only by `Db::upsert_mirror_repo`), not by its user-settable description. +/// The survivor inherits the group's most recent `updated_at` so a gossip push that +/// only touches the mirror row still floats the repo to the top. +/// +/// This mirrors the SQL dedup applied on the paged/unfiltered paths via +/// `Db::DEDUP_CTE`; the marker and the `id` tiebreak must stay in sync with it. +fn dedupe_canonical_repos(rows: Vec<(RepoRecord, i64)>) -> Vec<(RepoRecord, i64)> { + use std::collections::HashMap; + + // Mirror rows carry a slash-form id, written only by Db::upsert_mirror_repo; + // canonical rows use a UUID id (no slash). Structural, not user-settable. + fn is_mirror(r: &RepoRecord) -> bool { + r.id.contains('/') + } + + // Strictly more canonical: non-mirror beats mirror; on equal mirror-status the + // earlier created_at wins, and a full tie falls back to id ASC so the survivor + // matches SQL's DISTINCT ON (… created_at ASC, id ASC). + fn outranks(candidate: &RepoRecord, current: &RepoRecord) -> bool { + match (is_mirror(candidate), is_mirror(current)) { + (false, true) => true, + (true, false) => false, + _ => (candidate.created_at, &candidate.id) < (current.created_at, ¤t.id), + } + } + + // Preserve first-seen group order so output ordering stays deterministic. + let mut order: Vec<(String, String)> = Vec::new(); + let mut winners: HashMap<(String, String), (RepoRecord, i64)> = HashMap::new(); + let mut latest: HashMap<(String, String), DateTime> = HashMap::new(); + + for (rec, stars) in rows { + // did:key-aware owner key: strip a `did:key:` prefix so the bare mirror id + // and its `did:key:` canonical collapse, but leave any other DID method + // whole so `did:key:X` and `did:gitlawb:X` never merge. The `!contains(':')` + // guard mirrors did_matches' `key_id` check: a stripped value that still + // holds a `:` is a non-key full DID (e.g. malformed `did:key:did:gitlawb:X`) + // and must keep its full form, not collapse onto the bare method DID. Stays + // byte-equivalent to the SQL CASE in Db::DEDUP_CTE / count_repos_deduped. + let owner_key = rec + .owner_did + .strip_prefix("did:key:") + .filter(|rest| !rest.contains(':')) + .unwrap_or(&rec.owner_did) + .to_string(); + let key = (owner_key, rec.name.clone()); + + latest + .entry(key.clone()) + .and_modify(|u| { + if rec.updated_at > *u { + *u = rec.updated_at; + } + }) + .or_insert(rec.updated_at); + + match winners.get(&key) { + None => { + order.push(key.clone()); + winners.insert(key, (rec, stars)); + } + Some((current, _)) if outranks(&rec, current) => { + winners.insert(key, (rec, stars)); + } + Some(_) => {} + } + } + + order + .into_iter() + .filter_map(|key| { + let max_updated = latest.get(&key).copied(); + winners.remove(&key).map(|(mut rec, stars)| { + if let Some(u) = max_updated { + rec.updated_at = u; + } + (rec, stars) + }) + }) + .collect() +} + #[cfg(test)] mod tests { - use super::owner_push_rejection; - use crate::auth::caller_authorized_to_push; - use crate::error::AppError; + use super::*; const OWNER_DID: &str = "did:key:z6MkpTHR8VNsBxYAAWHut2Geadd9jSwuBV8xRoAnwWsdvktH"; const OWNER_SHORT: &str = "z6MkpTHR8VNsBxYAAWHut2Geadd9jSwuBV8xRoAnwWsdvktH"; @@ -1457,4 +1548,320 @@ mod tests { assert!(caller_authorized_to_push(&repo, OWNER_SHORT)); assert!(!caller_authorized_to_push(&repo, STRANGER_DID)); } + + fn ts(s: &str) -> DateTime { + DateTime::parse_from_rfc3339(s).unwrap().with_timezone(&Utc) + } + + fn record(id: &str, owner_did: &str, name: &str, desc: &str, updated: &str) -> RepoRecord { + RepoRecord { + id: id.to_string(), + name: name.to_string(), + owner_did: owner_did.to_string(), + description: Some(desc.to_string()), + is_public: true, + default_branch: "main".to_string(), + created_at: ts("2026-01-01T00:00:00Z"), + updated_at: ts(updated), + disk_path: format!("/srv/{id}"), + forked_from: None, + machine_id: None, + } + } + + #[test] + fn canonical_row_wins_over_short_owner_mirror() { + // Order deliberately puts the mirror row first to prove ranking, not input order, decides. + let mirror = record( + "z6Mkwbud/nipmod", + "z6Mkwbud", + "nipmod", + "mirrored from peer", + "2026-02-01T00:00:00Z", + ); + let canonical = record( + "9d92186a", + "did:key:z6Mkwbud", + "nipmod", + "Decentralized npm for agents on Gitlawb", + "2026-01-15T00:00:00Z", + ); + + let out = dedupe_canonical_repos(vec![(mirror, 3), (canonical, 7)]); + + assert_eq!(out.len(), 1, "the two rows collapse into one logical repo"); + let (rec, stars) = &out[0]; + assert_eq!( + rec.owner_did, "did:key:z6Mkwbud", + "canonical did:key row wins" + ); + assert_eq!( + rec.description.as_deref(), + Some("Decentralized npm for agents on Gitlawb"), + "canonical description and metadata survive, not the mirror placeholder", + ); + assert_eq!(*stars, 7, "star count follows the canonical row"); + // Survivor inherits the group's most recent updated_at (here the mirror's). + assert_eq!(rec.updated_at, ts("2026-02-01T00:00:00Z")); + } + + #[test] + fn distinct_repos_are_preserved_in_order() { + let a = record( + "id-a", + "did:key:z6Aaa", + "alpha", + "first", + "2026-03-01T00:00:00Z", + ); + let b = record( + "id-b", + "did:key:z6Bbb", + "beta", + "second", + "2026-03-02T00:00:00Z", + ); + + let out = dedupe_canonical_repos(vec![(a, 1), (b, 2)]); + + assert_eq!(out.len(), 2); + assert_eq!(out[0].0.name, "alpha"); + assert_eq!(out[1].0.name, "beta"); + } + + #[test] + fn same_short_owner_different_repo_does_not_collapse() { + // `one` is a real mirror row: slash-form id is the structural marker. + let one = record( + "z6Mkwbud/nipmod", + "z6Mkwbud", + "nipmod", + "mirrored from peer", + "2026-01-01T00:00:00Z", + ); + let two = record( + "id-2", + "did:key:z6Mkwbud", + "other", + "real", + "2026-01-01T00:00:00Z", + ); + + let out = dedupe_canonical_repos(vec![(one, 0), (two, 0)]); + + assert_eq!( + out.len(), + 2, + "different repo names stay separate under one owner" + ); + } + + #[test] + fn distinct_did_methods_sharing_a_base58_id_do_not_collapse() { + // `did:key` and `did:gitlawb` share the base58 id space, so a trailing + // segment key would treat these as one repo. The did:key-aware key keeps + // them apart, matching crate::api::did_matches. + let keyed = record( + "id-keyed", + "did:key:z6Mkwbud", + "nipmod", + "owned via did:key", + "2026-01-01T00:00:00Z", + ); + let gitlawb = record( + "id-gitlawb", + "did:gitlawb:z6Mkwbud", + "nipmod", + "owned via did:gitlawb", + "2026-01-01T00:00:00Z", + ); + + let out = dedupe_canonical_repos(vec![(keyed, 1), (gitlawb, 2)]); + + assert_eq!( + out.len(), + 2, + "same name and base58 id under different DID methods are distinct repos" + ); + } + + #[test] + fn bare_id_and_did_key_form_of_same_owner_collapse() { + // A bare mirror id and its did:key canonical are the same owner and must + // collapse, the mirror-vs-canonical case stated in owner-key terms. + let mirror = record( + "z6Mkwbud/nipmod", + "z6Mkwbud", + "nipmod", + "mirrored from peer", + "2026-02-01T00:00:00Z", + ); + let canonical = record( + "canon-id", + "did:key:z6Mkwbud", + "nipmod", + "real", + "2026-01-15T00:00:00Z", + ); + + let out = dedupe_canonical_repos(vec![(mirror, 0), (canonical, 5)]); + + assert_eq!(out.len(), 1, "bare id and its did:key form are one owner"); + assert_eq!(out[0].0.owner_did, "did:key:z6Mkwbud", "canonical row wins"); + } + + #[test] + fn did_key_wrapping_a_full_did_does_not_collapse_onto_the_bare_method_did() { + // Residual-colon guard, mirroring did_matches' `!key_id().contains(':')`: + // a malformed `did:key:did:gitlawb:X` strips to `did:gitlawb:X`, which still + // holds a `:`, so it must keep its full form and NOT collapse with a real + // `did:gitlawb:X` repo of the same name. + let wrapped = record( + "id-wrapped", + "did:key:did:gitlawb:z6Mkwbud", + "nipmod", + "malformed nested DID", + "2026-01-01T00:00:00Z", + ); + let method = record( + "id-method", + "did:gitlawb:z6Mkwbud", + "nipmod", + "real method DID", + "2026-01-02T00:00:00Z", + ); + + let out = dedupe_canonical_repos(vec![(wrapped, 1), (method, 2)]); + + assert_eq!( + out.len(), + 2, + "a did:key-wrapped full DID stays distinct from the bare method DID" + ); + // Assert identity, not just count: each owner survives unmerged, so a + // regression that kept two rows but mis-keyed the survivor is also caught. + let mut owners: Vec<&str> = out.iter().map(|(r, _)| r.owner_did.as_str()).collect(); + owners.sort_unstable(); + assert_eq!( + owners, + vec!["did:gitlawb:z6Mkwbud", "did:key:did:gitlawb:z6Mkwbud"], + "both owner DIDs survive in their full form" + ); + } + + #[test] + fn empty_did_key_residual_keys_to_empty_string_consistently() { + // Degenerate boundary the reviewers flagged: `did:key:` with no id strips to + // an empty residual (no colon), so the key is "". A bare empty owner also + // keys to "", so the two collapse — proving the Rust strip path maps the + // empty residual exactly like the SQL `substr(owner_did, 9)` / `position` + // path (mirrored in the db-level test). A real did:key id keys separately. + let empty_did_key = record( + "id-empty-didkey", + "did:key:", + "nipmod", + "empty residual", + "2026-01-01T00:00:00Z", + ); + let empty_bare = record( + "id-empty-bare", + "", + "nipmod", + "empty owner", + "2026-01-02T00:00:00Z", + ); + let real = record( + "id-real", + "did:key:z6Mkwbud", + "nipmod", + "real id", + "2026-01-03T00:00:00Z", + ); + + let out = dedupe_canonical_repos(vec![(empty_did_key, 0), (empty_bare, 0), (real, 0)]); + + assert_eq!( + out.len(), + 2, + "`did:key:` and the empty owner share the empty key and collapse; the real id stays separate" + ); + } + + #[test] + fn two_mirror_rows_break_tie_by_earliest_created_at() { + // Both are mirror rows (slash-form ids); earliest created_at wins. + let mut older = record( + "z6X/r", + "z6X", + "r", + "mirrored from peer", + "2026-02-01T00:00:00Z", + ); + older.created_at = ts("2026-01-01T00:00:00Z"); + let mut newer = record( + "z6X/r-dup", + "z6X", + "r", + "mirrored from peer", + "2026-03-01T00:00:00Z", + ); + newer.created_at = ts("2026-01-10T00:00:00Z"); + + let out = dedupe_canonical_repos(vec![(newer, 0), (older, 0)]); + + assert_eq!(out.len(), 1); + assert_eq!(out[0].0.id, "z6X/r", "earliest created_at wins the tie"); + } + + #[test] + fn canonical_with_mirror_description_is_treated_as_canonical() { + // Marker robustness: the canonical row carries the literal mirror + // description (user-settable) but a UUID id; the true mirror has the + // slash id and was created earlier. The canonical must still win — dedup + // keys on the structural id, not the description. + let canonical = record( + "9d92186a-uuid", + "did:key:z6Mkwbud", + "nipmod", + "mirrored from peer", + "2026-02-01T00:00:00Z", + ); + let mirror = record( + "z6Mkwbud/nipmod", + "z6Mkwbud", + "nipmod", + "a normal description", + "2026-01-01T00:00:00Z", + ); + + let out = dedupe_canonical_repos(vec![(canonical, 5), (mirror, 1)]); + + assert_eq!(out.len(), 1); + assert_eq!( + out[0].0.id, "9d92186a-uuid", + "canonical wins by structural id marker despite the mirror description" + ); + } + + #[test] + fn full_tie_resolves_by_id_asc() { + // Two canonical rows in one group, identical created_at; only id differs. + // Survivor is id ASC, matching SQL's DISTINCT ON (… created_at ASC, id ASC). + let bbb = record( + "bbb", + "did:key:z6Same", + "repo", + "real", + "2026-01-01T00:00:00Z", + ); + let aaa = record("aaa", "z6Same", "repo", "real", "2026-01-01T00:00:00Z"); + + let out = dedupe_canonical_repos(vec![(bbb, 0), (aaa, 0)]); + + assert_eq!(out.len(), 1, "same group collapses"); + assert_eq!( + out[0].0.id, "aaa", + "id ASC breaks a full tie deterministically" + ); + } } diff --git a/crates/gitlawb-node/src/db/mod.rs b/crates/gitlawb-node/src/db/mod.rs index cfb289e..36e44ff 100644 --- a/crates/gitlawb-node/src/db/mod.rs +++ b/crates/gitlawb-node/src/db/mod.rs @@ -767,6 +767,20 @@ const MIGRATIONS: &[Migration] = &[ "ALTER TABLE agents ADD COLUMN IF NOT EXISTS deactivated_at TEXT", ], }, + Migration { + version: 7, + name: "repo_owner_dedup_key_didkey_aware", + stmts: &[ + // The dedup grouping key moved from the last `:` segment to a + // did:key-aware key (strip `did:key:`, leave any other DID method + // whole) so `did:key:X` and `did:gitlawb:X` no longer collapse. Swap + // the index that backs it: drop the last-segment one from v1 and build + // the matching expression index. The CASE must stay byte-for-byte in + // sync with DEDUP_CTE / count_repos_deduped or Postgres won't use it. + "DROP INDEX IF EXISTS idx_repos_owner_short_name", + "CREATE INDEX IF NOT EXISTS idx_repos_owner_key_name ON repos ((CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END), name)", + ], + }, ]; // ── Repos ───────────────────────────────────────────────────────────────────── @@ -855,6 +869,11 @@ impl Db { Ok(rows.into_iter().map(row_to_repo).collect()) } + /// Raw list of every repo row — NOT deduped (a mirror row and its canonical + /// row both appear) and without stars. For enumeration callers that must see + /// every physical row (e.g. the IPFS object scan in `api::ipfs`), not for + /// listing surfaces. Listing surfaces dedupe via `list_all_repos_deduped` or + /// `list_all_repos_with_stars` + `dedupe_canonical_repos`. pub async fn list_all_repos(&self) -> Result> { let rows = sqlx::query( "SELECT id, name, owner_did, description, is_public, default_branch, @@ -889,30 +908,50 @@ impl Db { .collect()) } - pub async fn list_all_repos_paged( - &self, - owner_filter: Option<&str>, - limit: i64, - offset: i64, - ) -> Result<(Vec<(RepoRecord, i64)>, i64)> { - let rows = sqlx::query( - "WITH deduped AS ( - SELECT DISTINCT ON (split_part(owner_did, ':', -1), name) + /// Shared dedup CTE: collapses the mirror row and the canonical row of one + /// logical repo into a single survivor. `$1` is an optional owner filter + /// (NULL = all rows). Grouping collapses on a did:key-aware owner key: strip a + /// `did:key:` prefix (8 chars, so `substr(owner_did, 9)`) only when the + /// remainder is a bare id with no `:`, otherwise keep the full DID. That is the + /// exact normalization in `crate::api::did_matches`, so `did:key:X` and a bare + /// `X` collapse while distinct DID methods (`did:gitlawb:X`) never merge. The + /// CASE is repeated verbatim in `count_repos_deduped` and the v7 index and must + /// stay byte-identical or Postgres stops using the index. + /// The canonical row wins (mirror rows carry a slash-form `id` written only by + /// `upsert_mirror_repo`), ties broken by earliest `created_at` then `id` so a + /// full tie still picks a deterministic survivor. `list_all_repos_paged`, + /// `list_all_repos_deduped`, and the marker logic in + /// `crate::api::repos::dedupe_canonical_repos` must stay in sync. + const DEDUP_CTE: &'static str = "WITH deduped AS ( + SELECT DISTINCT ON (CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END, name) id, name, owner_did, description, is_public, default_branch, created_at, -- group MAX, not the canonical row's own value: pushes that -- arrive via gossip touch only the mirror row, so the -- canonical updated_at goes stale MAX(updated_at) OVER ( - PARTITION BY split_part(owner_did, ':', -1), name + PARTITION BY CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END, name ) AS updated_at, disk_path, forked_from, machine_id FROM repos WHERE ($1::text IS NULL OR owner_did = $1 OR owner_did LIKE '%:' || $1) - ORDER BY split_part(owner_did, ':', -1), name, - CASE WHEN description = 'mirrored from peer' THEN 1 ELSE 0 END, - created_at ASC - ) + ORDER BY CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END, name, + -- mirror rows carry a slash-form id (\"{owner_short}/{name}\"), + -- written only by upsert_mirror_repo; canonical ids are UUIDs. + -- Rank canonical (no slash) ahead of the mirror in each group, + -- keyed on the structural id, not the user-settable description. + CASE WHEN position('/' in id) > 0 THEN 1 ELSE 0 END, + created_at ASC, id ASC + )"; + + pub async fn list_all_repos_paged( + &self, + owner_filter: Option<&str>, + limit: i64, + offset: i64, + ) -> Result<(Vec<(RepoRecord, i64)>, i64)> { + let sql = format!( + "{} SELECT d.id, d.name, d.owner_did, d.description, d.is_public, d.default_branch, d.created_at, d.updated_at, d.disk_path, @@ -925,12 +964,14 @@ impl Db { ) s ON s.repo_id = d.id ORDER BY d.updated_at DESC LIMIT $2 OFFSET $3", - ) - .bind(owner_filter) - .bind(limit) - .bind(offset) - .fetch_all(&self.pool) - .await?; + Self::DEDUP_CTE + ); + let rows = sqlx::query(&sql) + .bind(owner_filter) + .bind(limit) + .bind(offset) + .fetch_all(&self.pool) + .await?; let total = rows .first() @@ -946,7 +987,7 @@ impl Db { let total = if out.is_empty() { let row = sqlx::query( - "SELECT COUNT(DISTINCT (split_part(owner_did, ':', -1), name)) AS cnt + "SELECT COUNT(DISTINCT (CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END, name)) AS cnt FROM repos WHERE ($1::text IS NULL OR owner_did = $1 OR owner_did LIKE '%:' || $1)", ) @@ -961,6 +1002,41 @@ impl Db { Ok((out, total)) } + /// Deduped repo list (no stars, no paging) for the unfiltered read surfaces + /// (GraphQL `repos`). One logical repo per mirror+canonical group, ordered by + /// the group's most recent activity. Shares `DEDUP_CTE` with the paged path so + /// the dedup rule cannot drift; binds a NULL owner filter (all rows). + pub async fn list_all_repos_deduped(&self) -> Result> { + let sql = format!( + "{} + SELECT d.id, d.name, d.owner_did, d.description, d.is_public, + d.default_branch, d.created_at, d.updated_at, d.disk_path, + d.forked_from, d.machine_id + FROM deduped d + ORDER BY d.updated_at DESC", + Self::DEDUP_CTE + ); + let rows = sqlx::query(&sql) + .bind(None::<&str>) + .fetch_all(&self.pool) + .await?; + + Ok(rows.into_iter().map(row_to_repo).collect()) + } + + /// Count of distinct logical repos (mirror + canonical collapsed), for + /// `/api/v1/stats`. Uses the same did:key-aware owner-key grouping as + /// `DEDUP_CTE` (the CASE must stay byte-identical); the marker/tiebreak only + /// decide which row would survive, not the group count, so they are not needed here. + pub async fn count_repos_deduped(&self) -> Result { + let row = sqlx::query( + "SELECT COUNT(DISTINCT (CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END, name)) AS cnt FROM repos", + ) + .fetch_one(&self.pool) + .await?; + Ok(row.get::("cnt")) + } + pub async fn touch_repo(&self, id: &str) -> Result<()> { sqlx::query("UPDATE repos SET updated_at = $1 WHERE id = $2") .bind(Utc::now().to_rfc3339()) @@ -3001,3 +3077,436 @@ mod agent_discovery_tests { assert!(filter_discoverable(vec![], Some("reputation:score")).is_empty()); } } + +#[cfg(test)] +mod dedup_db_tests { + use super::{Db, RepoRecord}; + use chrono::{DateTime, Utc}; + use sqlx::PgPool; + + async fn db(pool: PgPool) -> Db { + let db = Db::for_testing(pool); + db.run_migrations().await.unwrap(); + db + } + + fn ts(s: &str) -> DateTime { + DateTime::parse_from_rfc3339(s).unwrap().with_timezone(&Utc) + } + + /// Build a repo row with explicit timestamps. A slash in `id` marks a mirror + /// row (the format `upsert_mirror_repo` writes); a UUID-shaped `id` is canonical. + fn rec( + id: &str, + owner_did: &str, + name: &str, + desc: &str, + created: &str, + updated: &str, + ) -> RepoRecord { + RepoRecord { + id: id.to_string(), + name: name.to_string(), + owner_did: owner_did.to_string(), + description: Some(desc.to_string()), + is_public: true, + default_branch: "main".to_string(), + created_at: ts(created), + updated_at: ts(updated), + disk_path: format!("/srv/{id}"), + forked_from: None, + machine_id: None, + } + } + + /// The canonical `did:key:` row and the short-owner mirror row of one logical + /// repo collapse to a single deduped entry: the canonical row wins and inherits + /// the group's most recent `updated_at`. + #[sqlx::test] + async fn deduped_collapses_mirror_and_canonical(pool: PgPool) { + let db = db(pool).await; + let canonical = rec( + "9d92186a-canonical", + "did:key:z6Mkwbud", + "nipmod", + "Decentralized npm for agents", + "2026-01-15T00:00:00Z", + "2026-01-15T00:00:00Z", + ); + // Mirror row in the shape upsert_mirror_repo writes: slash id, bare owner. + let mirror = rec( + "z6Mkwbud/nipmod", + "z6Mkwbud", + "nipmod", + "mirrored from peer", + "2026-02-01T00:00:00Z", + "2026-03-01T00:00:00Z", + ); + db.create_repo(&canonical).await.unwrap(); + db.create_repo(&mirror).await.unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!(out.len(), 1, "the pair collapses to one logical repo"); + assert_eq!(out[0].owner_did, "did:key:z6Mkwbud", "canonical row wins"); + assert_eq!( + out[0].updated_at, + ts("2026-03-01T00:00:00Z"), + "survivor inherits the group's MAX(updated_at)" + ); + } + + /// upsert_mirror_repo's own rows dedupe against a canonical twin (proves the + /// real mirror writer's row shape is classified correctly). + #[sqlx::test] + async fn deduped_collapses_real_upsert_mirror_row(pool: PgPool) { + let db = db(pool).await; + let canonical = rec( + "uuid-canonical", + "did:key:z6Mkwbud", + "nipmod", + "real", + "2026-01-15T00:00:00Z", + "2026-01-15T00:00:00Z", + ); + db.create_repo(&canonical).await.unwrap(); + db.upsert_mirror_repo("z6Mkwbud", "nipmod", "/srv/mirror", None) + .await + .unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!( + out.len(), + 1, + "real mirror row collapses with its canonical twin" + ); + assert_eq!(out[0].owner_did, "did:key:z6Mkwbud", "canonical row wins"); + } + + /// Same name and base58 id but different DID methods (`did:key` vs + /// `did:gitlawb`) must NOT collapse: the grouping key strips only `did:key:` + /// and leaves other methods whole, matching crate::api::did_matches. Both the + /// list (DEDUP_CTE) and count (count_repos_deduped) paths must agree. + #[sqlx::test] + async fn deduped_keeps_distinct_did_methods_apart(pool: PgPool) { + let db = db(pool).await; + db.create_repo(&rec( + "id-keyed", + "did:key:z6Mkwbud", + "nipmod", + "via did:key", + "2026-01-01T00:00:00Z", + "2026-01-01T00:00:00Z", + )) + .await + .unwrap(); + db.create_repo(&rec( + "id-gitlawb", + "did:gitlawb:z6Mkwbud", + "nipmod", + "via did:gitlawb", + "2026-01-02T00:00:00Z", + "2026-01-02T00:00:00Z", + )) + .await + .unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!(out.len(), 2, "distinct DID methods are distinct owners"); + assert_eq!( + db.count_repos_deduped().await.unwrap(), + 2, + "count path agrees with the list path", + ); + } + + /// SQL residual-colon guard: a malformed `did:key:did:gitlawb:X` strips to a + /// value that still holds a `:`, so the CASE keeps it whole and it does NOT + /// collapse with a real `did:gitlawb:X`. Proves the SQL key matches the Rust + /// `strip_prefix(...).filter(|r| !r.contains(':'))` and did_matches. + #[sqlx::test] + async fn deduped_did_key_wrapping_a_full_did_stays_distinct(pool: PgPool) { + let db = db(pool).await; + db.create_repo(&rec( + "id-wrapped", + "did:key:did:gitlawb:z6Mkwbud", + "nipmod", + "malformed nested DID", + "2026-01-01T00:00:00Z", + "2026-01-01T00:00:00Z", + )) + .await + .unwrap(); + db.create_repo(&rec( + "id-method", + "did:gitlawb:z6Mkwbud", + "nipmod", + "real method DID", + "2026-01-02T00:00:00Z", + "2026-01-02T00:00:00Z", + )) + .await + .unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!( + out.len(), + 2, + "wrapped full DID stays distinct from the method DID" + ); + assert_eq!( + db.count_repos_deduped().await.unwrap(), + 2, + "count path agrees with the list path", + ); + } + + /// Empty-residual boundary: `did:key:` matches `LIKE 'did:key:%'`, + /// `substr(owner_did, 9)` is '', and `position(':' in '')` is 0, so the CASE + /// keys it to '' just like a bare empty owner, while a real `did:key:z…` keys + /// separately. Pins that the SQL empty-residual handling matches the Rust + /// `strip_prefix(...).filter(...)` path (mirrored in the api-level test). + #[sqlx::test] + async fn deduped_empty_did_key_residual_keys_to_empty_string(pool: PgPool) { + let db = db(pool).await; + db.create_repo(&rec( + "id-empty-didkey", + "did:key:", + "nipmod", + "empty residual", + "2026-01-01T00:00:00Z", + "2026-01-01T00:00:00Z", + )) + .await + .unwrap(); + db.create_repo(&rec( + "id-empty-bare", + "", + "nipmod", + "empty owner", + "2026-01-02T00:00:00Z", + "2026-01-02T00:00:00Z", + )) + .await + .unwrap(); + db.create_repo(&rec( + "id-real", + "did:key:z6Mkwbud", + "nipmod", + "real id", + "2026-01-03T00:00:00Z", + "2026-01-03T00:00:00Z", + )) + .await + .unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!( + out.len(), + 2, + "`did:key:` and the empty owner collapse on the empty key; the real id is separate" + ); + assert_eq!( + db.count_repos_deduped().await.unwrap(), + 2, + "count path agrees with the list path", + ); + } + + /// Distinct repos are preserved and ordered by most recent activity. + #[sqlx::test] + async fn deduped_preserves_distinct_repos_ordered_by_updated(pool: PgPool) { + let db = db(pool).await; + db.create_repo(&rec( + "id-a", + "did:key:z6Aaa", + "alpha", + "first", + "2026-03-01T00:00:00Z", + "2026-03-01T00:00:00Z", + )) + .await + .unwrap(); + db.create_repo(&rec( + "id-b", + "did:key:z6Bbb", + "beta", + "second", + "2026-03-02T00:00:00Z", + "2026-03-02T00:00:00Z", + )) + .await + .unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!(out.len(), 2); + assert_eq!(out[0].name, "beta", "most recently updated first"); + assert_eq!(out[1].name, "alpha"); + } + + /// count_repos_deduped counts logical repos, not raw rows. + #[sqlx::test] + async fn count_repos_deduped_counts_logical_repos(pool: PgPool) { + let db = db(pool).await; + // One logical repo (canonical + mirror) plus one standalone. + db.create_repo(&rec( + "uuid-c", + "did:key:z6Mkwbud", + "nipmod", + "real", + "2026-01-15T00:00:00Z", + "2026-01-15T00:00:00Z", + )) + .await + .unwrap(); + db.upsert_mirror_repo("z6Mkwbud", "nipmod", "/srv/m", None) + .await + .unwrap(); + db.create_repo(&rec( + "uuid-d", + "did:key:z6Other", + "solo", + "real", + "2026-01-16T00:00:00Z", + "2026-01-16T00:00:00Z", + )) + .await + .unwrap(); + + assert_eq!(db.count_repos_deduped().await.unwrap(), 2); + } + + /// Full tie (same mirror-status and created_at within a group) resolves to a + /// deterministic survivor by `id ASC`, matching the Rust helper's tiebreak. + #[sqlx::test] + async fn deduped_full_tie_resolves_by_id_asc(pool: PgPool) { + let db = db(pool).await; + // Two canonical rows in the same (normalized owner, name) group, identical + // created_at; only the id differs. Different owner_did strings avoid any + // (owner, name) collision while still normalizing to the same group key. + db.create_repo(&rec( + "bbb", + "did:key:z6Same", + "repo", + "real", + "2026-01-01T00:00:00Z", + "2026-01-01T00:00:00Z", + )) + .await + .unwrap(); + db.create_repo(&rec( + "aaa", + "z6Same", + "repo", + "real", + "2026-01-01T00:00:00Z", + "2026-01-01T00:00:00Z", + )) + .await + .unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!(out.len(), 1, "same group collapses"); + assert_eq!( + out[0].id, "aaa", + "id ASC breaks a full tie deterministically" + ); + } + + /// Marker robustness: a canonical row whose `description` is literally + /// "mirrored from peer" but whose `id` is a UUID is still ranked canonical and + /// wins over a true slash-id mirror in its group — even though the mirror was + /// created earlier. Proves dedup keys on the structural id, not the description. + #[sqlx::test] + async fn deduped_marker_uses_id_not_description(pool: PgPool) { + let db = db(pool).await; + let canonical = rec( + "uuid-canonical", + "did:key:z6Mkwbud", + "nipmod", + "mirrored from peer", // user-settable description = the old marker string + "2026-01-15T00:00:00Z", + "2026-01-15T00:00:00Z", + ); + let mirror = rec( + "z6Mkwbud/nipmod", // slash id = the real structural marker + "z6Mkwbud", + "nipmod", + "a normal description, not the marker", + "2026-01-01T00:00:00Z", // earlier: would win on created_at if marker ignored + "2026-01-01T00:00:00Z", + ); + db.create_repo(&canonical).await.unwrap(); + db.create_repo(&mirror).await.unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!(out.len(), 1); + assert_eq!( + out[0].id, "uuid-canonical", + "canonical wins by structural id marker despite carrying the mirror description" + ); + } + + /// A mirror row with no canonical twin survives dedup as the sole entry for its + /// group (it is not dropped just because it is the mirror). + #[sqlx::test] + async fn deduped_mirror_only_group_survives(pool: PgPool) { + let db = db(pool).await; + db.upsert_mirror_repo("z6Lonely", "orphan", "/srv/m", None) + .await + .unwrap(); + + let out = db.list_all_repos_deduped().await.unwrap(); + assert_eq!( + out.len(), + 1, + "a mirror-only group still yields one logical repo" + ); + assert_eq!(out[0].id, "z6Lonely/orphan"); + assert_eq!(db.count_repos_deduped().await.unwrap(), 1); + } + + /// Degenerate empty table: deduped list is empty and the count is 0, no error. + #[sqlx::test] + async fn deduped_empty_table(pool: PgPool) { + let db = db(pool).await; + assert!(db.list_all_repos_deduped().await.unwrap().is_empty()); + assert_eq!(db.count_repos_deduped().await.unwrap(), 0); + } + + /// count_repos_deduped and list_all_repos_deduped must agree: the count is the + /// number of logical repos the list returns. Guards the two independent SQL + /// queries against drifting on the grouping key. + #[sqlx::test] + async fn deduped_count_matches_list_len(pool: PgPool) { + let db = db(pool).await; + // Two logical repos: one canonical+mirror pair, one standalone canonical. + db.create_repo(&rec( + "uuid-1", + "did:key:z6Pair", + "shared", + "real", + "2026-01-01T00:00:00Z", + "2026-01-01T00:00:00Z", + )) + .await + .unwrap(); + db.upsert_mirror_repo("z6Pair", "shared", "/srv/m", None) + .await + .unwrap(); + db.create_repo(&rec( + "uuid-2", + "did:key:z6Solo", + "solo", + "real", + "2026-01-02T00:00:00Z", + "2026-01-02T00:00:00Z", + )) + .await + .unwrap(); + + let list_len = db.list_all_repos_deduped().await.unwrap().len() as i64; + let count = db.count_repos_deduped().await.unwrap(); + assert_eq!(list_len, 2); + assert_eq!(count, list_len, "count must equal the deduped list length"); + } +} diff --git a/crates/gitlawb-node/src/graphql/query.rs b/crates/gitlawb-node/src/graphql/query.rs index 5f2bfb1..c7673f6 100644 --- a/crates/gitlawb-node/src/graphql/query.rs +++ b/crates/gitlawb-node/src/graphql/query.rs @@ -12,7 +12,7 @@ impl QueryRoot { async fn repos(&self, ctx: &Context<'_>) -> Result> { let db = ctx.data_unchecked::>(); let repos = db - .list_all_repos() + .list_all_repos_deduped() .await .map_err(|e| async_graphql::Error::new(e.to_string()))?; Ok(repos diff --git a/crates/gitlawb-node/src/server.rs b/crates/gitlawb-node/src/server.rs index c9df49d..122b71f 100644 --- a/crates/gitlawb-node/src/server.rs +++ b/crates/gitlawb-node/src/server.rs @@ -452,12 +452,7 @@ async fn node_info(State(state): State) -> Json { } async fn stats(State(state): State) -> Json { - let repos = state - .db - .list_all_repos() - .await - .map(|r| r.len() as i64) - .unwrap_or(0); + let repos = state.db.count_repos_deduped().await.unwrap_or(0); let agents = state.db.count_agents().await.unwrap_or(0); let pushes = state.db.count_pushes().await.unwrap_or(0); Json(json!({ diff --git a/crates/gitlawb-node/src/test_support.rs b/crates/gitlawb-node/src/test_support.rs index 540c6ce..154db14 100644 --- a/crates/gitlawb-node/src/test_support.rs +++ b/crates/gitlawb-node/src/test_support.rs @@ -510,4 +510,101 @@ mod tests { "register must reject a DID other than the signer" ); } + + /// Issue #6 / jatmn finding 1: the GraphQL `repos` query renders one logical + /// repo per mirror+canonical pair. Seeds a canonical `did:key:` repo plus its + /// short-owner mirror row and a distinct standalone repo, then asserts the + /// query returns two entries (not three) and the shared repo appears once as + /// the canonical owner. + #[sqlx::test] + async fn graphql_repos_is_deduped(pool: PgPool) { + let short = "zGRAPHQLDEDUPAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(&format!("did:key:{short}"), "shared")) + .await + .expect("seed canonical"); + state + .db + .upsert_mirror_repo(short, "shared", "/tmp/mirror", None) + .await + .expect("seed mirror"); + state + .db + .create_repo(&seed_repo( + "did:key:zGQLOTHERBBBBBBBBBBBBBBBBBBBBBBBBBBBBB", + "solo", + )) + .await + .expect("seed standalone"); + + let resp = state + .graphql_schema + .execute(async_graphql::Request::new("{ repos { name ownerDid } }")) + .await; + assert!(resp.errors.is_empty(), "graphql errors: {:?}", resp.errors); + let data = resp.data.into_json().expect("graphql data to json"); + let repos = data["repos"].as_array().expect("repos array"); + assert_eq!( + repos.len(), + 2, + "mirror+canonical collapse to one logical repo, plus the standalone" + ); + let shared: Vec<_> = repos.iter().filter(|r| r["name"] == "shared").collect(); + assert_eq!(shared.len(), 1, "the shared repo must not be double-listed"); + assert_eq!( + shared[0]["ownerDid"], + serde_json::json!(format!("did:key:{short}")), + "the canonical did:key row is the survivor" + ); + } + + /// Issue #6 / jatmn finding 2: `/api/v1/stats` counts logical repos, not raw + /// rows. With a mirror+canonical pair and a standalone repo present, the + /// `repos` count is 2. + #[sqlx::test] + async fn stats_repo_count_is_deduped(pool: PgPool) { + let short = "zSTATSDEDUPAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(&format!("did:key:{short}"), "shared")) + .await + .expect("seed canonical"); + state + .db + .upsert_mirror_repo(short, "shared", "/tmp/mirror", None) + .await + .expect("seed mirror"); + state + .db + .create_repo(&seed_repo( + "did:key:zSTATSOTHERBBBBBBBBBBBBBBBBBBBBBBBBBB", + "solo", + )) + .await + .expect("seed standalone"); + + let router = crate::server::build_router(state); + let resp = router + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/api/v1/stats") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let bytes = axum::body::to_bytes(resp.into_body(), usize::MAX) + .await + .unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + assert_eq!( + json["repos"], 2, + "stats must count logical repos (mirror+canonical collapsed)" + ); + } }