From 8698f592f6af55f1c1b82fcafdda50eb2c7f3757 Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Wed, 10 Jun 2026 22:27:44 +0200 Subject: [PATCH 1/2] Publish kegg116 KEGG artefacts as gzip, version-prefixed assets (v0.1.0) First downloadable KEGG artefact set, wired into the runtime resolvers: - All artefacts are gzip and version-prefixed (kegg116_.gz) so MATLAB and Windows read them with the built-in gunzip, no external tool. organism_gene_ko moves from xz to gzip for the same reason. - HMM libraries ship as one gzip concatenated flatfile per domain; ensure_kegg_hmm_library decompresses and hmmpresses on first use, ~10x smaller than the pressed index and portable across HMMER versions. - Add a version-prefix-tolerant artefact resolver (_resolve_artefact) used by the organism/sequence entry points; parse_kegg_dump and build_kegg_artefacts.py gain an opt-in --version. - Populate data/manifest.json and _DATA_REGISTRY with the kegg116 release assets (real SHA256 + bytes); refresh the maintainer docs and manifest example. - Bump version to 0.1.0 and update CHANGELOG. --- CHANGELOG.md | 19 +++- IMPROVEMENTS.md | 2 +- data/manifest.example.json | 18 +-- data/manifest.json | 48 +++++++- docs/maintenance/kegg_data_format.md | 49 ++++---- docs/maintenance/maintaining_kegg_data.md | 49 +++++--- docs/reference/matlab_raven_backports.md | 10 +- docs/studies/kegg_hmm_cutoff_calibration.md | 2 +- pyproject.toml | 2 +- scripts/analyze_hmm_cutoffs.py | 2 +- scripts/build_kegg_artefacts.py | 43 ++++--- src/raven_python/data.py | 103 +++++++++++++---- .../reconstruction/kegg/organism.py | 10 +- src/raven_python/reconstruction/kegg/parse.py | 76 ++++++++++--- src/raven_python/reconstruction/kegg/query.py | 8 +- tests/test_data.py | 105 ++++++++++++++---- tests/test_reconstruction_kegg_parse.py | 16 ++- 17 files changed, 402 insertions(+), 160 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bd72aa..96a47e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,11 +4,20 @@ Milestones in the raven-python port. For function-level status see [docs/raven_migration.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/reference/migration.md); for open work see [docs/todo.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/reference/todo.md). -## Unreleased - -Post-release review pass — cobra-aligned hardening (no behaviour change on -well-formed inputs). Highlights: - +## 0.1.0 — 2026-06-10 + +First release with **published, downloadable KEGG artefacts**, plus a cobra-aligned +hardening pass (no behaviour change on well-formed inputs). Highlights: + +* **KEGG artefacts published (`kegg116`):** `ensure_kegg_data` / + `ensure_kegg_hmm_library` fetch version-pinned, SHA256-verified files from the + GitHub release. Every artefact is **gzip + version-prefixed** + (`kegg116_.gz`) so MATLAB and Windows read them with the built-in `gunzip` + (no external tool) — `organism_gene_ko` moved from xz to gzip for this. The **HMM + libraries ship as one gzip concatenated flatfile per domain** + (`kegg116_.hmm.gz`); the client decompresses and `hmmpress`-es once on + first use, cutting the download ~10× versus the pressed index and letting the + same artefact serve MATLAB RAVEN. * **Packaging:** `raven_python.__version__` now derives from the installed package metadata (`importlib.metadata`) instead of a hard-coded literal that had drifted to `0.0.1`; the docs site reported the wrong version. Pinned `ruff==0.15.15` in diff --git a/IMPROVEMENTS.md b/IMPROVEMENTS.md index 9fa9096..01dd3e5 100644 --- a/IMPROVEMENTS.md +++ b/IMPROVEMENTS.md @@ -98,7 +98,7 @@ and `taxonomy.py` (3b.3). Maintainer-side, build-time tooling (PLAN.md §2.3b). | K11 | ERGONOMICS | raven-python 🔨 | 🔨 | **`ensure_data`** (`data.py`): version-pinned registry that fetches/verifies/caches the published KEGG artefacts under `~/.cache/raven-python/data/`, mirroring `ensure_binary`. End users get a draft model with no KEGG access and no manual data handling — the `…_from_artefacts` entry points auto-fetch when no local dir is supplied. | | K12 | EFFICIENCY | raven-python 🔨 + MATLAB RAVEN 💡 | 🔨 | **Fast MAFFT (FFT-NS-2) for HMM training** instead of RAVEN's `--auto`, which selects slow iterative refinement (`dvtditr`) on medium/large KOs — observed ~2.5 min/KO (days for a domain) on real KEGG 118. FFT-NS-2 (`--retree 2 --maxiterate 0`) is seconds/KO and ample for profile-HMM building. **PartTree cutover is residue-based and memory-auto-tuned**: MAFFT memory tracks residues (count × length), not sequence count, so a count threshold let long-protein KOs (K00901: 2,788 seqs, 2.55 M residues) OOM under FFT-NS-2 — measured ~5 GB MAFFT RSS with FFT-NS-2 vs **0.69 GB with PartTree** for the same alignment. The cutover is **length-aware and memory-auto-tuned**: FFT-NS-2 memory is driven by the progressive-alignment **DP cost ≈ n_seqs × mean_len²** (= residues²/n_seqs), *not* residue count — a few hundred long proteins cost far more than the same residues in many short ones. (First tried a residue-only model `RSS≈1.32R²+1.84R`; it then OOM'd on K12047 — 452 seqs but mean length 2082, 0.94 M residues — because long proteins blow the per-residue cost.) Calibrated `RSS_GB ≈ 4.2e-9 × (n_seqs × mean_len²)` across real KEGG KOs (250k/266→0.67 GB … 1.5M/1624→5.73 GB; K12047 cost 1.96e9 = the largest, hence its OOM). `_auto_cost_budget` switches to PartTree when the DP cost exceeds `0.65 × (total − 2.5 GB overhead) / 4.2e-9` (≈7.9e8 on a 7.6 GB box), **warns on low-memory hosts**, and `parttree_residues` overrides with a manual residue cutoff. Back-portable to RAVEN. | | K13 | EFFICIENCY | raven-python 🗑️ | 🗑️ | ~~Per-KO sequence cap (`max_sequences`)~~ — **removed.** Briefly added as a count-based cap, but the residue-based PartTree cutover (K12) bounds MAFFT memory without dropping any sequences, so the cap was redundant complexity. All deduplicated sequences are kept. | -| K14 | EFFICIENCY (size) | raven-python 🔨 + MATLAB RAVEN 💡 | 🔨 | **Sort `organism_gene_ko` by `(organism, gene)` and store it xz-compressed** (`organism_gene_ko.tsv.xz`), cutting the dominant artefact **≈78 → 27 MB (2.9×)**. Gene IDs within an organism share long prefixes (locus tags, numeric runs), so sorting makes them adjacent and far more compressible (sort alone: 78→48 MB; xz vs gzip captures the cross-row redundancy gzip's 32 KB window misses: →27 MB). The sort is an **external merge sort** bounded to `chunk_rows` rows in memory (sorted runs spooled to gzipped temp files, merged with `heapq.merge`), so it keeps K9's flat memory profile. Both `lzma` and `gzip` are Python stdlib (native on Windows/macOS/Linux, no extra binary); small tables stay gzipped TSV (MATLAB-native), only the big one is xz (MATLAB needs an external `unxz`). Sorted order also matches the by-organism query in `get_kegg_model_for_organism`, enabling a future `searchsorted` slice instead of loading all 9M rows. Back-portable to RAVEN. | +| K14 | EFFICIENCY (size) | raven-python 🔨 + MATLAB RAVEN 💡 | 🔨 | **Sort `organism_gene_ko` by `(organism, gene)` and store it gzipped** (`organism_gene_ko.tsv.gz`), cutting the dominant artefact **≈78 → 48 MB** by sorting alone. Gene IDs within an organism share long prefixes (locus tags, numeric runs), so sorting makes them adjacent and far more compressible. The sort is an **external merge sort** bounded to `chunk_rows` rows in memory (sorted runs spooled to gzipped temp files, merged with `heapq.merge`), so it keeps K9's flat memory profile. We first xz-compressed this file (≈27 MB, 2.9×) but switched to **gzip** (≈74 MB) so MATLAB reads it with built-in `gunzip` and no external `unxz`: the artefacts are shared with MATLAB RAVEN, and the once-per-release size cut wasn't worth a MATLAB toolchain dependency. All tables are now gzipped TSV, native on Windows/macOS/Linux. Sorted order also matches the by-organism query in `get_kegg_model_for_organism`, enabling a future `searchsorted` slice instead of loading all 9M rows. Back-portable to RAVEN. | | K15 | ERGONOMICS (correctness) | raven-python 🔨 + MATLAB RAVEN 💡 | 🔨 | **Recalibrate the HMM-query KO-assignment defaults** (`assign_kos`): cut-off `1e-50 → 1e-30`, `min_score_ratio_g 0.8 → 0.9`; `min_score_ratio_ko` left at 0.3 but **documented as empirically inert**. Cross-validated the full 3b.5 pipeline against the true KEGG gene→KO annotation of four organisms across both libraries and the well-/lesser-studied axis — *S. cerevisiae*, *Cyanidioschyzon merolae* (red alga), *E. coli* K-12, *Mycoplasma genitalium* (minimal genome). Real annotations score overwhelmingly (median E ≈ 1e-100…1e-155; even the weakest 1% ≈ 1e-15…1e-36) while spurious hits cluster at ≈1e-8 — a ~20-order-of-magnitude gap. RAVEN's `1e-50` therefore sits **inside the true-positive tail** and silently drops real-but-divergent hits for no noise-rejection gain: gene→KO recall on *M. genitalium* was only 0.84 (reaction recall 0.87). At `1e-30` + `ratio_g=0.9`: *M. genitalium* recall **0.84→0.94** (rxn 0.87→0.97), *E. coli* 0.95→0.97 with **fewer** unannotated reactions (198→173, the tighter gene-ratio prunes spurious multi-KO genes), *S. cerevisiae*/*C. merolae* held or improved. The three sweep tables showed `min_score_ratio_ko` produced identical output at 0.0/0.3/0.5 across all four organisms — a magic-number knob that does nothing; `min_score_ratio_g` is the real precision lever. Full numbers in [docs/kegg_hmm_cutoff_calibration.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/studies/kegg_hmm_cutoff_calibration.md) (reproduce with `scripts/analyze_hmm_cutoffs.py`). Back-portable to RAVEN. | ## FSEOF (Phase 5 — implemented, redesigned) diff --git a/data/manifest.example.json b/data/manifest.example.json index 2bc5e81..036ff5e 100644 --- a/data/manifest.example.json +++ b/data/manifest.example.json @@ -1,20 +1,20 @@ { "manifest_version": 1, - "generated": "2026-05-30", + "generated": "2026-06-10", "data": { "kegg": { "version": "kegg116", "description": "KEGG reference model, KO/reaction tables, and prokaryote/eukaryote HMM libraries for getKEGGModelForOrganism.", "license": "Derived from the KEGG database; redistributed with permission from KEGG.", - "doi": "10.5281/zenodo.0000000", - "source": "https://github.com/SysBioChalmers/raven-python/releases/tag/kegg-kegg116", + "source": "https://github.com/SysBioChalmers/raven-python/releases/tag/v0.1.0", "files": { - "reference_model.yml.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/kegg-kegg116/reference_model.yml.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, - "ko_reaction.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/kegg-kegg116/ko_reaction.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, - "ko_names.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/kegg-kegg116/ko_names.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, - "organism_gene_ko.tsv.xz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/kegg-kegg116/organism_gene_ko.tsv.xz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, - "rxn_flags.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/kegg-kegg116/rxn_flags.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, - "prokaryotes.hmm": { "url": "https://zenodo.org/records/0000000/files/prokaryotes.hmm", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 } + "kegg116_reference_model.yml.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_reference_model.yml.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "kegg116_ko_reaction.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_reaction.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "kegg116_ko_names.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_names.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "kegg116_organism_gene_ko.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_organism_gene_ko.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "kegg116_rxn_flags.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "kegg116_prokaryotes.hmm.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_prokaryotes.hmm.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "kegg116_eukaryotes.hmm.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_eukaryotes.hmm.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 } } } }, diff --git a/data/manifest.json b/data/manifest.json index baf5ee9..84d291f 100644 --- a/data/manifest.json +++ b/data/manifest.json @@ -1,6 +1,50 @@ { "manifest_version": 1, - "generated": "2026-05-30", - "data": {}, + "generated": "2026-06-10", + "data": { + "kegg": { + "version": "kegg116", + "description": "KEGG reference model, KO/reaction tables, and prokaryote/eukaryote HMM libraries for getKEGGModelForOrganism.", + "license": "Derived from the KEGG database; redistributed with permission from KEGG.", + "source": "https://github.com/SysBioChalmers/raven-python/releases/tag/v0.1.0", + "files": { + "kegg116_eukaryotes.hmm.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_eukaryotes.hmm.gz", + "sha256": "2d48bc9935575d0f9ba4178bf2df19279bff866b49c1bf83a8e15787b11d6708", + "bytes": 134002309 + }, + "kegg116_ko_names.tsv.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_names.tsv.gz", + "sha256": "84f9c7150172d948f794d91a6608d55f7140f31e53249c705057ae49b11c93b3", + "bytes": 14585 + }, + "kegg116_ko_reaction.tsv.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_reaction.tsv.gz", + "sha256": "e1a4ac22875bd3030d03b78368b0153b6d99000acb2ee0f474340a03c180323c", + "bytes": 49196 + }, + "kegg116_organism_gene_ko.tsv.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_organism_gene_ko.tsv.gz", + "sha256": "27bf7dd58eb1acd5904990dc2be187aae4d8d9b9f7421375618e7c8d6ff7253d", + "bytes": 47935249 + }, + "kegg116_prokaryotes.hmm.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_prokaryotes.hmm.gz", + "sha256": "d80cb2a22dec9fd8336b3998e3b96ee121672f63f4041cddaf09624fe739f1af", + "bytes": 153173750 + }, + "kegg116_reference_model.yml.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_reference_model.yml.gz", + "sha256": "73ff313fe2aa2830ec511f4e522226c98c5714c2d5c4632844544e5a409c7f0c", + "bytes": 1090563 + }, + "kegg116_rxn_flags.tsv.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz", + "sha256": "c4c134effc9edeeb74b925ae8616320af162edbaad3a9b44dcc29d2c4d12db9b", + "bytes": 33289 + } + } + } + }, "binaries": {} } diff --git a/docs/maintenance/kegg_data_format.md b/docs/maintenance/kegg_data_format.md index efb6d13..23b327b 100644 --- a/docs/maintenance/kegg_data_format.md +++ b/docs/maintenance/kegg_data_format.md @@ -16,33 +16,32 @@ under `~/.cache/raven-python/data/kegg-/` by `ensure_data` (see ## Decision (current) -- **Small tables** (`ko_reaction`, `ko_names`, `rxn_flags`): **gzipped TSV - (`.tsv.gz`)**. Each is well under 1 MB, so compression choice is irrelevant; - gzip keeps them MATLAB-native and dependency-free. -- **The large `organism_gene_ko` table**: **xz-compressed TSV - (`organism_gene_ko.tsv.xz`), with rows sorted by `(organism, gene)`**. +- **All tables** (`ko_reaction`, `ko_names`, `rxn_flags`, and the large + `organism_gene_ko`): **gzipped TSV (`.tsv.gz`)**. Published assets are + version-prefixed, e.g. `kegg116_organism_gene_ko.tsv.gz`. +- The large `organism_gene_ko` table keeps its rows **sorted by `(organism, gene)`**. -Why the large table differs. It carries KEGG's ~9M gene↔KO associations and -dominates the artefact set (≈78 MB as unsorted gzipped TSV). Two cheap, -stdlib-only changes cut that to ≈27 MB (2.9×): +Why everything is gzip — even the big table. `organism_gene_ko` carries KEGG's +~9M gene↔KO associations and dominates the artefact set. Sorting by +`(organism, gene)` before writing makes gene IDs from one organism adjacent +(shared locus-tag/numeric prefixes), which both helps the compressor and matches +the by-organism query pattern in `get_kegg_model_for_organism`; the sort is an +external merge sort bounded to `chunk_rows` in memory (see +`stream_organism_gene_ko`), so it stays scalable. On the real dump this lands +around ~74 MB. -1. **Sort by `(organism, gene)`** before writing. Gene IDs from one organism - share long common prefixes (locus tags, numeric runs); sorting makes them - adjacent so the compressor can fold them. This alone takes 78 → 48 MB and - happens to match the by-organism query pattern in - `get_kegg_model_for_organism`. The sort is an external merge sort bounded to - `chunk_rows` in memory (see `stream_organism_gene_ko`), so it stays scalable. -2. **xz instead of gzip** (Python stdlib `lzma`). Its larger dictionary captures - cross-row redundancy gzip's 32 KB window misses: sorted + xz reaches ≈27 MB. +We previously xz-compressed this one file (≈27 MB, ~2.9× smaller). We switched it +to **gzip** so the *same* artefact is readable by MATLAB's built-in `gunzip` with +no external tool — the artefacts are shared with MATLAB RAVEN, and `.xz` would +force an external `xz`/`unxz` dependency. The size cost (~74 vs ~27 MB on a +once-per-release download) buys a dependency-free, cross-tool, cross-platform +read; xz's larger dictionary is not worth a MATLAB toolchain requirement. -- **pandas reads/writes both with zero extra dependencies** — compression is - inferred from the `.gz`/`.xz` suffix; `lzma` and `gzip` are both stdlib, so - this works natively on Windows, macOS, and Linux with no external binary. -- **MATLAB caveat:** `readtable` reads gzipped TSV after a `gunzip`, but MATLAB - has no built-in xz decompressor. The small tables stay MATLAB-native; the - large table needs an external `unxz` (or Java/`7-Zip`) before `readtable` on - the MATLAB side. The xz file is raven-python's (Python) primary artefact; this - trades a little MATLAB convenience on the one big file for a ~3× size cut. +- **pandas reads/writes gzip with zero extra dependencies** — compression is + inferred from the `.gz` suffix; `gzip` is stdlib, so this works natively on + Windows, macOS, and Linux with no external binary. +- **MATLAB:** `readtable` reads every table after a built-in `gunzip`, with no + external binary on any file. ## Options considered @@ -57,7 +56,7 @@ stdlib-only changes cut that to ≈27 MB (2.9×): Reconsider Parquet (or SQLite) if any of these become true: - The `organism_gene_ko` table grows large enough that load *time* (not just - size — the sort+xz change above already addresses on-disk size) becomes a real + size — the sort above already keeps on-disk size in check) becomes a real bottleneck. The remaining inefficiency is that building one species' model still loads all ~9M rows; sorted order makes a `searchsorted`/row-group by-organism read the natural next step before reaching for Parquet. diff --git a/docs/maintenance/maintaining_kegg_data.md b/docs/maintenance/maintaining_kegg_data.md index 356b574..da4bd96 100644 --- a/docs/maintenance/maintaining_kegg_data.md +++ b/docs/maintenance/maintaining_kegg_data.md @@ -76,11 +76,13 @@ KEGG release). ```python from raven_python.reconstruction.kegg import parse_kegg_dump -parse_kegg_dump("keggdb", "artefacts") +parse_kegg_dump("keggdb", "artefacts", version="kegg116") ``` This writes the gene-free reference model (`reference_model.yml.gz`, gzipped -RAVEN/cobra YAML) and the relational tables as gzipped TSV. See +RAVEN/cobra YAML) and the relational tables as gzipped TSV. With `version=` set, +every output filename is version-prefixed (e.g. `kegg116_organism_gene_ko.tsv.gz`), +matching the published release assets. See [kegg_data_format.md](kegg_data_format.md) for what those tables contain and the format rationale. @@ -98,7 +100,7 @@ searches. This needs **HMMER** (`hmmbuild`, `hmmpress`), **MAFFT**, and ```python from raven_python.reconstruction.kegg import build_hmm_library, read_kegg_table -organism_gene_ko = read_kegg_table("artefacts/organism_gene_ko.tsv.xz") +organism_gene_ko = read_kegg_table("artefacts/kegg116_organism_gene_ko.tsv.gz") for domain in ("prokaryotes", "eukaryotes"): build_hmm_library( organism_gene_ko, @@ -111,32 +113,42 @@ for domain in ("prokaryotes", "eukaryotes"): For each KO in the domain it gathers the member sequences, dereplicates with CD-HIT (~90 % identity), aligns with MAFFT, trains a profile with `hmmbuild`, and -finally concatenates and `hmmpress`-es them into a single `library.hmm` for fast -`hmmscan` querying. This is the slowest step (hours, once per KEGG release); it -skips KOs whose `.hmm` already exists, so it is resumable. The resulting -libraries are published as version-pinned artefacts alongside the reference model -and tables. +finally concatenates them into a single `library.hmm`. This is the slowest step +(hours, once per KEGG release); it skips KOs whose `.hmm` already exists, so it is +resumable. The concatenated library is published **gzipped** as +`_.hmm.gz` (e.g. `kegg116_prokaryotes.hmm.gz`); end users +decompress it and run `hmmpress` once on first use (see `ensure_kegg_hmm_library`), +which keeps the download ~10× smaller than the binary index, stays portable across +HMMER versions, and lets the same artefact serve MATLAB RAVEN. ## Building and publishing in one go [`scripts/build_kegg_artefacts.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/scripts/README.md) runs 3b.2 (+ 3b.3 with -`--hmms`) and lays the output out as publishable assets (`.hmm` named for -`ensure_kegg_hmm_library`): +`--hmms`) and lays the output out as publishable, version-prefixed assets +(`_.hmm.gz` named for `ensure_kegg_hmm_library`): ```bash -python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts --hmms --threads 8 +python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts \ + --version kegg116 --hmms --threads 8 ``` -Upload the contents of `artefacts/` to a release, then emit the registry entry for -`raven_python.data._DATA_REGISTRY` with [`scripts/make_registry_snippet.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/scripts/README.md): +Upload the contents of `artefacts/` to the release, then record the artefacts in +both the shared `data/manifest.json` and `raven_python.data._DATA_REGISTRY` with +[`scripts/make_registry_snippet.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/scripts/README.md) +(it computes each file's SHA256 + size): ```bash +# shared source of truth (read by raven-python and MATLAB RAVEN): +python scripts/make_registry_snippet.py manifest --manifest data/manifest.json \ + --target data --dataset kegg --version kegg116 --dir artefacts \ + --base-url https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0 +# in-code registry, so end users auto-fetch with no env var (paste into _DATA_REGISTRY): python scripts/make_registry_snippet.py data --dataset kegg --version kegg116 \ - --dir artefacts --base-url https://github.com/ORG/raven-python/releases/download/kegg-data-kegg116 + --dir artefacts --base-url https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0 ``` -Paste the printed block into `_DATA_REGISTRY`; from then on `ensure_data` fetches -and verifies the artefacts for end users automatically. +From then on `ensure_data` fetches and verifies the artefacts for end users +automatically. ## End-user paths (3b.4 / 3b.5) @@ -152,6 +164,7 @@ model from the artefacts: cross-platform. `organism_id="prokaryotes"`/`"eukaryotes"` builds a whole-domain model (pass `taxonomy=`). - **3b.5 — organism not in KEGG** (`get_kegg_model_from_sequences`): `hmmscan`-es a - proteome FASTA against the pressed `library.hmm`, so it needs **HMMER** - (`hmmscan`) — Linux/macOS or WSL2 (see the OS matrix). Tune assignment with + proteome FASTA against the domain library (decompressed and `hmmpress`-ed from + the published `.hmm.gz` on first use), so it needs **HMMER** (`hmmpress`, + `hmmscan`) — Linux/macOS or WSL2 (see the OS matrix). Tune assignment with `cutoff`, `min_score_ratio_ko`, `min_score_ratio_g`. diff --git a/docs/reference/matlab_raven_backports.md b/docs/reference/matlab_raven_backports.md index 5187f9e..8b6c44a 100644 --- a/docs/reference/matlab_raven_backports.md +++ b/docs/reference/matlab_raven_backports.md @@ -70,11 +70,11 @@ Implemented across [`reconstruction.kegg`](https://github.com/SysBioChalmers/rav switches when DP-cost > 0.65 × (RAM − 2.5 GB) / 4.2e-9). Same protein set, no sequences dropped. * **K14** ⚡ — Sort `organism_gene_ko` by `(organism, gene)` and store it - xz-compressed (`organism_gene_ko.tsv.xz`). Cuts the dominant artefact from - ~78 MB to ~27 MB (2.9×): gene IDs within an organism share long prefixes - so adjacency makes them far more compressible, and xz's larger window - catches cross-row redundancy gzip's 32 KB window misses. MATLAB needs an - external `unxz` to read the file; the small tables remain plain gzipped TSV. + gzipped (`organism_gene_ko.tsv.gz`). Sorting makes gene IDs within an organism + adjacent (shared locus-tag/numeric prefixes), so they compress far better + (~78 → 48 MB) and the order matches the by-organism query. Stored as gzip (not + xz) so MATLAB reads it with built-in `gunzip`, since the same artefact is shared + with raven-python; xz would be ~3× smaller but needs an external `unxz`. * **K15** 🐛 — Recalibrate the HMM-query KO-assignment defaults: change `cutoff` from `1e-50` to `1e-30` and `min_score_ratio_g` from `0.8` to `0.9`. `min_score_ratio_ko` can stay at `0.3` but is empirically inert diff --git a/docs/studies/kegg_hmm_cutoff_calibration.md b/docs/studies/kegg_hmm_cutoff_calibration.md index c2fd106..9c9b306 100644 --- a/docs/studies/kegg_hmm_cutoff_calibration.md +++ b/docs/studies/kegg_hmm_cutoff_calibration.md @@ -199,5 +199,5 @@ Full reconstruction of *S. cerevisiae* two ways, at the old defaults: Reaction recall 96.3 % (1305/1355 shared, Jaccard 0.86); gene recall 96.6 % (807/835 shared, Jaccard 0.87). The annotation path also exercises the new -`organism_gene_ko.tsv.xz` artefact (K14). `hmmscan` throughput ≈ 0.1 s/query +`organism_gene_ko.tsv.gz` artefact (K14). `hmmscan` throughput ≈ 0.1 s/query against either library on 12 threads (yeast: 6021 queries in 633 s). diff --git a/pyproject.toml b/pyproject.toml index c5d847c..c847ed3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "raven-python" -version = "0.1.0a1" +version = "0.1.0" description = "Reconstruction, Analysis and Visualization of Metabolic Networks in Python, a port of the RAVEN Toolbox built on cobrapy" readme = "README.md" requires-python = ">=3.11" diff --git a/scripts/analyze_hmm_cutoffs.py b/scripts/analyze_hmm_cutoffs.py index 654fc02..6026604 100644 --- a/scripts/analyze_hmm_cutoffs.py +++ b/scripts/analyze_hmm_cutoffs.py @@ -50,7 +50,7 @@ def load_ko2rxn(artefacts: Path) -> dict[str, set[str]]: def ground_truth(artefacts: Path, org: str, ko2rxn) -> tuple[set, set]: - ogk = read_kegg_table(artefacts / "organism_gene_ko.tsv.xz") + ogk = read_kegg_table(artefacts / "organism_gene_ko.tsv.gz") rows = ogk[ogk["organism"].str.lower() == org] pairs = set(zip(rows["gene"], rows["ko"], strict=True)) rxns = {r for _, ko in pairs for r in ko2rxn.get(ko, ())} diff --git a/scripts/build_kegg_artefacts.py b/scripts/build_kegg_artefacts.py index 13fd00e..2539dd3 100644 --- a/scripts/build_kegg_artefacts.py +++ b/scripts/build_kegg_artefacts.py @@ -5,26 +5,29 @@ ``download_kegg_dump`` / ``fetch_keggdb``): * 3b.2 — ``parse_kegg_dump`` → ``reference_model.yml.gz`` + the gzipped-TSV tables; -* 3b.3 — ``build_hmm_library`` per domain → a pressed ``.hmm`` (+ hmmpress - sidecars), named so :func:`raven_python.data.ensure_kegg_hmm_library` can fetch them. +* 3b.3 — ``build_hmm_library`` per domain → a gzipped concatenated flatfile + ``_.hmm.gz`` (the client decompresses + ``hmmpress``-es it on + first use), named so :func:`raven_python.data.ensure_kegg_hmm_library` can fetch it. -Everything lands in ``--out`` ready to upload as release assets; feed that +Pass ``--version`` (e.g. ``kegg116``) to version-prefix every output filename, matching +the published release assets. Everything lands in ``--out`` ready to upload; feed that directory to ``scripts/make_registry_snippet.py data`` to emit the registry entry. Examples -------- Tables + reference model only (fast, no binaries):: - python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts + python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts --version kegg116 Full build incl. both HMM libraries (slow; needs HMMER/MAFFT/CD-HIT):: python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts \\ - --hmms --threads 8 + --version kegg116 --hmms --threads 8 """ from __future__ import annotations import argparse +import gzip import shutil from pathlib import Path @@ -34,21 +37,20 @@ read_kegg_table, ) -# hmmpress sidecar extensions, alongside the .hmm. -_HMM_SIDECARS = (".h3f", ".h3i", ".h3m", ".h3p") +def _publish_library(work: dict, out_dir: Path, domain: str, prefix: str = "") -> Path: + """Gzip a built ``library.hmm`` to ``out_dir/.hmm.gz``. -def _publish_library(work: dict, out_dir: Path, domain: str) -> Path: - """Copy a built ``library.hmm`` (+ sidecars) to ``out_dir/.hmm``.""" + Only the concatenated flatfile is published (gzip, ~10x smaller than the + hmmpress index and portable across HMMER versions); the client decompresses and + re-presses on first use, so the same artefact also serves MATLAB RAVEN. + """ library = work["library"] if library is None: raise SystemExit(f"No HMMs built for {domain!r}; nothing to publish.") - target = out_dir / f"{domain}.hmm" - shutil.copyfile(library, target) - for suffix in _HMM_SIDECARS: - sidecar = library.with_name(library.name + suffix) - if sidecar.exists(): - shutil.copyfile(sidecar, target.with_name(target.name + suffix)) + target = out_dir / f"{prefix}{domain}.hmm.gz" + with open(library, "rb") as src, gzip.open(target, "wb") as out: + shutil.copyfileobj(src, out) return target @@ -56,6 +58,10 @@ def main(argv: list[str] | None = None) -> None: parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--keggdb", required=True, type=Path, help="arranged KEGG dump directory") parser.add_argument("--out", required=True, type=Path, help="artefact output directory") + parser.add_argument( + "--version", default=None, + help="KEGG version (e.g. kegg116); version-prefixes every output filename", + ) parser.add_argument("--hmms", action="store_true", help="also build the HMM libraries") parser.add_argument( "--domains", nargs="+", default=["prokaryotes", "eukaryotes"], help="HMM domains to build" @@ -70,8 +76,9 @@ def main(argv: list[str] | None = None) -> None: args = parser.parse_args(argv) args.out.mkdir(parents=True, exist_ok=True) + prefix = f"{args.version}_" if args.version else "" print(">>> Parsing KEGG dump (3b.2)...") - paths = parse_kegg_dump(args.keggdb, args.out) + paths = parse_kegg_dump(args.keggdb, args.out, version=args.version) for name, path in paths.items(): print(f" {name}: {path}") @@ -86,12 +93,12 @@ def main(argv: list[str] | None = None) -> None: domain=domain, seq_identity=args.seq_identity, parttree_residues=args.parttree_residues, threads=args.threads, ) - published = _publish_library(work, args.out, domain) + published = _publish_library(work, args.out, domain, prefix) print(f" {domain}: {published} ({len(work['hmms'])} profiles)") print(f"\n>>> Done. Upload the contents of {args.out} as release assets, then run:") print(" python scripts/make_registry_snippet.py data --dataset kegg " - f"--version --dir {args.out} --base-url ") + f"--version {args.version or ''} --dir {args.out} --base-url ") if __name__ == "__main__": diff --git a/src/raven_python/data.py b/src/raven_python/data.py index a02749e..fdba221 100644 --- a/src/raven_python/data.py +++ b/src/raven_python/data.py @@ -19,23 +19,62 @@ """ from __future__ import annotations +import gzip import os import shutil +import subprocess from pathlib import Path from urllib.request import urlopen -from raven_python.binaries import _sha256 +from raven_python.binaries import _sha256, resolve_binary # dataset -> {"version": str, "files": {filename: {"url": str, "sha256": str}}} -# Populated when raven_python publishes the KEGG artefacts as release assets. -_DATA_REGISTRY: dict = {} - -# The core KEGG artefacts needed to build a model (no HMM libraries). +# Mirrors data/manifest.json (the cross-language source of truth); regenerate the +# block with scripts/make_registry_snippet.py when publishing a new KEGG release. +_DATA_REGISTRY: dict = { + "kegg": { + "version": "kegg116", + "files": { + "kegg116_eukaryotes.hmm.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_eukaryotes.hmm.gz", + "sha256": "2d48bc9935575d0f9ba4178bf2df19279bff866b49c1bf83a8e15787b11d6708", + }, + "kegg116_ko_names.tsv.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_names.tsv.gz", + "sha256": "84f9c7150172d948f794d91a6608d55f7140f31e53249c705057ae49b11c93b3", + }, + "kegg116_ko_reaction.tsv.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_reaction.tsv.gz", + "sha256": "e1a4ac22875bd3030d03b78368b0153b6d99000acb2ee0f474340a03c180323c", + }, + "kegg116_organism_gene_ko.tsv.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_organism_gene_ko.tsv.gz", + "sha256": "27bf7dd58eb1acd5904990dc2be187aae4d8d9b9f7421375618e7c8d6ff7253d", + }, + "kegg116_prokaryotes.hmm.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_prokaryotes.hmm.gz", + "sha256": "d80cb2a22dec9fd8336b3998e3b96ee121672f63f4041cddaf09624fe739f1af", + }, + "kegg116_reference_model.yml.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_reference_model.yml.gz", + "sha256": "73ff313fe2aa2830ec511f4e522226c98c5714c2d5c4632844544e5a409c7f0c", + }, + "kegg116_rxn_flags.tsv.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz", + "sha256": "c4c134effc9edeeb74b925ae8616320af162edbaad3a9b44dcc29d2c4d12db9b", + }, + }, + }, +} + +# The core KEGG artefacts needed to build a model (no HMM libraries). These are +# the *base* names; published assets are version-prefixed (``_``), +# which is what the resolvers below construct and what the registry keys hold. CORE_KEGG_FILES = ( "reference_model.yml.gz", "ko_reaction.tsv.gz", "ko_names.tsv.gz", - "organism_gene_ko.tsv.xz", + "organism_gene_ko.tsv.gz", "rxn_flags.tsv.gz", ) @@ -128,30 +167,52 @@ def ensure_kegg_data( ) -> Path: """Ensure the core KEGG artefacts are cached; return their directory. - Fetches each of ``files`` (default :data:`CORE_KEGG_FILES`) for the ``kegg`` - dataset and returns the cache directory holding them — ready to pass as the - ``artefact_dir`` of :func:`get_kegg_model_for_organism_from_artefacts`. + Fetches each of ``files`` (default :data:`CORE_KEGG_FILES`, given as *base* + names) for the ``kegg`` dataset and returns the cache directory holding them — + ready to pass as the ``artefact_dir`` of + :func:`get_kegg_model_for_organism_from_artefacts`. Each file is fetched under + its version-prefixed published name (``_``). """ registry = _DATA_REGISTRY if registry is None else registry ver = version or _bundle("kegg", registry)["version"] - for filename in files: - ensure_data_file("kegg", filename, version=ver, registry=registry) + for base in files: + ensure_data_file("kegg", f"{ver}_{base}", version=ver, registry=registry) return _data_cache_dir() / f"kegg-{ver}" def ensure_kegg_hmm_library( - domain: str, *, version: str | None = None, registry: dict | None = None + domain: str, + *, + version: str | None = None, + registry: dict | None = None, + hmmpress: str | os.PathLike | None = None, ) -> Path: - """Ensure a domain HMM library (and its hmmpress index) is cached; return its path. - - ``domain`` is ``"prokaryotes"`` or ``"eukaryotes"``. Fetches ``.hmm`` - plus the ``hmmpress`` sidecar files (``.h3f/.h3i/.h3m/.h3p``) and returns the - path to the ``.hmm`` (the argument for :func:`run_hmmscan`). + """Ensure a domain HMM library is cached and pressed; return the ``.hmm`` path. + + ``domain`` is ``"prokaryotes"`` or ``"eukaryotes"``. Fetches the gzipped + concatenated library ``_.hmm.gz``, decompresses it once, and + runs ``hmmpress`` to build the ``.h3f/.h3i/.h3m/.h3p`` index ``hmmscan`` needs + (HMMER is already a requirement of the de-novo query path). Both steps are + cached, so they run only on first use. Returns the path to the decompressed + ``.hmm`` (the argument for :func:`run_hmmscan`). + + Shipping the gzip flatfile and pressing on the client keeps the download ~10x + smaller than the binary index, stays portable across HMMER versions/platforms, + and lets the same artefact serve MATLAB RAVEN. """ registry = _DATA_REGISTRY if registry is None else registry ver = version or _bundle("kegg", registry)["version"] - base = f"{domain}.hmm" - library = ensure_data_file("kegg", base, version=ver, registry=registry) - for suffix in (".h3f", ".h3i", ".h3m", ".h3p"): - ensure_data_file("kegg", base + suffix, version=ver, registry=registry) + archive = ensure_data_file("kegg", f"{ver}_{domain}.hmm.gz", version=ver, registry=registry) + library = archive.with_suffix("") # strip ".gz" -> _.hmm + if not library.exists(): + tmp = library.with_name(library.name + ".part") + with gzip.open(archive, "rb") as src, open(tmp, "wb") as out: + shutil.copyfileobj(src, out) + tmp.replace(library) + sidecars = (".h3f", ".h3i", ".h3m", ".h3p") + if not all(library.with_name(library.name + s).exists() for s in sidecars): + exe = resolve_binary("hmmpress", binary=hmmpress) + proc = subprocess.run([exe, "-f", str(library)], capture_output=True, text=True) + if proc.returncode != 0: + raise RuntimeError(f"hmmpress failed:\n{(proc.stderr or '').strip()}") return library diff --git a/src/raven_python/reconstruction/kegg/organism.py b/src/raven_python/reconstruction/kegg/organism.py index 9f30575..91f554b 100644 --- a/src/raven_python/reconstruction/kegg/organism.py +++ b/src/raven_python/reconstruction/kegg/organism.py @@ -28,7 +28,7 @@ from raven_python.io.yaml import read_yaml_model from raven_python.reconstruction.kegg.assemble import _DOMAINS, assemble_model_from_ko_genes -from raven_python.reconstruction.kegg.parse import read_kegg_table +from raven_python.reconstruction.kegg.parse import _resolve_artefact, read_kegg_table from raven_python.reconstruction.kegg.taxonomy import organisms_in_domain _NOTE = "Included by get_kegg_model_for_organism (no HMMs)" @@ -139,10 +139,10 @@ def get_kegg_model_for_organism_from_artefacts( artefact_dir = ensure_kegg_data(version=version) artefact_dir = Path(artefact_dir) - reference_model = read_yaml_model(artefact_dir / "reference_model.yml.gz") - ko_reaction = read_kegg_table(artefact_dir / "ko_reaction.tsv.gz") - organism_gene_ko = read_kegg_table(artefact_dir / "organism_gene_ko.tsv.xz") - rxn_flags = read_kegg_table(artefact_dir / "rxn_flags.tsv.gz") + reference_model = read_yaml_model(_resolve_artefact(artefact_dir, "reference_model.yml.gz")) + ko_reaction = read_kegg_table(_resolve_artefact(artefact_dir, "ko_reaction.tsv.gz")) + organism_gene_ko = read_kegg_table(_resolve_artefact(artefact_dir, "organism_gene_ko.tsv.gz")) + rxn_flags = read_kegg_table(_resolve_artefact(artefact_dir, "rxn_flags.tsv.gz")) return get_kegg_model_for_organism( organism_id, reference_model, diff --git a/src/raven_python/reconstruction/kegg/parse.py b/src/raven_python/reconstruction/kegg/parse.py index 3ecd6f4..23c6f0e 100644 --- a/src/raven_python/reconstruction/kegg/parse.py +++ b/src/raven_python/reconstruction/kegg/parse.py @@ -28,7 +28,6 @@ import gzip import heapq -import lzma import re import tempfile from collections.abc import Iterator @@ -444,17 +443,21 @@ def build_kegg_tables( } -def write_kegg_tables(tables: dict[str, pd.DataFrame], out_dir: str | Path) -> list[Path]: - """Write each table as a gzipped TSV (``.tsv.gz``) into ``out_dir``. +def write_kegg_tables( + tables: dict[str, pd.DataFrame], out_dir: str | Path, *, prefix: str = "" +) -> list[Path]: + """Write each table as a gzipped TSV (``.tsv.gz``) into ``out_dir``. Gzipped TSV is the dependency-free cross-language format shared with MATLAB - RAVEN (see docs/kegg_data_format.md). Returns the written paths. + RAVEN (see docs/kegg_data_format.md) — readable by MATLAB's built-in ``gunzip`` + with no external tool. ``prefix`` version-tags the filenames (e.g. + ``kegg116_``). Returns the written paths. """ out_dir = Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) written = [] for name, frame in tables.items(): - path = out_dir / f"{name}.tsv.gz" + path = out_dir / f"{prefix}{name}.tsv.gz" with gzip.open(path, "wt", encoding="utf-8", newline="") as handle: frame.to_csv(handle, sep="\t", index=False) written.append(path) @@ -465,13 +468,35 @@ def read_kegg_table(path: str | Path) -> pd.DataFrame: """Read a KEGG table written by :func:`write_kegg_tables` or :func:`stream_organism_gene_ko`. - Compression is inferred from the suffix, so both the gzipped small tables - (``.tsv.gz``) and the xz-compressed ``organism_gene_ko.tsv.xz`` are read - transparently. + Compression is inferred from the suffix; all published tables are gzipped TSV + (``.tsv.gz``), and a version-prefixed name (``kegg116_.tsv.gz``) reads + just the same. """ return pd.read_csv(path, sep="\t", dtype=str, keep_default_na=False) +def _resolve_artefact(directory: str | Path, base: str) -> Path: + """Locate an artefact file by its base name, tolerating a version prefix. + + Returns ``directory/base`` if it exists, else the single version-prefixed + match ``directory/_`` (the published asset name, e.g. + ``kegg116_ko_reaction.tsv.gz``). This lets one reader consume both a user's own + unprefixed build directory and the version-pinned download cache. + """ + directory = Path(directory) + exact = directory / base + if exact.exists(): + return exact + matches = sorted(directory.glob(f"*_{base}")) + if len(matches) == 1: + return matches[0] + if not matches: + raise FileNotFoundError( + f"{base!r} (or a _ prefixed form) not found in {directory}" + ) + raise ValueError(f"Ambiguous {base!r} in {directory}: {sorted(p.name for p in matches)}") + + def _flush_sorted_run(rows: list[str], tmp_dir: Path, run_no: int) -> Path: """Sort a buffer of ``organism\\tgene\\tko\\n`` lines and write one gzipped run.""" rows.sort(key=_ogk_sort_key) @@ -490,14 +515,15 @@ def _ogk_sort_key(line: str) -> tuple[str, str]: def stream_organism_gene_ko( kegg_dir: str | Path, keep: set[str], ogk_path: str | Path, *, chunk_rows: int = 1_000_000 ) -> pd.DataFrame: - """Stream the ``ko`` file to a sorted, xz-compressed ``organism_gene_ko.tsv.xz``. + """Stream the ``ko`` file to a sorted, gzipped ``organism_gene_ko.tsv.gz``. Real KEGG has ~9M gene↔KO associations — far too many to hold in memory as a DataFrame. Rows are sorted by ``(organism, gene)`` before writing: gene IDs from one organism share long common prefixes (locus tags, numeric runs), so - sorting makes them adjacent and lets the compressor shrink the table ~2.9x - versus the unsorted gzip form. The order also matches the by-organism query - pattern in :func:`get_kegg_model_for_organism`. + sorting makes them adjacent and helps the compressor; the order also matches + the by-organism query pattern in :func:`get_kegg_model_for_organism`. Gzip + (not xz) keeps the table readable by MATLAB's built-in ``gunzip`` with no + external tool, at a modestly larger size. The sort is an **external merge sort** bounded to ``chunk_rows`` rows in memory at a time (sorted runs spooled to gzipped temp files, then merged with @@ -526,7 +552,7 @@ def stream_organism_gene_ko( handles = [gzip.open(r, "rt", encoding="utf-8") for r in runs] try: - with lzma.open(ogk_path, "wt", encoding="utf-8", newline="") as out: + with gzip.open(ogk_path, "wt", encoding="utf-8", newline="") as out: out.write("organism\tgene\tko\n") out.writelines(heapq.merge(*handles, key=_ogk_sort_key)) finally: @@ -535,7 +561,9 @@ def stream_organism_gene_ko( return pd.DataFrame(names, columns=["ko", "name"]) -def parse_kegg_dump(kegg_dir: str | Path, out_dir: str | Path) -> dict[str, Path]: +def parse_kegg_dump( + kegg_dir: str | Path, out_dir: str | Path, *, version: str | None = None +) -> dict[str, Path]: """Parse a full KEGG dump into the reference model + tables and write them out. Writes ``reference_model.yml.gz`` (gzipped RAVEN/cobra YAML) plus the @@ -544,9 +572,14 @@ def parse_kegg_dump(kegg_dir: str | Path, out_dir: str | Path) -> dict[str, Path ``organism_gene_ko`` table is streamed to disk (see :func:`stream_organism_gene_ko`) rather than built in memory, so this scales to the full KEGG database; the small derived tables are built in memory. + + When ``version`` is given (e.g. ``"kegg116"``) the output filenames are + version-prefixed (``_``), matching the published release + assets; the returned dict keys stay the logical table names. """ out_dir = Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) + prefix = f"{version}_" if version else "" reactions = parse_kegg_reactions(kegg_dir) compounds = parse_kegg_compounds(kegg_dir) @@ -563,16 +596,23 @@ def parse_kegg_dump(kegg_dir: str | Path, out_dir: str | Path) -> dict[str, Path columns=["reaction", "spontaneous", "undefined_stoich", "incomplete", "general"], ), } - paths = {name: p for name, p in zip(small, write_kegg_tables(small, out_dir), strict=True)} + paths = { + name: p + for name, p in zip(small, write_kegg_tables(small, out_dir, prefix=prefix), strict=True) + } - ogk_path = out_dir / "organism_gene_ko.tsv.xz" + ogk_path = out_dir / f"{prefix}organism_gene_ko.tsv.gz" ko_names = stream_organism_gene_ko(kegg_dir, linked_kos, ogk_path) paths["organism_gene_ko"] = ogk_path paths.update( - zip(["ko_names"], write_kegg_tables({"ko_names": ko_names}, out_dir), strict=True) + zip( + ["ko_names"], + write_kegg_tables({"ko_names": ko_names}, out_dir, prefix=prefix), + strict=True, + ) ) - ref_path = out_dir / "reference_model.yml.gz" + ref_path = out_dir / f"{prefix}reference_model.yml.gz" write_yaml_model(model, ref_path) paths["reference_model"] = ref_path return paths diff --git a/src/raven_python/reconstruction/kegg/query.py b/src/raven_python/reconstruction/kegg/query.py index 2df3f78..00a48f3 100644 --- a/src/raven_python/reconstruction/kegg/query.py +++ b/src/raven_python/reconstruction/kegg/query.py @@ -27,7 +27,7 @@ from raven_python.binaries import resolve_binary from raven_python.io.yaml import read_yaml_model from raven_python.reconstruction.kegg.assemble import assemble_model_from_ko_genes -from raven_python.reconstruction.kegg.parse import read_kegg_table +from raven_python.reconstruction.kegg.parse import _resolve_artefact, read_kegg_table _NOTE = "Included by get_kegg_model_from_sequences (using HMMs)" _MIN_EVALUE = 1e-250 # floor for a reported E-value of 0, to keep logs finite @@ -223,9 +223,9 @@ def get_kegg_model_from_sequences_with_artefacts( if library is None: library = ensure_kegg_hmm_library(domain, version=version) artefact_dir = Path(artefact_dir) - reference_model = read_yaml_model(artefact_dir / "reference_model.yml.gz") - ko_reaction = read_kegg_table(artefact_dir / "ko_reaction.tsv.gz") - rxn_flags = read_kegg_table(artefact_dir / "rxn_flags.tsv.gz") + reference_model = read_yaml_model(_resolve_artefact(artefact_dir, "reference_model.yml.gz")) + ko_reaction = read_kegg_table(_resolve_artefact(artefact_dir, "ko_reaction.tsv.gz")) + rxn_flags = read_kegg_table(_resolve_artefact(artefact_dir, "rxn_flags.tsv.gz")) return get_kegg_model_from_sequences( fasta, reference_model, ko_reaction, library, rxn_flags=rxn_flags, **kwargs ) diff --git a/tests/test_data.py b/tests/test_data.py index 714c3a9..d8db588 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,12 +1,17 @@ """Tests for ensure_data (data.py). Uses file:// URLs to avoid the network.""" +import gzip import hashlib +import subprocess +from pathlib import Path import pytest +from raven_python import data from raven_python.data import ( CORE_KEGG_FILES, ensure_data_file, ensure_kegg_data, + ensure_kegg_hmm_library, ) @@ -16,22 +21,30 @@ def _sha256(data: bytes) -> str: @pytest.fixture def served(tmp_path, monkeypatch): - """A fake registry served from local files, with the cache pointed at tmp.""" + """A fake registry served from local files, with the cache pointed at tmp. + + Published assets are version-prefixed (``v1_``), matching the real + registry shape that ``ensure_kegg_data`` / ``ensure_kegg_hmm_library`` build. + ``payloads`` is keyed by the published (prefixed) name. + """ src = tmp_path / "src" src.mkdir() - payloads = { + ver = "v1" + bases = { "reference_model.yml.gz": b"!!omap model bytes", "ko_reaction.tsv.gz": b"ko\treaction\n", "ko_names.tsv.gz": b"ko\tname\n", - "organism_gene_ko.tsv.xz": b"organism\tgene\tko\n", + "organism_gene_ko.tsv.gz": b"organism\tgene\tko\n", "rxn_flags.tsv.gz": b"reaction\tspontaneous\n", } files = {} - for name, data in payloads.items(): - path = src / name - path.write_bytes(data) - files[name] = {"url": path.as_uri(), "sha256": _sha256(data)} - registry = {"kegg": {"version": "v1", "files": files}} + payloads = {} + for base, data_bytes in bases.items(): + name = f"{ver}_{base}" + (src / name).write_bytes(data_bytes) + files[name] = {"url": (src / name).as_uri(), "sha256": _sha256(data_bytes)} + payloads[name] = data_bytes + registry = {"kegg": {"version": ver, "files": files}} cache = tmp_path / "cache" monkeypatch.setenv("XDG_CACHE_HOME", str(cache)) @@ -40,27 +53,27 @@ def served(tmp_path, monkeypatch): def test_ensure_data_file_downloads_and_caches(served): registry, cache, payloads = served - path = ensure_data_file("kegg", "ko_reaction.tsv.gz", registry=registry) - assert path == cache / "raven_python" / "data" / "kegg-v1" / "ko_reaction.tsv.gz" - assert path.read_bytes() == payloads["ko_reaction.tsv.gz"] + path = ensure_data_file("kegg", "v1_ko_reaction.tsv.gz", registry=registry) + assert path == cache / "raven_python" / "data" / "kegg-v1" / "v1_ko_reaction.tsv.gz" + assert path.read_bytes() == payloads["v1_ko_reaction.tsv.gz"] -def test_ensure_data_file_reuses_cache(served, monkeypatch): +def test_ensure_data_file_reuses_cache(served): registry, _, _ = served - first = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry) + first = ensure_data_file("kegg", "v1_ko_names.tsv.gz", registry=registry) # Break the URL: a second call must hit the cache, not re-download. - registry["kegg"]["files"]["ko_names.tsv.gz"]["url"] = "file:///nonexistent" - second = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry) + registry["kegg"]["files"]["v1_ko_names.tsv.gz"]["url"] = "file:///nonexistent" + second = ensure_data_file("kegg", "v1_ko_names.tsv.gz", registry=registry) assert first == second and second.exists() def test_sha256_mismatch_rejected(served): registry, cache, _ = served - registry["kegg"]["files"]["rxn_flags.tsv.gz"]["sha256"] = "0" * 64 + registry["kegg"]["files"]["v1_rxn_flags.tsv.gz"]["sha256"] = "0" * 64 with pytest.raises(ValueError, match="SHA256 mismatch"): - ensure_data_file("kegg", "rxn_flags.tsv.gz", registry=registry) + ensure_data_file("kegg", "v1_rxn_flags.tsv.gz", registry=registry) # The corrupt partial download must not be left behind. - assert not (cache / "raven_python" / "data" / "kegg-v1" / "rxn_flags.tsv.gz").exists() + assert not (cache / "raven_python" / "data" / "kegg-v1" / "v1_rxn_flags.tsv.gz").exists() def test_unknown_dataset_actionable_error(served): @@ -79,11 +92,57 @@ def test_ensure_kegg_data_fetches_core_set(served): registry, cache, _ = served out = ensure_kegg_data(registry=registry) assert out == cache / "raven_python" / "data" / "kegg-v1" - for name in CORE_KEGG_FILES: - assert (out / name).is_file() + # CORE_KEGG_FILES are base names; the cached files are version-prefixed. + for base in CORE_KEGG_FILES: + assert (out / f"v1_{base}").is_file() + + +def test_ensure_kegg_hmm_library_decompresses_and_presses(served, tmp_path, monkeypatch): + registry, cache, _ = served + raw = b"HMMER3/f [3.4]\nNAME K00001\n//\n" + blob = gzip.compress(raw, mtime=0) + gz = tmp_path / "src" / "v1_prokaryotes.hmm.gz" + gz.write_bytes(blob) + registry["kegg"]["files"]["v1_prokaryotes.hmm.gz"] = { + "url": gz.as_uri(), + "sha256": _sha256(blob), + } + + presses: list[Path] = [] + def fake_run(cmd, capture_output, text): + library = Path(cmd[-1]) + presses.append(library) + for suffix in (".h3f", ".h3i", ".h3m", ".h3p"): + library.with_name(library.name + suffix).write_bytes(b"") + return subprocess.CompletedProcess(cmd, 0, "", "") -def test_empty_registry_raises(): - # The shipped registry is empty until artefacts are published. + monkeypatch.setattr(data, "resolve_binary", lambda name, binary=None: "hmmpress") + monkeypatch.setattr(data.subprocess, "run", fake_run) + + library = ensure_kegg_hmm_library("prokaryotes", registry=registry) + assert library.name == "v1_prokaryotes.hmm" + assert library.read_bytes() == raw # decompressed in place + assert library.with_name(library.name + ".h3m").exists() + assert len(presses) == 1 + + # Second call: library + sidecars already cached, so no re-decompress/-press. + again = ensure_kegg_hmm_library("prokaryotes", registry=registry) + assert again == library + assert len(presses) == 1 + + +def test_unregistered_dataset_raises(): + # An unpublished dataset still raises an actionable error against the shipped registry. with pytest.raises(FileNotFoundError, match="No data artefacts registered"): - ensure_data_file("kegg", "ko_reaction.tsv.gz") + ensure_data_file("metacyc", "x") + + +def test_shipped_registry_matches_resolver_names(): + # The published registry keys must equal what ensure_kegg_data / + # ensure_kegg_hmm_library construct as f"{version}_{base}", or fetches 404. + kegg = data._DATA_REGISTRY["kegg"] + ver = kegg["version"] + names = set(kegg["files"]) + assert {f"{ver}_{base}" for base in CORE_KEGG_FILES} <= names + assert {f"{ver}_{d}.hmm.gz" for d in ("prokaryotes", "eukaryotes")} <= names diff --git a/tests/test_reconstruction_kegg_parse.py b/tests/test_reconstruction_kegg_parse.py index 23d8f71..533642e 100644 --- a/tests/test_reconstruction_kegg_parse.py +++ b/tests/test_reconstruction_kegg_parse.py @@ -195,8 +195,8 @@ def test_parse_kegg_dump_writes_artefacts(tmp_path): "ko_reaction", "ko_names", "organism_gene_ko", "rxn_flags", "reference_model" } assert (tmp_path / "reference_model.yml.gz").is_file() - # organism_gene_ko is streamed to a sorted, xz-compressed TSV. - assert paths["organism_gene_ko"].name == "organism_gene_ko.tsv.xz" + # organism_gene_ko is streamed to a sorted, gzipped TSV. + assert paths["organism_gene_ko"].name == "organism_gene_ko.tsv.gz" ogk = read_kegg_table(paths["organism_gene_ko"]) assert set(ogk.columns) == {"organism", "gene", "ko"} assert ("eco", "b0001", "K00002") in set(map(tuple, ogk.to_numpy())) @@ -205,11 +205,21 @@ def test_parse_kegg_dump_writes_artefacts(tmp_path): assert keys == sorted(keys) +def test_parse_kegg_dump_version_prefixes_filenames(tmp_path): + paths = parse_kegg_dump(DUMP, tmp_path, version="kegg116") + # Dict keys stay logical; the files on disk are version-prefixed. + assert set(paths) >= {"ko_reaction", "organism_gene_ko", "reference_model"} + assert paths["organism_gene_ko"].name == "kegg116_organism_gene_ko.tsv.gz" + assert paths["reference_model"].name == "kegg116_reference_model.yml.gz" + assert (tmp_path / "kegg116_ko_reaction.tsv.gz").is_file() + assert read_kegg_table(paths["organism_gene_ko"]).columns.tolist() == ["organism", "gene", "ko"] + + def test_stream_organism_gene_ko_external_merge(tmp_path): """A tiny chunk_rows forces multiple sorted runs to be merged; output stays sorted.""" from raven_python.reconstruction.kegg.parse import stream_organism_gene_ko - out = tmp_path / "organism_gene_ko.tsv.xz" + out = tmp_path / "organism_gene_ko.tsv.gz" keep = {ko.id for ko in parse_kegg_kos(DUMP)} names = stream_organism_gene_ko(DUMP, keep, out, chunk_rows=1) assert out.is_file() and not list(tmp_path.glob("ogk_sort_*")) # temp dir cleaned up From e586e9f44305739759bdd7fb99f7422ba96b54ab Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Wed, 10 Jun 2026 23:49:21 +0200 Subject: [PATCH 2/2] Add KEGG taxonomy artefact and phyl_dist (RAVEN getPhylDist port) Publish kegg116_taxonomy.gz and regenerate RAVEN's keggPhylDist from it, so GECKO's organism-distance kcat selection needs no MATLAB .mat file: - reconstruction.kegg.phyl_dist + PhylDist faithfully reproduce RAVEN getPhylDist's (asymmetric, occasionally negative) distance metric; parse_taxonomy_records exposes ids/names/lineages and reads .gz transparently. - data.ensure_kegg_taxonomy fetches the artefact; build_kegg_artefacts.py emits it. - Register kegg116_taxonomy.gz in data/manifest.json and _DATA_REGISTRY (8 files). - Tests for phyl_dist (hand-checked against RAVEN) and the taxonomy fetch; update migration/IMPROVEMENTS/maintainer docs and CHANGELOG. --- CHANGELOG.md | 5 + IMPROVEMENTS.md | 2 +- data/manifest.example.json | 1 + data/manifest.json | 7 +- docs/maintenance/maintaining_kegg_data.md | 5 +- docs/reference/migration.md | 4 +- scripts/build_kegg_artefacts.py | 9 ++ src/raven_python/data.py | 18 +++ .../reconstruction/kegg/__init__.py | 6 + .../reconstruction/kegg/taxonomy.py | 145 +++++++++++++++--- tests/test_data.py | 10 ++ tests/test_reconstruction_kegg_phyl_dist.py | 70 +++++++++ 12 files changed, 257 insertions(+), 25 deletions(-) create mode 100644 tests/test_reconstruction_kegg_phyl_dist.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 96a47e5..a13c6d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,11 @@ hardening pass (no behaviour change on well-formed inputs). Highlights: (`kegg116_.hmm.gz`); the client decompresses and `hmmpress`-es once on first use, cutting the download ~10× versus the pressed index and letting the same artefact serve MATLAB RAVEN. +* **Taxonomy + phylogenetic distance:** publish `kegg116_taxonomy.gz` and add + `reconstruction.kegg.phyl_dist` (with `PhylDist`), a faithful port of RAVEN's + `getPhylDist` that regenerates the `keggPhylDist` distance matrix from the + taxonomy file — so GECKO's organism-distance kcat selection needs no MATLAB + `.mat`. `ensure_kegg_taxonomy` fetches the artefact. * **Packaging:** `raven_python.__version__` now derives from the installed package metadata (`importlib.metadata`) instead of a hard-coded literal that had drifted to `0.0.1`; the docs site reported the wrong version. Pinned `ruff==0.15.15` in diff --git a/IMPROVEMENTS.md b/IMPROVEMENTS.md index 01dd3e5..5e6bd18 100644 --- a/IMPROVEMENTS.md +++ b/IMPROVEMENTS.md @@ -92,7 +92,7 @@ and `taxonomy.py` (3b.3). Maintainer-side, build-time tooling (PLAN.md §2.3b). | K5 | EFFICIENCY (portability) | raven-python 🔨 | 🔨 | **KEGG download in pure Python stdlib** (`urllib`/`tarfile`/`gzip`/`netrc`), porting `fetch_keggdb.sh`. Drops the script's `wget`/`tar`/`gunzip` (and Cygwin-on-Windows) requirement, so it runs unchanged on Linux/macOS/Windows; tar extraction uses the `data` filter (no path traversal); same `~/.netrc` credential hygiene. The arrange step is split out (`extract_kegg_dump`) so it's network-free and unit-tested. | | K6 | EFFICIENCY | raven-python 🔨 + MATLAB RAVEN 💡 | 🔨 | **Per-KO multi-FASTA via a stdlib offset index** (`_index_fasta` → seek), replacing `constructMultiFasta`'s Java-`Hashtable` byte scan with 5M-element preallocation. One streaming pass, only wanted ids retained; no MATLAB/Java heap tuning. | | K7 | EFFICIENCY | raven-python 🔨 + MATLAB RAVEN 💡 | 🔨 | **Concatenate per-KO HMMs and `hmmpress` into one pressed library**, so the query path (3b.5) runs a single `hmmscan` against the database instead of RAVEN's thousands of per-KO `hmmsearch` invocations. | -| K8 | EFFICIENCY (scope) | raven-python 🔨 | 🔨 | **Drop the `getPhylDist` distance matrix.** Its only uses in RAVEN were per-organism HMM-sequence subsampling (`maxPhylDist`/`nSequences`) and the kingdom filter. Our fixed prok90/euk90 libraries (3b.3) remove the subsampling rationale, and domain mode (3b.4) uses the taxonomy domain classification directly — so the O(n²) matrix is never built. Simpler, faster, less code. | +| K8 | EFFICIENCY (scope) | raven-python 🔨 | 🔨 | **Drop the `getPhylDist` distance matrix.** Its only uses in RAVEN were per-organism HMM-sequence subsampling (`maxPhylDist`/`nSequences`) and the kingdom filter. Our fixed prok90/euk90 libraries (3b.3) remove the subsampling rationale, and domain mode (3b.4) uses the taxonomy domain classification directly — so the *reconstruction* path never builds the O(n²) matrix. The matrix itself remains available on demand via `taxonomy.phyl_dist`, which regenerates RAVEN's `keggPhylDist` from the published `taxonomy` artefact for GECKO's organism-distance kcat selection (no `.mat` needed). | | K9 | EFFICIENCY (memory) | raven-python 🔨 | 🔨 | **Stream `organism_gene_ko` to disk** in `parse_kegg_dump` instead of building it in memory. Real KEGG has **9.05M** gene↔KO associations; the in-memory DataFrame build OOMs in a few GB. Streaming (now via the external merge sort of K14) runs the full parse with flat, bounded peak memory. (Found by validating against a real KEGG FTP dump.) | | K10 | EFFICIENCY (size) | raven-python 🔨 | 🔨 | **Reference model as gzipped RAVEN/cobra YAML** (`reference_model.yml.gz`) rather than SBML: RAVEN-native, MATLAB-readable, and ~1.1 MB vs ~30 MB SBML for the real 12k-reaction model. Made `io/yaml.py` gzip-aware on a `.gz` suffix (general-purpose). | | K11 | ERGONOMICS | raven-python 🔨 | 🔨 | **`ensure_data`** (`data.py`): version-pinned registry that fetches/verifies/caches the published KEGG artefacts under `~/.cache/raven-python/data/`, mirroring `ensure_binary`. End users get a draft model with no KEGG access and no manual data handling — the `…_from_artefacts` entry points auto-fetch when no local dir is supplied. | diff --git a/data/manifest.example.json b/data/manifest.example.json index 036ff5e..8053972 100644 --- a/data/manifest.example.json +++ b/data/manifest.example.json @@ -13,6 +13,7 @@ "kegg116_ko_names.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_names.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, "kegg116_organism_gene_ko.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_organism_gene_ko.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, "kegg116_rxn_flags.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "kegg116_taxonomy.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_taxonomy.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, "kegg116_prokaryotes.hmm.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_prokaryotes.hmm.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, "kegg116_eukaryotes.hmm.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_eukaryotes.hmm.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 } } diff --git a/data/manifest.json b/data/manifest.json index 84d291f..bf67bde 100644 --- a/data/manifest.json +++ b/data/manifest.json @@ -4,7 +4,7 @@ "data": { "kegg": { "version": "kegg116", - "description": "KEGG reference model, KO/reaction tables, and prokaryote/eukaryote HMM libraries for getKEGGModelForOrganism.", + "description": "KEGG reference model, KO/reaction tables, taxonomy, and prokaryote/eukaryote HMM libraries for getKEGGModelForOrganism.", "license": "Derived from the KEGG database; redistributed with permission from KEGG.", "source": "https://github.com/SysBioChalmers/raven-python/releases/tag/v0.1.0", "files": { @@ -42,6 +42,11 @@ "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz", "sha256": "c4c134effc9edeeb74b925ae8616320af162edbaad3a9b44dcc29d2c4d12db9b", "bytes": 33289 + }, + "kegg116_taxonomy.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_taxonomy.gz", + "sha256": "1edc56da94d71433e5f08c133600292c311baaf33279a959518ab08389b0e538", + "bytes": 234693 } } } diff --git a/docs/maintenance/maintaining_kegg_data.md b/docs/maintenance/maintaining_kegg_data.md index da4bd96..28cee43 100644 --- a/docs/maintenance/maintaining_kegg_data.md +++ b/docs/maintenance/maintaining_kegg_data.md @@ -125,7 +125,10 @@ HMMER versions, and lets the same artefact serve MATLAB RAVEN. [`scripts/build_kegg_artefacts.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/scripts/README.md) runs 3b.2 (+ 3b.3 with `--hmms`) and lays the output out as publishable, version-prefixed assets -(`_.hmm.gz` named for `ensure_kegg_hmm_library`): +(`_.hmm.gz` named for `ensure_kegg_hmm_library`). It also publishes +`_taxonomy.gz` — the domain split plus the source for +[`phyl_dist`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/kegg/taxonomy.py), +which regenerates RAVEN's `keggPhylDist` (used by GECKO) with no `.mat` file: ```bash python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts \ diff --git a/docs/reference/migration.md b/docs/reference/migration.md index 185b4b8..b6d51cf 100644 --- a/docs/reference/migration.md +++ b/docs/reference/migration.md @@ -65,7 +65,7 @@ standard plus the geckopy enzyme-constrained extension, so ecModels round-trip. |---|---|---| | `getModelFromHomology` + `getBlast`/`getDiamond` | ✅ [`reconstruction.homology.get_model_from_homology`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/homology/homology.py), `run_blast`, `run_diamond`, `blast_from_table` | Core homology reconstruction with structured improvements (bidirectional / best-hits-only, AST GPR rewrite, complex policy, bitscore best-hits, DataFrame ortholog map). | | KEGG download → species model (5 steps) | ✅ [`reconstruction.kegg.*`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/kegg/) | All five steps: `download.fetch_keggdb`, `parse.read_kegg_table` + reference model, `hmm.build_libraries`, `organism.build_kegg_model_for_organism` (no-FASTA), `query.assign_kos` + `run_hmmscan`. | -| `getPhylDist` | ⛔ not ported | Fixed prok90/euk90 libraries make the distance matrix moot. | +| `getPhylDist` | ✅ [`reconstruction.kegg.phyl_dist`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/kegg/taxonomy.py) (+ `parse_taxonomy`) | Lineage parsing **and** the distance matrix (RAVEN's `keggPhylDist`), regenerated from the published `taxonomy` artefact so GECKO's organism-distance kcat selection needs no `.mat`. The per-organism HMM-subsampling use is moot here (fixed prok90/euk90 libraries). | | `getMetaCycModelForOrganism` | ⛔ not ported (and **flagged for removal from MATLAB RAVEN**) | BLAST-to-single-representatives is low-precision at every cutoff. See [IMPROVEMENTS.md](improvements.md) under `R-MetaCyc`. | ## Tasks, gap-filling, INIT, ftINIT @@ -106,7 +106,7 @@ standard plus the geckopy enzyme-constrained extension, so ecModels round-trip. * **`checkModelStruct` struct/type checks** — moot in cobra. * **`runDynamicFBA`** — see Omics/analysis row. * **`getMetaCycModelForOrganism`** — see Reconstruction row; flagged for upstream removal. -* **`getPhylDist`** — fixed prok90/euk90 libraries make it moot. +* **`getPhylDist` per-organism HMM subsampling** — fixed prok90/euk90 libraries make it moot (the distance matrix itself **is** ported, as `reconstruction.kegg.phyl_dist`, for GECKO). * **`mapCompartments`** — overlaps with `compare_models`. * **`editMiriam`, `extractMiriam`, `getRxnsInComp`, `getMetsInComp`, `constructEquations`, `getIndexes`** (most), **`setExchangeBounds`**, **most `setParam` modes**, **`getBlastFromExcel` Excel branch** — cobra one-liners; recorded above. diff --git a/scripts/build_kegg_artefacts.py b/scripts/build_kegg_artefacts.py index 2539dd3..52e002f 100644 --- a/scripts/build_kegg_artefacts.py +++ b/scripts/build_kegg_artefacts.py @@ -82,6 +82,15 @@ def main(argv: list[str] | None = None) -> None: for name, path in paths.items(): print(f" {name}: {path}") + # Publish the taxonomy file too: domain split, plus the source for phyl_dist + # (RAVEN's keggPhylDist, used by GECKO). It is a raw dump file, so gzip it as-is. + tax_src = args.keggdb / "taxonomy" + if tax_src.is_file(): + tax_out = args.out / f"{prefix}taxonomy.gz" + with open(tax_src, "rb") as src, gzip.open(tax_out, "wb") as out: + shutil.copyfileobj(src, out) + print(f" taxonomy: {tax_out}") + if args.hmms: ogk = read_kegg_table(paths["organism_gene_ko"]) genes_pep = args.keggdb / "genes.pep" diff --git a/src/raven_python/data.py b/src/raven_python/data.py index fdba221..662437f 100644 --- a/src/raven_python/data.py +++ b/src/raven_python/data.py @@ -63,6 +63,10 @@ "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz", "sha256": "c4c134effc9edeeb74b925ae8616320af162edbaad3a9b44dcc29d2c4d12db9b", }, + "kegg116_taxonomy.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_taxonomy.gz", + "sha256": "1edc56da94d71433e5f08c133600292c311baaf33279a959518ab08389b0e538", + }, }, }, } @@ -216,3 +220,17 @@ def ensure_kegg_hmm_library( if proc.returncode != 0: raise RuntimeError(f"hmmpress failed:\n{(proc.stderr or '').strip()}") return library + + +def ensure_kegg_taxonomy(*, version: str | None = None, registry: dict | None = None) -> Path: + """Ensure the KEGG ``taxonomy`` artefact is cached; return its (gzipped) path. + + The gzipped KEGG ``taxonomy`` file is the source for domain classification and for + regenerating the phylogenetic distance matrix — RAVEN's ``keggPhylDist``, which GECKO + uses to pick the closest organism for kcat assignment — via + :func:`raven_python.reconstruction.kegg.phyl_dist` (which reads ``.gz`` directly). So + that capability needs only this published artefact, no MATLAB ``.mat`` file. + """ + registry = _DATA_REGISTRY if registry is None else registry + ver = version or _bundle("kegg", registry)["version"] + return ensure_data_file("kegg", f"{ver}_taxonomy.gz", version=ver, registry=registry) diff --git a/src/raven_python/reconstruction/kegg/__init__.py b/src/raven_python/reconstruction/kegg/__init__.py index 5d27602..140c0d6 100644 --- a/src/raven_python/reconstruction/kegg/__init__.py +++ b/src/raven_python/reconstruction/kegg/__init__.py @@ -40,15 +40,19 @@ run_hmmscan, ) from raven_python.reconstruction.kegg.taxonomy import ( + PhylDist, organism_domains, organisms_in_domain, parse_taxonomy, + parse_taxonomy_records, + phyl_dist, ) __all__ = [ "KeggCompound", "KeggKO", "KeggReaction", + "PhylDist", "assign_kos", "build_hmm_library", "build_kegg_tables", @@ -70,6 +74,8 @@ "parse_kegg_kos", "parse_kegg_reactions", "parse_taxonomy", + "parse_taxonomy_records", + "phyl_dist", "read_kegg_table", "run_hmmscan", "stream_organism_gene_ko", diff --git a/src/raven_python/reconstruction/kegg/taxonomy.py b/src/raven_python/reconstruction/kegg/taxonomy.py index 463fcce..4b8dbb2 100644 --- a/src/raven_python/reconstruction/kegg/taxonomy.py +++ b/src/raven_python/reconstruction/kegg/taxonomy.py @@ -1,27 +1,47 @@ -"""Parse the KEGG ``taxonomy`` file into per-organism category lineages. +"""Parse the KEGG ``taxonomy`` file into lineages, domains, and phylogenetic distances. -Ports the file-reading half of RAVEN ``getPhylDist`` (the distance-matrix half is -step 3b.5). The ``taxonomy`` file is an indented tree: ``#``-prefixed lines name a -category, the number of leading ``#`` giving its depth; organism lines are -tab-separated ``T-numberorg_idname...``. Each organism inherits the -stack of categories above it, the first of which is its domain (``Prokaryotes`` / -``Eukaryotes``). +Ports RAVEN ``getPhylDist``: the file-reading half (per-organism category lineages and +the domain split) **and** the distance-matrix half (:func:`phyl_dist`), which earlier +ports deferred. The ``taxonomy`` file is an indented tree: ``#``-prefixed lines name a +category, the number of leading ``#`` giving its depth; organism lines are tab-separated +``T-numberorg_idT-numbername``. Each organism inherits the stack of +categories above it, the first of which is its domain (``Prokaryotes`` / ``Eukaryotes``). -Used by 3b.3 to split genes into the prok/euk HMM libraries, and (later) by 3b.5 -for phylogenetic distances. +Used by 3b.3 to split genes into the prok/euk HMM libraries, by 3b.4 domain mode, and by +:func:`phyl_dist` to regenerate the KEGG phylogenetic distance matrix (RAVEN's +``keggPhylDist``) that GECKO uses to pick the closest organism for kcat assignment — so +that capability needs no MATLAB ``.mat`` file, only the published ``taxonomy`` artefact. """ from __future__ import annotations +import gzip import warnings +from dataclasses import dataclass, field from pathlib import Path +import numpy as np -def parse_taxonomy(path: str | Path) -> dict[str, list[str]]: - """Return ``{organism_id: [category, ...]}`` from outermost to innermost.""" - org_categories: dict[str, list[str]] = {} + +def _open_text(path: str | Path): + """Open ``path`` as UTF-8 text, transparently decompressing a ``.gz`` artefact.""" + path = Path(path) + if path.suffix == ".gz": + return gzip.open(path, "rt", encoding="utf-8") + return open(path, encoding="utf-8") + + +def parse_taxonomy_records(path: str | Path) -> list[tuple[str, str, list[str]]]: + """Return ``[(organism_id, name, [category, ...]), ...]`` in file order. + + ``name`` is the organism's scientific name with any trailing KEGG parenthetical + kept (e.g. ``"Homo sapiens (human)"``), matching RAVEN ``getPhylDist``; the category + list is the lineage from outermost (domain) to innermost. Order is preserved so it + aligns with the rows/columns of :func:`phyl_dist`. + """ + records: list[tuple[str, str, list[str]]] = [] stack: list[str] = [] skipped_level_warned = False - with open(path, encoding="utf-8") as handle: + with _open_text(path) as handle: for line_no, raw in enumerate(handle, start=1): line = raw.rstrip("\n") if not line.strip(): @@ -30,9 +50,8 @@ def parse_taxonomy(path: str | Path) -> dict[str, list[str]]: depth = len(line) - len(line.lstrip("#")) name = line[depth:].strip() if depth - 1 > len(stack): - # Depth-skip (e.g. ## then ####): the original `stack[:depth-1]` - # silently produced a too-short lineage. Pad with explicit - # blanks so downstream slices stay aligned; warn once. + # Depth-skip (e.g. ## then ####): pad the missing levels with blanks + # so downstream slices stay aligned; warn once. if not skipped_level_warned: warnings.warn( f"{path}: taxonomy depth skips a level near line {line_no} " @@ -46,11 +65,25 @@ def parse_taxonomy(path: str | Path) -> dict[str, list[str]]: stack = stack[: depth - 1] stack.append(name) else: - fields = line.split("\t") if "\t" in line else line.split() - if len(fields) < 2: + # Organism line. RAVEN reads the id between the 1st/2nd whitespace and the + # name after the 3rd; with the tab-delimited file that is fields[1]/fields[3]. + if "\t" in line: + fields = line.split("\t") + org_id = fields[1].strip() if len(fields) > 1 else "" + org_name = fields[3].strip() if len(fields) > 3 else "" + else: + fields = line.split() + org_id = fields[1] if len(fields) > 1 else "" + org_name = " ".join(fields[3:]) if len(fields) > 3 else "" + if not org_id: continue - org_categories[fields[1].strip()] = list(stack) - return org_categories + records.append((org_id, org_name, list(stack))) + return records + + +def parse_taxonomy(path: str | Path) -> dict[str, list[str]]: + """Return ``{organism_id: [category, ...]}`` from outermost to innermost.""" + return {org_id: lineage for org_id, _name, lineage in parse_taxonomy_records(path)} def organism_domains(path: str | Path) -> dict[str, str]: @@ -69,3 +102,75 @@ def organisms_in_domain(path: str | Path, domain: str) -> set[str]: for org, dom in organism_domains(path).items() if dom.lower().startswith(needle) or needle.startswith(dom.lower()) } + + +@dataclass +class PhylDist: + """KEGG phylogenetic distance structure — RAVEN ``getPhylDist`` / ``keggPhylDist``. + + ``ids``/``names`` are the KEGG organism ids and scientific names (names keep RAVEN's + trailing parenthetical; consumers that need them cleaned, e.g. geckopy, strip it). + ``dist_matrix[i, j]`` is RAVEN's distance from ``ids[i]`` to ``ids[j]``. It is a + faithful reproduction of RAVEN's metric, which is **asymmetric and may be negative**; + GECKO consumes it only by taking the closest (minimum-distance) organism, so the exact + values matter for parity with MATLAB but the sign/asymmetry are harmless there. + """ + + ids: list[str] = field(default_factory=list) + names: list[str] = field(default_factory=list) + dist_matrix: np.ndarray = field(default_factory=lambda: np.empty((0, 0), dtype=np.int64)) + + +def phyl_dist(path: str | Path, *, only_in_kingdom: bool = False) -> PhylDist: + """Compute the KEGG phylogenetic distance matrix from the ``taxonomy`` file. + + Faithful port of RAVEN ``getPhylDist``. For organisms ``i`` and ``j`` with category + lineages of lengths ``Li, Lj``, the distance is ``(Li - Lj) + min(Li, Lj) - k`` where + ``k`` is the deepest level (within the shorter lineage, comparing position by position + from the root) at which the two categories coincide — ``1`` if none coincide. This + reproduces RAVEN's exact values (including their asymmetry and occasional negatives) + so the output matches MATLAB ``keggPhylDist`` that GECKO was built against. + + ``only_in_kingdom`` sets the distance to ``+inf`` between organisms in different + domains (RAVEN's ``onlyInKingdom`` variant); the matrix is then returned as ``float``. + + Cost is ``O(N²)`` in time and memory (``N`` = number of KEGG organisms, ~10⁴), so this + is a maintainer/GECKO-side generation step; persist the result rather than rebuilding. + """ + records = parse_taxonomy_records(path) + ids = [r[0] for r in records] + names = [r[1] for r in records] + lineages = [r[2] for r in records] + n = len(records) + if n == 0: + return PhylDist(ids, names, np.empty((0, 0), dtype=np.int64)) + + lengths = np.array([len(lin) for lin in lineages], dtype=np.int32) + max_depth = int(lengths.max()) + + # deepest[i, j] = deepest level d (1-based) at which lineage_i[d-1] == lineage_j[d-1], + # counting only levels present in both (d <= min(Li, Lj)); 0 if none coincide. + deepest = np.zeros((n, n), dtype=np.int32) + for d in range(1, max_depth + 1): + lookup: dict[str, int] = {} + lab = np.full(n, -1, dtype=np.int64) + for i, lin in enumerate(lineages): + if len(lin) >= d: + lab[i] = lookup.setdefault(lin[d - 1], len(lookup)) + eq = (lab[:, None] == lab[None, :]) & (lab[:, None] != -1) + # Ascending d, so the last (deepest) coinciding level wins -> max coinciding depth. + deepest[eq] = d + k = np.where(deepest == 0, 1, deepest).astype(np.int32) + + # Values are tiny (|dist| <= max lineage depth); int32 throughout keeps the N x N + # matrix (and its intermediates) half the size of the int64 default. + li = lengths[:, None] + lj = lengths[None, :] + dist = (li - lj) + np.minimum(li, lj) - k + + if only_in_kingdom: + domains = np.array([lin[0] if lin else "" for lin in lineages]) + dist = dist.astype(float) + dist[domains[:, None] != domains[None, :]] = np.inf + + return PhylDist(ids=ids, names=names, dist_matrix=dist) diff --git a/tests/test_data.py b/tests/test_data.py index d8db588..b72ccb1 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -12,6 +12,7 @@ ensure_data_file, ensure_kegg_data, ensure_kegg_hmm_library, + ensure_kegg_taxonomy, ) @@ -36,6 +37,7 @@ def served(tmp_path, monkeypatch): "ko_names.tsv.gz": b"ko\tname\n", "organism_gene_ko.tsv.gz": b"organism\tgene\tko\n", "rxn_flags.tsv.gz": b"reaction\tspontaneous\n", + "taxonomy.gz": b"# Prokaryotes\n", } files = {} payloads = {} @@ -132,6 +134,13 @@ def fake_run(cmd, capture_output, text): assert len(presses) == 1 +def test_ensure_kegg_taxonomy(served): + registry, cache, _ = served + path = ensure_kegg_taxonomy(registry=registry) + assert path == cache / "raven_python" / "data" / "kegg-v1" / "v1_taxonomy.gz" + assert path.is_file() + + def test_unregistered_dataset_raises(): # An unpublished dataset still raises an actionable error against the shipped registry. with pytest.raises(FileNotFoundError, match="No data artefacts registered"): @@ -146,3 +155,4 @@ def test_shipped_registry_matches_resolver_names(): names = set(kegg["files"]) assert {f"{ver}_{base}" for base in CORE_KEGG_FILES} <= names assert {f"{ver}_{d}.hmm.gz" for d in ("prokaryotes", "eukaryotes")} <= names + assert f"{ver}_taxonomy.gz" in names diff --git a/tests/test_reconstruction_kegg_phyl_dist.py b/tests/test_reconstruction_kegg_phyl_dist.py new file mode 100644 index 0000000..43f3e83 --- /dev/null +++ b/tests/test_reconstruction_kegg_phyl_dist.py @@ -0,0 +1,70 @@ +"""Tests for the KEGG phylogenetic distance generator (RAVEN getPhylDist port).""" +import gzip + +import numpy as np + +from raven_python.reconstruction.kegg.taxonomy import parse_taxonomy_records, phyl_dist + +# Tiny taxonomy: two prokaryotes sharing a lineage, a 3-deep mammal, a 2-deep fungus. +TAXONOMY = ( + "# Prokaryotes\n" + "## Bacteria\n" + "T1\tbsu\tT1\tBacillus subtilis\n" + "T2\teco\tT2\tEscherichia coli\n" + "# Eukaryotes\n" + "## Animals\n" + "### Mammals\n" + "T3\thsa\tT3\tHomo sapiens (human)\n" + "## Fungi\n" + "T4\tsce\tT4\tSaccharomyces cerevisiae\n" +) + + +def _write(tmp_path): + p = tmp_path / "taxonomy" + p.write_text(TAXONOMY, encoding="utf-8") + return p + + +def test_parse_records_ids_names_lineages(tmp_path): + recs = parse_taxonomy_records(_write(tmp_path)) + assert [r[0] for r in recs] == ["bsu", "eco", "hsa", "sce"] + # Names keep RAVEN's trailing parenthetical. + assert [r[1] for r in recs] == [ + "Bacillus subtilis", + "Escherichia coli", + "Homo sapiens (human)", + "Saccharomyces cerevisiae", + ] + lineage = {r[0]: r[2] for r in recs} + assert lineage["hsa"] == ["Eukaryotes", "Animals", "Mammals"] + assert lineage["sce"] == ["Eukaryotes", "Fungi"] + + +def test_phyl_dist_matches_raven_formula(tmp_path): + pd = phyl_dist(_write(tmp_path)) + assert pd.ids == ["bsu", "eco", "hsa", "sce"] + # Hand-computed from RAVEN's distMat = (Li-Lj) + min(Li,Lj) - k. + expected = np.array( + [ + [0, 0, 0, 1], + [0, 0, 0, 1], + [2, 2, 0, 2], + [1, 1, 0, 0], + ] + ) + np.testing.assert_array_equal(pd.dist_matrix, expected) + assert np.all(np.diag(pd.dist_matrix) == 0) # self-distance is 0 + assert pd.dist_matrix[2, 0] != pd.dist_matrix[0, 2] # RAVEN metric is asymmetric + + +def test_phyl_dist_only_in_kingdom_blocks_cross_domain(tmp_path): + pd = phyl_dist(_write(tmp_path), only_in_kingdom=True) + assert np.isinf(pd.dist_matrix[0, 2]) and np.isinf(pd.dist_matrix[0, 3]) # prok vs euk + assert np.isfinite(pd.dist_matrix[2, 3]) # both eukaryotes + + +def test_phyl_dist_reads_gzip(tmp_path): + p = tmp_path / "taxonomy.gz" + p.write_bytes(gzip.compress(TAXONOMY.encode("utf-8"))) + assert phyl_dist(p).ids == ["bsu", "eco", "hsa", "sce"]