From 8698f592f6af55f1c1b82fcafdda50eb2c7f3757 Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Wed, 10 Jun 2026 22:27:44 +0200
Subject: [PATCH 1/2] Publish kegg116 KEGG artefacts as gzip, version-prefixed
 assets (v0.1.0)

First downloadable KEGG artefact set, wired into the runtime resolvers:

- All artefacts are gzip and version-prefixed (kegg116_<name>.gz) so MATLAB and
  Windows read them with the built-in gunzip, no external tool. organism_gene_ko
  moves from xz to gzip for the same reason.
- HMM libraries ship as one gzip concatenated flatfile per domain;
  ensure_kegg_hmm_library decompresses and hmmpresses on first use, ~10x smaller
  than the pressed index and portable across HMMER versions.
- Add a version-prefix-tolerant artefact resolver (_resolve_artefact) used by the
  organism/sequence entry points; parse_kegg_dump and build_kegg_artefacts.py gain
  an opt-in --version.
- Populate data/manifest.json and _DATA_REGISTRY with the kegg116 release assets
  (real SHA256 + bytes); refresh the maintainer docs and manifest example.
- Bump version to 0.1.0 and update CHANGELOG.
---
 CHANGELOG.md                                  |  19 +++-
 IMPROVEMENTS.md                               |   2 +-
 data/manifest.example.json                    |  18 +--
 data/manifest.json                            |  48 +++++++-
 docs/maintenance/kegg_data_format.md          |  49 ++++----
 docs/maintenance/maintaining_kegg_data.md     |  49 +++++---
 docs/reference/matlab_raven_backports.md      |  10 +-
 docs/studies/kegg_hmm_cutoff_calibration.md   |   2 +-
 pyproject.toml                                |   2 +-
 scripts/analyze_hmm_cutoffs.py                |   2 +-
 scripts/build_kegg_artefacts.py               |  43 ++++---
 src/raven_python/data.py                      | 103 +++++++++++++----
 .../reconstruction/kegg/organism.py           |  10 +-
 src/raven_python/reconstruction/kegg/parse.py |  76 ++++++++++---
 src/raven_python/reconstruction/kegg/query.py |   8 +-
 tests/test_data.py                            | 105 ++++++++++++++----
 tests/test_reconstruction_kegg_parse.py       |  16 ++-
 17 files changed, 402 insertions(+), 160 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2bd72aa..96a47e5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,11 +4,20 @@ Milestones in the raven-python port. For function-level status see
 [docs/raven_migration.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/reference/migration.md); for open work see
 [docs/todo.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/reference/todo.md).
 
-## Unreleased
-
-Post-release review pass — cobra-aligned hardening (no behaviour change on
-well-formed inputs). Highlights:
-
+## 0.1.0 — 2026-06-10
+
+First release with **published, downloadable KEGG artefacts**, plus a cobra-aligned
+hardening pass (no behaviour change on well-formed inputs). Highlights:
+
+* **KEGG artefacts published (`kegg116`):** `ensure_kegg_data` /
+  `ensure_kegg_hmm_library` fetch version-pinned, SHA256-verified files from the
+  GitHub release. Every artefact is **gzip + version-prefixed**
+  (`kegg116_<name>.gz`) so MATLAB and Windows read them with the built-in `gunzip`
+  (no external tool) — `organism_gene_ko` moved from xz to gzip for this. The **HMM
+  libraries ship as one gzip concatenated flatfile per domain**
+  (`kegg116_<domain>.hmm.gz`); the client decompresses and `hmmpress`-es once on
+  first use, cutting the download ~10× versus the pressed index and letting the
+  same artefact serve MATLAB RAVEN.
 * **Packaging:** `raven_python.__version__` now derives from the installed package
   metadata (`importlib.metadata`) instead of a hard-coded literal that had drifted
   to `0.0.1`; the docs site reported the wrong version. Pinned `ruff==0.15.15` in
diff --git a/IMPROVEMENTS.md b/IMPROVEMENTS.md
index 9fa9096..01dd3e5 100644
--- a/IMPROVEMENTS.md
+++ b/IMPROVEMENTS.md
@@ -98,7 +98,7 @@ and `taxonomy.py` (3b.3). Maintainer-side, build-time tooling (PLAN.md §2.3b).
 | K11 | ERGONOMICS | raven-python 🔨 | 🔨 | **`ensure_data`** (`data.py`): version-pinned registry that fetches/verifies/caches the published KEGG artefacts under `~/.cache/raven-python/data/`, mirroring `ensure_binary`. End users get a draft model with no KEGG access and no manual data handling — the `…_from_artefacts` entry points auto-fetch when no local dir is supplied. |
 | K12 | EFFICIENCY | raven-python 🔨 + MATLAB RAVEN 💡 | 🔨 | **Fast MAFFT (FFT-NS-2) for HMM training** instead of RAVEN's `--auto`, which selects slow iterative refinement (`dvtditr`) on medium/large KOs — observed ~2.5 min/KO (days for a domain) on real KEGG 118. FFT-NS-2 (`--retree 2 --maxiterate 0`) is seconds/KO and ample for profile-HMM building. **PartTree cutover is residue-based and memory-auto-tuned**: MAFFT memory tracks residues (count × length), not sequence count, so a count threshold let long-protein KOs (K00901: 2,788 seqs, 2.55 M residues) OOM under FFT-NS-2 — measured ~5 GB MAFFT RSS with FFT-NS-2 vs **0.69 GB with PartTree** for the same alignment. The cutover is **length-aware and memory-auto-tuned**: FFT-NS-2 memory is driven by the progressive-alignment **DP cost ≈ n_seqs × mean_len²** (= residues²/n_seqs), *not* residue count — a few hundred long proteins cost far more than the same residues in many short ones. (First tried a residue-only model `RSS≈1.32R²+1.84R`; it then OOM'd on K12047 — 452 seqs but mean length 2082, 0.94 M residues — because long proteins blow the per-residue cost.) Calibrated `RSS_GB ≈ 4.2e-9 × (n_seqs × mean_len²)` across real KEGG KOs (250k/266→0.67 GB … 1.5M/1624→5.73 GB; K12047 cost 1.96e9 = the largest, hence its OOM). `_auto_cost_budget` switches to PartTree when the DP cost exceeds `0.65 × (total − 2.5 GB overhead) / 4.2e-9` (≈7.9e8 on a 7.6 GB box), **warns on low-memory hosts**, and `parttree_residues` overrides with a manual residue cutoff. Back-portable to RAVEN. |
 | K13 | EFFICIENCY | raven-python 🗑️ | 🗑️ | ~~Per-KO sequence cap (`max_sequences`)~~ — **removed.** Briefly added as a count-based cap, but the residue-based PartTree cutover (K12) bounds MAFFT memory without dropping any sequences, so the cap was redundant complexity. All deduplicated sequences are kept. |
-| K14 | EFFICIENCY (size) | raven-python 🔨 + MATLAB RAVEN 💡 | 🔨 | **Sort `organism_gene_ko` by `(organism, gene)` and store it xz-compressed** (`organism_gene_ko.tsv.xz`), cutting the dominant artefact **≈78 → 27 MB (2.9×)**. Gene IDs within an organism share long prefixes (locus tags, numeric runs), so sorting makes them adjacent and far more compressible (sort alone: 78→48 MB; xz vs gzip captures the cross-row redundancy gzip's 32 KB window misses: →27 MB). The sort is an **external merge sort** bounded to `chunk_rows` rows in memory (sorted runs spooled to gzipped temp files, merged with `heapq.merge`), so it keeps K9's flat memory profile. Both `lzma` and `gzip` are Python stdlib (native on Windows/macOS/Linux, no extra binary); small tables stay gzipped TSV (MATLAB-native), only the big one is xz (MATLAB needs an external `unxz`). Sorted order also matches the by-organism query in `get_kegg_model_for_organism`, enabling a future `searchsorted` slice instead of loading all 9M rows. Back-portable to RAVEN. |
+| K14 | EFFICIENCY (size) | raven-python 🔨 + MATLAB RAVEN 💡 | 🔨 | **Sort `organism_gene_ko` by `(organism, gene)` and store it gzipped** (`organism_gene_ko.tsv.gz`), cutting the dominant artefact **≈78 → 48 MB** by sorting alone. Gene IDs within an organism share long prefixes (locus tags, numeric runs), so sorting makes them adjacent and far more compressible. The sort is an **external merge sort** bounded to `chunk_rows` rows in memory (sorted runs spooled to gzipped temp files, merged with `heapq.merge`), so it keeps K9's flat memory profile. We first xz-compressed this file (≈27 MB, 2.9×) but switched to **gzip** (≈74 MB) so MATLAB reads it with built-in `gunzip` and no external `unxz`: the artefacts are shared with MATLAB RAVEN, and the once-per-release size cut wasn't worth a MATLAB toolchain dependency. All tables are now gzipped TSV, native on Windows/macOS/Linux. Sorted order also matches the by-organism query in `get_kegg_model_for_organism`, enabling a future `searchsorted` slice instead of loading all 9M rows. Back-portable to RAVEN. |
 | K15 | ERGONOMICS (correctness) | raven-python 🔨 + MATLAB RAVEN 💡 | 🔨 | **Recalibrate the HMM-query KO-assignment defaults** (`assign_kos`): cut-off `1e-50 → 1e-30`, `min_score_ratio_g 0.8 → 0.9`; `min_score_ratio_ko` left at 0.3 but **documented as empirically inert**. Cross-validated the full 3b.5 pipeline against the true KEGG gene→KO annotation of four organisms across both libraries and the well-/lesser-studied axis — *S. cerevisiae*, *Cyanidioschyzon merolae* (red alga), *E. coli* K-12, *Mycoplasma genitalium* (minimal genome). Real annotations score overwhelmingly (median E ≈ 1e-100…1e-155; even the weakest 1% ≈ 1e-15…1e-36) while spurious hits cluster at ≈1e-8 — a ~20-order-of-magnitude gap. RAVEN's `1e-50` therefore sits **inside the true-positive tail** and silently drops real-but-divergent hits for no noise-rejection gain: gene→KO recall on *M. genitalium* was only 0.84 (reaction recall 0.87). At `1e-30` + `ratio_g=0.9`: *M. genitalium* recall **0.84→0.94** (rxn 0.87→0.97), *E. coli* 0.95→0.97 with **fewer** unannotated reactions (198→173, the tighter gene-ratio prunes spurious multi-KO genes), *S. cerevisiae*/*C. merolae* held or improved. The three sweep tables showed `min_score_ratio_ko` produced identical output at 0.0/0.3/0.5 across all four organisms — a magic-number knob that does nothing; `min_score_ratio_g` is the real precision lever. Full numbers in [docs/kegg_hmm_cutoff_calibration.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/studies/kegg_hmm_cutoff_calibration.md) (reproduce with `scripts/analyze_hmm_cutoffs.py`). Back-portable to RAVEN. |
 
 ## FSEOF (Phase 5 — implemented, redesigned)
diff --git a/data/manifest.example.json b/data/manifest.example.json
index 2bc5e81..036ff5e 100644
--- a/data/manifest.example.json
+++ b/data/manifest.example.json
@@ -1,20 +1,20 @@
 {
   "manifest_version": 1,
-  "generated": "2026-05-30",
+  "generated": "2026-06-10",
   "data": {
     "kegg": {
       "version": "kegg116",
       "description": "KEGG reference model, KO/reaction tables, and prokaryote/eukaryote HMM libraries for getKEGGModelForOrganism.",
       "license": "Derived from the KEGG database; redistributed with permission from KEGG.",
-      "doi": "10.5281/zenodo.0000000",
-      "source": "https://github.com/SysBioChalmers/raven-python/releases/tag/kegg-kegg116",
+      "source": "https://github.com/SysBioChalmers/raven-python/releases/tag/v0.1.0",
       "files": {
-        "reference_model.yml.gz":   { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/kegg-kegg116/reference_model.yml.gz",   "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
-        "ko_reaction.tsv.gz":       { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/kegg-kegg116/ko_reaction.tsv.gz",       "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
-        "ko_names.tsv.gz":          { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/kegg-kegg116/ko_names.tsv.gz",          "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
-        "organism_gene_ko.tsv.xz":  { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/kegg-kegg116/organism_gene_ko.tsv.xz",  "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
-        "rxn_flags.tsv.gz":         { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/kegg-kegg116/rxn_flags.tsv.gz",         "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
-        "prokaryotes.hmm":          { "url": "https://zenodo.org/records/0000000/files/prokaryotes.hmm",                                            "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }
+        "kegg116_reference_model.yml.gz":  { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_reference_model.yml.gz",  "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
+        "kegg116_ko_reaction.tsv.gz":      { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_reaction.tsv.gz",      "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
+        "kegg116_ko_names.tsv.gz":         { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_names.tsv.gz",         "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
+        "kegg116_organism_gene_ko.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_organism_gene_ko.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
+        "kegg116_rxn_flags.tsv.gz":        { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz",        "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
+        "kegg116_prokaryotes.hmm.gz":      { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_prokaryotes.hmm.gz",      "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
+        "kegg116_eukaryotes.hmm.gz":       { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_eukaryotes.hmm.gz",       "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }
       }
     }
   },
diff --git a/data/manifest.json b/data/manifest.json
index baf5ee9..84d291f 100644
--- a/data/manifest.json
+++ b/data/manifest.json
@@ -1,6 +1,50 @@
 {
   "manifest_version": 1,
-  "generated": "2026-05-30",
-  "data": {},
+  "generated": "2026-06-10",
+  "data": {
+    "kegg": {
+      "version": "kegg116",
+      "description": "KEGG reference model, KO/reaction tables, and prokaryote/eukaryote HMM libraries for getKEGGModelForOrganism.",
+      "license": "Derived from the KEGG database; redistributed with permission from KEGG.",
+      "source": "https://github.com/SysBioChalmers/raven-python/releases/tag/v0.1.0",
+      "files": {
+        "kegg116_eukaryotes.hmm.gz": {
+          "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_eukaryotes.hmm.gz",
+          "sha256": "2d48bc9935575d0f9ba4178bf2df19279bff866b49c1bf83a8e15787b11d6708",
+          "bytes": 134002309
+        },
+        "kegg116_ko_names.tsv.gz": {
+          "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_names.tsv.gz",
+          "sha256": "84f9c7150172d948f794d91a6608d55f7140f31e53249c705057ae49b11c93b3",
+          "bytes": 14585
+        },
+        "kegg116_ko_reaction.tsv.gz": {
+          "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_reaction.tsv.gz",
+          "sha256": "e1a4ac22875bd3030d03b78368b0153b6d99000acb2ee0f474340a03c180323c",
+          "bytes": 49196
+        },
+        "kegg116_organism_gene_ko.tsv.gz": {
+          "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_organism_gene_ko.tsv.gz",
+          "sha256": "27bf7dd58eb1acd5904990dc2be187aae4d8d9b9f7421375618e7c8d6ff7253d",
+          "bytes": 47935249
+        },
+        "kegg116_prokaryotes.hmm.gz": {
+          "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_prokaryotes.hmm.gz",
+          "sha256": "d80cb2a22dec9fd8336b3998e3b96ee121672f63f4041cddaf09624fe739f1af",
+          "bytes": 153173750
+        },
+        "kegg116_reference_model.yml.gz": {
+          "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_reference_model.yml.gz",
+          "sha256": "73ff313fe2aa2830ec511f4e522226c98c5714c2d5c4632844544e5a409c7f0c",
+          "bytes": 1090563
+        },
+        "kegg116_rxn_flags.tsv.gz": {
+          "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz",
+          "sha256": "c4c134effc9edeeb74b925ae8616320af162edbaad3a9b44dcc29d2c4d12db9b",
+          "bytes": 33289
+        }
+      }
+    }
+  },
   "binaries": {}
 }
diff --git a/docs/maintenance/kegg_data_format.md b/docs/maintenance/kegg_data_format.md
index efb6d13..23b327b 100644
--- a/docs/maintenance/kegg_data_format.md
+++ b/docs/maintenance/kegg_data_format.md
@@ -16,33 +16,32 @@ under `~/.cache/raven-python/data/kegg-<version>/` by `ensure_data` (see
 
 ## Decision (current)
 
-- **Small tables** (`ko_reaction`, `ko_names`, `rxn_flags`): **gzipped TSV
-  (`.tsv.gz`)**. Each is well under 1 MB, so compression choice is irrelevant;
-  gzip keeps them MATLAB-native and dependency-free.
-- **The large `organism_gene_ko` table**: **xz-compressed TSV
-  (`organism_gene_ko.tsv.xz`), with rows sorted by `(organism, gene)`**.
+- **All tables** (`ko_reaction`, `ko_names`, `rxn_flags`, and the large
+  `organism_gene_ko`): **gzipped TSV (`.tsv.gz`)**. Published assets are
+  version-prefixed, e.g. `kegg116_organism_gene_ko.tsv.gz`.
+- The large `organism_gene_ko` table keeps its rows **sorted by `(organism, gene)`**.
 
-Why the large table differs. It carries KEGG's ~9M gene↔KO associations and
-dominates the artefact set (≈78 MB as unsorted gzipped TSV). Two cheap,
-stdlib-only changes cut that to ≈27 MB (2.9×):
+Why everything is gzip — even the big table. `organism_gene_ko` carries KEGG's
+~9M gene↔KO associations and dominates the artefact set. Sorting by
+`(organism, gene)` before writing makes gene IDs from one organism adjacent
+(shared locus-tag/numeric prefixes), which both helps the compressor and matches
+the by-organism query pattern in `get_kegg_model_for_organism`; the sort is an
+external merge sort bounded to `chunk_rows` in memory (see
+`stream_organism_gene_ko`), so it stays scalable. On the real dump this lands
+around ~74 MB.
 
-1. **Sort by `(organism, gene)`** before writing. Gene IDs from one organism
-   share long common prefixes (locus tags, numeric runs); sorting makes them
-   adjacent so the compressor can fold them. This alone takes 78 → 48 MB and
-   happens to match the by-organism query pattern in
-   `get_kegg_model_for_organism`. The sort is an external merge sort bounded to
-   `chunk_rows` in memory (see `stream_organism_gene_ko`), so it stays scalable.
-2. **xz instead of gzip** (Python stdlib `lzma`). Its larger dictionary captures
-   cross-row redundancy gzip's 32 KB window misses: sorted + xz reaches ≈27 MB.
+We previously xz-compressed this one file (≈27 MB, ~2.9× smaller). We switched it
+to **gzip** so the *same* artefact is readable by MATLAB's built-in `gunzip` with
+no external tool — the artefacts are shared with MATLAB RAVEN, and `.xz` would
+force an external `xz`/`unxz` dependency. The size cost (~74 vs ~27 MB on a
+once-per-release download) buys a dependency-free, cross-tool, cross-platform
+read; xz's larger dictionary is not worth a MATLAB toolchain requirement.
 
-- **pandas reads/writes both with zero extra dependencies** — compression is
-  inferred from the `.gz`/`.xz` suffix; `lzma` and `gzip` are both stdlib, so
-  this works natively on Windows, macOS, and Linux with no external binary.
-- **MATLAB caveat:** `readtable` reads gzipped TSV after a `gunzip`, but MATLAB
-  has no built-in xz decompressor. The small tables stay MATLAB-native; the
-  large table needs an external `unxz` (or Java/`7-Zip`) before `readtable` on
-  the MATLAB side. The xz file is raven-python's (Python) primary artefact; this
-  trades a little MATLAB convenience on the one big file for a ~3× size cut.
+- **pandas reads/writes gzip with zero extra dependencies** — compression is
+  inferred from the `.gz` suffix; `gzip` is stdlib, so this works natively on
+  Windows, macOS, and Linux with no external binary.
+- **MATLAB:** `readtable` reads every table after a built-in `gunzip`, with no
+  external binary on any file.
 
 ## Options considered
 
@@ -57,7 +56,7 @@ stdlib-only changes cut that to ≈27 MB (2.9×):
 Reconsider Parquet (or SQLite) if any of these become true:
 
 - The `organism_gene_ko` table grows large enough that load *time* (not just
-  size — the sort+xz change above already addresses on-disk size) becomes a real
+  size — the sort above already keeps on-disk size in check) becomes a real
   bottleneck. The remaining inefficiency is that building one species' model
   still loads all ~9M rows; sorted order makes a `searchsorted`/row-group
   by-organism read the natural next step before reaching for Parquet.
diff --git a/docs/maintenance/maintaining_kegg_data.md b/docs/maintenance/maintaining_kegg_data.md
index 356b574..da4bd96 100644
--- a/docs/maintenance/maintaining_kegg_data.md
+++ b/docs/maintenance/maintaining_kegg_data.md
@@ -76,11 +76,13 @@ KEGG release).
 ```python
 from raven_python.reconstruction.kegg import parse_kegg_dump
 
-parse_kegg_dump("keggdb", "artefacts")
+parse_kegg_dump("keggdb", "artefacts", version="kegg116")
 ```
 
 This writes the gene-free reference model (`reference_model.yml.gz`, gzipped
-RAVEN/cobra YAML) and the relational tables as gzipped TSV. See
+RAVEN/cobra YAML) and the relational tables as gzipped TSV. With `version=` set,
+every output filename is version-prefixed (e.g. `kegg116_organism_gene_ko.tsv.gz`),
+matching the published release assets. See
 [kegg_data_format.md](kegg_data_format.md) for what those tables contain and the
 format rationale.
 
@@ -98,7 +100,7 @@ searches. This needs **HMMER** (`hmmbuild`, `hmmpress`), **MAFFT**, and
 ```python
 from raven_python.reconstruction.kegg import build_hmm_library, read_kegg_table
 
-organism_gene_ko = read_kegg_table("artefacts/organism_gene_ko.tsv.xz")
+organism_gene_ko = read_kegg_table("artefacts/kegg116_organism_gene_ko.tsv.gz")
 for domain in ("prokaryotes", "eukaryotes"):
     build_hmm_library(
         organism_gene_ko,
@@ -111,32 +113,42 @@ for domain in ("prokaryotes", "eukaryotes"):
 
 For each KO in the domain it gathers the member sequences, dereplicates with
 CD-HIT (~90 % identity), aligns with MAFFT, trains a profile with `hmmbuild`, and
-finally concatenates and `hmmpress`-es them into a single `library.hmm` for fast
-`hmmscan` querying. This is the slowest step (hours, once per KEGG release); it
-skips KOs whose `.hmm` already exists, so it is resumable. The resulting
-libraries are published as version-pinned artefacts alongside the reference model
-and tables.
+finally concatenates them into a single `library.hmm`. This is the slowest step
+(hours, once per KEGG release); it skips KOs whose `.hmm` already exists, so it is
+resumable. The concatenated library is published **gzipped** as
+`<version>_<domain>.hmm.gz` (e.g. `kegg116_prokaryotes.hmm.gz`); end users
+decompress it and run `hmmpress` once on first use (see `ensure_kegg_hmm_library`),
+which keeps the download ~10× smaller than the binary index, stays portable across
+HMMER versions, and lets the same artefact serve MATLAB RAVEN.
 
 ## Building and publishing in one go
 
 [`scripts/build_kegg_artefacts.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/scripts/README.md) runs 3b.2 (+ 3b.3 with
-`--hmms`) and lays the output out as publishable assets (`<domain>.hmm` named for
-`ensure_kegg_hmm_library`):
+`--hmms`) and lays the output out as publishable, version-prefixed assets
+(`<version>_<domain>.hmm.gz` named for `ensure_kegg_hmm_library`):
 
 ```bash
-python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts --hmms --threads 8
+python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts \
+    --version kegg116 --hmms --threads 8
 ```
 
-Upload the contents of `artefacts/` to a release, then emit the registry entry for
-`raven_python.data._DATA_REGISTRY` with [`scripts/make_registry_snippet.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/scripts/README.md):
+Upload the contents of `artefacts/` to the release, then record the artefacts in
+both the shared `data/manifest.json` and `raven_python.data._DATA_REGISTRY` with
+[`scripts/make_registry_snippet.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/scripts/README.md)
+(it computes each file's SHA256 + size):
 
 ```bash
+# shared source of truth (read by raven-python and MATLAB RAVEN):
+python scripts/make_registry_snippet.py manifest --manifest data/manifest.json \
+    --target data --dataset kegg --version kegg116 --dir artefacts \
+    --base-url https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0
+# in-code registry, so end users auto-fetch with no env var (paste into _DATA_REGISTRY):
 python scripts/make_registry_snippet.py data --dataset kegg --version kegg116 \
-    --dir artefacts --base-url https://github.com/ORG/raven-python/releases/download/kegg-data-kegg116
+    --dir artefacts --base-url https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0
 ```
 
-Paste the printed block into `_DATA_REGISTRY`; from then on `ensure_data` fetches
-and verifies the artefacts for end users automatically.
+From then on `ensure_data` fetches and verifies the artefacts for end users
+automatically.
 
 ## End-user paths (3b.4 / 3b.5)
 
@@ -152,6 +164,7 @@ model from the artefacts:
   cross-platform. `organism_id="prokaryotes"`/`"eukaryotes"` builds a whole-domain
   model (pass `taxonomy=`).
 - **3b.5 — organism not in KEGG** (`get_kegg_model_from_sequences`): `hmmscan`-es a
-  proteome FASTA against the pressed `library.hmm`, so it needs **HMMER**
-  (`hmmscan`) — Linux/macOS or WSL2 (see the OS matrix). Tune assignment with
+  proteome FASTA against the domain library (decompressed and `hmmpress`-ed from
+  the published `.hmm.gz` on first use), so it needs **HMMER** (`hmmpress`,
+  `hmmscan`) — Linux/macOS or WSL2 (see the OS matrix). Tune assignment with
   `cutoff`, `min_score_ratio_ko`, `min_score_ratio_g`.
diff --git a/docs/reference/matlab_raven_backports.md b/docs/reference/matlab_raven_backports.md
index 5187f9e..8b6c44a 100644
--- a/docs/reference/matlab_raven_backports.md
+++ b/docs/reference/matlab_raven_backports.md
@@ -70,11 +70,11 @@ Implemented across [`reconstruction.kegg`](https://github.com/SysBioChalmers/rav
   switches when DP-cost > 0.65 × (RAM − 2.5 GB) / 4.2e-9). Same protein
   set, no sequences dropped.
 * **K14** ⚡ — Sort `organism_gene_ko` by `(organism, gene)` and store it
-  xz-compressed (`organism_gene_ko.tsv.xz`). Cuts the dominant artefact from
-  ~78 MB to ~27 MB (2.9×): gene IDs within an organism share long prefixes
-  so adjacency makes them far more compressible, and xz's larger window
-  catches cross-row redundancy gzip's 32 KB window misses. MATLAB needs an
-  external `unxz` to read the file; the small tables remain plain gzipped TSV.
+  gzipped (`organism_gene_ko.tsv.gz`). Sorting makes gene IDs within an organism
+  adjacent (shared locus-tag/numeric prefixes), so they compress far better
+  (~78 → 48 MB) and the order matches the by-organism query. Stored as gzip (not
+  xz) so MATLAB reads it with built-in `gunzip`, since the same artefact is shared
+  with raven-python; xz would be ~3× smaller but needs an external `unxz`.
 * **K15** 🐛 — Recalibrate the HMM-query KO-assignment defaults: change
   `cutoff` from `1e-50` to `1e-30` and `min_score_ratio_g` from `0.8` to
   `0.9`. `min_score_ratio_ko` can stay at `0.3` but is empirically inert
diff --git a/docs/studies/kegg_hmm_cutoff_calibration.md b/docs/studies/kegg_hmm_cutoff_calibration.md
index c2fd106..9c9b306 100644
--- a/docs/studies/kegg_hmm_cutoff_calibration.md
+++ b/docs/studies/kegg_hmm_cutoff_calibration.md
@@ -199,5 +199,5 @@ Full reconstruction of *S. cerevisiae* two ways, at the old defaults:
 
 Reaction recall 96.3 % (1305/1355 shared, Jaccard 0.86); gene recall 96.6 %
 (807/835 shared, Jaccard 0.87). The annotation path also exercises the new
-`organism_gene_ko.tsv.xz` artefact (K14). `hmmscan` throughput ≈ 0.1 s/query
+`organism_gene_ko.tsv.gz` artefact (K14). `hmmscan` throughput ≈ 0.1 s/query
 against either library on 12 threads (yeast: 6021 queries in 633 s).
diff --git a/pyproject.toml b/pyproject.toml
index c5d847c..c847ed3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "raven-python"
-version = "0.1.0a1"
+version = "0.1.0"
 description = "Reconstruction, Analysis and Visualization of Metabolic Networks in Python, a port of the RAVEN Toolbox built on cobrapy"
 readme = "README.md"
 requires-python = ">=3.11"
diff --git a/scripts/analyze_hmm_cutoffs.py b/scripts/analyze_hmm_cutoffs.py
index 654fc02..6026604 100644
--- a/scripts/analyze_hmm_cutoffs.py
+++ b/scripts/analyze_hmm_cutoffs.py
@@ -50,7 +50,7 @@ def load_ko2rxn(artefacts: Path) -> dict[str, set[str]]:
 
 
 def ground_truth(artefacts: Path, org: str, ko2rxn) -> tuple[set, set]:
-    ogk = read_kegg_table(artefacts / "organism_gene_ko.tsv.xz")
+    ogk = read_kegg_table(artefacts / "organism_gene_ko.tsv.gz")
     rows = ogk[ogk["organism"].str.lower() == org]
     pairs = set(zip(rows["gene"], rows["ko"], strict=True))
     rxns = {r for _, ko in pairs for r in ko2rxn.get(ko, ())}
diff --git a/scripts/build_kegg_artefacts.py b/scripts/build_kegg_artefacts.py
index 13fd00e..2539dd3 100644
--- a/scripts/build_kegg_artefacts.py
+++ b/scripts/build_kegg_artefacts.py
@@ -5,26 +5,29 @@
 ``download_kegg_dump`` / ``fetch_keggdb``):
 
 * 3b.2 — ``parse_kegg_dump`` → ``reference_model.yml.gz`` + the gzipped-TSV tables;
-* 3b.3 — ``build_hmm_library`` per domain → a pressed ``<domain>.hmm`` (+ hmmpress
-  sidecars), named so :func:`raven_python.data.ensure_kegg_hmm_library` can fetch them.
+* 3b.3 — ``build_hmm_library`` per domain → a gzipped concatenated flatfile
+  ``<version>_<domain>.hmm.gz`` (the client decompresses + ``hmmpress``-es it on
+  first use), named so :func:`raven_python.data.ensure_kegg_hmm_library` can fetch it.
 
-Everything lands in ``--out`` ready to upload as release assets; feed that
+Pass ``--version`` (e.g. ``kegg116``) to version-prefix every output filename, matching
+the published release assets. Everything lands in ``--out`` ready to upload; feed that
 directory to ``scripts/make_registry_snippet.py data`` to emit the registry entry.
 
 Examples
 --------
 Tables + reference model only (fast, no binaries)::
 
-    python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts
+    python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts --version kegg116
 
 Full build incl. both HMM libraries (slow; needs HMMER/MAFFT/CD-HIT)::
 
     python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts \\
-        --hmms --threads 8
+        --version kegg116 --hmms --threads 8
 """
 from __future__ import annotations
 
 import argparse
+import gzip
 import shutil
 from pathlib import Path
 
@@ -34,21 +37,20 @@
     read_kegg_table,
 )
 
-# hmmpress sidecar extensions, alongside the .hmm.
-_HMM_SIDECARS = (".h3f", ".h3i", ".h3m", ".h3p")
 
+def _publish_library(work: dict, out_dir: Path, domain: str, prefix: str = "") -> Path:
+    """Gzip a built ``library.hmm`` to ``out_dir/<prefix><domain>.hmm.gz``.
 
-def _publish_library(work: dict, out_dir: Path, domain: str) -> Path:
-    """Copy a built ``library.hmm`` (+ sidecars) to ``out_dir/<domain>.hmm``."""
+    Only the concatenated flatfile is published (gzip, ~10x smaller than the
+    hmmpress index and portable across HMMER versions); the client decompresses and
+    re-presses on first use, so the same artefact also serves MATLAB RAVEN.
+    """
     library = work["library"]
     if library is None:
         raise SystemExit(f"No HMMs built for {domain!r}; nothing to publish.")
-    target = out_dir / f"{domain}.hmm"
-    shutil.copyfile(library, target)
-    for suffix in _HMM_SIDECARS:
-        sidecar = library.with_name(library.name + suffix)
-        if sidecar.exists():
-            shutil.copyfile(sidecar, target.with_name(target.name + suffix))
+    target = out_dir / f"{prefix}{domain}.hmm.gz"
+    with open(library, "rb") as src, gzip.open(target, "wb") as out:
+        shutil.copyfileobj(src, out)
     return target
 
 
@@ -56,6 +58,10 @@ def main(argv: list[str] | None = None) -> None:
     parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
     parser.add_argument("--keggdb", required=True, type=Path, help="arranged KEGG dump directory")
     parser.add_argument("--out", required=True, type=Path, help="artefact output directory")
+    parser.add_argument(
+        "--version", default=None,
+        help="KEGG version (e.g. kegg116); version-prefixes every output filename",
+    )
     parser.add_argument("--hmms", action="store_true", help="also build the HMM libraries")
     parser.add_argument(
         "--domains", nargs="+", default=["prokaryotes", "eukaryotes"], help="HMM domains to build"
@@ -70,8 +76,9 @@ def main(argv: list[str] | None = None) -> None:
     args = parser.parse_args(argv)
 
     args.out.mkdir(parents=True, exist_ok=True)
+    prefix = f"{args.version}_" if args.version else ""
     print(">>> Parsing KEGG dump (3b.2)...")
-    paths = parse_kegg_dump(args.keggdb, args.out)
+    paths = parse_kegg_dump(args.keggdb, args.out, version=args.version)
     for name, path in paths.items():
         print(f"    {name}: {path}")
 
@@ -86,12 +93,12 @@ def main(argv: list[str] | None = None) -> None:
                 domain=domain, seq_identity=args.seq_identity,
                 parttree_residues=args.parttree_residues, threads=args.threads,
             )
-            published = _publish_library(work, args.out, domain)
+            published = _publish_library(work, args.out, domain, prefix)
             print(f"    {domain}: {published} ({len(work['hmms'])} profiles)")
 
     print(f"\n>>> Done. Upload the contents of {args.out} as release assets, then run:")
     print("    python scripts/make_registry_snippet.py data --dataset kegg "
-          f"--version <VER> --dir {args.out} --base-url <RELEASE_URL>")
+          f"--version {args.version or '<VER>'} --dir {args.out} --base-url <RELEASE_URL>")
 
 
 if __name__ == "__main__":
diff --git a/src/raven_python/data.py b/src/raven_python/data.py
index a02749e..fdba221 100644
--- a/src/raven_python/data.py
+++ b/src/raven_python/data.py
@@ -19,23 +19,62 @@
 """
 from __future__ import annotations
 
+import gzip
 import os
 import shutil
+import subprocess
 from pathlib import Path
 from urllib.request import urlopen
 
-from raven_python.binaries import _sha256
+from raven_python.binaries import _sha256, resolve_binary
 
 # dataset -> {"version": str, "files": {filename: {"url": str, "sha256": str}}}
-# Populated when raven_python publishes the KEGG artefacts as release assets.
-_DATA_REGISTRY: dict = {}
-
-# The core KEGG artefacts needed to build a model (no HMM libraries).
+# Mirrors data/manifest.json (the cross-language source of truth); regenerate the
+# block with scripts/make_registry_snippet.py when publishing a new KEGG release.
+_DATA_REGISTRY: dict = {
+    "kegg": {
+        "version": "kegg116",
+        "files": {
+            "kegg116_eukaryotes.hmm.gz": {
+                "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_eukaryotes.hmm.gz",
+                "sha256": "2d48bc9935575d0f9ba4178bf2df19279bff866b49c1bf83a8e15787b11d6708",
+            },
+            "kegg116_ko_names.tsv.gz": {
+                "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_names.tsv.gz",
+                "sha256": "84f9c7150172d948f794d91a6608d55f7140f31e53249c705057ae49b11c93b3",
+            },
+            "kegg116_ko_reaction.tsv.gz": {
+                "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_reaction.tsv.gz",
+                "sha256": "e1a4ac22875bd3030d03b78368b0153b6d99000acb2ee0f474340a03c180323c",
+            },
+            "kegg116_organism_gene_ko.tsv.gz": {
+                "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_organism_gene_ko.tsv.gz",
+                "sha256": "27bf7dd58eb1acd5904990dc2be187aae4d8d9b9f7421375618e7c8d6ff7253d",
+            },
+            "kegg116_prokaryotes.hmm.gz": {
+                "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_prokaryotes.hmm.gz",
+                "sha256": "d80cb2a22dec9fd8336b3998e3b96ee121672f63f4041cddaf09624fe739f1af",
+            },
+            "kegg116_reference_model.yml.gz": {
+                "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_reference_model.yml.gz",
+                "sha256": "73ff313fe2aa2830ec511f4e522226c98c5714c2d5c4632844544e5a409c7f0c",
+            },
+            "kegg116_rxn_flags.tsv.gz": {
+                "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz",
+                "sha256": "c4c134effc9edeeb74b925ae8616320af162edbaad3a9b44dcc29d2c4d12db9b",
+            },
+        },
+    },
+}
+
+# The core KEGG artefacts needed to build a model (no HMM libraries). These are
+# the *base* names; published assets are version-prefixed (``<version>_<base>``),
+# which is what the resolvers below construct and what the registry keys hold.
 CORE_KEGG_FILES = (
     "reference_model.yml.gz",
     "ko_reaction.tsv.gz",
     "ko_names.tsv.gz",
-    "organism_gene_ko.tsv.xz",
+    "organism_gene_ko.tsv.gz",
     "rxn_flags.tsv.gz",
 )
 
@@ -128,30 +167,52 @@ def ensure_kegg_data(
 ) -> Path:
     """Ensure the core KEGG artefacts are cached; return their directory.
 
-    Fetches each of ``files`` (default :data:`CORE_KEGG_FILES`) for the ``kegg``
-    dataset and returns the cache directory holding them — ready to pass as the
-    ``artefact_dir`` of :func:`get_kegg_model_for_organism_from_artefacts`.
+    Fetches each of ``files`` (default :data:`CORE_KEGG_FILES`, given as *base*
+    names) for the ``kegg`` dataset and returns the cache directory holding them —
+    ready to pass as the ``artefact_dir`` of
+    :func:`get_kegg_model_for_organism_from_artefacts`. Each file is fetched under
+    its version-prefixed published name (``<version>_<base>``).
     """
     registry = _DATA_REGISTRY if registry is None else registry
     ver = version or _bundle("kegg", registry)["version"]
-    for filename in files:
-        ensure_data_file("kegg", filename, version=ver, registry=registry)
+    for base in files:
+        ensure_data_file("kegg", f"{ver}_{base}", version=ver, registry=registry)
     return _data_cache_dir() / f"kegg-{ver}"
 
 
 def ensure_kegg_hmm_library(
-    domain: str, *, version: str | None = None, registry: dict | None = None
+    domain: str,
+    *,
+    version: str | None = None,
+    registry: dict | None = None,
+    hmmpress: str | os.PathLike | None = None,
 ) -> Path:
-    """Ensure a domain HMM library (and its hmmpress index) is cached; return its path.
-
-    ``domain`` is ``"prokaryotes"`` or ``"eukaryotes"``. Fetches ``<domain>.hmm``
-    plus the ``hmmpress`` sidecar files (``.h3f/.h3i/.h3m/.h3p``) and returns the
-    path to the ``.hmm`` (the argument for :func:`run_hmmscan`).
+    """Ensure a domain HMM library is cached and pressed; return the ``.hmm`` path.
+
+    ``domain`` is ``"prokaryotes"`` or ``"eukaryotes"``. Fetches the gzipped
+    concatenated library ``<version>_<domain>.hmm.gz``, decompresses it once, and
+    runs ``hmmpress`` to build the ``.h3f/.h3i/.h3m/.h3p`` index ``hmmscan`` needs
+    (HMMER is already a requirement of the de-novo query path). Both steps are
+    cached, so they run only on first use. Returns the path to the decompressed
+    ``.hmm`` (the argument for :func:`run_hmmscan`).
+
+    Shipping the gzip flatfile and pressing on the client keeps the download ~10x
+    smaller than the binary index, stays portable across HMMER versions/platforms,
+    and lets the same artefact serve MATLAB RAVEN.
     """
     registry = _DATA_REGISTRY if registry is None else registry
     ver = version or _bundle("kegg", registry)["version"]
-    base = f"{domain}.hmm"
-    library = ensure_data_file("kegg", base, version=ver, registry=registry)
-    for suffix in (".h3f", ".h3i", ".h3m", ".h3p"):
-        ensure_data_file("kegg", base + suffix, version=ver, registry=registry)
+    archive = ensure_data_file("kegg", f"{ver}_{domain}.hmm.gz", version=ver, registry=registry)
+    library = archive.with_suffix("")  # strip ".gz" -> <version>_<domain>.hmm
+    if not library.exists():
+        tmp = library.with_name(library.name + ".part")
+        with gzip.open(archive, "rb") as src, open(tmp, "wb") as out:
+            shutil.copyfileobj(src, out)
+        tmp.replace(library)
+    sidecars = (".h3f", ".h3i", ".h3m", ".h3p")
+    if not all(library.with_name(library.name + s).exists() for s in sidecars):
+        exe = resolve_binary("hmmpress", binary=hmmpress)
+        proc = subprocess.run([exe, "-f", str(library)], capture_output=True, text=True)
+        if proc.returncode != 0:
+            raise RuntimeError(f"hmmpress failed:\n{(proc.stderr or '').strip()}")
     return library
diff --git a/src/raven_python/reconstruction/kegg/organism.py b/src/raven_python/reconstruction/kegg/organism.py
index 9f30575..91f554b 100644
--- a/src/raven_python/reconstruction/kegg/organism.py
+++ b/src/raven_python/reconstruction/kegg/organism.py
@@ -28,7 +28,7 @@
 
 from raven_python.io.yaml import read_yaml_model
 from raven_python.reconstruction.kegg.assemble import _DOMAINS, assemble_model_from_ko_genes
-from raven_python.reconstruction.kegg.parse import read_kegg_table
+from raven_python.reconstruction.kegg.parse import _resolve_artefact, read_kegg_table
 from raven_python.reconstruction.kegg.taxonomy import organisms_in_domain
 
 _NOTE = "Included by get_kegg_model_for_organism (no HMMs)"
@@ -139,10 +139,10 @@ def get_kegg_model_for_organism_from_artefacts(
 
         artefact_dir = ensure_kegg_data(version=version)
     artefact_dir = Path(artefact_dir)
-    reference_model = read_yaml_model(artefact_dir / "reference_model.yml.gz")
-    ko_reaction = read_kegg_table(artefact_dir / "ko_reaction.tsv.gz")
-    organism_gene_ko = read_kegg_table(artefact_dir / "organism_gene_ko.tsv.xz")
-    rxn_flags = read_kegg_table(artefact_dir / "rxn_flags.tsv.gz")
+    reference_model = read_yaml_model(_resolve_artefact(artefact_dir, "reference_model.yml.gz"))
+    ko_reaction = read_kegg_table(_resolve_artefact(artefact_dir, "ko_reaction.tsv.gz"))
+    organism_gene_ko = read_kegg_table(_resolve_artefact(artefact_dir, "organism_gene_ko.tsv.gz"))
+    rxn_flags = read_kegg_table(_resolve_artefact(artefact_dir, "rxn_flags.tsv.gz"))
     return get_kegg_model_for_organism(
         organism_id,
         reference_model,
diff --git a/src/raven_python/reconstruction/kegg/parse.py b/src/raven_python/reconstruction/kegg/parse.py
index 3ecd6f4..23c6f0e 100644
--- a/src/raven_python/reconstruction/kegg/parse.py
+++ b/src/raven_python/reconstruction/kegg/parse.py
@@ -28,7 +28,6 @@
 
 import gzip
 import heapq
-import lzma
 import re
 import tempfile
 from collections.abc import Iterator
@@ -444,17 +443,21 @@ def build_kegg_tables(
     }
 
 
-def write_kegg_tables(tables: dict[str, pd.DataFrame], out_dir: str | Path) -> list[Path]:
-    """Write each table as a gzipped TSV (``<name>.tsv.gz``) into ``out_dir``.
+def write_kegg_tables(
+    tables: dict[str, pd.DataFrame], out_dir: str | Path, *, prefix: str = ""
+) -> list[Path]:
+    """Write each table as a gzipped TSV (``<prefix><name>.tsv.gz``) into ``out_dir``.
 
     Gzipped TSV is the dependency-free cross-language format shared with MATLAB
-    RAVEN (see docs/kegg_data_format.md). Returns the written paths.
+    RAVEN (see docs/kegg_data_format.md) — readable by MATLAB's built-in ``gunzip``
+    with no external tool. ``prefix`` version-tags the filenames (e.g.
+    ``kegg116_``). Returns the written paths.
     """
     out_dir = Path(out_dir)
     out_dir.mkdir(parents=True, exist_ok=True)
     written = []
     for name, frame in tables.items():
-        path = out_dir / f"{name}.tsv.gz"
+        path = out_dir / f"{prefix}{name}.tsv.gz"
         with gzip.open(path, "wt", encoding="utf-8", newline="") as handle:
             frame.to_csv(handle, sep="\t", index=False)
         written.append(path)
@@ -465,13 +468,35 @@ def read_kegg_table(path: str | Path) -> pd.DataFrame:
     """Read a KEGG table written by :func:`write_kegg_tables` or
     :func:`stream_organism_gene_ko`.
 
-    Compression is inferred from the suffix, so both the gzipped small tables
-    (``.tsv.gz``) and the xz-compressed ``organism_gene_ko.tsv.xz`` are read
-    transparently.
+    Compression is inferred from the suffix; all published tables are gzipped TSV
+    (``.tsv.gz``), and a version-prefixed name (``kegg116_<name>.tsv.gz``) reads
+    just the same.
     """
     return pd.read_csv(path, sep="\t", dtype=str, keep_default_na=False)
 
 
+def _resolve_artefact(directory: str | Path, base: str) -> Path:
+    """Locate an artefact file by its base name, tolerating a version prefix.
+
+    Returns ``directory/base`` if it exists, else the single version-prefixed
+    match ``directory/<version>_<base>`` (the published asset name, e.g.
+    ``kegg116_ko_reaction.tsv.gz``). This lets one reader consume both a user's own
+    unprefixed build directory and the version-pinned download cache.
+    """
+    directory = Path(directory)
+    exact = directory / base
+    if exact.exists():
+        return exact
+    matches = sorted(directory.glob(f"*_{base}"))
+    if len(matches) == 1:
+        return matches[0]
+    if not matches:
+        raise FileNotFoundError(
+            f"{base!r} (or a <version>_ prefixed form) not found in {directory}"
+        )
+    raise ValueError(f"Ambiguous {base!r} in {directory}: {sorted(p.name for p in matches)}")
+
+
 def _flush_sorted_run(rows: list[str], tmp_dir: Path, run_no: int) -> Path:
     """Sort a buffer of ``organism\\tgene\\tko\\n`` lines and write one gzipped run."""
     rows.sort(key=_ogk_sort_key)
@@ -490,14 +515,15 @@ def _ogk_sort_key(line: str) -> tuple[str, str]:
 def stream_organism_gene_ko(
     kegg_dir: str | Path, keep: set[str], ogk_path: str | Path, *, chunk_rows: int = 1_000_000
 ) -> pd.DataFrame:
-    """Stream the ``ko`` file to a sorted, xz-compressed ``organism_gene_ko.tsv.xz``.
+    """Stream the ``ko`` file to a sorted, gzipped ``organism_gene_ko.tsv.gz``.
 
     Real KEGG has ~9M gene↔KO associations — far too many to hold in memory as a
     DataFrame. Rows are sorted by ``(organism, gene)`` before writing: gene IDs
     from one organism share long common prefixes (locus tags, numeric runs), so
-    sorting makes them adjacent and lets the compressor shrink the table ~2.9x
-    versus the unsorted gzip form. The order also matches the by-organism query
-    pattern in :func:`get_kegg_model_for_organism`.
+    sorting makes them adjacent and helps the compressor; the order also matches
+    the by-organism query pattern in :func:`get_kegg_model_for_organism`. Gzip
+    (not xz) keeps the table readable by MATLAB's built-in ``gunzip`` with no
+    external tool, at a modestly larger size.
 
     The sort is an **external merge sort** bounded to ``chunk_rows`` rows in
     memory at a time (sorted runs spooled to gzipped temp files, then merged with
@@ -526,7 +552,7 @@ def stream_organism_gene_ko(
 
         handles = [gzip.open(r, "rt", encoding="utf-8") for r in runs]
         try:
-            with lzma.open(ogk_path, "wt", encoding="utf-8", newline="") as out:
+            with gzip.open(ogk_path, "wt", encoding="utf-8", newline="") as out:
                 out.write("organism\tgene\tko\n")
                 out.writelines(heapq.merge(*handles, key=_ogk_sort_key))
         finally:
@@ -535,7 +561,9 @@ def stream_organism_gene_ko(
     return pd.DataFrame(names, columns=["ko", "name"])
 
 
-def parse_kegg_dump(kegg_dir: str | Path, out_dir: str | Path) -> dict[str, Path]:
+def parse_kegg_dump(
+    kegg_dir: str | Path, out_dir: str | Path, *, version: str | None = None
+) -> dict[str, Path]:
     """Parse a full KEGG dump into the reference model + tables and write them out.
 
     Writes ``reference_model.yml.gz`` (gzipped RAVEN/cobra YAML) plus the
@@ -544,9 +572,14 @@ def parse_kegg_dump(kegg_dir: str | Path, out_dir: str | Path) -> dict[str, Path
     ``organism_gene_ko`` table is streamed to disk (see
     :func:`stream_organism_gene_ko`) rather than built in memory, so this scales
     to the full KEGG database; the small derived tables are built in memory.
+
+    When ``version`` is given (e.g. ``"kegg116"``) the output filenames are
+    version-prefixed (``<version>_<name>``), matching the published release
+    assets; the returned dict keys stay the logical table names.
     """
     out_dir = Path(out_dir)
     out_dir.mkdir(parents=True, exist_ok=True)
+    prefix = f"{version}_" if version else ""
 
     reactions = parse_kegg_reactions(kegg_dir)
     compounds = parse_kegg_compounds(kegg_dir)
@@ -563,16 +596,23 @@ def parse_kegg_dump(kegg_dir: str | Path, out_dir: str | Path) -> dict[str, Path
             columns=["reaction", "spontaneous", "undefined_stoich", "incomplete", "general"],
         ),
     }
-    paths = {name: p for name, p in zip(small, write_kegg_tables(small, out_dir), strict=True)}
+    paths = {
+        name: p
+        for name, p in zip(small, write_kegg_tables(small, out_dir, prefix=prefix), strict=True)
+    }
 
-    ogk_path = out_dir / "organism_gene_ko.tsv.xz"
+    ogk_path = out_dir / f"{prefix}organism_gene_ko.tsv.gz"
     ko_names = stream_organism_gene_ko(kegg_dir, linked_kos, ogk_path)
     paths["organism_gene_ko"] = ogk_path
     paths.update(
-        zip(["ko_names"], write_kegg_tables({"ko_names": ko_names}, out_dir), strict=True)
+        zip(
+            ["ko_names"],
+            write_kegg_tables({"ko_names": ko_names}, out_dir, prefix=prefix),
+            strict=True,
+        )
     )
 
-    ref_path = out_dir / "reference_model.yml.gz"
+    ref_path = out_dir / f"{prefix}reference_model.yml.gz"
     write_yaml_model(model, ref_path)
     paths["reference_model"] = ref_path
     return paths
diff --git a/src/raven_python/reconstruction/kegg/query.py b/src/raven_python/reconstruction/kegg/query.py
index 2df3f78..00a48f3 100644
--- a/src/raven_python/reconstruction/kegg/query.py
+++ b/src/raven_python/reconstruction/kegg/query.py
@@ -27,7 +27,7 @@
 from raven_python.binaries import resolve_binary
 from raven_python.io.yaml import read_yaml_model
 from raven_python.reconstruction.kegg.assemble import assemble_model_from_ko_genes
-from raven_python.reconstruction.kegg.parse import read_kegg_table
+from raven_python.reconstruction.kegg.parse import _resolve_artefact, read_kegg_table
 
 _NOTE = "Included by get_kegg_model_from_sequences (using HMMs)"
 _MIN_EVALUE = 1e-250  # floor for a reported E-value of 0, to keep logs finite
@@ -223,9 +223,9 @@ def get_kegg_model_from_sequences_with_artefacts(
         if library is None:
             library = ensure_kegg_hmm_library(domain, version=version)
     artefact_dir = Path(artefact_dir)
-    reference_model = read_yaml_model(artefact_dir / "reference_model.yml.gz")
-    ko_reaction = read_kegg_table(artefact_dir / "ko_reaction.tsv.gz")
-    rxn_flags = read_kegg_table(artefact_dir / "rxn_flags.tsv.gz")
+    reference_model = read_yaml_model(_resolve_artefact(artefact_dir, "reference_model.yml.gz"))
+    ko_reaction = read_kegg_table(_resolve_artefact(artefact_dir, "ko_reaction.tsv.gz"))
+    rxn_flags = read_kegg_table(_resolve_artefact(artefact_dir, "rxn_flags.tsv.gz"))
     return get_kegg_model_from_sequences(
         fasta, reference_model, ko_reaction, library, rxn_flags=rxn_flags, **kwargs
     )
diff --git a/tests/test_data.py b/tests/test_data.py
index 714c3a9..d8db588 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -1,12 +1,17 @@
 """Tests for ensure_data (data.py). Uses file:// URLs to avoid the network."""
+import gzip
 import hashlib
+import subprocess
+from pathlib import Path
 
 import pytest
 
+from raven_python import data
 from raven_python.data import (
     CORE_KEGG_FILES,
     ensure_data_file,
     ensure_kegg_data,
+    ensure_kegg_hmm_library,
 )
 
 
@@ -16,22 +21,30 @@ def _sha256(data: bytes) -> str:
 
 @pytest.fixture
 def served(tmp_path, monkeypatch):
-    """A fake registry served from local files, with the cache pointed at tmp."""
+    """A fake registry served from local files, with the cache pointed at tmp.
+
+    Published assets are version-prefixed (``v1_<base>``), matching the real
+    registry shape that ``ensure_kegg_data`` / ``ensure_kegg_hmm_library`` build.
+    ``payloads`` is keyed by the published (prefixed) name.
+    """
     src = tmp_path / "src"
     src.mkdir()
-    payloads = {
+    ver = "v1"
+    bases = {
         "reference_model.yml.gz": b"!!omap model bytes",
         "ko_reaction.tsv.gz": b"ko\treaction\n",
         "ko_names.tsv.gz": b"ko\tname\n",
-        "organism_gene_ko.tsv.xz": b"organism\tgene\tko\n",
+        "organism_gene_ko.tsv.gz": b"organism\tgene\tko\n",
         "rxn_flags.tsv.gz": b"reaction\tspontaneous\n",
     }
     files = {}
-    for name, data in payloads.items():
-        path = src / name
-        path.write_bytes(data)
-        files[name] = {"url": path.as_uri(), "sha256": _sha256(data)}
-    registry = {"kegg": {"version": "v1", "files": files}}
+    payloads = {}
+    for base, data_bytes in bases.items():
+        name = f"{ver}_{base}"
+        (src / name).write_bytes(data_bytes)
+        files[name] = {"url": (src / name).as_uri(), "sha256": _sha256(data_bytes)}
+        payloads[name] = data_bytes
+    registry = {"kegg": {"version": ver, "files": files}}
 
     cache = tmp_path / "cache"
     monkeypatch.setenv("XDG_CACHE_HOME", str(cache))
@@ -40,27 +53,27 @@ def served(tmp_path, monkeypatch):
 
 def test_ensure_data_file_downloads_and_caches(served):
     registry, cache, payloads = served
-    path = ensure_data_file("kegg", "ko_reaction.tsv.gz", registry=registry)
-    assert path == cache / "raven_python" / "data" / "kegg-v1" / "ko_reaction.tsv.gz"
-    assert path.read_bytes() == payloads["ko_reaction.tsv.gz"]
+    path = ensure_data_file("kegg", "v1_ko_reaction.tsv.gz", registry=registry)
+    assert path == cache / "raven_python" / "data" / "kegg-v1" / "v1_ko_reaction.tsv.gz"
+    assert path.read_bytes() == payloads["v1_ko_reaction.tsv.gz"]
 
 
-def test_ensure_data_file_reuses_cache(served, monkeypatch):
+def test_ensure_data_file_reuses_cache(served):
     registry, _, _ = served
-    first = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry)
+    first = ensure_data_file("kegg", "v1_ko_names.tsv.gz", registry=registry)
     # Break the URL: a second call must hit the cache, not re-download.
-    registry["kegg"]["files"]["ko_names.tsv.gz"]["url"] = "file:///nonexistent"
-    second = ensure_data_file("kegg", "ko_names.tsv.gz", registry=registry)
+    registry["kegg"]["files"]["v1_ko_names.tsv.gz"]["url"] = "file:///nonexistent"
+    second = ensure_data_file("kegg", "v1_ko_names.tsv.gz", registry=registry)
     assert first == second and second.exists()
 
 
 def test_sha256_mismatch_rejected(served):
     registry, cache, _ = served
-    registry["kegg"]["files"]["rxn_flags.tsv.gz"]["sha256"] = "0" * 64
+    registry["kegg"]["files"]["v1_rxn_flags.tsv.gz"]["sha256"] = "0" * 64
     with pytest.raises(ValueError, match="SHA256 mismatch"):
-        ensure_data_file("kegg", "rxn_flags.tsv.gz", registry=registry)
+        ensure_data_file("kegg", "v1_rxn_flags.tsv.gz", registry=registry)
     # The corrupt partial download must not be left behind.
-    assert not (cache / "raven_python" / "data" / "kegg-v1" / "rxn_flags.tsv.gz").exists()
+    assert not (cache / "raven_python" / "data" / "kegg-v1" / "v1_rxn_flags.tsv.gz").exists()
 
 
 def test_unknown_dataset_actionable_error(served):
@@ -79,11 +92,57 @@ def test_ensure_kegg_data_fetches_core_set(served):
     registry, cache, _ = served
     out = ensure_kegg_data(registry=registry)
     assert out == cache / "raven_python" / "data" / "kegg-v1"
-    for name in CORE_KEGG_FILES:
-        assert (out / name).is_file()
+    # CORE_KEGG_FILES are base names; the cached files are version-prefixed.
+    for base in CORE_KEGG_FILES:
+        assert (out / f"v1_{base}").is_file()
+
+
+def test_ensure_kegg_hmm_library_decompresses_and_presses(served, tmp_path, monkeypatch):
+    registry, cache, _ = served
+    raw = b"HMMER3/f [3.4]\nNAME  K00001\n//\n"
+    blob = gzip.compress(raw, mtime=0)
+    gz = tmp_path / "src" / "v1_prokaryotes.hmm.gz"
+    gz.write_bytes(blob)
+    registry["kegg"]["files"]["v1_prokaryotes.hmm.gz"] = {
+        "url": gz.as_uri(),
+        "sha256": _sha256(blob),
+    }
+
+    presses: list[Path] = []
 
+    def fake_run(cmd, capture_output, text):
+        library = Path(cmd[-1])
+        presses.append(library)
+        for suffix in (".h3f", ".h3i", ".h3m", ".h3p"):
+            library.with_name(library.name + suffix).write_bytes(b"")
+        return subprocess.CompletedProcess(cmd, 0, "", "")
 
-def test_empty_registry_raises():
-    # The shipped registry is empty until artefacts are published.
+    monkeypatch.setattr(data, "resolve_binary", lambda name, binary=None: "hmmpress")
+    monkeypatch.setattr(data.subprocess, "run", fake_run)
+
+    library = ensure_kegg_hmm_library("prokaryotes", registry=registry)
+    assert library.name == "v1_prokaryotes.hmm"
+    assert library.read_bytes() == raw  # decompressed in place
+    assert library.with_name(library.name + ".h3m").exists()
+    assert len(presses) == 1
+
+    # Second call: library + sidecars already cached, so no re-decompress/-press.
+    again = ensure_kegg_hmm_library("prokaryotes", registry=registry)
+    assert again == library
+    assert len(presses) == 1
+
+
+def test_unregistered_dataset_raises():
+    # An unpublished dataset still raises an actionable error against the shipped registry.
     with pytest.raises(FileNotFoundError, match="No data artefacts registered"):
-        ensure_data_file("kegg", "ko_reaction.tsv.gz")
+        ensure_data_file("metacyc", "x")
+
+
+def test_shipped_registry_matches_resolver_names():
+    # The published registry keys must equal what ensure_kegg_data /
+    # ensure_kegg_hmm_library construct as f"{version}_{base}", or fetches 404.
+    kegg = data._DATA_REGISTRY["kegg"]
+    ver = kegg["version"]
+    names = set(kegg["files"])
+    assert {f"{ver}_{base}" for base in CORE_KEGG_FILES} <= names
+    assert {f"{ver}_{d}.hmm.gz" for d in ("prokaryotes", "eukaryotes")} <= names
diff --git a/tests/test_reconstruction_kegg_parse.py b/tests/test_reconstruction_kegg_parse.py
index 23d8f71..533642e 100644
--- a/tests/test_reconstruction_kegg_parse.py
+++ b/tests/test_reconstruction_kegg_parse.py
@@ -195,8 +195,8 @@ def test_parse_kegg_dump_writes_artefacts(tmp_path):
         "ko_reaction", "ko_names", "organism_gene_ko", "rxn_flags", "reference_model"
     }
     assert (tmp_path / "reference_model.yml.gz").is_file()
-    # organism_gene_ko is streamed to a sorted, xz-compressed TSV.
-    assert paths["organism_gene_ko"].name == "organism_gene_ko.tsv.xz"
+    # organism_gene_ko is streamed to a sorted, gzipped TSV.
+    assert paths["organism_gene_ko"].name == "organism_gene_ko.tsv.gz"
     ogk = read_kegg_table(paths["organism_gene_ko"])
     assert set(ogk.columns) == {"organism", "gene", "ko"}
     assert ("eco", "b0001", "K00002") in set(map(tuple, ogk.to_numpy()))
@@ -205,11 +205,21 @@ def test_parse_kegg_dump_writes_artefacts(tmp_path):
     assert keys == sorted(keys)
 
 
+def test_parse_kegg_dump_version_prefixes_filenames(tmp_path):
+    paths = parse_kegg_dump(DUMP, tmp_path, version="kegg116")
+    # Dict keys stay logical; the files on disk are version-prefixed.
+    assert set(paths) >= {"ko_reaction", "organism_gene_ko", "reference_model"}
+    assert paths["organism_gene_ko"].name == "kegg116_organism_gene_ko.tsv.gz"
+    assert paths["reference_model"].name == "kegg116_reference_model.yml.gz"
+    assert (tmp_path / "kegg116_ko_reaction.tsv.gz").is_file()
+    assert read_kegg_table(paths["organism_gene_ko"]).columns.tolist() == ["organism", "gene", "ko"]
+
+
 def test_stream_organism_gene_ko_external_merge(tmp_path):
     """A tiny chunk_rows forces multiple sorted runs to be merged; output stays sorted."""
     from raven_python.reconstruction.kegg.parse import stream_organism_gene_ko
 
-    out = tmp_path / "organism_gene_ko.tsv.xz"
+    out = tmp_path / "organism_gene_ko.tsv.gz"
     keep = {ko.id for ko in parse_kegg_kos(DUMP)}
     names = stream_organism_gene_ko(DUMP, keep, out, chunk_rows=1)
     assert out.is_file() and not list(tmp_path.glob("ogk_sort_*"))  # temp dir cleaned up

From e586e9f44305739759bdd7fb99f7422ba96b54ab Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Wed, 10 Jun 2026 23:49:21 +0200
Subject: [PATCH 2/2] Add KEGG taxonomy artefact and phyl_dist (RAVEN
 getPhylDist port)

Publish kegg116_taxonomy.gz and regenerate RAVEN's keggPhylDist from it, so GECKO's
organism-distance kcat selection needs no MATLAB .mat file:

- reconstruction.kegg.phyl_dist + PhylDist faithfully reproduce RAVEN getPhylDist's
  (asymmetric, occasionally negative) distance metric; parse_taxonomy_records exposes
  ids/names/lineages and reads .gz transparently.
- data.ensure_kegg_taxonomy fetches the artefact; build_kegg_artefacts.py emits it.
- Register kegg116_taxonomy.gz in data/manifest.json and _DATA_REGISTRY (8 files).
- Tests for phyl_dist (hand-checked against RAVEN) and the taxonomy fetch; update
  migration/IMPROVEMENTS/maintainer docs and CHANGELOG.
---
 CHANGELOG.md                                  |   5 +
 IMPROVEMENTS.md                               |   2 +-
 data/manifest.example.json                    |   1 +
 data/manifest.json                            |   7 +-
 docs/maintenance/maintaining_kegg_data.md     |   5 +-
 docs/reference/migration.md                   |   4 +-
 scripts/build_kegg_artefacts.py               |   9 ++
 src/raven_python/data.py                      |  18 +++
 .../reconstruction/kegg/__init__.py           |   6 +
 .../reconstruction/kegg/taxonomy.py           | 145 +++++++++++++++---
 tests/test_data.py                            |  10 ++
 tests/test_reconstruction_kegg_phyl_dist.py   |  70 +++++++++
 12 files changed, 257 insertions(+), 25 deletions(-)
 create mode 100644 tests/test_reconstruction_kegg_phyl_dist.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 96a47e5..a13c6d3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,11 @@ hardening pass (no behaviour change on well-formed inputs). Highlights:
   (`kegg116_<domain>.hmm.gz`); the client decompresses and `hmmpress`-es once on
   first use, cutting the download ~10× versus the pressed index and letting the
   same artefact serve MATLAB RAVEN.
+* **Taxonomy + phylogenetic distance:** publish `kegg116_taxonomy.gz` and add
+  `reconstruction.kegg.phyl_dist` (with `PhylDist`), a faithful port of RAVEN's
+  `getPhylDist` that regenerates the `keggPhylDist` distance matrix from the
+  taxonomy file — so GECKO's organism-distance kcat selection needs no MATLAB
+  `.mat`. `ensure_kegg_taxonomy` fetches the artefact.
 * **Packaging:** `raven_python.__version__` now derives from the installed package
   metadata (`importlib.metadata`) instead of a hard-coded literal that had drifted
   to `0.0.1`; the docs site reported the wrong version. Pinned `ruff==0.15.15` in
diff --git a/IMPROVEMENTS.md b/IMPROVEMENTS.md
index 01dd3e5..5e6bd18 100644
--- a/IMPROVEMENTS.md
+++ b/IMPROVEMENTS.md
@@ -92,7 +92,7 @@ and `taxonomy.py` (3b.3). Maintainer-side, build-time tooling (PLAN.md §2.3b).
 | K5 | EFFICIENCY (portability) | raven-python 🔨 | 🔨 | **KEGG download in pure Python stdlib** (`urllib`/`tarfile`/`gzip`/`netrc`), porting `fetch_keggdb.sh`. Drops the script's `wget`/`tar`/`gunzip` (and Cygwin-on-Windows) requirement, so it runs unchanged on Linux/macOS/Windows; tar extraction uses the `data` filter (no path traversal); same `~/.netrc` credential hygiene. The arrange step is split out (`extract_kegg_dump`) so it's network-free and unit-tested. |
 | K6 | EFFICIENCY | raven-python 🔨 + MATLAB RAVEN 💡 | 🔨 | **Per-KO multi-FASTA via a stdlib offset index** (`_index_fasta` → seek), replacing `constructMultiFasta`'s Java-`Hashtable` byte scan with 5M-element preallocation. One streaming pass, only wanted ids retained; no MATLAB/Java heap tuning. |
 | K7 | EFFICIENCY | raven-python 🔨 + MATLAB RAVEN 💡 | 🔨 | **Concatenate per-KO HMMs and `hmmpress` into one pressed library**, so the query path (3b.5) runs a single `hmmscan` against the database instead of RAVEN's thousands of per-KO `hmmsearch` invocations. |
-| K8 | EFFICIENCY (scope) | raven-python 🔨 | 🔨 | **Drop the `getPhylDist` distance matrix.** Its only uses in RAVEN were per-organism HMM-sequence subsampling (`maxPhylDist`/`nSequences`) and the kingdom filter. Our fixed prok90/euk90 libraries (3b.3) remove the subsampling rationale, and domain mode (3b.4) uses the taxonomy domain classification directly — so the O(n²) matrix is never built. Simpler, faster, less code. |
+| K8 | EFFICIENCY (scope) | raven-python 🔨 | 🔨 | **Drop the `getPhylDist` distance matrix.** Its only uses in RAVEN were per-organism HMM-sequence subsampling (`maxPhylDist`/`nSequences`) and the kingdom filter. Our fixed prok90/euk90 libraries (3b.3) remove the subsampling rationale, and domain mode (3b.4) uses the taxonomy domain classification directly — so the *reconstruction* path never builds the O(n²) matrix. The matrix itself remains available on demand via `taxonomy.phyl_dist`, which regenerates RAVEN's `keggPhylDist` from the published `taxonomy` artefact for GECKO's organism-distance kcat selection (no `.mat` needed). |
 | K9 | EFFICIENCY (memory) | raven-python 🔨 | 🔨 | **Stream `organism_gene_ko` to disk** in `parse_kegg_dump` instead of building it in memory. Real KEGG has **9.05M** gene↔KO associations; the in-memory DataFrame build OOMs in a few GB. Streaming (now via the external merge sort of K14) runs the full parse with flat, bounded peak memory. (Found by validating against a real KEGG FTP dump.) |
 | K10 | EFFICIENCY (size) | raven-python 🔨 | 🔨 | **Reference model as gzipped RAVEN/cobra YAML** (`reference_model.yml.gz`) rather than SBML: RAVEN-native, MATLAB-readable, and ~1.1 MB vs ~30 MB SBML for the real 12k-reaction model. Made `io/yaml.py` gzip-aware on a `.gz` suffix (general-purpose). |
 | K11 | ERGONOMICS | raven-python 🔨 | 🔨 | **`ensure_data`** (`data.py`): version-pinned registry that fetches/verifies/caches the published KEGG artefacts under `~/.cache/raven-python/data/`, mirroring `ensure_binary`. End users get a draft model with no KEGG access and no manual data handling — the `…_from_artefacts` entry points auto-fetch when no local dir is supplied. |
diff --git a/data/manifest.example.json b/data/manifest.example.json
index 036ff5e..8053972 100644
--- a/data/manifest.example.json
+++ b/data/manifest.example.json
@@ -13,6 +13,7 @@
         "kegg116_ko_names.tsv.gz":         { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_names.tsv.gz",         "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
         "kegg116_organism_gene_ko.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_organism_gene_ko.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
         "kegg116_rxn_flags.tsv.gz":        { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz",        "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
+        "kegg116_taxonomy.gz":             { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_taxonomy.gz",             "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
         "kegg116_prokaryotes.hmm.gz":      { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_prokaryotes.hmm.gz",      "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
         "kegg116_eukaryotes.hmm.gz":       { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_eukaryotes.hmm.gz",       "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }
       }
diff --git a/data/manifest.json b/data/manifest.json
index 84d291f..bf67bde 100644
--- a/data/manifest.json
+++ b/data/manifest.json
@@ -4,7 +4,7 @@
   "data": {
     "kegg": {
       "version": "kegg116",
-      "description": "KEGG reference model, KO/reaction tables, and prokaryote/eukaryote HMM libraries for getKEGGModelForOrganism.",
+      "description": "KEGG reference model, KO/reaction tables, taxonomy, and prokaryote/eukaryote HMM libraries for getKEGGModelForOrganism.",
       "license": "Derived from the KEGG database; redistributed with permission from KEGG.",
       "source": "https://github.com/SysBioChalmers/raven-python/releases/tag/v0.1.0",
       "files": {
@@ -42,6 +42,11 @@
           "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz",
           "sha256": "c4c134effc9edeeb74b925ae8616320af162edbaad3a9b44dcc29d2c4d12db9b",
           "bytes": 33289
+        },
+        "kegg116_taxonomy.gz": {
+          "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_taxonomy.gz",
+          "sha256": "1edc56da94d71433e5f08c133600292c311baaf33279a959518ab08389b0e538",
+          "bytes": 234693
         }
       }
     }
diff --git a/docs/maintenance/maintaining_kegg_data.md b/docs/maintenance/maintaining_kegg_data.md
index da4bd96..28cee43 100644
--- a/docs/maintenance/maintaining_kegg_data.md
+++ b/docs/maintenance/maintaining_kegg_data.md
@@ -125,7 +125,10 @@ HMMER versions, and lets the same artefact serve MATLAB RAVEN.
 
 [`scripts/build_kegg_artefacts.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/scripts/README.md) runs 3b.2 (+ 3b.3 with
 `--hmms`) and lays the output out as publishable, version-prefixed assets
-(`<version>_<domain>.hmm.gz` named for `ensure_kegg_hmm_library`):
+(`<version>_<domain>.hmm.gz` named for `ensure_kegg_hmm_library`). It also publishes
+`<version>_taxonomy.gz` — the domain split plus the source for
+[`phyl_dist`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/kegg/taxonomy.py),
+which regenerates RAVEN's `keggPhylDist` (used by GECKO) with no `.mat` file:
 
 ```bash
 python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts \
diff --git a/docs/reference/migration.md b/docs/reference/migration.md
index 185b4b8..b6d51cf 100644
--- a/docs/reference/migration.md
+++ b/docs/reference/migration.md
@@ -65,7 +65,7 @@ standard plus the geckopy enzyme-constrained extension, so ecModels round-trip.
 |---|---|---|
 | `getModelFromHomology` + `getBlast`/`getDiamond` | ✅ [`reconstruction.homology.get_model_from_homology`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/homology/homology.py), `run_blast`, `run_diamond`, `blast_from_table` | Core homology reconstruction with structured improvements (bidirectional / best-hits-only, AST GPR rewrite, complex policy, bitscore best-hits, DataFrame ortholog map). |
 | KEGG download → species model (5 steps) | ✅ [`reconstruction.kegg.*`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/kegg/) | All five steps: `download.fetch_keggdb`, `parse.read_kegg_table` + reference model, `hmm.build_libraries`, `organism.build_kegg_model_for_organism` (no-FASTA), `query.assign_kos` + `run_hmmscan`. |
-| `getPhylDist` | ⛔ not ported | Fixed prok90/euk90 libraries make the distance matrix moot. |
+| `getPhylDist` | ✅ [`reconstruction.kegg.phyl_dist`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/kegg/taxonomy.py) (+ `parse_taxonomy`) | Lineage parsing **and** the distance matrix (RAVEN's `keggPhylDist`), regenerated from the published `taxonomy` artefact so GECKO's organism-distance kcat selection needs no `.mat`. The per-organism HMM-subsampling use is moot here (fixed prok90/euk90 libraries). |
 | `getMetaCycModelForOrganism` | ⛔ not ported (and **flagged for removal from MATLAB RAVEN**) | BLAST-to-single-representatives is low-precision at every cutoff. See [IMPROVEMENTS.md](improvements.md) under `R-MetaCyc`. |
 
 ## Tasks, gap-filling, INIT, ftINIT
@@ -106,7 +106,7 @@ standard plus the geckopy enzyme-constrained extension, so ecModels round-trip.
 * **`checkModelStruct` struct/type checks** — moot in cobra.
 * **`runDynamicFBA`** — see Omics/analysis row.
 * **`getMetaCycModelForOrganism`** — see Reconstruction row; flagged for upstream removal.
-* **`getPhylDist`** — fixed prok90/euk90 libraries make it moot.
+* **`getPhylDist` per-organism HMM subsampling** — fixed prok90/euk90 libraries make it moot (the distance matrix itself **is** ported, as `reconstruction.kegg.phyl_dist`, for GECKO).
 * **`mapCompartments`** — overlaps with `compare_models`.
 * **`editMiriam`, `extractMiriam`, `getRxnsInComp`, `getMetsInComp`, `constructEquations`, `getIndexes`** (most), **`setExchangeBounds`**, **most `setParam` modes**, **`getBlastFromExcel` Excel branch** — cobra one-liners; recorded above.
 
diff --git a/scripts/build_kegg_artefacts.py b/scripts/build_kegg_artefacts.py
index 2539dd3..52e002f 100644
--- a/scripts/build_kegg_artefacts.py
+++ b/scripts/build_kegg_artefacts.py
@@ -82,6 +82,15 @@ def main(argv: list[str] | None = None) -> None:
     for name, path in paths.items():
         print(f"    {name}: {path}")
 
+    # Publish the taxonomy file too: domain split, plus the source for phyl_dist
+    # (RAVEN's keggPhylDist, used by GECKO). It is a raw dump file, so gzip it as-is.
+    tax_src = args.keggdb / "taxonomy"
+    if tax_src.is_file():
+        tax_out = args.out / f"{prefix}taxonomy.gz"
+        with open(tax_src, "rb") as src, gzip.open(tax_out, "wb") as out:
+            shutil.copyfileobj(src, out)
+        print(f"    taxonomy: {tax_out}")
+
     if args.hmms:
         ogk = read_kegg_table(paths["organism_gene_ko"])
         genes_pep = args.keggdb / "genes.pep"
diff --git a/src/raven_python/data.py b/src/raven_python/data.py
index fdba221..662437f 100644
--- a/src/raven_python/data.py
+++ b/src/raven_python/data.py
@@ -63,6 +63,10 @@
                 "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz",
                 "sha256": "c4c134effc9edeeb74b925ae8616320af162edbaad3a9b44dcc29d2c4d12db9b",
             },
+            "kegg116_taxonomy.gz": {
+                "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_taxonomy.gz",
+                "sha256": "1edc56da94d71433e5f08c133600292c311baaf33279a959518ab08389b0e538",
+            },
         },
     },
 }
@@ -216,3 +220,17 @@ def ensure_kegg_hmm_library(
         if proc.returncode != 0:
             raise RuntimeError(f"hmmpress failed:\n{(proc.stderr or '').strip()}")
     return library
+
+
+def ensure_kegg_taxonomy(*, version: str | None = None, registry: dict | None = None) -> Path:
+    """Ensure the KEGG ``taxonomy`` artefact is cached; return its (gzipped) path.
+
+    The gzipped KEGG ``taxonomy`` file is the source for domain classification and for
+    regenerating the phylogenetic distance matrix — RAVEN's ``keggPhylDist``, which GECKO
+    uses to pick the closest organism for kcat assignment — via
+    :func:`raven_python.reconstruction.kegg.phyl_dist` (which reads ``.gz`` directly). So
+    that capability needs only this published artefact, no MATLAB ``.mat`` file.
+    """
+    registry = _DATA_REGISTRY if registry is None else registry
+    ver = version or _bundle("kegg", registry)["version"]
+    return ensure_data_file("kegg", f"{ver}_taxonomy.gz", version=ver, registry=registry)
diff --git a/src/raven_python/reconstruction/kegg/__init__.py b/src/raven_python/reconstruction/kegg/__init__.py
index 5d27602..140c0d6 100644
--- a/src/raven_python/reconstruction/kegg/__init__.py
+++ b/src/raven_python/reconstruction/kegg/__init__.py
@@ -40,15 +40,19 @@
     run_hmmscan,
 )
 from raven_python.reconstruction.kegg.taxonomy import (
+    PhylDist,
     organism_domains,
     organisms_in_domain,
     parse_taxonomy,
+    parse_taxonomy_records,
+    phyl_dist,
 )
 
 __all__ = [
     "KeggCompound",
     "KeggKO",
     "KeggReaction",
+    "PhylDist",
     "assign_kos",
     "build_hmm_library",
     "build_kegg_tables",
@@ -70,6 +74,8 @@
     "parse_kegg_kos",
     "parse_kegg_reactions",
     "parse_taxonomy",
+    "parse_taxonomy_records",
+    "phyl_dist",
     "read_kegg_table",
     "run_hmmscan",
     "stream_organism_gene_ko",
diff --git a/src/raven_python/reconstruction/kegg/taxonomy.py b/src/raven_python/reconstruction/kegg/taxonomy.py
index 463fcce..4b8dbb2 100644
--- a/src/raven_python/reconstruction/kegg/taxonomy.py
+++ b/src/raven_python/reconstruction/kegg/taxonomy.py
@@ -1,27 +1,47 @@
-"""Parse the KEGG ``taxonomy`` file into per-organism category lineages.
+"""Parse the KEGG ``taxonomy`` file into lineages, domains, and phylogenetic distances.
 
-Ports the file-reading half of RAVEN ``getPhylDist`` (the distance-matrix half is
-step 3b.5). The ``taxonomy`` file is an indented tree: ``#``-prefixed lines name a
-category, the number of leading ``#`` giving its depth; organism lines are
-tab-separated ``T-number<tab>org_id<tab>name<tab>...``. Each organism inherits the
-stack of categories above it, the first of which is its domain (``Prokaryotes`` /
-``Eukaryotes``).
+Ports RAVEN ``getPhylDist``: the file-reading half (per-organism category lineages and
+the domain split) **and** the distance-matrix half (:func:`phyl_dist`), which earlier
+ports deferred. The ``taxonomy`` file is an indented tree: ``#``-prefixed lines name a
+category, the number of leading ``#`` giving its depth; organism lines are tab-separated
+``T-number<tab>org_id<tab>T-number<tab>name``. Each organism inherits the stack of
+categories above it, the first of which is its domain (``Prokaryotes`` / ``Eukaryotes``).
 
-Used by 3b.3 to split genes into the prok/euk HMM libraries, and (later) by 3b.5
-for phylogenetic distances.
+Used by 3b.3 to split genes into the prok/euk HMM libraries, by 3b.4 domain mode, and by
+:func:`phyl_dist` to regenerate the KEGG phylogenetic distance matrix (RAVEN's
+``keggPhylDist``) that GECKO uses to pick the closest organism for kcat assignment — so
+that capability needs no MATLAB ``.mat`` file, only the published ``taxonomy`` artefact.
 """
 from __future__ import annotations
 
+import gzip
 import warnings
+from dataclasses import dataclass, field
 from pathlib import Path
 
+import numpy as np
 
-def parse_taxonomy(path: str | Path) -> dict[str, list[str]]:
-    """Return ``{organism_id: [category, ...]}`` from outermost to innermost."""
-    org_categories: dict[str, list[str]] = {}
+
+def _open_text(path: str | Path):
+    """Open ``path`` as UTF-8 text, transparently decompressing a ``.gz`` artefact."""
+    path = Path(path)
+    if path.suffix == ".gz":
+        return gzip.open(path, "rt", encoding="utf-8")
+    return open(path, encoding="utf-8")
+
+
+def parse_taxonomy_records(path: str | Path) -> list[tuple[str, str, list[str]]]:
+    """Return ``[(organism_id, name, [category, ...]), ...]`` in file order.
+
+    ``name`` is the organism's scientific name with any trailing KEGG parenthetical
+    kept (e.g. ``"Homo sapiens (human)"``), matching RAVEN ``getPhylDist``; the category
+    list is the lineage from outermost (domain) to innermost. Order is preserved so it
+    aligns with the rows/columns of :func:`phyl_dist`.
+    """
+    records: list[tuple[str, str, list[str]]] = []
     stack: list[str] = []
     skipped_level_warned = False
-    with open(path, encoding="utf-8") as handle:
+    with _open_text(path) as handle:
         for line_no, raw in enumerate(handle, start=1):
             line = raw.rstrip("\n")
             if not line.strip():
@@ -30,9 +50,8 @@ def parse_taxonomy(path: str | Path) -> dict[str, list[str]]:
                 depth = len(line) - len(line.lstrip("#"))
                 name = line[depth:].strip()
                 if depth - 1 > len(stack):
-                    # Depth-skip (e.g. ## then ####): the original `stack[:depth-1]`
-                    # silently produced a too-short lineage. Pad with explicit
-                    # blanks so downstream slices stay aligned; warn once.
+                    # Depth-skip (e.g. ## then ####): pad the missing levels with blanks
+                    # so downstream slices stay aligned; warn once.
                     if not skipped_level_warned:
                         warnings.warn(
                             f"{path}: taxonomy depth skips a level near line {line_no} "
@@ -46,11 +65,25 @@ def parse_taxonomy(path: str | Path) -> dict[str, list[str]]:
                     stack = stack[: depth - 1]
                 stack.append(name)
             else:
-                fields = line.split("\t") if "\t" in line else line.split()
-                if len(fields) < 2:
+                # Organism line. RAVEN reads the id between the 1st/2nd whitespace and the
+                # name after the 3rd; with the tab-delimited file that is fields[1]/fields[3].
+                if "\t" in line:
+                    fields = line.split("\t")
+                    org_id = fields[1].strip() if len(fields) > 1 else ""
+                    org_name = fields[3].strip() if len(fields) > 3 else ""
+                else:
+                    fields = line.split()
+                    org_id = fields[1] if len(fields) > 1 else ""
+                    org_name = " ".join(fields[3:]) if len(fields) > 3 else ""
+                if not org_id:
                     continue
-                org_categories[fields[1].strip()] = list(stack)
-    return org_categories
+                records.append((org_id, org_name, list(stack)))
+    return records
+
+
+def parse_taxonomy(path: str | Path) -> dict[str, list[str]]:
+    """Return ``{organism_id: [category, ...]}`` from outermost to innermost."""
+    return {org_id: lineage for org_id, _name, lineage in parse_taxonomy_records(path)}
 
 
 def organism_domains(path: str | Path) -> dict[str, str]:
@@ -69,3 +102,75 @@ def organisms_in_domain(path: str | Path, domain: str) -> set[str]:
         for org, dom in organism_domains(path).items()
         if dom.lower().startswith(needle) or needle.startswith(dom.lower())
     }
+
+
+@dataclass
+class PhylDist:
+    """KEGG phylogenetic distance structure — RAVEN ``getPhylDist`` / ``keggPhylDist``.
+
+    ``ids``/``names`` are the KEGG organism ids and scientific names (names keep RAVEN's
+    trailing parenthetical; consumers that need them cleaned, e.g. geckopy, strip it).
+    ``dist_matrix[i, j]`` is RAVEN's distance from ``ids[i]`` to ``ids[j]``. It is a
+    faithful reproduction of RAVEN's metric, which is **asymmetric and may be negative**;
+    GECKO consumes it only by taking the closest (minimum-distance) organism, so the exact
+    values matter for parity with MATLAB but the sign/asymmetry are harmless there.
+    """
+
+    ids: list[str] = field(default_factory=list)
+    names: list[str] = field(default_factory=list)
+    dist_matrix: np.ndarray = field(default_factory=lambda: np.empty((0, 0), dtype=np.int64))
+
+
+def phyl_dist(path: str | Path, *, only_in_kingdom: bool = False) -> PhylDist:
+    """Compute the KEGG phylogenetic distance matrix from the ``taxonomy`` file.
+
+    Faithful port of RAVEN ``getPhylDist``. For organisms ``i`` and ``j`` with category
+    lineages of lengths ``Li, Lj``, the distance is ``(Li - Lj) + min(Li, Lj) - k`` where
+    ``k`` is the deepest level (within the shorter lineage, comparing position by position
+    from the root) at which the two categories coincide — ``1`` if none coincide. This
+    reproduces RAVEN's exact values (including their asymmetry and occasional negatives)
+    so the output matches MATLAB ``keggPhylDist`` that GECKO was built against.
+
+    ``only_in_kingdom`` sets the distance to ``+inf`` between organisms in different
+    domains (RAVEN's ``onlyInKingdom`` variant); the matrix is then returned as ``float``.
+
+    Cost is ``O(N²)`` in time and memory (``N`` = number of KEGG organisms, ~10⁴), so this
+    is a maintainer/GECKO-side generation step; persist the result rather than rebuilding.
+    """
+    records = parse_taxonomy_records(path)
+    ids = [r[0] for r in records]
+    names = [r[1] for r in records]
+    lineages = [r[2] for r in records]
+    n = len(records)
+    if n == 0:
+        return PhylDist(ids, names, np.empty((0, 0), dtype=np.int64))
+
+    lengths = np.array([len(lin) for lin in lineages], dtype=np.int32)
+    max_depth = int(lengths.max())
+
+    # deepest[i, j] = deepest level d (1-based) at which lineage_i[d-1] == lineage_j[d-1],
+    # counting only levels present in both (d <= min(Li, Lj)); 0 if none coincide.
+    deepest = np.zeros((n, n), dtype=np.int32)
+    for d in range(1, max_depth + 1):
+        lookup: dict[str, int] = {}
+        lab = np.full(n, -1, dtype=np.int64)
+        for i, lin in enumerate(lineages):
+            if len(lin) >= d:
+                lab[i] = lookup.setdefault(lin[d - 1], len(lookup))
+        eq = (lab[:, None] == lab[None, :]) & (lab[:, None] != -1)
+        # Ascending d, so the last (deepest) coinciding level wins -> max coinciding depth.
+        deepest[eq] = d
+    k = np.where(deepest == 0, 1, deepest).astype(np.int32)
+
+    # Values are tiny (|dist| <= max lineage depth); int32 throughout keeps the N x N
+    # matrix (and its intermediates) half the size of the int64 default.
+    li = lengths[:, None]
+    lj = lengths[None, :]
+    dist = (li - lj) + np.minimum(li, lj) - k
+
+    if only_in_kingdom:
+        domains = np.array([lin[0] if lin else "" for lin in lineages])
+        dist = dist.astype(float)
+        dist[domains[:, None] != domains[None, :]] = np.inf
+
+    return PhylDist(ids=ids, names=names, dist_matrix=dist)
diff --git a/tests/test_data.py b/tests/test_data.py
index d8db588..b72ccb1 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -12,6 +12,7 @@
     ensure_data_file,
     ensure_kegg_data,
     ensure_kegg_hmm_library,
+    ensure_kegg_taxonomy,
 )
 
 
@@ -36,6 +37,7 @@ def served(tmp_path, monkeypatch):
         "ko_names.tsv.gz": b"ko\tname\n",
         "organism_gene_ko.tsv.gz": b"organism\tgene\tko\n",
         "rxn_flags.tsv.gz": b"reaction\tspontaneous\n",
+        "taxonomy.gz": b"# Prokaryotes\n",
     }
     files = {}
     payloads = {}
@@ -132,6 +134,13 @@ def fake_run(cmd, capture_output, text):
     assert len(presses) == 1
 
 
+def test_ensure_kegg_taxonomy(served):
+    registry, cache, _ = served
+    path = ensure_kegg_taxonomy(registry=registry)
+    assert path == cache / "raven_python" / "data" / "kegg-v1" / "v1_taxonomy.gz"
+    assert path.is_file()
+
+
 def test_unregistered_dataset_raises():
     # An unpublished dataset still raises an actionable error against the shipped registry.
     with pytest.raises(FileNotFoundError, match="No data artefacts registered"):
@@ -146,3 +155,4 @@ def test_shipped_registry_matches_resolver_names():
     names = set(kegg["files"])
     assert {f"{ver}_{base}" for base in CORE_KEGG_FILES} <= names
     assert {f"{ver}_{d}.hmm.gz" for d in ("prokaryotes", "eukaryotes")} <= names
+    assert f"{ver}_taxonomy.gz" in names
diff --git a/tests/test_reconstruction_kegg_phyl_dist.py b/tests/test_reconstruction_kegg_phyl_dist.py
new file mode 100644
index 0000000..43f3e83
--- /dev/null
+++ b/tests/test_reconstruction_kegg_phyl_dist.py
@@ -0,0 +1,70 @@
+"""Tests for the KEGG phylogenetic distance generator (RAVEN getPhylDist port)."""
+import gzip
+
+import numpy as np
+
+from raven_python.reconstruction.kegg.taxonomy import parse_taxonomy_records, phyl_dist
+
+# Tiny taxonomy: two prokaryotes sharing a lineage, a 3-deep mammal, a 2-deep fungus.
+TAXONOMY = (
+    "# Prokaryotes\n"
+    "## Bacteria\n"
+    "T1\tbsu\tT1\tBacillus subtilis\n"
+    "T2\teco\tT2\tEscherichia coli\n"
+    "# Eukaryotes\n"
+    "## Animals\n"
+    "### Mammals\n"
+    "T3\thsa\tT3\tHomo sapiens (human)\n"
+    "## Fungi\n"
+    "T4\tsce\tT4\tSaccharomyces cerevisiae\n"
+)
+
+
+def _write(tmp_path):
+    p = tmp_path / "taxonomy"
+    p.write_text(TAXONOMY, encoding="utf-8")
+    return p
+
+
+def test_parse_records_ids_names_lineages(tmp_path):
+    recs = parse_taxonomy_records(_write(tmp_path))
+    assert [r[0] for r in recs] == ["bsu", "eco", "hsa", "sce"]
+    # Names keep RAVEN's trailing parenthetical.
+    assert [r[1] for r in recs] == [
+        "Bacillus subtilis",
+        "Escherichia coli",
+        "Homo sapiens (human)",
+        "Saccharomyces cerevisiae",
+    ]
+    lineage = {r[0]: r[2] for r in recs}
+    assert lineage["hsa"] == ["Eukaryotes", "Animals", "Mammals"]
+    assert lineage["sce"] == ["Eukaryotes", "Fungi"]
+
+
+def test_phyl_dist_matches_raven_formula(tmp_path):
+    pd = phyl_dist(_write(tmp_path))
+    assert pd.ids == ["bsu", "eco", "hsa", "sce"]
+    # Hand-computed from RAVEN's distMat = (Li-Lj) + min(Li,Lj) - k.
+    expected = np.array(
+        [
+            [0, 0, 0, 1],
+            [0, 0, 0, 1],
+            [2, 2, 0, 2],
+            [1, 1, 0, 0],
+        ]
+    )
+    np.testing.assert_array_equal(pd.dist_matrix, expected)
+    assert np.all(np.diag(pd.dist_matrix) == 0)  # self-distance is 0
+    assert pd.dist_matrix[2, 0] != pd.dist_matrix[0, 2]  # RAVEN metric is asymmetric
+
+
+def test_phyl_dist_only_in_kingdom_blocks_cross_domain(tmp_path):
+    pd = phyl_dist(_write(tmp_path), only_in_kingdom=True)
+    assert np.isinf(pd.dist_matrix[0, 2]) and np.isinf(pd.dist_matrix[0, 3])  # prok vs euk
+    assert np.isfinite(pd.dist_matrix[2, 3])  # both eukaryotes
+
+
+def test_phyl_dist_reads_gzip(tmp_path):
+    p = tmp_path / "taxonomy.gz"
+    p.write_bytes(gzip.compress(TAXONOMY.encode("utf-8")))
+    assert phyl_dist(p).ids == ["bsu", "eco", "hsa", "sce"]