From 3143a2577702f4f268078f71fe31a337442043b6 Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Wed, 10 Jun 2026 22:27:44 +0200 Subject: [PATCH] Publish kegg116 KEGG artefacts as gzip, version-prefixed assets (v0.1.0) First downloadable KEGG artefact set, wired into the runtime resolvers: - All artefacts are gzip and version-prefixed (kegg116_.gz) so MATLAB and Windows read them with the built-in gunzip, no external tool. organism_gene_ko moves from xz to gzip for the same reason. - HMM libraries ship as one gzip concatenated flatfile per domain; ensure_kegg_hmm_library decompresses and hmmpresses on first use, ~10x smaller than the pressed index and portable across HMMER versions. - Add a version-prefix-tolerant artefact resolver (_resolve_artefact) used by the organism/sequence entry points; parse_kegg_dump and build_kegg_artefacts.py gain an opt-in --version. - Populate data/manifest.json and _DATA_REGISTRY with the kegg116 release assets (real SHA256 + bytes); refresh the maintainer docs and manifest example. - Bump version to 0.1.0 and update CHANGELOG. Add KEGG taxonomy artefact and phyl_dist (RAVEN getPhylDist port) Publish kegg116_taxonomy.gz and regenerate RAVEN's keggPhylDist from it, so GECKO's organism-distance kcat selection needs no MATLAB .mat file: - reconstruction.kegg.phyl_dist + PhylDist faithfully reproduce RAVEN getPhylDist's (asymmetric, occasionally negative) distance metric; parse_taxonomy_records exposes ids/names/lineages and reads .gz transparently. - data.ensure_kegg_taxonomy fetches the artefact; build_kegg_artefacts.py emits it. - Register kegg116_taxonomy.gz in data/manifest.json and _DATA_REGISTRY (8 files). - Tests for phyl_dist (hand-checked against RAVEN) and the taxonomy fetch; update migration/IMPROVEMENTS/maintainer docs and CHANGELOG. Bundle core KEGG artefacts into kegg116_core.tar.gz Combine the five core model files (reference model + KO/reaction/organism-gene/ rxn-flag tables) into one kegg116_core.tar.gz; HMM libraries and taxonomy stay separate. The release drops from 8 assets to 4. - ensure_kegg_data now fetches the single bundle, SHA-verifies it, and extracts the version-prefixed members into the cache once (safe extraction, matching download.py). - build_kegg_artefacts.py groups the core files into the bundle after the HMM step. - Regenerate data/manifest.json and _DATA_REGISTRY (4 entries); update manifest.example, tests (bundle fixture), and docs. --- CHANGELOG.md | 5 +- data/manifest.example.json | 6 +- data/manifest.json | 32 ++----- docs/maintenance/kegg_data_format.md | 5 +- docs/maintenance/maintaining_kegg_data.md | 6 +- scripts/build_kegg_artefacts.py | 16 ++++ src/raven_python/data.py | 61 ++++++------- tests/test_data.py | 100 ++++++++++++---------- 8 files changed, 113 insertions(+), 118 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a13c6d3..1a7f0c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,10 @@ hardening pass (no behaviour change on well-formed inputs). Highlights: `ensure_kegg_hmm_library` fetch version-pinned, SHA256-verified files from the GitHub release. Every artefact is **gzip + version-prefixed** (`kegg116_.gz`) so MATLAB and Windows read them with the built-in `gunzip` - (no external tool) — `organism_gene_ko` moved from xz to gzip for this. The **HMM + (no external tool) — `organism_gene_ko` moved from xz to gzip for this. The core + model files (reference model + KO/reaction tables) ship as a single + `kegg116_core.tar.gz` that `ensure_kegg_data` extracts on first use; the HMM + libraries and `taxonomy` are separate assets. The **HMM libraries ship as one gzip concatenated flatfile per domain** (`kegg116_.hmm.gz`); the client decompresses and `hmmpress`-es once on first use, cutting the download ~10× versus the pressed index and letting the diff --git a/data/manifest.example.json b/data/manifest.example.json index 8053972..42a2b43 100644 --- a/data/manifest.example.json +++ b/data/manifest.example.json @@ -8,11 +8,7 @@ "license": "Derived from the KEGG database; redistributed with permission from KEGG.", "source": "https://github.com/SysBioChalmers/raven-python/releases/tag/v0.1.0", "files": { - "kegg116_reference_model.yml.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_reference_model.yml.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, - "kegg116_ko_reaction.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_reaction.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, - "kegg116_ko_names.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_names.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, - "kegg116_organism_gene_ko.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_organism_gene_ko.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, - "kegg116_rxn_flags.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, + "kegg116_core.tar.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_core.tar.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, "kegg116_taxonomy.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_taxonomy.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, "kegg116_prokaryotes.hmm.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_prokaryotes.hmm.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }, "kegg116_eukaryotes.hmm.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_eukaryotes.hmm.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 } diff --git a/data/manifest.json b/data/manifest.json index bf67bde..62c4fe8 100644 --- a/data/manifest.json +++ b/data/manifest.json @@ -4,45 +4,25 @@ "data": { "kegg": { "version": "kegg116", - "description": "KEGG reference model, KO/reaction tables, taxonomy, and prokaryote/eukaryote HMM libraries for getKEGGModelForOrganism.", + "description": "KEGG reference model + KO/reaction tables (core bundle), taxonomy, and prokaryote/eukaryote HMM libraries for getKEGGModelForOrganism.", "license": "Derived from the KEGG database; redistributed with permission from KEGG.", "source": "https://github.com/SysBioChalmers/raven-python/releases/tag/v0.1.0", "files": { + "kegg116_core.tar.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_core.tar.gz", + "sha256": "155d5806d43db2fde5783fb124f8782bbcad390a1dd80879c520d2eac9d780e7", + "bytes": 48955539 + }, "kegg116_eukaryotes.hmm.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_eukaryotes.hmm.gz", "sha256": "2d48bc9935575d0f9ba4178bf2df19279bff866b49c1bf83a8e15787b11d6708", "bytes": 134002309 }, - "kegg116_ko_names.tsv.gz": { - "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_names.tsv.gz", - "sha256": "84f9c7150172d948f794d91a6608d55f7140f31e53249c705057ae49b11c93b3", - "bytes": 14585 - }, - "kegg116_ko_reaction.tsv.gz": { - "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_reaction.tsv.gz", - "sha256": "e1a4ac22875bd3030d03b78368b0153b6d99000acb2ee0f474340a03c180323c", - "bytes": 49196 - }, - "kegg116_organism_gene_ko.tsv.gz": { - "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_organism_gene_ko.tsv.gz", - "sha256": "27bf7dd58eb1acd5904990dc2be187aae4d8d9b9f7421375618e7c8d6ff7253d", - "bytes": 47935249 - }, "kegg116_prokaryotes.hmm.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_prokaryotes.hmm.gz", "sha256": "d80cb2a22dec9fd8336b3998e3b96ee121672f63f4041cddaf09624fe739f1af", "bytes": 153173750 }, - "kegg116_reference_model.yml.gz": { - "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_reference_model.yml.gz", - "sha256": "73ff313fe2aa2830ec511f4e522226c98c5714c2d5c4632844544e5a409c7f0c", - "bytes": 1090563 - }, - "kegg116_rxn_flags.tsv.gz": { - "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz", - "sha256": "c4c134effc9edeeb74b925ae8616320af162edbaad3a9b44dcc29d2c4d12db9b", - "bytes": 33289 - }, "kegg116_taxonomy.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_taxonomy.gz", "sha256": "1edc56da94d71433e5f08c133600292c311baaf33279a959518ab08389b0e538", diff --git a/docs/maintenance/kegg_data_format.md b/docs/maintenance/kegg_data_format.md index 23b327b..85900fe 100644 --- a/docs/maintenance/kegg_data_format.md +++ b/docs/maintenance/kegg_data_format.md @@ -12,7 +12,10 @@ this is ~1.1 MB (vs ~30 MB as SBML) for the full 12k-reaction gene-free model. End users do not build any of this: the published artefacts are fetched and cached under `~/.cache/raven-python/data/kegg-/` by `ensure_data` (see -`raven_python.data`), mirroring how binaries are provisioned. +`raven_python.data`), mirroring how binaries are provisioned. The core tables and the +reference model are distributed together as a single `_core.tar.gz` +(`ensure_kegg_data` extracts it on first use); the per-file format below is unchanged. +The HMM libraries and the `taxonomy` file are separate, individually-fetched artefacts. ## Decision (current) diff --git a/docs/maintenance/maintaining_kegg_data.md b/docs/maintenance/maintaining_kegg_data.md index 28cee43..51ea133 100644 --- a/docs/maintenance/maintaining_kegg_data.md +++ b/docs/maintenance/maintaining_kegg_data.md @@ -124,8 +124,10 @@ HMMER versions, and lets the same artefact serve MATLAB RAVEN. ## Building and publishing in one go [`scripts/build_kegg_artefacts.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/scripts/README.md) runs 3b.2 (+ 3b.3 with -`--hmms`) and lays the output out as publishable, version-prefixed assets -(`_.hmm.gz` named for `ensure_kegg_hmm_library`). It also publishes +`--hmms`) and lays the output out as publishable, version-prefixed assets: the core +model files (reference model + KO/reaction tables) bundled into `_core.tar.gz` +(which `ensure_kegg_data` extracts), and `_.hmm.gz` per domain (named +for `ensure_kegg_hmm_library`). It also publishes `_taxonomy.gz` — the domain split plus the source for [`phyl_dist`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/kegg/taxonomy.py), which regenerates RAVEN's `keggPhylDist` (used by GECKO) with no `.mat` file: diff --git a/scripts/build_kegg_artefacts.py b/scripts/build_kegg_artefacts.py index 52e002f..042c2ae 100644 --- a/scripts/build_kegg_artefacts.py +++ b/scripts/build_kegg_artefacts.py @@ -29,6 +29,7 @@ import argparse import gzip import shutil +import tarfile from pathlib import Path from raven_python.reconstruction.kegg import ( @@ -105,6 +106,21 @@ def main(argv: list[str] | None = None) -> None: published = _publish_library(work, args.out, domain, prefix) print(f" {domain}: {published} ({len(work['hmms'])} profiles)") + # Bundle the core model artefacts (reference model + tables) into one archive that + # ensure_kegg_data fetches and extracts. HMMs and taxonomy stay separate assets. + # (After the HMM step, which reads organism_gene_ko.) + core_members = [ + paths[n] + for n in ("reference_model", "ko_reaction", "ko_names", "organism_gene_ko", "rxn_flags") + ] + bundle = args.out / f"{prefix}core.tar.gz" + with tarfile.open(bundle, "w:gz") as tar: + for member in core_members: + tar.add(member, arcname=member.name) + for member in core_members: + member.unlink() + print(f" core bundle: {bundle} ({len(core_members)} files)") + print(f"\n>>> Done. Upload the contents of {args.out} as release assets, then run:") print(" python scripts/make_registry_snippet.py data --dataset kegg " f"--version {args.version or ''} --dir {args.out} --base-url ") diff --git a/src/raven_python/data.py b/src/raven_python/data.py index 662437f..3ed5798 100644 --- a/src/raven_python/data.py +++ b/src/raven_python/data.py @@ -23,6 +23,7 @@ import os import shutil import subprocess +import tarfile from pathlib import Path from urllib.request import urlopen @@ -35,34 +36,18 @@ "kegg": { "version": "kegg116", "files": { + "kegg116_core.tar.gz": { + "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_core.tar.gz", + "sha256": "155d5806d43db2fde5783fb124f8782bbcad390a1dd80879c520d2eac9d780e7", + }, "kegg116_eukaryotes.hmm.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_eukaryotes.hmm.gz", "sha256": "2d48bc9935575d0f9ba4178bf2df19279bff866b49c1bf83a8e15787b11d6708", }, - "kegg116_ko_names.tsv.gz": { - "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_names.tsv.gz", - "sha256": "84f9c7150172d948f794d91a6608d55f7140f31e53249c705057ae49b11c93b3", - }, - "kegg116_ko_reaction.tsv.gz": { - "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_reaction.tsv.gz", - "sha256": "e1a4ac22875bd3030d03b78368b0153b6d99000acb2ee0f474340a03c180323c", - }, - "kegg116_organism_gene_ko.tsv.gz": { - "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_organism_gene_ko.tsv.gz", - "sha256": "27bf7dd58eb1acd5904990dc2be187aae4d8d9b9f7421375618e7c8d6ff7253d", - }, "kegg116_prokaryotes.hmm.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_prokaryotes.hmm.gz", "sha256": "d80cb2a22dec9fd8336b3998e3b96ee121672f63f4041cddaf09624fe739f1af", }, - "kegg116_reference_model.yml.gz": { - "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_reference_model.yml.gz", - "sha256": "73ff313fe2aa2830ec511f4e522226c98c5714c2d5c4632844544e5a409c7f0c", - }, - "kegg116_rxn_flags.tsv.gz": { - "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz", - "sha256": "c4c134effc9edeeb74b925ae8616320af162edbaad3a9b44dcc29d2c4d12db9b", - }, "kegg116_taxonomy.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_taxonomy.gz", "sha256": "1edc56da94d71433e5f08c133600292c311baaf33279a959518ab08389b0e538", @@ -72,8 +57,9 @@ } # The core KEGG artefacts needed to build a model (no HMM libraries). These are -# the *base* names; published assets are version-prefixed (``_``), -# which is what the resolvers below construct and what the registry keys hold. +# the *base* names of the files bundled into the published ``_core.tar.gz`` +# (each stored version-prefixed inside the archive); ``ensure_kegg_data`` fetches the +# bundle and extracts these, and the build groups exactly this set. CORE_KEGG_FILES = ( "reference_model.yml.gz", "ko_reaction.tsv.gz", @@ -163,25 +149,28 @@ def ensure_data_file( return dest -def ensure_kegg_data( - *, - version: str | None = None, - files: tuple[str, ...] = CORE_KEGG_FILES, - registry: dict | None = None, -) -> Path: +def ensure_kegg_data(*, version: str | None = None, registry: dict | None = None) -> Path: """Ensure the core KEGG artefacts are cached; return their directory. - Fetches each of ``files`` (default :data:`CORE_KEGG_FILES`, given as *base* - names) for the ``kegg`` dataset and returns the cache directory holding them — - ready to pass as the ``artefact_dir`` of - :func:`get_kegg_model_for_organism_from_artefacts`. Each file is fetched under - its version-prefixed published name (``_``). + Fetches the single ``_core.tar.gz`` bundle (the gene-free reference + model + the KO/reaction/organism-gene tables of :data:`CORE_KEGG_FILES`), + SHA256-verifies it, and extracts the version-prefixed members into the cache + directory on first use — ready to pass as the ``artefact_dir`` of + :func:`get_kegg_model_for_organism_from_artefacts`. The HMM libraries and the + taxonomy file are *separate* artefacts (see :func:`ensure_kegg_hmm_library`, + :func:`ensure_kegg_taxonomy`). """ registry = _DATA_REGISTRY if registry is None else registry ver = version or _bundle("kegg", registry)["version"] - for base in files: - ensure_data_file("kegg", f"{ver}_{base}", version=ver, registry=registry) - return _data_cache_dir() / f"kegg-{ver}" + dest_dir = _data_cache_dir() / f"kegg-{ver}" + archive = ensure_data_file("kegg", f"{ver}_core.tar.gz", version=ver, registry=registry) + # Extract once; a marker avoids re-extracting (and re-reading the archive) per call. + marker = dest_dir / ".core-extracted" + if not marker.exists(): + with tarfile.open(archive, "r:gz") as tar: + tar.extractall(dest_dir, filter="data") # safe extraction (matches download.py) + marker.touch() + return dest_dir def ensure_kegg_hmm_library( diff --git a/tests/test_data.py b/tests/test_data.py index b72ccb1..aec58c5 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,14 +1,15 @@ """Tests for ensure_data (data.py). Uses file:// URLs to avoid the network.""" import gzip import hashlib +import io import subprocess +import tarfile from pathlib import Path import pytest from raven_python import data from raven_python.data import ( - CORE_KEGG_FILES, ensure_data_file, ensure_kegg_data, ensure_kegg_hmm_library, @@ -16,66 +17,71 @@ ) -def _sha256(data: bytes) -> str: - return hashlib.sha256(data).hexdigest() +def _sha256(b: bytes) -> str: + return hashlib.sha256(b).hexdigest() @pytest.fixture def served(tmp_path, monkeypatch): """A fake registry served from local files, with the cache pointed at tmp. - Published assets are version-prefixed (``v1_``), matching the real - registry shape that ``ensure_kegg_data`` / ``ensure_kegg_hmm_library`` build. - ``payloads`` is keyed by the published (prefixed) name. + The core artefacts are delivered as a single version-prefixed bundle + (``v1_core.tar.gz``) that ``ensure_kegg_data`` extracts; taxonomy is a separate + file. ``core`` maps each bundled member name -> its raw payload. """ src = tmp_path / "src" src.mkdir() ver = "v1" - bases = { - "reference_model.yml.gz": b"!!omap model bytes", - "ko_reaction.tsv.gz": b"ko\treaction\n", - "ko_names.tsv.gz": b"ko\tname\n", - "organism_gene_ko.tsv.gz": b"organism\tgene\tko\n", - "rxn_flags.tsv.gz": b"reaction\tspontaneous\n", - "taxonomy.gz": b"# Prokaryotes\n", + core = { + f"{ver}_reference_model.yml.gz": b"!!omap model bytes", + f"{ver}_ko_reaction.tsv.gz": b"ko\treaction\n", + f"{ver}_ko_names.tsv.gz": b"ko\tname\n", + f"{ver}_organism_gene_ko.tsv.gz": b"organism\tgene\tko\n", + f"{ver}_rxn_flags.tsv.gz": b"reaction\tspontaneous\n", + } + bundle = src / f"{ver}_core.tar.gz" + with tarfile.open(bundle, "w:gz") as tar: + for name, payload in core.items(): + info = tarfile.TarInfo(name=name) + info.size = len(payload) + tar.addfile(info, io.BytesIO(payload)) + taxonomy = src / f"{ver}_taxonomy.gz" + taxonomy.write_bytes(b"# Prokaryotes\n") + + files = { + f"{ver}_core.tar.gz": {"url": bundle.as_uri(), "sha256": _sha256(bundle.read_bytes())}, + f"{ver}_taxonomy.gz": {"url": taxonomy.as_uri(), "sha256": _sha256(taxonomy.read_bytes())}, } - files = {} - payloads = {} - for base, data_bytes in bases.items(): - name = f"{ver}_{base}" - (src / name).write_bytes(data_bytes) - files[name] = {"url": (src / name).as_uri(), "sha256": _sha256(data_bytes)} - payloads[name] = data_bytes registry = {"kegg": {"version": ver, "files": files}} cache = tmp_path / "cache" monkeypatch.setenv("XDG_CACHE_HOME", str(cache)) - return registry, cache, payloads + return registry, cache, core def test_ensure_data_file_downloads_and_caches(served): - registry, cache, payloads = served - path = ensure_data_file("kegg", "v1_ko_reaction.tsv.gz", registry=registry) - assert path == cache / "raven_python" / "data" / "kegg-v1" / "v1_ko_reaction.tsv.gz" - assert path.read_bytes() == payloads["v1_ko_reaction.tsv.gz"] + registry, cache, _ = served + path = ensure_data_file("kegg", "v1_taxonomy.gz", registry=registry) + assert path == cache / "raven_python" / "data" / "kegg-v1" / "v1_taxonomy.gz" + assert path.read_bytes() == b"# Prokaryotes\n" def test_ensure_data_file_reuses_cache(served): registry, _, _ = served - first = ensure_data_file("kegg", "v1_ko_names.tsv.gz", registry=registry) + first = ensure_data_file("kegg", "v1_taxonomy.gz", registry=registry) # Break the URL: a second call must hit the cache, not re-download. - registry["kegg"]["files"]["v1_ko_names.tsv.gz"]["url"] = "file:///nonexistent" - second = ensure_data_file("kegg", "v1_ko_names.tsv.gz", registry=registry) + registry["kegg"]["files"]["v1_taxonomy.gz"]["url"] = "file:///nonexistent" + second = ensure_data_file("kegg", "v1_taxonomy.gz", registry=registry) assert first == second and second.exists() def test_sha256_mismatch_rejected(served): registry, cache, _ = served - registry["kegg"]["files"]["v1_rxn_flags.tsv.gz"]["sha256"] = "0" * 64 + registry["kegg"]["files"]["v1_taxonomy.gz"]["sha256"] = "0" * 64 with pytest.raises(ValueError, match="SHA256 mismatch"): - ensure_data_file("kegg", "v1_rxn_flags.tsv.gz", registry=registry) + ensure_data_file("kegg", "v1_taxonomy.gz", registry=registry) # The corrupt partial download must not be left behind. - assert not (cache / "raven_python" / "data" / "kegg-v1" / "v1_rxn_flags.tsv.gz").exists() + assert not (cache / "raven_python" / "data" / "kegg-v1" / "v1_taxonomy.gz").exists() def test_unknown_dataset_actionable_error(served): @@ -90,13 +96,21 @@ def test_unknown_file_lists_available(served): ensure_data_file("kegg", "missing.tsv.gz", registry=registry) -def test_ensure_kegg_data_fetches_core_set(served): - registry, cache, _ = served +def test_ensure_kegg_data_extracts_core_bundle(served): + registry, cache, core = served out = ensure_kegg_data(registry=registry) assert out == cache / "raven_python" / "data" / "kegg-v1" - # CORE_KEGG_FILES are base names; the cached files are version-prefixed. - for base in CORE_KEGG_FILES: - assert (out / f"v1_{base}").is_file() + # The single bundle is fetched and its members extracted into the cache dir. + for name, payload in core.items(): + member = out / name + assert member.is_file() and member.read_bytes() == payload + + +def test_ensure_kegg_taxonomy(served): + registry, cache, _ = served + path = ensure_kegg_taxonomy(registry=registry) + assert path == cache / "raven_python" / "data" / "kegg-v1" / "v1_taxonomy.gz" + assert path.is_file() def test_ensure_kegg_hmm_library_decompresses_and_presses(served, tmp_path, monkeypatch): @@ -134,25 +148,17 @@ def fake_run(cmd, capture_output, text): assert len(presses) == 1 -def test_ensure_kegg_taxonomy(served): - registry, cache, _ = served - path = ensure_kegg_taxonomy(registry=registry) - assert path == cache / "raven_python" / "data" / "kegg-v1" / "v1_taxonomy.gz" - assert path.is_file() - - def test_unregistered_dataset_raises(): # An unpublished dataset still raises an actionable error against the shipped registry. with pytest.raises(FileNotFoundError, match="No data artefacts registered"): ensure_data_file("metacyc", "x") -def test_shipped_registry_matches_resolver_names(): - # The published registry keys must equal what ensure_kegg_data / - # ensure_kegg_hmm_library construct as f"{version}_{base}", or fetches 404. +def test_shipped_registry_has_expected_assets(): + # The published registry holds the core bundle, both HMM libraries, and taxonomy. kegg = data._DATA_REGISTRY["kegg"] ver = kegg["version"] names = set(kegg["files"]) - assert {f"{ver}_{base}" for base in CORE_KEGG_FILES} <= names + assert f"{ver}_core.tar.gz" in names assert {f"{ver}_{d}.hmm.gz" for d in ("prokaryotes", "eukaryotes")} <= names assert f"{ver}_taxonomy.gz" in names