Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ hardening pass (no behaviour change on well-formed inputs). Highlights:
`ensure_kegg_hmm_library` fetch version-pinned, SHA256-verified files from the
GitHub release. Every artefact is **gzip + version-prefixed**
(`kegg116_<name>.gz`) so MATLAB and Windows read them with the built-in `gunzip`
(no external tool) — `organism_gene_ko` moved from xz to gzip for this. The **HMM
(no external tool) — `organism_gene_ko` moved from xz to gzip for this. The core
model files (reference model + KO/reaction tables) ship as a single
`kegg116_core.tar.gz` that `ensure_kegg_data` extracts on first use; the HMM
libraries and `taxonomy` are separate assets. The **HMM
libraries ship as one gzip concatenated flatfile per domain**
(`kegg116_<domain>.hmm.gz`); the client decompresses and `hmmpress`-es once on
first use, cutting the download ~10× versus the pressed index and letting the
Expand Down
6 changes: 1 addition & 5 deletions data/manifest.example.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,7 @@
"license": "Derived from the KEGG database; redistributed with permission from KEGG.",
"source": "https://github.com/SysBioChalmers/raven-python/releases/tag/v0.1.0",
"files": {
"kegg116_reference_model.yml.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_reference_model.yml.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
"kegg116_ko_reaction.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_reaction.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
"kegg116_ko_names.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_names.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
"kegg116_organism_gene_ko.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_organism_gene_ko.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
"kegg116_rxn_flags.tsv.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
"kegg116_core.tar.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_core.tar.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
"kegg116_taxonomy.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_taxonomy.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
"kegg116_prokaryotes.hmm.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_prokaryotes.hmm.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 },
"kegg116_eukaryotes.hmm.gz": { "url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_eukaryotes.hmm.gz", "sha256": "0000000000000000000000000000000000000000000000000000000000000000", "bytes": 0 }
Expand Down
32 changes: 6 additions & 26 deletions data/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,45 +4,25 @@
"data": {
"kegg": {
"version": "kegg116",
"description": "KEGG reference model, KO/reaction tables, taxonomy, and prokaryote/eukaryote HMM libraries for getKEGGModelForOrganism.",
"description": "KEGG reference model + KO/reaction tables (core bundle), taxonomy, and prokaryote/eukaryote HMM libraries for getKEGGModelForOrganism.",
"license": "Derived from the KEGG database; redistributed with permission from KEGG.",
"source": "https://github.com/SysBioChalmers/raven-python/releases/tag/v0.1.0",
"files": {
"kegg116_core.tar.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_core.tar.gz",
"sha256": "155d5806d43db2fde5783fb124f8782bbcad390a1dd80879c520d2eac9d780e7",
"bytes": 48955539
},
"kegg116_eukaryotes.hmm.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_eukaryotes.hmm.gz",
"sha256": "2d48bc9935575d0f9ba4178bf2df19279bff866b49c1bf83a8e15787b11d6708",
"bytes": 134002309
},
"kegg116_ko_names.tsv.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_names.tsv.gz",
"sha256": "84f9c7150172d948f794d91a6608d55f7140f31e53249c705057ae49b11c93b3",
"bytes": 14585
},
"kegg116_ko_reaction.tsv.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_reaction.tsv.gz",
"sha256": "e1a4ac22875bd3030d03b78368b0153b6d99000acb2ee0f474340a03c180323c",
"bytes": 49196
},
"kegg116_organism_gene_ko.tsv.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_organism_gene_ko.tsv.gz",
"sha256": "27bf7dd58eb1acd5904990dc2be187aae4d8d9b9f7421375618e7c8d6ff7253d",
"bytes": 47935249
},
"kegg116_prokaryotes.hmm.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_prokaryotes.hmm.gz",
"sha256": "d80cb2a22dec9fd8336b3998e3b96ee121672f63f4041cddaf09624fe739f1af",
"bytes": 153173750
},
"kegg116_reference_model.yml.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_reference_model.yml.gz",
"sha256": "73ff313fe2aa2830ec511f4e522226c98c5714c2d5c4632844544e5a409c7f0c",
"bytes": 1090563
},
"kegg116_rxn_flags.tsv.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz",
"sha256": "c4c134effc9edeeb74b925ae8616320af162edbaad3a9b44dcc29d2c4d12db9b",
"bytes": 33289
},
"kegg116_taxonomy.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_taxonomy.gz",
"sha256": "1edc56da94d71433e5f08c133600292c311baaf33279a959518ab08389b0e538",
Expand Down
5 changes: 4 additions & 1 deletion docs/maintenance/kegg_data_format.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ this is ~1.1 MB (vs ~30 MB as SBML) for the full 12k-reaction gene-free model.

End users do not build any of this: the published artefacts are fetched and cached
under `~/.cache/raven-python/data/kegg-<version>/` by `ensure_data` (see
`raven_python.data`), mirroring how binaries are provisioned.
`raven_python.data`), mirroring how binaries are provisioned. The core tables and the
reference model are distributed together as a single `<version>_core.tar.gz`
(`ensure_kegg_data` extracts it on first use); the per-file format below is unchanged.
The HMM libraries and the `taxonomy` file are separate, individually-fetched artefacts.

## Decision (current)

Expand Down
6 changes: 4 additions & 2 deletions docs/maintenance/maintaining_kegg_data.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,10 @@ HMMER versions, and lets the same artefact serve MATLAB RAVEN.
## Building and publishing in one go

[`scripts/build_kegg_artefacts.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/scripts/README.md) runs 3b.2 (+ 3b.3 with
`--hmms`) and lays the output out as publishable, version-prefixed assets
(`<version>_<domain>.hmm.gz` named for `ensure_kegg_hmm_library`). It also publishes
`--hmms`) and lays the output out as publishable, version-prefixed assets: the core
model files (reference model + KO/reaction tables) bundled into `<version>_core.tar.gz`
(which `ensure_kegg_data` extracts), and `<version>_<domain>.hmm.gz` per domain (named
for `ensure_kegg_hmm_library`). It also publishes
`<version>_taxonomy.gz` — the domain split plus the source for
[`phyl_dist`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/kegg/taxonomy.py),
which regenerates RAVEN's `keggPhylDist` (used by GECKO) with no `.mat` file:
Expand Down
16 changes: 16 additions & 0 deletions scripts/build_kegg_artefacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import argparse
import gzip
import shutil
import tarfile
from pathlib import Path

from raven_python.reconstruction.kegg import (
Expand Down Expand Up @@ -105,6 +106,21 @@ def main(argv: list[str] | None = None) -> None:
published = _publish_library(work, args.out, domain, prefix)
print(f" {domain}: {published} ({len(work['hmms'])} profiles)")

# Bundle the core model artefacts (reference model + tables) into one archive that
# ensure_kegg_data fetches and extracts. HMMs and taxonomy stay separate assets.
# (After the HMM step, which reads organism_gene_ko.)
core_members = [
paths[n]
for n in ("reference_model", "ko_reaction", "ko_names", "organism_gene_ko", "rxn_flags")
]
bundle = args.out / f"{prefix}core.tar.gz"
with tarfile.open(bundle, "w:gz") as tar:
for member in core_members:
tar.add(member, arcname=member.name)
for member in core_members:
member.unlink()
print(f" core bundle: {bundle} ({len(core_members)} files)")

print(f"\n>>> Done. Upload the contents of {args.out} as release assets, then run:")
print(" python scripts/make_registry_snippet.py data --dataset kegg "
f"--version {args.version or '<VER>'} --dir {args.out} --base-url <RELEASE_URL>")
Expand Down
61 changes: 25 additions & 36 deletions src/raven_python/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import os
import shutil
import subprocess
import tarfile
from pathlib import Path
from urllib.request import urlopen

Expand All @@ -35,34 +36,18 @@
"kegg": {
"version": "kegg116",
"files": {
"kegg116_core.tar.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_core.tar.gz",
"sha256": "155d5806d43db2fde5783fb124f8782bbcad390a1dd80879c520d2eac9d780e7",
},
"kegg116_eukaryotes.hmm.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_eukaryotes.hmm.gz",
"sha256": "2d48bc9935575d0f9ba4178bf2df19279bff866b49c1bf83a8e15787b11d6708",
},
"kegg116_ko_names.tsv.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_names.tsv.gz",
"sha256": "84f9c7150172d948f794d91a6608d55f7140f31e53249c705057ae49b11c93b3",
},
"kegg116_ko_reaction.tsv.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_ko_reaction.tsv.gz",
"sha256": "e1a4ac22875bd3030d03b78368b0153b6d99000acb2ee0f474340a03c180323c",
},
"kegg116_organism_gene_ko.tsv.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_organism_gene_ko.tsv.gz",
"sha256": "27bf7dd58eb1acd5904990dc2be187aae4d8d9b9f7421375618e7c8d6ff7253d",
},
"kegg116_prokaryotes.hmm.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_prokaryotes.hmm.gz",
"sha256": "d80cb2a22dec9fd8336b3998e3b96ee121672f63f4041cddaf09624fe739f1af",
},
"kegg116_reference_model.yml.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_reference_model.yml.gz",
"sha256": "73ff313fe2aa2830ec511f4e522226c98c5714c2d5c4632844544e5a409c7f0c",
},
"kegg116_rxn_flags.tsv.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_rxn_flags.tsv.gz",
"sha256": "c4c134effc9edeeb74b925ae8616320af162edbaad3a9b44dcc29d2c4d12db9b",
},
"kegg116_taxonomy.gz": {
"url": "https://github.com/SysBioChalmers/raven-python/releases/download/v0.1.0/kegg116_taxonomy.gz",
"sha256": "1edc56da94d71433e5f08c133600292c311baaf33279a959518ab08389b0e538",
Expand All @@ -72,8 +57,9 @@
}

# The core KEGG artefacts needed to build a model (no HMM libraries). These are
# the *base* names; published assets are version-prefixed (``<version>_<base>``),
# which is what the resolvers below construct and what the registry keys hold.
# the *base* names of the files bundled into the published ``<version>_core.tar.gz``
# (each stored version-prefixed inside the archive); ``ensure_kegg_data`` fetches the
# bundle and extracts these, and the build groups exactly this set.
CORE_KEGG_FILES = (
"reference_model.yml.gz",
"ko_reaction.tsv.gz",
Expand Down Expand Up @@ -163,25 +149,28 @@ def ensure_data_file(
return dest


def ensure_kegg_data(
*,
version: str | None = None,
files: tuple[str, ...] = CORE_KEGG_FILES,
registry: dict | None = None,
) -> Path:
def ensure_kegg_data(*, version: str | None = None, registry: dict | None = None) -> Path:
"""Ensure the core KEGG artefacts are cached; return their directory.

Fetches each of ``files`` (default :data:`CORE_KEGG_FILES`, given as *base*
names) for the ``kegg`` dataset and returns the cache directory holding them —
ready to pass as the ``artefact_dir`` of
:func:`get_kegg_model_for_organism_from_artefacts`. Each file is fetched under
its version-prefixed published name (``<version>_<base>``).
Fetches the single ``<version>_core.tar.gz`` bundle (the gene-free reference
model + the KO/reaction/organism-gene tables of :data:`CORE_KEGG_FILES`),
SHA256-verifies it, and extracts the version-prefixed members into the cache
directory on first use — ready to pass as the ``artefact_dir`` of
:func:`get_kegg_model_for_organism_from_artefacts`. The HMM libraries and the
taxonomy file are *separate* artefacts (see :func:`ensure_kegg_hmm_library`,
:func:`ensure_kegg_taxonomy`).
"""
registry = _DATA_REGISTRY if registry is None else registry
ver = version or _bundle("kegg", registry)["version"]
for base in files:
ensure_data_file("kegg", f"{ver}_{base}", version=ver, registry=registry)
return _data_cache_dir() / f"kegg-{ver}"
dest_dir = _data_cache_dir() / f"kegg-{ver}"
archive = ensure_data_file("kegg", f"{ver}_core.tar.gz", version=ver, registry=registry)
# Extract once; a marker avoids re-extracting (and re-reading the archive) per call.
marker = dest_dir / ".core-extracted"
if not marker.exists():
with tarfile.open(archive, "r:gz") as tar:
tar.extractall(dest_dir, filter="data") # safe extraction (matches download.py)
marker.touch()
return dest_dir


def ensure_kegg_hmm_library(
Expand Down
Loading
Loading