From d1be3ca0ec5bb5e0c8a19970f7070a5833a81b8f Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Sat, 30 May 2026 22:12:17 +0200 Subject: [PATCH] docs: set up Sphinx + ReadTheDocs and reorganize the documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reorganize docs/ into a Diátaxis-style tree (guide / reference / studies / maintenance / archive) and add a Sphinx + MyST documentation site wired for ReadTheDocs. Site / toolchain: * .readthedocs.yaml + docs/conf.py (furo, MyST-Parser, autodoc + napoleon for the NumPy-style docstrings, intersphinx to cobra/numpy/pandas/scipy). * docs/requirements.txt; docs/index.md landing page + master toctree. * Full API reference auto-generated from docstrings — one automodule page per subpackage, honoring each __all__. * 7 task-oriented user-guide pages + quickstart; section index pages. * CHANGELOG.md / IMPROVEMENTS.md are {include}-d into the site. Link hygiene (the move + include changed relative depth): * All intra-docs cross-links updated to the new section paths. * Repo-file links (src/scripts/tests) and the included root files use absolute GitHub URLs so they resolve on both GitHub and the RTD site. * README/CHANGELOG/IMPROVEMENTS doc links remapped to the new paths. Quality / CI: * Tiny docstring formatting fixes (blank line before Parameters / bullet lists, literal block in binaries.py) so autodoc renders cleanly. Build is warning-clean under `sphinx-build -W`. * New `docs` CI job builds the site with `-W --keep-going`. --- .github/workflows/ci.yml | 17 +++ .gitignore | 3 + .readthedocs.yaml | 29 ++++ CHANGELOG.md | 60 ++++----- IMPROVEMENTS.md | 28 ++-- README.md | 13 +- docs/README.md | 76 ++++------- docs/_static/.gitkeep | 0 docs/conf.py | 118 ++++++++++++++++ docs/guide/analysis_and_comparison.md | 22 +++ docs/guide/context_specific.md | 41 ++++++ docs/guide/index.md | 21 +++ docs/guide/io_and_manipulation.md | 39 ++++++ docs/guide/localization.md | 20 +++ docs/guide/omics.md | 18 +++ docs/guide/quickstart.md | 67 +++++++++ docs/guide/reconstruction.md | 41 ++++++ docs/guide/tasks_and_gapfilling.md | 26 ++++ docs/index.md | 127 ++++++++++++++++++ docs/installation.md | 41 ++++++ docs/maintenance/index.md | 18 +++ docs/{ => maintenance}/kegg_data_format.md | 0 .../{ => maintenance}/maintaining_binaries.md | 2 +- .../maintaining_kegg_data.md | 4 +- docs/raven_migration.md | 120 ----------------- docs/reference/api/analysis.md | 10 ++ docs/reference/api/comparison.md | 11 ++ docs/reference/api/gapfilling.md | 11 ++ docs/reference/api/index.md | 30 +++++ docs/reference/api/init.md | 10 ++ docs/reference/api/io.md | 10 ++ docs/reference/api/localization.md | 12 ++ docs/reference/api/manipulation.md | 18 +++ docs/reference/api/omics.md | 10 ++ docs/reference/api/reconstruction.md | 24 ++++ docs/reference/api/resolvers.md | 22 +++ docs/reference/api/tasks.md | 11 ++ docs/reference/api/utils.md | 16 +++ docs/reference/changelog.md | 2 + docs/reference/improvements.md | 2 + docs/reference/index.md | 22 +++ docs/{ => reference}/known_issues.md | 2 +- .../{ => reference}/matlab_raven_backports.md | 30 ++--- docs/reference/migration.md | 120 +++++++++++++++++ docs/{ => reference}/todo.md | 14 +- docs/requirements.txt | 7 + docs/{ => studies}/humangem_validation.md | 0 docs/studies/index.md | 26 ++++ docs/{ => studies}/init_param_calibration.md | 2 +- docs/{ => studies}/init_solver_benchmark.md | 0 .../kegg_hmm_cutoff_calibration.md | 2 +- .../yeast_localization_benchmark.md | 6 +- src/raven_python/binaries.py | 6 +- src/raven_python/init/__init__.py | 2 + src/raven_python/manipulation/add.py | 1 + src/raven_python/manipulation/change.py | 1 + src/raven_python/manipulation/merge.py | 1 + src/raven_python/manipulation/transfer.py | 1 + src/raven_python/manipulation/transport.py | 1 + src/raven_python/utils/balance.py | 1 + 60 files changed, 1142 insertions(+), 253 deletions(-) create mode 100644 .readthedocs.yaml create mode 100644 docs/_static/.gitkeep create mode 100644 docs/conf.py create mode 100644 docs/guide/analysis_and_comparison.md create mode 100644 docs/guide/context_specific.md create mode 100644 docs/guide/index.md create mode 100644 docs/guide/io_and_manipulation.md create mode 100644 docs/guide/localization.md create mode 100644 docs/guide/omics.md create mode 100644 docs/guide/quickstart.md create mode 100644 docs/guide/reconstruction.md create mode 100644 docs/guide/tasks_and_gapfilling.md create mode 100644 docs/index.md create mode 100644 docs/installation.md create mode 100644 docs/maintenance/index.md rename docs/{ => maintenance}/kegg_data_format.md (100%) rename docs/{ => maintenance}/maintaining_binaries.md (98%) rename docs/{ => maintenance}/maintaining_kegg_data.md (96%) delete mode 100644 docs/raven_migration.md create mode 100644 docs/reference/api/analysis.md create mode 100644 docs/reference/api/comparison.md create mode 100644 docs/reference/api/gapfilling.md create mode 100644 docs/reference/api/index.md create mode 100644 docs/reference/api/init.md create mode 100644 docs/reference/api/io.md create mode 100644 docs/reference/api/localization.md create mode 100644 docs/reference/api/manipulation.md create mode 100644 docs/reference/api/omics.md create mode 100644 docs/reference/api/reconstruction.md create mode 100644 docs/reference/api/resolvers.md create mode 100644 docs/reference/api/tasks.md create mode 100644 docs/reference/api/utils.md create mode 100644 docs/reference/changelog.md create mode 100644 docs/reference/improvements.md create mode 100644 docs/reference/index.md rename docs/{ => reference}/known_issues.md (99%) rename docs/{ => reference}/matlab_raven_backports.md (85%) create mode 100644 docs/reference/migration.md rename docs/{ => reference}/todo.md (74%) create mode 100644 docs/requirements.txt rename docs/{ => studies}/humangem_validation.md (100%) create mode 100644 docs/studies/index.md rename docs/{ => studies}/init_param_calibration.md (99%) rename docs/{ => studies}/init_solver_benchmark.md (100%) rename docs/{ => studies}/kegg_hmm_cutoff_calibration.md (98%) rename docs/{ => studies}/yeast_localization_benchmark.md (94%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3f6d19b..b5b7f12 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -46,3 +46,20 @@ jobs: # skip themselves when ``optlang.gurobi_interface`` cannot import. - run: pip install -e ".[dev,plotting,excel]" - run: pytest -q --maxfail=5 --durations=20 + + docs: + name: docs + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + cache-dependency-path: pyproject.toml + - run: pip install --upgrade pip + # The package (with autodoc's import-time extras) plus the docs toolchain. + - run: pip install -e ".[excel,plotting]" -r docs/requirements.txt + # ``-W --keep-going`` keeps the docs build warning-clean; mirrors the + # ReadTheDocs build (docs/conf.py + .readthedocs.yaml). + - run: sphinx-build -b html -W --keep-going docs docs/_build/html diff --git a/.gitignore b/.gitignore index d47dad7..747f7e3 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,9 @@ env/ .DS_Store Thumbs.db +# Docs build output +docs/_build/ + # Tooling caches .claude/ diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..996ad6b --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,29 @@ +# ReadTheDocs build configuration +# https://docs.readthedocs.io/en/stable/config-file/v2.html +version: 2 + +build: + os: ubuntu-24.04 + tools: + python: "3.12" + +sphinx: + configuration: docs/conf.py + # Build fails on warnings would be too strict while the API docstrings are + # still being filled in; keep warnings visible but non-fatal. + fail_on_warning: false + +python: + install: + # Install the package itself (with the optional extras autodoc needs to + # import every module) plus the docs-only toolchain. + - method: pip + path: . + extra_requirements: + - excel + - plotting + - requirements: docs/requirements.txt + +formats: + - pdf + - htmlzip diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d42fa9..273a300 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,12 @@ # Changelog Milestones in the raven-python port. For function-level status see -[docs/raven_migration.md](docs/raven_migration.md); for open work see -[docs/todo.md](docs/todo.md). +[docs/raven_migration.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/reference/migration.md); for open work see +[docs/todo.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/reference/todo.md). ## Infrastructure -* **GitHub Actions CI** ([.github/workflows/ci.yml](.github/workflows/ci.yml)) — +* **GitHub Actions CI** ([.github/workflows/ci.yml](https://github.com/SysBioChalmers/raven-python/blob/develop/.github/workflows/ci.yml)) — ruff + pytest matrix over Python 3.11/3.12/3.13. Tests that require Gurobi auto-skip (no Gurobi on free runners); the known HiGHS upstream blocker (`hybrid_interface.Configuration` rejects `lp_method='primal'`) is marked @@ -33,7 +33,7 @@ fixes with matching MATLAB back-port proposals in IMPROVEMENTS.md (FS4, B2). bug to `check_model`; the actual code is in `balance.py`. Two new regression tests (F3 in `test_analysis_fseof.py`, F5 in -`test_utils_balance.py`). [docs/known_issues.md](docs/known_issues.md) now +`test_utils_balance.py`). [docs/known_issues.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/reference/known_issues.md) now fully closed (all sections A–F). ## Quality sweep — known-issues sections C / D / E @@ -68,7 +68,7 @@ download (defensive, needs urlopen mocking). ## Quality sweep — known-issues section B -Closed all four "silent misbehaviour" items from [docs/known_issues.md](docs/known_issues.md): +Closed all four "silent misbehaviour" items from [docs/known_issues.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/reference/known_issues.md): * `merge_models` warns on `formula` / `charge` conflicts when two source models share a name[comp] but disagree (used to silently keep the first-seen). * `add_reactions_from_equations` warns when creating a metabolite in an @@ -82,7 +82,7 @@ Four new regression tests cover them. ## Quality sweep — known-issues section A -Closed all six "latent edge-case bug" items from [docs/known_issues.md](docs/known_issues.md): +Closed all six "latent edge-case bug" items from [docs/known_issues.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/reference/known_issues.md): * `add_reactions_from_equations` no longer misparses `"2 oxoglutarate"` (or any leading-number metabolite name) — the resolver tries the full token before splitting off a coefficient. @@ -101,45 +101,45 @@ Six new regression tests cover the user-reachable cases. ## Phase 7 — Localization -* **Sub-cellular localisation by MILP.** [`localization.predict_localization`](src/raven_python/localization/predict.py) - + [`apply_localization`](src/raven_python/localization/predict.py). Deterministic (not simulated +* **Sub-cellular localisation by MILP.** [`localization.predict_localization`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/localization/predict.py) + + [`apply_localization`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/localization/predict.py). Deterministic (not simulated annealing); caller-passed `reactions_to_relocate` set with everything else pinned; incomplete-model tolerant (no silent reaction removal); `apply=False` returns a diff preview; multi-compartment by default with primary-free, extras-penalised scoring. -* **Predictor loaders.** [`load_wolfpsort`, `load_deeploc`](src/raven_python/localization/scores.py), +* **Predictor loaders.** [`load_wolfpsort`, `load_deeploc`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/localization/scores.py), with the `gene × compartment` DataFrame contract open for any predictor. -* **Compartment helpers** ([`manipulation/compartments.py`](src/raven_python/manipulation/compartments.py)): +* **Compartment helpers** ([`manipulation/compartments.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/compartments.py)): `merge_compartments`, `copy_to_compartment` — useful standalone for model curation. -* **Real-data validation on yeast-GEM** ([docs/yeast_localization_benchmark.md](docs/yeast_localization_benchmark.md)) +* **Real-data validation on yeast-GEM** ([docs/yeast_localization_benchmark.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/studies/yeast_localization_benchmark.md)) — accuracy 0.72 → 0.39 on 298 GPR'd reactions as confident predictor mis-scoring rises from 0 % to 50 %; perfect on compartments with disjoint gene sets (c/g/lp/p/v/vm), and surfaces a `transport_cost` calibration insight for soft-probability score tables. ## Phase 5 — Data integration & analysis -* **Reporter metabolites, FSEOF, random sampling** ([`analysis/`](src/raven_python/analysis/)). -* **HPA omics ingestion** ([`omics.parse_hpa`, `parse_hpa_rna`, `hpa_gene_scores`, `rna_gene_scores`](src/raven_python/omics/hpa.py)) +* **Reporter metabolites, FSEOF, random sampling** ([`analysis/`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/analysis/)). +* **HPA omics ingestion** ([`omics.parse_hpa`, `parse_hpa_rna`, `hpa_gene_scores`, `rna_gene_scores`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/omics/hpa.py)) — pandas-tidy DataFrames replace RAVEN's sparse-matrix layout; scoring adapters reuse the existing GPR walk. -* **N-model comparison** ([`comparison.compare_models`](src/raven_python/comparison/compare.py)). +* **N-model comparison** ([`comparison.compare_models`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/comparison/compare.py)). * **Dynamic FBA** is **not ported** — established Python packages cover it (`dfba`, `reframed`, `mewpy`). ## Phase 4d — ftINIT -* **ftINIT pipeline** ([`init.ftinit`](src/raven_python/init/ftinit.py)) — staged MILP, linear merge, +* **ftINIT pipeline** ([`init.ftinit`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/ftinit.py)) — staged MILP, linear merge, task-aware gap-filling, gene pruning. * **Validated against MATLAB RAVEN on Human-GEM.** 5 Hart2015 cell-line models; Jaccard 0.973–0.977 (no-task) and 0.978–0.980 (task-constrained). See - [docs/humangem_validation.md](docs/humangem_validation.md). -* **Parameter calibration & input-robustness study** ([docs/init_param_calibration.md](docs/init_param_calibration.md)) + [docs/humangem_validation.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/studies/humangem_validation.md). +* **Parameter calibration & input-robustness study** ([docs/init_param_calibration.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/studies/init_param_calibration.md)) — `mip_gap=0.01` is the genome-scale full-pipeline sweet spot (~37% faster than 0.001 at Jaccard 0.995); pipeline is robust to expression noise (Jaccard 0.92–0.95) but sensitive to sparsity (50–70% dropout → Jaccard 0.59–0.71); the task + gap-fill layer keeps the essential-task pass-rate at 67–69/69 across the gradient, whereas tINIT-without-it passes only 35/69 even on clean data. -* **Cross-solver portability** ([docs/init_solver_benchmark.md](docs/init_solver_benchmark.md)) - + [`tests/test_init_solvers.py`](tests/test_init_solvers.py): Gurobi and GLPK pass at toy +* **Cross-solver portability** ([docs/init_solver_benchmark.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/studies/init_solver_benchmark.md)) + + [`tests/test_init_solvers.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/tests/test_init_solvers.py): Gurobi and GLPK pass at toy scale; only Gurobi is viable at genome scale today (HiGHS hits an upstream optlang `clone()` bug; GLPK ignores `configuration.timeout` on MIP). * **Engineering wins surfaced by the genome-scale work:** `check_tasks` and @@ -149,25 +149,25 @@ Six new regression tests cover the user-reachable cases. ## Phase 4c — tINIT -* **INIT MILP and the tINIT pipeline** ([`init.run_init`](src/raven_python/init/init.py), - [`init.get_init_model`](src/raven_python/init/build.py)). Clean optlang reformulation; +* **INIT MILP and the tINIT pipeline** ([`init.run_init`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/init.py), + [`init.get_init_model`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/build.py)). Clean optlang reformulation; RNA-seq scoring via `5·ln(level/ref)`-clamped. ## Phase 4b — Gap-filling -* **Connectivity gap-filling** ([`gapfilling.connect_blocked_reactions`](src/raven_python/gapfilling/fill.py)) +* **Connectivity gap-filling** ([`gapfilling.connect_blocked_reactions`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/gapfilling/fill.py)) — MILP. Targeted (toward objective) mode delegates to `cobra.gapfill`. ## Phase 4a — Metabolic tasks -* **Task list parsing + `check_tasks`** ([`tasks/`](src/raven_python/tasks/)). +* **Task list parsing + `check_tasks`** ([`tasks/`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/tasks/)). ## Phase 3 — Reconstruction * **Homology-based draft** from a template GEM + BLAST/DIAMOND wrappers - ([`reconstruction/homology/`](src/raven_python/reconstruction/homology/)) — with structured + ([`reconstruction/homology/`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/homology/)) — with structured improvements over RAVEN's `getModelFromHomology` (see IMPROVEMENTS H1–H6). -* **KEGG five-step pipeline** ([`reconstruction/kegg/`](src/raven_python/reconstruction/kegg/)): +* **KEGG five-step pipeline** ([`reconstruction/kegg/`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/kegg/)): dump → parser → HMM library builder → species model → HMM-query draft. * **MetaCyc reconstruction** **not ported** (and flagged for removal from MATLAB RAVEN — see IMPROVEMENTS R-MetaCyc). @@ -176,17 +176,17 @@ Six new regression tests cover the user-reachable cases. * **YAML** aligned to cobra's `!!omap` writer + RAVEN-only fields preserved into `.notes`, plus geckopy `ec-*` for enzyme-constrained models - ([`io/yaml.py`](src/raven_python/io/yaml.py)). + ([`io/yaml.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/io/yaml.py)). * **SIF**, **Excel export**, and **Standard-GEM `model//…` git layout** - ([`io/`](src/raven_python/io/)). Excel import intentionally excluded. + ([`io/`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/io/)). Excel import intentionally excluded. ## Phase 1 — Foundation -* **GPR / balance / validation / parsing helpers** ([`utils/`](src/raven_python/utils/)) — +* **GPR / balance / validation / parsing helpers** ([`utils/`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/utils/)) — cobra-absent bits only; the rest are cheatsheeted. -* **Manipulation ergonomic layer** ([`manipulation/`](src/raven_python/manipulation/)) — +* **Manipulation ergonomic layer** ([`manipulation/`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/)) — add/change/remove/transport/transfer/merge/simplify/variance + adopted transforms. -* **External-binary resolver** ([`binaries.py`](src/raven_python/binaries.py)) — version-pinned +* **External-binary resolver** ([`binaries.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/binaries.py)) — version-pinned release-ZIP registry, SHA256-verified cache. ## Phase 0 — Scaffold diff --git a/IMPROVEMENTS.md b/IMPROVEMENTS.md index a6a9877..825d551 100644 --- a/IMPROVEMENTS.md +++ b/IMPROVEMENTS.md @@ -62,7 +62,7 @@ it is only incidentally in the MetaCyc folder.) ## getModelFromHomology (Phase 3a — implemented) -Design + rationale in [docs/plan_get_model_from_homology.md](docs/plan_get_model_from_homology.md); +Design + rationale in [docs/plan_get_model_from_homology.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/archive/plan_get_model_from_homology.md); implemented in `reconstruction/homology/homology.py`. *Logic* improvements over RAVEN's algorithm (RAVEN's own comments flag several of these spots as uncertain). @@ -99,7 +99,7 @@ and `taxonomy.py` (3b.3). Maintainer-side, build-time tooling (PLAN.md §2.3b). | K12 | EFFICIENCY | raven-python 🔨 + MATLAB RAVEN 💡 | 🔨 | **Fast MAFFT (FFT-NS-2) for HMM training** instead of RAVEN's `--auto`, which selects slow iterative refinement (`dvtditr`) on medium/large KOs — observed ~2.5 min/KO (days for a domain) on real KEGG 118. FFT-NS-2 (`--retree 2 --maxiterate 0`) is seconds/KO and ample for profile-HMM building. **PartTree cutover is residue-based and memory-auto-tuned**: MAFFT memory tracks residues (count × length), not sequence count, so a count threshold let long-protein KOs (K00901: 2,788 seqs, 2.55 M residues) OOM under FFT-NS-2 — measured ~5 GB MAFFT RSS with FFT-NS-2 vs **0.69 GB with PartTree** for the same alignment. The cutover is **length-aware and memory-auto-tuned**: FFT-NS-2 memory is driven by the progressive-alignment **DP cost ≈ n_seqs × mean_len²** (= residues²/n_seqs), *not* residue count — a few hundred long proteins cost far more than the same residues in many short ones. (First tried a residue-only model `RSS≈1.32R²+1.84R`; it then OOM'd on K12047 — 452 seqs but mean length 2082, 0.94 M residues — because long proteins blow the per-residue cost.) Calibrated `RSS_GB ≈ 4.2e-9 × (n_seqs × mean_len²)` across real KEGG KOs (250k/266→0.67 GB … 1.5M/1624→5.73 GB; K12047 cost 1.96e9 = the largest, hence its OOM). `_auto_cost_budget` switches to PartTree when the DP cost exceeds `0.65 × (total − 2.5 GB overhead) / 4.2e-9` (≈7.9e8 on a 7.6 GB box), **warns on low-memory hosts**, and `parttree_residues` overrides with a manual residue cutoff. Back-portable to RAVEN. | | K13 | EFFICIENCY | raven-python 🗑️ | 🗑️ | ~~Per-KO sequence cap (`max_sequences`)~~ — **removed.** Briefly added as a count-based cap, but the residue-based PartTree cutover (K12) bounds MAFFT memory without dropping any sequences, so the cap was redundant complexity. All deduplicated sequences are kept. | | K14 | EFFICIENCY (size) | raven-python 🔨 + MATLAB RAVEN 💡 | 🔨 | **Sort `organism_gene_ko` by `(organism, gene)` and store it xz-compressed** (`organism_gene_ko.tsv.xz`), cutting the dominant artefact **≈78 → 27 MB (2.9×)**. Gene IDs within an organism share long prefixes (locus tags, numeric runs), so sorting makes them adjacent and far more compressible (sort alone: 78→48 MB; xz vs gzip captures the cross-row redundancy gzip's 32 KB window misses: →27 MB). The sort is an **external merge sort** bounded to `chunk_rows` rows in memory (sorted runs spooled to gzipped temp files, merged with `heapq.merge`), so it keeps K9's flat memory profile. Both `lzma` and `gzip` are Python stdlib (native on Windows/macOS/Linux, no extra binary); small tables stay gzipped TSV (MATLAB-native), only the big one is xz (MATLAB needs an external `unxz`). Sorted order also matches the by-organism query in `get_kegg_model_for_organism`, enabling a future `searchsorted` slice instead of loading all 9M rows. Back-portable to RAVEN. | -| K15 | ERGONOMICS (correctness) | raven-python 🔨 + MATLAB RAVEN 💡 | 🔨 | **Recalibrate the HMM-query KO-assignment defaults** (`assign_kos`): cut-off `1e-50 → 1e-30`, `min_score_ratio_g 0.8 → 0.9`; `min_score_ratio_ko` left at 0.3 but **documented as empirically inert**. Cross-validated the full 3b.5 pipeline against the true KEGG gene→KO annotation of four organisms across both libraries and the well-/lesser-studied axis — *S. cerevisiae*, *Cyanidioschyzon merolae* (red alga), *E. coli* K-12, *Mycoplasma genitalium* (minimal genome). Real annotations score overwhelmingly (median E ≈ 1e-100…1e-155; even the weakest 1% ≈ 1e-15…1e-36) while spurious hits cluster at ≈1e-8 — a ~20-order-of-magnitude gap. RAVEN's `1e-50` therefore sits **inside the true-positive tail** and silently drops real-but-divergent hits for no noise-rejection gain: gene→KO recall on *M. genitalium* was only 0.84 (reaction recall 0.87). At `1e-30` + `ratio_g=0.9`: *M. genitalium* recall **0.84→0.94** (rxn 0.87→0.97), *E. coli* 0.95→0.97 with **fewer** unannotated reactions (198→173, the tighter gene-ratio prunes spurious multi-KO genes), *S. cerevisiae*/*C. merolae* held or improved. The three sweep tables showed `min_score_ratio_ko` produced identical output at 0.0/0.3/0.5 across all four organisms — a magic-number knob that does nothing; `min_score_ratio_g` is the real precision lever. Full numbers in [docs/kegg_hmm_cutoff_calibration.md](docs/kegg_hmm_cutoff_calibration.md) (reproduce with `scripts/analyze_hmm_cutoffs.py`). Back-portable to RAVEN. | +| K15 | ERGONOMICS (correctness) | raven-python 🔨 + MATLAB RAVEN 💡 | 🔨 | **Recalibrate the HMM-query KO-assignment defaults** (`assign_kos`): cut-off `1e-50 → 1e-30`, `min_score_ratio_g 0.8 → 0.9`; `min_score_ratio_ko` left at 0.3 but **documented as empirically inert**. Cross-validated the full 3b.5 pipeline against the true KEGG gene→KO annotation of four organisms across both libraries and the well-/lesser-studied axis — *S. cerevisiae*, *Cyanidioschyzon merolae* (red alga), *E. coli* K-12, *Mycoplasma genitalium* (minimal genome). Real annotations score overwhelmingly (median E ≈ 1e-100…1e-155; even the weakest 1% ≈ 1e-15…1e-36) while spurious hits cluster at ≈1e-8 — a ~20-order-of-magnitude gap. RAVEN's `1e-50` therefore sits **inside the true-positive tail** and silently drops real-but-divergent hits for no noise-rejection gain: gene→KO recall on *M. genitalium* was only 0.84 (reaction recall 0.87). At `1e-30` + `ratio_g=0.9`: *M. genitalium* recall **0.84→0.94** (rxn 0.87→0.97), *E. coli* 0.95→0.97 with **fewer** unannotated reactions (198→173, the tighter gene-ratio prunes spurious multi-KO genes), *S. cerevisiae*/*C. merolae* held or improved. The three sweep tables showed `min_score_ratio_ko` produced identical output at 0.0/0.3/0.5 across all four organisms — a magic-number knob that does nothing; `min_score_ratio_g` is the real precision lever. Full numbers in [docs/kegg_hmm_cutoff_calibration.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/studies/kegg_hmm_cutoff_calibration.md) (reproduce with `scripts/analyze_hmm_cutoffs.py`). Back-portable to RAVEN. | ## FSEOF (Phase 5 — implemented, redesigned) @@ -181,7 +181,7 @@ RAVEN `core/parseTaskList.m` + `core/checkTasks.m` → `tasks/tasklist.py` + `ta ## fillGaps (Phase 4b — implemented) RAVEN `core/fillGaps.m`. Only the **connectivity** mode is ported, as -`connect_blocked_reactions` ([gapfilling/fill.py](src/raven_python/gapfilling/fill.py)) — +`connect_blocked_reactions` ([gapfilling/fill.py](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/gapfilling/fill.py)) — MILP via cobra/optlang (GLPK). RAVEN's other mode (fill to make the objective feasible) is `cobra.flux_analysis.gapfill` and is **cheatsheeted, not re-wrapped** (PLAN §1). @@ -194,7 +194,7 @@ is `cobra.flux_analysis.gapfill` and is **cheatsheeted, not re-wrapped** (PLAN RAVEN `core/addRxns.m` — add reactions from equation strings (or mets+coeffs), auto-creating metabolites/genes. Ported as `add_reactions_from_equations` -([manipulation/add.py](src/raven_python/manipulation/add.py)). +([manipulation/add.py](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/add.py)). | # | Cat | Target | Status | Improvement | |---|---|---|---|---| @@ -205,7 +205,7 @@ metabolites/genes. Ported as `add_reactions_from_equations` ## changeGrRules -Ported as `change_gene_reaction_rules` ([manipulation/change.py](src/raven_python/manipulation/change.py)). +Ported as `change_gene_reaction_rules` ([manipulation/change.py](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/change.py)). | # | Cat | Target | Status | Improvement | |---|---|---|---|---| @@ -213,7 +213,7 @@ Ported as `change_gene_reaction_rules` ([manipulation/change.py](src/raven_pytho ## simplifyModel -Gap modes ported in [manipulation/simplify.py](src/raven_python/manipulation/simplify.py). +Gap modes ported in [manipulation/simplify.py](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/simplify.py). | # | Cat | Target | Status | Improvement | |---|---|---|---|---| @@ -221,7 +221,7 @@ Gap modes ported in [manipulation/simplify.py](src/raven_python/manipulation/sim ## mergeModels -Ported as `merge_models` ([manipulation/merge.py](src/raven_python/manipulation/merge.py)). +Ported as `merge_models` ([manipulation/merge.py](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/merge.py)). | # | Cat | Target | Status | Improvement | |---|---|---|---|---| @@ -230,7 +230,7 @@ Ported as `merge_models` ([manipulation/merge.py](src/raven_python/manipulation/ ## checkModelStruct -Ported (curation subset) as `check_model` ([utils/validate.py](src/raven_python/utils/validate.py)). +Ported (curation subset) as `check_model` ([utils/validate.py](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/utils/validate.py)). | # | Cat | Target | Status | Improvement | |---|---|---|---|---| @@ -239,8 +239,8 @@ Ported (curation subset) as `check_model` ([utils/validate.py](src/raven_python/ ## setParam / getElementalBalance -Ported as `set_parameters` ([manipulation/parameters.py](src/raven_python/manipulation/parameters.py)) -and `get_elemental_balance` ([utils/balance.py](src/raven_python/utils/balance.py)). +Ported as `set_parameters` ([manipulation/parameters.py](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/parameters.py)) +and `get_elemental_balance` ([utils/balance.py](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/utils/balance.py)). | # | Cat | Target | Status | Improvement | |---|---|---|---|---| @@ -256,7 +256,7 @@ if a downstream consumer needs the `include_partial` (fully-contained vs touchin several places — and ask before re-adding (see process note: argue pros/cons for marginal WRAPs). Ported as `remove_metabolites` / `remove_genes` -([manipulation/remove.py](src/raven_python/manipulation/remove.py)). `removeReactions` was **not** +([manipulation/remove.py](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/remove.py)). `removeReactions` was **not** ported: with orphan cleanup kept coupled (decision: don't separate metabolites from genes), it is identical to `cobra.Model.remove_reactions(remove_orphans=...)`. @@ -270,7 +270,7 @@ identical to `cobra.Model.remove_reactions(remove_orphans=...)`. ## readYAMLmodel / writeYAMLmodel RAVEN `io/readYAMLmodel.m` + `writeYAMLmodel.m` (+ private legacy parser). Ported as -`read_yaml_model`/`write_yaml_model` ([io/yaml.py](src/raven_python/io/yaml.py)). +`read_yaml_model`/`write_yaml_model` ([io/yaml.py](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/io/yaml.py)). **Lens correction (no separate legacy parser).** RAVEN ships a 462-line `parseYAMLLegacy.m` for the `!!omap` dialect, and geckopy refuses it ("re-save from MATLAB"). But `!!omap` is **cobra's own YAML @@ -289,7 +289,7 @@ value is preserving `metaData` identity and RAVEN-only per-entry fields, which i ## changeRxns RAVEN `core/changeRxns.m` — change reaction equations. Ported as -`change_reaction_equations` ([manipulation/change.py](src/raven_python/manipulation/change.py)). +`change_reaction_equations` ([manipulation/change.py](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/change.py)). | # | Cat | Target | Status | Improvement | |---|---|---|---|---| @@ -331,7 +331,7 @@ OR-of-AND-complex (DNF) form (`findPotentialErrors`). **Decision (raven-python): port the lint half only.** cobra auto-normalizes a GPR on assignment (`"(G1 AND G2) OR G3"` is stored as `"(G1 and G2) or G3"`), so the normalization half is redundant. The non-DNF lint has no cobra equivalent and was ported as `find_non_dnf_grrules`/`is_dnf` -([utils/gpr.py](src/raven_python/utils/gpr.py)). +([utils/gpr.py](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/utils/gpr.py)). | # | Cat | Target | Status | Improvement | |---|---|---|---|---| diff --git a/README.md b/README.md index 69111be..14febac 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ functionality that's unique to RAVEN: The status of every RAVEN function (ported, cheatsheet-mapped to cobra, or explicitly not ported) is documented function-by-function in -**[docs/raven_migration.md](docs/raven_migration.md)**. +**[docs/raven_migration.md](docs/reference/migration.md)**. ## Design principle @@ -38,7 +38,7 @@ COBRA ecosystem. ## Status raven-python has been validated against MATLAB RAVEN on **Human-GEM** (5 Hart2015 cell-line -models, Jaccard 0.975–0.980 — see [docs/humangem_validation.md](docs/humangem_validation.md)). +models, Jaccard 0.975–0.980 — see [docs/humangem_validation.md](docs/studies/humangem_validation.md)). The functional scope of the original RAVEN toolbox is covered with two principled omissions: @@ -48,7 +48,7 @@ omissions: it ([`dfba`](https://pypi.org/project/dfba/), [`reframed`](https://pypi.org/project/reframed/), [`mewpy`](https://pypi.org/project/mewpy/)). -What's still open is catalogued in **[docs/todo.md](docs/todo.md)** (visualisation / Phase +What's still open is catalogued in **[docs/todo.md](docs/reference/todo.md)** (visualisation / Phase 6 is the main item). ## Installation (development) @@ -60,12 +60,15 @@ pip install -e ".[dev]" ``` raven-python requires Python ≥ 3.11. Genome-scale (f)tINIT MILPs currently require **Gurobi** -([details on solver portability](docs/init_solver_benchmark.md)); toy and unit-test work +([details on solver portability](docs/studies/init_solver_benchmark.md)); toy and unit-test work runs on the open-source GLPK. ## Documentation -See **[docs/README.md](docs/README.md)** for the documentation index. +Full documentation is built with Sphinx and hosted on **ReadTheDocs**: +****. The source lives in +[docs/](docs/) — see [docs/README.md](docs/README.md) for the layout and local-build +instructions. ## Relationship to MATLAB RAVEN diff --git a/docs/README.md b/docs/README.md index d39ae69..2ccac8e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,52 +1,28 @@ # Documentation -Start with the [top-level README](../README.md). The docs are organised as: - -## RAVEN ↔ raven-python reference - -* **[raven_migration.md](raven_migration.md)** — function-by-function map from MATLAB RAVEN - to raven-python (and cobrapy where appropriate). Read this if you're coming from RAVEN. -* **[../IMPROVEMENTS.md](../IMPROVEMENTS.md)** — improvements raven-python makes over RAVEN that - are also candidates to back-port into the MATLAB toolbox. - -## Open work - -* **[todo.md](todo.md)** — what's still on the books. -* **[known_issues.md](known_issues.md)** — low-priority backlog from the full-codebase - review. None affects correctness on well-formed inputs. - -## Empirical studies & calibrations - -* **[humangem_validation.md](humangem_validation.md)** — raven-python ftINIT vs MATLAB RAVEN on - 5 Hart2015 cell lines (Jaccard 0.975–0.980). -* **[init_param_calibration.md](init_param_calibration.md)** — clean-data calibration + - input-robustness study for (f)tINIT (sweeps of `mip_gap` / `big_m` / `force_on` / `eps` / - `prod_weight` / scaling; dropout / noise / downsample robustness). -* **[init_solver_benchmark.md](init_solver_benchmark.md)** — Gurobi vs HiGHS vs GLPK on - genome-scale ftINIT. -* **[yeast_localization_benchmark.md](yeast_localization_benchmark.md)** — - `predict_localization` against curated yeast-GEM, with a predictor-noise sweep - (accuracy 0.72 → 0.39 as confident mis-scoring rises from 0 % to 50 %). -* **[kegg_hmm_cutoff_calibration.md](kegg_hmm_cutoff_calibration.md)** — HMM E-value / - score-ratio sensitivity for the KEGG HMM-query reconstruction path. - -## Data formats & maintenance - -* **[kegg_data_format.md](kegg_data_format.md)** — layout of the KEGG artefact bundle. -* **[maintaining_kegg_data.md](maintaining_kegg_data.md)** — building and publishing the - KEGG artefact releases. -* **[maintaining_binaries.md](maintaining_binaries.md)** — building and publishing the - external-binary (BLAST/DIAMOND/HMMER) ZIP releases. - -## Archive - -Historical design notes (preserved for reference but no longer part of the user-facing -documentation): - -* **[archive/ftinit_review_and_plan.md](archive/ftinit_review_and_plan.md)** — the - pre-implementation critical review of `ftINIT` that drove the Phase 4d port plan. -* **[archive/localization_design.md](archive/localization_design.md)** — the - pre-implementation critical review of `predictLocalization` that drove the Phase 7 - redesign (caller-passed relocate set, MILP instead of SA, partial-update mode). -* **[archive/plan_get_model_from_homology.md](archive/plan_get_model_from_homology.md)** — - early planning notes for the homology-based reconstruction. +The rendered documentation is built with **Sphinx** (MyST Markdown) and hosted on +**ReadTheDocs**. This folder is the source. + +Build it locally: + +```bash +pip install -e ".[excel,plotting]" +pip install -r docs/requirements.txt +sphinx-build -b html docs docs/_build/html +# open docs/_build/html/index.html +``` + +## Layout + +| Path | Contents | +| --- | --- | +| [`index.md`](index.md) | Landing page + master table of contents | +| [`installation.md`](installation.md) | Install, extras, solvers, external binaries | +| [`guide/`](guide/) | Task-oriented user guide (one page per capability) | +| [`reference/`](reference/) | Migration map, back-port proposals, improvements, and the autogenerated [API reference](reference/api/index.md) | +| [`studies/`](studies/) | Empirical validation & parameter-calibration studies | +| [`maintenance/`](maintenance/) | Data-bundle formats and release/maintenance procedures | +| [`archive/`](archive/) | Historical design notes (pre-implementation reviews) | + +The top-level [`CHANGELOG.md`](../CHANGELOG.md) and [`IMPROVEMENTS.md`](../IMPROVEMENTS.md) are +included into the site under *Project* and *Reference* respectively. diff --git a/docs/_static/.gitkeep b/docs/_static/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..68d81c8 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,118 @@ +"""Sphinx configuration for the raven-python documentation. + +Markdown sources are rendered by MyST-Parser, so the docs stay authored in the +same Markdown used on GitHub. The API reference is generated from the NumPy-style +docstrings via autodoc + autosummary (``:recursive:``). +""" + +from __future__ import annotations + +import sys +from datetime import date +from pathlib import Path + +# -- Make the package importable for autodoc -------------------------------- +# On ReadTheDocs the package is pip-installed (see .readthedocs.yaml); this also +# lets a local `make html` work straight from a source checkout. +_REPO_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(_REPO_ROOT / "src")) + + +def _get_version() -> str: + try: + from raven_python import __version__ + + return __version__ + except Exception: # pragma: no cover - docs must still build + return "0.0.1" + + +# -- Project information ----------------------------------------------------- +project = "raven-python" +author = "Eduard Kerkhoven" +copyright = f"{date.today().year}, SysBioChalmers" +release = _get_version() +version = release + +# -- General configuration --------------------------------------------------- +extensions = [ + "myst_parser", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinx.ext.intersphinx", + "sphinx.ext.viewcode", + "sphinx.ext.todo", + "sphinx_copybutton", + "sphinx_design", +] + +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "README.md"] + +# Markdown + reStructuredText both allowed; .md goes through MyST. +source_suffix = { + ".rst": "restructuredtext", + ".md": "markdown", +} + +# -- MyST configuration ------------------------------------------------------ +myst_enable_extensions = [ + "colon_fence", # ::: fenced directives + "deflist", + "fieldlist", + "tasklist", + "smartquotes", + "substitution", +] +myst_heading_anchors = 3 # auto-generate anchors for ##, ###, #### headings + +# -- autodoc / autosummary --------------------------------------------------- +# The API reference is built from explicit per-subpackage ``automodule`` pages +# (docs/reference/api/*), so no autosummary stub generation is needed. +autosummary_generate = False +autodoc_default_options = { + "members": True, + "undoc-members": True, + "show-inheritance": True, + "member-order": "bysource", +} +autodoc_typehints = "description" +autoclass_content = "both" +# Optional / heavy imports that need not be present for the docs to build. +autodoc_mock_imports = [] + +# -- napoleon (NumPy-style docstrings) -------------------------------------- +napoleon_google_docstring = False +napoleon_numpy_docstring = True +napoleon_use_rtype = False +napoleon_use_param = True +# Render docstring "Attributes:" sections as :ivar: fields rather than separate +# attribute directives — avoids duplicate-description warnings on the dataclasses +# (whose fields autodoc also documents). +napoleon_use_ivar = True + +# -- intersphinx ------------------------------------------------------------- +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "cobra": ("https://cobrapy.readthedocs.io/en/latest/", None), + "numpy": ("https://numpy.org/doc/stable/", None), + "pandas": ("https://pandas.pydata.org/docs/", None), + "scipy": ("https://docs.scipy.org/doc/scipy/", None), +} + +# -- todo extension ---------------------------------------------------------- +todo_include_todos = True + +# -- HTML output ------------------------------------------------------------- +html_theme = "furo" +html_title = f"raven-python {release}" +html_static_path = ["_static"] +html_theme_options = { + "source_repository": "https://github.com/SysBioChalmers/raven-python/", + "source_branch": "develop", + "source_directory": "docs/", +} + +# Suppress noisy-but-harmless warnings (e.g. cross-references to GitHub source +# files that intentionally live outside the doc tree). +suppress_warnings = ["myst.xref_missing"] diff --git a/docs/guide/analysis_and_comparison.md b/docs/guide/analysis_and_comparison.md new file mode 100644 index 0000000..8cd68cd --- /dev/null +++ b/docs/guide/analysis_and_comparison.md @@ -0,0 +1,22 @@ +# Analysis & comparison + +## Analyses — {mod}`raven_python.analysis` + +RAVEN analyses that are not in cobrapy's core: + +- {func}`raven_python.analysis.reporter_metabolites` — Reporter Metabolites, an + around-metabolite gene-score test. raven-python uses an exact closed-form background in + place of RAVEN's Monte-Carlo. +- {func}`raven_python.analysis.fseof` — Flux Scanning based on Enforced Objective Flux: + regression slope + correlation, with amplify / knockdown / knockout classes and gene + aggregation. +- {func}`raven_python.analysis.random_sampling` — random-objective flux sampling (wraps + cobra's samplers); {func}`raven_python.analysis.find_good_reactions` is the companion + screen. + +## Comparison — {mod}`raven_python.comparison` + +{func}`raven_python.comparison.compare_models` compares any number of models and returns tidy +DataFrames (reaction / metabolite / gene / subsystem presence, pairwise Jaccard, and an +optional `check_tasks` pass/fail matrix). Plotting and tSNE/MDS are deliberately left out — +they are one-liners in seaborn / scikit-learn on the returned frames. diff --git a/docs/guide/context_specific.md b/docs/guide/context_specific.md new file mode 100644 index 0000000..f55e48b --- /dev/null +++ b/docs/guide/context_specific.md @@ -0,0 +1,41 @@ +# Context-specific modeling (tINIT / ftINIT) + +Extract a tissue- or condition-specific model from a reference GEM plus gene scores derived +from omics data. Two algorithms are provided in {mod}`raven_python.init`. + +## Scoring + +Gene scores drive both algorithms. Build them from expression with +{func}`raven_python.init.gene_scores_from_expression` and turn them into reaction scores via +{func}`raven_python.init.score_reactions_from_genes` (a GPR walk shared with the omics +adapters — see the [omics guide](omics.md)). + +## tINIT + +- {func}`raven_python.init.run_init` — the classic INIT MILP (rewritten in optlang). +- {func}`raven_python.init.get_init_model` — the full tINIT pipeline (dead-end removal → + `run_init`). + +## ftINIT (faster, staged) + +- {func}`raven_python.init.run_ftinit` — the single-step ftINIT MILP (continuous indicators + for positive-score reactions; binaries only on negatives — the speedup over `run_init`). +- {func}`raven_python.init.ftinit` — the full pipeline: + {func}`raven_python.init.prep_init_model` → staged `run_ftinit` → + {func}`raven_python.init.fill_tasks` → {func}`raven_python.init.remove_low_score_genes`. + +## Tasks and defaults + +ftINIT's task layer keeps essential metabolic tasks feasible; define and check tasks with the +[tasks guide](tasks_and_gapfilling.md). The parameter defaults (`mip_gap`, `big_m`, +`force_on`, `eps`, `prod_weight`, scaling) and their robustness to noisy input are calibrated +in the [parameter-calibration study](../studies/init_param_calibration.md), and the +equivalence to MATLAB RAVEN is established in the +[Human-GEM validation](../studies/humangem_validation.md). + +:::{important} +Genome-scale (f)tINIT MILPs currently require **Gurobi** for tractable solve times; toy and +unit-test problems run on GLPK. See the +[solver benchmark](../studies/init_solver_benchmark.md). Metabolomics-based scoring is the one +piece not yet implemented (raises `NotImplementedError`). +::: diff --git a/docs/guide/index.md b/docs/guide/index.md new file mode 100644 index 0000000..46572ce --- /dev/null +++ b/docs/guide/index.md @@ -0,0 +1,21 @@ +# User guide + +Task-oriented guides for each raven-python capability. Every guide links to the relevant +[API reference](../reference/api/index.md) entries and, where one exists, the +[validation study](../studies/index.md) that backs the defaults. + +If you are coming from MATLAB RAVEN, read the +[migration map](../reference/migration.md) first — it tells you, function by function, what +moved to raven-python, what is now a cobra one-liner, and what was intentionally dropped. + +```{toctree} +:maxdepth: 1 + +io_and_manipulation +reconstruction +context_specific +omics +tasks_and_gapfilling +localization +analysis_and_comparison +``` diff --git a/docs/guide/io_and_manipulation.md b/docs/guide/io_and_manipulation.md new file mode 100644 index 0000000..73c4876 --- /dev/null +++ b/docs/guide/io_and_manipulation.md @@ -0,0 +1,39 @@ +# I/O & model manipulation + +## I/O — {mod}`raven_python.io` + +raven-python keeps everything as a {class}`cobra.Model`, so cobra's SBML I/O works +unchanged. On top of that it adds the RAVEN-specific formats: + +- {func}`raven_python.io.read_yaml_model` / {func}`raven_python.io.write_yaml_model` — + cobra-standard YAML (the `!!omap` layout), transparently handling `.yml.gz`. RAVEN-only and + GECKO `ec-*` side-fields are preserved on each entry's `notes` so a read→write round-trip is + lossless. +- {func}`raven_python.io.export_model_to_sif` — Cytoscape SIF (`rc` / `rr` / `cc` graphs). +- {func}`raven_python.io.export_to_excel` — the RAVEN 5-sheet workbook (RXNS / METS / COMPS / + GENES / MODEL). Requires the `excel` extra. Excel **import** is intentionally not provided. +- {func}`raven_python.io.export_for_git` — the Standard-GEM repository layout + (`model//…`), for version-controlled model repos. + +## Manipulation — {mod}`raven_python.manipulation` + +Structural transforms that cobra does not cover cleanly: + +- **Build / copy reactions:** {func}`raven_python.manipulation.add_reactions_from_equations` + (equation-string → reactions, matching metabolites by id / name / `name[comp]`), + {func}`raven_python.manipulation.add_transport_reactions`, + {func}`raven_python.manipulation.add_reactions_from_model`. +- **Merge:** {func}`raven_python.manipulation.merge_models` (N-model merge, unify metabolites + by `name[comp]`). +- **GPR / bounds:** {func}`raven_python.manipulation.change_gene_reaction_rules`, + {func}`raven_python.manipulation.change_reaction_equations`, + {func}`raven_python.manipulation.set_variance_bounds`. +- **Simplify:** {func}`raven_python.manipulation.remove_dead_end_reactions`, + `remove_duplicate_reactions`, `constrain_reversible_reactions`, `group_linear_reactions`. +- **Topology:** {func}`raven_python.manipulation.convert_to_irreversible`, + {func}`raven_python.manipulation.expand_model` (split isozyme OR-GPRs), + and the compartment helpers `merge_compartments` / `copy_to_compartment`. + +See the [migration map](../reference/migration.md) for which RAVEN functions these +correspond to (and which became cobra one-liners), and the full +[`io`](../reference/api/io.md) / [`manipulation`](../reference/api/manipulation.md) API. diff --git a/docs/guide/localization.md b/docs/guide/localization.md new file mode 100644 index 0000000..ccf4a9f --- /dev/null +++ b/docs/guide/localization.md @@ -0,0 +1,20 @@ +# Sub-cellular localisation + +{mod}`raven_python.localization` assigns reactions to compartments by MILP — deterministic +(not simulated annealing), predictor-agnostic, and partial-update friendly. + +1. **Load predictor scores** into the `gene × compartment` + {class}`raven_python.localization.LocalizationScores` frame: + {func}`raven_python.localization.load_wolfpsort` (WoLF PSORT summary output) or + {func}`raven_python.localization.load_deeploc` (DeepLoc 2 per-protein CSV). raven-python + does **not** shell out to the predictor — run it separately and feed in its output. +2. **Predict / apply:** {func}`raven_python.localization.predict_localization` is the MILP + entry point. Pass the set of reactions to relocate (everything else is pinned); extra + compartments beyond a reaction's primary one pay a `multi_compartment_penalty`. With + `apply=False` you get a {class}`raven_python.localization.LocalizationProposal` diff to + inspect before committing; {func}`raven_python.localization.apply_localization` applies a + result. + +The defaults and accuracy (including a predictor-noise sweep) are validated against curated +yeast-GEM in the +[yeast localization benchmark](../studies/yeast_localization_benchmark.md). diff --git a/docs/guide/omics.md b/docs/guide/omics.md new file mode 100644 index 0000000..aabf0ef --- /dev/null +++ b/docs/guide/omics.md @@ -0,0 +1,18 @@ +# Omics integration + +{mod}`raven_python.omics` ingests **Human Protein Atlas** data and turns it into the gene +scores that drive context-specific extraction. + +- **Proteomics:** {func}`raven_python.omics.parse_hpa` → + {func}`raven_python.omics.hpa_gene_scores`. +- **RNA-seq:** {func}`raven_python.omics.parse_hpa_rna` → + {func}`raven_python.omics.rna_gene_scores`. + +Both return tidy pandas DataFrames, and the scoring adapters reuse +{func}`raven_python.init.score_reactions_from_genes` (a single source of truth for the GPR +walk), so omics-derived scores plug straight into +{func}`raven_python.init.ftinit` / {func}`raven_python.init.get_init_model` — see the +[context-specific modeling guide](context_specific.md). + +`HPA_LEVEL_SCORES` exposes the categorical-level → score mapping used for the proteomics +expression levels. diff --git a/docs/guide/quickstart.md b/docs/guide/quickstart.md new file mode 100644 index 0000000..dd6fc82 --- /dev/null +++ b/docs/guide/quickstart.md @@ -0,0 +1,67 @@ +# Quickstart + +This page gets you from a fresh install to loading a model and running the two headline +workflows: a de-novo draft reconstruction and a context-specific extraction. + +See [Installation](../installation.md) for the full dependency / solver / binary matrix. + +```bash +pip install -e ".[dev]" +``` + +## Load and save a model + +The canonical in-memory object is a {class}`cobra.Model`, so everything cobra can do is +available unchanged. raven-python adds RAVEN's YAML reader/writer (cobra-standard layout +plus RAVEN/GECKO side-fields preserved on `notes`): + +```python +from raven_python.io import read_yaml_model, write_yaml_model + +model = read_yaml_model("model.yml") # transparently handles .yml.gz +print(model.summary()) + +write_yaml_model(model, "out.yml", sort_ids=True) +``` + +Other I/O and structural edits live in {mod}`raven_python.io` and +{mod}`raven_python.manipulation` — see the +[I/O & manipulation guide](io_and_manipulation.md). + +## De-novo reconstruction (homology) + +Build a draft for a new organism from a curated template model and a BLAST/DIAMOND +ortholog search: + +```python +from raven_python.reconstruction.homology import get_model_from_homology + +draft = get_model_from_homology(template_model, ortholog_hits) +``` + +The KEGG route ({func}`raven_python.reconstruction.kegg.get_kegg_model_for_organism`) and +the BLAST/DIAMOND helpers are covered in the +[reconstruction guide](reconstruction.md). + +## Context-specific model (ftINIT) + +Extract a tissue/condition-specific model from a reference GEM plus omics-derived gene +scores: + +```python +from raven_python.omics import parse_hpa_rna, rna_gene_scores +from raven_python.init import ftinit + +rna = parse_hpa_rna("rna_tissue.tsv") +scores = rna_gene_scores(reference_model, rna, tissue="liver") +context_model = ftinit(reference_model, scores) +``` + +ftINIT, tINIT, the scoring adapters and task-aware gap-filling are detailed in the +[context-specific modeling guide](context_specific.md). Genome-scale (f)tINIT currently +needs **Gurobi** — see the [solver benchmark](../studies/init_solver_benchmark.md). + +:::{note} +The snippets above show the entry points; consult each capability guide and the +[API reference](../reference/api/index.md) for the full keyword arguments. +::: diff --git a/docs/guide/reconstruction.md b/docs/guide/reconstruction.md new file mode 100644 index 0000000..fea01ed --- /dev/null +++ b/docs/guide/reconstruction.md @@ -0,0 +1,41 @@ +# De-novo reconstruction + +Two independent routes build a draft model for an organism that has no curated GEM yet. + +## Homology — {mod}`raven_python.reconstruction.homology` + +Transfer reactions from a curated **template** model to the target organism via an ortholog +search. + +1. Run the search (or bring your own table): + {func}`raven_python.reconstruction.homology.run_blast` / + {func}`raven_python.reconstruction.homology.run_diamond`, then + {func}`raven_python.reconstruction.homology.make_ortholog_hits` to get the canonical + `gene × gene` hits DataFrame (bidirectional / best-hits-only policies supported). +2. Draft the model: + {func}`raven_python.reconstruction.homology.get_model_from_homology` — AST-based GPR + rewrite, configurable complex policy, returns a {class}`cobra.Model` (plus a + `HomologyResult`). + +The external aligners are resolved from the pinned binary registry +({mod}`raven_python.binaries`). + +## KEGG — {mod}`raven_python.reconstruction.kegg` + +Draft directly from KEGG orthology, either for a KEGG-listed species or from your own protein +FASTA via HMM search. + +- **KEGG species (no FASTA):** + {func}`raven_python.reconstruction.kegg.get_kegg_model_for_organism`. +- **Your sequences (HMM search):** {func}`raven_python.reconstruction.kegg.assign_kos` → + {func}`raven_python.reconstruction.kegg.get_kegg_model_from_sequences`. The HMM cut-off + defaults are calibrated in the + [KEGG HMM cut-off study](../studies/kegg_hmm_cutoff_calibration.md). + +The KEGG artefact bundle (KO tables, reference model, HMM libraries) is fetched by +{mod}`raven_python.data`; building and publishing it is a maintainer task — see +[Maintaining KEGG data](../maintenance/maintaining_kegg_data.md) and the +[KEGG data format](../maintenance/kegg_data_format.md). + +After drafting, fill connectivity gaps with the +[gap-filling guide](tasks_and_gapfilling.md). diff --git a/docs/guide/tasks_and_gapfilling.md b/docs/guide/tasks_and_gapfilling.md new file mode 100644 index 0000000..ae77806 --- /dev/null +++ b/docs/guide/tasks_and_gapfilling.md @@ -0,0 +1,26 @@ +# Metabolic tasks & gap-filling + +## Tasks — {mod}`raven_python.tasks` + +A metabolic task asserts that a model can (or cannot) produce/consume a set of metabolites +under given constraints. + +- {class}`raven_python.tasks.Task` + {func}`raven_python.tasks.parse_task_list` — the + task-list file format. +- {func}`raven_python.tasks.check_tasks` + {class}`raven_python.tasks.TaskResult` — run tasks + against a model. `check_tasks` reuses one model across the whole list (no per-task model + copy) — at genome scale ~12× faster than a copy-per-task implementation. +- {func}`raven_python.tasks.find_task_essential_reactions` — the reactions a model must use to + satisfy a task list. This is the input to (f)tINIT's task layer (see the + [context-specific modeling guide](context_specific.md)). + +## Connectivity gap-filling — {mod}`raven_python.gapfilling` + +{func}`raven_python.gapfilling.connect_blocked_reactions` adds the fewest (lowest-penalty) +template reactions so reactions that are blocked in a draft can carry flux — the connectivity +flavour of RAVEN's `fillGaps`. + +:::{tip} +For the other flavour — add reactions until a specific objective becomes *feasible* — use +cobra's {func}`cobra.flux_analysis.gapfill` directly. +::: diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..4567567 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,127 @@ +# raven-python + +**Reconstruction, Analysis and Visualisation of Metabolic Networks — in Python.** + +`raven-python` is the Python counterpart of the +[RAVEN Toolbox 2](https://github.com/SysBioChalmers/RAVEN) (MATLAB). It builds on +[**cobrapy**](https://cobrapy.readthedocs.io) for everything cobrapy already does well +(simulation, standard analyses, SBML I/O, model manipulation) and adds the functionality +that is unique to RAVEN: + +- **De novo reconstruction** from KEGG and protein homology (BLAST / DIAMOND). +- **Context-specific models** from omics data via **tINIT / ftINIT**, with task-aware + gap-filling and the linear-merge MILP reduction. +- **Metabolic-task** validation (`check_tasks`, `find_task_essential_reactions`). +- **Connectivity gap-filling** against template models. +- **Omics integration** — Human Protein Atlas (proteomics + RNA-seq) ingestion. +- **Sub-cellular localisation** prediction by MILP, with partial-update mode and pluggable + predictors (WoLF PSORT, DeepLoc, …). +- **N-model comparison**; **reporter metabolites**; **FSEOF**; **flux sampling**. +- **YAML I/O** following the cobra standard, plus geckopy's `ec-*` enzyme-constrained + fields; **SIF** export; **RAVEN-style Excel** export. + +:::{admonition} Design principle +:class: tip + +The canonical in-memory object is always a {class}`cobra.Model`. There is no parallel +RAVEN struct and no `ravenCobraWrapper`-style adapter — RAVEN-specific fields that cobra +doesn't model natively live in cobra's `annotation` / `notes` dictionaries. This keeps +raven-python interoperable with the wider COBRA ecosystem. +::: + +## Where to start + +::::{grid} 1 1 2 2 +:gutter: 3 + +:::{grid-item-card} 🚀 Quickstart +:link: guide/quickstart +:link-type: doc + +Install, load a model, and run your first reconstruction / ftINIT extraction. +::: + +:::{grid-item-card} 🧭 Coming from MATLAB RAVEN? +:link: reference/migration +:link-type: doc + +The function-by-function map from RAVEN to raven-python (and cobrapy). +::: + +:::{grid-item-card} 📚 User guide +:link: guide/index +:link-type: doc + +Task-oriented how-tos for each capability — reconstruction, tINIT, tasks, omics, … +::: + +:::{grid-item-card} 🔍 API reference +:link: reference/api/index +:link-type: doc + +Every public function and class, generated from the docstrings. +::: +:::: + +## Status + +raven-python has been validated against MATLAB RAVEN on **Human-GEM** (5 Hart2015 cell-line +models, Jaccard 0.975–0.980 — see [the Human-GEM validation study](studies/humangem_validation.md)). +The functional scope of the original toolbox is covered, with two principled omissions: +**MetaCyc-based reconstruction** (flagged for removal from MATLAB RAVEN too) and **dynamic +FBA** (well covered by other maintained Python packages). What's still open is catalogued in +[the to-do list](reference/todo.md). + +```{toctree} +:hidden: +:caption: Getting started + +guide/quickstart +installation +``` + +```{toctree} +:hidden: +:caption: User guide + +guide/index +``` + +```{toctree} +:hidden: +:caption: Reference + +reference/index +``` + +```{toctree} +:hidden: +:caption: Studies & validation + +studies/index +``` + +```{toctree} +:hidden: +:caption: Maintenance + +maintenance/index +``` + +```{toctree} +:hidden: +:caption: Project + +reference/changelog +reference/todo +reference/known_issues +``` + +```{toctree} +:hidden: +:caption: Design archive + +archive/ftinit_review_and_plan +archive/localization_design +archive/plan_get_model_from_homology +``` diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 0000000..6be617a --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,41 @@ +# Installation + +raven-python requires **Python ≥ 3.11**. + +## From a source checkout (development) + +```bash +git clone https://github.com/SysBioChalmers/raven-python +cd raven-python +pip install -e ".[dev]" +``` + +## Optional extras + +The core install pulls in cobra, numpy, pandas, scipy, ruamel.yaml, requests and tqdm. +Some features need extra packages, exposed as optional extras: + +| Extra | Pulls in | Needed for | +| --- | --- | --- | +| `excel` | `openpyxl` | {func}`raven_python.io.excel.export_to_excel` | +| `plotting` | `matplotlib` | visualisation (Phase 6, in progress) | +| `dev` | `pytest`, `pytest-cov`, `ruff` | running the test-suite / linting | + +```bash +pip install -e ".[excel,plotting,dev]" +``` + +## Solvers + +Linear and most MILP work runs on the open-source **GLPK** that ships with cobra. +Genome-scale **(f)tINIT** MILPs currently require **Gurobi** for tractable solve times — +see the [solver benchmark](studies/init_solver_benchmark.md) for the Gurobi vs HiGHS vs GLPK +comparison and portability notes. + +## External binaries + +Homology and KEGG-HMM reconstruction shell out to **BLAST+ / DIAMOND / HMMER**. These are +not Python packages; raven-python resolves them from a version-pinned, SHA256-verified +release registry (see {mod}`raven_python.binaries`). The +[binary maintenance guide](maintenance/maintaining_binaries.md) covers building and +publishing those release ZIPs. diff --git a/docs/maintenance/index.md b/docs/maintenance/index.md new file mode 100644 index 0000000..a382f42 --- /dev/null +++ b/docs/maintenance/index.md @@ -0,0 +1,18 @@ +# Maintenance + +Maintainer-facing documentation: the layout of the published data bundles and how to +rebuild and release them. + +- **[KEGG data format](kegg_data_format.md)** — layout of the KEGG artefact bundle. +- **[Maintaining KEGG data](maintaining_kegg_data.md)** — building and publishing the KEGG + artefact releases. +- **[Maintaining binaries](maintaining_binaries.md)** — building and publishing the + external-binary (BLAST / DIAMOND / HMMER) ZIP releases. + +```{toctree} +:hidden: + +kegg_data_format +maintaining_kegg_data +maintaining_binaries +``` diff --git a/docs/kegg_data_format.md b/docs/maintenance/kegg_data_format.md similarity index 100% rename from docs/kegg_data_format.md rename to docs/maintenance/kegg_data_format.md diff --git a/docs/maintaining_binaries.md b/docs/maintenance/maintaining_binaries.md similarity index 98% rename from docs/maintaining_binaries.md rename to docs/maintenance/maintaining_binaries.md index df5b315..85689c0 100644 --- a/docs/maintaining_binaries.md +++ b/docs/maintenance/maintaining_binaries.md @@ -211,7 +211,7 @@ Implications: After building the per-platform ZIPs (named `---.zip`) and uploading them to the release, generate the `_REGISTRY` entry — checksums and -URLs — with [`scripts/make_registry_snippet.py`](../scripts/README.md): +URLs — with [`scripts/make_registry_snippet.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/scripts/README.md): ```bash python scripts/make_registry_snippet.py binary --bundle blast --version 2.16.0 \ diff --git a/docs/maintaining_kegg_data.md b/docs/maintenance/maintaining_kegg_data.md similarity index 96% rename from docs/maintaining_kegg_data.md rename to docs/maintenance/maintaining_kegg_data.md index f53d0da..356b574 100644 --- a/docs/maintaining_kegg_data.md +++ b/docs/maintenance/maintaining_kegg_data.md @@ -119,7 +119,7 @@ and tables. ## Building and publishing in one go -[`scripts/build_kegg_artefacts.py`](../scripts/README.md) runs 3b.2 (+ 3b.3 with +[`scripts/build_kegg_artefacts.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/scripts/README.md) runs 3b.2 (+ 3b.3 with `--hmms`) and lays the output out as publishable assets (`.hmm` named for `ensure_kegg_hmm_library`): @@ -128,7 +128,7 @@ python scripts/build_kegg_artefacts.py --keggdb keggdb --out artefacts --hmms -- ``` Upload the contents of `artefacts/` to a release, then emit the registry entry for -`raven_python.data._DATA_REGISTRY` with [`scripts/make_registry_snippet.py`](../scripts/README.md): +`raven_python.data._DATA_REGISTRY` with [`scripts/make_registry_snippet.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/scripts/README.md): ```bash python scripts/make_registry_snippet.py data --dataset kegg --version kegg116 \ diff --git a/docs/raven_migration.md b/docs/raven_migration.md deleted file mode 100644 index 6de2067..0000000 --- a/docs/raven_migration.md +++ /dev/null @@ -1,120 +0,0 @@ -# RAVEN → raven-python migration reference - -A function-by-function map from the MATLAB RAVEN Toolbox to raven-python (and cobrapy where -appropriate). For each RAVEN function we record one of four outcomes: - -* ✅ **ported** — there's a direct raven-python replacement; see the link. -* 🗒️ **cheatsheet** — cobrapy already covers it; a one-liner or short idiom does the job - (recorded under each row). -* ⛔ **not ported** — explicit decision not to bring it across, with rationale. -* 🆕 **new in raven-python** — functionality raven-python adds that has no RAVEN counterpart. - -For raven-python's deliberate improvements over RAVEN (and which of them are candidates to -upstream into MATLAB RAVEN), see [IMPROVEMENTS.md](../IMPROVEMENTS.md). - -## Design principle - -The in-memory object is always a [`cobra.Model`](https://cobrapy.readthedocs.io). There is -no parallel RAVEN struct, no `ravenCobraWrapper`-style adapter. RAVEN fields that cobra -doesn't model natively (`rxnMiriams`, `metDeltaG`, `rxnConfidenceScores`, …) live in -cobra's `annotation` / `notes` dictionaries. raven-python's YAML I/O follows the cobra YAML -standard plus the geckopy enzyme-constrained extension, so ecModels round-trip. - ---- - -## Foundation: utilities, manipulation, I/O - -| RAVEN | raven-python | Notes | -|---|---|---| -| `addRxns` (equations) | ✅ [`manipulation.add_reactions_from_equations`](../src/raven_python/manipulation/add.py) | The keystone: equation-string → reactions, matching mets by id, name, or `name[comp]`; strict / auto policies for new mets and genes. | -| `addTransport` | ✅ [`manipulation.add_transport_reactions`](../src/raven_python/manipulation/transport.py) | Transport across compartments, matching mets by name, sequential `tr_NNNN` ids. cobra has no transport primitive. | -| `addRxnsGenesMets` | ✅ [`manipulation.add_reactions_from_model`](../src/raven_python/manipulation/transfer.py) | Copy reactions from a source model, matching mets by `name[comp]` (vs cobra's strict-by-id merge). | -| `mergeModels` | ✅ [`manipulation.merge_models`](../src/raven_python/manipulation/merge.py) | N-model merge, unify mets by `name[comp]` (or id), keep all reactions (id collisions renamed). cobra's `merge` is pairwise / strict-by-id. | -| `changeRxns`, `changeGrRules` | ✅ [`manipulation.change_reaction_equations`](../src/raven_python/manipulation/change.py), `change_gene_reaction_rules` | Stoichiometry change in place; batch GPR set/append (`(old) or (new)`). | -| `setParam('var', …)` | ✅ [`manipulation.set_variance_bounds`](../src/raven_python/manipulation/parameters.py) | ±% band around measured values. **Other modes (lb/ub/eq/obj/unc)**: 🗒️ cobra one-liners (`reaction.bounds`, `model.objective`, `Configuration().bounds`). | -| `setExchangeBounds` | ⛔ not ported | cobra's `model.medium = {ex_id: uptake}` covers it. | -| `simplifyModel` (gap modes) | ✅ [`manipulation.remove_dead_end_reactions`](../src/raven_python/manipulation/simplify.py), `remove_duplicate_reactions`, `constrain_reversible_reactions`, `group_linear_reactions` | Cobra-covered modes (no-flux→`find_blocked_reactions`, zero-interval, unconstrained) are 🗒️ cheatsheeted. `group_linear` is lossy (drops genes), per RAVEN. | -| `removeMets`, `removeGenes` | ✅ [`manipulation.remove_metabolites`](../src/raven_python/manipulation/remove.py), `remove_genes` | Delegate to cobra; add `by_name` cross-compartment deletion (mets) and a `blocked_reactions` policy (genes). `removeReactions` itself ⛔ — cobra's `remove_reactions` covers it. | -| `convertToIrrev` | ✅ [`manipulation.convert_to_irreversible`](../src/raven_python/manipulation/irreversible.py) | Splits reversible non-exchange reactions into forward + `_REV`. Adopted from geckopy. | -| `expandModel` | ✅ [`manipulation.expand_model`](../src/raven_python/manipulation/expand.py) | Splits OR-GPR (isozyme) reactions into one per AND-clause. Adopted from geckopy. | -| `mergeCompartments` | ✅ [`manipulation.merge_compartments`](../src/raven_python/manipulation/compartments.py) | Collapse a multi-compartment model into one; deduplicate identical reactions; optionally drop one-met collapses (`drop_single_metabolite_reactions`). | -| `copyToComps` | ✅ [`manipulation.copy_to_compartment`](../src/raven_python/manipulation/compartments.py) | Duplicate reactions into a target compartment (idempotent; `delete_original=True` makes it a move). | -| `mapCompartments` | ⛔ not ported | Overlaps with `comparison.compare_models` on the reaction-id intersection. | -| `getElementalBalance` | ✅ [`utils.get_elemental_balance`](../src/raven_python/utils/balance.py) | Graded `balanced` / `unbalanced` / `unknown` — `unknown` catches a missing formula that cobra's `check_mass_balance` silently miscounts. | -| `checkModelStruct` (curation subset) | ✅ [`utils.check_model`](../src/raven_python/utils/validate.py) | Structured curation report. RAVEN's struct/type checks are moot in cobra. | -| `is_dnf` / GPR check (from `standardizeGrRules`) | ✅ [`utils.is_dnf`](../src/raven_python/utils/gpr.py), `find_non_dnf_grrules` | Lint-only half; cobra auto-normalises GPRs on assignment, so the rewriting half isn't ported. | -| `getIndexes` (`metcomps` sliver) | ✅ [`utils.parse_name_comp`](../src/raven_python/utils/parse.py) | The only `getIndexes` bit cobra doesn't already cover. | -| `editMiriam`, `extractMiriam` | ⛔ not ported | cobra's `.annotation` is already a `{namespace: id(s)}` dict — read/write it directly. | -| `getRxnsInComp`, `getMetsInComp` | ⛔ not ported | One-liners over cobra's `reaction.compartments` / `metabolite.compartment`. | -| `constructEquations` | ⛔ not ported | `reaction.build_reaction_string(use_metabolite_names=...)` already does both id and name equations. | -| `sortIdentifiers` | ✅ [`utils.sort_identifiers`](../src/raven_python/utils/sort.py) | Model-wide alphabetical sort; also via `sort_ids=` on `write_yaml_model`. | - -### I/O - -| RAVEN | raven-python | Notes | -|---|---|---| -| `readYAMLmodel`, `writeYAMLmodel` | ✅ [`io.read_yaml_model`, `write_yaml_model`](../src/raven_python/io/yaml.py) | Aligned to cobra's `!!omap` writer (RAVEN `fa281a1`). Adds the RAVEN-only top-level per-entry keys (inchis/deltaG/metFrom/notes, confidence_score/references/rxnFrom/deltaG, protein) into `.notes`, plus `version`/`metaData`/GECKO `ec-*`. cobra-readable output verified. | -| `exportModelToSIF` | ✅ [`io.export_model_to_sif`](../src/raven_python/io/sif.py) | Cytoscape SIF (`rc`/`rr`/`cc` graphs). | -| `exportToExcelFormat` (export only) | ✅ [`io.export_to_excel`](../src/raven_python/io/excel.py) | RAVEN 5-sheet xlsx (RXNS / METS / COMPS / GENES / MODEL). Excel **import** is intentionally excluded. | -| `exportForGit` | ✅ [`io.export_for_git`](../src/raven_python/io/git.py) | Standard-GEM repo layout (`model//…`). | -| `importYAML/SBML/Mat/Excel` | 🗒️ cobra's standard readers | `cobra.io.read_sbml_model` / `load_json_model` / etc.; Excel import not ported. | - -## Reconstruction - -| RAVEN | raven-python | Notes | -|---|---|---| -| `getModelFromHomology` + `getBlast`/`getDiamond` | ✅ [`reconstruction.homology.get_model_from_homology`](../src/raven_python/reconstruction/homology/homology.py), `run_blast`, `run_diamond`, `blast_from_table` | Core homology reconstruction with structured improvements (bidirectional / best-hits-only, AST GPR rewrite, complex policy, bitscore best-hits, DataFrame ortholog map). | -| KEGG download → species model (5 steps) | ✅ [`reconstruction.kegg.*`](../src/raven_python/reconstruction/kegg/) | All five steps: `download.fetch_keggdb`, `parse.read_kegg_table` + reference model, `hmm.build_libraries`, `organism.build_kegg_model_for_organism` (no-FASTA), `query.assign_kos` + `run_hmmscan`. | -| `getPhylDist` | ⛔ not ported | Fixed prok90/euk90 libraries make the distance matrix moot. | -| `getMetaCycModelForOrganism` | ⛔ not ported (and **flagged for removal from MATLAB RAVEN**) | BLAST-to-single-representatives is low-precision at every cutoff. See [IMPROVEMENTS.md](../IMPROVEMENTS.md) under `R-MetaCyc`. | - -## Tasks, gap-filling, INIT, ftINIT - -| RAVEN | raven-python | Notes | -|---|---|---| -| `parseTaskList`, `checkTasks` | ✅ [`tasks.parse_task_list`](../src/raven_python/tasks/tasklist.py), `check_tasks` ([tasks/check.py](../src/raven_python/tasks/check.py)) | `check_tasks` reuses one model across the task list (no per-task model copy) — at genome scale ~12× faster than the copy-per-task implementation it replaced. | -| `fillGaps` (connectivity mode) | ✅ [`gapfilling.connect_blocked_reactions`](../src/raven_python/gapfilling/fill.py) | MILP (min penalty-weighted template reactions s.t. blocked reactions carry flux). Targeted mode → 🗒️ `cobra.gapfill`. | -| `runINIT`, `scoreComplexModel`, `getINITModel` | ✅ [`init.run_init`](../src/raven_python/init/init.py), [`init.score_reactions_from_genes`, `gene_scores_from_expression`](../src/raven_python/init/score.py), [`init.get_init_model`](../src/raven_python/init/build.py) | INIT MILP rewritten in optlang (no sparse-matrix construction); RNA-seq scoring is `5·ln(level/ref)`-clamped. | -| `ftINIT`, `prepINITModel`, `ftINITInternalAlg`, `getINITSteps` | ✅ [`init.ftinit`](../src/raven_python/init/ftinit.py), [`init.prep_init_model`](../src/raven_python/init/prep.py), [`init.run_ftinit`](../src/raven_python/init/ftinit.py), [`init.get_init_steps`](../src/raven_python/init/steps.py) | Staged MILP + linear merge + scaling (`rescaleModelForINIT`). Validated against RAVEN on Human-GEM (Jaccard 0.975–0.980; see [humangem_validation.md](humangem_validation.md)). Metabolomics-based scoring is the one piece not yet implemented (raises `NotImplementedError`). | -| `ftINITFillGaps`, `ftINITFillGapsMILP`, `ftINITFillGapsForAllTasks` | ✅ [`init.fill_tasks`](../src/raven_python/init/taskfill.py) | Task-aware gap-filling within ftINIT; in-place `_feasible` check + bounded fill MILP (`mip_gap`, `time_limit`). | -| `mergeLinear` | ✅ [`init.merge_linear`](../src/raven_python/init/merge.py) | Linear merge of unit-stoichiometry chains; bookkeeping (`group_ids`, `reversed_rxns`) to map back to the reference model. | -| `removeLowScoreGenes` | ✅ [`init.remove_low_score_genes`](../src/raven_python/init/genes.py) | Final gene-prune step of ftINIT. | -| `fitTasks` | ⛔ not ported | Niche; tasks are normally consumed via `checkTasks` / ftINIT's task layer. | - -## Omics, analysis, comparison - -| RAVEN | raven-python | Notes | -|---|---|---| -| `parseHPA`, `parseHPArna`, `scoreModel` | ✅ [`omics.parse_hpa`, `parse_hpa_rna`, `hpa_gene_scores`, `rna_gene_scores`](../src/raven_python/omics/hpa.py) | Pandas-tidy DataFrames; scoring adapters reuse `score_reactions_from_genes` (single source of truth for the GPR walk). | -| `reporterMetabolites` | ✅ [`analysis.reporter_metabolites`](../src/raven_python/analysis/reporter.py) | Exact closed-form background replaces RAVEN's Monte-Carlo (RM1 in IMPROVEMENTS). | -| `FSEOF` | ✅ [`analysis.fseof`](../src/raven_python/analysis/fseof.py) | Regression slope + correlation, amplify/knockdown/knockout classes, gene aggregation (FS1–FS4 in IMPROVEMENTS). | -| `randomSampling` | ✅ [`analysis.sample`](../src/raven_python/analysis/sampling.py) | Wraps cobra's flux sampling. | -| `compareMultipleModels` | ✅ [`comparison.compare_models`](../src/raven_python/comparison/compare.py) | Tidy DataFrames (reactions / mets / genes / subsystems presence + pairwise Jaccard + optional `check_tasks` pass/fail). Plotting and tSNE/MDS are 🗒️ one-liners in seaborn/scikit-learn; intentionally not in the function. | -| `runDynamicFBA` | ⛔ not ported | Established Python implementations exist: [`dfba`](https://pypi.org/project/dfba/) (Pinheiro et al.; CVODES-backed), [`reframed`](https://pypi.org/project/reframed/) (Machado lab), [`mewpy`](https://pypi.org/project/mewpy/) (Cunha lab). Cobrapy itself has none, but re-porting would duplicate maintained prior art. | - -## Localisation (Phase 7) - -| RAVEN | raven-python | Notes | -|---|---|---| -| `predictLocalization` | ✅ [`localization.predict_localization`](../src/raven_python/localization/predict.py) | Deterministic MILP (not simulated annealing). Caller-passed `reactions_to_relocate` set (everything else pinned). Multi-compartment by default: primary "free", extras pay `multi_compartment_penalty`. Tolerates incomplete models (no silent reaction removal). `apply=False` returns a `LocalizationProposal` diff. Real-data validation against curated yeast-GEM in [yeast_localization_benchmark.md](yeast_localization_benchmark.md). | -| `getWoLFScores`, `parseScores('wolf')` | ✅ [`localization.load_wolfpsort`](../src/raven_python/localization/scores.py) | Parses WoLF PSORT summary output (RAVEN-compatible); row-normalised. Does not shell out to the WoLF PSORT binary — run that separately and feed in the output. | -| `parseScores('deeploc')` | ✅ [`localization.load_deeploc`](../src/raven_python/localization/scores.py) | DeepLoc 2 per-protein CSV (Protein_ID / Localizations / Signals + one column per compartment). | - -## Things deliberately not ported - -* **`ravenCobraWrapper` / RAVEN struct adapter** — cobra is the canonical object; no parallel struct. -* **`checkModelStruct` struct/type checks** — moot in cobra. -* **`runDynamicFBA`** — see Omics/analysis row. -* **`getMetaCycModelForOrganism`** — see Reconstruction row; flagged for upstream removal. -* **`getPhylDist`** — fixed prok90/euk90 libraries make it moot. -* **`mapCompartments`** — overlaps with `compare_models`. -* **`editMiriam`, `extractMiriam`, `getRxnsInComp`, `getMetsInComp`, `constructEquations`, `getIndexes`** (most), **`setExchangeBounds`**, **most `setParam` modes**, **`getBlastFromExcel` Excel branch** — cobra one-liners; recorded above. - -## "New in raven-python" entry points - -These are not direct RAVEN ports: - -* [`comparison.compare_models`](../src/raven_python/comparison/compare.py) — already in RAVEN, but the raven-python version returns tidy DataFrames suitable for downstream analysis (RAVEN's version embeds heatmap/tSNE plotting). -* [`reconstruction.homology.run_blast` / `run_diamond` / `blast_from_table`](../src/raven_python/reconstruction/homology/blast.py) — generic subprocess wrappers; a DataFrame is the canonical "hits" object (no `blastStructure` struct). -* [`binaries.resolve_binary`, `ensure_binary`](../src/raven_python/binaries.py) — version-pinned release-ZIP registry for external tools (BLAST/DIAMOND/HMMER), SHA256-verified. -* [`localization.LocalizationProposal`](../src/raven_python/localization/predict.py) — the diff-preview mode (`apply=False`). diff --git a/docs/reference/api/analysis.md b/docs/reference/api/analysis.md new file mode 100644 index 0000000..187f874 --- /dev/null +++ b/docs/reference/api/analysis.md @@ -0,0 +1,10 @@ +# `raven_python.analysis` + +Analyses not in cobrapy's core: Reporter Metabolites (an around-metabolite gene-score test), +FSEOF (Flux Scanning based on Enforced Objective Flux), and random-objective flux sampling. + +```{eval-rst} +.. automodule:: raven_python.analysis + :members: + :imported-members: +``` diff --git a/docs/reference/api/comparison.md b/docs/reference/api/comparison.md new file mode 100644 index 0000000..36c0616 --- /dev/null +++ b/docs/reference/api/comparison.md @@ -0,0 +1,11 @@ +# `raven_python.comparison` + +Structural and functional comparison across multiple models — tidy DataFrames of +reaction / metabolite / gene / subsystem presence, pairwise Jaccard, and an optional +`check_tasks` pass/fail matrix. + +```{eval-rst} +.. automodule:: raven_python.comparison + :members: + :imported-members: +``` diff --git a/docs/reference/api/gapfilling.md b/docs/reference/api/gapfilling.md new file mode 100644 index 0000000..a853b1e --- /dev/null +++ b/docs/reference/api/gapfilling.md @@ -0,0 +1,11 @@ +# `raven_python.gapfilling` + +Connectivity gap-filling against template models: add the fewest (lowest-penalty) template +reactions so reactions blocked in a draft can carry flux. (For fill-until-feasible gap-fill, +use {func}`cobra.flux_analysis.gapfill`.) + +```{eval-rst} +.. automodule:: raven_python.gapfilling + :members: + :imported-members: +``` diff --git a/docs/reference/api/index.md b/docs/reference/api/index.md new file mode 100644 index 0000000..b3945b0 --- /dev/null +++ b/docs/reference/api/index.md @@ -0,0 +1,30 @@ +# API reference + +Generated from the package's NumPy-style docstrings. There is one page per subpackage; each +documents that subpackage's public API (its `__all__`) — the full signature and parameter +docs for every public function and class. + +The canonical object throughout is {class}`cobra.Model`; raven-python adds the +RAVEN-specific operations on top of it rather than introducing a parallel model type. + +```{toctree} +:maxdepth: 1 + +io +manipulation +utils +reconstruction +init +tasks +gapfilling +omics +localization +analysis +comparison +resolvers +``` + +:::{note} +The `plotting` subpackage is a stub — pathway-map and omics-overlay visualisation +(Phase 6) is not yet implemented. +::: diff --git a/docs/reference/api/init.md b/docs/reference/api/init.md new file mode 100644 index 0000000..cd70f8b --- /dev/null +++ b/docs/reference/api/init.md @@ -0,0 +1,10 @@ +# `raven_python.init` + +Context-specific model extraction (tINIT / ftINIT): the classic INIT MILP, the staged +ftINIT MILP, gene→reaction scoring, task-aware gap-filling, and the linear-merge reduction. + +```{eval-rst} +.. automodule:: raven_python.init + :members: + :imported-members: +``` diff --git a/docs/reference/api/io.md b/docs/reference/api/io.md new file mode 100644 index 0000000..cbe7f7a --- /dev/null +++ b/docs/reference/api/io.md @@ -0,0 +1,10 @@ +# `raven_python.io` + +RAVEN-specific I/O: YAML (cobra + Metabolic Atlas / Human-GEM extensions), SIF, Excel +export, and the Standard-GEM `model//…` git layout. + +```{eval-rst} +.. automodule:: raven_python.io + :members: + :imported-members: +``` diff --git a/docs/reference/api/localization.md b/docs/reference/api/localization.md new file mode 100644 index 0000000..1eaddcb --- /dev/null +++ b/docs/reference/api/localization.md @@ -0,0 +1,12 @@ +# `raven_python.localization` + +Sub-cellular localisation — predictor-agnostic and partial-update friendly. +{func}`raven_python.localization.predict_localization` is the MILP entry point; the loaders +parse predictor outputs (WoLF PSORT, DeepLoc 2) into the `gene × compartment` score frame the +algorithm consumes. + +```{eval-rst} +.. automodule:: raven_python.localization + :members: + :imported-members: +``` diff --git a/docs/reference/api/manipulation.md b/docs/reference/api/manipulation.md new file mode 100644 index 0000000..f7ba63f --- /dev/null +++ b/docs/reference/api/manipulation.md @@ -0,0 +1,18 @@ +# `raven_python.manipulation` + +Generic {class}`cobra.Model` structural transforms that cobrapy does not cover cleanly: +reaction building from equations, batch GPR / bound changes, irreversibility splitting, +isozyme expansion, compartment merge / copy, and model merging by name. + +```{eval-rst} +.. automodule:: raven_python.manipulation + :members: + :imported-members: +``` + +## Compartment helpers + +```{eval-rst} +.. automodule:: raven_python.manipulation.compartments + :members: +``` diff --git a/docs/reference/api/omics.md b/docs/reference/api/omics.md new file mode 100644 index 0000000..8ea784f --- /dev/null +++ b/docs/reference/api/omics.md @@ -0,0 +1,10 @@ +# `raven_python.omics` + +Omics integration — Human Protein Atlas (HPA) proteomics + RNA-seq parsing and gene-scoring +adapters. The entry point for tissue-specific (f)tINIT runs. + +```{eval-rst} +.. automodule:: raven_python.omics + :members: + :imported-members: +``` diff --git a/docs/reference/api/reconstruction.md b/docs/reference/api/reconstruction.md new file mode 100644 index 0000000..17ac825 --- /dev/null +++ b/docs/reference/api/reconstruction.md @@ -0,0 +1,24 @@ +# `raven_python.reconstruction` + +De novo reconstruction from KEGG and protein homology (BLAST / DIAMOND). + +## Homology + +Homology-based reconstruction from template models (`getModelFromHomology`, BLAST / DIAMOND). + +```{eval-rst} +.. automodule:: raven_python.reconstruction.homology + :members: + :imported-members: +``` + +## KEGG + +KEGG-based draft reconstruction (`getKEGGModelForOrganism` and friends): download → dump +parsing → HMM libraries (maintainer build steps), then the runtime model for a KEGG species. + +```{eval-rst} +.. automodule:: raven_python.reconstruction.kegg + :members: + :imported-members: +``` diff --git a/docs/reference/api/resolvers.md b/docs/reference/api/resolvers.md new file mode 100644 index 0000000..3266fcd --- /dev/null +++ b/docs/reference/api/resolvers.md @@ -0,0 +1,22 @@ +# Resource resolvers + +Version-pinned, SHA256-verified registries that fetch the external resources raven-python +depends on but does not vendor. + +## `raven_python.binaries` + +External-tool resolver for BLAST+ / DIAMOND / HMMER (release-ZIP registry). + +```{eval-rst} +.. automodule:: raven_python.binaries + :members: +``` + +## `raven_python.data` + +Data-bundle resolver (KEGG artefacts and template-model data). + +```{eval-rst} +.. automodule:: raven_python.data + :members: +``` diff --git a/docs/reference/api/tasks.md b/docs/reference/api/tasks.md new file mode 100644 index 0000000..168fa9e --- /dev/null +++ b/docs/reference/api/tasks.md @@ -0,0 +1,11 @@ +# `raven_python.tasks` + +Metabolic task definition, parsing, and checking — the task-list file format, running tasks +against a model, and finding the reactions a model must use to satisfy a task list (the input +for (f)tINIT's task layer). + +```{eval-rst} +.. automodule:: raven_python.tasks + :members: + :imported-members: +``` diff --git a/docs/reference/api/utils.md b/docs/reference/api/utils.md new file mode 100644 index 0000000..fb68816 --- /dev/null +++ b/docs/reference/api/utils.md @@ -0,0 +1,16 @@ +# `raven_python.utils` + +Shared helpers — GPR linting, elemental balance, model curation checks, id sorting. + +```{eval-rst} +.. automodule:: raven_python.utils + :members: + :imported-members: +``` + +## Name / compartment parsing + +```{eval-rst} +.. automodule:: raven_python.utils.parse + :members: +``` diff --git a/docs/reference/changelog.md b/docs/reference/changelog.md new file mode 100644 index 0000000..3139cd4 --- /dev/null +++ b/docs/reference/changelog.md @@ -0,0 +1,2 @@ +```{include} ../../CHANGELOG.md +``` diff --git a/docs/reference/improvements.md b/docs/reference/improvements.md new file mode 100644 index 0000000..a50e4ce --- /dev/null +++ b/docs/reference/improvements.md @@ -0,0 +1,2 @@ +```{include} ../../IMPROVEMENTS.md +``` diff --git a/docs/reference/index.md b/docs/reference/index.md new file mode 100644 index 0000000..01b96a6 --- /dev/null +++ b/docs/reference/index.md @@ -0,0 +1,22 @@ +# Reference + +Conceptual and API reference for raven-python. + +- **[RAVEN ↔ raven-python migration map](migration.md)** — the function-by-function map + from MATLAB RAVEN to raven-python (and cobrapy where appropriate). Start here if you're + porting RAVEN code. +- **[MATLAB RAVEN back-port proposals](matlab_raven_backports.md)** — improvements + raven-python makes that are candidates to back-port into the MATLAB toolbox. +- **[Improvements over RAVEN](improvements.md)** — the full catalogue of correctness / + ergonomics improvements (the `IMPROVEMENTS.md` master list). +- **[API reference](api/index.md)** — every public function and class, generated from the + docstrings. + +```{toctree} +:hidden: + +migration +matlab_raven_backports +improvements +api/index +``` diff --git a/docs/known_issues.md b/docs/reference/known_issues.md similarity index 99% rename from docs/known_issues.md rename to docs/reference/known_issues.md index 1ed394f..afc31fb 100644 --- a/docs/known_issues.md +++ b/docs/reference/known_issues.md @@ -111,7 +111,7 @@ tests live alongside each fixed function. All five items addressed in the quality-sweep pass (see CHANGELOG). The three docstring/comment items got documentation fixes; the two correctness items (F3, F5) got code fixes plus matching RAVEN-back-port proposals in -[IMPROVEMENTS.md](../IMPROVEMENTS.md) (FS4, B2). +[IMPROVEMENTS.md](improvements.md) (FS4, B2). - ✅ **`init/init.py` — `run_init`:** docstring now spells out the score-0 semantics divergence between classic INIT (score-0 = removable) and ftINIT diff --git a/docs/matlab_raven_backports.md b/docs/reference/matlab_raven_backports.md similarity index 85% rename from docs/matlab_raven_backports.md rename to docs/reference/matlab_raven_backports.md index b281896..0c96f00 100644 --- a/docs/matlab_raven_backports.md +++ b/docs/reference/matlab_raven_backports.md @@ -3,7 +3,7 @@ This is a consolidated list of fixes and improvements that the Python port surfaced and that are worth carrying upstream into MATLAB RAVEN. Each item names a RAVEN file/function, briefly diagnoses the current behaviour, and proposes the minimal -MATLAB-side patch. Items are sourced from [IMPROVEMENTS.md](../IMPROVEMENTS.md) +MATLAB-side patch. Items are sourced from [IMPROVEMENTS.md](improvements.md) (the `MATLAB RAVEN 💡` rows) plus two new items from the section-F quality sweep in [docs/known_issues.md](known_issues.md). @@ -17,7 +17,7 @@ fix is identical in spirit; the patch shape just differs. ## `core/getModelFromHomology.m` -Implemented in raventoolbox as [`reconstruction.homology.get_model_from_homology`](../src/raventoolbox/reconstruction/homology/homology.py). +Implemented in raventoolbox as [`reconstruction.homology.get_model_from_homology`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/homology/homology.py). * **H1** 🧹 — Split the overloaded `strictness` 1/2/3 into two orthogonal options: `bidirectional` (reciprocal hits) and `bestHitsOnly`. Reciprocal @@ -36,7 +36,7 @@ Implemented in raventoolbox as [`reconstruction.homology.get_model_from_homology ## `core/getKEGGModelForOrganism.m` and the KEGG pipeline -Implemented across [`reconstruction.kegg`](../src/raventoolbox/reconstruction/kegg/). +Implemented across [`reconstruction.kegg`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/kegg/). * **K1** 🐛 — In the KEGG flat-file parser, read each reaction's equation from its **own `EQUATION` field**, not by matching line *i* of `reaction.lst` @@ -85,11 +85,11 @@ Implemented across [`reconstruction.kegg`](../src/raventoolbox/reconstruction/ke (median true E ≈ 1e-100…1e-155; spurious ≈ 1e-8), silently dropping divergent real hits. At the proposed values, *M. genitalium* gene→KO recall rose from 0.84 → 0.94 (reaction recall 0.87 → 0.97) with no precision loss. - Full numbers in [docs/kegg_hmm_cutoff_calibration.md](kegg_hmm_cutoff_calibration.md). + Full numbers in [docs/kegg_hmm_cutoff_calibration.md](../studies/kegg_hmm_cutoff_calibration.md). ## `core/FSEOF.m` -Implemented in raventoolbox as [`analysis.fseof`](../src/raventoolbox/analysis/fseof.py). +Implemented in raventoolbox as [`analysis.fseof`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/analysis/fseof.py). * **FS1** 🐛 — Replace the strict step-by-step monotonicity gate (a target is discarded if any single step's flux fails to exceed the previous) with a @@ -123,7 +123,7 @@ Implemented in raventoolbox as [`analysis.fseof`](../src/raventoolbox/analysis/f ## `core/randomSampling.m` -Implemented in raventoolbox as [`analysis.random_sampling`](../src/raventoolbox/analysis/sampling.py). +Implemented in raventoolbox as [`analysis.random_sampling`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/analysis/sampling.py). * **SAMP1** ⚡ — Compute `goodRxns` (loop-free, flux-carrying objective candidates) via a **single FVA pass**, not the current per-reaction `parfor` @@ -142,7 +142,7 @@ Implemented in raventoolbox as [`analysis.random_sampling`](../src/raventoolbox/ ## `core/reporterMetabolites.m` -Implemented in raventoolbox as [`analysis.reporter_metabolites`](../src/raventoolbox/analysis/reporter.py). +Implemented in raventoolbox as [`analysis.reporter_metabolites`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/analysis/reporter.py). * **RM1** ⚡🐛 — Replace the per-neighbour-count Monte Carlo background correction (currently 100 000 random sets drawn *for each distinct @@ -154,7 +154,7 @@ Implemented in raventoolbox as [`analysis.reporter_metabolites`](../src/raventoo ## `core/runINIT.m` -Implemented in raventoolbox as [`init.run_init`](../src/raventoolbox/init/init.py). +Implemented in raventoolbox as [`init.run_init`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/init.py). * **I4** 🐛 — Drop the hard-coded big-M (1000) in the MILP and use **each reaction's own upper bound** instead: `v ≤ ub · x`. Expose `eps` and @@ -164,12 +164,12 @@ Implemented in raventoolbox as [`init.run_init`](../src/raventoolbox/init/init.p ## `core/ftINIT.m` and the ftINIT pipeline -Implemented across [`init.ftinit`](../src/raventoolbox/init/ftinit.py), [`init.taskfill`](../src/raventoolbox/init/taskfill.py), [`init.merge`](../src/raventoolbox/init/merge.py), [`init.prep`](../src/raventoolbox/init/prep.py). +Implemented across [`init.ftinit`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/ftinit.py), [`init.taskfill`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/taskfill.py), [`init.merge`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/merge.py), [`init.prep`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/prep.py). * **FT3** 🐛 — Same big-M issue as `runINIT` — use each reaction's own bound as big-M instead of the fixed 100/1000. Expose `force_on` and `force_on_ess` as parameters (the calibrated values for genome-scale Human-GEM are - documented in [docs/init_param_calibration.md](init_param_calibration.md)). + documented in [docs/init_param_calibration.md](../studies/init_param_calibration.md)). * **FT9** 🐛 — Per-reaction essential forcing must be **clamped to the reaction's bound**. The staged pipeline fixes previous-step reactions as essential at `min(0.99 · |prev flux|, 0.1)`; if a low-capacity reaction @@ -186,14 +186,14 @@ Implemented across [`init.ftinit`](../src/raventoolbox/init/ftinit.py), [`init.t ## `core/addRxns.m` -Implemented in raventoolbox as [`manipulation.add_reactions_from_equations`](../src/raventoolbox/manipulation/add.py). +Implemented in raventoolbox as [`manipulation.add_reactions_from_equations`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/add.py). * **A1** 🧹 — Accept a string keyword (`metsBy = 'id'` / `'name'`) instead of the opaque `eqnType = 1 / 2 / 3` integer. Call-sites become self-documenting. ## `core/checkModelStruct.m` (or its curation subset) -Implemented in raventoolbox as [`utils.check_model`](../src/raventoolbox/utils/validate.py). +Implemented in raventoolbox as [`utils.check_model`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/utils/validate.py). * **V2** 🧹 — Return a struct array of issues (one per finding, with `category` / `target` / `message` fields) instead of printing warnings or @@ -201,7 +201,7 @@ Implemented in raventoolbox as [`utils.check_model`](../src/raventoolbox/utils/v ## `core/getElementalBalance.m` -Implemented in raventoolbox as [`utils.get_elemental_balance`](../src/raventoolbox/utils/balance.py). +Implemented in raventoolbox as [`utils.get_elemental_balance`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/utils/balance.py). * **B2** 🐛 — A reaction with no metabolites (an empty `S(:,j)`) currently falls through to `balanceStatus = 1` ("balanced") because both pre-loops @@ -219,7 +219,7 @@ Implemented in raventoolbox as [`utils.get_elemental_balance`](../src/raventoolb ## `core/findPotentialErrors.m` (GPR linting) -Implemented in raventoolbox as [`utils.find_non_dnf_grrules`](../src/raventoolbox/utils/gpr.py). +Implemented in raventoolbox as [`utils.find_non_dnf_grrules`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/utils/gpr.py). * **S1** 🧹 — Return the `indexes2check` array (which the function already computes) plus per-reaction reason strings as a struct array, instead of @@ -252,4 +252,4 @@ Already flagged in `IMPROVEMENTS.md` as a removal target on both sides — the MetaCyc reconstruction path is dropped in raventoolbox and proposed for removal from MATLAB RAVEN (`external/metacyc/*`). Pasting here for visibility because the actual removal still needs to land upstream. See -[IMPROVEMENTS.md § R-MetaCyc](../IMPROVEMENTS.md) for the full rationale. +[IMPROVEMENTS.md § R-MetaCyc](improvements.md) for the full rationale. diff --git a/docs/reference/migration.md b/docs/reference/migration.md new file mode 100644 index 0000000..185b4b8 --- /dev/null +++ b/docs/reference/migration.md @@ -0,0 +1,120 @@ +# RAVEN → raven-python migration reference + +A function-by-function map from the MATLAB RAVEN Toolbox to raven-python (and cobrapy where +appropriate). For each RAVEN function we record one of four outcomes: + +* ✅ **ported** — there's a direct raven-python replacement; see the link. +* 🗒️ **cheatsheet** — cobrapy already covers it; a one-liner or short idiom does the job + (recorded under each row). +* ⛔ **not ported** — explicit decision not to bring it across, with rationale. +* 🆕 **new in raven-python** — functionality raven-python adds that has no RAVEN counterpart. + +For raven-python's deliberate improvements over RAVEN (and which of them are candidates to +upstream into MATLAB RAVEN), see [IMPROVEMENTS.md](improvements.md). + +## Design principle + +The in-memory object is always a [`cobra.Model`](https://cobrapy.readthedocs.io). There is +no parallel RAVEN struct, no `ravenCobraWrapper`-style adapter. RAVEN fields that cobra +doesn't model natively (`rxnMiriams`, `metDeltaG`, `rxnConfidenceScores`, …) live in +cobra's `annotation` / `notes` dictionaries. raven-python's YAML I/O follows the cobra YAML +standard plus the geckopy enzyme-constrained extension, so ecModels round-trip. + +--- + +## Foundation: utilities, manipulation, I/O + +| RAVEN | raven-python | Notes | +|---|---|---| +| `addRxns` (equations) | ✅ [`manipulation.add_reactions_from_equations`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/add.py) | The keystone: equation-string → reactions, matching mets by id, name, or `name[comp]`; strict / auto policies for new mets and genes. | +| `addTransport` | ✅ [`manipulation.add_transport_reactions`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/transport.py) | Transport across compartments, matching mets by name, sequential `tr_NNNN` ids. cobra has no transport primitive. | +| `addRxnsGenesMets` | ✅ [`manipulation.add_reactions_from_model`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/transfer.py) | Copy reactions from a source model, matching mets by `name[comp]` (vs cobra's strict-by-id merge). | +| `mergeModels` | ✅ [`manipulation.merge_models`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/merge.py) | N-model merge, unify mets by `name[comp]` (or id), keep all reactions (id collisions renamed). cobra's `merge` is pairwise / strict-by-id. | +| `changeRxns`, `changeGrRules` | ✅ [`manipulation.change_reaction_equations`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/change.py), `change_gene_reaction_rules` | Stoichiometry change in place; batch GPR set/append (`(old) or (new)`). | +| `setParam('var', …)` | ✅ [`manipulation.set_variance_bounds`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/parameters.py) | ±% band around measured values. **Other modes (lb/ub/eq/obj/unc)**: 🗒️ cobra one-liners (`reaction.bounds`, `model.objective`, `Configuration().bounds`). | +| `setExchangeBounds` | ⛔ not ported | cobra's `model.medium = {ex_id: uptake}` covers it. | +| `simplifyModel` (gap modes) | ✅ [`manipulation.remove_dead_end_reactions`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/simplify.py), `remove_duplicate_reactions`, `constrain_reversible_reactions`, `group_linear_reactions` | Cobra-covered modes (no-flux→`find_blocked_reactions`, zero-interval, unconstrained) are 🗒️ cheatsheeted. `group_linear` is lossy (drops genes), per RAVEN. | +| `removeMets`, `removeGenes` | ✅ [`manipulation.remove_metabolites`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/remove.py), `remove_genes` | Delegate to cobra; add `by_name` cross-compartment deletion (mets) and a `blocked_reactions` policy (genes). `removeReactions` itself ⛔ — cobra's `remove_reactions` covers it. | +| `convertToIrrev` | ✅ [`manipulation.convert_to_irreversible`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/irreversible.py) | Splits reversible non-exchange reactions into forward + `_REV`. Adopted from geckopy. | +| `expandModel` | ✅ [`manipulation.expand_model`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/expand.py) | Splits OR-GPR (isozyme) reactions into one per AND-clause. Adopted from geckopy. | +| `mergeCompartments` | ✅ [`manipulation.merge_compartments`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/compartments.py) | Collapse a multi-compartment model into one; deduplicate identical reactions; optionally drop one-met collapses (`drop_single_metabolite_reactions`). | +| `copyToComps` | ✅ [`manipulation.copy_to_compartment`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/compartments.py) | Duplicate reactions into a target compartment (idempotent; `delete_original=True` makes it a move). | +| `mapCompartments` | ⛔ not ported | Overlaps with `comparison.compare_models` on the reaction-id intersection. | +| `getElementalBalance` | ✅ [`utils.get_elemental_balance`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/utils/balance.py) | Graded `balanced` / `unbalanced` / `unknown` — `unknown` catches a missing formula that cobra's `check_mass_balance` silently miscounts. | +| `checkModelStruct` (curation subset) | ✅ [`utils.check_model`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/utils/validate.py) | Structured curation report. RAVEN's struct/type checks are moot in cobra. | +| `is_dnf` / GPR check (from `standardizeGrRules`) | ✅ [`utils.is_dnf`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/utils/gpr.py), `find_non_dnf_grrules` | Lint-only half; cobra auto-normalises GPRs on assignment, so the rewriting half isn't ported. | +| `getIndexes` (`metcomps` sliver) | ✅ [`utils.parse_name_comp`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/utils/parse.py) | The only `getIndexes` bit cobra doesn't already cover. | +| `editMiriam`, `extractMiriam` | ⛔ not ported | cobra's `.annotation` is already a `{namespace: id(s)}` dict — read/write it directly. | +| `getRxnsInComp`, `getMetsInComp` | ⛔ not ported | One-liners over cobra's `reaction.compartments` / `metabolite.compartment`. | +| `constructEquations` | ⛔ not ported | `reaction.build_reaction_string(use_metabolite_names=...)` already does both id and name equations. | +| `sortIdentifiers` | ✅ [`utils.sort_identifiers`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/utils/sort.py) | Model-wide alphabetical sort; also via `sort_ids=` on `write_yaml_model`. | + +### I/O + +| RAVEN | raven-python | Notes | +|---|---|---| +| `readYAMLmodel`, `writeYAMLmodel` | ✅ [`io.read_yaml_model`, `write_yaml_model`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/io/yaml.py) | Aligned to cobra's `!!omap` writer (RAVEN `fa281a1`). Adds the RAVEN-only top-level per-entry keys (inchis/deltaG/metFrom/notes, confidence_score/references/rxnFrom/deltaG, protein) into `.notes`, plus `version`/`metaData`/GECKO `ec-*`. cobra-readable output verified. | +| `exportModelToSIF` | ✅ [`io.export_model_to_sif`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/io/sif.py) | Cytoscape SIF (`rc`/`rr`/`cc` graphs). | +| `exportToExcelFormat` (export only) | ✅ [`io.export_to_excel`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/io/excel.py) | RAVEN 5-sheet xlsx (RXNS / METS / COMPS / GENES / MODEL). Excel **import** is intentionally excluded. | +| `exportForGit` | ✅ [`io.export_for_git`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/io/git.py) | Standard-GEM repo layout (`model//…`). | +| `importYAML/SBML/Mat/Excel` | 🗒️ cobra's standard readers | `cobra.io.read_sbml_model` / `load_json_model` / etc.; Excel import not ported. | + +## Reconstruction + +| RAVEN | raven-python | Notes | +|---|---|---| +| `getModelFromHomology` + `getBlast`/`getDiamond` | ✅ [`reconstruction.homology.get_model_from_homology`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/homology/homology.py), `run_blast`, `run_diamond`, `blast_from_table` | Core homology reconstruction with structured improvements (bidirectional / best-hits-only, AST GPR rewrite, complex policy, bitscore best-hits, DataFrame ortholog map). | +| KEGG download → species model (5 steps) | ✅ [`reconstruction.kegg.*`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/kegg/) | All five steps: `download.fetch_keggdb`, `parse.read_kegg_table` + reference model, `hmm.build_libraries`, `organism.build_kegg_model_for_organism` (no-FASTA), `query.assign_kos` + `run_hmmscan`. | +| `getPhylDist` | ⛔ not ported | Fixed prok90/euk90 libraries make the distance matrix moot. | +| `getMetaCycModelForOrganism` | ⛔ not ported (and **flagged for removal from MATLAB RAVEN**) | BLAST-to-single-representatives is low-precision at every cutoff. See [IMPROVEMENTS.md](improvements.md) under `R-MetaCyc`. | + +## Tasks, gap-filling, INIT, ftINIT + +| RAVEN | raven-python | Notes | +|---|---|---| +| `parseTaskList`, `checkTasks` | ✅ [`tasks.parse_task_list`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/tasks/tasklist.py), `check_tasks` ([tasks/check.py](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/tasks/check.py)) | `check_tasks` reuses one model across the task list (no per-task model copy) — at genome scale ~12× faster than the copy-per-task implementation it replaced. | +| `fillGaps` (connectivity mode) | ✅ [`gapfilling.connect_blocked_reactions`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/gapfilling/fill.py) | MILP (min penalty-weighted template reactions s.t. blocked reactions carry flux). Targeted mode → 🗒️ `cobra.gapfill`. | +| `runINIT`, `scoreComplexModel`, `getINITModel` | ✅ [`init.run_init`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/init.py), [`init.score_reactions_from_genes`, `gene_scores_from_expression`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/score.py), [`init.get_init_model`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/build.py) | INIT MILP rewritten in optlang (no sparse-matrix construction); RNA-seq scoring is `5·ln(level/ref)`-clamped. | +| `ftINIT`, `prepINITModel`, `ftINITInternalAlg`, `getINITSteps` | ✅ [`init.ftinit`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/ftinit.py), [`init.prep_init_model`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/prep.py), [`init.run_ftinit`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/ftinit.py), [`init.get_init_steps`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/steps.py) | Staged MILP + linear merge + scaling (`rescaleModelForINIT`). Validated against RAVEN on Human-GEM (Jaccard 0.975–0.980; see [humangem_validation.md](../studies/humangem_validation.md)). Metabolomics-based scoring is the one piece not yet implemented (raises `NotImplementedError`). | +| `ftINITFillGaps`, `ftINITFillGapsMILP`, `ftINITFillGapsForAllTasks` | ✅ [`init.fill_tasks`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/taskfill.py) | Task-aware gap-filling within ftINIT; in-place `_feasible` check + bounded fill MILP (`mip_gap`, `time_limit`). | +| `mergeLinear` | ✅ [`init.merge_linear`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/merge.py) | Linear merge of unit-stoichiometry chains; bookkeeping (`group_ids`, `reversed_rxns`) to map back to the reference model. | +| `removeLowScoreGenes` | ✅ [`init.remove_low_score_genes`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/genes.py) | Final gene-prune step of ftINIT. | +| `fitTasks` | ⛔ not ported | Niche; tasks are normally consumed via `checkTasks` / ftINIT's task layer. | + +## Omics, analysis, comparison + +| RAVEN | raven-python | Notes | +|---|---|---| +| `parseHPA`, `parseHPArna`, `scoreModel` | ✅ [`omics.parse_hpa`, `parse_hpa_rna`, `hpa_gene_scores`, `rna_gene_scores`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/omics/hpa.py) | Pandas-tidy DataFrames; scoring adapters reuse `score_reactions_from_genes` (single source of truth for the GPR walk). | +| `reporterMetabolites` | ✅ [`analysis.reporter_metabolites`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/analysis/reporter.py) | Exact closed-form background replaces RAVEN's Monte-Carlo (RM1 in IMPROVEMENTS). | +| `FSEOF` | ✅ [`analysis.fseof`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/analysis/fseof.py) | Regression slope + correlation, amplify/knockdown/knockout classes, gene aggregation (FS1–FS4 in IMPROVEMENTS). | +| `randomSampling` | ✅ [`analysis.sample`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/analysis/sampling.py) | Wraps cobra's flux sampling. | +| `compareMultipleModels` | ✅ [`comparison.compare_models`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/comparison/compare.py) | Tidy DataFrames (reactions / mets / genes / subsystems presence + pairwise Jaccard + optional `check_tasks` pass/fail). Plotting and tSNE/MDS are 🗒️ one-liners in seaborn/scikit-learn; intentionally not in the function. | +| `runDynamicFBA` | ⛔ not ported | Established Python implementations exist: [`dfba`](https://pypi.org/project/dfba/) (Pinheiro et al.; CVODES-backed), [`reframed`](https://pypi.org/project/reframed/) (Machado lab), [`mewpy`](https://pypi.org/project/mewpy/) (Cunha lab). Cobrapy itself has none, but re-porting would duplicate maintained prior art. | + +## Localisation (Phase 7) + +| RAVEN | raven-python | Notes | +|---|---|---| +| `predictLocalization` | ✅ [`localization.predict_localization`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/localization/predict.py) | Deterministic MILP (not simulated annealing). Caller-passed `reactions_to_relocate` set (everything else pinned). Multi-compartment by default: primary "free", extras pay `multi_compartment_penalty`. Tolerates incomplete models (no silent reaction removal). `apply=False` returns a `LocalizationProposal` diff. Real-data validation against curated yeast-GEM in [yeast_localization_benchmark.md](../studies/yeast_localization_benchmark.md). | +| `getWoLFScores`, `parseScores('wolf')` | ✅ [`localization.load_wolfpsort`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/localization/scores.py) | Parses WoLF PSORT summary output (RAVEN-compatible); row-normalised. Does not shell out to the WoLF PSORT binary — run that separately and feed in the output. | +| `parseScores('deeploc')` | ✅ [`localization.load_deeploc`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/localization/scores.py) | DeepLoc 2 per-protein CSV (Protein_ID / Localizations / Signals + one column per compartment). | + +## Things deliberately not ported + +* **`ravenCobraWrapper` / RAVEN struct adapter** — cobra is the canonical object; no parallel struct. +* **`checkModelStruct` struct/type checks** — moot in cobra. +* **`runDynamicFBA`** — see Omics/analysis row. +* **`getMetaCycModelForOrganism`** — see Reconstruction row; flagged for upstream removal. +* **`getPhylDist`** — fixed prok90/euk90 libraries make it moot. +* **`mapCompartments`** — overlaps with `compare_models`. +* **`editMiriam`, `extractMiriam`, `getRxnsInComp`, `getMetsInComp`, `constructEquations`, `getIndexes`** (most), **`setExchangeBounds`**, **most `setParam` modes**, **`getBlastFromExcel` Excel branch** — cobra one-liners; recorded above. + +## "New in raven-python" entry points + +These are not direct RAVEN ports: + +* [`comparison.compare_models`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/comparison/compare.py) — already in RAVEN, but the raven-python version returns tidy DataFrames suitable for downstream analysis (RAVEN's version embeds heatmap/tSNE plotting). +* [`reconstruction.homology.run_blast` / `run_diamond` / `blast_from_table`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/reconstruction/homology/blast.py) — generic subprocess wrappers; a DataFrame is the canonical "hits" object (no `blastStructure` struct). +* [`binaries.resolve_binary`, `ensure_binary`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/binaries.py) — version-pinned release-ZIP registry for external tools (BLAST/DIAMOND/HMMER), SHA256-verified. +* [`localization.LocalizationProposal`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/localization/predict.py) — the diff-preview mode (`apply=False`). diff --git a/docs/todo.md b/docs/reference/todo.md similarity index 74% rename from docs/todo.md rename to docs/reference/todo.md index 29f8571..8c24129 100644 --- a/docs/todo.md +++ b/docs/reference/todo.md @@ -1,7 +1,7 @@ # Open work -What's still on the books. See [raven_migration.md](raven_migration.md) for the function-by- -function port status (most of it is done); see [IMPROVEMENTS.md](../IMPROVEMENTS.md) for the +What's still on the books. See [raven_migration.md](migration.md) for the function-by- +function port status (most of it is done); see [IMPROVEMENTS.md](improvements.md) for the catalogue of raven-python improvements that should also be back-ported into MATLAB RAVEN. ## Major @@ -19,7 +19,7 @@ cobrapy + Escher already covers a lot here — likely a thin integration layer r ### Metabolomics-based scoring in ftINIT -The metabolomics-detected metabolite production-reward block in [`init.ftinit`](../src/raven_python/init/ftinit.py) +The metabolomics-detected metabolite production-reward block in [`init.ftinit`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/init/ftinit.py) currently raises `NotImplementedError` if a non-empty `metabolomics` argument is passed. The linear merge eliminates degree-2 detected metabolites, so it needs producer-group-mapping + negative-producer force-flux constraints — the most intricate MILP piece, for the least-used @@ -28,15 +28,15 @@ input. Worth doing only when a real user request lands. ## Infrastructure * **Binary ZIP releases** for BLAST/DIAMOND (Phase 3a). The runtime resolver in - [`binaries.py`](../src/raven_python/binaries.py) is ready; the registry is empty until ZIPs are + [`binaries.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/binaries.py) is ready; the registry is empty until ZIPs are published as GitHub release assets. -* **KEGG data artefact releases.** See [maintaining_kegg_data.md](maintaining_kegg_data.md). +* **KEGG data artefact releases.** See [maintaining_kegg_data.md](../maintenance/maintaining_kegg_data.md). ## Smaller items * [known_issues.md](known_issues.md) — backlog of low-priority edge cases / robustness gaps / dead code from the full-codebase review. None affects correctness on well-formed inputs. -* [IMPROVEMENTS.md](../IMPROVEMENTS.md) — items marked 💡 *proposed* are candidates to +* [IMPROVEMENTS.md](improvements.md) — items marked 💡 *proposed* are candidates to implement (and back-port). ## Upstream blockers (not raven-python work, but worth tracking) @@ -44,5 +44,5 @@ input. Worth doing only when a real user request lands. * `optlang.hybrid_interface.Configuration.clone()` bug — blocks HiGHS at any scale (CI catches it in `tests/test_init_solvers.py`). * GLPK's MIP solve ignores `configuration.timeout` at genome scale — blocks GLPK on large MILPs. -* Both documented in [init_solver_benchmark.md](init_solver_benchmark.md) with concrete fix +* Both documented in [init_solver_benchmark.md](../studies/init_solver_benchmark.md) with concrete fix suggestions. diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..98219c2 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,7 @@ +# Documentation build toolchain (used by ReadTheDocs and `make -C docs html`). +# The raven-python package itself is installed separately (see .readthedocs.yaml). +sphinx>=7.2 +furo>=2024.1.29 +myst-parser>=2.0 +sphinx-copybutton>=0.5 +sphinx-design>=0.5 diff --git a/docs/humangem_validation.md b/docs/studies/humangem_validation.md similarity index 100% rename from docs/humangem_validation.md rename to docs/studies/humangem_validation.md diff --git a/docs/studies/index.md b/docs/studies/index.md new file mode 100644 index 0000000..1db5f8e --- /dev/null +++ b/docs/studies/index.md @@ -0,0 +1,26 @@ +# Studies & validation + +Empirical validation runs and parameter calibrations that back raven-python's defaults and +its equivalence claims against MATLAB RAVEN. + +- **[Human-GEM validation](humangem_validation.md)** — raven-python ftINIT vs MATLAB RAVEN + on 5 Hart2015 cell lines (Jaccard 0.975–0.980). +- **[(f)tINIT parameter calibration](init_param_calibration.md)** — clean-data calibration + plus input-robustness study (`mip_gap` / `big_m` / `force_on` / `eps` / `prod_weight` / + scaling sweeps; dropout / noise / downsample robustness). +- **[(f)tINIT solver benchmark](init_solver_benchmark.md)** — Gurobi vs HiGHS vs GLPK on + genome-scale ftINIT. +- **[Yeast localization benchmark](yeast_localization_benchmark.md)** — + `predict_localization` against curated yeast-GEM, with a predictor-noise sweep. +- **[KEGG HMM cut-off calibration](kegg_hmm_cutoff_calibration.md)** — HMM E-value / + score-ratio sensitivity for the KEGG HMM-query reconstruction path. + +```{toctree} +:hidden: + +humangem_validation +init_param_calibration +init_solver_benchmark +yeast_localization_benchmark +kegg_hmm_cutoff_calibration +``` diff --git a/docs/init_param_calibration.md b/docs/studies/init_param_calibration.md similarity index 99% rename from docs/init_param_calibration.md rename to docs/studies/init_param_calibration.md index cc69314..899641a 100644 --- a/docs/init_param_calibration.md +++ b/docs/studies/init_param_calibration.md @@ -322,7 +322,7 @@ choice, is what bridges the gap. ## 3. Cross-solver portability See [init_solver_benchmark.md](init_solver_benchmark.md) for the genome-scale -solver comparison (Gurobi/HiGHS/GLPK) and [tests/test_init_solvers.py](../tests/test_init_solvers.py) +solver comparison (Gurobi/HiGHS/GLPK) and [tests/test_init_solvers.py](https://github.com/SysBioChalmers/raven-python/blob/develop/tests/test_init_solvers.py) for CI parameterised over installed MILP backends. Headline: at genome scale only Gurobi is viable today; HiGHS fails on an upstream optlang `hybrid_interface.clone()` bug; GLPK ignores `configuration.timeout` on MIP and ran 1 h+ without converging. Toy-scale diff --git a/docs/init_solver_benchmark.md b/docs/studies/init_solver_benchmark.md similarity index 100% rename from docs/init_solver_benchmark.md rename to docs/studies/init_solver_benchmark.md diff --git a/docs/kegg_hmm_cutoff_calibration.md b/docs/studies/kegg_hmm_cutoff_calibration.md similarity index 98% rename from docs/kegg_hmm_cutoff_calibration.md rename to docs/studies/kegg_hmm_cutoff_calibration.md index 43e3b3e..c2fd106 100644 --- a/docs/kegg_hmm_cutoff_calibration.md +++ b/docs/studies/kegg_hmm_cutoff_calibration.md @@ -29,7 +29,7 @@ in three steps: recall = |pred ∩ truth| / |truth|, F1. Reaction-level: `rxn_rec` = fraction of the organism's true reactions recovered (KO→reaction via `ko_reaction`); `rxn_novel` = predicted reactions **not** in the annotation set. -- Reproduce with [`scripts/analyze_hmm_cutoffs.py`](../scripts/analyze_hmm_cutoffs.py). +- Reproduce with [`scripts/analyze_hmm_cutoffs.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/scripts/analyze_hmm_cutoffs.py). ### Important caveat diff --git a/docs/yeast_localization_benchmark.md b/docs/studies/yeast_localization_benchmark.md similarity index 94% rename from docs/yeast_localization_benchmark.md rename to docs/studies/yeast_localization_benchmark.md index de75566..09aa49e 100644 --- a/docs/yeast_localization_benchmark.md +++ b/docs/studies/yeast_localization_benchmark.md @@ -1,10 +1,10 @@ # yeast-GEM localisation benchmark -Real-data validation of [`localization.predict_localization`](../src/raven_python/localization/predict.py) +Real-data validation of [`localization.predict_localization`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/localization/predict.py) on the curated yeast-GEM. The benchmark is end-to-end — model, scoring, MILP — and sweeps predictor noise so the failure modes are visible, not just the headline accuracy. -* Driver: [`scripts/benchmark_localization_yeast.py`](../scripts/benchmark_localization_yeast.py) +* Driver: [`scripts/benchmark_localization_yeast.py`](https://github.com/SysBioChalmers/raven-python/blob/develop/scripts/benchmark_localization_yeast.py) * Yeast-GEM source: `pcSecYeastSpecies/Model/yeastGEM.xml` (3991 reactions, 1147 genes, 14 compartments). * Run command: @@ -26,7 +26,7 @@ sweeps predictor noise so the failure modes are visible, not just the headline a compartments collapse to 12 placement targets in the truth set (extracellular and the lipid particle / vacuolar membrane variants stay distinct). 2. **Flattening**: the model is squashed into one compartment with - [`manipulation.merge_compartments`](../src/raven_python/manipulation/compartments.py) + [`manipulation.merge_compartments`](https://github.com/SysBioChalmers/raven-python/blob/develop/src/raven_python/manipulation/compartments.py) so the predictor cannot lean on metabolite-topology evidence. Without this step every GPR'd reaction's "predicted" compartment is just its current one — vacuous. 3. **Reference scores**: each gene gets `1.0` in every compartment that hosts one of diff --git a/src/raven_python/binaries.py b/src/raven_python/binaries.py index 2d4b5a2..78ee0ac 100644 --- a/src/raven_python/binaries.py +++ b/src/raven_python/binaries.py @@ -1,6 +1,6 @@ """Locate and provision external command-line binaries (BLAST+, DIAMOND, …). -Shared across tools (not homology-specific). Resolution order for any executable: +Shared across tools (not homology-specific). Resolution order for any executable:: explicit path arg → env var (RAVEN_PYTHON_) → shutil.which (PATH) → ensure_binary (download the version-pinned ZIP from a raven_python release, @@ -8,8 +8,8 @@ → FileNotFoundError with install guidance So a pre-installed/conda binary always wins; the bundled ZIP is the zero-setup -fallback. See docs/maintaining_binaries.md for how the release ZIPs and the -registry are produced and updated. +fallback. See docs/maintenance/maintaining_binaries.md for how the release ZIPs and +the registry are produced and updated. """ from __future__ import annotations diff --git a/src/raven_python/init/__init__.py b/src/raven_python/init/__init__.py index 040f299..8ef24ad 100644 --- a/src/raven_python/init/__init__.py +++ b/src/raven_python/init/__init__.py @@ -1,12 +1,14 @@ """Context-specific model extraction (tINIT / ftINIT). tINIT: + * :func:`run_init` — the classic INIT MILP. * :func:`score_reactions_from_genes` / :func:`gene_scores_from_expression` — gene → reaction scoring (RNA-seq is the common upstream). * :func:`get_init_model` — the tINIT pipeline (dead-end removal + ``run_init``). ftINIT (faster, staged): + * :func:`run_ftinit` — the single-step ftINIT MILP (continuous indicators for positive-score reactions; binaries only on negatives — the speedup over ``run_init``). * :func:`ftinit` — the full pipeline (``prep_init_model`` → staged ``run_ftinit`` → diff --git a/src/raven_python/manipulation/add.py b/src/raven_python/manipulation/add.py index 3842297..f5e6495 100644 --- a/src/raven_python/manipulation/add.py +++ b/src/raven_python/manipulation/add.py @@ -253,6 +253,7 @@ def add_reactions_from_equations( new_met_prefix: str = "m", ) -> list[Reaction]: """Add reactions defined by equation strings, matching mets by ID or name. + Parameters ---------- model diff --git a/src/raven_python/manipulation/change.py b/src/raven_python/manipulation/change.py index 78612ba..073fdfd 100644 --- a/src/raven_python/manipulation/change.py +++ b/src/raven_python/manipulation/change.py @@ -33,6 +33,7 @@ def change_reaction_equations( new_met_prefix: str = "m", ) -> list[Reaction]: """Replace the stoichiometry of existing reactions. + Parameters ---------- model diff --git a/src/raven_python/manipulation/merge.py b/src/raven_python/manipulation/merge.py index bfa1f24..805bd80 100644 --- a/src/raven_python/manipulation/merge.py +++ b/src/raven_python/manipulation/merge.py @@ -40,6 +40,7 @@ def merge_models( track_origin: bool = True, ) -> cobra.Model: """Merge models into a single new model. + Parameters ---------- models diff --git a/src/raven_python/manipulation/transfer.py b/src/raven_python/manipulation/transfer.py index b867f02..6534e38 100644 --- a/src/raven_python/manipulation/transfer.py +++ b/src/raven_python/manipulation/transfer.py @@ -34,6 +34,7 @@ def add_reactions_from_model( confidence: int | None = None, ) -> list[Reaction]: """Copy reactions from ``source_model`` into ``model``. + Parameters ---------- model diff --git a/src/raven_python/manipulation/transport.py b/src/raven_python/manipulation/transport.py index d0c1bf1..d222377 100644 --- a/src/raven_python/manipulation/transport.py +++ b/src/raven_python/manipulation/transport.py @@ -66,6 +66,7 @@ def add_transport_reactions( id_prefix: str = "tr_", ) -> list[Reaction]: """Add transport reactions from one compartment to one or more others. + Parameters ---------- from_compartment diff --git a/src/raven_python/utils/balance.py b/src/raven_python/utils/balance.py index ee64ab4..117cb1d 100644 --- a/src/raven_python/utils/balance.py +++ b/src/raven_python/utils/balance.py @@ -42,6 +42,7 @@ def get_elemental_balance( model: cobra.Model, reactions=None ) -> list[ElementalBalance]: """Check whether reactions are elementally balanced. + Parameters ---------- reactions