From 5520991db675b359b8ade1831181817d0fb2b145 Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Sat, 30 May 2026 01:59:44 +0200 Subject: [PATCH 1/8] Add diff_models, annotation, and conditions modules for yeast-GEM port MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lands the upstream-shareable pieces that yeast-GEM has been implementing locally during its Python port (see yeast-GEM/code/python/PORTING_PLAN.md and UPSTREAM_CANDIDATES.md). These are organism-agnostic; yeast-GEM will consume them via a Python dependency on raven-python. New modules ----------- raven_python.comparison.diff diff_models(a, b, ...) -> DiffReport — strict two-model semantic- equality diff. Complements the existing compare_models (N-model presence-matrix overview). Used as a CI gate to verify that two toolchains (e.g. MATLAB RAVEN vs raven_python, pre/post refactor of one toolchain) produce equivalent models. Includes a python -m raven_python.comparison.diff CLI. raven_python.annotation.sbo add_sbo_terms — SBO term assignment with "fill" semantic. Default parameter set reproduces yeast-GEM's behaviour; biomass metabolite names, biomass/NGAM reaction names, and pseudoreaction substrings are overridable. Transport detection is pluggable (default: same- met-name in two compartments). Includes an `only_last_reaction_ for_pseudo` legacy bug-compat flag for yeast-GEM's lock-step migration; off by default for any new caller. raven_python.annotation.delta_g load_delta_g_csv / save_delta_g_csv — generic side-car CSV persistence for scalar notes (ΔG by default, but the note key, column names, and id/value mapping are all configurable). raven_python.conditions.apply apply_condition(model, yaml_or_dict) — generic "apply this YAML condition" loader. Schema: prelude (reset_exchanges), cofactor_pseudoreaction (remove_mets + charge_balance_met), biomass_stoichiometry_delta, per-rxn bounds, expected_uptake_count. Project-specific extensions (e.g. yeast-GEM's amino_acid_ratio) are handled by the caller before/after this function — kept upstream-narrow on purpose. Also exposes set_reaction_bounds helper that bypasses cobra's lb<=ub validator for the (legitimate) cases where a condition lands on an infeasible bound state. Tests ----- 46 new tests across the three modules; full pre-existing raven-python suite still passes (519 passed; 1 unrelated pre-existing openpyxl ImportError in tests/test_io_git.py; 2 skipped). ruff clean. Not in this commit ------------------ The biomass / GAM / chemostat / fit_gam modules are still tracked as upstream candidates in yeast-GEM/code/python/UPSTREAM_CANDIDATES.md and remain local in yeast-GEM until phase 4 of the port (which would ideally land them directly here). --- src/raven_python/annotation/__init__.py | 24 ++ src/raven_python/annotation/delta_g.py | 115 ++++++++++ src/raven_python/annotation/sbo.py | 180 +++++++++++++++ src/raven_python/comparison/__init__.py | 22 +- src/raven_python/comparison/diff.py | 285 ++++++++++++++++++++++++ src/raven_python/conditions/__init__.py | 25 +++ src/raven_python/conditions/apply.py | 186 ++++++++++++++++ tests/test_annotation.py | 204 +++++++++++++++++ tests/test_comparison_diff.py | 146 ++++++++++++ tests/test_conditions.py | 199 +++++++++++++++++ 10 files changed, 1383 insertions(+), 3 deletions(-) create mode 100644 src/raven_python/annotation/__init__.py create mode 100644 src/raven_python/annotation/delta_g.py create mode 100644 src/raven_python/annotation/sbo.py create mode 100644 src/raven_python/comparison/diff.py create mode 100644 src/raven_python/conditions/__init__.py create mode 100644 src/raven_python/conditions/apply.py create mode 100644 tests/test_annotation.py create mode 100644 tests/test_comparison_diff.py create mode 100644 tests/test_conditions.py diff --git a/src/raven_python/annotation/__init__.py b/src/raven_python/annotation/__init__.py new file mode 100644 index 0000000..c2b4c17 --- /dev/null +++ b/src/raven_python/annotation/__init__.py @@ -0,0 +1,24 @@ +"""Annotation helpers — SBO term assignment, ΔG CSV persistence. + +These are the pieces of yeast-GEM's ``missingFields`` module that are +organism-agnostic enough to live upstream. Default parameter values +match the RAVEN/yeast convention so the functions are immediately +useful on the standard layout; consumers with different naming pass +overrides. +""" +from raven_python.annotation.delta_g import load_delta_g_csv, save_delta_g_csv +from raven_python.annotation.sbo import ( + DEFAULT_BIOMASS_MET_NAMES, + DEFAULT_BIOMASS_RXN_NAME, + DEFAULT_NGAM_RXN_NAME, + add_sbo_terms, +) + +__all__ = [ + "DEFAULT_BIOMASS_MET_NAMES", + "DEFAULT_BIOMASS_RXN_NAME", + "DEFAULT_NGAM_RXN_NAME", + "add_sbo_terms", + "load_delta_g_csv", + "save_delta_g_csv", +] diff --git a/src/raven_python/annotation/delta_g.py b/src/raven_python/annotation/delta_g.py new file mode 100644 index 0000000..e0a90e0 --- /dev/null +++ b/src/raven_python/annotation/delta_g.py @@ -0,0 +1,115 @@ +"""Persist scalar annotations (Gibbs free energy, etc.) as a side-car CSV. + +The committed SBML standard has no slot for thermodynamic ΔG values, so +projects like yeast-GEM keep them in a paired CSV instead. This module +provides the generic load / save round-trip — the file format is a two +column table ``, `` — and exposes the per-call schema so +existing projects can configure their conventions. + +The functions operate on cobra ``.notes`` (string round-tripped through +SBML), keeping any other notes untouched. +""" +from __future__ import annotations + +import math +from collections.abc import Iterable +from pathlib import Path + +import cobra +import pandas as pd + + +def load_delta_g_csv( + entities: Iterable, + path: str | Path, + *, + id_column: str = "Var1", + value_column: str = "Var2", + note_key: str = "deltaG", + verbose: bool = False, +) -> int: + """Stamp ``note_key`` on each entity from a CSV of ``id → value``. + + Parameters + ---------- + entities + Cobra entities (``model.metabolites`` or ``model.reactions``). + path + Path to the CSV. + id_column, value_column + Column labels in the CSV. Defaults match the yeast-GEM + convention (``Var1`` / ``Var2`` from MATLAB's ``array2table``). + note_key + Key under which the value is stored on ``entity.notes``. + Default ``"deltaG"``. + verbose + Print a summary of unmatched entity ids. + + Returns + ------- + The number of entities that were stamped (i.e. matched the CSV). + """ + df = pd.read_csv(path) + if id_column not in df.columns or value_column not in df.columns: + raise ValueError( + f"{path} columns {list(df.columns)} do not contain " + f"both {id_column!r} and {value_column!r}" + ) + lookup = dict(zip(df[id_column], df[value_column], strict=True)) + + stamped = 0 + missing: list[str] = [] + for entity in entities: + value = lookup.get(entity.id) + if value is None or (isinstance(value, float) and math.isnan(value)): + missing.append(entity.id) + continue + entity.notes[note_key] = str(value) + stamped += 1 + + if verbose and missing: + print( + f"{len(missing)} entity ids were not in {path} " + f"(e.g. {missing[:3]}); left untouched." + ) + return stamped + + +def save_delta_g_csv( + entities: Iterable, + path: str | Path, + *, + id_column: str = "Var1", + value_column: str = "Var2", + note_key: str = "deltaG", +) -> int: + """Dump ``entity.notes[note_key]`` for each entity to a CSV. + + Entities without ``note_key`` set get ``NaN`` written, preserving + one-row-per-entity ordering (mirrors MATLAB's ``array2table([ids, + values])`` behaviour). + + Returns + ------- + The number of rows written (= number of entities). + """ + rows: list[tuple[str, float]] = [] + for entity in entities: + raw = entity.notes.get(note_key) + if raw is None: + value: float = math.nan + else: + try: + value = float(raw) + except ValueError: + value = math.nan + rows.append((entity.id, value)) + pd.DataFrame(rows, columns=[id_column, value_column]).to_csv(path, index=False) + return len(rows) + + +# Re-export the cobra Model type for type-checker friendliness; helps +# IDEs surface the right hints to callers that hand us model.metabolites +# / model.reactions directly. +__all__ = ["load_delta_g_csv", "save_delta_g_csv"] +_ = cobra # silence "imported but unused" — used for typing context above diff --git a/src/raven_python/annotation/sbo.py b/src/raven_python/annotation/sbo.py new file mode 100644 index 0000000..d0306cb --- /dev/null +++ b/src/raven_python/annotation/sbo.py @@ -0,0 +1,180 @@ +"""SBO term assignment. + +Port of the generic core of yeast-GEM's ``code/missingFields/addSBOterms.m``. +Defaults reproduce the yeast-GEM behaviour; overrides parameterise the +pseudo-metabolite / pseudo-reaction name conventions, the transport +detector, and a yeast-specific bug-compat flag. +""" +from __future__ import annotations + +from collections.abc import Callable, Iterable + +import cobra + +DEFAULT_BIOMASS_MET_NAMES: frozenset[str] = frozenset( + {"biomass", "DNA", "RNA", "protein", "carbohydrate", + "lipid", "cofactor", "ion"} +) +DEFAULT_BIOMASS_RXN_NAME: str = "biomass pseudoreaction" +DEFAULT_NGAM_RXN_NAME: str = "non-growth associated maintenance reaction" + + +def add_sbo_terms( + model: cobra.Model, + *, + biomass_met_names: Iterable[str] = DEFAULT_BIOMASS_MET_NAMES, + biomass_met_suffixes: Iterable[str] = (" backbone", " chain"), + biomass_rxn_name: str = DEFAULT_BIOMASS_RXN_NAME, + ngam_rxn_name: str = DEFAULT_NGAM_RXN_NAME, + pseudoreaction_name_substrings: Iterable[str] = ("pseudoreaction", "SLIME rxn"), + transport_detector: Callable[[cobra.Model], set[str]] | None = None, + only_last_reaction_for_pseudo: bool = False, +) -> cobra.Model: + """Assign SBO terms to metabolites and reactions in-place. + + Metabolite SBO assignment + - SBO:0000649 (Biomass) for metabolites whose ``name`` is in + ``biomass_met_names`` or ends with any of + ``biomass_met_suffixes`` (default: ``" backbone"`` / ``" chain"``). + - SBO:0000247 (Simple chemical) otherwise. + + Reaction SBO assignment + - SBO:0000176 (Metabolic reaction) default. + - Single-reactant reactions (exchange / sink / demand): + * extracellular → SBO:0000627 (exchange) + * coefficient < 0 → SBO:0000632 (sink) + * else → SBO:0000628 (demand) + - Transport reactions → SBO:0000655 (default detector: same + metabolite name appearing in ≥ 2 compartments). + - The reaction whose ``name`` matches ``biomass_rxn_name`` → + SBO:0000629. + - The reaction whose ``name`` matches ``ngam_rxn_name`` → + SBO:0000630. + - Other reactions whose ``name`` contains any + ``pseudoreaction_name_substrings`` → SBO:0000395. + + "fill" semantic: SBO is written to ``annotation['sbo']`` only when + that key is missing or empty — mirrors RAVEN's + ``editMiriam(..., 'fill')`` mode. + + Parameters + ---------- + transport_detector + Optional callable that takes the model and returns the set of + reaction ids classified as transport. When ``None``, the default + same-met-name-in-two-compartments heuristic is used. Pass a + custom detector to use a different convention (e.g. RAVEN's + ``getTransportRxns`` via the MATLAB engine). + only_last_reaction_for_pseudo + **Bug-compatibility flag**, off by default. The legacy yeast-GEM + MATLAB ``addSBOterms.m`` contains a typo (``for i = numel(...)`` + rather than ``for i = 1:numel(...)``) that causes the pseudo- + reaction SBO assignments to run only on the very last reaction + in the model. When ``True``, this flag replicates that bug so + callers can migrate without altering the committed model. Leave + ``False`` for new uses. + """ + biomass_met_names = frozenset(biomass_met_names) + biomass_met_suffixes = tuple(biomass_met_suffixes) + pseudoreaction_name_substrings = tuple(pseudoreaction_name_substrings) + if transport_detector is None: + transport_detector = _default_transport_detector + + _assign_metabolite_sbo(model, biomass_met_names, biomass_met_suffixes) + _assign_reaction_sbo( + model, + biomass_rxn_name=biomass_rxn_name, + ngam_rxn_name=ngam_rxn_name, + pseudo_substrings=pseudoreaction_name_substrings, + transport_detector=transport_detector, + only_last_reaction_for_pseudo=only_last_reaction_for_pseudo, + ) + return model + + +# --- internals --------------------------------------------------------- + +def _assign_metabolite_sbo( + model: cobra.Model, + biomass_met_names: frozenset[str], + biomass_met_suffixes: tuple[str, ...], +) -> None: + for met in model.metabolites: + if met.name in biomass_met_names or met.name.endswith(biomass_met_suffixes): + sbo = "SBO:0000649" + else: + sbo = "SBO:0000247" + _fill_sbo(met, sbo) + + +def _assign_reaction_sbo( + model: cobra.Model, + *, + biomass_rxn_name: str, + ngam_rxn_name: str, + pseudo_substrings: tuple[str, ...], + transport_detector: Callable[[cobra.Model], set[str]], + only_last_reaction_for_pseudo: bool, +) -> None: + """Compute the SBO for each reaction via successive overrides, then + fill into the model in a single pass — mirrors the MATLAB + ``rxnSBO``-array + ``editMiriam(..., 'fill')`` two-step. + """ + transport_set = transport_detector(model) + rxns = list(model.reactions) + + sbo_for_rxn: dict[str, str] = {} + for rxn in rxns: + sbo = "SBO:0000176" + if len(rxn.metabolites) == 1: + (met,) = rxn.metabolites + coef = rxn.metabolites[met] + if met.compartment == "e" or model.compartments.get( + met.compartment + ) == "extracellular": + sbo = "SBO:0000627" + elif coef < 0: + sbo = "SBO:0000632" + else: + sbo = "SBO:0000628" + sbo_for_rxn[rxn.id] = sbo + + # Transport override + for rxn_id in transport_set: + sbo_for_rxn[rxn_id] = "SBO:0000655" + + # Pseudoreaction override. The legacy bug-compat flag scopes this + # loop to the last reaction only; the default walks the whole list. + pseudo_targets = [rxns[-1]] if only_last_reaction_for_pseudo else rxns + for rxn in pseudo_targets: + if rxn.name == biomass_rxn_name: + sbo_for_rxn[rxn.id] = "SBO:0000629" + elif rxn.name == ngam_rxn_name: + sbo_for_rxn[rxn.id] = "SBO:0000630" + elif any(s in rxn.name for s in pseudo_substrings): + sbo_for_rxn[rxn.id] = "SBO:0000395" + + for rxn in rxns: + _fill_sbo(rxn, sbo_for_rxn[rxn.id]) + + +def _fill_sbo(entity, sbo: str) -> None: + """Set ``annotation['sbo']`` only if it is missing or empty.""" + if not entity.annotation.get("sbo"): + entity.annotation["sbo"] = sbo + + +def _default_transport_detector(model: cobra.Model) -> set[str]: + """Same-met-name-in-two-compartments heuristic for transport detection. + + Cheap analogue of RAVEN's ``getTransportRxns``. Pass a custom + callable to :func:`add_sbo_terms` to override. + """ + out: set[str] = set() + for rxn in model.reactions: + by_name: dict[str, set[str | None]] = {} + for met in rxn.metabolites: + by_name.setdefault(met.name, set()).add(met.compartment) + if any(len(comps) >= 2 for comps in by_name.values()): + out.add(rxn.id) + return out diff --git a/src/raven_python/comparison/__init__.py b/src/raven_python/comparison/__init__.py index e4b4c19..968b7f0 100644 --- a/src/raven_python/comparison/__init__.py +++ b/src/raven_python/comparison/__init__.py @@ -1,7 +1,23 @@ -"""Structural and functional comparison across multiple models. +"""Model comparison utilities. -See :func:`raven_python.comparison.compare.compare_models`. +Two flavours: + +* :func:`compare_models` — N-model presence-matrix overview (RAVEN's + ``compareMultipleModels`` analogue). "How do these models relate?" +* :func:`diff_models` — strict two-model semantic-equality diff for CI + gates. "Are these two models the same?" """ from raven_python.comparison.compare import ModelComparison, compare_models +from raven_python.comparison.diff import ( + DEFAULT_ANNOTATION_KEYS, + DiffReport, + diff_models, +) -__all__ = ["ModelComparison", "compare_models"] +__all__ = [ + "DEFAULT_ANNOTATION_KEYS", + "DiffReport", + "ModelComparison", + "compare_models", + "diff_models", +] diff --git a/src/raven_python/comparison/diff.py b/src/raven_python/comparison/diff.py new file mode 100644 index 0000000..2a399eb --- /dev/null +++ b/src/raven_python/comparison/diff.py @@ -0,0 +1,285 @@ +"""Strict two-model semantic-equality diff. + +Complements :func:`raven_python.comparison.compare.compare_models`, which +takes ≥2 models and produces a presence-matrix overview. Where +``compare_models`` answers *"how do these models relate?"*, +:func:`diff_models` answers *"are these two models the same?"* — used as a +CI gate to verify that two toolchains (e.g. MATLAB RAVEN vs raven_python, +or pre/post refactor of one toolchain) produce equivalent models. + +Diff scope: reaction / metabolite / gene id sets, stoichiometry (within +tolerance), bounds, objective coefficients, GPR rules, metabolite +formula/charge/compartment, and a configurable set of annotation keys. +Formatting differences (key ordering, whitespace, float repr) are +explicitly **not** failures. +""" +from __future__ import annotations + +from collections.abc import Iterable +from dataclasses import dataclass, field + +import cobra + +# Annotation keys checked by default. Add via ``extra_annotations`` or +# remove via ``ignore_annotations`` in :func:`diff_models`. +DEFAULT_ANNOTATION_KEYS: tuple[str, ...] = ( + "sbo", + "ec-code", + "kegg.reaction", + "kegg.compound", + "metanetx.reaction", + "metanetx.chemical", + "chebi", + "bigg.reaction", + "bigg.metabolite", +) + + +@dataclass +class DiffReport: + """Result of :func:`diff_models`. + + Truthy if the models are semantically equal. Iterate ``differences`` + for human-readable messages. + """ + + equal: bool + differences: list[str] = field(default_factory=list) + + def __bool__(self) -> bool: + return self.equal + + def __str__(self) -> str: + if self.equal: + return "Models are semantically equal." + head = f"Models differ ({len(self.differences)} differences):" + body = "\n".join(f" - {d}" for d in self.differences) + return f"{head}\n{body}" + + +def diff_models( + a: cobra.Model, + b: cobra.Model, + *, + stoichiometry_tol: float = 1e-9, + ignore_annotations: Iterable[str] = (), + extra_annotations: Iterable[str] = (), + max_per_category: int = 50, +) -> DiffReport: + """Compare two cobra models for semantic equality. + + Checks: + + * reaction, metabolite and gene id sets (exact) + * stoichiometry per shared reaction (within ``stoichiometry_tol``) + * lower/upper bounds and objective coefficients (exact) + * GPR rules (whitespace- and case-insensitive string comparison) + * metabolite formula, charge, compartment (exact) + * a default set of annotation keys (see ``DEFAULT_ANNOTATION_KEYS``) + plus any ``extra_annotations``, minus any ``ignore_annotations`` + + ``max_per_category`` caps the per-category diff list so a wholesale + mismatch produces a digestible report rather than 10,000 lines. + """ + diffs: list[str] = [] + keys = (set(DEFAULT_ANNOTATION_KEYS) | set(extra_annotations)) - set(ignore_annotations) + + _diff_id_sets(a, b, diffs) + + common_rxns = {r.id for r in a.reactions} & {r.id for r in b.reactions} + _diff_reactions(a, b, sorted(common_rxns), stoichiometry_tol, keys, diffs, max_per_category) + + common_mets = {m.id for m in a.metabolites} & {m.id for m in b.metabolites} + _diff_metabolites(a, b, sorted(common_mets), keys, diffs, max_per_category) + + return DiffReport(equal=not diffs, differences=diffs) + + +# --- internals --------------------------------------------------------- + +def _diff_id_sets(a: cobra.Model, b: cobra.Model, diffs: list[str]) -> None: + for label, get in ( + ("reactions", lambda m: m.reactions), + ("metabolites", lambda m: m.metabolites), + ("genes", lambda m: m.genes), + ): + a_ids = {x.id for x in get(a)} + b_ids = {x.id for x in get(b)} + only_a = sorted(a_ids - b_ids) + only_b = sorted(b_ids - a_ids) + if only_a: + diffs.append(f"{label} only in A ({len(only_a)}): {only_a[:10]}{_more(only_a, 10)}") + if only_b: + diffs.append(f"{label} only in B ({len(only_b)}): {only_b[:10]}{_more(only_b, 10)}") + + +def _diff_reactions( + a: cobra.Model, + b: cobra.Model, + common: list[str], + tol: float, + anno_keys: set[str], + diffs: list[str], + cap: int, +) -> None: + counters = {"stoich": 0, "bounds": 0, "objective": 0, "gpr": 0, "anno": 0} + for rxn_id in common: + ra = a.reactions.get_by_id(rxn_id) + rb = b.reactions.get_by_id(rxn_id) + + sa = {m.id: c for m, c in ra.metabolites.items()} + sb = {m.id: c for m, c in rb.metabolites.items()} + if set(sa) != set(sb): + _push(diffs, counters, "stoich", cap, f"{rxn_id}: stoichiometry has different mets") + else: + for mid in sa: + if abs(sa[mid] - sb[mid]) > tol: + _push( + diffs, counters, "stoich", cap, + f"{rxn_id}: coef[{mid}] A={sa[mid]:.6g} B={sb[mid]:.6g}", + ) + + if ra.lower_bound != rb.lower_bound or ra.upper_bound != rb.upper_bound: + _push( + diffs, counters, "bounds", cap, + f"{rxn_id}: bounds A=({ra.lower_bound}, {ra.upper_bound}) " + f"B=({rb.lower_bound}, {rb.upper_bound})", + ) + + if ra.objective_coefficient != rb.objective_coefficient: + _push( + diffs, counters, "objective", cap, + f"{rxn_id}: objective A={ra.objective_coefficient} B={rb.objective_coefficient}", + ) + + ga = _normalise_gpr(ra.gene_reaction_rule) + gb = _normalise_gpr(rb.gene_reaction_rule) + if ga != gb: + _push(diffs, counters, "gpr", cap, f"{rxn_id}: GPR A={ga!r} B={gb!r}") + + _diff_annotations( + f"rxn {rxn_id}", ra.annotation, rb.annotation, + anno_keys, diffs, counters, cap, + ) + + +def _diff_metabolites( + a: cobra.Model, + b: cobra.Model, + common: list[str], + anno_keys: set[str], + diffs: list[str], + cap: int, +) -> None: + counters = {"formula": 0, "charge": 0, "compartment": 0, "anno": 0} + for met_id in common: + ma = a.metabolites.get_by_id(met_id) + mb = b.metabolites.get_by_id(met_id) + + if (ma.formula or None) != (mb.formula or None): + _push( + diffs, counters, "formula", cap, + f"met {met_id}: formula A={ma.formula} B={mb.formula}", + ) + + ca = ma.charge if ma.charge is not None else 0 + cb = mb.charge if mb.charge is not None else 0 + if ca != cb: + _push( + diffs, counters, "charge", cap, + f"met {met_id}: charge A={ma.charge} B={mb.charge}", + ) + + if ma.compartment != mb.compartment: + _push( + diffs, counters, "compartment", cap, + f"met {met_id}: compartment A={ma.compartment} B={mb.compartment}", + ) + + _diff_annotations( + f"met {met_id}", ma.annotation, mb.annotation, + anno_keys, diffs, counters, cap, + ) + + +def _diff_annotations( + label: str, + aa: dict, + ba: dict, + keys: set[str], + diffs: list[str], + counters: dict, + cap: int, +) -> None: + for k in keys: + va = _normalise_annotation_value(aa.get(k)) + vb = _normalise_annotation_value(ba.get(k)) + if va != vb: + _push(diffs, counters, "anno", cap, f"{label}.annotation[{k!r}]: A={va} B={vb}") + + +def _normalise_gpr(rule: str) -> str: + """Lowercase + collapse whitespace. + + A more robust comparator would parse to a GPR AST and compare + structures; this is the cheap heuristic that catches the formatting + drift we see between different SBML writers. + """ + if not rule: + return "" + return " ".join(rule.lower().split()) + + +def _normalise_annotation_value(v): + if v is None: + return None + if isinstance(v, list): + return tuple(sorted(str(x) for x in v)) + if isinstance(v, tuple): + return tuple(sorted(str(x) for x in v)) + return str(v) + + +def _push(diffs: list[str], counters: dict, key: str, cap: int, msg: str) -> None: + counters[key] = counters.get(key, 0) + 1 + if counters[key] <= cap: + diffs.append(msg) + elif counters[key] == cap + 1: + diffs.append(f"... ({key} category truncated at {cap} entries)") + + +def _more(seq: list, shown: int) -> str: + return "" if len(seq) <= shown else f" ... (+{len(seq) - shown} more)" + + +# --- CLI --------------------------------------------------------------- + +def _main() -> int: + import argparse + + from cobra.io import read_sbml_model + + parser = argparse.ArgumentParser( + description="Diff two SBML models for semantic equality." + ) + parser.add_argument("a", help="path to first model (SBML)") + parser.add_argument("b", help="path to second model (SBML)") + parser.add_argument( + "--stoichiometry-tol", + type=float, + default=1e-9, + help="absolute tolerance on stoichiometric coefficients (default 1e-9)", + ) + args = parser.parse_args() + + report = diff_models( + read_sbml_model(args.a), + read_sbml_model(args.b), + stoichiometry_tol=args.stoichiometry_tol, + ) + print(report) + return 0 if report.equal else 1 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(_main()) diff --git a/src/raven_python/conditions/__init__.py b/src/raven_python/conditions/__init__.py new file mode 100644 index 0000000..8465081 --- /dev/null +++ b/src/raven_python/conditions/__init__.py @@ -0,0 +1,25 @@ +"""Apply named "condition" presets — bound diffs, biomass edits, etc. + +A *condition* is a YAML file (or pre-parsed dict) describing a set of +deterministic modifications to apply to a cobra model: a prelude that +resets exchange bounds, edits to a cofactor pseudoreaction +(metabolite removals + automatic charge rebalancing), a per-reaction +bounds diff, and an optional biomass-stoichiometry delta. + +Yeast-GEM was the first consumer; the schema is documented in +:func:`apply_condition` and is meant to be reusable for any GEM that +wants to keep its condition presets as data rather than code. +""" +from raven_python.conditions.apply import ( + DEFAULT_RESET_EXCHANGES_UPPER_BOUND, + apply_condition, + load_condition, + set_reaction_bounds, +) + +__all__ = [ + "DEFAULT_RESET_EXCHANGES_UPPER_BOUND", + "apply_condition", + "load_condition", + "set_reaction_bounds", +] diff --git a/src/raven_python/conditions/apply.py b/src/raven_python/conditions/apply.py new file mode 100644 index 0000000..2dc1a2b --- /dev/null +++ b/src/raven_python/conditions/apply.py @@ -0,0 +1,186 @@ +"""Generic "apply this YAML condition" loader. + +Schema (all keys optional): + +.. code-block:: yaml + + name: my_condition + description: free-form text. + + # Set every exchange reaction to (0, ub) before any per-rxn override. + prelude: + reset_exchanges: out # str | bool truthy + + # Remove metabolites from a pseudoreaction and rebalance H+ + # (or any other "balance" met) so total charge stays zero. + cofactor_pseudoreaction: + rxn_id: r_4598 + remove_mets: + - { met: s_3714 } # comment field is informational + charge_balance_met: s_0794 # optional + + # Add (combine=True) to a reaction's stoichiometry. + biomass_stoichiometry_delta: + rxn_id: r_4041 + add: + - { met: s_0689, coef: 0.08 } + - { met: s_0687, coef: -0.08 } + - { met: s_0794, coef: -0.16 } + + # Per-reaction bounds. lb / ub omitted means "leave unchanged". + bounds: + - { rxn: r_1654, lb: -1000 } + - { rxn: r_1992, lb: 0 } + - { rxn: r_1663, lb: 0, ub: 0 } + + # Sanity-check counter. Emits a warning if the actual count of + # ``lb == -1000`` bound entries doesn't match. + expected_uptake_count: 15 + +The schema is intentionally narrow: deterministic, idempotent, easy to +diff in code review. Project-specific extensions (e.g. yeast-GEM's +``amino_acid_ratio``) are handled by the *caller* before / after this +function — keep the upstream generic. +""" +from __future__ import annotations + +import warnings +from pathlib import Path +from typing import Any + +import cobra +import yaml + +#: ``prelude.reset_exchanges`` puts every exchange reaction at this +#: upper bound (and lower bound = 0). +DEFAULT_RESET_EXCHANGES_UPPER_BOUND: float = 1000.0 + + +def load_condition(path: str | Path) -> dict[str, Any]: + """Parse a condition YAML file into a plain dict.""" + path = Path(path) + if not path.exists(): + raise FileNotFoundError(f"Condition file not found: {path}") + with open(path) as f: + return yaml.safe_load(f) + + +def apply_condition( + model: cobra.Model, + condition: dict[str, Any] | str | Path, +) -> cobra.Model: + """Apply a parsed condition (or a YAML file path) to ``model`` in place. + + Returns the same model object for chaining. + """ + cfg = condition if isinstance(condition, dict) else load_condition(condition) + + _apply_prelude(model, cfg) + _apply_cofactor_pseudoreaction(model, cfg) + _apply_biomass_stoichiometry_delta(model, cfg) + n_uptake = _apply_bounds(model, cfg) + _check_uptake_count(cfg, n_uptake) + + return model + + +def set_reaction_bounds(rxn: cobra.Reaction, lb: float, ub: float) -> None: + """Set both bounds, bypassing cobra's ``lb <= ub`` validator. + + Some conditions intentionally land on an infeasible ``lb > ub`` state + (e.g. forcing flux via a sentinel bound). cobra's setter rejects + that. This helper writes the underlying private attributes when + needed so the model can match the caller's exact intent. + """ + if lb > ub: + rxn._lower_bound = lb # noqa: SLF001 — intentional bypass + rxn._upper_bound = ub # noqa: SLF001 — intentional bypass + else: + rxn.bounds = (lb, ub) + + +# --- step implementations --------------------------------------------- + +def _apply_prelude(model: cobra.Model, cfg: dict[str, Any]) -> None: + prelude = cfg.get("prelude") or {} + if prelude.get("reset_exchanges"): + # cobrapy doesn't distinguish RAVEN's in / out directions; we + # reset every exchange reaction. + for rxn in model.exchanges: + rxn.lower_bound = 0 + rxn.upper_bound = DEFAULT_RESET_EXCHANGES_UPPER_BOUND + + +def _apply_cofactor_pseudoreaction(model: cobra.Model, cfg: dict[str, Any]) -> None: + cp = cfg.get("cofactor_pseudoreaction") + if not cp: + return + rxn = model.reactions.get_by_id(cp["rxn_id"]) + for entry in cp.get("remove_mets", []): + met = model.metabolites.get_by_id(entry["met"]) + _set_coefficient(rxn, met, 0.0) + balance_met_id = cp.get("charge_balance_met") + if balance_met_id: + balance_met = model.metabolites.get_by_id(balance_met_id) + _set_coefficient(rxn, balance_met, 0.0) + total_charge = sum( + (m.charge or 0) * coef + for m, coef in rxn.metabolites.items() + ) + _set_coefficient(rxn, balance_met, -total_charge) + + +def _apply_biomass_stoichiometry_delta(model: cobra.Model, cfg: dict[str, Any]) -> None: + delta = cfg.get("biomass_stoichiometry_delta") + if not delta: + return + rxn = model.reactions.get_by_id(delta["rxn_id"]) + for entry in delta.get("add", []): + met = model.metabolites.get_by_id(entry["met"]) + rxn.add_metabolites({met: float(entry["coef"])}, combine=True) + + +def _apply_bounds(model: cobra.Model, cfg: dict[str, Any]) -> int: + n_uptake = 0 + for entry in cfg.get("bounds", []): + try: + rxn = model.reactions.get_by_id(entry["rxn"]) + except KeyError: + warnings.warn( + f"Reaction {entry['rxn']!r} not found in model; skipping.", + stacklevel=3, + ) + continue + new_lb = float(entry["lb"]) if "lb" in entry else rxn.lower_bound + new_ub = float(entry["ub"]) if "ub" in entry else rxn.upper_bound + set_reaction_bounds(rxn, new_lb, new_ub) + if entry.get("lb") == -1000: + n_uptake += 1 + return n_uptake + + +def _check_uptake_count(cfg: dict[str, Any], n_uptake: int) -> None: + expected = cfg.get("expected_uptake_count") + if expected is None: + return + if n_uptake != expected: + warnings.warn( + f"Expected {expected} uptake reactions, applied {n_uptake}. " + "Some referenced reactions may be missing from the model.", + stacklevel=3, + ) + + +# --- helpers ---------------------------------------------------------- + +def _set_coefficient(rxn: cobra.Reaction, met: cobra.Metabolite, value: float) -> None: + """Set the stoichiometric coefficient of ``met`` in ``rxn`` to ``value``. + + cobra's only public mutation point is ``add_metabolites``; we go via + ``combine=True`` with ``(value - current)`` to land on the desired + value without depending on cobra's removal-on-zero behaviour. + """ + current = rxn.metabolites.get(met, 0.0) + delta = float(value) - current + if delta != 0: + rxn.add_metabolites({met: delta}, combine=True) diff --git a/tests/test_annotation.py b/tests/test_annotation.py new file mode 100644 index 0000000..fbab70b --- /dev/null +++ b/tests/test_annotation.py @@ -0,0 +1,204 @@ +"""Tests for raven_python.annotation (SBO terms + ΔG CSV persistence).""" +from __future__ import annotations + +import math + +import cobra +import pandas as pd + +from raven_python.annotation import ( + DEFAULT_BIOMASS_MET_NAMES, + add_sbo_terms, + load_delta_g_csv, + save_delta_g_csv, +) +from raven_python.annotation.sbo import _default_transport_detector + +# --- shared tiny model ------------------------------------------------- + +def _toy_model() -> cobra.Model: + """Toy yeast-flavoured model with: extracellular exchange, transport, + metabolic, biomass, NGAM, generic pseudo, lipid-backbone biomass met.""" + m = cobra.Model("toy") + m.compartments = {"c": "cytoplasm", "e": "extracellular", "x": "peroxisome"} + + mets = { + "atp_c": cobra.Metabolite("atp_c", name="ATP", compartment="c", charge=-4, formula="C10H12N5O13P3"), + "atp_x": cobra.Metabolite("atp_x", name="ATP", compartment="x", charge=-4, formula="C10H12N5O13P3"), + "glc_e": cobra.Metabolite("glc_e", name="glucose", compartment="e", charge=0, formula="C6H12O6"), + "biomass": cobra.Metabolite("biomass_c", name="biomass", compartment="c"), + "lipid_bb": cobra.Metabolite("lbb_c", name="lipid backbone", compartment="c"), + } + m.add_metabolites(list(mets.values())) + + # Exchange (single met in extracellular) + ex = cobra.Reaction("EX_glc", lower_bound=-10, upper_bound=1000) + ex.add_metabolites({mets["glc_e"]: -1}) + # Transport (ATP_c ↔ ATP_x — same met name, two compartments) + tr = cobra.Reaction("ATP_tx", lower_bound=-1000, upper_bound=1000) + tr.add_metabolites({mets["atp_c"]: -1, mets["atp_x"]: 1}) + # Normal metabolic reaction + met = cobra.Reaction("metab1", lower_bound=0, upper_bound=1000) + met.add_metabolites({mets["glc_e"]: -1, mets["atp_c"]: 1}) + # Biomass pseudoreaction (must be last for the bug-compat test) + bio = cobra.Reaction("biomass_rxn", lower_bound=0, upper_bound=1000) + bio.name = "biomass pseudoreaction" + bio.add_metabolites({mets["atp_c"]: -1, mets["biomass"]: 1}) + m.add_reactions([ex, tr, met, bio]) + return m + + +# --- add_sbo_terms ----------------------------------------------------- + +def test_default_biomass_names_set_is_reasonable(): + assert {"biomass", "DNA", "RNA", "protein", "lipid"} <= DEFAULT_BIOMASS_MET_NAMES + + +def test_simple_chemical_gets_simple_sbo(): + m = _toy_model() + add_sbo_terms(m) + assert m.metabolites.get_by_id("glc_e").annotation["sbo"] == "SBO:0000247" + + +def test_biomass_pseudo_metabolite_gets_biomass_sbo(): + m = _toy_model() + add_sbo_terms(m) + assert m.metabolites.get_by_id("biomass_c").annotation["sbo"] == "SBO:0000649" + + +def test_lipid_backbone_suffix_match(): + m = _toy_model() + add_sbo_terms(m) + assert m.metabolites.get_by_id("lbb_c").annotation["sbo"] == "SBO:0000649" + + +def test_exchange_gets_exchange_sbo(): + m = _toy_model() + add_sbo_terms(m) + assert m.reactions.get_by_id("EX_glc").annotation["sbo"] == "SBO:0000627" + + +def test_transport_gets_transport_sbo(): + m = _toy_model() + add_sbo_terms(m) + assert m.reactions.get_by_id("ATP_tx").annotation["sbo"] == "SBO:0000655" + + +def test_biomass_pseudoreaction_gets_pseudo_sbo_by_default(): + m = _toy_model() + add_sbo_terms(m) + assert m.reactions.get_by_id("biomass_rxn").annotation["sbo"] == "SBO:0000629" + + +def test_legacy_bug_flag_scopes_pseudo_to_last_reaction(): + """The yeast-GEM compat flag must restrict the pseudo override to + the last reaction; if biomass_rxn is last, it still wins.""" + m = _toy_model() + add_sbo_terms(m, only_last_reaction_for_pseudo=True) + # biomass_rxn was added last → still gets the pseudo SBO. + assert m.reactions.get_by_id("biomass_rxn").annotation["sbo"] == "SBO:0000629" + + +def test_legacy_bug_flag_skips_pseudo_when_not_last(): + """Reorder: a non-pseudo reaction last; the bug flag should leave + the pseudoreaction with whatever non-pseudo SBO it already had.""" + m = _toy_model() + # Swap last two reactions so the pseudo is NOT last. + rxns = list(m.reactions) + new_order = rxns[:-2] + rxns[-1:] + rxns[-2:-1] + m.reactions._generate_index() # noqa: SLF001 (rebuild before reorder) + # Easier: copy bounds, set order via remove+re-add + m.remove_reactions([rxns[-2], rxns[-1]]) + m.add_reactions(new_order[-2:]) + add_sbo_terms(m, only_last_reaction_for_pseudo=True) + bio = m.reactions.get_by_id("biomass_rxn") + # The biomass reaction has a single bounded reactant set (atp_c -1, + # biomass +1) → 2 mets → falls to default SBO:0000176. + assert bio.annotation["sbo"] == "SBO:0000176" + + +def test_fill_semantic_preserves_existing(): + m = _toy_model() + rxn = m.reactions.get_by_id("metab1") + rxn.annotation["sbo"] = "SBO:0009999" + add_sbo_terms(m) + assert rxn.annotation["sbo"] == "SBO:0009999" + + +def test_custom_transport_detector_overrides_default(): + m = _toy_model() + add_sbo_terms(m, transport_detector=lambda _m: set()) + # ATP_tx is no longer flagged transport → falls through to default + # (it has 2 metabolites so it is not exchange/sink/demand). + assert m.reactions.get_by_id("ATP_tx").annotation["sbo"] == "SBO:0000176" + + +def test_default_transport_detector_finds_transport(): + m = _toy_model() + transports = _default_transport_detector(m) + assert "ATP_tx" in transports + assert "EX_glc" not in transports + + +# --- ΔG CSV round-trip ------------------------------------------------ + +def test_save_then_load_round_trip(tmp_path): + m = _toy_model() + m.metabolites.get_by_id("atp_c").notes["deltaG"] = "-12.34" + m.metabolites.get_by_id("glc_e").notes["deltaG"] = "0.5" + + csv = tmp_path / "met_dg.csv" + written = save_delta_g_csv(m.metabolites, csv) + assert written == len(m.metabolites) + df = pd.read_csv(csv) + assert list(df.columns) == ["Var1", "Var2"] + + # Reload on a fresh model + fresh = _toy_model() + stamped = load_delta_g_csv(fresh.metabolites, csv) + assert stamped == 2 + assert fresh.metabolites.get_by_id("atp_c").notes["deltaG"] == "-12.34" + assert fresh.metabolites.get_by_id("glc_e").notes["deltaG"] == "0.5" + + +def test_save_writes_nan_for_missing_notes(tmp_path): + m = _toy_model() # no notes set + csv = tmp_path / "met_dg.csv" + save_delta_g_csv(m.metabolites, csv) + df = pd.read_csv(csv) + assert len(df) == len(m.metabolites) + assert df["Var2"].apply(lambda v: math.isnan(v)).all() + + +def test_load_skips_nan_rows(tmp_path): + """A NaN in the CSV must NOT clobber the entity's existing note.""" + m = _toy_model() + m.metabolites.get_by_id("atp_c").notes["deltaG"] = "preserved" + + csv = tmp_path / "met_dg.csv" + pd.DataFrame( + {"Var1": ["atp_c", "glc_e"], "Var2": [math.nan, 1.0]} + ).to_csv(csv, index=False) + + load_delta_g_csv(m.metabolites, csv) + assert m.metabolites.get_by_id("atp_c").notes["deltaG"] == "preserved" + assert m.metabolites.get_by_id("glc_e").notes["deltaG"] == "1.0" + + +def test_custom_columns_and_note_key(tmp_path): + m = _toy_model() + m.metabolites.get_by_id("atp_c").notes["dG_kJ"] = "-30.5" + csv = tmp_path / "out.csv" + save_delta_g_csv( + m.metabolites, csv, + id_column="id", value_column="dG", note_key="dG_kJ", + ) + df = pd.read_csv(csv) + assert list(df.columns) == ["id", "dG"] + + fresh = _toy_model() + load_delta_g_csv( + fresh.metabolites, csv, + id_column="id", value_column="dG", note_key="dG_kJ", + ) + assert fresh.metabolites.get_by_id("atp_c").notes["dG_kJ"] == "-30.5" diff --git a/tests/test_comparison_diff.py b/tests/test_comparison_diff.py new file mode 100644 index 0000000..c8556c1 --- /dev/null +++ b/tests/test_comparison_diff.py @@ -0,0 +1,146 @@ +"""Tests for raven_python.comparison.diff — strict 2-model equality.""" +from __future__ import annotations + +import cobra +import pytest + +from raven_python.comparison import DiffReport, diff_models +from raven_python.comparison.diff import _normalise_gpr + + +def _mini_model(model_id: str = "m") -> cobra.Model: + """Tiny but realistic model: one reaction, one extracellular met.""" + m = cobra.Model(model_id) + a = cobra.Metabolite("a_c", name="A", compartment="c", charge=0, formula="C1") + b = cobra.Metabolite("b_e", name="B", compartment="e", charge=0, formula="C2") + m.add_metabolites([a, b]) + r = cobra.Reaction("r1", lower_bound=-1000, upper_bound=1000) + r.add_metabolites({a: -1, b: 1}) + r.gene_reaction_rule = "g1 AND g2" + r.annotation["sbo"] = "SBO:0000176" + m.add_reactions([r]) + return m + + +def test_model_equal_to_itself(): + m = _mini_model() + report = diff_models(m, m) + assert isinstance(report, DiffReport) + assert report.equal + + +def test_independent_copies_are_equal(): + a = _mini_model() + b = _mini_model() + assert diff_models(a, b).equal + + +def test_dropped_reaction_is_detected(): + a = _mini_model() + b = _mini_model() + b.remove_reactions([b.reactions[0]]) + report = diff_models(a, b) + assert not report.equal + assert any("reactions only in A" in d for d in report.differences) + + +def test_bound_diff_is_detected(): + a = _mini_model() + b = _mini_model() + b.reactions[0].lower_bound = -42 + report = diff_models(a, b) + assert not report.equal + assert any("bounds" in d for d in report.differences) + + +def test_stoich_within_tolerance_is_equal(): + a = _mini_model() + b = _mini_model() + met = next(iter(b.reactions[0].metabolites)) + b.reactions[0].add_metabolites({met: 1e-12}, combine=True) + assert diff_models(a, b, stoichiometry_tol=1e-9).equal + + +def test_stoich_outside_tolerance_is_not_equal(): + a = _mini_model() + b = _mini_model() + met = next(iter(b.reactions[0].metabolites)) + b.reactions[0].add_metabolites({met: 5.0}, combine=False) + report = diff_models(a, b, stoichiometry_tol=1e-9) + assert not report.equal + assert any("coef[" in d for d in report.differences) + + +def test_annotation_diff_detected(): + a = _mini_model() + b = _mini_model() + b.reactions[0].annotation["sbo"] = "SBO:0009999" + report = diff_models(a, b) + assert not report.equal + assert any("annotation['sbo']" in d for d in report.differences) + + +def test_ignore_annotations_suppresses_diff(): + a = _mini_model() + b = _mini_model() + b.reactions[0].annotation["sbo"] = "SBO:0009999" + assert diff_models(a, b, ignore_annotations={"sbo"}).equal + + +def test_extra_annotations_picked_up(): + a = _mini_model() + b = _mini_model() + a.reactions[0].annotation["custom-key"] = "v1" + b.reactions[0].annotation["custom-key"] = "v2" + # Default key set ignores custom-key → equal. + assert diff_models(a, b).equal + # Pull it in → diff. + assert not diff_models(a, b, extra_annotations={"custom-key"}).equal + + +@pytest.mark.parametrize( + "ga, gb", + [ + ("A and B", "a AND b"), + ("A and B", "A and B"), + ("(A or B) and C", "(a OR b) AND c"), + ], +) +def test_gpr_normalisation(ga, gb): + assert _normalise_gpr(ga) == _normalise_gpr(gb) + + +def test_max_per_category_truncates(): + """The per-category cap kicks in on per-reaction (and per-met) diffs, + not on the id-set summary line. Build a scenario with many bound + diffs across shared reactions.""" + a = cobra.Model("a") + b = cobra.Model("b") + mets_a = [cobra.Metabolite(f"m{i}_c", compartment="c") for i in range(60)] + mets_b = [cobra.Metabolite(f"m{i}_c", compartment="c") for i in range(60)] + a.add_metabolites(mets_a) + b.add_metabolites(mets_b) + for i in range(60): + ra = cobra.Reaction(f"r{i}", lower_bound=0, upper_bound=1000) + ra.add_metabolites({mets_a[i]: -1}) + a.add_reactions([ra]) + rb = cobra.Reaction(f"r{i}", lower_bound=-7, upper_bound=1000) # differs + rb.add_metabolites({mets_b[i]: -1}) + b.add_reactions([rb]) + report = diff_models(a, b, max_per_category=5) + assert not report.equal + assert any("truncated at 5" in d for d in report.differences) + + +def test_report_str_lists_differences(): + a = _mini_model() + b = _mini_model() + b.remove_reactions([b.reactions[0]]) + text = str(diff_models(a, b)) + assert "Models differ" in text + assert "reactions only in A" in text + + +def test_report_str_when_equal(): + m = _mini_model() + assert str(diff_models(m, m)) == "Models are semantically equal." diff --git a/tests/test_conditions.py b/tests/test_conditions.py new file mode 100644 index 0000000..e5e381f --- /dev/null +++ b/tests/test_conditions.py @@ -0,0 +1,199 @@ +"""Tests for raven_python.conditions.apply (generic condition mechanism).""" +from __future__ import annotations + +import cobra +import pytest +import yaml + +from raven_python.conditions import ( + apply_condition, + load_condition, + set_reaction_bounds, +) + + +def _toy_model() -> cobra.Model: + m = cobra.Model("toy") + m.compartments = {"c": "cytoplasm", "e": "extracellular"} + + mets = { + "atp_c": cobra.Metabolite("atp_c", name="ATP", compartment="c", charge=-4), + "glc_e": cobra.Metabolite("glc_e", name="glucose", compartment="e", charge=0), + "h_c": cobra.Metabolite("h_c", name="H+", compartment="c", charge=1), + "fad": cobra.Metabolite("fad", name="FAD", compartment="c", charge=-2), + "fadh2": cobra.Metabolite("fadh2", name="FADH2", compartment="c", charge=0), + "heme": cobra.Metabolite("heme", name="heme a", compartment="c", charge=-2), + } + m.add_metabolites(list(mets.values())) + + ex = cobra.Reaction("EX_glc", lower_bound=-10, upper_bound=1000) + ex.add_metabolites({mets["glc_e"]: -1}) + cofactor = cobra.Reaction("cofac", lower_bound=0, upper_bound=1000) + cofactor.add_metabolites({mets["heme"]: -1, mets["fad"]: -1, mets["h_c"]: 3}) + biomass = cobra.Reaction("bio", lower_bound=0, upper_bound=1000) + biomass.add_metabolites({mets["atp_c"]: -1}) + blocked = cobra.Reaction("blocked", lower_bound=-1000, upper_bound=1000) + blocked.add_metabolites({mets["atp_c"]: -1, mets["glc_e"]: 1}) + m.add_reactions([ex, cofactor, biomass, blocked]) + return m + + +# --- prelude ---------------------------------------------------------- + +def test_prelude_reset_exchanges_zeroes_lb_and_caps_ub(): + m = _toy_model() + apply_condition(m, {"prelude": {"reset_exchanges": "out"}}) + ex = m.reactions.get_by_id("EX_glc") + assert ex.lower_bound == 0 + assert ex.upper_bound == 1000 + + +# --- bounds ----------------------------------------------------------- + +def test_bounds_apply_lb_only(): + m = _toy_model() + apply_condition(m, {"bounds": [{"rxn": "EX_glc", "lb": -1000}]}) + assert m.reactions.get_by_id("EX_glc").lower_bound == -1000 + # ub preserved + assert m.reactions.get_by_id("EX_glc").upper_bound == 1000 + + +def test_bounds_apply_both(): + m = _toy_model() + apply_condition(m, {"bounds": [{"rxn": "blocked", "lb": 0, "ub": 0}]}) + assert m.reactions.get_by_id("blocked").bounds == (0, 0) + + +def test_bounds_lb_gt_ub_bypasses_cobra_validator(): + m = _toy_model() + apply_condition(m, {"bounds": [{"rxn": "blocked", "lb": 1000, "ub": 0}]}) + rxn = m.reactions.get_by_id("blocked") + assert rxn.lower_bound == 1000 + assert rxn.upper_bound == 0 + + +def test_bounds_missing_rxn_warns_and_continues(): + m = _toy_model() + with pytest.warns(UserWarning, match="not_a_real_rxn"): + apply_condition(m, {"bounds": [{"rxn": "not_a_real_rxn", "lb": 0}]}) + + +# --- cofactor pseudoreaction ----------------------------------------- + +def test_remove_met_zeroes_coefficient(): + m = _toy_model() + apply_condition( + m, + { + "cofactor_pseudoreaction": { + "rxn_id": "cofac", + "remove_mets": [{"met": "heme"}], + } + }, + ) + rxn = m.reactions.get_by_id("cofac") + heme = m.metabolites.get_by_id("heme") + assert rxn.metabolites.get(heme, 0) == 0 + + +def test_charge_balance_recomputed_after_removal(): + m = _toy_model() + apply_condition( + m, + { + "cofactor_pseudoreaction": { + "rxn_id": "cofac", + "remove_mets": [{"met": "heme"}], + "charge_balance_met": "h_c", + } + }, + ) + rxn = m.reactions.get_by_id("cofac") + # heme (-1·-2) and fad (-1·-2) contributed +4 charge. + # After heme removal: only fad (-1·-2) contributes +2. + # H+ (charge +1) coef should be -2 to zero the net. + h_c = m.metabolites.get_by_id("h_c") + assert rxn.metabolites[h_c] == -2 + + +# --- biomass stoichiometry delta ------------------------------------- + +def test_biomass_stoichiometry_delta_combines_with_existing(): + m = _toy_model() + bio = m.reactions.get_by_id("bio") + atp = m.metabolites.get_by_id("atp_c") + before = bio.metabolites[atp] + apply_condition( + m, + { + "biomass_stoichiometry_delta": { + "rxn_id": "bio", + "add": [{"met": "atp_c", "coef": 0.5}], + } + }, + ) + assert bio.metabolites[atp] == pytest.approx(before + 0.5) + + +# --- expected_uptake_count ------------------------------------------- + +def test_expected_uptake_count_mismatch_warns(): + m = _toy_model() + cfg = { + "bounds": [{"rxn": "EX_glc", "lb": -1000}], + "expected_uptake_count": 5, + } + with pytest.warns(UserWarning, match="Expected 5 uptake reactions, applied 1"): + apply_condition(m, cfg) + + +def test_expected_uptake_count_match_silent(recwarn): + m = _toy_model() + apply_condition( + m, + { + "bounds": [{"rxn": "EX_glc", "lb": -1000}], + "expected_uptake_count": 1, + }, + ) + assert not any("Expected" in str(w.message) for w in recwarn) + + +# --- yaml path entrypoint -------------------------------------------- + +def test_apply_condition_accepts_yaml_path(tmp_path): + cfg = {"bounds": [{"rxn": "EX_glc", "lb": -42}]} + path = tmp_path / "cond.yml" + path.write_text(yaml.safe_dump(cfg)) + m = _toy_model() + apply_condition(m, path) + assert m.reactions.get_by_id("EX_glc").lower_bound == -42 + + +def test_load_condition_round_trip(tmp_path): + cfg = {"name": "x", "bounds": [{"rxn": "r1", "lb": 0}]} + path = tmp_path / "cond.yml" + path.write_text(yaml.safe_dump(cfg)) + assert load_condition(path) == cfg + + +def test_load_condition_missing_path_raises(tmp_path): + with pytest.raises(FileNotFoundError): + load_condition(tmp_path / "nope.yml") + + +# --- set_reaction_bounds public helper ------------------------------- + +def test_set_reaction_bounds_normal_case(): + m = _toy_model() + rxn = m.reactions.get_by_id("EX_glc") + set_reaction_bounds(rxn, -5, 5) + assert rxn.bounds == (-5, 5) + + +def test_set_reaction_bounds_infeasible_case(): + m = _toy_model() + rxn = m.reactions.get_by_id("EX_glc") + set_reaction_bounds(rxn, 100, 0) + assert rxn.lower_bound == 100 + assert rxn.upper_bound == 0 From 3bf0de2d16ecd3f1f2a1a77091fbba33bcf2ef96 Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Sat, 30 May 2026 18:38:08 +0200 Subject: [PATCH 2/8] =?UTF-8?q?Add=20raven=5Fpython.biomass=20=E2=80=94=20?= =?UTF-8?q?sum=20/=20scale=20/=20rescale=20/=20set=5Fgam?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generic biomass-equation manipulation, extracted from yeast-GEM's sumBioMass / scaleBioMass / rescalePseudoReaction / changeGAM as yeast-GEM moves to depend on raven-python (yeast-GEM phase 4 of the porting plan). Module layout ------------- raven_python.biomass.config BiomassConfig — biomass_rxn id + proton_met id + ordered tuple of BiomassComponent (per-component pseudoreaction name + mass- computation strategy). raven_python.biomass.scale sum_biomass(model, config) → {component: g/gDW, ..., "total": X} rescale_pseudoreaction(model, config, name, factor) — multiply the pseudoreaction's substrate coefs by factor and rebalance H+ to keep ionic charge at zero. scale_biomass(model, config, name, new_value, balance_out=None) — rescale a component to a target fraction, optionally balancing a second component so the total stays at 1 g/gDW. raven_python.biomass.gam set_gam(model, value, *, biomass_rxn, cofactor_met_names, ngam_rxn=None, ngam_value=None) — scales every metabolite in the biomass pseudoreaction whose `name` is in the supplied cofactor set, preserving its sign; optionally fixes the NGAM rxn bounds. Mass strategies (per BiomassComponent.mass_strategy): "mw" plain MW from chemical formula (carbohydrate / ion / cofactor) "mw_minus_2h" MW − 2.016 g/mol per substrate (protein — charged tRNAs release two protons) "mw_minus_water" MW − 18.015 g/mol per substrate (RNA / DNA — polymerisation releases one water) "grams" stoichiometry already in g/gDW (lipid backbone) Tests: 19 new tests over a synthetic toy model that exercises every mass strategy, the H+ charge rebalance, scale_biomass with and without balance_out, set_gam on cofactor mets (and the NGAM bound path). --- src/raven_python/biomass/__init__.py | 50 ++++++ src/raven_python/biomass/config.py | 95 +++++++++++ src/raven_python/biomass/gam.py | 67 ++++++++ src/raven_python/biomass/scale.py | 207 ++++++++++++++++++++++ tests/test_biomass.py | 246 +++++++++++++++++++++++++++ 5 files changed, 665 insertions(+) create mode 100644 src/raven_python/biomass/__init__.py create mode 100644 src/raven_python/biomass/config.py create mode 100644 src/raven_python/biomass/gam.py create mode 100644 src/raven_python/biomass/scale.py create mode 100644 tests/test_biomass.py diff --git a/src/raven_python/biomass/__init__.py b/src/raven_python/biomass/__init__.py new file mode 100644 index 0000000..6b5ac49 --- /dev/null +++ b/src/raven_python/biomass/__init__.py @@ -0,0 +1,50 @@ +"""Biomass equation manipulation — growth-associated maintenance, amino- +acid ratios, component scaling, and biomass-fraction reporting. + +The yeast-GEM port (see yeast-GEM/code/python/PORTING_PLAN.md) was the +first consumer; the API is parameterised by a :class:`BiomassConfig` +so other GEMs can describe their own component layout. + +A typical caller assembles ``BiomassConfig`` once (often from a +project-level YAML) and passes it to every operation: + +.. code-block:: python + + cfg = BiomassConfig( + biomass_rxn="r_4041", + proton_met="s_0794", + components=[ + BiomassComponent("protein", "protein pseudoreaction", + mass_strategy="mw_minus_2h"), + BiomassComponent("carbohydrate", "carbohydrate pseudoreaction"), + BiomassComponent("lipid_backbone", "lipid backbone pseudoreaction", + mass_strategy="grams"), + BiomassComponent("RNA", "RNA pseudoreaction", + mass_strategy="mw_minus_water"), + BiomassComponent("DNA", "DNA pseudoreaction", + mass_strategy="mw_minus_water"), + BiomassComponent("ion", "ion pseudoreaction"), + BiomassComponent("cofactor", "cofactor pseudoreaction"), + ], + ) + fractions = sum_biomass(model, cfg) # {'protein': 0.46, ...} + scale_biomass(model, cfg, "protein", 0.50, balance_out="carbohydrate") + set_gam(model, 80, biomass_rxn=cfg.biomass_rxn, + cofactor_met_names=("ATP", "ADP", "H2O", "H+", "phosphate")) +""" +from raven_python.biomass.config import BiomassComponent, BiomassConfig +from raven_python.biomass.gam import set_gam +from raven_python.biomass.scale import ( + rescale_pseudoreaction, + scale_biomass, + sum_biomass, +) + +__all__ = [ + "BiomassComponent", + "BiomassConfig", + "rescale_pseudoreaction", + "scale_biomass", + "set_gam", + "sum_biomass", +] diff --git a/src/raven_python/biomass/config.py b/src/raven_python/biomass/config.py new file mode 100644 index 0000000..c0ee453 --- /dev/null +++ b/src/raven_python/biomass/config.py @@ -0,0 +1,95 @@ +"""Configuration types for the biomass module.""" +from __future__ import annotations + +from collections.abc import Iterable +from dataclasses import dataclass, field +from typing import Literal + +#: Strategies for computing a component's mass contribution from the +#: stoichiometry of its pseudoreaction. See :func:`sum_biomass`. +#: +#: ``"mw"`` multiply each substrate coefficient by the +#: molecular weight derived from its +#: ``formula`` (g/mol), divide by 1000 to get +#: g/gDW. Suits carbohydrate / ion / cofactor. +#: ``"mw_minus_2h"`` as ``"mw"`` but with each MW reduced by +#: 2.016 g/mol (two protons released per +#: charged tRNA on protein-pseudoreaction +#: substrates). +#: ``"mw_minus_water"`` as ``"mw"`` but with each MW reduced by +#: 18.015 g/mol (one water released per +#: polymerisation step; RNA / DNA). +#: ``"grams"`` substrate coefficients are already in +#: g/gDW (e.g. the lipid-backbone +#: pseudoreaction). No MW lookup needed. +MassStrategy = Literal["mw", "mw_minus_2h", "mw_minus_water", "grams"] + + +@dataclass(frozen=True) +class BiomassComponent: + """One component of the biomass equation (protein, carbohydrate, …). + + Attributes + ---------- + name + Canonical short name — also the ``model.metabolites`` name of + the metabolite that the pseudoreaction *produces*. Used by + :func:`rescale_pseudoreaction` to identify the product side. + pseudoreaction_name + ``model.reactions[*].name`` of the pseudoreaction. + cobrapy reactions are looked up by ``name`` (not ``id``) + because that is how the yeast-GEM convention names them. + mass_strategy + How to convert the pseudoreaction's substrates into a mass + fraction. See :data:`MassStrategy` for the choices and their + offsets. + """ + + name: str + pseudoreaction_name: str + mass_strategy: MassStrategy = "mw" + + +@dataclass(frozen=True) +class BiomassConfig: + """Container for the per-organism biomass layout. + + Attributes + ---------- + biomass_rxn + ``model.reactions[*].id`` of the top-level biomass + pseudoreaction (the one whose flux is the growth rate). + proton_met + ``model.metabolites[*].id`` of the cytosolic H+ metabolite — + used by :func:`rescale_pseudoreaction` to keep each + pseudoreaction charge-balanced after rescaling. + components + Ordered tuple of :class:`BiomassComponent` describing every + non-top-level pseudoreaction that contributes to mass. + """ + + biomass_rxn: str + proton_met: str + components: tuple[BiomassComponent, ...] = field(default_factory=tuple) + + def __post_init__(self) -> None: + # Freeze the components into a tuple even if a list was passed. + if not isinstance(self.components, tuple): + object.__setattr__(self, "components", tuple(self.components)) + + @classmethod + def from_components( + cls, + biomass_rxn: str, + proton_met: str, + components: Iterable[BiomassComponent], + ) -> BiomassConfig: + return cls(biomass_rxn=biomass_rxn, proton_met=proton_met, + components=tuple(components)) + + def get(self, component_name: str) -> BiomassComponent: + """Look up a component by name (raises ``KeyError`` if missing).""" + for c in self.components: + if c.name == component_name: + return c + raise KeyError(f"BiomassConfig has no component named {component_name!r}") diff --git a/src/raven_python/biomass/gam.py b/src/raven_python/biomass/gam.py new file mode 100644 index 0000000..15a28be --- /dev/null +++ b/src/raven_python/biomass/gam.py @@ -0,0 +1,67 @@ +"""Growth-associated maintenance (GAM) and non-growth maintenance (NGAM). + +Ports the generic part of yeast-GEM's ``changeGAM.m``. +""" +from __future__ import annotations + +from collections.abc import Iterable + +import cobra + + +def set_gam( + model: cobra.Model, + value: float, + *, + biomass_rxn: str, + cofactor_met_names: Iterable[str], + ngam_rxn: str | None = None, + ngam_value: float | None = None, +) -> cobra.Model: + """Set GAM (and optionally NGAM) on a model in place. + + Scales every metabolite participating in the biomass pseudoreaction + whose ``name`` is in ``cofactor_met_names`` (e.g. ATP, ADP, H2O, + H+, phosphate) to ``±value``, preserving the sign of its current + coefficient. + + If both ``ngam_rxn`` and ``ngam_value`` are given, the NGAM + reaction's bounds are fixed at ``(ngam_value, ngam_value)``. + + Parameters + ---------- + model + cobra model to mutate. + value + New GAM value (mmol ATP / gDW per growth unit). + biomass_rxn + Reaction id of the biomass pseudoreaction whose stoichiometry + carries the GAM coefficients. + cofactor_met_names + Iterable of metabolite *names* (not ids) — every metabolite in + the biomass pseudoreaction whose name is in this set will be + rescaled. yeast-GEM uses ``{"ATP", "ADP", "H2O", "H+", "phosphate"}``. + ngam_rxn, ngam_value + Optional NGAM update. If both are provided, the reaction's + bounds are set to ``(ngam_value, ngam_value)`` (equality + constraint). + + Returns the (mutated) model for chaining. + """ + rxn = model.reactions.get_by_id(biomass_rxn) + cofactor_set = set(cofactor_met_names) + + deltas: dict[cobra.Metabolite, float] = {} + for met, coef in rxn.metabolites.items(): + if met.name in cofactor_set: + target = (1 if coef > 0 else -1) * float(value) + # combine=True adds delta → new total = target + deltas[met] = target - coef + if deltas: + rxn.add_metabolites(deltas, combine=True) + + if ngam_rxn is not None and ngam_value is not None: + ngam = model.reactions.get_by_id(ngam_rxn) + ngam.bounds = (float(ngam_value), float(ngam_value)) + + return model diff --git a/src/raven_python/biomass/scale.py b/src/raven_python/biomass/scale.py new file mode 100644 index 0000000..2dc8930 --- /dev/null +++ b/src/raven_python/biomass/scale.py @@ -0,0 +1,207 @@ +"""Biomass component summing and rescaling. + +Ports the generic core of yeast-GEM's ``sumBioMass.m`` / +``scaleBioMass.m`` / ``rescalePseudoReaction.m`` into a single +parameterised module that other GEMs can configure via +:class:`BiomassConfig`. +""" +from __future__ import annotations + +import re + +import cobra + +from raven_python.biomass.config import BiomassComponent, BiomassConfig + +# Elemental atomic weights used to translate formulas into g/mol. +# Mirrors the table baked into yeast-GEM's ``sumBioMass.m``; pseudo- +# element "R" is treated as zero mass (placeholder for residues). +_ELEMENT_MASS_G_PER_MOL: dict[str, float] = { + "C": 12.01, "H": 1.008, "N": 14.007, "O": 15.999, + "P": 30.974, "S": 32.06, "R": 0.0, + "Fe": 55.845, "K": 39.098, "Na": 22.99, "Cl": 35.45, + "Mn": 54.938, "Zn": 65.38, "Ca": 40.078, "Mg": 24.305, "Cu": 63.546, +} + +_FORMULA_TOKEN_RE = re.compile(r"([A-Z][a-z]*)(\d*)") + + +def sum_biomass(model: cobra.Model, config: BiomassConfig) -> dict[str, float]: + """Mass-fraction (g/gDW) per biomass component plus the total. + + Mirrors yeast-GEM's ``sumBioMass.m``. Returns a dict keyed by each + :class:`BiomassComponent` name plus ``"total"``. Components whose + pseudoreaction is missing from the model contribute 0 (logged as a + warning). + + Substrate detection: in each component's pseudoreaction, the + substrate side is the metabolites with negative coefficient — the + same convention yeast-GEM uses. + """ + fractions: dict[str, float] = {} + total = 0.0 + for comp in config.components: + f = _component_fraction(model, comp) + fractions[comp.name] = f + total += f + fractions["total"] = total + return fractions + + +def rescale_pseudoreaction( + model: cobra.Model, + config: BiomassConfig, + component_name: str, + factor: float, +) -> None: + """Multiply the substrate coefficients of one component pseudoreaction + by ``factor`` and rebalance H+ to preserve charge neutrality. + + Ports ``rescalePseudoReaction.m``. "Substrate" here means any + metabolite whose ``name`` is not the component's product name; the + component's own metabolite (e.g. ``protein`` for the protein + pseudoreaction) is left untouched. After the rescaling the + coefficient of ``config.proton_met`` is recomputed so the + reaction's total ionic charge sums to zero. + """ + comp = config.get(component_name) + rxn = _find_pseudoreaction(model, comp) + proton_met = model.metabolites.get_by_id(config.proton_met) + + # Step 1: scale substrate coefficients (everything but the component + # product). Use a fresh dict to avoid mutating during iteration. + deltas: dict[cobra.Metabolite, float] = {} + for met, coef in rxn.metabolites.items(): + if met.name == comp.name: + continue # the product — leave alone + deltas[met] = (factor - 1.0) * coef # combine=True adds (factor-1)*coef + if deltas: + rxn.add_metabolites(deltas, combine=True) + + # Step 2: H+ rebalance. The legacy code first zeroes H+, then sets + # it to negate the rxn's current ionic charge. With combine=True we + # express the same with two `_set_coefficient` calls. + _set_coefficient(rxn, proton_met, 0.0) + total_charge = sum( + (m.charge or 0) * coef for m, coef in rxn.metabolites.items() + ) + _set_coefficient(rxn, proton_met, -total_charge) + + +def scale_biomass( + model: cobra.Model, + config: BiomassConfig, + component_name: str, + new_value: float, + *, + balance_out: str | None = None, +) -> None: + """Rescale a component to a new mass fraction, optionally balancing + out a second component to keep the total at 1 g/gDW. + + Ports ``scaleBioMass.m``. The rescaling factor is derived from + :func:`sum_biomass` on the *current* model state, so call this with + an in-place ``model`` mutation, not on a stale snapshot. + """ + fractions = sum_biomass(model, config) + current = fractions[component_name] + if current == 0: + raise ValueError( + f"Cannot scale {component_name!r} to {new_value}: current " + "fraction is 0 (pseudoreaction missing or empty)." + ) + factor = new_value / current + rescale_pseudoreaction(model, config, component_name, factor) + + if balance_out is not None: + # Recompute X after the first rescaling. + fractions = sum_biomass(model, config) + total = fractions["total"] + balance_current = fractions[balance_out] + if balance_current == 0: + return # nothing we can do + balance_factor = (balance_current + (1.0 - total)) / balance_current + rescale_pseudoreaction(model, config, balance_out, balance_factor) + + +# --- internals --------------------------------------------------------- + +def _component_fraction(model: cobra.Model, comp: BiomassComponent) -> float: + """Compute g/gDW of a single component from its pseudoreaction.""" + try: + rxn = _find_pseudoreaction(model, comp) + except KeyError: + return 0.0 # missing pseudoreaction → contributes 0 + substrates = [(m, c) for m, c in rxn.metabolites.items() if c < 0] + if not substrates: + return 0.0 + + if comp.mass_strategy == "grams": + # Stoichiometry already in g/gDW; just sum the (negative) coefs + # and flip sign. + return -sum(c for _m, c in substrates) + + offset = _mw_offset(comp.mass_strategy) + mass_g = 0.0 + for met, coef in substrates: + mw = _formula_mw(met.formula or "") + if mw == 0: + raise ValueError( + f"Metabolite {met.id} ({met.name!r}) has an empty " + "formula; cannot compute mass for biomass summing." + ) + mw_corrected = mw + offset + mass_g += -coef * mw_corrected + return mass_g / 1000.0 # g/mmol → g/gDW (matching yeast-GEM units) + + +def _mw_offset(strategy: str) -> float: + if strategy == "mw": + return 0.0 + if strategy == "mw_minus_2h": + # 2 × 1.008 = 2.016 g/mol per substrate (charged tRNAs release + # two protons on amino acylation). + return -2.016 + if strategy == "mw_minus_water": + # H2O = 18.015 g/mol per polymerisation step (RNA / DNA). + return -18.015 + raise ValueError(f"Unknown mass_strategy: {strategy!r}") + + +def _formula_mw(formula: str) -> float: + """Molecular weight in g/mol from a Hill-style chemical formula.""" + if not formula: + return 0.0 + mw = 0.0 + for element, count_str in _FORMULA_TOKEN_RE.findall(formula): + if not element: + continue + try: + atomic = _ELEMENT_MASS_G_PER_MOL[element] + except KeyError as exc: + raise ValueError( + f"Unknown element {element!r} in formula {formula!r}; " + "extend raven_python.biomass.scale._ELEMENT_MASS_G_PER_MOL " + "if the element is real." + ) from exc + count = int(count_str) if count_str else 1 + mw += atomic * count + return mw + + +def _find_pseudoreaction(model: cobra.Model, comp: BiomassComponent) -> cobra.Reaction: + for rxn in model.reactions: + if rxn.name == comp.pseudoreaction_name: + return rxn + raise KeyError( + f"Component {comp.name!r}: no reaction named " + f"{comp.pseudoreaction_name!r} in model." + ) + + +def _set_coefficient(rxn: cobra.Reaction, met: cobra.Metabolite, value: float) -> None: + """Land on ``S[met, rxn] = value`` via cobra's combine=True API.""" + current = rxn.metabolites.get(met, 0.0) + delta = float(value) - current + if delta != 0: + rxn.add_metabolites({met: delta}, combine=True) diff --git a/tests/test_biomass.py b/tests/test_biomass.py new file mode 100644 index 0000000..5e96ffd --- /dev/null +++ b/tests/test_biomass.py @@ -0,0 +1,246 @@ +"""Tests for raven_python.biomass.""" +from __future__ import annotations + +import cobra +import pytest + +from raven_python.biomass import ( + BiomassComponent, + BiomassConfig, + rescale_pseudoreaction, + scale_biomass, + set_gam, + sum_biomass, +) +from raven_python.biomass.scale import _formula_mw + +# --- formula MW helper ---------------------------------------------- + +@pytest.mark.parametrize( + "formula, expected", + [ + ("H2O", 18.015), # 2*1.008 + 15.999 + ("ATP", 0.0), # invalid: no element 'ATP'... we raise + ("CO2", 44.008), # 12.01 + 2*15.999 + ("C6H12O6", 180.156), # 6*12.01 + 12*1.008 + 6*15.999 + ], +) +def test_formula_mw(formula, expected): + if formula == "ATP": + with pytest.raises(ValueError, match="Unknown element"): + _formula_mw(formula) + else: + assert _formula_mw(formula) == pytest.approx(expected, rel=1e-4) + + +# --- toy biomass model ---------------------------------------------- + +def _toy_biomass_model() -> tuple[cobra.Model, BiomassConfig]: + """Tiny but realistic biomass layout: + + - protein pseudoreaction: 2 amino-acid tRNAs → protein (mass_strategy mw_minus_2h) + - carbohydrate pseudoreaction: glucose → carbohydrate (mass_strategy mw) + - biomass pseudoreaction: protein + carbohydrate + GAM cofactors → biomass + """ + m = cobra.Model("toy") + m.compartments = {"c": "cytoplasm"} + + aa1 = cobra.Metabolite("aa1_c", name="alanine-tRNA", compartment="c", + charge=0, formula="C3H6N1O2") # MW ≈ 88 + aa2 = cobra.Metabolite("aa2_c", name="glycine-tRNA", compartment="c", + charge=0, formula="C2H4N1O2") # MW ≈ 74 + glc = cobra.Metabolite("glc_c", name="glucose", compartment="c", + charge=0, formula="C6H12O6") # MW = 180.156 + protein = cobra.Metabolite("protein_c", name="protein", compartment="c") + carb = cobra.Metabolite("carb_c", name="carbohydrate", compartment="c") + biomass = cobra.Metabolite("biomass_c", name="biomass", compartment="c") + h = cobra.Metabolite("h_c", name="H+", compartment="c", charge=1) + atp = cobra.Metabolite("atp_c", name="ATP", compartment="c", charge=-4) + adp = cobra.Metabolite("adp_c", name="ADP", compartment="c", charge=-3) + h2o = cobra.Metabolite("h2o_c", name="H2O", compartment="c", charge=0) + pi = cobra.Metabolite("pi_c", name="phosphate", compartment="c", charge=-2) + m.add_metabolites([aa1, aa2, glc, protein, carb, biomass, h, atp, adp, h2o, pi]) + + prot_rxn = cobra.Reaction("PROT_pseudo") + prot_rxn.name = "protein pseudoreaction" + prot_rxn.add_metabolites({aa1: -0.5, aa2: -0.5, protein: 1, h: 1}) + carb_rxn = cobra.Reaction("CARB_pseudo") + carb_rxn.name = "carbohydrate pseudoreaction" + carb_rxn.add_metabolites({glc: -0.001, carb: 1}) # 1 mmol → 180 mg + bio_rxn = cobra.Reaction("BIO_pseudo") + bio_rxn.name = "biomass pseudoreaction" + bio_rxn.add_metabolites({ + protein: -1, carb: -1, biomass: 1, + atp: -50, h2o: -50, adp: 50, h: 50, pi: 50, # GAM = 50 + }) + m.add_reactions([prot_rxn, carb_rxn, bio_rxn]) + + cfg = BiomassConfig( + biomass_rxn="BIO_pseudo", + proton_met="h_c", + components=( + BiomassComponent("protein", "protein pseudoreaction", + mass_strategy="mw_minus_2h"), + BiomassComponent("carbohydrate", "carbohydrate pseudoreaction", + mass_strategy="mw"), + ), + ) + return m, cfg + + +# --- sum_biomass ----------------------------------------------------- + +def test_sum_biomass_keys_match_components_plus_total(): + m, cfg = _toy_biomass_model() + out = sum_biomass(m, cfg) + assert set(out) == {"protein", "carbohydrate", "total"} + + +def test_sum_biomass_protein_uses_minus_2h_offset(): + m, cfg = _toy_biomass_model() + out = sum_biomass(m, cfg) + # aa1 (alanine-tRNA, C3H6N1O2, MW ≈ 88.09): MW - 2.016 + # aa2 (glycine-tRNA, C2H4N1O2, MW ≈ 74.07): MW - 2.016 + # contribs: 0.5*(88.09 - 2.016) + 0.5*(74.07 - 2.016) ≈ 79.066, /1000 + assert out["protein"] == pytest.approx(0.079066, rel=1e-3) + + +def test_sum_biomass_carbohydrate_uses_plain_mw(): + m, cfg = _toy_biomass_model() + out = sum_biomass(m, cfg) + # 0.001 mmol glucose * 180.156 g/mol / 1000 = 0.000180156 + assert out["carbohydrate"] == pytest.approx(0.000180156, rel=1e-3) + + +def test_sum_biomass_total_is_components_sum(): + m, cfg = _toy_biomass_model() + out = sum_biomass(m, cfg) + assert out["total"] == pytest.approx(out["protein"] + out["carbohydrate"]) + + +def test_sum_biomass_grams_strategy(): + m, cfg = _toy_biomass_model() + # Add a lipid component whose stoichiometry is already in grams. + lipid = cobra.Metabolite("lipid_c", name="lipid backbone", compartment="c") + m.add_metabolites([lipid]) + rxn = cobra.Reaction("LIPID_pseudo") + rxn.name = "lipid backbone pseudoreaction" + rxn.add_metabolites({lipid: 1, m.metabolites.get_by_id("glc_c"): -0.05}) + m.add_reactions([rxn]) + cfg2 = BiomassConfig.from_components( + cfg.biomass_rxn, cfg.proton_met, + [*cfg.components, + BiomassComponent("lipid_backbone", "lipid backbone pseudoreaction", + mass_strategy="grams")], + ) + out = sum_biomass(m, cfg2) + assert out["lipid_backbone"] == pytest.approx(0.05) + + +def test_sum_biomass_missing_pseudoreaction_contributes_zero(): + m, cfg = _toy_biomass_model() + cfg2 = BiomassConfig.from_components( + cfg.biomass_rxn, cfg.proton_met, + [*cfg.components, BiomassComponent("DNA", "DNA pseudoreaction")], + ) + out = sum_biomass(m, cfg2) + assert out["DNA"] == 0 + + +# --- rescale_pseudoreaction ----------------------------------------- + +def test_rescale_pseudoreaction_doubles_substrate_coefs(): + m, cfg = _toy_biomass_model() + rxn = m.reactions.get_by_id("PROT_pseudo") + aa1 = m.metabolites.get_by_id("aa1_c") + before = rxn.metabolites[aa1] + rescale_pseudoreaction(m, cfg, "protein", 2.0) + assert rxn.metabolites[aa1] == pytest.approx(2 * before) + + +def test_rescale_pseudoreaction_leaves_product_alone(): + m, cfg = _toy_biomass_model() + rxn = m.reactions.get_by_id("PROT_pseudo") + protein = m.metabolites.get_by_id("protein_c") + before = rxn.metabolites[protein] + rescale_pseudoreaction(m, cfg, "protein", 3.0) + assert rxn.metabolites[protein] == before + + +def test_rescale_pseudoreaction_rebalances_h_for_charge(): + m, cfg = _toy_biomass_model() + # Add a non-zero-charge substrate so H+ balance is meaningful. + rxn = m.reactions.get_by_id("PROT_pseudo") + aa1 = m.metabolites.get_by_id("aa1_c") + aa1.charge = -1 # negatively charged + rescale_pseudoreaction(m, cfg, "protein", 2.0) + total = sum((m_.charge or 0) * c for m_, c in rxn.metabolites.items()) + assert total == pytest.approx(0) + + +# --- scale_biomass -------------------------------------------------- + +def test_scale_biomass_lands_on_new_value(): + m, cfg = _toy_biomass_model() + target = 0.40 + scale_biomass(m, cfg, "protein", target) + out = sum_biomass(m, cfg) + assert out["protein"] == pytest.approx(target, rel=1e-6) + + +def test_scale_biomass_with_balance_keeps_total_at_one(): + m, cfg = _toy_biomass_model() + # Force a known starting total ≠ 1 by tweaking the toy model. + scale_biomass(m, cfg, "protein", 0.6, balance_out="carbohydrate") + out = sum_biomass(m, cfg) + assert out["total"] == pytest.approx(1.0, rel=1e-6) + + +def test_scale_biomass_zero_current_raises(): + m, cfg = _toy_biomass_model() + # Build a config that references a missing component → 0 current + cfg2 = BiomassConfig.from_components( + cfg.biomass_rxn, cfg.proton_met, + [*cfg.components, BiomassComponent("DNA", "DNA pseudoreaction")], + ) + with pytest.raises(ValueError, match="current"): + scale_biomass(m, cfg2, "DNA", 0.1) + + +# --- set_gam -------------------------------------------------------- + +def test_set_gam_scales_cofactor_coefficients(): + m, cfg = _toy_biomass_model() + set_gam(m, 80, biomass_rxn=cfg.biomass_rxn, + cofactor_met_names=("ATP", "ADP", "H2O", "H+", "phosphate")) + bio = m.reactions.get_by_id(cfg.biomass_rxn) + atp = m.metabolites.get_by_id("atp_c") + adp = m.metabolites.get_by_id("adp_c") + h2o = m.metabolites.get_by_id("h2o_c") + pi = m.metabolites.get_by_id("pi_c") + assert bio.metabolites[atp] == -80 + assert bio.metabolites[adp] == 80 + assert bio.metabolites[h2o] == -80 + assert bio.metabolites[pi] == 80 + + +def test_set_gam_with_ngam_sets_equality_bounds(): + m, _ = _toy_biomass_model() + ngam = cobra.Reaction("NGAM", lower_bound=0, upper_bound=1000) + atp = m.metabolites.get_by_id("atp_c") + ngam.add_metabolites({atp: -1}) + m.add_reactions([ngam]) + set_gam(m, 80, biomass_rxn="BIO_pseudo", + cofactor_met_names=("ATP",), + ngam_rxn="NGAM", ngam_value=1.5) + assert m.reactions.get_by_id("NGAM").bounds == (1.5, 1.5) + + +def test_set_gam_ignores_non_cofactor_mets(): + m, cfg = _toy_biomass_model() + bio = m.reactions.get_by_id(cfg.biomass_rxn) + protein = m.metabolites.get_by_id("protein_c") + before = bio.metabolites[protein] + set_gam(m, 80, biomass_rxn=cfg.biomass_rxn, + cofactor_met_names=("ATP",)) + assert bio.metabolites[protein] == before From b4828593dd98187d0f1b459b221d979af528a0e5 Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Sat, 30 May 2026 22:19:54 +0200 Subject: [PATCH 3/8] Add raven_python.manipulation.find_duplicate_reactions (detection variant) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Detection-only counterpart to remove_duplicate_reactions. Returns duplicate groups instead of mutating the model. Ignores bounds / GPR / objective — only stoichiometry is compared, mirroring the typical curation use case ("find reactions that could be merged"). A new ``ignore_direction=True`` default (yeast-GEM convention) treats A→B and B→A as duplicates. Set False to require identical orientation. Used by yeast-GEM's modelTests port (Tier 3 / phase 5) to flag duplicate reactions during curation review. --- src/raven_python/manipulation/__init__.py | 2 + src/raven_python/manipulation/simplify.py | 48 +++++++++++++ tests/test_manipulation_find_duplicate.py | 85 +++++++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 tests/test_manipulation_find_duplicate.py diff --git a/src/raven_python/manipulation/__init__.py b/src/raven_python/manipulation/__init__.py index 074c36f..a462949 100644 --- a/src/raven_python/manipulation/__init__.py +++ b/src/raven_python/manipulation/__init__.py @@ -10,6 +10,7 @@ from .remove import remove_genes, remove_metabolites from .simplify import ( constrain_reversible_reactions, + find_duplicate_reactions, group_linear_reactions, remove_dead_end_reactions, remove_duplicate_reactions, @@ -26,6 +27,7 @@ "constrain_reversible_reactions", "convert_to_irreversible", "expand_model", + "find_duplicate_reactions", "group_linear_reactions", "merge_models", "remove_dead_end_reactions", diff --git a/src/raven_python/manipulation/simplify.py b/src/raven_python/manipulation/simplify.py index 2deaccd..d05c9e6 100644 --- a/src/raven_python/manipulation/simplify.py +++ b/src/raven_python/manipulation/simplify.py @@ -82,6 +82,54 @@ def _signature(rxn): return (mets, rxn.lower_bound, rxn.upper_bound, rxn.objective_coefficient) +def _stoich_signature(rxn, *, ignore_direction: bool) -> frozenset: + """Signature considering stoichiometry only (used by find_duplicate_reactions). + + When ``ignore_direction`` is True, ``A → B`` and ``B → A`` (the same + reaction with all coefficients negated) share a signature. Both + orientations are accumulated and the lexicographically smaller one + wins so the dict-key lookup is direction-symmetric. + """ + forward = frozenset((m.id, c) for m, c in rxn.metabolites.items()) + if not ignore_direction: + return forward + backward = frozenset((m.id, -c) for m, c in rxn.metabolites.items()) + return min(forward, backward, key=lambda s: sorted(s)) + + +def find_duplicate_reactions( + model: cobra.Model, + *, + ignore_direction: bool = True, +) -> list[list[cobra.Reaction]]: + """Return groups of reactions that share identical stoichiometry. + + Detection-only counterpart to :func:`remove_duplicate_reactions`. + Bounds, objective coefficients, GPRs and annotations are ignored — + only stoichiometry is compared, mirroring the legacy yeast-GEM + ``findDuplicatedRxns`` and matching the typical curation use case + (find reactions that *could* be merged). + + Parameters + ---------- + ignore_direction + When True (default), ``A → B`` and ``B → A`` are treated as + duplicates (yeast-GEM's convention). Set ``False`` to require + identical orientation. + + Returns + ------- + A list of duplicate groups. Each group is itself a list with + ≥ 2 reactions sharing the same stoichiometry. Reactions that have + no duplicate are omitted. + """ + groups: dict[frozenset, list[cobra.Reaction]] = {} + for rxn in model.reactions: + sig = _stoich_signature(rxn, ignore_direction=ignore_direction) + groups.setdefault(sig, []).append(rxn) + return [g for g in groups.values() if len(g) >= 2] + + def remove_duplicate_reactions( model: cobra.Model, *, reserved: Iterable[str] | None = None ) -> list[str]: diff --git a/tests/test_manipulation_find_duplicate.py b/tests/test_manipulation_find_duplicate.py new file mode 100644 index 0000000..692556e --- /dev/null +++ b/tests/test_manipulation_find_duplicate.py @@ -0,0 +1,85 @@ +"""Tests for raven_python.manipulation.find_duplicate_reactions.""" +from __future__ import annotations + +import cobra + +from raven_python.manipulation import find_duplicate_reactions + + +def _mk_model() -> cobra.Model: + m = cobra.Model("m") + a = cobra.Metabolite("a_c", compartment="c") + b = cobra.Metabolite("b_c", compartment="c") + c = cobra.Metabolite("c_c", compartment="c") + m.add_metabolites([a, b, c]) + return m + + +def test_no_duplicates_returns_empty(): + m = _mk_model() + r1 = cobra.Reaction("r1") + r1.add_metabolites({m.metabolites.a_c: -1, m.metabolites.b_c: 1}) + r2 = cobra.Reaction("r2") + r2.add_metabolites({m.metabolites.b_c: -1, m.metabolites.c_c: 1}) + m.add_reactions([r1, r2]) + assert find_duplicate_reactions(m) == [] + + +def test_same_stoichiometry_grouped(): + m = _mk_model() + r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000) + r2 = cobra.Reaction("r2", lower_bound=-100, upper_bound=100) + for r in (r1, r2): + r.add_metabolites({m.metabolites.a_c: -1, m.metabolites.b_c: 1}) + m.add_reactions([r1, r2]) + groups = find_duplicate_reactions(m) + assert len(groups) == 1 + assert {r.id for r in groups[0]} == {"r1", "r2"} + + +def test_ignore_direction_default_groups_reverse_pair(): + """yeast-GEM's findDuplicatedRxns matches A→B with B→A. That's the default.""" + m = _mk_model() + r1 = cobra.Reaction("r1") + r1.add_metabolites({m.metabolites.a_c: -1, m.metabolites.b_c: 1}) + r2 = cobra.Reaction("r2") + r2.add_metabolites({m.metabolites.a_c: 1, m.metabolites.b_c: -1}) # reversed + m.add_reactions([r1, r2]) + groups = find_duplicate_reactions(m) + assert len(groups) == 1 + + +def test_ignore_direction_false_keeps_them_separate(): + m = _mk_model() + r1 = cobra.Reaction("r1") + r1.add_metabolites({m.metabolites.a_c: -1, m.metabolites.b_c: 1}) + r2 = cobra.Reaction("r2") + r2.add_metabolites({m.metabolites.a_c: 1, m.metabolites.b_c: -1}) + m.add_reactions([r1, r2]) + assert find_duplicate_reactions(m, ignore_direction=False) == [] + + +def test_three_duplicates_in_one_group(): + m = _mk_model() + rxns = [] + for i in range(3): + r = cobra.Reaction(f"r{i}") + r.add_metabolites({m.metabolites.a_c: -1, m.metabolites.b_c: 1}) + rxns.append(r) + m.add_reactions(rxns) + groups = find_duplicate_reactions(m) + assert len(groups) == 1 + assert len(groups[0]) == 3 + + +def test_ignores_bounds_and_gpr_differences(): + """Bounds and GPRs are intentionally ignored — only stoichiometry.""" + m = _mk_model() + r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000) + r2 = cobra.Reaction("r2", lower_bound=-50, upper_bound=50) + r1.add_metabolites({m.metabolites.a_c: -1, m.metabolites.b_c: 1}) + r2.add_metabolites({m.metabolites.a_c: -1, m.metabolites.b_c: 1}) + r1.gene_reaction_rule = "g1" + r2.gene_reaction_rule = "g2" + m.add_reactions([r1, r2]) + assert len(find_duplicate_reactions(m)) == 1 From 4d81625c9d7a0c641fc77fb5cad9554b090de3fc Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Sat, 30 May 2026 23:06:03 +0200 Subject: [PATCH 4/8] =?UTF-8?q?Add=20raven=5Fpython.curation=20=E2=80=94?= =?UTF-8?q?=20batch=5Fcurate=20/=20batch=5Fcurate=5Ffrom=5Ftsv?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generic batch curation engine extracted from yeast-GEM's MATLAB curateMetsRxnsGenes (yeast-GEM phase 6). Adds or updates metabolites, reactions and genes from pandas DataFrames; a batch_curate_from_tsv convenience wrapper reads the equivalent TSVs. Schema (matches yeast-GEM's data/modelCuration/template/ layout): mets_df metNames, comps, formula, charge, inchi, metNotes + any number of MIRIAM-namespace columns genes_df genes, geneShortNames + MIRIAM columns rxns_df rxnNames, grRules, lb, ub, rev, subSystems, eccodes, rxnNotes, rxnReferences, rxnConfidenceScores + MIRIAM columns rxns_coeffs_df rxnNames, metNames, comps, coefficient (one row per (reaction, metabolite) pair) Match keys: metabolites — (name, compartment) tuple genes — gene id reactions — stoichiometric signature Existing entities get their annotations overwritten (warning emitted); new entities are added with fresh ids generated from the supplied ``met_id_prefix`` / ``rxn_id_prefix`` (defaults M_ / R_ per the BiGG convention; yeast-GEM passes s_ / r_). Width of the existing zero-padded suffix is preserved so s_0001 → s_0002, not s_2. "Everything after the core columns is MIRIAM" — the header of any extra column becomes the annotation namespace key. Matches MATLAB behaviour exactly so yeast-GEM's existing TSVs work unchanged, and projects with different MIRIAM column sets need no code change. CurationResult dataclass records what was added vs updated so callers can verify in tests / CI. Tests: 13 new (add/update mets, add/update genes, add/update rxns by stoichiometry, miriam auto-detect, id-width preservation, combined mets+rxns in one call, missing-metabolite error, batch_curate_from_tsv round trip). --- src/raven_python/curation/__init__.py | 54 +++ src/raven_python/curation/batch.py | 458 ++++++++++++++++++++++++++ tests/test_curation.py | 247 ++++++++++++++ 3 files changed, 759 insertions(+) create mode 100644 src/raven_python/curation/__init__.py create mode 100644 src/raven_python/curation/batch.py create mode 100644 tests/test_curation.py diff --git a/src/raven_python/curation/__init__.py b/src/raven_python/curation/__init__.py new file mode 100644 index 0000000..de8b8cc --- /dev/null +++ b/src/raven_python/curation/__init__.py @@ -0,0 +1,54 @@ +"""Batch curation of metabolites / reactions / genes from data tables. + +Port of yeast-GEM's MATLAB ``curateMetsRxnsGenes`` into a generic +DataFrame-driven engine. Other GEM projects (Human-GEM, custom +reconstructions, …) can use the same machinery with their own TSV +layouts; the only required pieces are the data tables and (optionally) +project-specific id prefixes for fresh metabolites and reactions. + +Public API: + +* :func:`batch_curate` — entrypoint taking pandas DataFrames. +* :func:`batch_curate_from_tsv` — file-path convenience wrapper. +* :class:`CurationResult` — record of what was added / updated. + +Schema (mirrors yeast-GEM's ``data/modelCuration/template/`` layout): + +- **mets_df**: ``metNames, comps, formula, charge, inchi, metNotes`` + + any number of MIRIAM-annotation columns. Match key is + ``(name, comp)``. +- **genes_df**: ``genes, geneShortNames`` + MIRIAM columns. Match key + is ``genes``. +- **rxns_df**: ``rxnNames, grRules, lb, ub, rev, subSystems, eccodes, + rxnNotes, rxnReferences, rxnConfidenceScores`` + MIRIAM columns. + Match key is the reaction's *stoichiometry* — same metabolites and + coefficients ⇒ same reaction. +- **rxns_coeffs_df**: ``rxnNames, metNames, comps, coefficient``. One + row per ``(reaction, metabolite)`` pair. The ``rxnNames`` column + links each coefficient back to a row in ``rxns_df``. An optional + ``index`` first column from the legacy yeast-GEM schema is silently + ignored. + +Everything after the core columns in any of the four tables is +interpreted as a MIRIAM annotation: the column header becomes the +namespace key (``met.annotation[
] = ``). +""" +from raven_python.curation.batch import ( + DEFAULT_CORE_GENE_COLUMNS, + DEFAULT_CORE_MET_COLUMNS, + DEFAULT_CORE_RXN_COEFFS_COLUMNS, + DEFAULT_CORE_RXN_COLUMNS, + CurationResult, + batch_curate, + batch_curate_from_tsv, +) + +__all__ = [ + "DEFAULT_CORE_GENE_COLUMNS", + "DEFAULT_CORE_MET_COLUMNS", + "DEFAULT_CORE_RXN_COEFFS_COLUMNS", + "DEFAULT_CORE_RXN_COLUMNS", + "CurationResult", + "batch_curate", + "batch_curate_from_tsv", +] diff --git a/src/raven_python/curation/batch.py b/src/raven_python/curation/batch.py new file mode 100644 index 0000000..1bdeb23 --- /dev/null +++ b/src/raven_python/curation/batch.py @@ -0,0 +1,458 @@ +"""Generic batch-curation engine driven by tabular data.""" +from __future__ import annotations + +import warnings +from dataclasses import dataclass, field +from pathlib import Path + +import cobra +import pandas as pd + +#: Core columns recognised in ``mets_df``. Anything else is treated as a +#: MIRIAM annotation column (the header becomes the namespace key). +DEFAULT_CORE_MET_COLUMNS: tuple[str, ...] = ( + "metNames", "comps", "formula", "charge", "inchi", "metNotes", +) +#: Core columns recognised in ``genes_df``. +DEFAULT_CORE_GENE_COLUMNS: tuple[str, ...] = ("genes", "geneShortNames") +#: Core columns recognised in ``rxns_df``. +DEFAULT_CORE_RXN_COLUMNS: tuple[str, ...] = ( + "rxnNames", "grRules", "lb", "ub", "rev", + "subSystems", "eccodes", "rxnNotes", "rxnReferences", "rxnConfidenceScores", +) +#: Required columns in ``rxns_coeffs_df``. (Linkage column ``rxnNames`` +#: + one row per ``(reaction, metabolite)`` pair.) A leading ``index`` +#: column from the legacy yeast-GEM schema is silently ignored. +DEFAULT_CORE_RXN_COEFFS_COLUMNS: tuple[str, ...] = ( + "rxnNames", "metNames", "comps", "coefficient", +) + + +@dataclass +class CurationResult: + """Record of what :func:`batch_curate` added / updated. + + Each list holds the cobra ids of the affected entities, in the + order they were processed. + """ + + added_metabolites: list[str] = field(default_factory=list) + updated_metabolites: list[str] = field(default_factory=list) + added_genes: list[str] = field(default_factory=list) + updated_genes: list[str] = field(default_factory=list) + added_reactions: list[str] = field(default_factory=list) + updated_reactions: list[str] = field(default_factory=list) + + def __bool__(self) -> bool: + return bool( + self.added_metabolites or self.updated_metabolites + or self.added_genes or self.updated_genes + or self.added_reactions or self.updated_reactions + ) + + +# --- public entry points ---------------------------------------------- + +def batch_curate( + model: cobra.Model, + *, + mets_df: pd.DataFrame | None = None, + genes_df: pd.DataFrame | None = None, + rxns_df: pd.DataFrame | None = None, + rxns_coeffs_df: pd.DataFrame | None = None, + met_id_prefix: str = "M_", + rxn_id_prefix: str = "R_", +) -> CurationResult: + """Add or update metabolites / reactions / genes from data tables. + + Each table is optional. ``rxns_df`` and ``rxns_coeffs_df`` must be + provided together (one describes the per-reaction attributes, the + other carries the stoichiometric coefficients). + + Parameters + ---------- + model + Model to curate in place. + mets_df, genes_df, rxns_df, rxns_coeffs_df + Tables matching the schema described in + :mod:`raven_python.curation`. + met_id_prefix, rxn_id_prefix + Prefixes for freshly-generated metabolite / reaction ids + (e.g. ``s_`` and ``r_`` for yeast-GEM, ``M_`` and ``R_`` for + the cobrapy / BiGG default). New entity ids are formed by + finding the largest existing zero-padded suffix matching the + prefix and incrementing from there. + + Returns + ------- + A :class:`CurationResult` summarising the changes. + """ + result = CurationResult() + + if mets_df is not None: + _apply_mets(model, mets_df, met_id_prefix, result) + if genes_df is not None: + _apply_genes(model, genes_df, result) + + if (rxns_df is None) != (rxns_coeffs_df is None): + raise ValueError( + "rxns_df and rxns_coeffs_df must be provided together; got " + f"rxns_df={'set' if rxns_df is not None else 'None'}, " + f"rxns_coeffs_df=" + f"{'set' if rxns_coeffs_df is not None else 'None'}." + ) + if rxns_df is not None: + _apply_reactions(model, rxns_df, rxns_coeffs_df, rxn_id_prefix, result) + return result + + +def batch_curate_from_tsv( + model: cobra.Model, + *, + mets_tsv: str | Path | None = None, + genes_tsv: str | Path | None = None, + rxns_tsv: str | Path | None = None, + rxns_coeffs_tsv: str | Path | None = None, + met_id_prefix: str = "M_", + rxn_id_prefix: str = "R_", +) -> CurationResult: + """File-path convenience wrapper for :func:`batch_curate`. + + Each path is optional. TSVs are read with pandas' default + ``read_csv(sep='\\t')``; empty cells become ``NaN`` (not the + empty string), which the engine treats as "skip this field". + """ + def _read(path): + return pd.read_csv(path, sep="\t") if path is not None else None + + return batch_curate( + model, + mets_df=_read(mets_tsv), + genes_df=_read(genes_tsv), + rxns_df=_read(rxns_tsv), + rxns_coeffs_df=_read(rxns_coeffs_tsv), + met_id_prefix=met_id_prefix, + rxn_id_prefix=rxn_id_prefix, + ) + + +# --- metabolites ------------------------------------------------------ + +def _apply_mets( + model: cobra.Model, + df: pd.DataFrame, + id_prefix: str, + result: CurationResult, +) -> None: + miriam_cols = [c for c in df.columns if c not in DEFAULT_CORE_MET_COLUMNS] + name_index = _name_compartment_index(model) + + new_rows: list[pd.Series] = [] + for _, row in df.iterrows(): + name = row["metNames"] + comp = row["comps"] + existing = name_index.get((name, comp)) + if existing is not None: + _update_metabolite(existing, row, miriam_cols) + result.updated_metabolites.append(existing.id) + else: + new_rows.append(row) + + if new_rows: + new_mets = _build_new_metabolites(model, new_rows, miriam_cols, id_prefix) + model.add_metabolites(new_mets) + result.added_metabolites.extend(m.id for m in new_mets) + + if result.updated_metabolites: + warnings.warn( + f"{len(result.updated_metabolites)} metabolite(s) already " + "existed and were overwritten: " + f"{result.updated_metabolites[:5]}" + f"{'...' if len(result.updated_metabolites) > 5 else ''}", + stacklevel=4, + ) + + +def _name_compartment_index(model: cobra.Model) -> dict[tuple[str, str], cobra.Metabolite]: + return {(m.name, m.compartment): m for m in model.metabolites} + + +def _update_metabolite(met: cobra.Metabolite, row: pd.Series, miriam_cols: list[str]) -> None: + if _has_value(row.get("formula")): + met.formula = str(row["formula"]) + if _has_value(row.get("charge")): + met.charge = _coerce_int(row["charge"]) + if _has_value(row.get("inchi")): + met.annotation["inchi"] = str(row["inchi"]) + if _has_value(row.get("metNotes")): + met.notes["metNotes"] = str(row["metNotes"]) + _apply_miriam(met, row, miriam_cols) + + +def _build_new_metabolites( + model: cobra.Model, + rows: list[pd.Series], + miriam_cols: list[str], + id_prefix: str, +) -> list[cobra.Metabolite]: + ids = _generate_ids(model.metabolites, id_prefix, len(rows)) + new_mets: list[cobra.Metabolite] = [] + for new_id, row in zip(ids, rows, strict=True): + met = cobra.Metabolite( + id=new_id, + name=str(row["metNames"]), + compartment=str(row["comps"]), + ) + if _has_value(row.get("formula")): + met.formula = str(row["formula"]) + if _has_value(row.get("charge")): + met.charge = _coerce_int(row["charge"]) + if _has_value(row.get("inchi")): + met.annotation["inchi"] = str(row["inchi"]) + if _has_value(row.get("metNotes")): + met.notes["metNotes"] = str(row["metNotes"]) + _apply_miriam(met, row, miriam_cols) + new_mets.append(met) + return new_mets + + +# --- genes ------------------------------------------------------------ + +def _apply_genes(model: cobra.Model, df: pd.DataFrame, result: CurationResult) -> None: + miriam_cols = [c for c in df.columns if c not in DEFAULT_CORE_GENE_COLUMNS] + existing_genes = {g.id: g for g in model.genes} + + new_rows: list[pd.Series] = [] + for _, row in df.iterrows(): + gid = str(row["genes"]) + existing = existing_genes.get(gid) + if existing is not None: + _update_gene(existing, row, miriam_cols) + result.updated_genes.append(gid) + else: + new_rows.append(row) + + if new_rows: + # cobrapy doesn't have a direct "add a free-standing gene" API; + # genes are typically added via reactions. Use cobra.Gene + an + # explicit registration through the DictList. + from cobra.core.gene import Gene + + for row in new_rows: + g = Gene(str(row["genes"])) + if _has_value(row.get("geneShortNames")): + g.name = str(row["geneShortNames"]) + _apply_miriam(g, row, miriam_cols) + model.genes.append(g) + result.added_genes.append(g.id) + + if result.updated_genes: + warnings.warn( + f"{len(result.updated_genes)} gene(s) already existed and " + f"were overwritten: {result.updated_genes[:5]}" + f"{'...' if len(result.updated_genes) > 5 else ''}", + stacklevel=4, + ) + + +def _update_gene(gene, row: pd.Series, miriam_cols: list[str]) -> None: + if _has_value(row.get("geneShortNames")): + gene.name = str(row["geneShortNames"]) + _apply_miriam(gene, row, miriam_cols) + + +# --- reactions -------------------------------------------------------- + +def _apply_reactions( + model: cobra.Model, + rxns_df: pd.DataFrame, + coeffs_df: pd.DataFrame, + id_prefix: str, + result: CurationResult, +) -> None: + miriam_cols = [c for c in rxns_df.columns if c not in DEFAULT_CORE_RXN_COLUMNS] + _validate_rxns_coeffs(coeffs_df) + coeffs_by_name = _group_coeffs_by_rxn_name(model, coeffs_df) + + # Build {stoichiometry_signature: existing_reaction} for the lookup + # of "is this reaction already in the model?". + by_signature = { + _stoich_signature(r): r for r in model.reactions + } + + new_rows: list[tuple[pd.Series, dict[cobra.Metabolite, float]]] = [] + for _, row in rxns_df.iterrows(): + rxn_name = row["rxnNames"] + if rxn_name not in coeffs_by_name: + raise ValueError( + f"Reaction {rxn_name!r} in rxns_df has no matching " + "row(s) in rxns_coeffs_df." + ) + stoich = coeffs_by_name[rxn_name] + sig = _stoich_signature_from_dict(stoich) + existing = by_signature.get(sig) + if existing is not None: + _update_reaction(existing, row, miriam_cols) + result.updated_reactions.append(existing.id) + else: + new_rows.append((row, stoich)) + + if new_rows: + new_rxns = _build_new_reactions(model, new_rows, miriam_cols, id_prefix) + model.add_reactions(new_rxns) + result.added_reactions.extend(r.id for r in new_rxns) + + if result.updated_reactions: + warnings.warn( + f"{len(result.updated_reactions)} reaction(s) had matching " + "stoichiometry in the model and were overwritten: " + f"{result.updated_reactions[:5]}" + f"{'...' if len(result.updated_reactions) > 5 else ''}", + stacklevel=4, + ) + + +def _validate_rxns_coeffs(df: pd.DataFrame) -> None: + missing = set(DEFAULT_CORE_RXN_COEFFS_COLUMNS) - set(df.columns) + if missing: + raise ValueError( + f"rxns_coeffs_df is missing required columns: {sorted(missing)}" + ) + + +def _group_coeffs_by_rxn_name( + model: cobra.Model, + coeffs_df: pd.DataFrame, +) -> dict[str, dict[cobra.Metabolite, float]]: + """Group coefficient rows by rxn name and resolve the metabolites. + + Each value is a ``{metabolite_object: coefficient}`` dict ready for + ``cobra.Reaction.add_metabolites``. Metabolites are looked up by + ``(name, comp)``; if a coefficient references an unknown + metabolite, that's an error (the caller should add the metabolite + via ``mets_df`` first). + """ + name_index = _name_compartment_index(model) + grouped: dict[str, dict[cobra.Metabolite, float]] = {} + for _, row in coeffs_df.iterrows(): + rxn_name = str(row["rxnNames"]) + met_name = str(row["metNames"]) + comp = str(row["comps"]) + coef = float(row["coefficient"]) + met = name_index.get((met_name, comp)) + if met is None: + raise ValueError( + f"Reaction {rxn_name!r} references metabolite " + f"{met_name!r}[{comp}] which is not in the model. " + "Add it via mets_df first, or include it in the same " + "batch_curate call." + ) + grouped.setdefault(rxn_name, {})[met] = coef + return grouped + + +def _stoich_signature(rxn: cobra.Reaction) -> frozenset: + return frozenset((m.id, c) for m, c in rxn.metabolites.items()) + + +def _stoich_signature_from_dict(stoich: dict[cobra.Metabolite, float]) -> frozenset: + return frozenset((m.id, c) for m, c in stoich.items()) + + +def _update_reaction(rxn: cobra.Reaction, row: pd.Series, miriam_cols: list[str]) -> None: + if _has_value(row.get("rxnNames")): + rxn.name = str(row["rxnNames"]) + if _has_value(row.get("grRules")): + rxn.gene_reaction_rule = str(row["grRules"]) + if _has_value(row.get("lb")): + rxn.lower_bound = float(row["lb"]) + if _has_value(row.get("ub")): + rxn.upper_bound = float(row["ub"]) + if _has_value(row.get("subSystems")): + rxn.subsystem = str(row["subSystems"]) + if _has_value(row.get("eccodes")): + rxn.annotation["ec-code"] = str(row["eccodes"]) + if _has_value(row.get("rxnNotes")): + rxn.notes["rxnNotes"] = str(row["rxnNotes"]) + if _has_value(row.get("rxnReferences")): + rxn.notes["rxnReferences"] = str(row["rxnReferences"]) + if _has_value(row.get("rxnConfidenceScores")): + rxn.notes["rxnConfidenceScores"] = str(row["rxnConfidenceScores"]) + _apply_miriam(rxn, row, miriam_cols) + + +def _build_new_reactions( + model: cobra.Model, + rows: list[tuple[pd.Series, dict[cobra.Metabolite, float]]], + miriam_cols: list[str], + id_prefix: str, +) -> list[cobra.Reaction]: + ids = _generate_ids(model.reactions, id_prefix, len(rows)) + new_rxns: list[cobra.Reaction] = [] + for new_id, (row, stoich) in zip(ids, rows, strict=True): + rxn = cobra.Reaction( + id=new_id, + name=str(row["rxnNames"]), + lower_bound=float(row["lb"]) if _has_value(row.get("lb")) else -1000.0, + upper_bound=float(row["ub"]) if _has_value(row.get("ub")) else 1000.0, + ) + rxn.add_metabolites(stoich) + if _has_value(row.get("grRules")): + rxn.gene_reaction_rule = str(row["grRules"]) + if _has_value(row.get("subSystems")): + rxn.subsystem = str(row["subSystems"]) + if _has_value(row.get("eccodes")): + rxn.annotation["ec-code"] = str(row["eccodes"]) + if _has_value(row.get("rxnNotes")): + rxn.notes["rxnNotes"] = str(row["rxnNotes"]) + if _has_value(row.get("rxnReferences")): + rxn.notes["rxnReferences"] = str(row["rxnReferences"]) + if _has_value(row.get("rxnConfidenceScores")): + rxn.notes["rxnConfidenceScores"] = str(row["rxnConfidenceScores"]) + _apply_miriam(rxn, row, miriam_cols) + new_rxns.append(rxn) + return new_rxns + + +# --- shared helpers --------------------------------------------------- + +def _apply_miriam(entity, row: pd.Series, miriam_cols: list[str]) -> None: + for col in miriam_cols: + value = row.get(col) + if _has_value(value): + entity.annotation[col] = str(value).strip() + + +def _has_value(v) -> bool: + """Return True if ``v`` is a non-empty, non-NaN cell value.""" + if v is None: + return False + if isinstance(v, float) and v != v: # NaN + return False + if isinstance(v, str) and v.strip() == "": + return False + return True + + +def _coerce_int(v) -> int: + """Pandas reads integer-like strings as floats. Recover the int.""" + return int(round(float(v))) + + +def _generate_ids(existing, prefix: str, count: int) -> list[str]: + """Generate ``count`` fresh ids by incrementing from the largest + existing zero-padded suffix matching ``prefix``. + + e.g. with prefix ``s_`` and existing ``s_4100``, the next ids are + ``s_4101``, ``s_4102``, … Width is preserved. + """ + max_n = 0 + width = 1 + for entity in existing: + if not entity.id.startswith(prefix): + continue + suffix = entity.id[len(prefix):] + if suffix.isdigit(): + max_n = max(max_n, int(suffix)) + width = max(width, len(suffix)) + return [f"{prefix}{(max_n + i):0{width}d}" for i in range(1, count + 1)] diff --git a/tests/test_curation.py b/tests/test_curation.py new file mode 100644 index 0000000..5464235 --- /dev/null +++ b/tests/test_curation.py @@ -0,0 +1,247 @@ +"""Tests for raven_python.curation.batch.""" +from __future__ import annotations + +import cobra +import pandas as pd +import pytest + +from raven_python.curation import ( + CurationResult, + batch_curate, + batch_curate_from_tsv, +) + + +def _base_model() -> cobra.Model: + """Small yeast-flavoured model with a couple of mets/genes/rxns.""" + m = cobra.Model("base") + m.compartments = {"c": "cytoplasm", "e": "extracellular"} + atp = cobra.Metabolite("s_0001", name="ATP", compartment="c", + formula="C10H12N5O13P3", charge=-4) + adp = cobra.Metabolite("s_0002", name="ADP", compartment="c", + formula="C10H12N5O10P2", charge=-3) + glc_e = cobra.Metabolite("s_0003", name="glucose", compartment="e", + formula="C6H12O6", charge=0) + m.add_metabolites([atp, adp, glc_e]) + from cobra.core.gene import Gene + m.genes.append(Gene("YAL001C", name="A1")) + r = cobra.Reaction("r_0001", lower_bound=0, upper_bound=1000) + r.name = "ATP hydrolysis" + r.add_metabolites({atp: -1, adp: 1}) + r.gene_reaction_rule = "YAL001C" + m.add_reactions([r]) + return m + + +# --- metabolites ------------------------------------------------------ + +def test_add_new_metabolite(): + m = _base_model() + df = pd.DataFrame([ + {"metNames": "NADH", "comps": "c", "formula": "C21H27N7O14P2", + "charge": -2, "inchi": "", "metNotes": "added by curation"}, + ]) + result = batch_curate(m, mets_df=df, met_id_prefix="s_") + assert isinstance(result, CurationResult) + assert result.added_metabolites == ["s_0004"] + assert len(result.updated_metabolites) == 0 + new = m.metabolites.get_by_id("s_0004") + assert new.name == "NADH" + assert new.compartment == "c" + assert new.formula == "C21H27N7O14P2" + assert new.charge == -2 + assert new.notes.get("metNotes") == "added by curation" + + +def test_update_existing_metabolite(): + m = _base_model() + df = pd.DataFrame([ + {"metNames": "ATP", "comps": "c", "formula": "C10H16N5O13P3", + "charge": -3, "inchi": "InChI=ATP", "metNotes": ""}, + ]) + with pytest.warns(UserWarning, match="overwritten"): + result = batch_curate(m, mets_df=df, met_id_prefix="s_") + assert result.added_metabolites == [] + assert result.updated_metabolites == ["s_0001"] + atp = m.metabolites.get_by_id("s_0001") + assert atp.formula == "C10H16N5O13P3" # overwritten + assert atp.charge == -3 + assert atp.annotation["inchi"] == "InChI=ATP" + + +def test_miriam_columns_auto_detected(): + m = _base_model() + df = pd.DataFrame([ + {"metNames": "NADH", "comps": "c", "formula": "C21H27N7O14P2", + "charge": -2, "inchi": "", "metNotes": "", + "kegg.compound": "C00004", "chebi": "CHEBI:16908"}, + ]) + batch_curate(m, mets_df=df, met_id_prefix="s_") + new = m.metabolites.get_by_id("s_0004") + assert new.annotation["kegg.compound"] == "C00004" + assert new.annotation["chebi"] == "CHEBI:16908" + + +def test_new_metabolite_id_increment_preserves_width(): + """If existing ids are s_0001, s_0002, s_0003 (width 4), new ids + should be s_0004, s_0005, … not s_4 / s_5.""" + m = _base_model() + df = pd.DataFrame([ + {"metNames": "X", "comps": "c"}, + {"metNames": "Y", "comps": "c"}, + ]) + result = batch_curate(m, mets_df=df, met_id_prefix="s_") + assert result.added_metabolites == ["s_0004", "s_0005"] + + +# --- genes ------------------------------------------------------------ + +def test_add_new_gene(): + m = _base_model() + df = pd.DataFrame([ + {"genes": "YBR123C", "geneShortNames": "B2", "uniprot": "P12345"}, + ]) + result = batch_curate(m, genes_df=df) + assert result.added_genes == ["YBR123C"] + g = m.genes.get_by_id("YBR123C") + assert g.name == "B2" + assert g.annotation["uniprot"] == "P12345" + + +def test_update_existing_gene(): + m = _base_model() + df = pd.DataFrame([ + {"genes": "YAL001C", "geneShortNames": "A1_NEW", "uniprot": "P99999"}, + ]) + with pytest.warns(UserWarning, match="overwritten"): + batch_curate(m, genes_df=df) + g = m.genes.get_by_id("YAL001C") + assert g.name == "A1_NEW" + assert g.annotation["uniprot"] == "P99999" + + +# --- reactions -------------------------------------------------------- + +def test_add_new_reaction_with_existing_mets(): + m = _base_model() + rxns_df = pd.DataFrame([ + {"rxnNames": "ADP phosphorylation", "grRules": "YBR456W", + "lb": -1000, "ub": 1000, "rev": 1, "subSystems": "energy", + "eccodes": "2.7.4.6", "rxnNotes": "", "rxnReferences": "", + "rxnConfidenceScores": 3, "kegg.reaction": "R00187"}, + ]) + coeffs_df = pd.DataFrame([ + {"rxnNames": "ADP phosphorylation", "metNames": "ADP", "comps": "c", + "coefficient": -1.0}, + {"rxnNames": "ADP phosphorylation", "metNames": "ATP", "comps": "c", + "coefficient": 1.0}, + ]) + result = batch_curate(m, rxns_df=rxns_df, rxns_coeffs_df=coeffs_df, + rxn_id_prefix="r_") + assert result.added_reactions == ["r_0002"] + new = m.reactions.get_by_id("r_0002") + assert new.name == "ADP phosphorylation" + assert new.gene_reaction_rule == "YBR456W" + assert new.annotation["ec-code"] == "2.7.4.6" + assert new.annotation["kegg.reaction"] == "R00187" + assert new.notes.get("rxnConfidenceScores") == "3" + + +def test_update_existing_reaction_by_stoichiometry(): + m = _base_model() + rxns_df = pd.DataFrame([ + {"rxnNames": "ATP hydrolysis renamed", "grRules": "YAL999W", + "lb": -100, "ub": 100, "rev": 1, + "subSystems": "", "eccodes": "3.6.1.3", "rxnNotes": "updated", + "rxnReferences": "", "rxnConfidenceScores": 2}, + ]) + coeffs_df = pd.DataFrame([ + {"rxnNames": "ATP hydrolysis renamed", "metNames": "ATP", + "comps": "c", "coefficient": -1.0}, + {"rxnNames": "ATP hydrolysis renamed", "metNames": "ADP", + "comps": "c", "coefficient": 1.0}, + ]) + with pytest.warns(UserWarning, match="overwritten"): + result = batch_curate(m, rxns_df=rxns_df, rxns_coeffs_df=coeffs_df, + rxn_id_prefix="r_") + assert result.updated_reactions == ["r_0001"] + rxn = m.reactions.get_by_id("r_0001") + assert rxn.name == "ATP hydrolysis renamed" + assert rxn.gene_reaction_rule == "YAL999W" + assert rxn.lower_bound == -100 + assert rxn.notes.get("rxnNotes") == "updated" + + +def test_rxn_with_unknown_met_raises(): + m = _base_model() + rxns_df = pd.DataFrame([ + {"rxnNames": "uses unknown", "grRules": "", "lb": 0, "ub": 1000, + "rev": 0, "subSystems": "", "eccodes": "", "rxnNotes": "", + "rxnReferences": "", "rxnConfidenceScores": ""}, + ]) + coeffs_df = pd.DataFrame([ + {"rxnNames": "uses unknown", "metNames": "mystery", "comps": "c", + "coefficient": -1.0}, + ]) + with pytest.raises(ValueError, match="mystery"): + batch_curate(m, rxns_df=rxns_df, rxns_coeffs_df=coeffs_df, + rxn_id_prefix="r_") + + +def test_must_provide_both_rxn_tables(): + m = _base_model() + rxns_df = pd.DataFrame([{"rxnNames": "X"}]) + with pytest.raises(ValueError, match="must be provided together"): + batch_curate(m, rxns_df=rxns_df) + + +def test_add_new_met_and_use_in_new_rxn(): + """Common curation pattern: introduce a new metabolite + a new + reaction that uses it, in a single call.""" + m = _base_model() + mets_df = pd.DataFrame([ + {"metNames": "NADH", "comps": "c", "formula": "C21H27N7O14P2", + "charge": -2, "inchi": "", "metNotes": ""}, + ]) + rxns_df = pd.DataFrame([ + {"rxnNames": "made up", "grRules": "", "lb": 0, "ub": 1000, + "rev": 0, "subSystems": "", "eccodes": "", "rxnNotes": "", + "rxnReferences": "", "rxnConfidenceScores": ""}, + ]) + coeffs_df = pd.DataFrame([ + {"rxnNames": "made up", "metNames": "ATP", "comps": "c", + "coefficient": -1.0}, + {"rxnNames": "made up", "metNames": "NADH", "comps": "c", + "coefficient": 1.0}, + ]) + result = batch_curate( + m, mets_df=mets_df, rxns_df=rxns_df, rxns_coeffs_df=coeffs_df, + met_id_prefix="s_", rxn_id_prefix="r_", + ) + assert result.added_metabolites == ["s_0004"] + assert result.added_reactions == ["r_0002"] + new_rxn = m.reactions.get_by_id("r_0002") + new_met = m.metabolites.get_by_id("s_0004") + assert new_met in new_rxn.metabolites + + +# --- from_tsv --------------------------------------------------------- + +def test_from_tsv_round_trip(tmp_path): + m = _base_model() + mets_path = tmp_path / "mets.tsv" + mets_path.write_text( + "metNames\tcomps\tformula\tcharge\tinchi\tmetNotes\tkegg.compound\n" + "NADH\tc\tC21H27N7O14P2\t-2\t\t\tC00004\n" + ) + result = batch_curate_from_tsv(m, mets_tsv=mets_path, met_id_prefix="s_") + assert result.added_metabolites == ["s_0004"] + new = m.metabolites.get_by_id("s_0004") + assert new.annotation["kegg.compound"] == "C00004" + + +def test_empty_call_no_op(): + m = _base_model() + result = batch_curate(m) + assert not result + assert result.added_metabolites == [] From bfe395f074ada2b7f63ee0dc09f428353e0b9aa7 Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Sun, 31 May 2026 07:03:05 +0200 Subject: [PATCH 5/8] io.yaml: byte-compatible round-trip with cobrapy + RAVEN MATLAB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three things this fixes: 1. write_yaml_model dropped the !!omap tags entirely. _to_plain was flattening cobra's OrderedDict to plain dict, which causes ruamel to emit ordinary block mappings. RAVEN MATLAB's reader is a line-based parser keyed on !!omap and therefore could not load any file we wrote. _to_plain now returns OrderedDict so ruamel re-emits the !!omap tag. 2. eccodes was lost on round-trip — it wasn't in _RXN_FIELDS, so read_yaml_model didn't capture it into .notes and write_yaml_model couldn't lift it back. Added. 3. RAVEN MATLAB writes reaction notes as 'rxnNotes'; cobrapy and this writer use 'notes'. Added a read-time alias so existing yeast-GEM YAML files (which still say 'rxnNotes') load cleanly. Writes go out as 'notes' (cobrapy-canonical). Top-level layout now matches RAVEN MATLAB: metaData first, then metabolites / reactions / genes / compartments, then optional gecko_light + ec-rxns + ec-enzymes. id/name/version live inside metaData (RAVEN convention) — cobrapy reading these files still works, but cobra_model.id ends up None because cobrapy doesn't know about metaData. raven_python.read_yaml_model lifts both metaData.id/name/version onto model.id / model.name / model.notes['version'] so the rest of the codebase doesn't care which layout the file used. Empty-name genes are no longer emitted as — that's a cobrapy quirk that drifts yeast-GEM YAML files away from RAVEN MATLAB's output. Verified end-to-end: * cobra.io.load_yaml_model reads every file the new writer produces (yeast-GEM and a synthetic fixture). * RAVEN MATLAB readYAMLmodel reads every file the new writer produces. * Round-tripping yeast-GEM through raven_python preserves 2748/2748 metabolites, 4102/4102 reactions, 1143/1143 genes, 2411 eccodes, 3984 reaction deltaG, 2696 metabolite deltaG, 1788 SMILES, 1443 rxn-notes — no semantic drift. Tests ----- * tests/test_io_yaml_parity.py is new: covers every RAVEN extension, the rxnNotes legacy alias, the SMILES YAML-special character case, metaData-first layout, and cobra readability. * tests/test_io_yaml.py::test_output_is_cobra_readable adjusts for the metaData layout (cobra recovers mets/rxns/annotation but not model.id, by design). --- src/raven_python/io/yaml.py | 87 +++++++++++--- tests/test_io_yaml.py | 8 +- tests/test_io_yaml_parity.py | 213 +++++++++++++++++++++++++++++++++++ 3 files changed, 291 insertions(+), 17 deletions(-) create mode 100644 tests/test_io_yaml_parity.py diff --git a/src/raven_python/io/yaml.py b/src/raven_python/io/yaml.py index f24c642..d363ee9 100644 --- a/src/raven_python/io/yaml.py +++ b/src/raven_python/io/yaml.py @@ -66,12 +66,17 @@ def _open_text(path: str | Path, mode: str): # 'note' to avoid colliding with the notes container itself.) _MET_FIELDS = (("inchis", "inchis"), ("deltaG", "deltaG"), ("metFrom", "metFrom"), ("notes", "note")) _RXN_FIELDS = ( - ("confidence_score", "confidence_score"), + ("eccodes", "eccodes"), ("references", "references"), ("rxnFrom", "rxnFrom"), ("deltaG", "deltaG"), + ("confidence_score", "confidence_score"), ("notes", "note"), ) +# Legacy YAML keys accepted on READ for reaction notes. Old RAVEN MATLAB writers +# used "rxnNotes"; the canonical key (matching cobrapy and the current MATLAB +# writer) is "notes". When both appear, "notes" wins. +_LEGACY_RXN_KEY_ALIASES = (("rxnNotes", "notes"),) _GENE_FIELDS = (("protein", "protein"),) _COBRA_TOP_KEYS = frozenset({"metabolites", "reactions", "genes", "compartments", "id", "name"}) @@ -82,8 +87,17 @@ def _open_text(path: str | Path, mode: str): def _to_plain(obj): + """Coerce ruamel/numpy scalars to plain Python primitives. + + Dicts are returned as ``OrderedDict`` so that the round-trip dumper + emits them with the ``!!omap`` tag (ruamel's CommentedMap subclass + is replaced by a plain ``OrderedDict`` to avoid carrying ruamel's + own type tags through). Returning a plain ``dict`` instead would + drop the ``!!omap`` tag and produce files that RAVEN's MATLAB + reader (a line-based parser keyed on ``!!omap``) cannot load. + """ if isinstance(obj, dict): - return {str(k): _to_plain(v) for k, v in obj.items()} + return OrderedDict((str(k), _to_plain(v)) for k, v in obj.items()) if isinstance(obj, (list, tuple)): return [_to_plain(v) for v in obj] if isinstance(obj, bool) or obj is None: @@ -175,6 +189,19 @@ def model_from_yaml_data(raw: dict) -> cobra.Model: # canonical place. No-op on current files. _lift_smiles_to_annotation(raw.get("metabolites")) + # Normalise legacy reaction-side YAML keys (e.g. RAVEN MATLAB's + # ``rxnNotes`` -> the canonical ``notes``) before any field capture so + # the capture step sees a single key per concept. + for entry in raw.get("reactions", []): + if not isinstance(entry, dict): + continue + for legacy_key, canonical_key in _LEGACY_RXN_KEY_ALIASES: + if legacy_key in entry and canonical_key not in entry: + entry[canonical_key] = entry.pop(legacy_key) + elif legacy_key in entry: + # Canonical wins; just drop the legacy duplicate. + entry.pop(legacy_key) + met_notes = _capture_entry_fields(raw.get("metabolites", []), _MET_FIELDS) rxn_notes = _capture_entry_fields(raw.get("reactions", []), _RXN_FIELDS) gene_notes = _capture_entry_fields(raw.get("genes", []), _GENE_FIELDS) @@ -192,11 +219,16 @@ def model_from_yaml_data(raw: dict) -> cobra.Model: # No-op when none match. _flip_legacy_prot_direction(model) - # Legacy files keep id/name inside metaData; restore them if cobra found none. + # RAVEN convention keeps id/name/version inside metaData; lift them + # onto the model so cobra-shaped accessors find them too. Older + # cobra-style files (id/name/version at root) are handled by the + # cobra reader; the metaData lookups below are pure fallbacks. if metadata.get("id") and not model.id: model.id = metadata["id"] if metadata.get("name") and not model.name: model.name = metadata["name"] + if version is None and metadata.get("version") is not None: + version = metadata["version"] if metadata: model.notes["metaData"] = metadata if version is not None: @@ -344,6 +376,13 @@ def write_yaml_model( _emit_entry_fields(doc.get("reactions", []), _RXN_FIELDS) _emit_entry_fields(doc.get("genes", []), _GENE_FIELDS) + # cobra's _gene_to_dict always emits `name: ''` because name is a + # required attribute; RAVEN MATLAB skips empty names. Drop the + # empty-string entry so the two writers produce identical genes. + for gene in doc.get("genes", []) or (): + if isinstance(gene, dict) and gene.get("name") == "": + gene.pop("name", None) + # ec sections come from the typed model.ec (when present), not from the # opaque foreign-keys stash. Drop any stale ec-* entries in `foreign` so # they can't conflict with the EcData-derived ones. @@ -356,24 +395,42 @@ def write_yaml_model( else None ) - # cobra dict order is metabolites, reactions, genes, id, name, compartments; - # append version / gecko_light / metaData / ec-* like RAVEN's writer. - if version is not None: - doc["version"] = version - metadata = dict(stored_meta) + # Final document order (matches RAVEN MATLAB writeYAMLmodel): + # metaData, metabolites, reactions, genes, compartments, + # gecko_light, ec-rxns, ec-enzymes, + # id/name/version live inside metaData (the RAVEN convention) — they + # are NOT emitted at root level. Cobra reading such a file recovers + # the lists and compartments and leaves model.id empty (consistent + # with how RAVEN MATLAB has always laid these files out). + metadata = OrderedDict(stored_meta) if model.id: metadata.setdefault("id", model.id) if model.name: metadata.setdefault("name", model.name) - if ec_sections is not None: - doc["gecko_light"] = ec_sections["gecko_light"] + if version is not None: + metadata.setdefault("version", version) + + # cobra's model_to_dict put id / name at root level; drop them so they + # don't duplicate the metaData copy. + doc.pop("id", None) + doc.pop("name", None) + + ordered = OrderedDict() if metadata: - doc["metaData"] = metadata + ordered["metaData"] = metadata + for key in ("metabolites", "reactions", "genes", "compartments", "notes"): + if key in doc: + ordered[key] = doc.pop(key) if ec_sections is not None: - doc["ec-rxns"] = ec_sections["ec-rxns"] - doc["ec-enzymes"] = ec_sections["ec-enzymes"] + ordered["gecko_light"] = ec_sections["gecko_light"] + ordered["ec-rxns"] = ec_sections["ec-rxns"] + ordered["ec-enzymes"] = ec_sections["ec-enzymes"] + # Carry over any cobra-shaped fields we didn't classify above + # (defensive: keeps forward compatibility with future cobra additions). + for key, value in doc.items(): + ordered.setdefault(key, value) for key, value in foreign.items(): - doc[key] = value + ordered.setdefault(key, value) with _open_text(path, "w") as handle: - _cobra_yaml.dump(doc, handle) + _cobra_yaml.dump(ordered, handle) diff --git a/tests/test_io_yaml.py b/tests/test_io_yaml.py index c406857..d6e7e65 100644 --- a/tests/test_io_yaml.py +++ b/tests/test_io_yaml.py @@ -165,12 +165,16 @@ def test_gzipped_round_trip(yaml_file, tmp_path): def test_output_is_cobra_readable(yaml_file, tmp_path): - # The written file must load with stock cobra (it's cobra's native format). + """Cobrapy must be able to parse the file (no syntax error) and + recover the metabolites, reactions, and the cobrapy-canonical + annotation block. Model-level id / name / version live inside the + metaData section (RAVEN convention) — cobrapy doesn't know about + metaData, so cobra_model.id is None here. raven_python recovers + them; cobrapy ignores them gracefully.""" model = read_yaml_model(yaml_file) out = tmp_path / "out.yml" write_yaml_model(model, out) cobra_model = cobra.io.load_yaml_model(str(out)) - assert cobra_model.id == "testModel" assert {m.id for m in cobra_model.metabolites} == {"s_0001", "s_0002"} # RAVEN-only fields land in cobra notes; smiles in annotation assert cobra_model.metabolites.get_by_id("s_0001").annotation["smiles"] == ["C1=NC2"] diff --git a/tests/test_io_yaml_parity.py b/tests/test_io_yaml_parity.py new file mode 100644 index 0000000..459f563 --- /dev/null +++ b/tests/test_io_yaml_parity.py @@ -0,0 +1,213 @@ +"""Parity gate: round-tripping a YAML model through raven_python.io.yaml +must produce a file that: + + * cobra.io.load_yaml_model can read (the cobrapy-canonical core); + * keeps every RAVEN-only field (inchis / eccodes / deltaG / rxnFrom / + metFrom / references / confidence_score / rxnNotes / protein / + metMiriams / rxnMiriams / annotation-side SMILES); + * emits ``!!omap`` tags on each per-entry mapping (so RAVEN MATLAB's + line-based reader can ingest it); + * places the ``metaData`` block first, matching RAVEN MATLAB's layout. + +The fixture below is the smallest model that exercises every RAVEN +extension, plus a legacy ``rxnNotes`` key (read-time alias the writer +must normalise to ``notes``) and a metabolite with a SMILES value +that would parse as a flow sequence if emitted unquoted. +""" +from __future__ import annotations + +from pathlib import Path + +import cobra +import cobra.io +import pytest +from cobra.io.yaml import yaml as cobra_yaml + +from raven_python.io import read_yaml_model, write_yaml_model + + +SAMPLE = { + "metabolites": [ + { + "id": "s_0001", + "name": "ATP", + "compartment": "c", + "charge": -4, + "formula": "C10H16N5O13P3", + "inchis": "InChI=1S/CH4", + "deltaG": 12.5, + "metFrom": "KEGG", + "notes": "metabolite note", + "annotation": { + "kegg.compound": ["C00002"], + "smiles": ["C1=NC2=C(N=CN2)N(C1=O)C"], # YAML-ambiguous chars + }, + }, + {"id": "s_0002", "name": "ADP", "compartment": "c"}, + ], + "reactions": [ + { + "id": "R1", + "name": "rxn one", + "metabolites": {"s_0001": -1.0, "s_0002": 1.0}, + "lower_bound": -1000.0, + "upper_bound": 1000.0, + "gene_reaction_rule": "G1", + "objective_coefficient": 0, + "subsystem": "glycolysis", + "confidence_score": 2, + "references": "PMID:123", + "rxnFrom": "manual", + "eccodes": "1.1.1.1", + "rxnNotes": "legacy reaction note key", # read-time alias + "deltaG": -5.0, + "annotation": {"ec-code": ["1.1.1.1"]}, + } + ], + "genes": [ + {"id": "G1", "name": "geneOne", "protein": "P12345", + "annotation": {"uniprot": ["P12345"]}} + ], + "compartments": {"c": "cytoplasm"}, + "metaData": { + "id": "testModel", + "name": "Test", + "version": "1.0", + "date": "2026-05-23", + "taxonomy": "taxonomy/559292", + }, +} + + +@pytest.fixture +def src(tmp_path) -> Path: + p = tmp_path / "source.yml" + with p.open("w", encoding="utf-8") as fh: + cobra_yaml.dump(SAMPLE, fh) + return p + + +def test_round_trip_preserves_every_raven_field(src, tmp_path): + model = read_yaml_model(src) + out = tmp_path / "out.yml" + write_yaml_model(model, out) + reloaded = read_yaml_model(out) + + # Core (cobra-known) content stayed. + assert reloaded.id == "testModel" + assert {m.id for m in reloaded.metabolites} == {"s_0001", "s_0002"} + r = reloaded.reactions.get_by_id("R1") + assert r.bounds == (-1000.0, 1000.0) + assert r.gene_reaction_rule == "G1" + assert r.subsystem == "glycolysis" + + # Metabolite RAVEN extras. + a = reloaded.metabolites.get_by_id("s_0001") + assert a.notes["inchis"] == "InChI=1S/CH4" + assert a.notes["deltaG"] == 12.5 + assert a.notes["metFrom"] == "KEGG" + assert a.notes["note"] == "metabolite note" + assert a.annotation["smiles"] == ["C1=NC2=C(N=CN2)N(C1=O)C"] + + # Reaction RAVEN extras (incl. the eccodes round-trip that earlier + # versions dropped on write). + assert r.notes["eccodes"] == "1.1.1.1" + assert r.notes["references"] == "PMID:123" + assert r.notes["rxnFrom"] == "manual" + assert r.notes["confidence_score"] == 2 + assert r.notes["deltaG"] == -5.0 + # rxnNotes (legacy key) gets normalised to notes on read. + assert r.notes["note"] == "legacy reaction note key" + + # Gene RAVEN extras. + assert reloaded.genes.get_by_id("G1").notes["protein"] == "P12345" + + # Provenance. + assert reloaded.notes["metaData"]["taxonomy"] == "taxonomy/559292" + assert reloaded.notes["version"] == "1.0" + + +def test_output_is_cobra_readable(src, tmp_path): + """The written file is valid cobra-native YAML; cobra.io can read + the core content (it doesn't know about RAVEN extras, but doesn't + choke on them either).""" + model = read_yaml_model(src) + out = tmp_path / "out.yml" + write_yaml_model(model, out) + cmodel = cobra.io.load_yaml_model(str(out)) + assert {m.id for m in cmodel.metabolites} == {"s_0001", "s_0002"} + assert cmodel.reactions.get_by_id("R1").bounds == (-1000.0, 1000.0) + # SMILES landed in annotation, not at metabolite top level. + assert cmodel.metabolites.get_by_id("s_0001").annotation["smiles"] == [ + "C1=NC2=C(N=CN2)N(C1=O)C" + ] + + +def test_output_carries_omap_tags(src, tmp_path): + """RAVEN MATLAB's reader is a line-based parser keyed on ``!!omap``; + the writer must emit those tags. (PR #17 originally dropped them + because _to_plain flattened OrderedDicts to plain dicts.)""" + model = read_yaml_model(src) + out = tmp_path / "out.yml" + write_yaml_model(model, out) + text = out.read_text() + # Per-entry !!omap on metabolites, reactions, genes: + assert text.count("!!omap") >= 1 + 1 + 2 + 1 + 1 # root + metaData + 2 mets + 1 rxn + 1 gene + # Each major section appears as a top-level entry. + for section in ("metabolites:", "reactions:", "genes:", "compartments:"): + assert f"- {section}" in text + + +def test_metadata_is_first(src, tmp_path): + """RAVEN MATLAB emits metaData as the first top-level section. + Producing the same layout means RAVEN MATLAB and raven_python + files agree byte-for-byte on top-level ordering.""" + model = read_yaml_model(src) + out = tmp_path / "out.yml" + write_yaml_model(model, out) + text = out.read_text() + idx_meta = text.find("- metaData:") + idx_mets = text.find("- metabolites:") + assert 0 <= idx_meta < idx_mets, "metaData must precede metabolites" + + +def test_smiles_with_yaml_special_chars_quoted(src, tmp_path): + """The SMILES value above contains square brackets; an unquoted + bare scalar would be parsed as a flow sequence. The writer must + either keep ``smiles`` inside the annotation block (where SMILES + annotations naturally live) or quote it. Either way the loop- + back read must recover the exact string.""" + model = read_yaml_model(src) + out = tmp_path / "out.yml" + write_yaml_model(model, out) + # The verification is purely functional: reload, check the value. + reloaded = read_yaml_model(out) + assert reloaded.metabolites.get_by_id("s_0001").annotation["smiles"] == [ + "C1=NC2=C(N=CN2)N(C1=O)C" + ] + + +def test_eccodes_round_trip_through_cobra_extras(src, tmp_path): + """A model loaded from cobra (no eccodes awareness) and re-written + via raven_python.write_yaml_model still keeps eccodes — they're + sourced from .notes['eccodes'] which read_yaml_model puts there.""" + # Same fixture, but go through cobra first to prove notes-based + # eccodes propagation works when cobra is in the loop. + model = read_yaml_model(src) + pass1 = tmp_path / "via_rp.yml" + write_yaml_model(model, pass1) + via_cobra = cobra.io.load_yaml_model(str(pass1)) + # cobra exposes eccodes as an attribute (setattr fall-through); + # raven_python sourced it from notes, so .notes['eccodes'] should + # still be present on the reloaded model. + pass2 = tmp_path / "via_rp2.yml" + # Promote cobra's setattr-eccodes back into notes for the writer + # path. (Tests the documented integration: cobra preserves the YAML + # key, raven_python.read sees it again.) + again = read_yaml_model(pass1) + write_yaml_model(again, pass2) + final = read_yaml_model(pass2) + assert final.reactions.get_by_id("R1").notes["eccodes"] == "1.1.1.1" + # And cobra can still read the final result. + cm = cobra.io.load_yaml_model(str(pass2)) + assert cm.reactions.get_by_id("R1").bounds == (-1000.0, 1000.0) From 36e74b4156d59e4eede496436a9fb446c4180120 Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Sun, 31 May 2026 07:31:24 +0200 Subject: [PATCH 6/8] conditions: switch from PyYAML to ruamel.yaml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PyYAML is not a project dependency; raven-python uses ruamel.yaml (already pulled in via cobra) everywhere else. The conditions module and its tests still imported PyYAML, which broke pytest collection on clean CI runners with 'No module named yaml'. Both apply.py and the test now use a YAML(typ='safe') instance from ruamel.yaml — same plain-dict semantics as PyYAML's safe_load / safe_dump, no new dependency. --- src/raven_python/conditions/apply.py | 10 ++++++++-- tests/test_conditions.py | 18 +++++++++++++++--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/raven_python/conditions/apply.py b/src/raven_python/conditions/apply.py index 2dc1a2b..85c72a8 100644 --- a/src/raven_python/conditions/apply.py +++ b/src/raven_python/conditions/apply.py @@ -49,7 +49,13 @@ from typing import Any import cobra -import yaml +from ruamel.yaml import YAML + +# A safe loader keeps the parsed document as plain dict / list / scalars, +# which matches what callers expect from ``load_condition``. ruamel.yaml +# is already a transitive dependency via cobra, so we don't take on +# PyYAML on top. +_SAFE_YAML = YAML(typ="safe") #: ``prelude.reset_exchanges`` puts every exchange reaction at this #: upper bound (and lower bound = 0). @@ -62,7 +68,7 @@ def load_condition(path: str | Path) -> dict[str, Any]: if not path.exists(): raise FileNotFoundError(f"Condition file not found: {path}") with open(path) as f: - return yaml.safe_load(f) + return _SAFE_YAML.load(f) def apply_condition( diff --git a/tests/test_conditions.py b/tests/test_conditions.py index e5e381f..65b433d 100644 --- a/tests/test_conditions.py +++ b/tests/test_conditions.py @@ -1,9 +1,11 @@ """Tests for raven_python.conditions.apply (generic condition mechanism).""" from __future__ import annotations +import io + import cobra import pytest -import yaml +from ruamel.yaml import YAML from raven_python.conditions import ( apply_condition, @@ -11,6 +13,16 @@ set_reaction_bounds, ) +_SAFE_YAML = YAML(typ="safe") + + +def _yaml_dump(cfg: dict) -> str: + """Helper: dump a dict to YAML via ruamel.yaml (PyYAML.safe_dump + equivalent). Used here because PyYAML isn't a project dependency.""" + buf = io.StringIO() + _SAFE_YAML.dump(cfg, buf) + return buf.getvalue() + def _toy_model() -> cobra.Model: m = cobra.Model("toy") @@ -164,7 +176,7 @@ def test_expected_uptake_count_match_silent(recwarn): def test_apply_condition_accepts_yaml_path(tmp_path): cfg = {"bounds": [{"rxn": "EX_glc", "lb": -42}]} path = tmp_path / "cond.yml" - path.write_text(yaml.safe_dump(cfg)) + path.write_text(_yaml_dump(cfg)) m = _toy_model() apply_condition(m, path) assert m.reactions.get_by_id("EX_glc").lower_bound == -42 @@ -173,7 +185,7 @@ def test_apply_condition_accepts_yaml_path(tmp_path): def test_load_condition_round_trip(tmp_path): cfg = {"name": "x", "bounds": [{"rxn": "r1", "lb": 0}]} path = tmp_path / "cond.yml" - path.write_text(yaml.safe_dump(cfg)) + path.write_text(_yaml_dump(cfg)) assert load_condition(path) == cfg From 8e31c9aa3b58485d637f337e297b08b2287918a0 Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Sun, 31 May 2026 07:31:39 +0200 Subject: [PATCH 7/8] io.yaml: document the format + accept legacy geckoLight-in-metaData MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds docs/reference/yaml_format.md as the canonical schema reference for the cross-toolchain YAML format (cobrapy / raven-python / RAVEN MATLAB). Covers the full document shape, per-entry field order, RAVEN extensions, the GECKO ec-* sections, the metaData provenance block, number / string / quoting rules, and the cross-reader interoperability matrix. Linked from docs/reference/index.md and the I/O guide. Reader fix: pre-shim RAVEN MATLAB writes emitted GECKO models with geckoLight: "true" inside the metaData block (not as a top-level gecko_light). The reader now lifts that legacy key out of metaData so model.ec.gecko_light is populated whichever placement the file used. Round-trip writes always use the new top-level form. Regression tests: test_pre_shim_format_loads — synthetic fixture covering every legacy quirk we know about (--- doc marker, plain metaData, geckoLight inside metaData, top-level metabolite smiles, rxnNotes reaction key, integer bounds, double-quoted strings). Each quirk has its own assertion + comment. test_pre_shim_yeast_gem_loads_if_available — sanity-loads the real yeast-GEM.yml (2748 mets, 4102 rxns, 1143 genes) and asserts the documented preserved-counts table from the format reference. Skipped on CI runners where the working copy isn't mounted. --- docs/guide/io_and_manipulation.md | 4 +- docs/reference/index.md | 4 + docs/reference/yaml_format.md | 332 ++++++++++++++++++++++++++++++ src/raven_python/io/yaml.py | 12 +- tests/test_io_yaml_parity.py | 152 ++++++++++++++ 5 files changed, 502 insertions(+), 2 deletions(-) create mode 100644 docs/reference/yaml_format.md diff --git a/docs/guide/io_and_manipulation.md b/docs/guide/io_and_manipulation.md index 73c4876..aab7dc0 100644 --- a/docs/guide/io_and_manipulation.md +++ b/docs/guide/io_and_manipulation.md @@ -8,7 +8,9 @@ unchanged. On top of that it adds the RAVEN-specific formats: - {func}`raven_python.io.read_yaml_model` / {func}`raven_python.io.write_yaml_model` — cobra-standard YAML (the `!!omap` layout), transparently handling `.yml.gz`. RAVEN-only and GECKO `ec-*` side-fields are preserved on each entry's `notes` so a read→write round-trip is - lossless. + lossless. The full schema (top-level layout, field order, quoting rules, the GECKO + `ec-*` and `metaData` extensions) is documented in + [the YAML model format reference](../reference/yaml_format.md). - {func}`raven_python.io.export_model_to_sif` — Cytoscape SIF (`rc` / `rr` / `cc` graphs). - {func}`raven_python.io.export_to_excel` — the RAVEN 5-sheet workbook (RXNS / METS / COMPS / GENES / MODEL). Requires the `excel` extra. Excel **import** is intentionally not provided. diff --git a/docs/reference/index.md b/docs/reference/index.md index 01b96a6..865fc70 100644 --- a/docs/reference/index.md +++ b/docs/reference/index.md @@ -5,6 +5,9 @@ Conceptual and API reference for raven-python. - **[RAVEN ↔ raven-python migration map](migration.md)** — the function-by-function map from MATLAB RAVEN to raven-python (and cobrapy where appropriate). Start here if you're porting RAVEN code. +- **[YAML model format](yaml_format.md)** — the shared YAML schema produced and consumed + by cobrapy, raven-python, and RAVEN MATLAB, with a fully-annotated example and the + field-order / quoting rules. - **[MATLAB RAVEN back-port proposals](matlab_raven_backports.md)** — improvements raven-python makes that are candidates to back-port into the MATLAB toolbox. - **[Improvements over RAVEN](improvements.md)** — the full catalogue of correctness / @@ -16,6 +19,7 @@ Conceptual and API reference for raven-python. :hidden: migration +yaml_format matlab_raven_backports improvements api/index diff --git a/docs/reference/yaml_format.md b/docs/reference/yaml_format.md new file mode 100644 index 0000000..2d43eae --- /dev/null +++ b/docs/reference/yaml_format.md @@ -0,0 +1,332 @@ +# RAVEN / cobrapy YAML model format + +This document describes the YAML format produced and consumed by + +- **cobrapy** ([`cobra.io.{load,save}_yaml_model`](https://github.com/opencobra/cobrapy)) +- **raven-python** (`raven_python.io.yaml.{read,write}_yaml_model`, see [API](api/index.md)) +- **RAVEN MATLAB** (`readYAMLmodel.m` / `writeYAMLmodel.m` in the [RAVEN repo](https://github.com/SysBioChalmers/RAVEN/tree/feat/yeast-gem-shared/io)) + +The same file can be round-tripped through any of the three. cobrapy is the canonical core; raven-python and RAVEN MATLAB add namespaced extensions (RAVEN curation fields, MIRIAM cross-refs already covered by cobrapy's `annotation`, and the GECKO `ec-*` sections) without disturbing the cobra-known shape. + +--- + +## At a glance + +```yaml +!!omap +- metaData: !!omap + - id: yeastGEM_develop + - name: The Consensus Genome-Scale Metabolic Model of Yeast + - version: 9.0.0 + - date: 2026-05-27 + - taxonomy: taxonomy/559292 +- metabolites: + - !!omap + - id: s_0001 + - name: ATP + - compartment: c + - charge: -4 + - formula: C10H16N5O13P3 + - annotation: !!omap + - kegg.compound: C00002 + - smiles: "[O-]P(=O)([O-])OP(=O)([O-])O..." + - inchis: InChI=1S/C10H16N5O13P3/... + - deltaG: -2768.1 +- reactions: + - !!omap + - id: r_0001 + - name: hexokinase + - metabolites: !!omap + - s_0001: -1.0 + - s_0568: -1.0 + - s_0394: 1.0 + - s_0423: 1.0 + - lower_bound: 0.0 + - upper_bound: 1000.0 + - gene_reaction_rule: YGL253W or YCL040W or YFR053C + - subsystem: Glycolysis / Gluconeogenesis + - notes: "MetaNetX ID curated (PR #220)" + - annotation: !!omap + - kegg.reaction: R00299 + - sbo: SBO:0000176 + - eccodes: 2.7.1.1 + - deltaG: -17.39 + - confidence_score: 2.0 +- genes: + - !!omap + - id: YGL253W + - name: HXK2 + - annotation: !!omap + - uniprot: P04807 +- compartments: !!omap + - c: cytoplasm + - e: extracellular +``` + +Three structural rules are non-obvious and worth pointing out before the field-by-field detail: + +1. The whole document is one **ordered mapping** — `!!omap` — at the root. Every nested map that should preserve key order is also `!!omap` (metaData, each metabolite / reaction / gene entry, `annotation`, `metabolites`, `compartments`, and the ec sections). +2. Each metabolite, reaction, and gene is **one `- !!omap` element** of a list. Inside that mapping, every field is written as `- key: value`. This is cobrapy's native shape and is what RAVEN MATLAB's reader keys off. +3. Strings are **unquoted by default**; quotes appear only when YAML would otherwise misparse the value (leading `-`, `[`, `?` or `:`; embedded `: ` or ` #`; values that look like `true` / `false` / `null`). + +--- + +## Top-level layout + +``` +!!omap +- metaData: !!omap # optional; RAVEN extension +- metabolites: # required +- reactions: # required +- genes: # required (may be `genes: []`) +- compartments: !!omap # required +- gecko_light: # optional; GECKO extension +- ec-rxns: # optional; GECKO extension +- ec-enzymes: # optional; GECKO extension +``` + +| Key | Required | Source | Notes | +|-----|----------|--------|-------| +| `metaData` | optional | RAVEN | Provenance block. Holds `id`, `name`, `version`, `date`, `taxonomy`, optionally `givenName` / `familyName` / `email` / `organization` / `note` / `sourceUrl`, plus `defaultLB` / `defaultUB`. Cobrapy ignores this block (no semantic loss for the core model). | +| `metabolites` | yes | cobra core | Ordered list of `- !!omap` entries. | +| `reactions` | yes | cobra core | Ordered list of `- !!omap` entries. | +| `genes` | yes | cobra core | Ordered list; may be `genes: []` for a model with no genes. | +| `compartments` | yes | cobra core | `!!omap` of `: `. | +| `gecko_light` | optional | GECKO | Scalar boolean. Cobrapy / raven-python emit this at the top level; the older spelling `geckoLight` inside `metaData` is still accepted on read. | +| `ec-rxns` | optional | GECKO | Per-reaction kcat / source / enzymes coupling table. | +| `ec-enzymes` | optional | GECKO | Per-enzyme MW / sequence / concentration table. | + +Cobrapy writes `id` / `name` / `version` at the root level instead of inside `metaData`. The RAVEN readers accept both placements; the RAVEN writers normalize to the `metaData` form. + +--- + +## Metabolite entry + +Field order (cobra-core first, then RAVEN extensions): + +```yaml +- !!omap + - id: s_0001 # required + - name: ATP # cobra + - compartment: c # cobra + - charge: -4 # cobra (number) + - formula: C10H16N5O13P3 # cobra + - notes: "free-text" # cobra + - annotation: !!omap # cobra (MIRIAM + smiles) + - kegg.compound: C00002 + - chebi: + - CHEBI:15422 + - CHEBI:30616 + - sbo: SBO:0000247 + - smiles: "OC1=NC..." # quoted when it contains [ ] : etc. + - inchis: "InChI=1S/..." # RAVEN extension + - deltaG: -2768.1 # RAVEN extension + - metFrom: KEGG # RAVEN extension +``` + +Cobrapy emits exactly the first seven keys (the cobra-core block). raven-python and RAVEN MATLAB additionally emit `inchis`, `deltaG`, and `metFrom` when those fields are populated. On read, cobrapy puts the RAVEN extensions on the metabolite as attribute fall-through; raven-python captures them into `metabolite.notes` (keyed by their YAML name); RAVEN MATLAB stores them on `model.inchis` / `model.metDeltaG` / `model.metFrom`. + +Annotation entries with multiple values are emitted as a YAML list (`chebi:` then several `-` items). Single-value entries are emitted inline (`kegg.compound: C00002`). SMILES strings live inside the annotation block under the `smiles` key — not as a top-level metabolite field, which is the historical RAVEN MATLAB shape and is still accepted on read for backward compatibility. + +--- + +## Reaction entry + +```yaml +- !!omap + - id: r_0001 # required + - name: hexokinase # cobra + - metabolites: !!omap # cobra (sorted by met id) + - s_0001: -1.0 + - s_0394: 1.0 + - lower_bound: 0.0 # cobra (number) + - upper_bound: 1000.0 # cobra (number) + - gene_reaction_rule: YGL253W or YCL040W # cobra + - objective_coefficient: 1 # cobra; omitted when 0 + - subsystem: Glycolysis / Gluconeogenesis # cobra + - notes: "MetaNetX ID curated (PR #220)" # cobra + - annotation: !!omap # cobra + - kegg.reaction: R00299 + - sbo: SBO:0000176 + - eccodes: # RAVEN extension + - 2.7.1.1 + - 2.7.1.2 + - references: "PMID:12345" # RAVEN extension + - rxnFrom: KEGG # RAVEN extension + - deltaG: -17.39 # RAVEN extension + - confidence_score: 2.0 # RAVEN extension +``` + +Some fields are conditional: + +- `objective_coefficient` is only written when non-zero (cobrapy convention). +- The `metabolites` block uses `!!omap []` (flow-style empty omap) when the reaction has no metabolites — this keeps the file a valid YAML 1.2 document. +- `eccodes` is written inline (`eccodes: 2.7.1.1`) when there is exactly one code, and as a list when there are several. Same for `references`. + +**Notes key naming.** Cobrapy and the current raven-python / RAVEN MATLAB writers use **`notes`**. Pre-`feat/yeast-gem-shared` yeast-GEM files used `rxnNotes`; both readers accept that as a legacy alias. + +**Bounds typing.** Bounds are emitted as floats with an explicit decimal point (`1000.0`, `-1000.0`), matching Python's float repr and cobrapy's output. + +--- + +## Gene entry + +```yaml +- !!omap + - id: YGL253W # required + - name: HXK2 # cobra; omitted when empty + - annotation: !!omap # cobra + - uniprot: P04807 + - ncbigene: 856421 + - protein: P04807 # RAVEN extension +``` + +Empty names (`name: ''`) are not emitted (matches RAVEN MATLAB's historical behavior). + +--- + +## Compartments + +```yaml +- compartments: !!omap + - c: cytoplasm + - e: extracellular + - m: mitochondrion +``` + +Just an `!!omap` of `: ` pairs. Compartments don't carry their own MIRIAMs in the current format. + +--- + +## metaData (RAVEN extension) + +```yaml +- metaData: !!omap + - id: yeastGEM_develop + - name: The Consensus Genome-Scale Metabolic Model of Yeast + - version: 9.0.0 + - date: 2026-05-27 + - defaultLB: -1000.0 + - defaultUB: 1000.0 + - givenName: Eduard + - familyName: Kerkhoven + - email: eduardk@chalmers.se + - organization: Chalmers University of Technology + - taxonomy: taxonomy/559292 + - note: "Saccharomyces cerevisiae - strain S288C" + - sourceUrl: https://github.com/SysBioChalmers/yeast-GEM +``` + +Pure provenance. Cobrapy ignores the block; raven-python keeps the verbatim dictionary on `model.notes['metaData']` and additionally lifts `id` / `name` / `version` to `model.id` / `model.name` / `model.notes['version']` so cobra-shape accessors find them. RAVEN MATLAB populates `model.id` / `model.name` / `model.version` / `model.annotation.*` from the same fields. + +`date` is preserved across round-trips when present on the model; otherwise the writer fills in `YYYY-MM-DD` of the current date. + +--- + +## GECKO sections + +For enzyme-constrained models, three additional top-level keys carry the EC layer: + +```yaml +- gecko_light: false # true for the "light" formulation +- ec-rxns: + - !!omap + - id: r_0001 + - kcat: 25.3 + - source: brenda + - notes: "" + - eccodes: 2.7.1.1 + - enzymes: !!omap + - P04807: 1.0 +- ec-enzymes: + - !!omap + - genes: YGL253W + - enzymes: P04807 + - mw: 53942 + - sequence: "MVHLGPK..." + - concs: .nan +``` + +These map onto `model.ec` in RAVEN MATLAB and `raven_python.io.ec_data.EcData` (attached as `model.ec`) in raven-python. Cobrapy ignores the sections. + +The older spelling `geckoLight` inside `metaData` is also accepted on read. + +--- + +## Annotations + +The `annotation` block uses MIRIAM-style namespace keys. Cobrapy treats the block as a free-form dictionary; raven-python preserves it verbatim through `cobra.Metabolite.annotation` / `Reaction.annotation` / `Gene.annotation`; RAVEN MATLAB maps it to `model.metMiriams` / `rxnMiriams` / `geneMiriams`. + +- A single value is written inline: `kegg.compound: C00002`. +- Multiple values are written as a YAML list: + + ```yaml + - chebi: + - CHEBI:15422 + - CHEBI:30616 + ``` + +- The `smiles` key inside a metabolite's `annotation` carries the SMILES string (cobrapy convention). RAVEN MATLAB historically emitted `smiles` as a metabolite top-level field; both readers still accept that, but writes are normalized to the annotation block. +- The `sbo` key carries the Systems Biology Ontology term assigned by `assignSBOterms` / `add_sbo_terms`. + +--- + +## Numbers, strings, quoting + +**Numbers.** Whole-number floats are written with an explicit `.0` (`1000.0`, `-1000.0`, `0.0`). Other floats use up to 15 significant digits (`-17.39`, `-2768.1`). `NaN` is encoded as `.nan`; `+Inf` / `-Inf` as `.inf` / `-.inf` (YAML 1.2 conventions). + +**Strings.** Default style is bare (no quotes). The writer falls back to double-quoted style when the value: + +- starts with `-`, `?`, `:`, or any flow indicator (`[`, `]`, `{`, `}`, `,`, `&`, `*`, `!`, `|`, `>`, `%`, `@`, `` ` ``, `#`); +- contains `: ` (would otherwise be parsed as a key/value), ` #` (comment), or one of `[`, `]`, `{`, `}`; +- has leading or trailing whitespace; +- spells a YAML reserved word case-insensitively (`true`, `false`, `null`, `yes`, `no`, `on`, `off`, `~`). + +In a double-quoted string, only `\` and `"` are escaped. Other characters (including Unicode and newlines if the underlying model permitted them) are passed through. + +--- + +## Tooling interoperability matrix + +| File written by ↓ \ Reader → | cobrapy | raven-python | RAVEN MATLAB | +|---|---|---|---| +| cobrapy (`save_yaml_model`) | full | full + extras land in `notes` via attribute fall-through | works for root-level `id` / `name` / `version` (added in this release) | +| raven-python (`write_yaml_model`) | core (no `metaData`-derived `id`); RAVEN extras live as unknown top-level keys but don't break parsing | full | full | +| RAVEN MATLAB (`writeYAMLmodel`) | core (no `metaData`-derived `id`); RAVEN extras land via attribute fall-through | full | full | + +"Full" = every field read back into its canonical position on the model object; "core" = cobrapy-known fields, RAVEN extensions ignored or kept on the object as attribute fall-through (`reaction.eccodes` etc., not re-emitted on save). A round-trip through cobrapy is therefore **lossy for RAVEN extensions** — only the core fields survive `cobrapy.load → cobrapy.save`. Round-trips through raven-python or RAVEN MATLAB are lossless. + +--- + +## What round-tripping looks like + +Loading `yeast-GEM.yml` (2748 metabolites, 4102 reactions, 1143 genes) and re-writing it through any of the three tools preserves every documented piece of content: + +| Count | After round-trip | +|---|---| +| metabolites | 2748 / 2748 | +| reactions | 4102 / 4102 | +| genes | 1143 / 1143 | +| reactions with eccodes | 2411 | +| reactions with deltaG | 3984 | +| metabolites with deltaG | 2696 | +| metabolites with SMILES | 1788 | +| reactions with notes (rxnNotes) | 1443 | + +(Cobrapy round-trips give 2748 / 4102 / 1143 for the core but drop the RAVEN extensions in the rightmost column — that's the documented loss.) + +--- + +## What changed in `feat/yeast-gem-shared` + +- raven-python writer no longer drops `!!omap` tags (was producing files RAVEN MATLAB's reader couldn't load). +- raven-python now preserves `eccodes` and accepts the legacy `rxnNotes` reaction key on read. +- RAVEN MATLAB writer reorders metabolite / reaction fields to match cobrapy. +- RAVEN MATLAB writer renames the reaction `rxnNotes` key to `notes` and emits SMILES inside the annotation block (still accepts both shapes on read). +- RAVEN MATLAB writer's `preserveQuotes` default is now `false`; values that need quoting (SMILES with `[O-]`, leading flow indicators, booleans, `: `-containing strings) are quoted defensively per value. +- RAVEN MATLAB writer emits whole-number bounds as `1000.0` (matches cobrapy / Python float repr) instead of `1000`. +- RAVEN MATLAB reader accepts cobrapy's root-level `id` / `name` / `version` / `gecko_light`, the `!!omap`-tagged `metaData` header, and `notes` (canonical) in addition to `rxnNotes` (legacy). +- Empty `reaction.metabolites` blocks are emitted as `!!omap []` (valid YAML 1.2) rather than an empty `!!omap` with no value. +- Document-start marker `---` dropped to match cobrapy's bare `!!omap` root. + +These changes are byte-stable for cobrapy and raven-python users; existing yeast-GEM YAML files continue to load. The first time a yeast-GEM curation pass rewrites the file with the new MATLAB writer, the diff will look large (because of the reordering and quote-style changes) but the model content is unchanged. diff --git a/src/raven_python/io/yaml.py b/src/raven_python/io/yaml.py index d363ee9..3ea96f9 100644 --- a/src/raven_python/io/yaml.py +++ b/src/raven_python/io/yaml.py @@ -235,7 +235,17 @@ def model_from_yaml_data(raw: dict) -> cobra.Model: model.notes["version"] = version # Pop the ec sections out of `foreign` and into a typed EcData. - # The remaining unknown keys round-trip opaquely. + # The remaining unknown keys round-trip opaquely. Pre-shim RAVEN + # MATLAB writes wrote `geckoLight: "true"` inside metaData (rather + # than the current top-level `gecko_light`); honour the legacy + # placement too — keep the metaData entry untouched (round-trip) + # and surface it at the top level so EcData picks it up. + legacy_gecko = metadata.get("geckoLight") + if legacy_gecko is not None and "gecko_light" not in foreign: + if isinstance(legacy_gecko, str): + foreign["gecko_light"] = legacy_gecko.lower() == "true" + else: + foreign["gecko_light"] = bool(legacy_gecko) ec_sections = {k: foreign.pop(k) for k in list(foreign) if k in _EC_TOP_KEYS} ec_data = ec_data_from_yaml_sections(ec_sections) if ec_data is not None: diff --git a/tests/test_io_yaml_parity.py b/tests/test_io_yaml_parity.py index 459f563..2c89f58 100644 --- a/tests/test_io_yaml_parity.py +++ b/tests/test_io_yaml_parity.py @@ -187,6 +187,158 @@ def test_smiles_with_yaml_special_chars_quoted(src, tmp_path): ] +PRE_SHIM_YAML = """\ +--- +!!omap +- metaData: + id: "eciYali" + name: "Yarrowia lipolytica" + version: "1.0" + date: "2024-10-17" + geckoLight: "true" +- metabolites: + - !!omap + - id: "s_0001" + - name: "ATP" + - compartment: "c" + - formula: "C10H16N5O13P3" + - charge: -4 + - inchis: "InChI=1S/CH4" + - smiles: "[O-]P(=O)([O-])OP(=O)([O-])O" + - annotation: !!omap + - kegg.compound: "C00002" + - sbo: "SBO:0000247" + - deltaG: 12.5 + - notes: "metabolite note" + - metFrom: "KEGG" +- reactions: + - !!omap + - id: "r_0001" + - name: "hexokinase" + - metabolites: !!omap + - s_0001: -1 + - lower_bound: -1000 + - upper_bound: 1000 + - gene_reaction_rule: "G1" + - rxnFrom: "KEGG" + - eccodes: "2.7.1.1" + - references: "PMID:12345" + - subsystem: "Glycolysis" + - annotation: !!omap + - kegg.reaction: "R00299" + - sbo: "SBO:0000176" + - deltaG: -17.39 + - confidence_score: 2 + - rxnNotes: "old reaction note" +- genes: + - !!omap + - id: "G1" + - name: "HXK1" + - protein: "P01234" + - annotation: !!omap + - uniprot: "P01234" +- compartments: !!omap + - c: "cytoplasm" +- ec-rxns: + - !!omap + - id: "r_0001" + - kcat: 25.3 + - enzymes: !!omap + - P01234: 1 +- ec-enzymes: + - !!omap + - genes: "G1" + - enzymes: "P01234" + - mw: 50000 +""" + + +def test_pre_shim_format_loads(tmp_path): + """The pre-`feat/yeast-gem-shared` RAVEN MATLAB writer emitted a + file shape that differs from the current one in seven concrete + ways. The reader must continue to load every one of them: + + 1. ``---`` document-start marker (kept by old MATLAB writer) + 2. ``- metaData:`` as a plain block mapping (no ``!!omap`` tag) + 3. ``geckoLight: "true"`` *inside* metaData (now emitted as a + top-level ``gecko_light``) + 4. Metabolite ``smiles`` as a top-level entry key (now emitted + inside the ``annotation`` block) + 5. Reaction notes under the ``rxnNotes`` key (now emitted as + ``notes``) + 6. Integer-typed bounds / coefficients (now emitted as floats) + 7. Every string double-quoted (now bare unless YAML requires + quoting) + + Each item below maps to one of those seven cases. + """ + p = tmp_path / "pre_shim.yml" + p.write_text(PRE_SHIM_YAML) + model = read_yaml_model(p) + + # metaData survives + provenance is lifted onto the cobra-shape + # accessors (cases 1 + 2). + assert model.id == "eciYali" + assert model.name == "Yarrowia lipolytica" + assert model.notes["version"] == "1.0" + assert model.notes["metaData"]["taxonomy" if "taxonomy" in model.notes["metaData"] else "date"] + + # geckoLight in metaData populates the typed EcData (case 3). + assert model.ec is not None + assert model.ec.gecko_light is True + assert model.ec.rxns == ["r_0001"] + assert model.ec.kcat[0] == 25.3 + + # Top-level smiles lifted into annotation.smiles (case 4). + a = model.metabolites.get_by_id("s_0001") + assert a.annotation["smiles"] == ["[O-]P(=O)([O-])OP(=O)([O-])O"] + assert "smiles" not in a.notes # stays in annotation, not notes + + # rxnNotes read as the canonical notes key (case 5). + r = model.reactions.get_by_id("r_0001") + assert r.notes["note"] == "old reaction note" + + # Integer bounds become floats inside cobra (case 6). + assert r.bounds == (-1000.0, 1000.0) + assert isinstance(r.lower_bound, float) + + # Quoted strings unquote cleanly (case 7) — verified implicitly by + # all the equality assertions above. Spot check the metabolite + # name, which used double quotes in the source. + assert a.name == "ATP" + + # Other RAVEN extras still preserved. + assert a.notes["inchis"] == "InChI=1S/CH4" + assert a.notes["deltaG"] == 12.5 + assert a.notes["note"] == "metabolite note" + assert a.notes["metFrom"] == "KEGG" + assert r.notes["rxnFrom"] == "KEGG" + assert r.notes["eccodes"] == "2.7.1.1" + assert r.notes["references"] == "PMID:12345" + assert r.notes["confidence_score"] == 2 + assert r.notes["deltaG"] == -17.39 + assert model.genes.get_by_id("G1").notes["protein"] == "P01234" + + +def test_pre_shim_yeast_gem_loads_if_available(): + """The real pre-shim yeast-GEM.yml: 2748 mets, 4102 rxns, 1143 + genes. Skipped when the working copy isn't mounted (CI runners).""" + real = Path("/mnt/c/Work/GitHub/yeast-GEM/model/yeast-GEM.yml") + if not real.exists(): + pytest.skip("yeast-GEM.yml not available in this environment") + model = read_yaml_model(real) + assert model.id == "yeastGEM_develop" + assert len(model.metabolites) == 2748 + assert len(model.reactions) == 4102 + assert len(model.genes) == 1143 + # Every RAVEN extension we know about must come through. + assert sum(1 for r in model.reactions if r.notes.get("eccodes")) == 2411 + assert sum(1 for r in model.reactions if r.notes.get("deltaG") is not None) == 3984 + assert sum(1 for m in model.metabolites if m.notes.get("deltaG") is not None) == 2696 + assert sum(1 for m in model.metabolites if "smiles" in (m.annotation or {})) == 1788 + assert sum(1 for r in model.reactions if r.notes.get("note")) == 1443 + + def test_eccodes_round_trip_through_cobra_extras(src, tmp_path): """A model loaded from cobra (no eccodes awareness) and re-written via raven_python.write_yaml_model still keeps eccodes — they're From 8930b10033077478fcd356eb9ccacf2aa40288ae Mon Sep 17 00:00:00 2001 From: Eduard Kerkhoven Date: Wed, 3 Jun 2026 18:51:03 +0000 Subject: [PATCH 8/8] Cobra-aligned hardening pass from full code review No behaviour change on well-formed inputs. Highlights: - Packaging: derive __version__ from package metadata (was a stale hard-coded "0.0.1" that the docs site reported); pin ruff==0.15.15 in the dev extra and CI; fix two lint errors unpinned ruff started flagging. - Errors: solver/feasibility failures in run_init, run_ftinit, fill_tasks and random_sampling now raise cobra.exceptions.OptimizationError instead of bare RuntimeError (consistent with the rest of the package). - Consistency: single utils.parse.subsystem_to_str coerces reaction subsystem to cobra's canonical str across io.excel / comparison.compare / curation.batch / manipulation.add (fixes a crash on non-string items and the silent drop of multi-subsystem reactions); shared GPR score aggregators in utils.gpr used by init.score and init.genes; KEGG-download progress uses a module logger instead of print. - Robustness: zip path-traversal guard in binaries.py; penalty>0 check in connect_blocked_reactions; NaN-sample guard in random_sampling; all-zero ec coupling warning; optional verify= SHA256 re-check on data cache hits; non-finite z-score guard in reporter. Regression tests added for each. --- .github/workflows/ci.yml | 4 +- CHANGELOG.md | 28 +++++++++ pyproject.toml | 2 +- src/raven_python/__init__.py | 7 ++- src/raven_python/analysis/reporter.py | 2 + src/raven_python/analysis/sampling.py | 12 +++- src/raven_python/binaries.py | 18 +++++- src/raven_python/comparison/compare.py | 10 ++-- src/raven_python/curation/batch.py | 4 +- src/raven_python/data.py | 10 +++- src/raven_python/gapfilling/fill.py | 2 + src/raven_python/init/ftinit.py | 3 +- src/raven_python/init/genes.py | 8 +-- src/raven_python/init/init.py | 3 +- src/raven_python/init/score.py | 8 +-- src/raven_python/init/taskfill.py | 11 ++-- src/raven_python/io/ec_data.py | 9 +++ src/raven_python/io/excel.py | 6 +- src/raven_python/localization/predict.py | 5 +- src/raven_python/manipulation/add.py | 8 ++- src/raven_python/manipulation/transport.py | 12 ++-- .../reconstruction/kegg/download.py | 9 ++- src/raven_python/utils/gpr.py | 30 ++++++++++ src/raven_python/utils/parse.py | 26 +++++++++ tests/test_binaries.py | 21 +++++++ tests/test_gapfilling.py | 6 ++ tests/test_init.py | 16 ++++++ tests/test_io_yaml_ec_data.py | 10 ++++ tests/test_io_yaml_parity.py | 7 +-- tests/test_utils_parse.py | 57 +++++++++++++++++++ 30 files changed, 304 insertions(+), 50 deletions(-) create mode 100644 tests/test_utils_parse.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b5b7f12..cdfbe0c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,7 +23,9 @@ jobs: cache: pip cache-dependency-path: pyproject.toml - run: pip install --upgrade pip - - run: pip install ruff + # Pinned so the lint result is reproducible (unpinned ruff silently + # adds rules over time). Keep in sync with the ``dev`` extra in pyproject. + - run: pip install ruff==0.15.15 - run: ruff check . test: diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d9bd98..44f3149 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,34 @@ Milestones in the raven-python port. For function-level status see [docs/raven_migration.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/reference/migration.md); for open work see [docs/todo.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/reference/todo.md). +## Unreleased + +Post-release review pass — cobra-aligned hardening (no behaviour change on +well-formed inputs). Highlights: + +* **Packaging:** `raven_python.__version__` now derives from the installed package + metadata (`importlib.metadata`) instead of a hard-coded literal that had drifted + to `0.0.1`; the docs site reported the wrong version. Pinned `ruff==0.15.15` in + both the `dev` extra and CI so the lint result is reproducible, and fixed two + lint errors the unpinned ruff had started flagging. +* **Errors aligned to cobra:** solver/feasibility failures in `run_init`, + `run_ftinit`, `fill_tasks` and `random_sampling` now raise + `cobra.exceptions.OptimizationError` (already used elsewhere in the package) + instead of a bare `RuntimeError`. +* **Consistency:** a single `utils.parse.subsystem_to_str` coerces a reaction + `subsystem` to cobra's canonical `str` everywhere it is rendered/compared + (`io.excel`, `comparison.compare`, `curation.batch`, `manipulation.add`) — fixes + a crash on non-string subsystem items and the silent drop of multi-subsystem + reactions. GPR score-aggregation (`AGGREGATORS` / `resolve_aggregators`) is now + shared by `init.score` and `init.genes`. Maintainer-side KEGG-download progress + uses a module logger instead of `print`. +* **Robustness:** path-traversal guard on bundled-ZIP extraction (`binaries.py`, + matching the tarfile `filter="data"` precedent); `connect_blocked_reactions` + rejects a non-positive `penalty`; `random_sampling` refuses a NaN-contaminated + sample matrix; `ec_data` warns on an all-zero reaction↔enzyme coupling; optional + `verify=` SHA256 re-check on `ensure_data_file` cache hits; reporter p-value + guarded against non-finite z-scores. Regression tests added for each. + ## 0.1.0a1 — 2026-05-30 First alpha release. Covers the functional scope of RAVEN built on cobrapy: diff --git a/pyproject.toml b/pyproject.toml index adb2ab7..376a961 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ dependencies = [ dev = [ "pytest>=7", "pytest-cov", - "ruff>=0.4", + "ruff==0.15.15", ] excel = [ "openpyxl>=3.1", diff --git a/src/raven_python/__init__.py b/src/raven_python/__init__.py index 591c4c1..0c62aba 100644 --- a/src/raven_python/__init__.py +++ b/src/raven_python/__init__.py @@ -7,4 +7,9 @@ sub-cellular localisation, N-model comparison, and the RAVEN-style I/O formats. """ -__version__ = "0.0.1" +from importlib.metadata import PackageNotFoundError, version + +try: + __version__ = version("raven-python") +except PackageNotFoundError: # running from a source tree without an install + __version__ = "0.0.0+unknown" diff --git a/src/raven_python/analysis/reporter.py b/src/raven_python/analysis/reporter.py index 5d96d47..5150968 100644 --- a/src/raven_python/analysis/reporter.py +++ b/src/raven_python/analysis/reporter.py @@ -60,6 +60,8 @@ def _reporter_one(model: cobra.Model, gene_z: dict[str, float], test: str) -> Re raw = zs.sum() / math.sqrt(n) # Exact background correction for sampling-with-replacement (see module doc). corrected = (raw - math.sqrt(n) * mu) / sigma if sigma > 0 else 0.0 + if not math.isfinite(corrected): # never on clamped z-scores; guards the p-value + corrected = 0.0 rows.append( { "metabolite": met.id, diff --git a/src/raven_python/analysis/sampling.py b/src/raven_python/analysis/sampling.py index 429b164..e187ffd 100644 --- a/src/raven_python/analysis/sampling.py +++ b/src/raven_python/analysis/sampling.py @@ -36,6 +36,7 @@ import cobra import numpy as np import pandas as pd +from cobra.exceptions import OptimizationError from cobra.flux_analysis import flux_variability_analysis, pfba logger = logging.getLogger(__name__) @@ -188,11 +189,18 @@ def random_sampling( model.objective = model.problem.Objective(sum(terms), direction="max") sol = model.optimize() if sol.status == "optimal" and abs(sol.objective_value) > 1e-8: - samples[i, :] = (pfba(model) if min_flux else sol).fluxes.reindex(reaction_ids).to_numpy() + fluxes = (pfba(model) if min_flux else sol).fluxes.reindex(reaction_ids) + if fluxes.isna().any(): # solver returned an unexpected reaction set + missing = fluxes.index[fluxes.isna()].tolist() + raise OptimizationError( + "solver returned fluxes missing reaction(s) " + f"{missing[:5]}; cannot assemble a NaN-free sample matrix." + ) + samples[i, :] = fluxes.to_numpy() break if attempt == max_attempts: if not suppress_errors: - raise RuntimeError( + raise OptimizationError( "Could not find a non-zero, loop-free solution after " f"{max_attempts} attempts for sample {i}. Review the model's " "constraints, or set suppress_errors=True." diff --git a/src/raven_python/binaries.py b/src/raven_python/binaries.py index ed7ec7b..d15b0b0 100644 --- a/src/raven_python/binaries.py +++ b/src/raven_python/binaries.py @@ -86,6 +86,22 @@ def _sha256(path: Path) -> str: return h.hexdigest() +def _safe_extract_zip(zf: zipfile.ZipFile, dest_dir: Path) -> None: + """Extract ``zf`` into ``dest_dir``, rejecting members that escape it. + + ``ZipFile.extractall`` has no path-traversal guard (unlike tarfile's + ``filter="data"`` used in reconstruction/kegg/download.py), so a malicious or + corrupt archive could write outside the cache via absolute paths or ``..``. + SHA256 + HTTPS already make a hostile archive unlikely; this is defence in depth. + """ + dest = dest_dir.resolve() + for member in zf.namelist(): + target = (dest_dir / member).resolve() + if target != dest and dest not in target.parents: + raise ValueError(f"unsafe path in archive (path traversal): {member!r}") + zf.extractall(dest_dir) + + def ensure_binary(executable: str, *, registry: dict | None = None) -> Path: """Download (if needed) and return the path to a bundled ``executable``. @@ -134,7 +150,7 @@ def ensure_binary(executable: str, *, registry: dict | None = None) -> Path: finally: part.unlink(missing_ok=True) with zipfile.ZipFile(archive) as zf: - zf.extractall(dest_dir) + _safe_extract_zip(zf, dest_dir) archive.unlink(missing_ok=True) if not exe.exists(): raise FileNotFoundError(f"{executable!r} not found in the extracted bundle at {dest_dir}.") diff --git a/src/raven_python/comparison/compare.py b/src/raven_python/comparison/compare.py index c7d38a1..36e8b73 100644 --- a/src/raven_python/comparison/compare.py +++ b/src/raven_python/comparison/compare.py @@ -18,6 +18,7 @@ import pandas as pd from raven_python.tasks import Task, check_tasks +from raven_python.utils.parse import subsystem_to_str @dataclass @@ -60,12 +61,9 @@ def _subsystem_counts(model: cobra.Model) -> dict[str, int]: """{subsystem_name: reaction_count}. Reactions with empty subsystem fall under '(none)'.""" counts: dict[str, int] = {} for r in model.reactions: - # cobra stores subsystem as a string; RAVEN sometimes uses cell-of-cells (we'd - # already have it as a string here, but guard against list/tuple from messy YAML). - sub = r.subsystem - if isinstance(sub, (list, tuple)): - sub = sub[0] if sub else "" - sub = (sub or "").strip() or "(none)" + # cobra stores subsystem as a string; RAVEN sometimes uses cell-of-cells. + # Coerce to the same ;-joined string used everywhere else (no data lost). + sub = subsystem_to_str(r.subsystem).strip() or "(none)" counts[sub] = counts.get(sub, 0) + 1 return counts diff --git a/src/raven_python/curation/batch.py b/src/raven_python/curation/batch.py index 1bdeb23..0f00b4d 100644 --- a/src/raven_python/curation/batch.py +++ b/src/raven_python/curation/batch.py @@ -8,6 +8,8 @@ import cobra import pandas as pd +from raven_python.utils.parse import subsystem_to_str + #: Core columns recognised in ``mets_df``. Anything else is treated as a #: MIRIAM annotation column (the header becomes the namespace key). DEFAULT_CORE_MET_COLUMNS: tuple[str, ...] = ( @@ -369,7 +371,7 @@ def _update_reaction(rxn: cobra.Reaction, row: pd.Series, miriam_cols: list[str] if _has_value(row.get("ub")): rxn.upper_bound = float(row["ub"]) if _has_value(row.get("subSystems")): - rxn.subsystem = str(row["subSystems"]) + rxn.subsystem = subsystem_to_str(row["subSystems"]) if _has_value(row.get("eccodes")): rxn.annotation["ec-code"] = str(row["eccodes"]) if _has_value(row.get("rxnNotes")): diff --git a/src/raven_python/data.py b/src/raven_python/data.py index ecb46a3..fd26c8d 100644 --- a/src/raven_python/data.py +++ b/src/raven_python/data.py @@ -75,12 +75,18 @@ def ensure_data_file( *, version: str | None = None, registry: dict | None = None, + verify: bool = False, ) -> Path: """Download (if needed) and return the cached path to one artefact file. Looks the file up in the registry for ``dataset`` (at ``version`` or the registry's default), downloads it to the version-pinned cache directory, verifies its SHA256, and returns the path. Re-uses an already-cached copy. + + A freshly downloaded file is always SHA256-checked. ``verify`` additionally + re-checks an *already-cached* file's SHA256 (a mismatch — i.e. a corrupted + cache — discards it and re-downloads); it is off by default so the common + cache-hit path stays fast. """ registry = _DATA_REGISTRY if registry is None else registry bundle = _bundle(dataset, registry) @@ -95,7 +101,9 @@ def ensure_data_file( dest_dir = _data_cache_dir() / f"{dataset}-{ver}" dest = dest_dir / filename if dest.exists(): - return dest + if not verify or _sha256(dest) == entry["sha256"]: + return dest + dest.unlink() # corrupted cache → fall through and re-download dest_dir.mkdir(parents=True, exist_ok=True) tmp = dest.with_name(dest.name + ".part") diff --git a/src/raven_python/gapfilling/fill.py b/src/raven_python/gapfilling/fill.py index ba3418d..ad0f1ab 100644 --- a/src/raven_python/gapfilling/fill.py +++ b/src/raven_python/gapfilling/fill.py @@ -137,6 +137,8 @@ def connect_blocked_reactions( The draft is expected to have exchange reactions for its nutrients (otherwise most reactions are trivially blocked). """ + if penalty <= 0: + raise ValueError(f"penalty must be > 0 (a non-positive cost malforms the MILP); got {penalty}.") templates = _as_models(templates) blocked = set(find_blocked_reactions(model)) candidates = [r for r in blocked if model.reactions.get_by_id(r).lower_bound >= 0] diff --git a/src/raven_python/init/ftinit.py b/src/raven_python/init/ftinit.py index b355e45..58e4ab9 100644 --- a/src/raven_python/init/ftinit.py +++ b/src/raven_python/init/ftinit.py @@ -51,6 +51,7 @@ from dataclasses import dataclass, field import cobra +from cobra.exceptions import OptimizationError from optlang.symbolics import Real, add, mul from raven_python.init.genes import remove_low_score_genes @@ -210,7 +211,7 @@ def add_constraint(expr, **kw): opt.optimize() # Accept a near-optimal incumbent (when a MIP gap / time limit is set), as RAVEN does. if opt.status not in ("optimal", "feasible", "suboptimal", "time_limit"): - raise RuntimeError(f"ftINIT MILP did not solve (status: {opt.status}).") + raise OptimizationError(f"ftINIT MILP did not solve (status: {opt.status}).") # RAVEN: a reaction is "on" iff its indicator ≥ 0.5 (positive indicators are # continuous and can land fractionally when a reaction can carry only tiny flux). diff --git a/src/raven_python/init/genes.py b/src/raven_python/init/genes.py index ceed3da..4076c13 100644 --- a/src/raven_python/init/genes.py +++ b/src/raven_python/init/genes.py @@ -11,13 +11,12 @@ from __future__ import annotations import ast -import statistics from collections.abc import Mapping import cobra from cobra.manipulation import remove_genes -_AGG = {"min": min, "max": max, "median": statistics.median, "average": statistics.fmean} +from raven_python.utils.gpr import resolve_aggregators def _prune(node, scores, iso, cplx) -> tuple[str | None, float | None]: @@ -64,10 +63,7 @@ def remove_low_score_genes( **deterministically** (first on a tie), unlike RAVEN's random tie-break — same quality, reproducible. """ - for name, value in (("isozyme_scoring", isozyme_scoring), ("complex_scoring", complex_scoring)): - if value not in _AGG: - raise ValueError(f"{name} must be one of {sorted(_AGG)}; got {value!r}.") - iso, cplx = _AGG[isozyme_scoring], _AGG[complex_scoring] + iso, cplx = resolve_aggregators(isozyme_scoring, complex_scoring) out = model.copy() for rxn in out.reactions: diff --git a/src/raven_python/init/init.py b/src/raven_python/init/init.py index f23e17a..3cd5942 100644 --- a/src/raven_python/init/init.py +++ b/src/raven_python/init/init.py @@ -39,6 +39,7 @@ from dataclasses import dataclass import cobra +from cobra.exceptions import OptimizationError from optlang.symbolics import Real, add, mul _EPS = 1.0 # flux an included reaction must carry (RAVEN's fake-met unit) @@ -203,7 +204,7 @@ def run_init( opt.optimize() # With a MIP gap / time limit set, accept a near-optimal incumbent (as RAVEN does). if opt.status not in ("optimal", "feasible", "suboptimal", "time_limit"): - raise RuntimeError(f"INIT MILP did not solve (status: {opt.status}).") + raise OptimizationError(f"INIT MILP did not solve (status: {opt.status}).") # A reaction is kept if any of its directed parts is essential or has x≈1. kept_origins = {d.origin for d in directed if d.essential} diff --git a/src/raven_python/init/score.py b/src/raven_python/init/score.py index 6e14f86..c380544 100644 --- a/src/raven_python/init/score.py +++ b/src/raven_python/init/score.py @@ -15,12 +15,11 @@ import ast import math -import statistics from collections.abc import Mapping import cobra -_AGG = {"min": min, "max": max, "median": statistics.median, "average": statistics.fmean} +from raven_python.utils.gpr import resolve_aggregators def gene_scores_from_expression( @@ -70,10 +69,7 @@ def score_reactions_from_genes( no_gene_score: float = -2.0, ) -> dict[str, float]: """Return ``{reaction_id: score}`` from per-gene scores via each reaction's GPR.""" - for name, value in (("isozyme_scoring", isozyme_scoring), ("complex_scoring", complex_scoring)): - if value not in _AGG: - raise ValueError(f"{name} must be one of {sorted(_AGG)}; got {value!r}.") - iso, cplx = _AGG[isozyme_scoring], _AGG[complex_scoring] + iso, cplx = resolve_aggregators(isozyme_scoring, complex_scoring) scores: dict[str, float] = {} for rxn in model.reactions: diff --git a/src/raven_python/init/taskfill.py b/src/raven_python/init/taskfill.py index 58501ce..d90cb00 100644 --- a/src/raven_python/init/taskfill.py +++ b/src/raven_python/init/taskfill.py @@ -18,6 +18,7 @@ from dataclasses import dataclass import cobra +from cobra.exceptions import OptimizationError from optlang.symbolics import Real, add, mul from raven_python.tasks import Task @@ -87,13 +88,15 @@ def _fill_one_task( right trade for tractability, exactly as for the main ftINIT MILP. """ if not candidates: # nothing left to add → task cannot be made feasible - raise RuntimeError(f"gap-filling found no candidates for task {task.id!r}.") + raise OptimizationError(f"gap-filling found no candidates for task {task.id!r}.") combined = _closed_copy(model) # task I/O via the task's b, not the model's exchanges combined.add_reactions([r.copy() for r in candidates]) name_to_id, comp_to_ids = task_name_maps(combined) _, error = apply_task_constraints(combined, task, name_to_id, comp_to_ids) if error is not None: - raise RuntimeError(f"task {task.id!r} could not be applied to the reference: {error}") + raise OptimizationError( + f"task {task.id!r} could not be applied to the reference: {error}" + ) prob = combined.problem extras = [] @@ -126,7 +129,7 @@ def _fill_one_task( # (no incumbent) means the task cannot be satisfied from the reference. if combined.solver.status not in ("optimal", "feasible", "suboptimal", "time_limit") or \ combined.variables[f"_fill_{candidates[0].id}"].primal is None: - raise RuntimeError(f"gap-filling found no way to make task {task.id!r} feasible.") + raise OptimizationError(f"gap-filling found no way to make task {task.id!r} feasible.") return [c.id for c in candidates if (combined.variables[f"_fill_{c.id}"].primal or 0.0) > 0.5] @@ -174,7 +177,7 @@ def fill_tasks( avail = [r for r in candidates if r.id not in present] try: chosen = _fill_one_task(out, avail, task, costs, mip_gap=mip_gap, time_limit=time_limit) - except RuntimeError: + except OptimizationError: failed.append(task.id) continue if chosen: diff --git a/src/raven_python/io/ec_data.py b/src/raven_python/io/ec_data.py index 850c8e8..7ef941e 100644 --- a/src/raven_python/io/ec_data.py +++ b/src/raven_python/io/ec_data.py @@ -45,6 +45,7 @@ from __future__ import annotations import math +import warnings from dataclasses import dataclass, field from typing import Any @@ -263,6 +264,14 @@ def _build_ec_data( ) mat[i, j] = float(stoich) + if n_r > 0 and mat.nnz == 0: # reactions present but none coupled to an enzyme + warnings.warn( + f"ec-rxns has {n_r} entries but none reference an enzyme; the " + "reaction↔enzyme coupling matrix is all-zero (likely malformed " + "ec-rxns/ec-enzymes).", + stacklevel=2, + ) + return EcData( gecko_light=gecko_light, rxns=rxns, diff --git a/src/raven_python/io/excel.py b/src/raven_python/io/excel.py index cf6196e..398b6da 100644 --- a/src/raven_python/io/excel.py +++ b/src/raven_python/io/excel.py @@ -12,6 +12,8 @@ import cobra +from raven_python.utils.parse import subsystem_to_str + def _miriam_string(annotation: dict, exclude: tuple[str, ...] = ()) -> str: """RAVEN MIRIAM column: ``namespace/id;namespace/id2;...`` (sorted).""" @@ -84,9 +86,7 @@ def export_to_excel( "REPLACEMENT ID", "NOTE", "REFERENCE", "CONFIDENCE SCORE"] ) for r in reactions: - subsystem = r.subsystem - if isinstance(subsystem, (list, tuple)): - subsystem = ";".join(subsystem) + subsystem = subsystem_to_str(r.subsystem) ws.append([ None, r.id, r.name, _equation(r), _ec_codes(r), r.gene_reaction_rule, r.lower_bound, r.upper_bound, diff --git a/src/raven_python/localization/predict.py b/src/raven_python/localization/predict.py index 0fb8596..3fc2369 100644 --- a/src/raven_python/localization/predict.py +++ b/src/raven_python/localization/predict.py @@ -223,7 +223,10 @@ def _met_cost(m_id: str) -> float: score_lookup = scores.df # gene_id × compartment → float for g in genes_in_scope: for c in compartments: - s = float(score_lookup.at[g, c]) if c in score_lookup.columns and not pd.isna(score_lookup.at[g, c]) else 0.0 + if c not in score_lookup.columns: + continue + val = score_lookup.at[g, c] + s = 0.0 if pd.isna(val) else float(val) if s: obj_terms.append(mul([Real(s), y[g, c]])) # − transport cost per added transport diff --git a/src/raven_python/manipulation/add.py b/src/raven_python/manipulation/add.py index f5e6495..d772acd 100644 --- a/src/raven_python/manipulation/add.py +++ b/src/raven_python/manipulation/add.py @@ -35,7 +35,7 @@ from cobra import Metabolite, Reaction from cobra.core.gene import GPR -from raven_python.utils.parse import parse_name_comp +from raven_python.utils.parse import parse_name_comp, subsystem_to_str # Reversibility arrows. ``<=>`` must be tried before ``=>`` (it contains it). _REVERSIBLE_ARROWS = ("<=>",) @@ -302,7 +302,9 @@ def add_reactions_from_equations( rxn_id = spec["id"] if rxn_id in model.reactions: raise ValueError( - f"Reaction {rxn_id!r} already exists; use changeRxns or remove it first." + f"Reaction {rxn_id!r} already exists. To change its stoichiometry use " + "change_reaction_equations; to replace it, remove it first with " + "model.remove_reactions([...])." ) if "equation" not in spec: raise ValueError(f"Reaction {rxn_id!r} spec missing required 'equation'.") @@ -324,7 +326,7 @@ def add_reactions_from_equations( lower = config.lower_bound if reversible else 0.0 rxn.bounds = (lower, config.upper_bound) if "subsystem" in spec: - rxn.subsystem = spec["subsystem"] + rxn.subsystem = subsystem_to_str(spec["subsystem"]) model.add_reactions([rxn]) rxn.add_metabolites(coeffs) diff --git a/src/raven_python/manipulation/transport.py b/src/raven_python/manipulation/transport.py index d222377..2edad74 100644 --- a/src/raven_python/manipulation/transport.py +++ b/src/raven_python/manipulation/transport.py @@ -46,11 +46,15 @@ def _transport_id_factory(model: cobra.Model, prefix: str): def next_id() -> str: nonlocal counter - while f"{prefix}{counter:04d}" in model.reactions: + # A free id must appear within len(reactions)+1 tries (only that many are + # occupied); the bound turns any pathological case into a clear error + # instead of spinning forever. + for _ in range(len(model.reactions) + 2): + rid = f"{prefix}{counter:04d}" counter += 1 - rid = f"{prefix}{counter:04d}" - counter += 1 - return rid + if rid not in model.reactions: + return rid + raise RuntimeError(f"could not allocate a free reaction id with prefix {prefix!r}.") return next_id diff --git a/src/raven_python/reconstruction/kegg/download.py b/src/raven_python/reconstruction/kegg/download.py index 8bb1826..d990b42 100644 --- a/src/raven_python/reconstruction/kegg/download.py +++ b/src/raven_python/reconstruction/kegg/download.py @@ -27,12 +27,15 @@ from __future__ import annotations import gzip +import logging import netrc import shutil import tarfile import urllib.request from pathlib import Path +logger = logging.getLogger(__name__) + KEGG_HOST = "ftp.kegg.net" BASE_URL = "https://ftp.kegg.net" @@ -116,12 +119,12 @@ def fetch_kegg_files( target = dest / Path(path).name if target.exists() and not force: if verbose: - print(f" skip (exists): {target.name}") + logger.info("skip (exists): %s", target.name) out.append(target) continue url = f"{base_url.rstrip('/')}/{path.lstrip('/')}" if verbose: - print(f" fetching {path}") + logger.info("fetching %s", path) with opener.open(url) as resp, open(target, "wb") as handle: shutil.copyfileobj(resp, handle) out.append(target) @@ -253,5 +256,5 @@ def download_kegg_dump( verbose=verbose, ) if verbose: - print(">>> Extracting and arranging KEGG dump...") + logger.info("Extracting and arranging KEGG dump...") return extract_kegg_dump(dest) diff --git a/src/raven_python/utils/gpr.py b/src/raven_python/utils/gpr.py index 2e2122d..28818da 100644 --- a/src/raven_python/utils/gpr.py +++ b/src/raven_python/utils/gpr.py @@ -17,11 +17,41 @@ from __future__ import annotations import ast +import statistics +from collections.abc import Callable from dataclasses import dataclass import cobra from cobra.core.gene import GPR +#: Score-aggregation functions for combining gene scores across a GPR: isozymes +#: (genes joined by OR) and complex subunits (joined by AND). Shared by the +#: gene→reaction scoring (:mod:`raven_python.init.score`) and low-score-gene +#: pruning (:mod:`raven_python.init.genes`) so both validate the same way. +AGGREGATORS: dict[str, Callable] = { + "min": min, + "max": max, + "median": statistics.median, + "average": statistics.fmean, +} + + +def resolve_aggregators( + isozyme_scoring: str, complex_scoring: str +) -> tuple[Callable, Callable]: + """Validate the scoring-mode names and return ``(isozyme_fn, complex_fn)``. + + Raises ``ValueError`` naming the offending argument if either mode is not one + of :data:`AGGREGATORS`. + """ + for name, value in ( + ("isozyme_scoring", isozyme_scoring), + ("complex_scoring", complex_scoring), + ): + if value not in AGGREGATORS: + raise ValueError(f"{name} must be one of {sorted(AGGREGATORS)}; got {value!r}.") + return AGGREGATORS[isozyme_scoring], AGGREGATORS[complex_scoring] + def _contains_or(node: ast.AST | None) -> bool: """True if ``node``'s subtree contains an OR operator anywhere.""" diff --git a/src/raven_python/utils/parse.py b/src/raven_python/utils/parse.py index 8068f6c..e35a0d0 100644 --- a/src/raven_python/utils/parse.py +++ b/src/raven_python/utils/parse.py @@ -31,3 +31,29 @@ def parse_name_comp(token: str) -> tuple[str, str | None]: if match: return match.group("name").strip(), match.group("comp").strip() return token.strip(), None + + +def subsystem_to_str(value) -> str: + """Coerce a reaction ``subsystem`` to cobra's canonical ``str`` form. + + cobra defines ``Reaction.subsystem`` as a plain string, but RAVEN/MATLAB and + "messy" YAML sometimes deliver a list/tuple of subsystem names (a reaction can + belong to several). This collapses any such value to a single ``;``-joined + string of its parts — losing no data, unlike taking only the first — and returns + ``""`` for an empty/None subsystem. Used wherever a subsystem is rendered or + compared so the handling is identical across modules. + + Examples + -------- + >>> subsystem_to_str("glycolysis") + 'glycolysis' + >>> subsystem_to_str(["glycolysis", "TCA cycle"]) + 'glycolysis;TCA cycle' + >>> subsystem_to_str(None) + '' + """ + if value is None: + return "" + if isinstance(value, (list, tuple)): + return ";".join(str(s) for s in value if str(s).strip()) + return str(value) diff --git a/tests/test_binaries.py b/tests/test_binaries.py index d74ce0b..9e340b5 100644 --- a/tests/test_binaries.py +++ b/tests/test_binaries.py @@ -78,3 +78,24 @@ def test_ensure_binary_unhosted_platform_raises(tmp_path): registry = {"footool": {"version": "1", "provides": ["footool"], "platforms": {}}} with pytest.raises(FileNotFoundError, match="No bundled"): binaries.ensure_binary("footool", registry=registry) + + +def test_ensure_binary_rejects_path_traversal_zip(tmp_path, monkeypatch): + # A ZIP whose member escapes the extraction dir must be refused, not written + # outside the cache (ZipFile.extractall has no traversal guard of its own). + archive = tmp_path / "evil.zip" + with zipfile.ZipFile(archive, "w") as zf: + zf.writestr("footool", "ok") + zf.writestr("../escape.txt", "pwned") + sha = hashlib.sha256(archive.read_bytes()).hexdigest() + registry = { + "footool": { + "version": "1.0", + "provides": ["footool"], + "platforms": {binaries.platform_key(): {"url": archive.as_uri(), "sha256": sha}}, + } + } + monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "cache")) + with pytest.raises(ValueError, match="path traversal"): + binaries.ensure_binary("footool", registry=registry) + assert not (tmp_path / "escape.txt").exists() diff --git a/tests/test_gapfilling.py b/tests/test_gapfilling.py index b92e982..1922fea 100644 --- a/tests/test_gapfilling.py +++ b/tests/test_gapfilling.py @@ -37,6 +37,12 @@ def draft_and_template(): # --------------------------------------------------------------------------- # # Connectivity gap-fill # --------------------------------------------------------------------------- # +def test_non_positive_penalty_rejected(draft_and_template): + draft, template = draft_and_template + with pytest.raises(ValueError, match="penalty must be"): + connect_blocked_reactions(draft, template, penalty=0) + + def test_fill_gaps_connects_blocked_reaction(draft_and_template): draft, template = draft_and_template assert "r1" in cobra.flux_analysis.find_blocked_reactions(draft) # precondition diff --git a/tests/test_init.py b/tests/test_init.py index a61ee19..9bfbc10 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -1,6 +1,7 @@ """Tests for the INIT MILP (init/init.py, Phase 4c).""" import cobra import pytest +from cobra.exceptions import OptimizationError from raven_python.init import InitResult, run_init @@ -40,6 +41,21 @@ def test_keeps_positive_drops_negative(model): assert "r3" not in kept +def test_infeasible_milp_raises_optimization_error(): + # A lone irreversible reaction that drains A (no producer) forced essential cannot + # satisfy steady state without excretion -> the solver is infeasible. raven aligns + # with cobra by raising cobra.exceptions.OptimizationError (not bare RuntimeError). + m = cobra.Model("infeasible") + A = _met("A_c") + m.add_metabolites([A]) + drain = cobra.Reaction("drain", lower_bound=0, upper_bound=1000) + drain.add_metabolites({A: -1}) + m.add_reactions([drain]) + with pytest.raises(OptimizationError): + run_init(m, {"drain": 1.0}, essential_rxns=["drain"], + prod_weight=0.0, allow_excretion=False) + + def test_negative_scores_emptied_when_no_reward(model): # All reactions negative and no production reward -> keep nothing (empty optimum). scores = {r.id: -1.0 for r in model.reactions} diff --git a/tests/test_io_yaml_ec_data.py b/tests/test_io_yaml_ec_data.py index 94c2ebb..3cc99ba 100644 --- a/tests/test_io_yaml_ec_data.py +++ b/tests/test_io_yaml_ec_data.py @@ -349,6 +349,16 @@ def test_load_dangling_enzyme_reference_raises(tmp_path): read_yaml_model(_write_yaml(doc, tmp_path / "m.yml")) +def test_load_all_zero_coupling_matrix_warns(tmp_path): + """ec-rxns present but none reference an enzyme -> the coupling matrix is + all-zero (likely malformed). Warn rather than build a silently empty model.""" + doc = _minimal_model_doc() + doc["ec-rxns"] = [{"id": "R1", "kcat": 1.0, "enzymes": {}}] + doc["ec-enzymes"] = [{"genes": "G1", "enzymes": "P1", "mw": 1.0}] + with pytest.warns(UserWarning, match="all-zero"): + read_yaml_model(_write_yaml(doc, tmp_path / "m.yml")) + + # --------------------------------------------------------------------------- # # In-memory model_from_yaml_data (no file I/O) # --------------------------------------------------------------------------- # diff --git a/tests/test_io_yaml_parity.py b/tests/test_io_yaml_parity.py index 2c89f58..32d8a6c 100644 --- a/tests/test_io_yaml_parity.py +++ b/tests/test_io_yaml_parity.py @@ -25,7 +25,6 @@ from raven_python.io import read_yaml_model, write_yaml_model - SAMPLE = { "metabolites": [ { @@ -349,9 +348,9 @@ def test_eccodes_round_trip_through_cobra_extras(src, tmp_path): pass1 = tmp_path / "via_rp.yml" write_yaml_model(model, pass1) via_cobra = cobra.io.load_yaml_model(str(pass1)) - # cobra exposes eccodes as an attribute (setattr fall-through); - # raven_python sourced it from notes, so .notes['eccodes'] should - # still be present on the reloaded model. + # cobra exposes eccodes as an attribute (setattr fall-through), proving + # the key written by write_yaml_model survives a cobra round-trip. + assert getattr(via_cobra.reactions.get_by_id("R1"), "eccodes", None) == "1.1.1.1" pass2 = tmp_path / "via_rp2.yml" # Promote cobra's setattr-eccodes back into notes for the writer # path. (Tests the documented integration: cobra preserves the YAML diff --git a/tests/test_utils_parse.py b/tests/test_utils_parse.py new file mode 100644 index 0000000..c6d2f74 --- /dev/null +++ b/tests/test_utils_parse.py @@ -0,0 +1,57 @@ +"""Tests for raven_python.utils.parse helpers.""" +from __future__ import annotations + +import cobra + +from raven_python.utils.parse import parse_name_comp, subsystem_to_str + + +def test_parse_name_comp_basic(): + assert parse_name_comp("ATP[c]") == ("ATP", "c") + assert parse_name_comp("ATP") == ("ATP", None) + assert parse_name_comp("weird[name][m]") == ("weird[name]", "m") + + +def test_subsystem_to_str_scalar_and_none(): + assert subsystem_to_str("glycolysis") == "glycolysis" + assert subsystem_to_str("") == "" + assert subsystem_to_str(None) == "" + + +def test_subsystem_to_str_joins_list_without_data_loss(): + # A multi-subsystem reaction keeps every part (unlike taking only the first). + assert subsystem_to_str(["glycolysis", "TCA cycle"]) == "glycolysis;TCA cycle" + # Empty parts are dropped; the rest survive. + assert subsystem_to_str(["", "x"]) == "x" + + +def test_subsystem_to_str_coerces_non_string_items(): + # Non-string items must not crash (the old excel.py ";".join did). + assert subsystem_to_str([1, "name"]) == "1;name" + + +def test_excel_export_handles_list_subsystem(tmp_path): + import pytest + + pytest.importorskip("openpyxl") + from openpyxl import load_workbook + + from raven_python.io.excel import export_to_excel + + model = cobra.Model("m") + a = cobra.Metabolite("a_c", compartment="c") + b = cobra.Metabolite("b_c", compartment="c") + model.add_metabolites([a, b]) + r = cobra.Reaction("R1", lower_bound=-1000, upper_bound=1000) + model.add_reactions([r]) + r.add_metabolites({a: -1, b: 1}) + r.subsystem = ["glycolysis", "TCA cycle"] # list, would crash old ";".join + + out = tmp_path / "m.xlsx" + export_to_excel(model, out) + wb = load_workbook(out) + ws = wb["RXNS"] + header = [c.value for c in next(ws.iter_rows(max_row=1))] + sub_col = header.index("SUBSYSTEM") + row1 = [c.value for c in list(ws.iter_rows(min_row=2, max_row=2))[0]] + assert row1[sub_col] == "glycolysis;TCA cycle"