From 5520991db675b359b8ade1831181817d0fb2b145 Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Sat, 30 May 2026 01:59:44 +0200
Subject: [PATCH 1/7] Add diff_models, annotation, and conditions modules for
 yeast-GEM port
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lands the upstream-shareable pieces that yeast-GEM has been implementing
locally during its Python port (see yeast-GEM/code/python/PORTING_PLAN.md
and UPSTREAM_CANDIDATES.md). These are organism-agnostic; yeast-GEM
will consume them via a Python dependency on raven-python.

New modules
-----------
raven_python.comparison.diff
    diff_models(a, b, ...) -> DiffReport — strict two-model semantic-
    equality diff. Complements the existing compare_models (N-model
    presence-matrix overview). Used as a CI gate to verify that two
    toolchains (e.g. MATLAB RAVEN vs raven_python, pre/post refactor
    of one toolchain) produce equivalent models. Includes a
    python -m raven_python.comparison.diff CLI.

raven_python.annotation.sbo
    add_sbo_terms — SBO term assignment with "fill" semantic. Default
    parameter set reproduces yeast-GEM's behaviour; biomass metabolite
    names, biomass/NGAM reaction names, and pseudoreaction substrings
    are overridable. Transport detection is pluggable (default: same-
    met-name in two compartments). Includes an `only_last_reaction_
    for_pseudo` legacy bug-compat flag for yeast-GEM's lock-step
    migration; off by default for any new caller.

raven_python.annotation.delta_g
    load_delta_g_csv / save_delta_g_csv — generic side-car CSV
    persistence for scalar notes (ΔG by default, but the note key,
    column names, and id/value mapping are all configurable).

raven_python.conditions.apply
    apply_condition(model, yaml_or_dict) — generic "apply this YAML
    condition" loader. Schema: prelude (reset_exchanges),
    cofactor_pseudoreaction (remove_mets + charge_balance_met),
    biomass_stoichiometry_delta, per-rxn bounds, expected_uptake_count.
    Project-specific extensions (e.g. yeast-GEM's amino_acid_ratio)
    are handled by the caller before/after this function — kept
    upstream-narrow on purpose. Also exposes set_reaction_bounds
    helper that bypasses cobra's lb<=ub validator for the (legitimate)
    cases where a condition lands on an infeasible bound state.

Tests
-----
46 new tests across the three modules; full pre-existing raven-python
suite still passes (519 passed; 1 unrelated pre-existing openpyxl
ImportError in tests/test_io_git.py; 2 skipped). ruff clean.

Not in this commit
------------------
The biomass / GAM / chemostat / fit_gam modules are still tracked as
upstream candidates in yeast-GEM/code/python/UPSTREAM_CANDIDATES.md
and remain local in yeast-GEM until phase 4 of the port (which would
ideally land them directly here).
---
 src/raven_python/annotation/__init__.py |  24 ++
 src/raven_python/annotation/delta_g.py  | 115 ++++++++++
 src/raven_python/annotation/sbo.py      | 180 +++++++++++++++
 src/raven_python/comparison/__init__.py |  22 +-
 src/raven_python/comparison/diff.py     | 285 ++++++++++++++++++++++++
 src/raven_python/conditions/__init__.py |  25 +++
 src/raven_python/conditions/apply.py    | 186 ++++++++++++++++
 tests/test_annotation.py                | 204 +++++++++++++++++
 tests/test_comparison_diff.py           | 146 ++++++++++++
 tests/test_conditions.py                | 199 +++++++++++++++++
 10 files changed, 1383 insertions(+), 3 deletions(-)
 create mode 100644 src/raven_python/annotation/__init__.py
 create mode 100644 src/raven_python/annotation/delta_g.py
 create mode 100644 src/raven_python/annotation/sbo.py
 create mode 100644 src/raven_python/comparison/diff.py
 create mode 100644 src/raven_python/conditions/__init__.py
 create mode 100644 src/raven_python/conditions/apply.py
 create mode 100644 tests/test_annotation.py
 create mode 100644 tests/test_comparison_diff.py
 create mode 100644 tests/test_conditions.py

diff --git a/src/raven_python/annotation/__init__.py b/src/raven_python/annotation/__init__.py
new file mode 100644
index 0000000..c2b4c17
--- /dev/null
+++ b/src/raven_python/annotation/__init__.py
@@ -0,0 +1,24 @@
+"""Annotation helpers — SBO term assignment, ΔG CSV persistence.
+
+These are the pieces of yeast-GEM's ``missingFields`` module that are
+organism-agnostic enough to live upstream. Default parameter values
+match the RAVEN/yeast convention so the functions are immediately
+useful on the standard layout; consumers with different naming pass
+overrides.
+"""
+from raven_python.annotation.delta_g import load_delta_g_csv, save_delta_g_csv
+from raven_python.annotation.sbo import (
+    DEFAULT_BIOMASS_MET_NAMES,
+    DEFAULT_BIOMASS_RXN_NAME,
+    DEFAULT_NGAM_RXN_NAME,
+    add_sbo_terms,
+)
+
+__all__ = [
+    "DEFAULT_BIOMASS_MET_NAMES",
+    "DEFAULT_BIOMASS_RXN_NAME",
+    "DEFAULT_NGAM_RXN_NAME",
+    "add_sbo_terms",
+    "load_delta_g_csv",
+    "save_delta_g_csv",
+]
diff --git a/src/raven_python/annotation/delta_g.py b/src/raven_python/annotation/delta_g.py
new file mode 100644
index 0000000..e0a90e0
--- /dev/null
+++ b/src/raven_python/annotation/delta_g.py
@@ -0,0 +1,115 @@
+"""Persist scalar annotations (Gibbs free energy, etc.) as a side-car CSV.
+
+The committed SBML standard has no slot for thermodynamic ΔG values, so
+projects like yeast-GEM keep them in a paired CSV instead. This module
+provides the generic load / save round-trip — the file format is a two
+column table ``<id>, <value>`` — and exposes the per-call schema so
+existing projects can configure their conventions.
+
+The functions operate on cobra ``.notes`` (string round-tripped through
+SBML), keeping any other notes untouched.
+"""
+from __future__ import annotations
+
+import math
+from collections.abc import Iterable
+from pathlib import Path
+
+import cobra
+import pandas as pd
+
+
+def load_delta_g_csv(
+    entities: Iterable,
+    path: str | Path,
+    *,
+    id_column: str = "Var1",
+    value_column: str = "Var2",
+    note_key: str = "deltaG",
+    verbose: bool = False,
+) -> int:
+    """Stamp ``note_key`` on each entity from a CSV of ``id → value``.
+
+    Parameters
+    ----------
+    entities
+        Cobra entities (``model.metabolites`` or ``model.reactions``).
+    path
+        Path to the CSV.
+    id_column, value_column
+        Column labels in the CSV. Defaults match the yeast-GEM
+        convention (``Var1`` / ``Var2`` from MATLAB's ``array2table``).
+    note_key
+        Key under which the value is stored on ``entity.notes``.
+        Default ``"deltaG"``.
+    verbose
+        Print a summary of unmatched entity ids.
+
+    Returns
+    -------
+    The number of entities that were stamped (i.e. matched the CSV).
+    """
+    df = pd.read_csv(path)
+    if id_column not in df.columns or value_column not in df.columns:
+        raise ValueError(
+            f"{path} columns {list(df.columns)} do not contain "
+            f"both {id_column!r} and {value_column!r}"
+        )
+    lookup = dict(zip(df[id_column], df[value_column], strict=True))
+
+    stamped = 0
+    missing: list[str] = []
+    for entity in entities:
+        value = lookup.get(entity.id)
+        if value is None or (isinstance(value, float) and math.isnan(value)):
+            missing.append(entity.id)
+            continue
+        entity.notes[note_key] = str(value)
+        stamped += 1
+
+    if verbose and missing:
+        print(
+            f"{len(missing)} entity ids were not in {path} "
+            f"(e.g. {missing[:3]}); left untouched."
+        )
+    return stamped
+
+
+def save_delta_g_csv(
+    entities: Iterable,
+    path: str | Path,
+    *,
+    id_column: str = "Var1",
+    value_column: str = "Var2",
+    note_key: str = "deltaG",
+) -> int:
+    """Dump ``entity.notes[note_key]`` for each entity to a CSV.
+
+    Entities without ``note_key`` set get ``NaN`` written, preserving
+    one-row-per-entity ordering (mirrors MATLAB's ``array2table([ids,
+    values])`` behaviour).
+
+    Returns
+    -------
+    The number of rows written (= number of entities).
+    """
+    rows: list[tuple[str, float]] = []
+    for entity in entities:
+        raw = entity.notes.get(note_key)
+        if raw is None:
+            value: float = math.nan
+        else:
+            try:
+                value = float(raw)
+            except ValueError:
+                value = math.nan
+        rows.append((entity.id, value))
+    pd.DataFrame(rows, columns=[id_column, value_column]).to_csv(path, index=False)
+    return len(rows)
+
+
+# Re-export the cobra Model type for type-checker friendliness; helps
+# IDEs surface the right hints to callers that hand us model.metabolites
+# / model.reactions directly.
+__all__ = ["load_delta_g_csv", "save_delta_g_csv"]
+_ = cobra  # silence "imported but unused" — used for typing context above
diff --git a/src/raven_python/annotation/sbo.py b/src/raven_python/annotation/sbo.py
new file mode 100644
index 0000000..d0306cb
--- /dev/null
+++ b/src/raven_python/annotation/sbo.py
@@ -0,0 +1,180 @@
+"""SBO term assignment.
+
+Port of the generic core of yeast-GEM's ``code/missingFields/addSBOterms.m``.
+Defaults reproduce the yeast-GEM behaviour; overrides parameterise the
+pseudo-metabolite / pseudo-reaction name conventions, the transport
+detector, and a yeast-specific bug-compat flag.
+"""
+from __future__ import annotations
+
+from collections.abc import Callable, Iterable
+
+import cobra
+
+DEFAULT_BIOMASS_MET_NAMES: frozenset[str] = frozenset(
+    {"biomass", "DNA", "RNA", "protein", "carbohydrate",
+     "lipid", "cofactor", "ion"}
+)
+DEFAULT_BIOMASS_RXN_NAME: str = "biomass pseudoreaction"
+DEFAULT_NGAM_RXN_NAME: str = "non-growth associated maintenance reaction"
+
+
+def add_sbo_terms(
+    model: cobra.Model,
+    *,
+    biomass_met_names: Iterable[str] = DEFAULT_BIOMASS_MET_NAMES,
+    biomass_met_suffixes: Iterable[str] = (" backbone", " chain"),
+    biomass_rxn_name: str = DEFAULT_BIOMASS_RXN_NAME,
+    ngam_rxn_name: str = DEFAULT_NGAM_RXN_NAME,
+    pseudoreaction_name_substrings: Iterable[str] = ("pseudoreaction", "SLIME rxn"),
+    transport_detector: Callable[[cobra.Model], set[str]] | None = None,
+    only_last_reaction_for_pseudo: bool = False,
+) -> cobra.Model:
+    """Assign SBO terms to metabolites and reactions in-place.
+
+    Metabolite SBO assignment
+        - SBO:0000649 (Biomass) for metabolites whose ``name`` is in
+          ``biomass_met_names`` or ends with any of
+          ``biomass_met_suffixes`` (default: ``" backbone"`` / ``" chain"``).
+        - SBO:0000247 (Simple chemical) otherwise.
+
+    Reaction SBO assignment
+        - SBO:0000176 (Metabolic reaction) default.
+        - Single-reactant reactions (exchange / sink / demand):
+            * extracellular → SBO:0000627 (exchange)
+            * coefficient < 0 → SBO:0000632 (sink)
+            * else → SBO:0000628 (demand)
+        - Transport reactions → SBO:0000655 (default detector: same
+          metabolite name appearing in ≥ 2 compartments).
+        - The reaction whose ``name`` matches ``biomass_rxn_name`` →
+          SBO:0000629.
+        - The reaction whose ``name`` matches ``ngam_rxn_name`` →
+          SBO:0000630.
+        - Other reactions whose ``name`` contains any
+          ``pseudoreaction_name_substrings`` → SBO:0000395.
+
+    "fill" semantic: SBO is written to ``annotation['sbo']`` only when
+    that key is missing or empty — mirrors RAVEN's
+    ``editMiriam(..., 'fill')`` mode.
+
+    Parameters
+    ----------
+    transport_detector
+        Optional callable that takes the model and returns the set of
+        reaction ids classified as transport. When ``None``, the default
+        same-met-name-in-two-compartments heuristic is used. Pass a
+        custom detector to use a different convention (e.g. RAVEN's
+        ``getTransportRxns`` via the MATLAB engine).
+    only_last_reaction_for_pseudo
+        **Bug-compatibility flag**, off by default. The legacy yeast-GEM
+        MATLAB ``addSBOterms.m`` contains a typo (``for i = numel(...)``
+        rather than ``for i = 1:numel(...)``) that causes the pseudo-
+        reaction SBO assignments to run only on the very last reaction
+        in the model. When ``True``, this flag replicates that bug so
+        callers can migrate without altering the committed model. Leave
+        ``False`` for new uses.
+    """
+    biomass_met_names = frozenset(biomass_met_names)
+    biomass_met_suffixes = tuple(biomass_met_suffixes)
+    pseudoreaction_name_substrings = tuple(pseudoreaction_name_substrings)
+    if transport_detector is None:
+        transport_detector = _default_transport_detector
+
+    _assign_metabolite_sbo(model, biomass_met_names, biomass_met_suffixes)
+    _assign_reaction_sbo(
+        model,
+        biomass_rxn_name=biomass_rxn_name,
+        ngam_rxn_name=ngam_rxn_name,
+        pseudo_substrings=pseudoreaction_name_substrings,
+        transport_detector=transport_detector,
+        only_last_reaction_for_pseudo=only_last_reaction_for_pseudo,
+    )
+    return model
+
+
+# --- internals ---------------------------------------------------------
+
+def _assign_metabolite_sbo(
+    model: cobra.Model,
+    biomass_met_names: frozenset[str],
+    biomass_met_suffixes: tuple[str, ...],
+) -> None:
+    for met in model.metabolites:
+        if met.name in biomass_met_names or met.name.endswith(biomass_met_suffixes):
+            sbo = "SBO:0000649"
+        else:
+            sbo = "SBO:0000247"
+        _fill_sbo(met, sbo)
+
+
+def _assign_reaction_sbo(
+    model: cobra.Model,
+    *,
+    biomass_rxn_name: str,
+    ngam_rxn_name: str,
+    pseudo_substrings: tuple[str, ...],
+    transport_detector: Callable[[cobra.Model], set[str]],
+    only_last_reaction_for_pseudo: bool,
+) -> None:
+    """Compute the SBO for each reaction via successive overrides, then
+    fill into the model in a single pass — mirrors the MATLAB
+    ``rxnSBO``-array + ``editMiriam(..., 'fill')`` two-step.
+    """
+    transport_set = transport_detector(model)
+    rxns = list(model.reactions)
+
+    sbo_for_rxn: dict[str, str] = {}
+    for rxn in rxns:
+        sbo = "SBO:0000176"
+        if len(rxn.metabolites) == 1:
+            (met,) = rxn.metabolites
+            coef = rxn.metabolites[met]
+            if met.compartment == "e" or model.compartments.get(
+                met.compartment
+            ) == "extracellular":
+                sbo = "SBO:0000627"
+            elif coef < 0:
+                sbo = "SBO:0000632"
+            else:
+                sbo = "SBO:0000628"
+        sbo_for_rxn[rxn.id] = sbo
+
+    # Transport override
+    for rxn_id in transport_set:
+        sbo_for_rxn[rxn_id] = "SBO:0000655"
+
+    # Pseudoreaction override. The legacy bug-compat flag scopes this
+    # loop to the last reaction only; the default walks the whole list.
+    pseudo_targets = [rxns[-1]] if only_last_reaction_for_pseudo else rxns
+    for rxn in pseudo_targets:
+        if rxn.name == biomass_rxn_name:
+            sbo_for_rxn[rxn.id] = "SBO:0000629"
+        elif rxn.name == ngam_rxn_name:
+            sbo_for_rxn[rxn.id] = "SBO:0000630"
+        elif any(s in rxn.name for s in pseudo_substrings):
+            sbo_for_rxn[rxn.id] = "SBO:0000395"
+
+    for rxn in rxns:
+        _fill_sbo(rxn, sbo_for_rxn[rxn.id])
+
+
+def _fill_sbo(entity, sbo: str) -> None:
+    """Set ``annotation['sbo']`` only if it is missing or empty."""
+    if not entity.annotation.get("sbo"):
+        entity.annotation["sbo"] = sbo
+
+
+def _default_transport_detector(model: cobra.Model) -> set[str]:
+    """Same-met-name-in-two-compartments heuristic for transport detection.
+
+    Cheap analogue of RAVEN's ``getTransportRxns``. Pass a custom
+    callable to :func:`add_sbo_terms` to override.
+    """
+    out: set[str] = set()
+    for rxn in model.reactions:
+        by_name: dict[str, set[str | None]] = {}
+        for met in rxn.metabolites:
+            by_name.setdefault(met.name, set()).add(met.compartment)
+        if any(len(comps) >= 2 for comps in by_name.values()):
+            out.add(rxn.id)
+    return out
diff --git a/src/raven_python/comparison/__init__.py b/src/raven_python/comparison/__init__.py
index e4b4c19..968b7f0 100644
--- a/src/raven_python/comparison/__init__.py
+++ b/src/raven_python/comparison/__init__.py
@@ -1,7 +1,23 @@
-"""Structural and functional comparison across multiple models.
+"""Model comparison utilities.
 
-See :func:`raven_python.comparison.compare.compare_models`.
+Two flavours:
+
+* :func:`compare_models` — N-model presence-matrix overview (RAVEN's
+  ``compareMultipleModels`` analogue). "How do these models relate?"
+* :func:`diff_models` — strict two-model semantic-equality diff for CI
+  gates. "Are these two models the same?"
 """
 from raven_python.comparison.compare import ModelComparison, compare_models
+from raven_python.comparison.diff import (
+    DEFAULT_ANNOTATION_KEYS,
+    DiffReport,
+    diff_models,
+)
 
-__all__ = ["ModelComparison", "compare_models"]
+__all__ = [
+    "DEFAULT_ANNOTATION_KEYS",
+    "DiffReport",
+    "ModelComparison",
+    "compare_models",
+    "diff_models",
+]
diff --git a/src/raven_python/comparison/diff.py b/src/raven_python/comparison/diff.py
new file mode 100644
index 0000000..2a399eb
--- /dev/null
+++ b/src/raven_python/comparison/diff.py
@@ -0,0 +1,285 @@
+"""Strict two-model semantic-equality diff.
+
+Complements :func:`raven_python.comparison.compare.compare_models`, which
+takes ≥2 models and produces a presence-matrix overview. Where
+``compare_models`` answers *"how do these models relate?"*,
+:func:`diff_models` answers *"are these two models the same?"* — used as a
+CI gate to verify that two toolchains (e.g. MATLAB RAVEN vs raven_python,
+or pre/post refactor of one toolchain) produce equivalent models.
+
+Diff scope: reaction / metabolite / gene id sets, stoichiometry (within
+tolerance), bounds, objective coefficients, GPR rules, metabolite
+formula/charge/compartment, and a configurable set of annotation keys.
+Formatting differences (key ordering, whitespace, float repr) are
+explicitly **not** failures.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable
+from dataclasses import dataclass, field
+
+import cobra
+
+# Annotation keys checked by default. Add via ``extra_annotations`` or
+# remove via ``ignore_annotations`` in :func:`diff_models`.
+DEFAULT_ANNOTATION_KEYS: tuple[str, ...] = (
+    "sbo",
+    "ec-code",
+    "kegg.reaction",
+    "kegg.compound",
+    "metanetx.reaction",
+    "metanetx.chemical",
+    "chebi",
+    "bigg.reaction",
+    "bigg.metabolite",
+)
+
+
+@dataclass
+class DiffReport:
+    """Result of :func:`diff_models`.
+
+    Truthy if the models are semantically equal. Iterate ``differences``
+    for human-readable messages.
+    """
+
+    equal: bool
+    differences: list[str] = field(default_factory=list)
+
+    def __bool__(self) -> bool:
+        return self.equal
+
+    def __str__(self) -> str:
+        if self.equal:
+            return "Models are semantically equal."
+        head = f"Models differ ({len(self.differences)} differences):"
+        body = "\n".join(f"  - {d}" for d in self.differences)
+        return f"{head}\n{body}"
+
+
+def diff_models(
+    a: cobra.Model,
+    b: cobra.Model,
+    *,
+    stoichiometry_tol: float = 1e-9,
+    ignore_annotations: Iterable[str] = (),
+    extra_annotations: Iterable[str] = (),
+    max_per_category: int = 50,
+) -> DiffReport:
+    """Compare two cobra models for semantic equality.
+
+    Checks:
+
+    * reaction, metabolite and gene id sets (exact)
+    * stoichiometry per shared reaction (within ``stoichiometry_tol``)
+    * lower/upper bounds and objective coefficients (exact)
+    * GPR rules (whitespace- and case-insensitive string comparison)
+    * metabolite formula, charge, compartment (exact)
+    * a default set of annotation keys (see ``DEFAULT_ANNOTATION_KEYS``)
+      plus any ``extra_annotations``, minus any ``ignore_annotations``
+
+    ``max_per_category`` caps the per-category diff list so a wholesale
+    mismatch produces a digestible report rather than 10,000 lines.
+    """
+    diffs: list[str] = []
+    keys = (set(DEFAULT_ANNOTATION_KEYS) | set(extra_annotations)) - set(ignore_annotations)
+
+    _diff_id_sets(a, b, diffs)
+
+    common_rxns = {r.id for r in a.reactions} & {r.id for r in b.reactions}
+    _diff_reactions(a, b, sorted(common_rxns), stoichiometry_tol, keys, diffs, max_per_category)
+
+    common_mets = {m.id for m in a.metabolites} & {m.id for m in b.metabolites}
+    _diff_metabolites(a, b, sorted(common_mets), keys, diffs, max_per_category)
+
+    return DiffReport(equal=not diffs, differences=diffs)
+
+
+# --- internals ---------------------------------------------------------
+
+def _diff_id_sets(a: cobra.Model, b: cobra.Model, diffs: list[str]) -> None:
+    for label, get in (
+        ("reactions", lambda m: m.reactions),
+        ("metabolites", lambda m: m.metabolites),
+        ("genes", lambda m: m.genes),
+    ):
+        a_ids = {x.id for x in get(a)}
+        b_ids = {x.id for x in get(b)}
+        only_a = sorted(a_ids - b_ids)
+        only_b = sorted(b_ids - a_ids)
+        if only_a:
+            diffs.append(f"{label} only in A ({len(only_a)}): {only_a[:10]}{_more(only_a, 10)}")
+        if only_b:
+            diffs.append(f"{label} only in B ({len(only_b)}): {only_b[:10]}{_more(only_b, 10)}")
+
+
+def _diff_reactions(
+    a: cobra.Model,
+    b: cobra.Model,
+    common: list[str],
+    tol: float,
+    anno_keys: set[str],
+    diffs: list[str],
+    cap: int,
+) -> None:
+    counters = {"stoich": 0, "bounds": 0, "objective": 0, "gpr": 0, "anno": 0}
+    for rxn_id in common:
+        ra = a.reactions.get_by_id(rxn_id)
+        rb = b.reactions.get_by_id(rxn_id)
+
+        sa = {m.id: c for m, c in ra.metabolites.items()}
+        sb = {m.id: c for m, c in rb.metabolites.items()}
+        if set(sa) != set(sb):
+            _push(diffs, counters, "stoich", cap, f"{rxn_id}: stoichiometry has different mets")
+        else:
+            for mid in sa:
+                if abs(sa[mid] - sb[mid]) > tol:
+                    _push(
+                        diffs, counters, "stoich", cap,
+                        f"{rxn_id}: coef[{mid}] A={sa[mid]:.6g} B={sb[mid]:.6g}",
+                    )
+
+        if ra.lower_bound != rb.lower_bound or ra.upper_bound != rb.upper_bound:
+            _push(
+                diffs, counters, "bounds", cap,
+                f"{rxn_id}: bounds A=({ra.lower_bound}, {ra.upper_bound}) "
+                f"B=({rb.lower_bound}, {rb.upper_bound})",
+            )
+
+        if ra.objective_coefficient != rb.objective_coefficient:
+            _push(
+                diffs, counters, "objective", cap,
+                f"{rxn_id}: objective A={ra.objective_coefficient} B={rb.objective_coefficient}",
+            )
+
+        ga = _normalise_gpr(ra.gene_reaction_rule)
+        gb = _normalise_gpr(rb.gene_reaction_rule)
+        if ga != gb:
+            _push(diffs, counters, "gpr", cap, f"{rxn_id}: GPR A={ga!r} B={gb!r}")
+
+        _diff_annotations(
+            f"rxn {rxn_id}", ra.annotation, rb.annotation,
+            anno_keys, diffs, counters, cap,
+        )
+
+
+def _diff_metabolites(
+    a: cobra.Model,
+    b: cobra.Model,
+    common: list[str],
+    anno_keys: set[str],
+    diffs: list[str],
+    cap: int,
+) -> None:
+    counters = {"formula": 0, "charge": 0, "compartment": 0, "anno": 0}
+    for met_id in common:
+        ma = a.metabolites.get_by_id(met_id)
+        mb = b.metabolites.get_by_id(met_id)
+
+        if (ma.formula or None) != (mb.formula or None):
+            _push(
+                diffs, counters, "formula", cap,
+                f"met {met_id}: formula A={ma.formula} B={mb.formula}",
+            )
+
+        ca = ma.charge if ma.charge is not None else 0
+        cb = mb.charge if mb.charge is not None else 0
+        if ca != cb:
+            _push(
+                diffs, counters, "charge", cap,
+                f"met {met_id}: charge A={ma.charge} B={mb.charge}",
+            )
+
+        if ma.compartment != mb.compartment:
+            _push(
+                diffs, counters, "compartment", cap,
+                f"met {met_id}: compartment A={ma.compartment} B={mb.compartment}",
+            )
+
+        _diff_annotations(
+            f"met {met_id}", ma.annotation, mb.annotation,
+            anno_keys, diffs, counters, cap,
+        )
+
+
+def _diff_annotations(
+    label: str,
+    aa: dict,
+    ba: dict,
+    keys: set[str],
+    diffs: list[str],
+    counters: dict,
+    cap: int,
+) -> None:
+    for k in keys:
+        va = _normalise_annotation_value(aa.get(k))
+        vb = _normalise_annotation_value(ba.get(k))
+        if va != vb:
+            _push(diffs, counters, "anno", cap, f"{label}.annotation[{k!r}]: A={va} B={vb}")
+
+
+def _normalise_gpr(rule: str) -> str:
+    """Lowercase + collapse whitespace.
+
+    A more robust comparator would parse to a GPR AST and compare
+    structures; this is the cheap heuristic that catches the formatting
+    drift we see between different SBML writers.
+    """
+    if not rule:
+        return ""
+    return " ".join(rule.lower().split())
+
+
+def _normalise_annotation_value(v):
+    if v is None:
+        return None
+    if isinstance(v, list):
+        return tuple(sorted(str(x) for x in v))
+    if isinstance(v, tuple):
+        return tuple(sorted(str(x) for x in v))
+    return str(v)
+
+
+def _push(diffs: list[str], counters: dict, key: str, cap: int, msg: str) -> None:
+    counters[key] = counters.get(key, 0) + 1
+    if counters[key] <= cap:
+        diffs.append(msg)
+    elif counters[key] == cap + 1:
+        diffs.append(f"... ({key} category truncated at {cap} entries)")
+
+
+def _more(seq: list, shown: int) -> str:
+    return "" if len(seq) <= shown else f" ... (+{len(seq) - shown} more)"
+
+
+# --- CLI ---------------------------------------------------------------
+
+def _main() -> int:
+    import argparse
+
+    from cobra.io import read_sbml_model
+
+    parser = argparse.ArgumentParser(
+        description="Diff two SBML models for semantic equality."
+    )
+    parser.add_argument("a", help="path to first model (SBML)")
+    parser.add_argument("b", help="path to second model (SBML)")
+    parser.add_argument(
+        "--stoichiometry-tol",
+        type=float,
+        default=1e-9,
+        help="absolute tolerance on stoichiometric coefficients (default 1e-9)",
+    )
+    args = parser.parse_args()
+
+    report = diff_models(
+        read_sbml_model(args.a),
+        read_sbml_model(args.b),
+        stoichiometry_tol=args.stoichiometry_tol,
+    )
+    print(report)
+    return 0 if report.equal else 1
+
+
+if __name__ == "__main__":  # pragma: no cover
+    raise SystemExit(_main())
diff --git a/src/raven_python/conditions/__init__.py b/src/raven_python/conditions/__init__.py
new file mode 100644
index 0000000..8465081
--- /dev/null
+++ b/src/raven_python/conditions/__init__.py
@@ -0,0 +1,25 @@
+"""Apply named "condition" presets — bound diffs, biomass edits, etc.
+
+A *condition* is a YAML file (or pre-parsed dict) describing a set of
+deterministic modifications to apply to a cobra model: a prelude that
+resets exchange bounds, edits to a cofactor pseudoreaction
+(metabolite removals + automatic charge rebalancing), a per-reaction
+bounds diff, and an optional biomass-stoichiometry delta.
+
+Yeast-GEM was the first consumer; the schema is documented in
+:func:`apply_condition` and is meant to be reusable for any GEM that
+wants to keep its condition presets as data rather than code.
+"""
+from raven_python.conditions.apply import (
+    DEFAULT_RESET_EXCHANGES_UPPER_BOUND,
+    apply_condition,
+    load_condition,
+    set_reaction_bounds,
+)
+
+__all__ = [
+    "DEFAULT_RESET_EXCHANGES_UPPER_BOUND",
+    "apply_condition",
+    "load_condition",
+    "set_reaction_bounds",
+]
diff --git a/src/raven_python/conditions/apply.py b/src/raven_python/conditions/apply.py
new file mode 100644
index 0000000..2dc1a2b
--- /dev/null
+++ b/src/raven_python/conditions/apply.py
@@ -0,0 +1,186 @@
+"""Generic "apply this YAML condition" loader.
+
+Schema (all keys optional):
+
+.. code-block:: yaml
+
+    name: my_condition
+    description: free-form text.
+
+    # Set every exchange reaction to (0, ub) before any per-rxn override.
+    prelude:
+      reset_exchanges: out                # str | bool truthy
+
+    # Remove metabolites from a pseudoreaction and rebalance H+
+    # (or any other "balance" met) so total charge stays zero.
+    cofactor_pseudoreaction:
+      rxn_id: r_4598
+      remove_mets:
+        - { met: s_3714 }                 # comment field is informational
+      charge_balance_met: s_0794          # optional
+
+    # Add (combine=True) to a reaction's stoichiometry.
+    biomass_stoichiometry_delta:
+      rxn_id: r_4041
+      add:
+        - { met: s_0689, coef:  0.08 }
+        - { met: s_0687, coef: -0.08 }
+        - { met: s_0794, coef: -0.16 }
+
+    # Per-reaction bounds. lb / ub omitted means "leave unchanged".
+    bounds:
+      - { rxn: r_1654, lb: -1000 }
+      - { rxn: r_1992, lb: 0 }
+      - { rxn: r_1663, lb: 0, ub: 0 }
+
+    # Sanity-check counter. Emits a warning if the actual count of
+    # ``lb == -1000`` bound entries doesn't match.
+    expected_uptake_count: 15
+
+The schema is intentionally narrow: deterministic, idempotent, easy to
+diff in code review. Project-specific extensions (e.g. yeast-GEM's
+``amino_acid_ratio``) are handled by the *caller* before / after this
+function — keep the upstream generic.
+"""
+from __future__ import annotations
+
+import warnings
+from pathlib import Path
+from typing import Any
+
+import cobra
+import yaml
+
+#: ``prelude.reset_exchanges`` puts every exchange reaction at this
+#: upper bound (and lower bound = 0).
+DEFAULT_RESET_EXCHANGES_UPPER_BOUND: float = 1000.0
+
+
+def load_condition(path: str | Path) -> dict[str, Any]:
+    """Parse a condition YAML file into a plain dict."""
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"Condition file not found: {path}")
+    with open(path) as f:
+        return yaml.safe_load(f)
+
+
+def apply_condition(
+    model: cobra.Model,
+    condition: dict[str, Any] | str | Path,
+) -> cobra.Model:
+    """Apply a parsed condition (or a YAML file path) to ``model`` in place.
+
+    Returns the same model object for chaining.
+    """
+    cfg = condition if isinstance(condition, dict) else load_condition(condition)
+
+    _apply_prelude(model, cfg)
+    _apply_cofactor_pseudoreaction(model, cfg)
+    _apply_biomass_stoichiometry_delta(model, cfg)
+    n_uptake = _apply_bounds(model, cfg)
+    _check_uptake_count(cfg, n_uptake)
+
+    return model
+
+
+def set_reaction_bounds(rxn: cobra.Reaction, lb: float, ub: float) -> None:
+    """Set both bounds, bypassing cobra's ``lb <= ub`` validator.
+
+    Some conditions intentionally land on an infeasible ``lb > ub`` state
+    (e.g. forcing flux via a sentinel bound). cobra's setter rejects
+    that. This helper writes the underlying private attributes when
+    needed so the model can match the caller's exact intent.
+    """
+    if lb > ub:
+        rxn._lower_bound = lb  # noqa: SLF001 — intentional bypass
+        rxn._upper_bound = ub  # noqa: SLF001 — intentional bypass
+    else:
+        rxn.bounds = (lb, ub)
+
+
+# --- step implementations ---------------------------------------------
+
+def _apply_prelude(model: cobra.Model, cfg: dict[str, Any]) -> None:
+    prelude = cfg.get("prelude") or {}
+    if prelude.get("reset_exchanges"):
+        # cobrapy doesn't distinguish RAVEN's in / out directions; we
+        # reset every exchange reaction.
+        for rxn in model.exchanges:
+            rxn.lower_bound = 0
+            rxn.upper_bound = DEFAULT_RESET_EXCHANGES_UPPER_BOUND
+
+
+def _apply_cofactor_pseudoreaction(model: cobra.Model, cfg: dict[str, Any]) -> None:
+    cp = cfg.get("cofactor_pseudoreaction")
+    if not cp:
+        return
+    rxn = model.reactions.get_by_id(cp["rxn_id"])
+    for entry in cp.get("remove_mets", []):
+        met = model.metabolites.get_by_id(entry["met"])
+        _set_coefficient(rxn, met, 0.0)
+    balance_met_id = cp.get("charge_balance_met")
+    if balance_met_id:
+        balance_met = model.metabolites.get_by_id(balance_met_id)
+        _set_coefficient(rxn, balance_met, 0.0)
+        total_charge = sum(
+            (m.charge or 0) * coef
+            for m, coef in rxn.metabolites.items()
+        )
+        _set_coefficient(rxn, balance_met, -total_charge)
+
+
+def _apply_biomass_stoichiometry_delta(model: cobra.Model, cfg: dict[str, Any]) -> None:
+    delta = cfg.get("biomass_stoichiometry_delta")
+    if not delta:
+        return
+    rxn = model.reactions.get_by_id(delta["rxn_id"])
+    for entry in delta.get("add", []):
+        met = model.metabolites.get_by_id(entry["met"])
+        rxn.add_metabolites({met: float(entry["coef"])}, combine=True)
+
+
+def _apply_bounds(model: cobra.Model, cfg: dict[str, Any]) -> int:
+    n_uptake = 0
+    for entry in cfg.get("bounds", []):
+        try:
+            rxn = model.reactions.get_by_id(entry["rxn"])
+        except KeyError:
+            warnings.warn(
+                f"Reaction {entry['rxn']!r} not found in model; skipping.",
+                stacklevel=3,
+            )
+            continue
+        new_lb = float(entry["lb"]) if "lb" in entry else rxn.lower_bound
+        new_ub = float(entry["ub"]) if "ub" in entry else rxn.upper_bound
+        set_reaction_bounds(rxn, new_lb, new_ub)
+        if entry.get("lb") == -1000:
+            n_uptake += 1
+    return n_uptake
+
+
+def _check_uptake_count(cfg: dict[str, Any], n_uptake: int) -> None:
+    expected = cfg.get("expected_uptake_count")
+    if expected is None:
+        return
+    if n_uptake != expected:
+        warnings.warn(
+            f"Expected {expected} uptake reactions, applied {n_uptake}. "
+            "Some referenced reactions may be missing from the model.",
+            stacklevel=3,
+        )
+
+
+# --- helpers ----------------------------------------------------------
+
+def _set_coefficient(rxn: cobra.Reaction, met: cobra.Metabolite, value: float) -> None:
+    """Set the stoichiometric coefficient of ``met`` in ``rxn`` to ``value``.
+
+    cobra's only public mutation point is ``add_metabolites``; we go via
+    ``combine=True`` with ``(value - current)`` to land on the desired
+    value without depending on cobra's removal-on-zero behaviour.
+    """
+    current = rxn.metabolites.get(met, 0.0)
+    delta = float(value) - current
+    if delta != 0:
+        rxn.add_metabolites({met: delta}, combine=True)
diff --git a/tests/test_annotation.py b/tests/test_annotation.py
new file mode 100644
index 0000000..fbab70b
--- /dev/null
+++ b/tests/test_annotation.py
@@ -0,0 +1,204 @@
+"""Tests for raven_python.annotation (SBO terms + ΔG CSV persistence)."""
+from __future__ import annotations
+
+import math
+
+import cobra
+import pandas as pd
+
+from raven_python.annotation import (
+    DEFAULT_BIOMASS_MET_NAMES,
+    add_sbo_terms,
+    load_delta_g_csv,
+    save_delta_g_csv,
+)
+from raven_python.annotation.sbo import _default_transport_detector
+
+# --- shared tiny model -------------------------------------------------
+
+def _toy_model() -> cobra.Model:
+    """Toy yeast-flavoured model with: extracellular exchange, transport,
+    metabolic, biomass, NGAM, generic pseudo, lipid-backbone biomass met."""
+    m = cobra.Model("toy")
+    m.compartments = {"c": "cytoplasm", "e": "extracellular", "x": "peroxisome"}
+
+    mets = {
+        "atp_c":      cobra.Metabolite("atp_c",      name="ATP",      compartment="c", charge=-4, formula="C10H12N5O13P3"),
+        "atp_x":      cobra.Metabolite("atp_x",      name="ATP",      compartment="x", charge=-4, formula="C10H12N5O13P3"),
+        "glc_e":      cobra.Metabolite("glc_e",      name="glucose",  compartment="e", charge=0,  formula="C6H12O6"),
+        "biomass":    cobra.Metabolite("biomass_c",  name="biomass",  compartment="c"),
+        "lipid_bb":   cobra.Metabolite("lbb_c",      name="lipid backbone", compartment="c"),
+    }
+    m.add_metabolites(list(mets.values()))
+
+    # Exchange (single met in extracellular)
+    ex = cobra.Reaction("EX_glc", lower_bound=-10, upper_bound=1000)
+    ex.add_metabolites({mets["glc_e"]: -1})
+    # Transport (ATP_c ↔ ATP_x — same met name, two compartments)
+    tr = cobra.Reaction("ATP_tx", lower_bound=-1000, upper_bound=1000)
+    tr.add_metabolites({mets["atp_c"]: -1, mets["atp_x"]: 1})
+    # Normal metabolic reaction
+    met = cobra.Reaction("metab1", lower_bound=0, upper_bound=1000)
+    met.add_metabolites({mets["glc_e"]: -1, mets["atp_c"]: 1})
+    # Biomass pseudoreaction (must be last for the bug-compat test)
+    bio = cobra.Reaction("biomass_rxn", lower_bound=0, upper_bound=1000)
+    bio.name = "biomass pseudoreaction"
+    bio.add_metabolites({mets["atp_c"]: -1, mets["biomass"]: 1})
+    m.add_reactions([ex, tr, met, bio])
+    return m
+
+
+# --- add_sbo_terms -----------------------------------------------------
+
+def test_default_biomass_names_set_is_reasonable():
+    assert {"biomass", "DNA", "RNA", "protein", "lipid"} <= DEFAULT_BIOMASS_MET_NAMES
+
+
+def test_simple_chemical_gets_simple_sbo():
+    m = _toy_model()
+    add_sbo_terms(m)
+    assert m.metabolites.get_by_id("glc_e").annotation["sbo"] == "SBO:0000247"
+
+
+def test_biomass_pseudo_metabolite_gets_biomass_sbo():
+    m = _toy_model()
+    add_sbo_terms(m)
+    assert m.metabolites.get_by_id("biomass_c").annotation["sbo"] == "SBO:0000649"
+
+
+def test_lipid_backbone_suffix_match():
+    m = _toy_model()
+    add_sbo_terms(m)
+    assert m.metabolites.get_by_id("lbb_c").annotation["sbo"] == "SBO:0000649"
+
+
+def test_exchange_gets_exchange_sbo():
+    m = _toy_model()
+    add_sbo_terms(m)
+    assert m.reactions.get_by_id("EX_glc").annotation["sbo"] == "SBO:0000627"
+
+
+def test_transport_gets_transport_sbo():
+    m = _toy_model()
+    add_sbo_terms(m)
+    assert m.reactions.get_by_id("ATP_tx").annotation["sbo"] == "SBO:0000655"
+
+
+def test_biomass_pseudoreaction_gets_pseudo_sbo_by_default():
+    m = _toy_model()
+    add_sbo_terms(m)
+    assert m.reactions.get_by_id("biomass_rxn").annotation["sbo"] == "SBO:0000629"
+
+
+def test_legacy_bug_flag_scopes_pseudo_to_last_reaction():
+    """The yeast-GEM compat flag must restrict the pseudo override to
+    the last reaction; if biomass_rxn is last, it still wins."""
+    m = _toy_model()
+    add_sbo_terms(m, only_last_reaction_for_pseudo=True)
+    # biomass_rxn was added last → still gets the pseudo SBO.
+    assert m.reactions.get_by_id("biomass_rxn").annotation["sbo"] == "SBO:0000629"
+
+
+def test_legacy_bug_flag_skips_pseudo_when_not_last():
+    """Reorder: a non-pseudo reaction last; the bug flag should leave
+    the pseudoreaction with whatever non-pseudo SBO it already had."""
+    m = _toy_model()
+    # Swap last two reactions so the pseudo is NOT last.
+    rxns = list(m.reactions)
+    new_order = rxns[:-2] + rxns[-1:] + rxns[-2:-1]
+    m.reactions._generate_index()  # noqa: SLF001 (rebuild before reorder)
+    # Easier: copy bounds, set order via remove+re-add
+    m.remove_reactions([rxns[-2], rxns[-1]])
+    m.add_reactions(new_order[-2:])
+    add_sbo_terms(m, only_last_reaction_for_pseudo=True)
+    bio = m.reactions.get_by_id("biomass_rxn")
+    # The biomass reaction has a single bounded reactant set (atp_c -1,
+    # biomass +1) → 2 mets → falls to default SBO:0000176.
+    assert bio.annotation["sbo"] == "SBO:0000176"
+
+
+def test_fill_semantic_preserves_existing():
+    m = _toy_model()
+    rxn = m.reactions.get_by_id("metab1")
+    rxn.annotation["sbo"] = "SBO:0009999"
+    add_sbo_terms(m)
+    assert rxn.annotation["sbo"] == "SBO:0009999"
+
+
+def test_custom_transport_detector_overrides_default():
+    m = _toy_model()
+    add_sbo_terms(m, transport_detector=lambda _m: set())
+    # ATP_tx is no longer flagged transport → falls through to default
+    # (it has 2 metabolites so it is not exchange/sink/demand).
+    assert m.reactions.get_by_id("ATP_tx").annotation["sbo"] == "SBO:0000176"
+
+
+def test_default_transport_detector_finds_transport():
+    m = _toy_model()
+    transports = _default_transport_detector(m)
+    assert "ATP_tx" in transports
+    assert "EX_glc" not in transports
+
+
+# --- ΔG CSV round-trip ------------------------------------------------
+
+def test_save_then_load_round_trip(tmp_path):
+    m = _toy_model()
+    m.metabolites.get_by_id("atp_c").notes["deltaG"] = "-12.34"
+    m.metabolites.get_by_id("glc_e").notes["deltaG"] = "0.5"
+
+    csv = tmp_path / "met_dg.csv"
+    written = save_delta_g_csv(m.metabolites, csv)
+    assert written == len(m.metabolites)
+    df = pd.read_csv(csv)
+    assert list(df.columns) == ["Var1", "Var2"]
+
+    # Reload on a fresh model
+    fresh = _toy_model()
+    stamped = load_delta_g_csv(fresh.metabolites, csv)
+    assert stamped == 2
+    assert fresh.metabolites.get_by_id("atp_c").notes["deltaG"] == "-12.34"
+    assert fresh.metabolites.get_by_id("glc_e").notes["deltaG"] == "0.5"
+
+
+def test_save_writes_nan_for_missing_notes(tmp_path):
+    m = _toy_model()  # no notes set
+    csv = tmp_path / "met_dg.csv"
+    save_delta_g_csv(m.metabolites, csv)
+    df = pd.read_csv(csv)
+    assert len(df) == len(m.metabolites)
+    assert df["Var2"].apply(lambda v: math.isnan(v)).all()
+
+
+def test_load_skips_nan_rows(tmp_path):
+    """A NaN in the CSV must NOT clobber the entity's existing note."""
+    m = _toy_model()
+    m.metabolites.get_by_id("atp_c").notes["deltaG"] = "preserved"
+
+    csv = tmp_path / "met_dg.csv"
+    pd.DataFrame(
+        {"Var1": ["atp_c", "glc_e"], "Var2": [math.nan, 1.0]}
+    ).to_csv(csv, index=False)
+
+    load_delta_g_csv(m.metabolites, csv)
+    assert m.metabolites.get_by_id("atp_c").notes["deltaG"] == "preserved"
+    assert m.metabolites.get_by_id("glc_e").notes["deltaG"] == "1.0"
+
+
+def test_custom_columns_and_note_key(tmp_path):
+    m = _toy_model()
+    m.metabolites.get_by_id("atp_c").notes["dG_kJ"] = "-30.5"
+    csv = tmp_path / "out.csv"
+    save_delta_g_csv(
+        m.metabolites, csv,
+        id_column="id", value_column="dG", note_key="dG_kJ",
+    )
+    df = pd.read_csv(csv)
+    assert list(df.columns) == ["id", "dG"]
+
+    fresh = _toy_model()
+    load_delta_g_csv(
+        fresh.metabolites, csv,
+        id_column="id", value_column="dG", note_key="dG_kJ",
+    )
+    assert fresh.metabolites.get_by_id("atp_c").notes["dG_kJ"] == "-30.5"
diff --git a/tests/test_comparison_diff.py b/tests/test_comparison_diff.py
new file mode 100644
index 0000000..c8556c1
--- /dev/null
+++ b/tests/test_comparison_diff.py
@@ -0,0 +1,146 @@
+"""Tests for raven_python.comparison.diff — strict 2-model equality."""
+from __future__ import annotations
+
+import cobra
+import pytest
+
+from raven_python.comparison import DiffReport, diff_models
+from raven_python.comparison.diff import _normalise_gpr
+
+
+def _mini_model(model_id: str = "m") -> cobra.Model:
+    """Tiny but realistic model: one reaction, one extracellular met."""
+    m = cobra.Model(model_id)
+    a = cobra.Metabolite("a_c", name="A", compartment="c", charge=0, formula="C1")
+    b = cobra.Metabolite("b_e", name="B", compartment="e", charge=0, formula="C2")
+    m.add_metabolites([a, b])
+    r = cobra.Reaction("r1", lower_bound=-1000, upper_bound=1000)
+    r.add_metabolites({a: -1, b: 1})
+    r.gene_reaction_rule = "g1 AND g2"
+    r.annotation["sbo"] = "SBO:0000176"
+    m.add_reactions([r])
+    return m
+
+
+def test_model_equal_to_itself():
+    m = _mini_model()
+    report = diff_models(m, m)
+    assert isinstance(report, DiffReport)
+    assert report.equal
+
+
+def test_independent_copies_are_equal():
+    a = _mini_model()
+    b = _mini_model()
+    assert diff_models(a, b).equal
+
+
+def test_dropped_reaction_is_detected():
+    a = _mini_model()
+    b = _mini_model()
+    b.remove_reactions([b.reactions[0]])
+    report = diff_models(a, b)
+    assert not report.equal
+    assert any("reactions only in A" in d for d in report.differences)
+
+
+def test_bound_diff_is_detected():
+    a = _mini_model()
+    b = _mini_model()
+    b.reactions[0].lower_bound = -42
+    report = diff_models(a, b)
+    assert not report.equal
+    assert any("bounds" in d for d in report.differences)
+
+
+def test_stoich_within_tolerance_is_equal():
+    a = _mini_model()
+    b = _mini_model()
+    met = next(iter(b.reactions[0].metabolites))
+    b.reactions[0].add_metabolites({met: 1e-12}, combine=True)
+    assert diff_models(a, b, stoichiometry_tol=1e-9).equal
+
+
+def test_stoich_outside_tolerance_is_not_equal():
+    a = _mini_model()
+    b = _mini_model()
+    met = next(iter(b.reactions[0].metabolites))
+    b.reactions[0].add_metabolites({met: 5.0}, combine=False)
+    report = diff_models(a, b, stoichiometry_tol=1e-9)
+    assert not report.equal
+    assert any("coef[" in d for d in report.differences)
+
+
+def test_annotation_diff_detected():
+    a = _mini_model()
+    b = _mini_model()
+    b.reactions[0].annotation["sbo"] = "SBO:0009999"
+    report = diff_models(a, b)
+    assert not report.equal
+    assert any("annotation['sbo']" in d for d in report.differences)
+
+
+def test_ignore_annotations_suppresses_diff():
+    a = _mini_model()
+    b = _mini_model()
+    b.reactions[0].annotation["sbo"] = "SBO:0009999"
+    assert diff_models(a, b, ignore_annotations={"sbo"}).equal
+
+
+def test_extra_annotations_picked_up():
+    a = _mini_model()
+    b = _mini_model()
+    a.reactions[0].annotation["custom-key"] = "v1"
+    b.reactions[0].annotation["custom-key"] = "v2"
+    # Default key set ignores custom-key → equal.
+    assert diff_models(a, b).equal
+    # Pull it in → diff.
+    assert not diff_models(a, b, extra_annotations={"custom-key"}).equal
+
+
+@pytest.mark.parametrize(
+    "ga, gb",
+    [
+        ("A and B", "a AND b"),
+        ("A  and   B", "A and B"),
+        ("(A or B) and C", "(a OR b) AND c"),
+    ],
+)
+def test_gpr_normalisation(ga, gb):
+    assert _normalise_gpr(ga) == _normalise_gpr(gb)
+
+
+def test_max_per_category_truncates():
+    """The per-category cap kicks in on per-reaction (and per-met) diffs,
+    not on the id-set summary line. Build a scenario with many bound
+    diffs across shared reactions."""
+    a = cobra.Model("a")
+    b = cobra.Model("b")
+    mets_a = [cobra.Metabolite(f"m{i}_c", compartment="c") for i in range(60)]
+    mets_b = [cobra.Metabolite(f"m{i}_c", compartment="c") for i in range(60)]
+    a.add_metabolites(mets_a)
+    b.add_metabolites(mets_b)
+    for i in range(60):
+        ra = cobra.Reaction(f"r{i}", lower_bound=0, upper_bound=1000)
+        ra.add_metabolites({mets_a[i]: -1})
+        a.add_reactions([ra])
+        rb = cobra.Reaction(f"r{i}", lower_bound=-7, upper_bound=1000)  # differs
+        rb.add_metabolites({mets_b[i]: -1})
+        b.add_reactions([rb])
+    report = diff_models(a, b, max_per_category=5)
+    assert not report.equal
+    assert any("truncated at 5" in d for d in report.differences)
+
+
+def test_report_str_lists_differences():
+    a = _mini_model()
+    b = _mini_model()
+    b.remove_reactions([b.reactions[0]])
+    text = str(diff_models(a, b))
+    assert "Models differ" in text
+    assert "reactions only in A" in text
+
+
+def test_report_str_when_equal():
+    m = _mini_model()
+    assert str(diff_models(m, m)) == "Models are semantically equal."
diff --git a/tests/test_conditions.py b/tests/test_conditions.py
new file mode 100644
index 0000000..e5e381f
--- /dev/null
+++ b/tests/test_conditions.py
@@ -0,0 +1,199 @@
+"""Tests for raven_python.conditions.apply (generic condition mechanism)."""
+from __future__ import annotations
+
+import cobra
+import pytest
+import yaml
+
+from raven_python.conditions import (
+    apply_condition,
+    load_condition,
+    set_reaction_bounds,
+)
+
+
+def _toy_model() -> cobra.Model:
+    m = cobra.Model("toy")
+    m.compartments = {"c": "cytoplasm", "e": "extracellular"}
+
+    mets = {
+        "atp_c": cobra.Metabolite("atp_c", name="ATP", compartment="c", charge=-4),
+        "glc_e": cobra.Metabolite("glc_e", name="glucose", compartment="e", charge=0),
+        "h_c":   cobra.Metabolite("h_c",   name="H+",      compartment="c", charge=1),
+        "fad":   cobra.Metabolite("fad",   name="FAD",     compartment="c", charge=-2),
+        "fadh2": cobra.Metabolite("fadh2", name="FADH2",   compartment="c", charge=0),
+        "heme":  cobra.Metabolite("heme",  name="heme a",  compartment="c", charge=-2),
+    }
+    m.add_metabolites(list(mets.values()))
+
+    ex = cobra.Reaction("EX_glc", lower_bound=-10, upper_bound=1000)
+    ex.add_metabolites({mets["glc_e"]: -1})
+    cofactor = cobra.Reaction("cofac", lower_bound=0, upper_bound=1000)
+    cofactor.add_metabolites({mets["heme"]: -1, mets["fad"]: -1, mets["h_c"]: 3})
+    biomass = cobra.Reaction("bio", lower_bound=0, upper_bound=1000)
+    biomass.add_metabolites({mets["atp_c"]: -1})
+    blocked = cobra.Reaction("blocked", lower_bound=-1000, upper_bound=1000)
+    blocked.add_metabolites({mets["atp_c"]: -1, mets["glc_e"]: 1})
+    m.add_reactions([ex, cofactor, biomass, blocked])
+    return m
+
+
+# --- prelude ----------------------------------------------------------
+
+def test_prelude_reset_exchanges_zeroes_lb_and_caps_ub():
+    m = _toy_model()
+    apply_condition(m, {"prelude": {"reset_exchanges": "out"}})
+    ex = m.reactions.get_by_id("EX_glc")
+    assert ex.lower_bound == 0
+    assert ex.upper_bound == 1000
+
+
+# --- bounds -----------------------------------------------------------
+
+def test_bounds_apply_lb_only():
+    m = _toy_model()
+    apply_condition(m, {"bounds": [{"rxn": "EX_glc", "lb": -1000}]})
+    assert m.reactions.get_by_id("EX_glc").lower_bound == -1000
+    # ub preserved
+    assert m.reactions.get_by_id("EX_glc").upper_bound == 1000
+
+
+def test_bounds_apply_both():
+    m = _toy_model()
+    apply_condition(m, {"bounds": [{"rxn": "blocked", "lb": 0, "ub": 0}]})
+    assert m.reactions.get_by_id("blocked").bounds == (0, 0)
+
+
+def test_bounds_lb_gt_ub_bypasses_cobra_validator():
+    m = _toy_model()
+    apply_condition(m, {"bounds": [{"rxn": "blocked", "lb": 1000, "ub": 0}]})
+    rxn = m.reactions.get_by_id("blocked")
+    assert rxn.lower_bound == 1000
+    assert rxn.upper_bound == 0
+
+
+def test_bounds_missing_rxn_warns_and_continues():
+    m = _toy_model()
+    with pytest.warns(UserWarning, match="not_a_real_rxn"):
+        apply_condition(m, {"bounds": [{"rxn": "not_a_real_rxn", "lb": 0}]})
+
+
+# --- cofactor pseudoreaction -----------------------------------------
+
+def test_remove_met_zeroes_coefficient():
+    m = _toy_model()
+    apply_condition(
+        m,
+        {
+            "cofactor_pseudoreaction": {
+                "rxn_id": "cofac",
+                "remove_mets": [{"met": "heme"}],
+            }
+        },
+    )
+    rxn = m.reactions.get_by_id("cofac")
+    heme = m.metabolites.get_by_id("heme")
+    assert rxn.metabolites.get(heme, 0) == 0
+
+
+def test_charge_balance_recomputed_after_removal():
+    m = _toy_model()
+    apply_condition(
+        m,
+        {
+            "cofactor_pseudoreaction": {
+                "rxn_id": "cofac",
+                "remove_mets": [{"met": "heme"}],
+                "charge_balance_met": "h_c",
+            }
+        },
+    )
+    rxn = m.reactions.get_by_id("cofac")
+    # heme (-1·-2) and fad (-1·-2) contributed +4 charge.
+    # After heme removal: only fad (-1·-2) contributes +2.
+    # H+ (charge +1) coef should be -2 to zero the net.
+    h_c = m.metabolites.get_by_id("h_c")
+    assert rxn.metabolites[h_c] == -2
+
+
+# --- biomass stoichiometry delta -------------------------------------
+
+def test_biomass_stoichiometry_delta_combines_with_existing():
+    m = _toy_model()
+    bio = m.reactions.get_by_id("bio")
+    atp = m.metabolites.get_by_id("atp_c")
+    before = bio.metabolites[atp]
+    apply_condition(
+        m,
+        {
+            "biomass_stoichiometry_delta": {
+                "rxn_id": "bio",
+                "add": [{"met": "atp_c", "coef": 0.5}],
+            }
+        },
+    )
+    assert bio.metabolites[atp] == pytest.approx(before + 0.5)
+
+
+# --- expected_uptake_count -------------------------------------------
+
+def test_expected_uptake_count_mismatch_warns():
+    m = _toy_model()
+    cfg = {
+        "bounds": [{"rxn": "EX_glc", "lb": -1000}],
+        "expected_uptake_count": 5,
+    }
+    with pytest.warns(UserWarning, match="Expected 5 uptake reactions, applied 1"):
+        apply_condition(m, cfg)
+
+
+def test_expected_uptake_count_match_silent(recwarn):
+    m = _toy_model()
+    apply_condition(
+        m,
+        {
+            "bounds": [{"rxn": "EX_glc", "lb": -1000}],
+            "expected_uptake_count": 1,
+        },
+    )
+    assert not any("Expected" in str(w.message) for w in recwarn)
+
+
+# --- yaml path entrypoint --------------------------------------------
+
+def test_apply_condition_accepts_yaml_path(tmp_path):
+    cfg = {"bounds": [{"rxn": "EX_glc", "lb": -42}]}
+    path = tmp_path / "cond.yml"
+    path.write_text(yaml.safe_dump(cfg))
+    m = _toy_model()
+    apply_condition(m, path)
+    assert m.reactions.get_by_id("EX_glc").lower_bound == -42
+
+
+def test_load_condition_round_trip(tmp_path):
+    cfg = {"name": "x", "bounds": [{"rxn": "r1", "lb": 0}]}
+    path = tmp_path / "cond.yml"
+    path.write_text(yaml.safe_dump(cfg))
+    assert load_condition(path) == cfg
+
+
+def test_load_condition_missing_path_raises(tmp_path):
+    with pytest.raises(FileNotFoundError):
+        load_condition(tmp_path / "nope.yml")
+
+
+# --- set_reaction_bounds public helper -------------------------------
+
+def test_set_reaction_bounds_normal_case():
+    m = _toy_model()
+    rxn = m.reactions.get_by_id("EX_glc")
+    set_reaction_bounds(rxn, -5, 5)
+    assert rxn.bounds == (-5, 5)
+
+
+def test_set_reaction_bounds_infeasible_case():
+    m = _toy_model()
+    rxn = m.reactions.get_by_id("EX_glc")
+    set_reaction_bounds(rxn, 100, 0)
+    assert rxn.lower_bound == 100
+    assert rxn.upper_bound == 0

From 3bf0de2d16ecd3f1f2a1a77091fbba33bcf2ef96 Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Sat, 30 May 2026 18:38:08 +0200
Subject: [PATCH 2/7] =?UTF-8?q?Add=20raven=5Fpython.biomass=20=E2=80=94=20?=
 =?UTF-8?q?sum=20/=20scale=20/=20rescale=20/=20set=5Fgam?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Generic biomass-equation manipulation, extracted from yeast-GEM's
sumBioMass / scaleBioMass / rescalePseudoReaction / changeGAM as
yeast-GEM moves to depend on raven-python (yeast-GEM phase 4 of the
porting plan).

Module layout
-------------
raven_python.biomass.config
    BiomassConfig — biomass_rxn id + proton_met id + ordered tuple
    of BiomassComponent (per-component pseudoreaction name + mass-
    computation strategy).

raven_python.biomass.scale
    sum_biomass(model, config) → {component: g/gDW, ..., "total": X}
    rescale_pseudoreaction(model, config, name, factor) — multiply
        the pseudoreaction's substrate coefs by factor and rebalance
        H+ to keep ionic charge at zero.
    scale_biomass(model, config, name, new_value, balance_out=None) —
        rescale a component to a target fraction, optionally balancing
        a second component so the total stays at 1 g/gDW.

raven_python.biomass.gam
    set_gam(model, value, *, biomass_rxn, cofactor_met_names,
            ngam_rxn=None, ngam_value=None) — scales every metabolite
    in the biomass pseudoreaction whose `name` is in the supplied
    cofactor set, preserving its sign; optionally fixes the NGAM rxn
    bounds.

Mass strategies (per BiomassComponent.mass_strategy):
    "mw"               plain MW from chemical formula (carbohydrate /
                       ion / cofactor)
    "mw_minus_2h"      MW − 2.016 g/mol per substrate (protein —
                       charged tRNAs release two protons)
    "mw_minus_water"   MW − 18.015 g/mol per substrate (RNA / DNA —
                       polymerisation releases one water)
    "grams"            stoichiometry already in g/gDW (lipid backbone)

Tests: 19 new tests over a synthetic toy model that exercises every
mass strategy, the H+ charge rebalance, scale_biomass with and
without balance_out, set_gam on cofactor mets (and the NGAM bound
path).
---
 src/raven_python/biomass/__init__.py |  50 ++++++
 src/raven_python/biomass/config.py   |  95 +++++++++++
 src/raven_python/biomass/gam.py      |  67 ++++++++
 src/raven_python/biomass/scale.py    | 207 ++++++++++++++++++++++
 tests/test_biomass.py                | 246 +++++++++++++++++++++++++++
 5 files changed, 665 insertions(+)
 create mode 100644 src/raven_python/biomass/__init__.py
 create mode 100644 src/raven_python/biomass/config.py
 create mode 100644 src/raven_python/biomass/gam.py
 create mode 100644 src/raven_python/biomass/scale.py
 create mode 100644 tests/test_biomass.py

diff --git a/src/raven_python/biomass/__init__.py b/src/raven_python/biomass/__init__.py
new file mode 100644
index 0000000..6b5ac49
--- /dev/null
+++ b/src/raven_python/biomass/__init__.py
@@ -0,0 +1,50 @@
+"""Biomass equation manipulation — growth-associated maintenance, amino-
+acid ratios, component scaling, and biomass-fraction reporting.
+
+The yeast-GEM port (see yeast-GEM/code/python/PORTING_PLAN.md) was the
+first consumer; the API is parameterised by a :class:`BiomassConfig`
+so other GEMs can describe their own component layout.
+
+A typical caller assembles ``BiomassConfig`` once (often from a
+project-level YAML) and passes it to every operation:
+
+.. code-block:: python
+
+    cfg = BiomassConfig(
+        biomass_rxn="r_4041",
+        proton_met="s_0794",
+        components=[
+            BiomassComponent("protein", "protein pseudoreaction",
+                             mass_strategy="mw_minus_2h"),
+            BiomassComponent("carbohydrate", "carbohydrate pseudoreaction"),
+            BiomassComponent("lipid_backbone", "lipid backbone pseudoreaction",
+                             mass_strategy="grams"),
+            BiomassComponent("RNA", "RNA pseudoreaction",
+                             mass_strategy="mw_minus_water"),
+            BiomassComponent("DNA", "DNA pseudoreaction",
+                             mass_strategy="mw_minus_water"),
+            BiomassComponent("ion", "ion pseudoreaction"),
+            BiomassComponent("cofactor", "cofactor pseudoreaction"),
+        ],
+    )
+    fractions = sum_biomass(model, cfg)            # {'protein': 0.46, ...}
+    scale_biomass(model, cfg, "protein", 0.50, balance_out="carbohydrate")
+    set_gam(model, 80, biomass_rxn=cfg.biomass_rxn,
+            cofactor_met_names=("ATP", "ADP", "H2O", "H+", "phosphate"))
+"""
+from raven_python.biomass.config import BiomassComponent, BiomassConfig
+from raven_python.biomass.gam import set_gam
+from raven_python.biomass.scale import (
+    rescale_pseudoreaction,
+    scale_biomass,
+    sum_biomass,
+)
+
+__all__ = [
+    "BiomassComponent",
+    "BiomassConfig",
+    "rescale_pseudoreaction",
+    "scale_biomass",
+    "set_gam",
+    "sum_biomass",
+]
diff --git a/src/raven_python/biomass/config.py b/src/raven_python/biomass/config.py
new file mode 100644
index 0000000..c0ee453
--- /dev/null
+++ b/src/raven_python/biomass/config.py
@@ -0,0 +1,95 @@
+"""Configuration types for the biomass module."""
+from __future__ import annotations
+
+from collections.abc import Iterable
+from dataclasses import dataclass, field
+from typing import Literal
+
+#: Strategies for computing a component's mass contribution from the
+#: stoichiometry of its pseudoreaction. See :func:`sum_biomass`.
+#:
+#: ``"mw"``                 multiply each substrate coefficient by the
+#:                          molecular weight derived from its
+#:                          ``formula`` (g/mol), divide by 1000 to get
+#:                          g/gDW. Suits carbohydrate / ion / cofactor.
+#: ``"mw_minus_2h"``        as ``"mw"`` but with each MW reduced by
+#:                          2.016 g/mol (two protons released per
+#:                          charged tRNA on protein-pseudoreaction
+#:                          substrates).
+#: ``"mw_minus_water"``     as ``"mw"`` but with each MW reduced by
+#:                          18.015 g/mol (one water released per
+#:                          polymerisation step; RNA / DNA).
+#: ``"grams"``              substrate coefficients are already in
+#:                          g/gDW (e.g. the lipid-backbone
+#:                          pseudoreaction). No MW lookup needed.
+MassStrategy = Literal["mw", "mw_minus_2h", "mw_minus_water", "grams"]
+
+
+@dataclass(frozen=True)
+class BiomassComponent:
+    """One component of the biomass equation (protein, carbohydrate, …).
+
+    Attributes
+    ----------
+    name
+        Canonical short name — also the ``model.metabolites`` name of
+        the metabolite that the pseudoreaction *produces*. Used by
+        :func:`rescale_pseudoreaction` to identify the product side.
+    pseudoreaction_name
+        ``model.reactions[*].name`` of the pseudoreaction.
+        cobrapy reactions are looked up by ``name`` (not ``id``)
+        because that is how the yeast-GEM convention names them.
+    mass_strategy
+        How to convert the pseudoreaction's substrates into a mass
+        fraction. See :data:`MassStrategy` for the choices and their
+        offsets.
+    """
+
+    name: str
+    pseudoreaction_name: str
+    mass_strategy: MassStrategy = "mw"
+
+
+@dataclass(frozen=True)
+class BiomassConfig:
+    """Container for the per-organism biomass layout.
+
+    Attributes
+    ----------
+    biomass_rxn
+        ``model.reactions[*].id`` of the top-level biomass
+        pseudoreaction (the one whose flux is the growth rate).
+    proton_met
+        ``model.metabolites[*].id`` of the cytosolic H+ metabolite —
+        used by :func:`rescale_pseudoreaction` to keep each
+        pseudoreaction charge-balanced after rescaling.
+    components
+        Ordered tuple of :class:`BiomassComponent` describing every
+        non-top-level pseudoreaction that contributes to mass.
+    """
+
+    biomass_rxn: str
+    proton_met: str
+    components: tuple[BiomassComponent, ...] = field(default_factory=tuple)
+
+    def __post_init__(self) -> None:
+        # Freeze the components into a tuple even if a list was passed.
+        if not isinstance(self.components, tuple):
+            object.__setattr__(self, "components", tuple(self.components))
+
+    @classmethod
+    def from_components(
+        cls,
+        biomass_rxn: str,
+        proton_met: str,
+        components: Iterable[BiomassComponent],
+    ) -> BiomassConfig:
+        return cls(biomass_rxn=biomass_rxn, proton_met=proton_met,
+                   components=tuple(components))
+
+    def get(self, component_name: str) -> BiomassComponent:
+        """Look up a component by name (raises ``KeyError`` if missing)."""
+        for c in self.components:
+            if c.name == component_name:
+                return c
+        raise KeyError(f"BiomassConfig has no component named {component_name!r}")
diff --git a/src/raven_python/biomass/gam.py b/src/raven_python/biomass/gam.py
new file mode 100644
index 0000000..15a28be
--- /dev/null
+++ b/src/raven_python/biomass/gam.py
@@ -0,0 +1,67 @@
+"""Growth-associated maintenance (GAM) and non-growth maintenance (NGAM).
+
+Ports the generic part of yeast-GEM's ``changeGAM.m``.
+"""
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+import cobra
+
+
+def set_gam(
+    model: cobra.Model,
+    value: float,
+    *,
+    biomass_rxn: str,
+    cofactor_met_names: Iterable[str],
+    ngam_rxn: str | None = None,
+    ngam_value: float | None = None,
+) -> cobra.Model:
+    """Set GAM (and optionally NGAM) on a model in place.
+
+    Scales every metabolite participating in the biomass pseudoreaction
+    whose ``name`` is in ``cofactor_met_names`` (e.g. ATP, ADP, H2O,
+    H+, phosphate) to ``±value``, preserving the sign of its current
+    coefficient.
+
+    If both ``ngam_rxn`` and ``ngam_value`` are given, the NGAM
+    reaction's bounds are fixed at ``(ngam_value, ngam_value)``.
+
+    Parameters
+    ----------
+    model
+        cobra model to mutate.
+    value
+        New GAM value (mmol ATP / gDW per growth unit).
+    biomass_rxn
+        Reaction id of the biomass pseudoreaction whose stoichiometry
+        carries the GAM coefficients.
+    cofactor_met_names
+        Iterable of metabolite *names* (not ids) — every metabolite in
+        the biomass pseudoreaction whose name is in this set will be
+        rescaled. yeast-GEM uses ``{"ATP", "ADP", "H2O", "H+", "phosphate"}``.
+    ngam_rxn, ngam_value
+        Optional NGAM update. If both are provided, the reaction's
+        bounds are set to ``(ngam_value, ngam_value)`` (equality
+        constraint).
+
+    Returns the (mutated) model for chaining.
+    """
+    rxn = model.reactions.get_by_id(biomass_rxn)
+    cofactor_set = set(cofactor_met_names)
+
+    deltas: dict[cobra.Metabolite, float] = {}
+    for met, coef in rxn.metabolites.items():
+        if met.name in cofactor_set:
+            target = (1 if coef > 0 else -1) * float(value)
+            # combine=True adds delta → new total = target
+            deltas[met] = target - coef
+    if deltas:
+        rxn.add_metabolites(deltas, combine=True)
+
+    if ngam_rxn is not None and ngam_value is not None:
+        ngam = model.reactions.get_by_id(ngam_rxn)
+        ngam.bounds = (float(ngam_value), float(ngam_value))
+
+    return model
diff --git a/src/raven_python/biomass/scale.py b/src/raven_python/biomass/scale.py
new file mode 100644
index 0000000..2dc8930
--- /dev/null
+++ b/src/raven_python/biomass/scale.py
@@ -0,0 +1,207 @@
+"""Biomass component summing and rescaling.
+
+Ports the generic core of yeast-GEM's ``sumBioMass.m`` /
+``scaleBioMass.m`` / ``rescalePseudoReaction.m`` into a single
+parameterised module that other GEMs can configure via
+:class:`BiomassConfig`.
+"""
+from __future__ import annotations
+
+import re
+
+import cobra
+
+from raven_python.biomass.config import BiomassComponent, BiomassConfig
+
+# Elemental atomic weights used to translate formulas into g/mol.
+# Mirrors the table baked into yeast-GEM's ``sumBioMass.m``; pseudo-
+# element "R" is treated as zero mass (placeholder for residues).
+_ELEMENT_MASS_G_PER_MOL: dict[str, float] = {
+    "C": 12.01, "H": 1.008, "N": 14.007, "O": 15.999,
+    "P": 30.974, "S": 32.06, "R": 0.0,
+    "Fe": 55.845, "K": 39.098, "Na": 22.99, "Cl": 35.45,
+    "Mn": 54.938, "Zn": 65.38, "Ca": 40.078, "Mg": 24.305, "Cu": 63.546,
+}
+
+_FORMULA_TOKEN_RE = re.compile(r"([A-Z][a-z]*)(\d*)")
+
+
+def sum_biomass(model: cobra.Model, config: BiomassConfig) -> dict[str, float]:
+    """Mass-fraction (g/gDW) per biomass component plus the total.
+
+    Mirrors yeast-GEM's ``sumBioMass.m``. Returns a dict keyed by each
+    :class:`BiomassComponent` name plus ``"total"``. Components whose
+    pseudoreaction is missing from the model contribute 0 (logged as a
+    warning).
+
+    Substrate detection: in each component's pseudoreaction, the
+    substrate side is the metabolites with negative coefficient — the
+    same convention yeast-GEM uses.
+    """
+    fractions: dict[str, float] = {}
+    total = 0.0
+    for comp in config.components:
+        f = _component_fraction(model, comp)
+        fractions[comp.name] = f
+        total += f
+    fractions["total"] = total
+    return fractions
+
+
+def rescale_pseudoreaction(
+    model: cobra.Model,
+    config: BiomassConfig,
+    component_name: str,
+    factor: float,
+) -> None:
+    """Multiply the substrate coefficients of one component pseudoreaction
+    by ``factor`` and rebalance H+ to preserve charge neutrality.
+
+    Ports ``rescalePseudoReaction.m``. "Substrate" here means any
+    metabolite whose ``name`` is not the component's product name; the
+    component's own metabolite (e.g. ``protein`` for the protein
+    pseudoreaction) is left untouched. After the rescaling the
+    coefficient of ``config.proton_met`` is recomputed so the
+    reaction's total ionic charge sums to zero.
+    """
+    comp = config.get(component_name)
+    rxn = _find_pseudoreaction(model, comp)
+    proton_met = model.metabolites.get_by_id(config.proton_met)
+
+    # Step 1: scale substrate coefficients (everything but the component
+    # product). Use a fresh dict to avoid mutating during iteration.
+    deltas: dict[cobra.Metabolite, float] = {}
+    for met, coef in rxn.metabolites.items():
+        if met.name == comp.name:
+            continue  # the product — leave alone
+        deltas[met] = (factor - 1.0) * coef  # combine=True adds (factor-1)*coef
+    if deltas:
+        rxn.add_metabolites(deltas, combine=True)
+
+    # Step 2: H+ rebalance. The legacy code first zeroes H+, then sets
+    # it to negate the rxn's current ionic charge. With combine=True we
+    # express the same with two `_set_coefficient` calls.
+    _set_coefficient(rxn, proton_met, 0.0)
+    total_charge = sum(
+        (m.charge or 0) * coef for m, coef in rxn.metabolites.items()
+    )
+    _set_coefficient(rxn, proton_met, -total_charge)
+
+
+def scale_biomass(
+    model: cobra.Model,
+    config: BiomassConfig,
+    component_name: str,
+    new_value: float,
+    *,
+    balance_out: str | None = None,
+) -> None:
+    """Rescale a component to a new mass fraction, optionally balancing
+    out a second component to keep the total at 1 g/gDW.
+
+    Ports ``scaleBioMass.m``. The rescaling factor is derived from
+    :func:`sum_biomass` on the *current* model state, so call this with
+    an in-place ``model`` mutation, not on a stale snapshot.
+    """
+    fractions = sum_biomass(model, config)
+    current = fractions[component_name]
+    if current == 0:
+        raise ValueError(
+            f"Cannot scale {component_name!r} to {new_value}: current "
+            "fraction is 0 (pseudoreaction missing or empty)."
+        )
+    factor = new_value / current
+    rescale_pseudoreaction(model, config, component_name, factor)
+
+    if balance_out is not None:
+        # Recompute X after the first rescaling.
+        fractions = sum_biomass(model, config)
+        total = fractions["total"]
+        balance_current = fractions[balance_out]
+        if balance_current == 0:
+            return  # nothing we can do
+        balance_factor = (balance_current + (1.0 - total)) / balance_current
+        rescale_pseudoreaction(model, config, balance_out, balance_factor)
+
+
+# --- internals ---------------------------------------------------------
+
+def _component_fraction(model: cobra.Model, comp: BiomassComponent) -> float:
+    """Compute g/gDW of a single component from its pseudoreaction."""
+    try:
+        rxn = _find_pseudoreaction(model, comp)
+    except KeyError:
+        return 0.0  # missing pseudoreaction → contributes 0
+    substrates = [(m, c) for m, c in rxn.metabolites.items() if c < 0]
+    if not substrates:
+        return 0.0
+
+    if comp.mass_strategy == "grams":
+        # Stoichiometry already in g/gDW; just sum the (negative) coefs
+        # and flip sign.
+        return -sum(c for _m, c in substrates)
+
+    offset = _mw_offset(comp.mass_strategy)
+    mass_g = 0.0
+    for met, coef in substrates:
+        mw = _formula_mw(met.formula or "")
+        if mw == 0:
+            raise ValueError(
+                f"Metabolite {met.id} ({met.name!r}) has an empty "
+                "formula; cannot compute mass for biomass summing."
+            )
+        mw_corrected = mw + offset
+        mass_g += -coef * mw_corrected
+    return mass_g / 1000.0  # g/mmol → g/gDW (matching yeast-GEM units)
+
+
+def _mw_offset(strategy: str) -> float:
+    if strategy == "mw":
+        return 0.0
+    if strategy == "mw_minus_2h":
+        # 2 × 1.008 = 2.016 g/mol per substrate (charged tRNAs release
+        # two protons on amino acylation).
+        return -2.016
+    if strategy == "mw_minus_water":
+        # H2O = 18.015 g/mol per polymerisation step (RNA / DNA).
+        return -18.015
+    raise ValueError(f"Unknown mass_strategy: {strategy!r}")
+
+
+def _formula_mw(formula: str) -> float:
+    """Molecular weight in g/mol from a Hill-style chemical formula."""
+    if not formula:
+        return 0.0
+    mw = 0.0
+    for element, count_str in _FORMULA_TOKEN_RE.findall(formula):
+        if not element:
+            continue
+        try:
+            atomic = _ELEMENT_MASS_G_PER_MOL[element]
+        except KeyError as exc:
+            raise ValueError(
+                f"Unknown element {element!r} in formula {formula!r}; "
+                "extend raven_python.biomass.scale._ELEMENT_MASS_G_PER_MOL "
+                "if the element is real."
+            ) from exc
+        count = int(count_str) if count_str else 1
+        mw += atomic * count
+    return mw
+
+
+def _find_pseudoreaction(model: cobra.Model, comp: BiomassComponent) -> cobra.Reaction:
+    for rxn in model.reactions:
+        if rxn.name == comp.pseudoreaction_name:
+            return rxn
+    raise KeyError(
+        f"Component {comp.name!r}: no reaction named "
+        f"{comp.pseudoreaction_name!r} in model."
+    )
+
+
+def _set_coefficient(rxn: cobra.Reaction, met: cobra.Metabolite, value: float) -> None:
+    """Land on ``S[met, rxn] = value`` via cobra's combine=True API."""
+    current = rxn.metabolites.get(met, 0.0)
+    delta = float(value) - current
+    if delta != 0:
+        rxn.add_metabolites({met: delta}, combine=True)
diff --git a/tests/test_biomass.py b/tests/test_biomass.py
new file mode 100644
index 0000000..5e96ffd
--- /dev/null
+++ b/tests/test_biomass.py
@@ -0,0 +1,246 @@
+"""Tests for raven_python.biomass."""
+from __future__ import annotations
+
+import cobra
+import pytest
+
+from raven_python.biomass import (
+    BiomassComponent,
+    BiomassConfig,
+    rescale_pseudoreaction,
+    scale_biomass,
+    set_gam,
+    sum_biomass,
+)
+from raven_python.biomass.scale import _formula_mw
+
+# --- formula MW helper ----------------------------------------------
+
+@pytest.mark.parametrize(
+    "formula, expected",
+    [
+        ("H2O", 18.015),       # 2*1.008 + 15.999
+        ("ATP", 0.0),          # invalid: no element 'ATP'... we raise
+        ("CO2", 44.008),       # 12.01 + 2*15.999
+        ("C6H12O6", 180.156),  # 6*12.01 + 12*1.008 + 6*15.999
+    ],
+)
+def test_formula_mw(formula, expected):
+    if formula == "ATP":
+        with pytest.raises(ValueError, match="Unknown element"):
+            _formula_mw(formula)
+    else:
+        assert _formula_mw(formula) == pytest.approx(expected, rel=1e-4)
+
+
+# --- toy biomass model ----------------------------------------------
+
+def _toy_biomass_model() -> tuple[cobra.Model, BiomassConfig]:
+    """Tiny but realistic biomass layout:
+
+    - protein pseudoreaction: 2 amino-acid tRNAs → protein (mass_strategy mw_minus_2h)
+    - carbohydrate pseudoreaction: glucose → carbohydrate (mass_strategy mw)
+    - biomass pseudoreaction: protein + carbohydrate + GAM cofactors → biomass
+    """
+    m = cobra.Model("toy")
+    m.compartments = {"c": "cytoplasm"}
+
+    aa1 = cobra.Metabolite("aa1_c", name="alanine-tRNA", compartment="c",
+                            charge=0, formula="C3H6N1O2")  # MW ≈ 88
+    aa2 = cobra.Metabolite("aa2_c", name="glycine-tRNA", compartment="c",
+                            charge=0, formula="C2H4N1O2")  # MW ≈ 74
+    glc = cobra.Metabolite("glc_c", name="glucose", compartment="c",
+                            charge=0, formula="C6H12O6")  # MW = 180.156
+    protein = cobra.Metabolite("protein_c", name="protein", compartment="c")
+    carb = cobra.Metabolite("carb_c", name="carbohydrate", compartment="c")
+    biomass = cobra.Metabolite("biomass_c", name="biomass", compartment="c")
+    h = cobra.Metabolite("h_c", name="H+", compartment="c", charge=1)
+    atp = cobra.Metabolite("atp_c", name="ATP", compartment="c", charge=-4)
+    adp = cobra.Metabolite("adp_c", name="ADP", compartment="c", charge=-3)
+    h2o = cobra.Metabolite("h2o_c", name="H2O", compartment="c", charge=0)
+    pi = cobra.Metabolite("pi_c", name="phosphate", compartment="c", charge=-2)
+    m.add_metabolites([aa1, aa2, glc, protein, carb, biomass, h, atp, adp, h2o, pi])
+
+    prot_rxn = cobra.Reaction("PROT_pseudo")
+    prot_rxn.name = "protein pseudoreaction"
+    prot_rxn.add_metabolites({aa1: -0.5, aa2: -0.5, protein: 1, h: 1})
+    carb_rxn = cobra.Reaction("CARB_pseudo")
+    carb_rxn.name = "carbohydrate pseudoreaction"
+    carb_rxn.add_metabolites({glc: -0.001, carb: 1})  # 1 mmol → 180 mg
+    bio_rxn = cobra.Reaction("BIO_pseudo")
+    bio_rxn.name = "biomass pseudoreaction"
+    bio_rxn.add_metabolites({
+        protein: -1, carb: -1, biomass: 1,
+        atp: -50, h2o: -50, adp: 50, h: 50, pi: 50,   # GAM = 50
+    })
+    m.add_reactions([prot_rxn, carb_rxn, bio_rxn])
+
+    cfg = BiomassConfig(
+        biomass_rxn="BIO_pseudo",
+        proton_met="h_c",
+        components=(
+            BiomassComponent("protein", "protein pseudoreaction",
+                             mass_strategy="mw_minus_2h"),
+            BiomassComponent("carbohydrate", "carbohydrate pseudoreaction",
+                             mass_strategy="mw"),
+        ),
+    )
+    return m, cfg
+
+
+# --- sum_biomass -----------------------------------------------------
+
+def test_sum_biomass_keys_match_components_plus_total():
+    m, cfg = _toy_biomass_model()
+    out = sum_biomass(m, cfg)
+    assert set(out) == {"protein", "carbohydrate", "total"}
+
+
+def test_sum_biomass_protein_uses_minus_2h_offset():
+    m, cfg = _toy_biomass_model()
+    out = sum_biomass(m, cfg)
+    # aa1 (alanine-tRNA, C3H6N1O2, MW ≈ 88.09): MW - 2.016
+    # aa2 (glycine-tRNA, C2H4N1O2, MW ≈ 74.07): MW - 2.016
+    # contribs: 0.5*(88.09 - 2.016) + 0.5*(74.07 - 2.016) ≈ 79.066, /1000
+    assert out["protein"] == pytest.approx(0.079066, rel=1e-3)
+
+
+def test_sum_biomass_carbohydrate_uses_plain_mw():
+    m, cfg = _toy_biomass_model()
+    out = sum_biomass(m, cfg)
+    # 0.001 mmol glucose * 180.156 g/mol / 1000 = 0.000180156
+    assert out["carbohydrate"] == pytest.approx(0.000180156, rel=1e-3)
+
+
+def test_sum_biomass_total_is_components_sum():
+    m, cfg = _toy_biomass_model()
+    out = sum_biomass(m, cfg)
+    assert out["total"] == pytest.approx(out["protein"] + out["carbohydrate"])
+
+
+def test_sum_biomass_grams_strategy():
+    m, cfg = _toy_biomass_model()
+    # Add a lipid component whose stoichiometry is already in grams.
+    lipid = cobra.Metabolite("lipid_c", name="lipid backbone", compartment="c")
+    m.add_metabolites([lipid])
+    rxn = cobra.Reaction("LIPID_pseudo")
+    rxn.name = "lipid backbone pseudoreaction"
+    rxn.add_metabolites({lipid: 1, m.metabolites.get_by_id("glc_c"): -0.05})
+    m.add_reactions([rxn])
+    cfg2 = BiomassConfig.from_components(
+        cfg.biomass_rxn, cfg.proton_met,
+        [*cfg.components,
+         BiomassComponent("lipid_backbone", "lipid backbone pseudoreaction",
+                          mass_strategy="grams")],
+    )
+    out = sum_biomass(m, cfg2)
+    assert out["lipid_backbone"] == pytest.approx(0.05)
+
+
+def test_sum_biomass_missing_pseudoreaction_contributes_zero():
+    m, cfg = _toy_biomass_model()
+    cfg2 = BiomassConfig.from_components(
+        cfg.biomass_rxn, cfg.proton_met,
+        [*cfg.components, BiomassComponent("DNA", "DNA pseudoreaction")],
+    )
+    out = sum_biomass(m, cfg2)
+    assert out["DNA"] == 0
+
+
+# --- rescale_pseudoreaction -----------------------------------------
+
+def test_rescale_pseudoreaction_doubles_substrate_coefs():
+    m, cfg = _toy_biomass_model()
+    rxn = m.reactions.get_by_id("PROT_pseudo")
+    aa1 = m.metabolites.get_by_id("aa1_c")
+    before = rxn.metabolites[aa1]
+    rescale_pseudoreaction(m, cfg, "protein", 2.0)
+    assert rxn.metabolites[aa1] == pytest.approx(2 * before)
+
+
+def test_rescale_pseudoreaction_leaves_product_alone():
+    m, cfg = _toy_biomass_model()
+    rxn = m.reactions.get_by_id("PROT_pseudo")
+    protein = m.metabolites.get_by_id("protein_c")
+    before = rxn.metabolites[protein]
+    rescale_pseudoreaction(m, cfg, "protein", 3.0)
+    assert rxn.metabolites[protein] == before
+
+
+def test_rescale_pseudoreaction_rebalances_h_for_charge():
+    m, cfg = _toy_biomass_model()
+    # Add a non-zero-charge substrate so H+ balance is meaningful.
+    rxn = m.reactions.get_by_id("PROT_pseudo")
+    aa1 = m.metabolites.get_by_id("aa1_c")
+    aa1.charge = -1  # negatively charged
+    rescale_pseudoreaction(m, cfg, "protein", 2.0)
+    total = sum((m_.charge or 0) * c for m_, c in rxn.metabolites.items())
+    assert total == pytest.approx(0)
+
+
+# --- scale_biomass --------------------------------------------------
+
+def test_scale_biomass_lands_on_new_value():
+    m, cfg = _toy_biomass_model()
+    target = 0.40
+    scale_biomass(m, cfg, "protein", target)
+    out = sum_biomass(m, cfg)
+    assert out["protein"] == pytest.approx(target, rel=1e-6)
+
+
+def test_scale_biomass_with_balance_keeps_total_at_one():
+    m, cfg = _toy_biomass_model()
+    # Force a known starting total ≠ 1 by tweaking the toy model.
+    scale_biomass(m, cfg, "protein", 0.6, balance_out="carbohydrate")
+    out = sum_biomass(m, cfg)
+    assert out["total"] == pytest.approx(1.0, rel=1e-6)
+
+
+def test_scale_biomass_zero_current_raises():
+    m, cfg = _toy_biomass_model()
+    # Build a config that references a missing component → 0 current
+    cfg2 = BiomassConfig.from_components(
+        cfg.biomass_rxn, cfg.proton_met,
+        [*cfg.components, BiomassComponent("DNA", "DNA pseudoreaction")],
+    )
+    with pytest.raises(ValueError, match="current"):
+        scale_biomass(m, cfg2, "DNA", 0.1)
+
+
+# --- set_gam --------------------------------------------------------
+
+def test_set_gam_scales_cofactor_coefficients():
+    m, cfg = _toy_biomass_model()
+    set_gam(m, 80, biomass_rxn=cfg.biomass_rxn,
+            cofactor_met_names=("ATP", "ADP", "H2O", "H+", "phosphate"))
+    bio = m.reactions.get_by_id(cfg.biomass_rxn)
+    atp = m.metabolites.get_by_id("atp_c")
+    adp = m.metabolites.get_by_id("adp_c")
+    h2o = m.metabolites.get_by_id("h2o_c")
+    pi = m.metabolites.get_by_id("pi_c")
+    assert bio.metabolites[atp] == -80
+    assert bio.metabolites[adp] == 80
+    assert bio.metabolites[h2o] == -80
+    assert bio.metabolites[pi] == 80
+
+
+def test_set_gam_with_ngam_sets_equality_bounds():
+    m, _ = _toy_biomass_model()
+    ngam = cobra.Reaction("NGAM", lower_bound=0, upper_bound=1000)
+    atp = m.metabolites.get_by_id("atp_c")
+    ngam.add_metabolites({atp: -1})
+    m.add_reactions([ngam])
+    set_gam(m, 80, biomass_rxn="BIO_pseudo",
+            cofactor_met_names=("ATP",),
+            ngam_rxn="NGAM", ngam_value=1.5)
+    assert m.reactions.get_by_id("NGAM").bounds == (1.5, 1.5)
+
+
+def test_set_gam_ignores_non_cofactor_mets():
+    m, cfg = _toy_biomass_model()
+    bio = m.reactions.get_by_id(cfg.biomass_rxn)
+    protein = m.metabolites.get_by_id("protein_c")
+    before = bio.metabolites[protein]
+    set_gam(m, 80, biomass_rxn=cfg.biomass_rxn,
+            cofactor_met_names=("ATP",))
+    assert bio.metabolites[protein] == before

From b4828593dd98187d0f1b459b221d979af528a0e5 Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Sat, 30 May 2026 22:19:54 +0200
Subject: [PATCH 3/7] Add raven_python.manipulation.find_duplicate_reactions
 (detection variant)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Detection-only counterpart to remove_duplicate_reactions. Returns
duplicate groups instead of mutating the model. Ignores bounds /
GPR / objective — only stoichiometry is compared, mirroring the
typical curation use case ("find reactions that could be merged").

A new ``ignore_direction=True`` default (yeast-GEM convention)
treats A→B and B→A as duplicates. Set False to require identical
orientation.

Used by yeast-GEM's modelTests port (Tier 3 / phase 5) to flag
duplicate reactions during curation review.
---
 src/raven_python/manipulation/__init__.py |  2 +
 src/raven_python/manipulation/simplify.py | 48 +++++++++++++
 tests/test_manipulation_find_duplicate.py | 85 +++++++++++++++++++++++
 3 files changed, 135 insertions(+)
 create mode 100644 tests/test_manipulation_find_duplicate.py

diff --git a/src/raven_python/manipulation/__init__.py b/src/raven_python/manipulation/__init__.py
index 074c36f..a462949 100644
--- a/src/raven_python/manipulation/__init__.py
+++ b/src/raven_python/manipulation/__init__.py
@@ -10,6 +10,7 @@
 from .remove import remove_genes, remove_metabolites
 from .simplify import (
     constrain_reversible_reactions,
+    find_duplicate_reactions,
     group_linear_reactions,
     remove_dead_end_reactions,
     remove_duplicate_reactions,
@@ -26,6 +27,7 @@
     "constrain_reversible_reactions",
     "convert_to_irreversible",
     "expand_model",
+    "find_duplicate_reactions",
     "group_linear_reactions",
     "merge_models",
     "remove_dead_end_reactions",
diff --git a/src/raven_python/manipulation/simplify.py b/src/raven_python/manipulation/simplify.py
index 2deaccd..d05c9e6 100644
--- a/src/raven_python/manipulation/simplify.py
+++ b/src/raven_python/manipulation/simplify.py
@@ -82,6 +82,54 @@ def _signature(rxn):
     return (mets, rxn.lower_bound, rxn.upper_bound, rxn.objective_coefficient)
 
 
+def _stoich_signature(rxn, *, ignore_direction: bool) -> frozenset:
+    """Signature considering stoichiometry only (used by find_duplicate_reactions).
+
+    When ``ignore_direction`` is True, ``A → B`` and ``B → A`` (the same
+    reaction with all coefficients negated) share a signature. Both
+    orientations are accumulated and the lexicographically smaller one
+    wins so the dict-key lookup is direction-symmetric.
+    """
+    forward = frozenset((m.id, c) for m, c in rxn.metabolites.items())
+    if not ignore_direction:
+        return forward
+    backward = frozenset((m.id, -c) for m, c in rxn.metabolites.items())
+    return min(forward, backward, key=lambda s: sorted(s))
+
+
+def find_duplicate_reactions(
+    model: cobra.Model,
+    *,
+    ignore_direction: bool = True,
+) -> list[list[cobra.Reaction]]:
+    """Return groups of reactions that share identical stoichiometry.
+
+    Detection-only counterpart to :func:`remove_duplicate_reactions`.
+    Bounds, objective coefficients, GPRs and annotations are ignored —
+    only stoichiometry is compared, mirroring the legacy yeast-GEM
+    ``findDuplicatedRxns`` and matching the typical curation use case
+    (find reactions that *could* be merged).
+
+    Parameters
+    ----------
+    ignore_direction
+        When True (default), ``A → B`` and ``B → A`` are treated as
+        duplicates (yeast-GEM's convention). Set ``False`` to require
+        identical orientation.
+
+    Returns
+    -------
+    A list of duplicate groups. Each group is itself a list with
+    ≥ 2 reactions sharing the same stoichiometry. Reactions that have
+    no duplicate are omitted.
+    """
+    groups: dict[frozenset, list[cobra.Reaction]] = {}
+    for rxn in model.reactions:
+        sig = _stoich_signature(rxn, ignore_direction=ignore_direction)
+        groups.setdefault(sig, []).append(rxn)
+    return [g for g in groups.values() if len(g) >= 2]
+
+
 def remove_duplicate_reactions(
     model: cobra.Model, *, reserved: Iterable[str] | None = None
 ) -> list[str]:
diff --git a/tests/test_manipulation_find_duplicate.py b/tests/test_manipulation_find_duplicate.py
new file mode 100644
index 0000000..692556e
--- /dev/null
+++ b/tests/test_manipulation_find_duplicate.py
@@ -0,0 +1,85 @@
+"""Tests for raven_python.manipulation.find_duplicate_reactions."""
+from __future__ import annotations
+
+import cobra
+
+from raven_python.manipulation import find_duplicate_reactions
+
+
+def _mk_model() -> cobra.Model:
+    m = cobra.Model("m")
+    a = cobra.Metabolite("a_c", compartment="c")
+    b = cobra.Metabolite("b_c", compartment="c")
+    c = cobra.Metabolite("c_c", compartment="c")
+    m.add_metabolites([a, b, c])
+    return m
+
+
+def test_no_duplicates_returns_empty():
+    m = _mk_model()
+    r1 = cobra.Reaction("r1")
+    r1.add_metabolites({m.metabolites.a_c: -1, m.metabolites.b_c: 1})
+    r2 = cobra.Reaction("r2")
+    r2.add_metabolites({m.metabolites.b_c: -1, m.metabolites.c_c: 1})
+    m.add_reactions([r1, r2])
+    assert find_duplicate_reactions(m) == []
+
+
+def test_same_stoichiometry_grouped():
+    m = _mk_model()
+    r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000)
+    r2 = cobra.Reaction("r2", lower_bound=-100, upper_bound=100)
+    for r in (r1, r2):
+        r.add_metabolites({m.metabolites.a_c: -1, m.metabolites.b_c: 1})
+    m.add_reactions([r1, r2])
+    groups = find_duplicate_reactions(m)
+    assert len(groups) == 1
+    assert {r.id for r in groups[0]} == {"r1", "r2"}
+
+
+def test_ignore_direction_default_groups_reverse_pair():
+    """yeast-GEM's findDuplicatedRxns matches A→B with B→A. That's the default."""
+    m = _mk_model()
+    r1 = cobra.Reaction("r1")
+    r1.add_metabolites({m.metabolites.a_c: -1, m.metabolites.b_c: 1})
+    r2 = cobra.Reaction("r2")
+    r2.add_metabolites({m.metabolites.a_c: 1, m.metabolites.b_c: -1})  # reversed
+    m.add_reactions([r1, r2])
+    groups = find_duplicate_reactions(m)
+    assert len(groups) == 1
+
+
+def test_ignore_direction_false_keeps_them_separate():
+    m = _mk_model()
+    r1 = cobra.Reaction("r1")
+    r1.add_metabolites({m.metabolites.a_c: -1, m.metabolites.b_c: 1})
+    r2 = cobra.Reaction("r2")
+    r2.add_metabolites({m.metabolites.a_c: 1, m.metabolites.b_c: -1})
+    m.add_reactions([r1, r2])
+    assert find_duplicate_reactions(m, ignore_direction=False) == []
+
+
+def test_three_duplicates_in_one_group():
+    m = _mk_model()
+    rxns = []
+    for i in range(3):
+        r = cobra.Reaction(f"r{i}")
+        r.add_metabolites({m.metabolites.a_c: -1, m.metabolites.b_c: 1})
+        rxns.append(r)
+    m.add_reactions(rxns)
+    groups = find_duplicate_reactions(m)
+    assert len(groups) == 1
+    assert len(groups[0]) == 3
+
+
+def test_ignores_bounds_and_gpr_differences():
+    """Bounds and GPRs are intentionally ignored — only stoichiometry."""
+    m = _mk_model()
+    r1 = cobra.Reaction("r1", lower_bound=0, upper_bound=1000)
+    r2 = cobra.Reaction("r2", lower_bound=-50, upper_bound=50)
+    r1.add_metabolites({m.metabolites.a_c: -1, m.metabolites.b_c: 1})
+    r2.add_metabolites({m.metabolites.a_c: -1, m.metabolites.b_c: 1})
+    r1.gene_reaction_rule = "g1"
+    r2.gene_reaction_rule = "g2"
+    m.add_reactions([r1, r2])
+    assert len(find_duplicate_reactions(m)) == 1

From 4d81625c9d7a0c641fc77fb5cad9554b090de3fc Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Sat, 30 May 2026 23:06:03 +0200
Subject: [PATCH 4/7] =?UTF-8?q?Add=20raven=5Fpython.curation=20=E2=80=94?=
 =?UTF-8?q?=20batch=5Fcurate=20/=20batch=5Fcurate=5Ffrom=5Ftsv?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Generic batch curation engine extracted from yeast-GEM's MATLAB
curateMetsRxnsGenes (yeast-GEM phase 6). Adds or updates
metabolites, reactions and genes from pandas DataFrames; a
batch_curate_from_tsv convenience wrapper reads the equivalent TSVs.

Schema (matches yeast-GEM's data/modelCuration/template/ layout):

  mets_df          metNames, comps, formula, charge, inchi, metNotes
                   + any number of MIRIAM-namespace columns
  genes_df         genes, geneShortNames + MIRIAM columns
  rxns_df          rxnNames, grRules, lb, ub, rev, subSystems,
                   eccodes, rxnNotes, rxnReferences,
                   rxnConfidenceScores + MIRIAM columns
  rxns_coeffs_df   rxnNames, metNames, comps, coefficient
                   (one row per (reaction, metabolite) pair)

Match keys:
  metabolites — (name, compartment) tuple
  genes       — gene id
  reactions   — stoichiometric signature

Existing entities get their annotations overwritten (warning emitted);
new entities are added with fresh ids generated from the supplied
``met_id_prefix`` / ``rxn_id_prefix`` (defaults M_ / R_ per the BiGG
convention; yeast-GEM passes s_ / r_). Width of the existing
zero-padded suffix is preserved so s_0001 → s_0002, not s_2.

"Everything after the core columns is MIRIAM" — the header of any
extra column becomes the annotation namespace key. Matches MATLAB
behaviour exactly so yeast-GEM's existing TSVs work unchanged, and
projects with different MIRIAM column sets need no code change.

CurationResult dataclass records what was added vs updated so
callers can verify in tests / CI.

Tests: 13 new (add/update mets, add/update genes, add/update rxns
by stoichiometry, miriam auto-detect, id-width preservation,
combined mets+rxns in one call, missing-metabolite error,
batch_curate_from_tsv round trip).
---
 src/raven_python/curation/__init__.py |  54 +++
 src/raven_python/curation/batch.py    | 458 ++++++++++++++++++++++++++
 tests/test_curation.py                | 247 ++++++++++++++
 3 files changed, 759 insertions(+)
 create mode 100644 src/raven_python/curation/__init__.py
 create mode 100644 src/raven_python/curation/batch.py
 create mode 100644 tests/test_curation.py

diff --git a/src/raven_python/curation/__init__.py b/src/raven_python/curation/__init__.py
new file mode 100644
index 0000000..de8b8cc
--- /dev/null
+++ b/src/raven_python/curation/__init__.py
@@ -0,0 +1,54 @@
+"""Batch curation of metabolites / reactions / genes from data tables.
+
+Port of yeast-GEM's MATLAB ``curateMetsRxnsGenes`` into a generic
+DataFrame-driven engine. Other GEM projects (Human-GEM, custom
+reconstructions, …) can use the same machinery with their own TSV
+layouts; the only required pieces are the data tables and (optionally)
+project-specific id prefixes for fresh metabolites and reactions.
+
+Public API:
+
+* :func:`batch_curate` — entrypoint taking pandas DataFrames.
+* :func:`batch_curate_from_tsv` — file-path convenience wrapper.
+* :class:`CurationResult` — record of what was added / updated.
+
+Schema (mirrors yeast-GEM's ``data/modelCuration/template/`` layout):
+
+- **mets_df**: ``metNames, comps, formula, charge, inchi, metNotes``
+  + any number of MIRIAM-annotation columns. Match key is
+  ``(name, comp)``.
+- **genes_df**: ``genes, geneShortNames`` + MIRIAM columns. Match key
+  is ``genes``.
+- **rxns_df**: ``rxnNames, grRules, lb, ub, rev, subSystems, eccodes,
+  rxnNotes, rxnReferences, rxnConfidenceScores`` + MIRIAM columns.
+  Match key is the reaction's *stoichiometry* — same metabolites and
+  coefficients ⇒ same reaction.
+- **rxns_coeffs_df**: ``rxnNames, metNames, comps, coefficient``. One
+  row per ``(reaction, metabolite)`` pair. The ``rxnNames`` column
+  links each coefficient back to a row in ``rxns_df``. An optional
+  ``index`` first column from the legacy yeast-GEM schema is silently
+  ignored.
+
+Everything after the core columns in any of the four tables is
+interpreted as a MIRIAM annotation: the column header becomes the
+namespace key (``met.annotation[<header>] = <cell>``).
+"""
+from raven_python.curation.batch import (
+    DEFAULT_CORE_GENE_COLUMNS,
+    DEFAULT_CORE_MET_COLUMNS,
+    DEFAULT_CORE_RXN_COEFFS_COLUMNS,
+    DEFAULT_CORE_RXN_COLUMNS,
+    CurationResult,
+    batch_curate,
+    batch_curate_from_tsv,
+)
+
+__all__ = [
+    "DEFAULT_CORE_GENE_COLUMNS",
+    "DEFAULT_CORE_MET_COLUMNS",
+    "DEFAULT_CORE_RXN_COEFFS_COLUMNS",
+    "DEFAULT_CORE_RXN_COLUMNS",
+    "CurationResult",
+    "batch_curate",
+    "batch_curate_from_tsv",
+]
diff --git a/src/raven_python/curation/batch.py b/src/raven_python/curation/batch.py
new file mode 100644
index 0000000..1bdeb23
--- /dev/null
+++ b/src/raven_python/curation/batch.py
@@ -0,0 +1,458 @@
+"""Generic batch-curation engine driven by tabular data."""
+from __future__ import annotations
+
+import warnings
+from dataclasses import dataclass, field
+from pathlib import Path
+
+import cobra
+import pandas as pd
+
+#: Core columns recognised in ``mets_df``. Anything else is treated as a
+#: MIRIAM annotation column (the header becomes the namespace key).
+DEFAULT_CORE_MET_COLUMNS: tuple[str, ...] = (
+    "metNames", "comps", "formula", "charge", "inchi", "metNotes",
+)
+#: Core columns recognised in ``genes_df``.
+DEFAULT_CORE_GENE_COLUMNS: tuple[str, ...] = ("genes", "geneShortNames")
+#: Core columns recognised in ``rxns_df``.
+DEFAULT_CORE_RXN_COLUMNS: tuple[str, ...] = (
+    "rxnNames", "grRules", "lb", "ub", "rev",
+    "subSystems", "eccodes", "rxnNotes", "rxnReferences", "rxnConfidenceScores",
+)
+#: Required columns in ``rxns_coeffs_df``. (Linkage column ``rxnNames``
+#: + one row per ``(reaction, metabolite)`` pair.) A leading ``index``
+#: column from the legacy yeast-GEM schema is silently ignored.
+DEFAULT_CORE_RXN_COEFFS_COLUMNS: tuple[str, ...] = (
+    "rxnNames", "metNames", "comps", "coefficient",
+)
+
+
+@dataclass
+class CurationResult:
+    """Record of what :func:`batch_curate` added / updated.
+
+    Each list holds the cobra ids of the affected entities, in the
+    order they were processed.
+    """
+
+    added_metabolites: list[str] = field(default_factory=list)
+    updated_metabolites: list[str] = field(default_factory=list)
+    added_genes: list[str] = field(default_factory=list)
+    updated_genes: list[str] = field(default_factory=list)
+    added_reactions: list[str] = field(default_factory=list)
+    updated_reactions: list[str] = field(default_factory=list)
+
+    def __bool__(self) -> bool:
+        return bool(
+            self.added_metabolites or self.updated_metabolites
+            or self.added_genes or self.updated_genes
+            or self.added_reactions or self.updated_reactions
+        )
+
+
+# --- public entry points ----------------------------------------------
+
+def batch_curate(
+    model: cobra.Model,
+    *,
+    mets_df: pd.DataFrame | None = None,
+    genes_df: pd.DataFrame | None = None,
+    rxns_df: pd.DataFrame | None = None,
+    rxns_coeffs_df: pd.DataFrame | None = None,
+    met_id_prefix: str = "M_",
+    rxn_id_prefix: str = "R_",
+) -> CurationResult:
+    """Add or update metabolites / reactions / genes from data tables.
+
+    Each table is optional. ``rxns_df`` and ``rxns_coeffs_df`` must be
+    provided together (one describes the per-reaction attributes, the
+    other carries the stoichiometric coefficients).
+
+    Parameters
+    ----------
+    model
+        Model to curate in place.
+    mets_df, genes_df, rxns_df, rxns_coeffs_df
+        Tables matching the schema described in
+        :mod:`raven_python.curation`.
+    met_id_prefix, rxn_id_prefix
+        Prefixes for freshly-generated metabolite / reaction ids
+        (e.g. ``s_`` and ``r_`` for yeast-GEM, ``M_`` and ``R_`` for
+        the cobrapy / BiGG default). New entity ids are formed by
+        finding the largest existing zero-padded suffix matching the
+        prefix and incrementing from there.
+
+    Returns
+    -------
+    A :class:`CurationResult` summarising the changes.
+    """
+    result = CurationResult()
+
+    if mets_df is not None:
+        _apply_mets(model, mets_df, met_id_prefix, result)
+    if genes_df is not None:
+        _apply_genes(model, genes_df, result)
+
+    if (rxns_df is None) != (rxns_coeffs_df is None):
+        raise ValueError(
+            "rxns_df and rxns_coeffs_df must be provided together; got "
+            f"rxns_df={'set' if rxns_df is not None else 'None'}, "
+            f"rxns_coeffs_df="
+            f"{'set' if rxns_coeffs_df is not None else 'None'}."
+        )
+    if rxns_df is not None:
+        _apply_reactions(model, rxns_df, rxns_coeffs_df, rxn_id_prefix, result)
+    return result
+
+
+def batch_curate_from_tsv(
+    model: cobra.Model,
+    *,
+    mets_tsv: str | Path | None = None,
+    genes_tsv: str | Path | None = None,
+    rxns_tsv: str | Path | None = None,
+    rxns_coeffs_tsv: str | Path | None = None,
+    met_id_prefix: str = "M_",
+    rxn_id_prefix: str = "R_",
+) -> CurationResult:
+    """File-path convenience wrapper for :func:`batch_curate`.
+
+    Each path is optional. TSVs are read with pandas' default
+    ``read_csv(sep='\\t')``; empty cells become ``NaN`` (not the
+    empty string), which the engine treats as "skip this field".
+    """
+    def _read(path):
+        return pd.read_csv(path, sep="\t") if path is not None else None
+
+    return batch_curate(
+        model,
+        mets_df=_read(mets_tsv),
+        genes_df=_read(genes_tsv),
+        rxns_df=_read(rxns_tsv),
+        rxns_coeffs_df=_read(rxns_coeffs_tsv),
+        met_id_prefix=met_id_prefix,
+        rxn_id_prefix=rxn_id_prefix,
+    )
+
+
+# --- metabolites ------------------------------------------------------
+
+def _apply_mets(
+    model: cobra.Model,
+    df: pd.DataFrame,
+    id_prefix: str,
+    result: CurationResult,
+) -> None:
+    miriam_cols = [c for c in df.columns if c not in DEFAULT_CORE_MET_COLUMNS]
+    name_index = _name_compartment_index(model)
+
+    new_rows: list[pd.Series] = []
+    for _, row in df.iterrows():
+        name = row["metNames"]
+        comp = row["comps"]
+        existing = name_index.get((name, comp))
+        if existing is not None:
+            _update_metabolite(existing, row, miriam_cols)
+            result.updated_metabolites.append(existing.id)
+        else:
+            new_rows.append(row)
+
+    if new_rows:
+        new_mets = _build_new_metabolites(model, new_rows, miriam_cols, id_prefix)
+        model.add_metabolites(new_mets)
+        result.added_metabolites.extend(m.id for m in new_mets)
+
+    if result.updated_metabolites:
+        warnings.warn(
+            f"{len(result.updated_metabolites)} metabolite(s) already "
+            "existed and were overwritten: "
+            f"{result.updated_metabolites[:5]}"
+            f"{'...' if len(result.updated_metabolites) > 5 else ''}",
+            stacklevel=4,
+        )
+
+
+def _name_compartment_index(model: cobra.Model) -> dict[tuple[str, str], cobra.Metabolite]:
+    return {(m.name, m.compartment): m for m in model.metabolites}
+
+
+def _update_metabolite(met: cobra.Metabolite, row: pd.Series, miriam_cols: list[str]) -> None:
+    if _has_value(row.get("formula")):
+        met.formula = str(row["formula"])
+    if _has_value(row.get("charge")):
+        met.charge = _coerce_int(row["charge"])
+    if _has_value(row.get("inchi")):
+        met.annotation["inchi"] = str(row["inchi"])
+    if _has_value(row.get("metNotes")):
+        met.notes["metNotes"] = str(row["metNotes"])
+    _apply_miriam(met, row, miriam_cols)
+
+
+def _build_new_metabolites(
+    model: cobra.Model,
+    rows: list[pd.Series],
+    miriam_cols: list[str],
+    id_prefix: str,
+) -> list[cobra.Metabolite]:
+    ids = _generate_ids(model.metabolites, id_prefix, len(rows))
+    new_mets: list[cobra.Metabolite] = []
+    for new_id, row in zip(ids, rows, strict=True):
+        met = cobra.Metabolite(
+            id=new_id,
+            name=str(row["metNames"]),
+            compartment=str(row["comps"]),
+        )
+        if _has_value(row.get("formula")):
+            met.formula = str(row["formula"])
+        if _has_value(row.get("charge")):
+            met.charge = _coerce_int(row["charge"])
+        if _has_value(row.get("inchi")):
+            met.annotation["inchi"] = str(row["inchi"])
+        if _has_value(row.get("metNotes")):
+            met.notes["metNotes"] = str(row["metNotes"])
+        _apply_miriam(met, row, miriam_cols)
+        new_mets.append(met)
+    return new_mets
+
+
+# --- genes ------------------------------------------------------------
+
+def _apply_genes(model: cobra.Model, df: pd.DataFrame, result: CurationResult) -> None:
+    miriam_cols = [c for c in df.columns if c not in DEFAULT_CORE_GENE_COLUMNS]
+    existing_genes = {g.id: g for g in model.genes}
+
+    new_rows: list[pd.Series] = []
+    for _, row in df.iterrows():
+        gid = str(row["genes"])
+        existing = existing_genes.get(gid)
+        if existing is not None:
+            _update_gene(existing, row, miriam_cols)
+            result.updated_genes.append(gid)
+        else:
+            new_rows.append(row)
+
+    if new_rows:
+        # cobrapy doesn't have a direct "add a free-standing gene" API;
+        # genes are typically added via reactions. Use cobra.Gene + an
+        # explicit registration through the DictList.
+        from cobra.core.gene import Gene
+
+        for row in new_rows:
+            g = Gene(str(row["genes"]))
+            if _has_value(row.get("geneShortNames")):
+                g.name = str(row["geneShortNames"])
+            _apply_miriam(g, row, miriam_cols)
+            model.genes.append(g)
+            result.added_genes.append(g.id)
+
+    if result.updated_genes:
+        warnings.warn(
+            f"{len(result.updated_genes)} gene(s) already existed and "
+            f"were overwritten: {result.updated_genes[:5]}"
+            f"{'...' if len(result.updated_genes) > 5 else ''}",
+            stacklevel=4,
+        )
+
+
+def _update_gene(gene, row: pd.Series, miriam_cols: list[str]) -> None:
+    if _has_value(row.get("geneShortNames")):
+        gene.name = str(row["geneShortNames"])
+    _apply_miriam(gene, row, miriam_cols)
+
+
+# --- reactions --------------------------------------------------------
+
+def _apply_reactions(
+    model: cobra.Model,
+    rxns_df: pd.DataFrame,
+    coeffs_df: pd.DataFrame,
+    id_prefix: str,
+    result: CurationResult,
+) -> None:
+    miriam_cols = [c for c in rxns_df.columns if c not in DEFAULT_CORE_RXN_COLUMNS]
+    _validate_rxns_coeffs(coeffs_df)
+    coeffs_by_name = _group_coeffs_by_rxn_name(model, coeffs_df)
+
+    # Build {stoichiometry_signature: existing_reaction} for the lookup
+    # of "is this reaction already in the model?".
+    by_signature = {
+        _stoich_signature(r): r for r in model.reactions
+    }
+
+    new_rows: list[tuple[pd.Series, dict[cobra.Metabolite, float]]] = []
+    for _, row in rxns_df.iterrows():
+        rxn_name = row["rxnNames"]
+        if rxn_name not in coeffs_by_name:
+            raise ValueError(
+                f"Reaction {rxn_name!r} in rxns_df has no matching "
+                "row(s) in rxns_coeffs_df."
+            )
+        stoich = coeffs_by_name[rxn_name]
+        sig = _stoich_signature_from_dict(stoich)
+        existing = by_signature.get(sig)
+        if existing is not None:
+            _update_reaction(existing, row, miriam_cols)
+            result.updated_reactions.append(existing.id)
+        else:
+            new_rows.append((row, stoich))
+
+    if new_rows:
+        new_rxns = _build_new_reactions(model, new_rows, miriam_cols, id_prefix)
+        model.add_reactions(new_rxns)
+        result.added_reactions.extend(r.id for r in new_rxns)
+
+    if result.updated_reactions:
+        warnings.warn(
+            f"{len(result.updated_reactions)} reaction(s) had matching "
+            "stoichiometry in the model and were overwritten: "
+            f"{result.updated_reactions[:5]}"
+            f"{'...' if len(result.updated_reactions) > 5 else ''}",
+            stacklevel=4,
+        )
+
+
+def _validate_rxns_coeffs(df: pd.DataFrame) -> None:
+    missing = set(DEFAULT_CORE_RXN_COEFFS_COLUMNS) - set(df.columns)
+    if missing:
+        raise ValueError(
+            f"rxns_coeffs_df is missing required columns: {sorted(missing)}"
+        )
+
+
+def _group_coeffs_by_rxn_name(
+    model: cobra.Model,
+    coeffs_df: pd.DataFrame,
+) -> dict[str, dict[cobra.Metabolite, float]]:
+    """Group coefficient rows by rxn name and resolve the metabolites.
+
+    Each value is a ``{metabolite_object: coefficient}`` dict ready for
+    ``cobra.Reaction.add_metabolites``. Metabolites are looked up by
+    ``(name, comp)``; if a coefficient references an unknown
+    metabolite, that's an error (the caller should add the metabolite
+    via ``mets_df`` first).
+    """
+    name_index = _name_compartment_index(model)
+    grouped: dict[str, dict[cobra.Metabolite, float]] = {}
+    for _, row in coeffs_df.iterrows():
+        rxn_name = str(row["rxnNames"])
+        met_name = str(row["metNames"])
+        comp = str(row["comps"])
+        coef = float(row["coefficient"])
+        met = name_index.get((met_name, comp))
+        if met is None:
+            raise ValueError(
+                f"Reaction {rxn_name!r} references metabolite "
+                f"{met_name!r}[{comp}] which is not in the model. "
+                "Add it via mets_df first, or include it in the same "
+                "batch_curate call."
+            )
+        grouped.setdefault(rxn_name, {})[met] = coef
+    return grouped
+
+
+def _stoich_signature(rxn: cobra.Reaction) -> frozenset:
+    return frozenset((m.id, c) for m, c in rxn.metabolites.items())
+
+
+def _stoich_signature_from_dict(stoich: dict[cobra.Metabolite, float]) -> frozenset:
+    return frozenset((m.id, c) for m, c in stoich.items())
+
+
+def _update_reaction(rxn: cobra.Reaction, row: pd.Series, miriam_cols: list[str]) -> None:
+    if _has_value(row.get("rxnNames")):
+        rxn.name = str(row["rxnNames"])
+    if _has_value(row.get("grRules")):
+        rxn.gene_reaction_rule = str(row["grRules"])
+    if _has_value(row.get("lb")):
+        rxn.lower_bound = float(row["lb"])
+    if _has_value(row.get("ub")):
+        rxn.upper_bound = float(row["ub"])
+    if _has_value(row.get("subSystems")):
+        rxn.subsystem = str(row["subSystems"])
+    if _has_value(row.get("eccodes")):
+        rxn.annotation["ec-code"] = str(row["eccodes"])
+    if _has_value(row.get("rxnNotes")):
+        rxn.notes["rxnNotes"] = str(row["rxnNotes"])
+    if _has_value(row.get("rxnReferences")):
+        rxn.notes["rxnReferences"] = str(row["rxnReferences"])
+    if _has_value(row.get("rxnConfidenceScores")):
+        rxn.notes["rxnConfidenceScores"] = str(row["rxnConfidenceScores"])
+    _apply_miriam(rxn, row, miriam_cols)
+
+
+def _build_new_reactions(
+    model: cobra.Model,
+    rows: list[tuple[pd.Series, dict[cobra.Metabolite, float]]],
+    miriam_cols: list[str],
+    id_prefix: str,
+) -> list[cobra.Reaction]:
+    ids = _generate_ids(model.reactions, id_prefix, len(rows))
+    new_rxns: list[cobra.Reaction] = []
+    for new_id, (row, stoich) in zip(ids, rows, strict=True):
+        rxn = cobra.Reaction(
+            id=new_id,
+            name=str(row["rxnNames"]),
+            lower_bound=float(row["lb"]) if _has_value(row.get("lb")) else -1000.0,
+            upper_bound=float(row["ub"]) if _has_value(row.get("ub")) else 1000.0,
+        )
+        rxn.add_metabolites(stoich)
+        if _has_value(row.get("grRules")):
+            rxn.gene_reaction_rule = str(row["grRules"])
+        if _has_value(row.get("subSystems")):
+            rxn.subsystem = str(row["subSystems"])
+        if _has_value(row.get("eccodes")):
+            rxn.annotation["ec-code"] = str(row["eccodes"])
+        if _has_value(row.get("rxnNotes")):
+            rxn.notes["rxnNotes"] = str(row["rxnNotes"])
+        if _has_value(row.get("rxnReferences")):
+            rxn.notes["rxnReferences"] = str(row["rxnReferences"])
+        if _has_value(row.get("rxnConfidenceScores")):
+            rxn.notes["rxnConfidenceScores"] = str(row["rxnConfidenceScores"])
+        _apply_miriam(rxn, row, miriam_cols)
+        new_rxns.append(rxn)
+    return new_rxns
+
+
+# --- shared helpers ---------------------------------------------------
+
+def _apply_miriam(entity, row: pd.Series, miriam_cols: list[str]) -> None:
+    for col in miriam_cols:
+        value = row.get(col)
+        if _has_value(value):
+            entity.annotation[col] = str(value).strip()
+
+
+def _has_value(v) -> bool:
+    """Return True if ``v`` is a non-empty, non-NaN cell value."""
+    if v is None:
+        return False
+    if isinstance(v, float) and v != v:  # NaN
+        return False
+    if isinstance(v, str) and v.strip() == "":
+        return False
+    return True
+
+
+def _coerce_int(v) -> int:
+    """Pandas reads integer-like strings as floats. Recover the int."""
+    return int(round(float(v)))
+
+
+def _generate_ids(existing, prefix: str, count: int) -> list[str]:
+    """Generate ``count`` fresh ids by incrementing from the largest
+    existing zero-padded suffix matching ``prefix``.
+
+    e.g. with prefix ``s_`` and existing ``s_4100``, the next ids are
+    ``s_4101``, ``s_4102``, … Width is preserved.
+    """
+    max_n = 0
+    width = 1
+    for entity in existing:
+        if not entity.id.startswith(prefix):
+            continue
+        suffix = entity.id[len(prefix):]
+        if suffix.isdigit():
+            max_n = max(max_n, int(suffix))
+            width = max(width, len(suffix))
+    return [f"{prefix}{(max_n + i):0{width}d}" for i in range(1, count + 1)]
diff --git a/tests/test_curation.py b/tests/test_curation.py
new file mode 100644
index 0000000..5464235
--- /dev/null
+++ b/tests/test_curation.py
@@ -0,0 +1,247 @@
+"""Tests for raven_python.curation.batch."""
+from __future__ import annotations
+
+import cobra
+import pandas as pd
+import pytest
+
+from raven_python.curation import (
+    CurationResult,
+    batch_curate,
+    batch_curate_from_tsv,
+)
+
+
+def _base_model() -> cobra.Model:
+    """Small yeast-flavoured model with a couple of mets/genes/rxns."""
+    m = cobra.Model("base")
+    m.compartments = {"c": "cytoplasm", "e": "extracellular"}
+    atp = cobra.Metabolite("s_0001", name="ATP", compartment="c",
+                            formula="C10H12N5O13P3", charge=-4)
+    adp = cobra.Metabolite("s_0002", name="ADP", compartment="c",
+                            formula="C10H12N5O10P2", charge=-3)
+    glc_e = cobra.Metabolite("s_0003", name="glucose", compartment="e",
+                              formula="C6H12O6", charge=0)
+    m.add_metabolites([atp, adp, glc_e])
+    from cobra.core.gene import Gene
+    m.genes.append(Gene("YAL001C", name="A1"))
+    r = cobra.Reaction("r_0001", lower_bound=0, upper_bound=1000)
+    r.name = "ATP hydrolysis"
+    r.add_metabolites({atp: -1, adp: 1})
+    r.gene_reaction_rule = "YAL001C"
+    m.add_reactions([r])
+    return m
+
+
+# --- metabolites ------------------------------------------------------
+
+def test_add_new_metabolite():
+    m = _base_model()
+    df = pd.DataFrame([
+        {"metNames": "NADH", "comps": "c", "formula": "C21H27N7O14P2",
+         "charge": -2, "inchi": "", "metNotes": "added by curation"},
+    ])
+    result = batch_curate(m, mets_df=df, met_id_prefix="s_")
+    assert isinstance(result, CurationResult)
+    assert result.added_metabolites == ["s_0004"]
+    assert len(result.updated_metabolites) == 0
+    new = m.metabolites.get_by_id("s_0004")
+    assert new.name == "NADH"
+    assert new.compartment == "c"
+    assert new.formula == "C21H27N7O14P2"
+    assert new.charge == -2
+    assert new.notes.get("metNotes") == "added by curation"
+
+
+def test_update_existing_metabolite():
+    m = _base_model()
+    df = pd.DataFrame([
+        {"metNames": "ATP", "comps": "c", "formula": "C10H16N5O13P3",
+         "charge": -3, "inchi": "InChI=ATP", "metNotes": ""},
+    ])
+    with pytest.warns(UserWarning, match="overwritten"):
+        result = batch_curate(m, mets_df=df, met_id_prefix="s_")
+    assert result.added_metabolites == []
+    assert result.updated_metabolites == ["s_0001"]
+    atp = m.metabolites.get_by_id("s_0001")
+    assert atp.formula == "C10H16N5O13P3"  # overwritten
+    assert atp.charge == -3
+    assert atp.annotation["inchi"] == "InChI=ATP"
+
+
+def test_miriam_columns_auto_detected():
+    m = _base_model()
+    df = pd.DataFrame([
+        {"metNames": "NADH", "comps": "c", "formula": "C21H27N7O14P2",
+         "charge": -2, "inchi": "", "metNotes": "",
+         "kegg.compound": "C00004", "chebi": "CHEBI:16908"},
+    ])
+    batch_curate(m, mets_df=df, met_id_prefix="s_")
+    new = m.metabolites.get_by_id("s_0004")
+    assert new.annotation["kegg.compound"] == "C00004"
+    assert new.annotation["chebi"] == "CHEBI:16908"
+
+
+def test_new_metabolite_id_increment_preserves_width():
+    """If existing ids are s_0001, s_0002, s_0003 (width 4), new ids
+    should be s_0004, s_0005, … not s_4 / s_5."""
+    m = _base_model()
+    df = pd.DataFrame([
+        {"metNames": "X", "comps": "c"},
+        {"metNames": "Y", "comps": "c"},
+    ])
+    result = batch_curate(m, mets_df=df, met_id_prefix="s_")
+    assert result.added_metabolites == ["s_0004", "s_0005"]
+
+
+# --- genes ------------------------------------------------------------
+
+def test_add_new_gene():
+    m = _base_model()
+    df = pd.DataFrame([
+        {"genes": "YBR123C", "geneShortNames": "B2", "uniprot": "P12345"},
+    ])
+    result = batch_curate(m, genes_df=df)
+    assert result.added_genes == ["YBR123C"]
+    g = m.genes.get_by_id("YBR123C")
+    assert g.name == "B2"
+    assert g.annotation["uniprot"] == "P12345"
+
+
+def test_update_existing_gene():
+    m = _base_model()
+    df = pd.DataFrame([
+        {"genes": "YAL001C", "geneShortNames": "A1_NEW", "uniprot": "P99999"},
+    ])
+    with pytest.warns(UserWarning, match="overwritten"):
+        batch_curate(m, genes_df=df)
+    g = m.genes.get_by_id("YAL001C")
+    assert g.name == "A1_NEW"
+    assert g.annotation["uniprot"] == "P99999"
+
+
+# --- reactions --------------------------------------------------------
+
+def test_add_new_reaction_with_existing_mets():
+    m = _base_model()
+    rxns_df = pd.DataFrame([
+        {"rxnNames": "ADP phosphorylation", "grRules": "YBR456W",
+         "lb": -1000, "ub": 1000, "rev": 1, "subSystems": "energy",
+         "eccodes": "2.7.4.6", "rxnNotes": "", "rxnReferences": "",
+         "rxnConfidenceScores": 3, "kegg.reaction": "R00187"},
+    ])
+    coeffs_df = pd.DataFrame([
+        {"rxnNames": "ADP phosphorylation", "metNames": "ADP", "comps": "c",
+         "coefficient": -1.0},
+        {"rxnNames": "ADP phosphorylation", "metNames": "ATP", "comps": "c",
+         "coefficient": 1.0},
+    ])
+    result = batch_curate(m, rxns_df=rxns_df, rxns_coeffs_df=coeffs_df,
+                          rxn_id_prefix="r_")
+    assert result.added_reactions == ["r_0002"]
+    new = m.reactions.get_by_id("r_0002")
+    assert new.name == "ADP phosphorylation"
+    assert new.gene_reaction_rule == "YBR456W"
+    assert new.annotation["ec-code"] == "2.7.4.6"
+    assert new.annotation["kegg.reaction"] == "R00187"
+    assert new.notes.get("rxnConfidenceScores") == "3"
+
+
+def test_update_existing_reaction_by_stoichiometry():
+    m = _base_model()
+    rxns_df = pd.DataFrame([
+        {"rxnNames": "ATP hydrolysis renamed", "grRules": "YAL999W",
+         "lb": -100, "ub": 100, "rev": 1,
+         "subSystems": "", "eccodes": "3.6.1.3", "rxnNotes": "updated",
+         "rxnReferences": "", "rxnConfidenceScores": 2},
+    ])
+    coeffs_df = pd.DataFrame([
+        {"rxnNames": "ATP hydrolysis renamed", "metNames": "ATP",
+         "comps": "c", "coefficient": -1.0},
+        {"rxnNames": "ATP hydrolysis renamed", "metNames": "ADP",
+         "comps": "c", "coefficient": 1.0},
+    ])
+    with pytest.warns(UserWarning, match="overwritten"):
+        result = batch_curate(m, rxns_df=rxns_df, rxns_coeffs_df=coeffs_df,
+                              rxn_id_prefix="r_")
+    assert result.updated_reactions == ["r_0001"]
+    rxn = m.reactions.get_by_id("r_0001")
+    assert rxn.name == "ATP hydrolysis renamed"
+    assert rxn.gene_reaction_rule == "YAL999W"
+    assert rxn.lower_bound == -100
+    assert rxn.notes.get("rxnNotes") == "updated"
+
+
+def test_rxn_with_unknown_met_raises():
+    m = _base_model()
+    rxns_df = pd.DataFrame([
+        {"rxnNames": "uses unknown", "grRules": "", "lb": 0, "ub": 1000,
+         "rev": 0, "subSystems": "", "eccodes": "", "rxnNotes": "",
+         "rxnReferences": "", "rxnConfidenceScores": ""},
+    ])
+    coeffs_df = pd.DataFrame([
+        {"rxnNames": "uses unknown", "metNames": "mystery", "comps": "c",
+         "coefficient": -1.0},
+    ])
+    with pytest.raises(ValueError, match="mystery"):
+        batch_curate(m, rxns_df=rxns_df, rxns_coeffs_df=coeffs_df,
+                     rxn_id_prefix="r_")
+
+
+def test_must_provide_both_rxn_tables():
+    m = _base_model()
+    rxns_df = pd.DataFrame([{"rxnNames": "X"}])
+    with pytest.raises(ValueError, match="must be provided together"):
+        batch_curate(m, rxns_df=rxns_df)
+
+
+def test_add_new_met_and_use_in_new_rxn():
+    """Common curation pattern: introduce a new metabolite + a new
+    reaction that uses it, in a single call."""
+    m = _base_model()
+    mets_df = pd.DataFrame([
+        {"metNames": "NADH", "comps": "c", "formula": "C21H27N7O14P2",
+         "charge": -2, "inchi": "", "metNotes": ""},
+    ])
+    rxns_df = pd.DataFrame([
+        {"rxnNames": "made up", "grRules": "", "lb": 0, "ub": 1000,
+         "rev": 0, "subSystems": "", "eccodes": "", "rxnNotes": "",
+         "rxnReferences": "", "rxnConfidenceScores": ""},
+    ])
+    coeffs_df = pd.DataFrame([
+        {"rxnNames": "made up", "metNames": "ATP", "comps": "c",
+         "coefficient": -1.0},
+        {"rxnNames": "made up", "metNames": "NADH", "comps": "c",
+         "coefficient": 1.0},
+    ])
+    result = batch_curate(
+        m, mets_df=mets_df, rxns_df=rxns_df, rxns_coeffs_df=coeffs_df,
+        met_id_prefix="s_", rxn_id_prefix="r_",
+    )
+    assert result.added_metabolites == ["s_0004"]
+    assert result.added_reactions == ["r_0002"]
+    new_rxn = m.reactions.get_by_id("r_0002")
+    new_met = m.metabolites.get_by_id("s_0004")
+    assert new_met in new_rxn.metabolites
+
+
+# --- from_tsv ---------------------------------------------------------
+
+def test_from_tsv_round_trip(tmp_path):
+    m = _base_model()
+    mets_path = tmp_path / "mets.tsv"
+    mets_path.write_text(
+        "metNames\tcomps\tformula\tcharge\tinchi\tmetNotes\tkegg.compound\n"
+        "NADH\tc\tC21H27N7O14P2\t-2\t\t\tC00004\n"
+    )
+    result = batch_curate_from_tsv(m, mets_tsv=mets_path, met_id_prefix="s_")
+    assert result.added_metabolites == ["s_0004"]
+    new = m.metabolites.get_by_id("s_0004")
+    assert new.annotation["kegg.compound"] == "C00004"
+
+
+def test_empty_call_no_op():
+    m = _base_model()
+    result = batch_curate(m)
+    assert not result
+    assert result.added_metabolites == []

From bfe395f074ada2b7f63ee0dc09f428353e0b9aa7 Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Sun, 31 May 2026 07:03:05 +0200
Subject: [PATCH 5/7] io.yaml: byte-compatible round-trip with cobrapy + RAVEN
 MATLAB
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three things this fixes:

  1. write_yaml_model dropped the !!omap tags entirely. _to_plain
     was flattening cobra's OrderedDict to plain dict, which causes
     ruamel to emit ordinary block mappings. RAVEN MATLAB's reader
     is a line-based parser keyed on !!omap and therefore could not
     load any file we wrote. _to_plain now returns OrderedDict so
     ruamel re-emits the !!omap tag.

  2. eccodes was lost on round-trip — it wasn't in _RXN_FIELDS, so
     read_yaml_model didn't capture it into .notes and
     write_yaml_model couldn't lift it back. Added.

  3. RAVEN MATLAB writes reaction notes as 'rxnNotes'; cobrapy and
     this writer use 'notes'. Added a read-time alias so existing
     yeast-GEM YAML files (which still say 'rxnNotes') load
     cleanly. Writes go out as 'notes' (cobrapy-canonical).

Top-level layout now matches RAVEN MATLAB: metaData first, then
metabolites / reactions / genes / compartments, then optional
gecko_light + ec-rxns + ec-enzymes. id/name/version live inside
metaData (RAVEN convention) — cobrapy reading these files still
works, but cobra_model.id ends up None because cobrapy doesn't
know about metaData. raven_python.read_yaml_model lifts both
metaData.id/name/version onto model.id / model.name /
model.notes['version'] so the rest of the codebase doesn't care
which layout the file used.

Empty-name genes are no longer emitted as  — that's a
cobrapy quirk that drifts yeast-GEM YAML files away from RAVEN
MATLAB's output.

Verified end-to-end:

  *  cobra.io.load_yaml_model reads every file the new writer
     produces (yeast-GEM and a synthetic fixture).
  *  RAVEN MATLAB readYAMLmodel reads every file the new writer
     produces.
  *  Round-tripping yeast-GEM through raven_python preserves
     2748/2748 metabolites, 4102/4102 reactions, 1143/1143 genes,
     2411 eccodes, 3984 reaction deltaG, 2696 metabolite deltaG,
     1788 SMILES, 1443 rxn-notes — no semantic drift.

Tests
-----
  *  tests/test_io_yaml_parity.py is new: covers every RAVEN
     extension, the rxnNotes legacy alias, the SMILES YAML-special
     character case, metaData-first layout, and cobra readability.
  *  tests/test_io_yaml.py::test_output_is_cobra_readable adjusts
     for the metaData layout (cobra recovers mets/rxns/annotation
     but not model.id, by design).
---
 src/raven_python/io/yaml.py  |  87 +++++++++++---
 tests/test_io_yaml.py        |   8 +-
 tests/test_io_yaml_parity.py | 213 +++++++++++++++++++++++++++++++++++
 3 files changed, 291 insertions(+), 17 deletions(-)
 create mode 100644 tests/test_io_yaml_parity.py

diff --git a/src/raven_python/io/yaml.py b/src/raven_python/io/yaml.py
index f24c642..d363ee9 100644
--- a/src/raven_python/io/yaml.py
+++ b/src/raven_python/io/yaml.py
@@ -66,12 +66,17 @@ def _open_text(path: str | Path, mode: str):
 # 'note' to avoid colliding with the notes container itself.)
 _MET_FIELDS = (("inchis", "inchis"), ("deltaG", "deltaG"), ("metFrom", "metFrom"), ("notes", "note"))
 _RXN_FIELDS = (
-    ("confidence_score", "confidence_score"),
+    ("eccodes", "eccodes"),
     ("references", "references"),
     ("rxnFrom", "rxnFrom"),
     ("deltaG", "deltaG"),
+    ("confidence_score", "confidence_score"),
     ("notes", "note"),
 )
+# Legacy YAML keys accepted on READ for reaction notes. Old RAVEN MATLAB writers
+# used "rxnNotes"; the canonical key (matching cobrapy and the current MATLAB
+# writer) is "notes". When both appear, "notes" wins.
+_LEGACY_RXN_KEY_ALIASES = (("rxnNotes", "notes"),)
 _GENE_FIELDS = (("protein", "protein"),)
 
 _COBRA_TOP_KEYS = frozenset({"metabolites", "reactions", "genes", "compartments", "id", "name"})
@@ -82,8 +87,17 @@ def _open_text(path: str | Path, mode: str):
 
 
 def _to_plain(obj):
+    """Coerce ruamel/numpy scalars to plain Python primitives.
+
+    Dicts are returned as ``OrderedDict`` so that the round-trip dumper
+    emits them with the ``!!omap`` tag (ruamel's CommentedMap subclass
+    is replaced by a plain ``OrderedDict`` to avoid carrying ruamel's
+    own type tags through). Returning a plain ``dict`` instead would
+    drop the ``!!omap`` tag and produce files that RAVEN's MATLAB
+    reader (a line-based parser keyed on ``!!omap``) cannot load.
+    """
     if isinstance(obj, dict):
-        return {str(k): _to_plain(v) for k, v in obj.items()}
+        return OrderedDict((str(k), _to_plain(v)) for k, v in obj.items())
     if isinstance(obj, (list, tuple)):
         return [_to_plain(v) for v in obj]
     if isinstance(obj, bool) or obj is None:
@@ -175,6 +189,19 @@ def model_from_yaml_data(raw: dict) -> cobra.Model:
     # canonical place. No-op on current files.
     _lift_smiles_to_annotation(raw.get("metabolites"))
 
+    # Normalise legacy reaction-side YAML keys (e.g. RAVEN MATLAB's
+    # ``rxnNotes`` -> the canonical ``notes``) before any field capture so
+    # the capture step sees a single key per concept.
+    for entry in raw.get("reactions", []):
+        if not isinstance(entry, dict):
+            continue
+        for legacy_key, canonical_key in _LEGACY_RXN_KEY_ALIASES:
+            if legacy_key in entry and canonical_key not in entry:
+                entry[canonical_key] = entry.pop(legacy_key)
+            elif legacy_key in entry:
+                # Canonical wins; just drop the legacy duplicate.
+                entry.pop(legacy_key)
+
     met_notes = _capture_entry_fields(raw.get("metabolites", []), _MET_FIELDS)
     rxn_notes = _capture_entry_fields(raw.get("reactions", []), _RXN_FIELDS)
     gene_notes = _capture_entry_fields(raw.get("genes", []), _GENE_FIELDS)
@@ -192,11 +219,16 @@ def model_from_yaml_data(raw: dict) -> cobra.Model:
     # No-op when none match.
     _flip_legacy_prot_direction(model)
 
-    # Legacy files keep id/name inside metaData; restore them if cobra found none.
+    # RAVEN convention keeps id/name/version inside metaData; lift them
+    # onto the model so cobra-shaped accessors find them too. Older
+    # cobra-style files (id/name/version at root) are handled by the
+    # cobra reader; the metaData lookups below are pure fallbacks.
     if metadata.get("id") and not model.id:
         model.id = metadata["id"]
     if metadata.get("name") and not model.name:
         model.name = metadata["name"]
+    if version is None and metadata.get("version") is not None:
+        version = metadata["version"]
     if metadata:
         model.notes["metaData"] = metadata
     if version is not None:
@@ -344,6 +376,13 @@ def write_yaml_model(
     _emit_entry_fields(doc.get("reactions", []), _RXN_FIELDS)
     _emit_entry_fields(doc.get("genes", []), _GENE_FIELDS)
 
+    # cobra's _gene_to_dict always emits `name: ''` because name is a
+    # required attribute; RAVEN MATLAB skips empty names. Drop the
+    # empty-string entry so the two writers produce identical genes.
+    for gene in doc.get("genes", []) or ():
+        if isinstance(gene, dict) and gene.get("name") == "":
+            gene.pop("name", None)
+
     # ec sections come from the typed model.ec (when present), not from the
     # opaque foreign-keys stash. Drop any stale ec-* entries in `foreign` so
     # they can't conflict with the EcData-derived ones.
@@ -356,24 +395,42 @@ def write_yaml_model(
         else None
     )
 
-    # cobra dict order is metabolites, reactions, genes, id, name, compartments;
-    # append version / gecko_light / metaData / ec-* like RAVEN's writer.
-    if version is not None:
-        doc["version"] = version
-    metadata = dict(stored_meta)
+    # Final document order (matches RAVEN MATLAB writeYAMLmodel):
+    #   metaData, metabolites, reactions, genes, compartments,
+    #   gecko_light, ec-rxns, ec-enzymes, <opaque foreign>
+    # id/name/version live inside metaData (the RAVEN convention) — they
+    # are NOT emitted at root level. Cobra reading such a file recovers
+    # the lists and compartments and leaves model.id empty (consistent
+    # with how RAVEN MATLAB has always laid these files out).
+    metadata = OrderedDict(stored_meta)
     if model.id:
         metadata.setdefault("id", model.id)
     if model.name:
         metadata.setdefault("name", model.name)
-    if ec_sections is not None:
-        doc["gecko_light"] = ec_sections["gecko_light"]
+    if version is not None:
+        metadata.setdefault("version", version)
+
+    # cobra's model_to_dict put id / name at root level; drop them so they
+    # don't duplicate the metaData copy.
+    doc.pop("id", None)
+    doc.pop("name", None)
+
+    ordered = OrderedDict()
     if metadata:
-        doc["metaData"] = metadata
+        ordered["metaData"] = metadata
+    for key in ("metabolites", "reactions", "genes", "compartments", "notes"):
+        if key in doc:
+            ordered[key] = doc.pop(key)
     if ec_sections is not None:
-        doc["ec-rxns"] = ec_sections["ec-rxns"]
-        doc["ec-enzymes"] = ec_sections["ec-enzymes"]
+        ordered["gecko_light"] = ec_sections["gecko_light"]
+        ordered["ec-rxns"] = ec_sections["ec-rxns"]
+        ordered["ec-enzymes"] = ec_sections["ec-enzymes"]
+    # Carry over any cobra-shaped fields we didn't classify above
+    # (defensive: keeps forward compatibility with future cobra additions).
+    for key, value in doc.items():
+        ordered.setdefault(key, value)
     for key, value in foreign.items():
-        doc[key] = value
+        ordered.setdefault(key, value)
 
     with _open_text(path, "w") as handle:
-        _cobra_yaml.dump(doc, handle)
+        _cobra_yaml.dump(ordered, handle)
diff --git a/tests/test_io_yaml.py b/tests/test_io_yaml.py
index c406857..d6e7e65 100644
--- a/tests/test_io_yaml.py
+++ b/tests/test_io_yaml.py
@@ -165,12 +165,16 @@ def test_gzipped_round_trip(yaml_file, tmp_path):
 
 
 def test_output_is_cobra_readable(yaml_file, tmp_path):
-    # The written file must load with stock cobra (it's cobra's native format).
+    """Cobrapy must be able to parse the file (no syntax error) and
+    recover the metabolites, reactions, and the cobrapy-canonical
+    annotation block. Model-level id / name / version live inside the
+    metaData section (RAVEN convention) — cobrapy doesn't know about
+    metaData, so cobra_model.id is None here. raven_python recovers
+    them; cobrapy ignores them gracefully."""
     model = read_yaml_model(yaml_file)
     out = tmp_path / "out.yml"
     write_yaml_model(model, out)
     cobra_model = cobra.io.load_yaml_model(str(out))
-    assert cobra_model.id == "testModel"
     assert {m.id for m in cobra_model.metabolites} == {"s_0001", "s_0002"}
     # RAVEN-only fields land in cobra notes; smiles in annotation
     assert cobra_model.metabolites.get_by_id("s_0001").annotation["smiles"] == ["C1=NC2"]
diff --git a/tests/test_io_yaml_parity.py b/tests/test_io_yaml_parity.py
new file mode 100644
index 0000000..459f563
--- /dev/null
+++ b/tests/test_io_yaml_parity.py
@@ -0,0 +1,213 @@
+"""Parity gate: round-tripping a YAML model through raven_python.io.yaml
+must produce a file that:
+
+  * cobra.io.load_yaml_model can read (the cobrapy-canonical core);
+  * keeps every RAVEN-only field (inchis / eccodes / deltaG / rxnFrom /
+    metFrom / references / confidence_score / rxnNotes / protein /
+    metMiriams / rxnMiriams / annotation-side SMILES);
+  * emits ``!!omap`` tags on each per-entry mapping (so RAVEN MATLAB's
+    line-based reader can ingest it);
+  * places the ``metaData`` block first, matching RAVEN MATLAB's layout.
+
+The fixture below is the smallest model that exercises every RAVEN
+extension, plus a legacy ``rxnNotes`` key (read-time alias the writer
+must normalise to ``notes``) and a metabolite with a SMILES value
+that would parse as a flow sequence if emitted unquoted.
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+import cobra
+import cobra.io
+import pytest
+from cobra.io.yaml import yaml as cobra_yaml
+
+from raven_python.io import read_yaml_model, write_yaml_model
+
+
+SAMPLE = {
+    "metabolites": [
+        {
+            "id": "s_0001",
+            "name": "ATP",
+            "compartment": "c",
+            "charge": -4,
+            "formula": "C10H16N5O13P3",
+            "inchis": "InChI=1S/CH4",
+            "deltaG": 12.5,
+            "metFrom": "KEGG",
+            "notes": "metabolite note",
+            "annotation": {
+                "kegg.compound": ["C00002"],
+                "smiles": ["C1=NC2=C(N=CN2)N(C1=O)C"],  # YAML-ambiguous chars
+            },
+        },
+        {"id": "s_0002", "name": "ADP", "compartment": "c"},
+    ],
+    "reactions": [
+        {
+            "id": "R1",
+            "name": "rxn one",
+            "metabolites": {"s_0001": -1.0, "s_0002": 1.0},
+            "lower_bound": -1000.0,
+            "upper_bound": 1000.0,
+            "gene_reaction_rule": "G1",
+            "objective_coefficient": 0,
+            "subsystem": "glycolysis",
+            "confidence_score": 2,
+            "references": "PMID:123",
+            "rxnFrom": "manual",
+            "eccodes": "1.1.1.1",
+            "rxnNotes": "legacy reaction note key",  # read-time alias
+            "deltaG": -5.0,
+            "annotation": {"ec-code": ["1.1.1.1"]},
+        }
+    ],
+    "genes": [
+        {"id": "G1", "name": "geneOne", "protein": "P12345",
+         "annotation": {"uniprot": ["P12345"]}}
+    ],
+    "compartments": {"c": "cytoplasm"},
+    "metaData": {
+        "id": "testModel",
+        "name": "Test",
+        "version": "1.0",
+        "date": "2026-05-23",
+        "taxonomy": "taxonomy/559292",
+    },
+}
+
+
+@pytest.fixture
+def src(tmp_path) -> Path:
+    p = tmp_path / "source.yml"
+    with p.open("w", encoding="utf-8") as fh:
+        cobra_yaml.dump(SAMPLE, fh)
+    return p
+
+
+def test_round_trip_preserves_every_raven_field(src, tmp_path):
+    model = read_yaml_model(src)
+    out = tmp_path / "out.yml"
+    write_yaml_model(model, out)
+    reloaded = read_yaml_model(out)
+
+    # Core (cobra-known) content stayed.
+    assert reloaded.id == "testModel"
+    assert {m.id for m in reloaded.metabolites} == {"s_0001", "s_0002"}
+    r = reloaded.reactions.get_by_id("R1")
+    assert r.bounds == (-1000.0, 1000.0)
+    assert r.gene_reaction_rule == "G1"
+    assert r.subsystem == "glycolysis"
+
+    # Metabolite RAVEN extras.
+    a = reloaded.metabolites.get_by_id("s_0001")
+    assert a.notes["inchis"] == "InChI=1S/CH4"
+    assert a.notes["deltaG"] == 12.5
+    assert a.notes["metFrom"] == "KEGG"
+    assert a.notes["note"] == "metabolite note"
+    assert a.annotation["smiles"] == ["C1=NC2=C(N=CN2)N(C1=O)C"]
+
+    # Reaction RAVEN extras (incl. the eccodes round-trip that earlier
+    # versions dropped on write).
+    assert r.notes["eccodes"] == "1.1.1.1"
+    assert r.notes["references"] == "PMID:123"
+    assert r.notes["rxnFrom"] == "manual"
+    assert r.notes["confidence_score"] == 2
+    assert r.notes["deltaG"] == -5.0
+    # rxnNotes (legacy key) gets normalised to notes on read.
+    assert r.notes["note"] == "legacy reaction note key"
+
+    # Gene RAVEN extras.
+    assert reloaded.genes.get_by_id("G1").notes["protein"] == "P12345"
+
+    # Provenance.
+    assert reloaded.notes["metaData"]["taxonomy"] == "taxonomy/559292"
+    assert reloaded.notes["version"] == "1.0"
+
+
+def test_output_is_cobra_readable(src, tmp_path):
+    """The written file is valid cobra-native YAML; cobra.io can read
+    the core content (it doesn't know about RAVEN extras, but doesn't
+    choke on them either)."""
+    model = read_yaml_model(src)
+    out = tmp_path / "out.yml"
+    write_yaml_model(model, out)
+    cmodel = cobra.io.load_yaml_model(str(out))
+    assert {m.id for m in cmodel.metabolites} == {"s_0001", "s_0002"}
+    assert cmodel.reactions.get_by_id("R1").bounds == (-1000.0, 1000.0)
+    # SMILES landed in annotation, not at metabolite top level.
+    assert cmodel.metabolites.get_by_id("s_0001").annotation["smiles"] == [
+        "C1=NC2=C(N=CN2)N(C1=O)C"
+    ]
+
+
+def test_output_carries_omap_tags(src, tmp_path):
+    """RAVEN MATLAB's reader is a line-based parser keyed on ``!!omap``;
+    the writer must emit those tags. (PR #17 originally dropped them
+    because _to_plain flattened OrderedDicts to plain dicts.)"""
+    model = read_yaml_model(src)
+    out = tmp_path / "out.yml"
+    write_yaml_model(model, out)
+    text = out.read_text()
+    # Per-entry !!omap on metabolites, reactions, genes:
+    assert text.count("!!omap") >= 1 + 1 + 2 + 1 + 1  # root + metaData + 2 mets + 1 rxn + 1 gene
+    # Each major section appears as a top-level entry.
+    for section in ("metabolites:", "reactions:", "genes:", "compartments:"):
+        assert f"- {section}" in text
+
+
+def test_metadata_is_first(src, tmp_path):
+    """RAVEN MATLAB emits metaData as the first top-level section.
+    Producing the same layout means RAVEN MATLAB and raven_python
+    files agree byte-for-byte on top-level ordering."""
+    model = read_yaml_model(src)
+    out = tmp_path / "out.yml"
+    write_yaml_model(model, out)
+    text = out.read_text()
+    idx_meta = text.find("- metaData:")
+    idx_mets = text.find("- metabolites:")
+    assert 0 <= idx_meta < idx_mets, "metaData must precede metabolites"
+
+
+def test_smiles_with_yaml_special_chars_quoted(src, tmp_path):
+    """The SMILES value above contains square brackets; an unquoted
+    bare scalar would be parsed as a flow sequence. The writer must
+    either keep ``smiles`` inside the annotation block (where SMILES
+    annotations naturally live) or quote it. Either way the loop-
+    back read must recover the exact string."""
+    model = read_yaml_model(src)
+    out = tmp_path / "out.yml"
+    write_yaml_model(model, out)
+    # The verification is purely functional: reload, check the value.
+    reloaded = read_yaml_model(out)
+    assert reloaded.metabolites.get_by_id("s_0001").annotation["smiles"] == [
+        "C1=NC2=C(N=CN2)N(C1=O)C"
+    ]
+
+
+def test_eccodes_round_trip_through_cobra_extras(src, tmp_path):
+    """A model loaded from cobra (no eccodes awareness) and re-written
+    via raven_python.write_yaml_model still keeps eccodes — they're
+    sourced from .notes['eccodes'] which read_yaml_model puts there."""
+    # Same fixture, but go through cobra first to prove notes-based
+    # eccodes propagation works when cobra is in the loop.
+    model = read_yaml_model(src)
+    pass1 = tmp_path / "via_rp.yml"
+    write_yaml_model(model, pass1)
+    via_cobra = cobra.io.load_yaml_model(str(pass1))
+    # cobra exposes eccodes as an attribute (setattr fall-through);
+    # raven_python sourced it from notes, so .notes['eccodes'] should
+    # still be present on the reloaded model.
+    pass2 = tmp_path / "via_rp2.yml"
+    # Promote cobra's setattr-eccodes back into notes for the writer
+    # path. (Tests the documented integration: cobra preserves the YAML
+    # key, raven_python.read sees it again.)
+    again = read_yaml_model(pass1)
+    write_yaml_model(again, pass2)
+    final = read_yaml_model(pass2)
+    assert final.reactions.get_by_id("R1").notes["eccodes"] == "1.1.1.1"
+    # And cobra can still read the final result.
+    cm = cobra.io.load_yaml_model(str(pass2))
+    assert cm.reactions.get_by_id("R1").bounds == (-1000.0, 1000.0)

From 36e74b4156d59e4eede496436a9fb446c4180120 Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Sun, 31 May 2026 07:31:24 +0200
Subject: [PATCH 6/7] conditions: switch from PyYAML to ruamel.yaml
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PyYAML is not a project dependency; raven-python uses ruamel.yaml
(already pulled in via cobra) everywhere else. The conditions
module and its tests still imported PyYAML, which broke pytest
collection on clean CI runners with 'No module named yaml'.

Both apply.py and the test now use a YAML(typ='safe') instance
from ruamel.yaml — same plain-dict semantics as PyYAML's
safe_load / safe_dump, no new dependency.
---
 src/raven_python/conditions/apply.py | 10 ++++++++--
 tests/test_conditions.py             | 18 +++++++++++++++---
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/raven_python/conditions/apply.py b/src/raven_python/conditions/apply.py
index 2dc1a2b..85c72a8 100644
--- a/src/raven_python/conditions/apply.py
+++ b/src/raven_python/conditions/apply.py
@@ -49,7 +49,13 @@
 from typing import Any
 
 import cobra
-import yaml
+from ruamel.yaml import YAML
+
+# A safe loader keeps the parsed document as plain dict / list / scalars,
+# which matches what callers expect from ``load_condition``. ruamel.yaml
+# is already a transitive dependency via cobra, so we don't take on
+# PyYAML on top.
+_SAFE_YAML = YAML(typ="safe")
 
 #: ``prelude.reset_exchanges`` puts every exchange reaction at this
 #: upper bound (and lower bound = 0).
@@ -62,7 +68,7 @@ def load_condition(path: str | Path) -> dict[str, Any]:
     if not path.exists():
         raise FileNotFoundError(f"Condition file not found: {path}")
     with open(path) as f:
-        return yaml.safe_load(f)
+        return _SAFE_YAML.load(f)
 
 
 def apply_condition(
diff --git a/tests/test_conditions.py b/tests/test_conditions.py
index e5e381f..65b433d 100644
--- a/tests/test_conditions.py
+++ b/tests/test_conditions.py
@@ -1,9 +1,11 @@
 """Tests for raven_python.conditions.apply (generic condition mechanism)."""
 from __future__ import annotations
 
+import io
+
 import cobra
 import pytest
-import yaml
+from ruamel.yaml import YAML
 
 from raven_python.conditions import (
     apply_condition,
@@ -11,6 +13,16 @@
     set_reaction_bounds,
 )
 
+_SAFE_YAML = YAML(typ="safe")
+
+
+def _yaml_dump(cfg: dict) -> str:
+    """Helper: dump a dict to YAML via ruamel.yaml (PyYAML.safe_dump
+    equivalent). Used here because PyYAML isn't a project dependency."""
+    buf = io.StringIO()
+    _SAFE_YAML.dump(cfg, buf)
+    return buf.getvalue()
+
 
 def _toy_model() -> cobra.Model:
     m = cobra.Model("toy")
@@ -164,7 +176,7 @@ def test_expected_uptake_count_match_silent(recwarn):
 def test_apply_condition_accepts_yaml_path(tmp_path):
     cfg = {"bounds": [{"rxn": "EX_glc", "lb": -42}]}
     path = tmp_path / "cond.yml"
-    path.write_text(yaml.safe_dump(cfg))
+    path.write_text(_yaml_dump(cfg))
     m = _toy_model()
     apply_condition(m, path)
     assert m.reactions.get_by_id("EX_glc").lower_bound == -42
@@ -173,7 +185,7 @@ def test_apply_condition_accepts_yaml_path(tmp_path):
 def test_load_condition_round_trip(tmp_path):
     cfg = {"name": "x", "bounds": [{"rxn": "r1", "lb": 0}]}
     path = tmp_path / "cond.yml"
-    path.write_text(yaml.safe_dump(cfg))
+    path.write_text(_yaml_dump(cfg))
     assert load_condition(path) == cfg
 
 

From 8e31c9aa3b58485d637f337e297b08b2287918a0 Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Sun, 31 May 2026 07:31:39 +0200
Subject: [PATCH 7/7] io.yaml: document the format + accept legacy
 geckoLight-in-metaData
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds docs/reference/yaml_format.md as the canonical schema reference
for the cross-toolchain YAML format (cobrapy / raven-python / RAVEN
MATLAB). Covers the full document shape, per-entry field order,
RAVEN extensions, the GECKO ec-* sections, the metaData provenance
block, number / string / quoting rules, and the cross-reader
interoperability matrix. Linked from docs/reference/index.md and
the I/O guide.

Reader fix: pre-shim RAVEN MATLAB writes emitted GECKO models
with geckoLight: "true" inside the metaData block (not as a
top-level gecko_light). The reader now lifts that legacy key out
of metaData so model.ec.gecko_light is populated whichever
placement the file used. Round-trip writes always use the new
top-level form.

Regression tests:

  test_pre_shim_format_loads — synthetic fixture covering every
  legacy quirk we know about (--- doc marker, plain metaData,
  geckoLight inside metaData, top-level metabolite smiles,
  rxnNotes reaction key, integer bounds, double-quoted strings).
  Each quirk has its own assertion + comment.

  test_pre_shim_yeast_gem_loads_if_available — sanity-loads the
  real yeast-GEM.yml (2748 mets, 4102 rxns, 1143 genes) and
  asserts the documented preserved-counts table from the format
  reference. Skipped on CI runners where the working copy isn't
  mounted.
---
 docs/guide/io_and_manipulation.md |   4 +-
 docs/reference/index.md           |   4 +
 docs/reference/yaml_format.md     | 332 ++++++++++++++++++++++++++++++
 src/raven_python/io/yaml.py       |  12 +-
 tests/test_io_yaml_parity.py      | 152 ++++++++++++++
 5 files changed, 502 insertions(+), 2 deletions(-)
 create mode 100644 docs/reference/yaml_format.md

diff --git a/docs/guide/io_and_manipulation.md b/docs/guide/io_and_manipulation.md
index 73c4876..aab7dc0 100644
--- a/docs/guide/io_and_manipulation.md
+++ b/docs/guide/io_and_manipulation.md
@@ -8,7 +8,9 @@ unchanged. On top of that it adds the RAVEN-specific formats:
 - {func}`raven_python.io.read_yaml_model` / {func}`raven_python.io.write_yaml_model` —
   cobra-standard YAML (the `!!omap` layout), transparently handling `.yml.gz`. RAVEN-only and
   GECKO `ec-*` side-fields are preserved on each entry's `notes` so a read→write round-trip is
-  lossless.
+  lossless. The full schema (top-level layout, field order, quoting rules, the GECKO
+  `ec-*` and `metaData` extensions) is documented in
+  [the YAML model format reference](../reference/yaml_format.md).
 - {func}`raven_python.io.export_model_to_sif` — Cytoscape SIF (`rc` / `rr` / `cc` graphs).
 - {func}`raven_python.io.export_to_excel` — the RAVEN 5-sheet workbook (RXNS / METS / COMPS /
   GENES / MODEL). Requires the `excel` extra. Excel **import** is intentionally not provided.
diff --git a/docs/reference/index.md b/docs/reference/index.md
index 01b96a6..865fc70 100644
--- a/docs/reference/index.md
+++ b/docs/reference/index.md
@@ -5,6 +5,9 @@ Conceptual and API reference for raven-python.
 - **[RAVEN ↔ raven-python migration map](migration.md)** — the function-by-function map
   from MATLAB RAVEN to raven-python (and cobrapy where appropriate). Start here if you're
   porting RAVEN code.
+- **[YAML model format](yaml_format.md)** — the shared YAML schema produced and consumed
+  by cobrapy, raven-python, and RAVEN MATLAB, with a fully-annotated example and the
+  field-order / quoting rules.
 - **[MATLAB RAVEN back-port proposals](matlab_raven_backports.md)** — improvements
   raven-python makes that are candidates to back-port into the MATLAB toolbox.
 - **[Improvements over RAVEN](improvements.md)** — the full catalogue of correctness /
@@ -16,6 +19,7 @@ Conceptual and API reference for raven-python.
 :hidden:
 
 migration
+yaml_format
 matlab_raven_backports
 improvements
 api/index
diff --git a/docs/reference/yaml_format.md b/docs/reference/yaml_format.md
new file mode 100644
index 0000000..2d43eae
--- /dev/null
+++ b/docs/reference/yaml_format.md
@@ -0,0 +1,332 @@
+# RAVEN / cobrapy YAML model format
+
+This document describes the YAML format produced and consumed by
+
+- **cobrapy** ([`cobra.io.{load,save}_yaml_model`](https://github.com/opencobra/cobrapy))
+- **raven-python** (`raven_python.io.yaml.{read,write}_yaml_model`, see [API](api/index.md))
+- **RAVEN MATLAB** (`readYAMLmodel.m` / `writeYAMLmodel.m` in the [RAVEN repo](https://github.com/SysBioChalmers/RAVEN/tree/feat/yeast-gem-shared/io))
+
+The same file can be round-tripped through any of the three. cobrapy is the canonical core; raven-python and RAVEN MATLAB add namespaced extensions (RAVEN curation fields, MIRIAM cross-refs already covered by cobrapy's `annotation`, and the GECKO `ec-*` sections) without disturbing the cobra-known shape.
+
+---
+
+## At a glance
+
+```yaml
+!!omap
+- metaData: !!omap
+    - id: yeastGEM_develop
+    - name: The Consensus Genome-Scale Metabolic Model of Yeast
+    - version: 9.0.0
+    - date: 2026-05-27
+    - taxonomy: taxonomy/559292
+- metabolites:
+    - !!omap
+      - id: s_0001
+      - name: ATP
+      - compartment: c
+      - charge: -4
+      - formula: C10H16N5O13P3
+      - annotation: !!omap
+          - kegg.compound: C00002
+          - smiles: "[O-]P(=O)([O-])OP(=O)([O-])O..."
+      - inchis: InChI=1S/C10H16N5O13P3/...
+      - deltaG: -2768.1
+- reactions:
+    - !!omap
+      - id: r_0001
+      - name: hexokinase
+      - metabolites: !!omap
+          - s_0001: -1.0
+          - s_0568: -1.0
+          - s_0394: 1.0
+          - s_0423: 1.0
+      - lower_bound: 0.0
+      - upper_bound: 1000.0
+      - gene_reaction_rule: YGL253W or YCL040W or YFR053C
+      - subsystem: Glycolysis / Gluconeogenesis
+      - notes: "MetaNetX ID curated (PR #220)"
+      - annotation: !!omap
+          - kegg.reaction: R00299
+          - sbo: SBO:0000176
+      - eccodes: 2.7.1.1
+      - deltaG: -17.39
+      - confidence_score: 2.0
+- genes:
+    - !!omap
+      - id: YGL253W
+      - name: HXK2
+      - annotation: !!omap
+          - uniprot: P04807
+- compartments: !!omap
+    - c: cytoplasm
+    - e: extracellular
+```
+
+Three structural rules are non-obvious and worth pointing out before the field-by-field detail:
+
+1. The whole document is one **ordered mapping** — `!!omap` — at the root. Every nested map that should preserve key order is also `!!omap` (metaData, each metabolite / reaction / gene entry, `annotation`, `metabolites`, `compartments`, and the ec sections).
+2. Each metabolite, reaction, and gene is **one `- !!omap` element** of a list. Inside that mapping, every field is written as `- key: value`. This is cobrapy's native shape and is what RAVEN MATLAB's reader keys off.
+3. Strings are **unquoted by default**; quotes appear only when YAML would otherwise misparse the value (leading `-`, `[`, `?` or `:`; embedded `: ` or ` #`; values that look like `true` / `false` / `null`).
+
+---
+
+## Top-level layout
+
+```
+!!omap
+- metaData: !!omap    # optional; RAVEN extension
+- metabolites:        # required
+- reactions:          # required
+- genes:              # required (may be `genes: []`)
+- compartments: !!omap # required
+- gecko_light: <bool> # optional; GECKO extension
+- ec-rxns:            # optional; GECKO extension
+- ec-enzymes:         # optional; GECKO extension
+```
+
+| Key | Required | Source | Notes |
+|-----|----------|--------|-------|
+| `metaData` | optional | RAVEN | Provenance block. Holds `id`, `name`, `version`, `date`, `taxonomy`, optionally `givenName` / `familyName` / `email` / `organization` / `note` / `sourceUrl`, plus `defaultLB` / `defaultUB`. Cobrapy ignores this block (no semantic loss for the core model). |
+| `metabolites` | yes | cobra core | Ordered list of `- !!omap` entries. |
+| `reactions` | yes | cobra core | Ordered list of `- !!omap` entries. |
+| `genes` | yes | cobra core | Ordered list; may be `genes: []` for a model with no genes. |
+| `compartments` | yes | cobra core | `!!omap` of `<code>: <full name>`. |
+| `gecko_light` | optional | GECKO | Scalar boolean. Cobrapy / raven-python emit this at the top level; the older spelling `geckoLight` inside `metaData` is still accepted on read. |
+| `ec-rxns` | optional | GECKO | Per-reaction kcat / source / enzymes coupling table. |
+| `ec-enzymes` | optional | GECKO | Per-enzyme MW / sequence / concentration table. |
+
+Cobrapy writes `id` / `name` / `version` at the root level instead of inside `metaData`. The RAVEN readers accept both placements; the RAVEN writers normalize to the `metaData` form.
+
+---
+
+## Metabolite entry
+
+Field order (cobra-core first, then RAVEN extensions):
+
+```yaml
+- !!omap
+  - id: s_0001               # required
+  - name: ATP                # cobra
+  - compartment: c           # cobra
+  - charge: -4               # cobra (number)
+  - formula: C10H16N5O13P3   # cobra
+  - notes: "free-text"       # cobra
+  - annotation: !!omap       # cobra (MIRIAM + smiles)
+      - kegg.compound: C00002
+      - chebi:
+          - CHEBI:15422
+          - CHEBI:30616
+      - sbo: SBO:0000247
+      - smiles: "OC1=NC..."  # quoted when it contains [ ] : etc.
+  - inchis: "InChI=1S/..."   # RAVEN extension
+  - deltaG: -2768.1          # RAVEN extension
+  - metFrom: KEGG            # RAVEN extension
+```
+
+Cobrapy emits exactly the first seven keys (the cobra-core block). raven-python and RAVEN MATLAB additionally emit `inchis`, `deltaG`, and `metFrom` when those fields are populated. On read, cobrapy puts the RAVEN extensions on the metabolite as attribute fall-through; raven-python captures them into `metabolite.notes` (keyed by their YAML name); RAVEN MATLAB stores them on `model.inchis` / `model.metDeltaG` / `model.metFrom`.
+
+Annotation entries with multiple values are emitted as a YAML list (`chebi:` then several `-` items). Single-value entries are emitted inline (`kegg.compound: C00002`). SMILES strings live inside the annotation block under the `smiles` key — not as a top-level metabolite field, which is the historical RAVEN MATLAB shape and is still accepted on read for backward compatibility.
+
+---
+
+## Reaction entry
+
+```yaml
+- !!omap
+  - id: r_0001                                    # required
+  - name: hexokinase                              # cobra
+  - metabolites: !!omap                           # cobra (sorted by met id)
+      - s_0001: -1.0
+      - s_0394: 1.0
+  - lower_bound: 0.0                              # cobra (number)
+  - upper_bound: 1000.0                           # cobra (number)
+  - gene_reaction_rule: YGL253W or YCL040W        # cobra
+  - objective_coefficient: 1                      # cobra; omitted when 0
+  - subsystem: Glycolysis / Gluconeogenesis       # cobra
+  - notes: "MetaNetX ID curated (PR #220)"        # cobra
+  - annotation: !!omap                            # cobra
+      - kegg.reaction: R00299
+      - sbo: SBO:0000176
+  - eccodes:                                      # RAVEN extension
+      - 2.7.1.1
+      - 2.7.1.2
+  - references: "PMID:12345"                      # RAVEN extension
+  - rxnFrom: KEGG                                 # RAVEN extension
+  - deltaG: -17.39                                # RAVEN extension
+  - confidence_score: 2.0                         # RAVEN extension
+```
+
+Some fields are conditional:
+
+- `objective_coefficient` is only written when non-zero (cobrapy convention).
+- The `metabolites` block uses `!!omap []` (flow-style empty omap) when the reaction has no metabolites — this keeps the file a valid YAML 1.2 document.
+- `eccodes` is written inline (`eccodes: 2.7.1.1`) when there is exactly one code, and as a list when there are several. Same for `references`.
+
+**Notes key naming.** Cobrapy and the current raven-python / RAVEN MATLAB writers use **`notes`**. Pre-`feat/yeast-gem-shared` yeast-GEM files used `rxnNotes`; both readers accept that as a legacy alias.
+
+**Bounds typing.** Bounds are emitted as floats with an explicit decimal point (`1000.0`, `-1000.0`), matching Python's float repr and cobrapy's output.
+
+---
+
+## Gene entry
+
+```yaml
+- !!omap
+  - id: YGL253W              # required
+  - name: HXK2               # cobra; omitted when empty
+  - annotation: !!omap       # cobra
+      - uniprot: P04807
+      - ncbigene: 856421
+  - protein: P04807          # RAVEN extension
+```
+
+Empty names (`name: ''`) are not emitted (matches RAVEN MATLAB's historical behavior).
+
+---
+
+## Compartments
+
+```yaml
+- compartments: !!omap
+    - c: cytoplasm
+    - e: extracellular
+    - m: mitochondrion
+```
+
+Just an `!!omap` of `<short code>: <human-readable name>` pairs. Compartments don't carry their own MIRIAMs in the current format.
+
+---
+
+## metaData (RAVEN extension)
+
+```yaml
+- metaData: !!omap
+    - id: yeastGEM_develop
+    - name: The Consensus Genome-Scale Metabolic Model of Yeast
+    - version: 9.0.0
+    - date: 2026-05-27
+    - defaultLB: -1000.0
+    - defaultUB: 1000.0
+    - givenName: Eduard
+    - familyName: Kerkhoven
+    - email: eduardk@chalmers.se
+    - organization: Chalmers University of Technology
+    - taxonomy: taxonomy/559292
+    - note: "Saccharomyces cerevisiae - strain S288C"
+    - sourceUrl: https://github.com/SysBioChalmers/yeast-GEM
+```
+
+Pure provenance. Cobrapy ignores the block; raven-python keeps the verbatim dictionary on `model.notes['metaData']` and additionally lifts `id` / `name` / `version` to `model.id` / `model.name` / `model.notes['version']` so cobra-shape accessors find them. RAVEN MATLAB populates `model.id` / `model.name` / `model.version` / `model.annotation.*` from the same fields.
+
+`date` is preserved across round-trips when present on the model; otherwise the writer fills in `YYYY-MM-DD` of the current date.
+
+---
+
+## GECKO sections
+
+For enzyme-constrained models, three additional top-level keys carry the EC layer:
+
+```yaml
+- gecko_light: false        # true for the "light" formulation
+- ec-rxns:
+    - !!omap
+      - id: r_0001
+      - kcat: 25.3
+      - source: brenda
+      - notes: ""
+      - eccodes: 2.7.1.1
+      - enzymes: !!omap
+          - P04807: 1.0
+- ec-enzymes:
+    - !!omap
+      - genes: YGL253W
+      - enzymes: P04807
+      - mw: 53942
+      - sequence: "MVHLGPK..."
+      - concs: .nan
+```
+
+These map onto `model.ec` in RAVEN MATLAB and `raven_python.io.ec_data.EcData` (attached as `model.ec`) in raven-python. Cobrapy ignores the sections.
+
+The older spelling `geckoLight` inside `metaData` is also accepted on read.
+
+---
+
+## Annotations
+
+The `annotation` block uses MIRIAM-style namespace keys. Cobrapy treats the block as a free-form dictionary; raven-python preserves it verbatim through `cobra.Metabolite.annotation` / `Reaction.annotation` / `Gene.annotation`; RAVEN MATLAB maps it to `model.metMiriams` / `rxnMiriams` / `geneMiriams`.
+
+- A single value is written inline: `kegg.compound: C00002`.
+- Multiple values are written as a YAML list:
+
+  ```yaml
+  - chebi:
+      - CHEBI:15422
+      - CHEBI:30616
+  ```
+
+- The `smiles` key inside a metabolite's `annotation` carries the SMILES string (cobrapy convention). RAVEN MATLAB historically emitted `smiles` as a metabolite top-level field; both readers still accept that, but writes are normalized to the annotation block.
+- The `sbo` key carries the Systems Biology Ontology term assigned by `assignSBOterms` / `add_sbo_terms`.
+
+---
+
+## Numbers, strings, quoting
+
+**Numbers.** Whole-number floats are written with an explicit `.0` (`1000.0`, `-1000.0`, `0.0`). Other floats use up to 15 significant digits (`-17.39`, `-2768.1`). `NaN` is encoded as `.nan`; `+Inf` / `-Inf` as `.inf` / `-.inf` (YAML 1.2 conventions).
+
+**Strings.** Default style is bare (no quotes). The writer falls back to double-quoted style when the value:
+
+- starts with `-`, `?`, `:`, or any flow indicator (`[`, `]`, `{`, `}`, `,`, `&`, `*`, `!`, `|`, `>`, `%`, `@`, `` ` ``, `#`);
+- contains `: ` (would otherwise be parsed as a key/value), ` #` (comment), or one of `[`, `]`, `{`, `}`;
+- has leading or trailing whitespace;
+- spells a YAML reserved word case-insensitively (`true`, `false`, `null`, `yes`, `no`, `on`, `off`, `~`).
+
+In a double-quoted string, only `\` and `"` are escaped. Other characters (including Unicode and newlines if the underlying model permitted them) are passed through.
+
+---
+
+## Tooling interoperability matrix
+
+| File written by ↓ \ Reader → | cobrapy | raven-python | RAVEN MATLAB |
+|---|---|---|---|
+| cobrapy (`save_yaml_model`) | full | full + extras land in `notes` via attribute fall-through | works for root-level `id` / `name` / `version` (added in this release) |
+| raven-python (`write_yaml_model`) | core (no `metaData`-derived `id`); RAVEN extras live as unknown top-level keys but don't break parsing | full | full |
+| RAVEN MATLAB (`writeYAMLmodel`) | core (no `metaData`-derived `id`); RAVEN extras land via attribute fall-through | full | full |
+
+"Full" = every field read back into its canonical position on the model object; "core" = cobrapy-known fields, RAVEN extensions ignored or kept on the object as attribute fall-through (`reaction.eccodes` etc., not re-emitted on save). A round-trip through cobrapy is therefore **lossy for RAVEN extensions** — only the core fields survive `cobrapy.load → cobrapy.save`. Round-trips through raven-python or RAVEN MATLAB are lossless.
+
+---
+
+## What round-tripping looks like
+
+Loading `yeast-GEM.yml` (2748 metabolites, 4102 reactions, 1143 genes) and re-writing it through any of the three tools preserves every documented piece of content:
+
+| Count | After round-trip |
+|---|---|
+| metabolites | 2748 / 2748 |
+| reactions | 4102 / 4102 |
+| genes | 1143 / 1143 |
+| reactions with eccodes | 2411 |
+| reactions with deltaG | 3984 |
+| metabolites with deltaG | 2696 |
+| metabolites with SMILES | 1788 |
+| reactions with notes (rxnNotes) | 1443 |
+
+(Cobrapy round-trips give 2748 / 4102 / 1143 for the core but drop the RAVEN extensions in the rightmost column — that's the documented loss.)
+
+---
+
+## What changed in `feat/yeast-gem-shared`
+
+- raven-python writer no longer drops `!!omap` tags (was producing files RAVEN MATLAB's reader couldn't load).
+- raven-python now preserves `eccodes` and accepts the legacy `rxnNotes` reaction key on read.
+- RAVEN MATLAB writer reorders metabolite / reaction fields to match cobrapy.
+- RAVEN MATLAB writer renames the reaction `rxnNotes` key to `notes` and emits SMILES inside the annotation block (still accepts both shapes on read).
+- RAVEN MATLAB writer's `preserveQuotes` default is now `false`; values that need quoting (SMILES with `[O-]`, leading flow indicators, booleans, `: `-containing strings) are quoted defensively per value.
+- RAVEN MATLAB writer emits whole-number bounds as `1000.0` (matches cobrapy / Python float repr) instead of `1000`.
+- RAVEN MATLAB reader accepts cobrapy's root-level `id` / `name` / `version` / `gecko_light`, the `!!omap`-tagged `metaData` header, and `notes` (canonical) in addition to `rxnNotes` (legacy).
+- Empty `reaction.metabolites` blocks are emitted as `!!omap []` (valid YAML 1.2) rather than an empty `!!omap` with no value.
+- Document-start marker `---` dropped to match cobrapy's bare `!!omap` root.
+
+These changes are byte-stable for cobrapy and raven-python users; existing yeast-GEM YAML files continue to load. The first time a yeast-GEM curation pass rewrites the file with the new MATLAB writer, the diff will look large (because of the reordering and quote-style changes) but the model content is unchanged.
diff --git a/src/raven_python/io/yaml.py b/src/raven_python/io/yaml.py
index d363ee9..3ea96f9 100644
--- a/src/raven_python/io/yaml.py
+++ b/src/raven_python/io/yaml.py
@@ -235,7 +235,17 @@ def model_from_yaml_data(raw: dict) -> cobra.Model:
         model.notes["version"] = version
 
     # Pop the ec sections out of `foreign` and into a typed EcData.
-    # The remaining unknown keys round-trip opaquely.
+    # The remaining unknown keys round-trip opaquely. Pre-shim RAVEN
+    # MATLAB writes wrote `geckoLight: "true"` inside metaData (rather
+    # than the current top-level `gecko_light`); honour the legacy
+    # placement too — keep the metaData entry untouched (round-trip)
+    # and surface it at the top level so EcData picks it up.
+    legacy_gecko = metadata.get("geckoLight")
+    if legacy_gecko is not None and "gecko_light" not in foreign:
+        if isinstance(legacy_gecko, str):
+            foreign["gecko_light"] = legacy_gecko.lower() == "true"
+        else:
+            foreign["gecko_light"] = bool(legacy_gecko)
     ec_sections = {k: foreign.pop(k) for k in list(foreign) if k in _EC_TOP_KEYS}
     ec_data = ec_data_from_yaml_sections(ec_sections)
     if ec_data is not None:
diff --git a/tests/test_io_yaml_parity.py b/tests/test_io_yaml_parity.py
index 459f563..2c89f58 100644
--- a/tests/test_io_yaml_parity.py
+++ b/tests/test_io_yaml_parity.py
@@ -187,6 +187,158 @@ def test_smiles_with_yaml_special_chars_quoted(src, tmp_path):
     ]
 
 
+PRE_SHIM_YAML = """\
+---
+!!omap
+- metaData:
+    id: "eciYali"
+    name: "Yarrowia lipolytica"
+    version: "1.0"
+    date: "2024-10-17"
+    geckoLight: "true"
+- metabolites:
+    - !!omap
+      - id: "s_0001"
+      - name: "ATP"
+      - compartment: "c"
+      - formula: "C10H16N5O13P3"
+      - charge: -4
+      - inchis: "InChI=1S/CH4"
+      - smiles: "[O-]P(=O)([O-])OP(=O)([O-])O"
+      - annotation: !!omap
+          - kegg.compound: "C00002"
+          - sbo: "SBO:0000247"
+      - deltaG: 12.5
+      - notes: "metabolite note"
+      - metFrom: "KEGG"
+- reactions:
+    - !!omap
+      - id: "r_0001"
+      - name: "hexokinase"
+      - metabolites: !!omap
+          - s_0001: -1
+      - lower_bound: -1000
+      - upper_bound: 1000
+      - gene_reaction_rule: "G1"
+      - rxnFrom: "KEGG"
+      - eccodes: "2.7.1.1"
+      - references: "PMID:12345"
+      - subsystem: "Glycolysis"
+      - annotation: !!omap
+          - kegg.reaction: "R00299"
+          - sbo: "SBO:0000176"
+      - deltaG: -17.39
+      - confidence_score: 2
+      - rxnNotes: "old reaction note"
+- genes:
+    - !!omap
+      - id: "G1"
+      - name: "HXK1"
+      - protein: "P01234"
+      - annotation: !!omap
+          - uniprot: "P01234"
+- compartments: !!omap
+    - c: "cytoplasm"
+- ec-rxns:
+    - !!omap
+      - id: "r_0001"
+      - kcat: 25.3
+      - enzymes: !!omap
+          - P01234: 1
+- ec-enzymes:
+    - !!omap
+      - genes: "G1"
+      - enzymes: "P01234"
+      - mw: 50000
+"""
+
+
+def test_pre_shim_format_loads(tmp_path):
+    """The pre-`feat/yeast-gem-shared` RAVEN MATLAB writer emitted a
+    file shape that differs from the current one in seven concrete
+    ways. The reader must continue to load every one of them:
+
+      1. ``---`` document-start marker (kept by old MATLAB writer)
+      2. ``- metaData:`` as a plain block mapping (no ``!!omap`` tag)
+      3. ``geckoLight: "true"`` *inside* metaData (now emitted as a
+         top-level ``gecko_light``)
+      4. Metabolite ``smiles`` as a top-level entry key (now emitted
+         inside the ``annotation`` block)
+      5. Reaction notes under the ``rxnNotes`` key (now emitted as
+         ``notes``)
+      6. Integer-typed bounds / coefficients (now emitted as floats)
+      7. Every string double-quoted (now bare unless YAML requires
+         quoting)
+
+    Each item below maps to one of those seven cases.
+    """
+    p = tmp_path / "pre_shim.yml"
+    p.write_text(PRE_SHIM_YAML)
+    model = read_yaml_model(p)
+
+    # metaData survives + provenance is lifted onto the cobra-shape
+    # accessors (cases 1 + 2).
+    assert model.id == "eciYali"
+    assert model.name == "Yarrowia lipolytica"
+    assert model.notes["version"] == "1.0"
+    assert model.notes["metaData"]["taxonomy" if "taxonomy" in model.notes["metaData"] else "date"]
+
+    # geckoLight in metaData populates the typed EcData (case 3).
+    assert model.ec is not None
+    assert model.ec.gecko_light is True
+    assert model.ec.rxns == ["r_0001"]
+    assert model.ec.kcat[0] == 25.3
+
+    # Top-level smiles lifted into annotation.smiles (case 4).
+    a = model.metabolites.get_by_id("s_0001")
+    assert a.annotation["smiles"] == ["[O-]P(=O)([O-])OP(=O)([O-])O"]
+    assert "smiles" not in a.notes  # stays in annotation, not notes
+
+    # rxnNotes read as the canonical notes key (case 5).
+    r = model.reactions.get_by_id("r_0001")
+    assert r.notes["note"] == "old reaction note"
+
+    # Integer bounds become floats inside cobra (case 6).
+    assert r.bounds == (-1000.0, 1000.0)
+    assert isinstance(r.lower_bound, float)
+
+    # Quoted strings unquote cleanly (case 7) — verified implicitly by
+    # all the equality assertions above. Spot check the metabolite
+    # name, which used double quotes in the source.
+    assert a.name == "ATP"
+
+    # Other RAVEN extras still preserved.
+    assert a.notes["inchis"] == "InChI=1S/CH4"
+    assert a.notes["deltaG"] == 12.5
+    assert a.notes["note"] == "metabolite note"
+    assert a.notes["metFrom"] == "KEGG"
+    assert r.notes["rxnFrom"] == "KEGG"
+    assert r.notes["eccodes"] == "2.7.1.1"
+    assert r.notes["references"] == "PMID:12345"
+    assert r.notes["confidence_score"] == 2
+    assert r.notes["deltaG"] == -17.39
+    assert model.genes.get_by_id("G1").notes["protein"] == "P01234"
+
+
+def test_pre_shim_yeast_gem_loads_if_available():
+    """The real pre-shim yeast-GEM.yml: 2748 mets, 4102 rxns, 1143
+    genes. Skipped when the working copy isn't mounted (CI runners)."""
+    real = Path("/mnt/c/Work/GitHub/yeast-GEM/model/yeast-GEM.yml")
+    if not real.exists():
+        pytest.skip("yeast-GEM.yml not available in this environment")
+    model = read_yaml_model(real)
+    assert model.id == "yeastGEM_develop"
+    assert len(model.metabolites) == 2748
+    assert len(model.reactions) == 4102
+    assert len(model.genes) == 1143
+    # Every RAVEN extension we know about must come through.
+    assert sum(1 for r in model.reactions if r.notes.get("eccodes")) == 2411
+    assert sum(1 for r in model.reactions if r.notes.get("deltaG") is not None) == 3984
+    assert sum(1 for m in model.metabolites if m.notes.get("deltaG") is not None) == 2696
+    assert sum(1 for m in model.metabolites if "smiles" in (m.annotation or {})) == 1788
+    assert sum(1 for r in model.reactions if r.notes.get("note")) == 1443
+
+
 def test_eccodes_round_trip_through_cobra_extras(src, tmp_path):
     """A model loaded from cobra (no eccodes awareness) and re-written
     via raven_python.write_yaml_model still keeps eccodes — they're