Skip to content
Merged
54 changes: 51 additions & 3 deletions src/raven_python/io/yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@

* older RAVEN files with ``id`` / ``name`` nested in ``metaData``;
* per-metabolite top-level ``smiles`` (lifted into ``annotation['smiles']``);
* per-reaction top-level ``eccodes`` (lifted into ``annotation['ec-code']`` —
the cobra-standard place where geckopy reads EC numbers);
* very old RAVEN files written as a bare ``-`` sequence of single-key mappings
rather than one big mapping;
* MATLAB GECKO ecModels whose ``usage_prot_*`` and ``prot_pool_exchange``
Expand Down Expand Up @@ -66,7 +68,6 @@ def _open_text(path: str | Path, mode: str):
# 'note' to avoid colliding with the notes container itself.)
_MET_FIELDS = (("inchis", "inchis"), ("deltaG", "deltaG"), ("metFrom", "metFrom"), ("notes", "note"))
_RXN_FIELDS = (
("eccodes", "eccodes"),
("references", "references"),
("rxnFrom", "rxnFrom"),
("deltaG", "deltaG"),
Expand Down Expand Up @@ -168,8 +169,9 @@ def model_from_yaml_data(raw: dict) -> cobra.Model:
out of legacy ``metaData``; preserves ``version`` and ``metaData``
on ``model.notes`` for round-trip.
2. **legacy quirks:** lifts per-metabolite top-level ``smiles`` into
``annotation['smiles']`` (older MATLAB GECKO ecModels emitted it
at the top level); flips the older reverse-direction
``annotation['smiles']`` and per-reaction top-level ``eccodes`` into
``annotation['ec-code']`` (older RAVEN/MATLAB GECKO files emitted
these at the top level); flips the older reverse-direction
``usage_prot_*`` / ``prot_pool_exchange`` convention to the
forward convention.
3. **GECKO ec sections:** when ``ec-rxns`` / ``ec-enzymes`` are
Expand All @@ -189,6 +191,13 @@ def model_from_yaml_data(raw: dict) -> cobra.Model:
# canonical place. No-op on current files.
_lift_smiles_to_annotation(raw.get("metabolites"))

# Legacy quirk: per-reaction top-level `eccodes` -> annotation['ec-code'].
# EC numbers are standard cobra annotation; older RAVEN/MATLAB files put
# them at the reaction top level, which hid them from cobra/geckopy (which
# read annotation['ec-code']). Lift before model_from_dict. No-op on
# current cobra-shaped files.
_lift_eccodes_to_annotation(raw.get("reactions"))

# Normalise legacy reaction-side YAML keys (e.g. RAVEN MATLAB's
# ``rxnNotes`` -> the canonical ``notes``) before any field capture so
# the capture step sees a single key per concept.
Expand Down Expand Up @@ -284,6 +293,45 @@ def _lift_smiles_to_annotation(metabolites) -> None:
)


def _lift_eccodes_to_annotation(reactions) -> None:
"""Move a reaction's legacy top-level ``eccodes`` into ``annotation['ec-code']``.

EC numbers are a standard MIRIAM cross-reference, so the cobra/raven
convention is to carry them inside ``annotation`` under the ``ec-code``
key — where cobra and geckopy read them — not as a RAVEN-only top-level
field. Older RAVEN/MATLAB writers emitted a top-level ``eccodes`` (a
``;``-joined string or a list of codes); lift it into the canonical
place. Normalises in place; no-op when no reaction carries a top-level
``eccodes`` key. A native ``annotation['ec-code']`` (if already present)
wins and is left untouched.
"""
if not isinstance(reactions, list):
return
for rxn in reactions:
if not (isinstance(rxn, dict) and "eccodes" in rxn):
continue
codes = _eccodes_to_list(rxn.pop("eccodes"))
if not codes:
continue
annotation = rxn.get("annotation")
if not isinstance(annotation, dict):
annotation = {}
rxn["annotation"] = annotation
annotation.setdefault("ec-code", codes)


def _eccodes_to_list(value) -> list:
"""Normalise a RAVEN ``eccodes`` value to a list of trimmed code strings.

Accepts a ``;``-joined string (RAVEN MATLAB's ``getECstring`` form) or an
already-split list; drops empty tokens.
"""
if value is None:
return []
items = value if isinstance(value, list) else str(value).split(";")
return [str(s).strip() for s in items if str(s).strip()]


def _flip_legacy_prot_direction(model: cobra.Model) -> None:
"""Flip pre-forward-direction protein reactions in place.

Expand Down
38 changes: 19 additions & 19 deletions tests/test_io_yaml_parity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
must produce a file that:

* cobra.io.load_yaml_model can read (the cobrapy-canonical core);
* keeps every RAVEN-only field (inchis / eccodes / deltaG / rxnFrom /
* keeps every RAVEN-only field (inchis / deltaG / rxnFrom /
metFrom / references / confidence_score / rxnNotes / protein /
metMiriams / rxnMiriams / annotation-side SMILES);
metMiriams / rxnMiriams / annotation-side SMILES and EC codes);
* emits ``!!omap`` tags on each per-entry mapping (so RAVEN MATLAB's
line-based reader can ingest it);
* places the ``metaData`` block first, matching RAVEN MATLAB's layout.
Expand Down Expand Up @@ -108,9 +108,10 @@ def test_round_trip_preserves_every_raven_field(src, tmp_path):
assert a.notes["note"] == "metabolite note"
assert a.annotation["smiles"] == ["C1=NC2=C(N=CN2)N(C1=O)C"]

# Reaction RAVEN extras (incl. the eccodes round-trip that earlier
# versions dropped on write).
assert r.notes["eccodes"] == "1.1.1.1"
# EC codes round-trip through cobra annotation (the cobra-native place,
# where geckopy reads them), not a RAVEN-only top-level/notes field.
assert r.annotation["ec-code"] == ["1.1.1.1"]
assert "eccodes" not in r.notes
assert r.notes["references"] == "PMID:123"
assert r.notes["rxnFrom"] == "manual"
assert r.notes["confidence_score"] == 2
Expand Down Expand Up @@ -312,7 +313,9 @@ def test_pre_shim_format_loads(tmp_path):
assert a.notes["note"] == "metabolite note"
assert a.notes["metFrom"] == "KEGG"
assert r.notes["rxnFrom"] == "KEGG"
assert r.notes["eccodes"] == "2.7.1.1"
# legacy top-level eccodes lifted into the cobra-native annotation['ec-code']
assert r.annotation["ec-code"] == ["2.7.1.1"]
assert "eccodes" not in r.notes
assert r.notes["references"] == "PMID:12345"
assert r.notes["confidence_score"] == 2
assert r.notes["deltaG"] == -17.39
Expand All @@ -331,34 +334,31 @@ def test_pre_shim_yeast_gem_loads_if_available():
assert len(model.reactions) == 4102
assert len(model.genes) == 1143
# Every RAVEN extension we know about must come through.
assert sum(1 for r in model.reactions if r.notes.get("eccodes")) == 2411
assert sum(1 for r in model.reactions if r.annotation.get("ec-code")) == 2411
assert sum(1 for r in model.reactions if r.notes.get("deltaG") is not None) == 3984
assert sum(1 for m in model.metabolites if m.notes.get("deltaG") is not None) == 2696
assert sum(1 for m in model.metabolites if "smiles" in (m.annotation or {})) == 1788
assert sum(1 for r in model.reactions if r.notes.get("note")) == 1443


def test_eccodes_round_trip_through_cobra_extras(src, tmp_path):
"""A model loaded from cobra (no eccodes awareness) and re-written
via raven_python.write_yaml_model still keeps eccodes — they're
sourced from .notes['eccodes'] which read_yaml_model puts there."""
# Same fixture, but go through cobra first to prove notes-based
# eccodes propagation works when cobra is in the loop.
"""EC codes round-trip as cobra annotation through a
raven_python -> cobra -> raven_python loop. They live in
``annotation['ec-code']`` — the cobra-native place — so plain
``cobra.io`` preserves them with no RAVEN-specific handling, and
geckopy (which reads ``annotation['ec-code']``) sees them."""
model = read_yaml_model(src)
pass1 = tmp_path / "via_rp.yml"
write_yaml_model(model, pass1)
# Plain cobra reads annotation['ec-code'] natively — this is the
# interop the alignment guarantees.
via_cobra = cobra.io.load_yaml_model(str(pass1))
# cobra exposes eccodes as an attribute (setattr fall-through), proving
# the key written by write_yaml_model survives a cobra round-trip.
assert getattr(via_cobra.reactions.get_by_id("R1"), "eccodes", None) == "1.1.1.1"
assert via_cobra.reactions.get_by_id("R1").annotation["ec-code"] == ["1.1.1.1"]
pass2 = tmp_path / "via_rp2.yml"
# Promote cobra's setattr-eccodes back into notes for the writer
# path. (Tests the documented integration: cobra preserves the YAML
# key, raven_python.read sees it again.)
again = read_yaml_model(pass1)
write_yaml_model(again, pass2)
final = read_yaml_model(pass2)
assert final.reactions.get_by_id("R1").notes["eccodes"] == "1.1.1.1"
assert final.reactions.get_by_id("R1").annotation["ec-code"] == ["1.1.1.1"]
# And cobra can still read the final result.
cm = cobra.io.load_yaml_model(str(pass2))
assert cm.reactions.get_by_id("R1").bounds == (-1000.0, 1000.0)
Loading