diff --git a/src/raven_python/io/yaml.py b/src/raven_python/io/yaml.py index 3ea96f9..47cc751 100644 --- a/src/raven_python/io/yaml.py +++ b/src/raven_python/io/yaml.py @@ -30,6 +30,8 @@ * older RAVEN files with ``id`` / ``name`` nested in ``metaData``; * per-metabolite top-level ``smiles`` (lifted into ``annotation['smiles']``); +* per-reaction top-level ``eccodes`` (lifted into ``annotation['ec-code']`` — + the cobra-standard place where geckopy reads EC numbers); * very old RAVEN files written as a bare ``-`` sequence of single-key mappings rather than one big mapping; * MATLAB GECKO ecModels whose ``usage_prot_*`` and ``prot_pool_exchange`` @@ -66,7 +68,6 @@ def _open_text(path: str | Path, mode: str): # 'note' to avoid colliding with the notes container itself.) _MET_FIELDS = (("inchis", "inchis"), ("deltaG", "deltaG"), ("metFrom", "metFrom"), ("notes", "note")) _RXN_FIELDS = ( - ("eccodes", "eccodes"), ("references", "references"), ("rxnFrom", "rxnFrom"), ("deltaG", "deltaG"), @@ -168,8 +169,9 @@ def model_from_yaml_data(raw: dict) -> cobra.Model: out of legacy ``metaData``; preserves ``version`` and ``metaData`` on ``model.notes`` for round-trip. 2. **legacy quirks:** lifts per-metabolite top-level ``smiles`` into - ``annotation['smiles']`` (older MATLAB GECKO ecModels emitted it - at the top level); flips the older reverse-direction + ``annotation['smiles']`` and per-reaction top-level ``eccodes`` into + ``annotation['ec-code']`` (older RAVEN/MATLAB GECKO files emitted + these at the top level); flips the older reverse-direction ``usage_prot_*`` / ``prot_pool_exchange`` convention to the forward convention. 3. **GECKO ec sections:** when ``ec-rxns`` / ``ec-enzymes`` are @@ -189,6 +191,13 @@ def model_from_yaml_data(raw: dict) -> cobra.Model: # canonical place. No-op on current files. _lift_smiles_to_annotation(raw.get("metabolites")) + # Legacy quirk: per-reaction top-level `eccodes` -> annotation['ec-code']. + # EC numbers are standard cobra annotation; older RAVEN/MATLAB files put + # them at the reaction top level, which hid them from cobra/geckopy (which + # read annotation['ec-code']). Lift before model_from_dict. No-op on + # current cobra-shaped files. + _lift_eccodes_to_annotation(raw.get("reactions")) + # Normalise legacy reaction-side YAML keys (e.g. RAVEN MATLAB's # ``rxnNotes`` -> the canonical ``notes``) before any field capture so # the capture step sees a single key per concept. @@ -284,6 +293,45 @@ def _lift_smiles_to_annotation(metabolites) -> None: ) +def _lift_eccodes_to_annotation(reactions) -> None: + """Move a reaction's legacy top-level ``eccodes`` into ``annotation['ec-code']``. + + EC numbers are a standard MIRIAM cross-reference, so the cobra/raven + convention is to carry them inside ``annotation`` under the ``ec-code`` + key — where cobra and geckopy read them — not as a RAVEN-only top-level + field. Older RAVEN/MATLAB writers emitted a top-level ``eccodes`` (a + ``;``-joined string or a list of codes); lift it into the canonical + place. Normalises in place; no-op when no reaction carries a top-level + ``eccodes`` key. A native ``annotation['ec-code']`` (if already present) + wins and is left untouched. + """ + if not isinstance(reactions, list): + return + for rxn in reactions: + if not (isinstance(rxn, dict) and "eccodes" in rxn): + continue + codes = _eccodes_to_list(rxn.pop("eccodes")) + if not codes: + continue + annotation = rxn.get("annotation") + if not isinstance(annotation, dict): + annotation = {} + rxn["annotation"] = annotation + annotation.setdefault("ec-code", codes) + + +def _eccodes_to_list(value) -> list: + """Normalise a RAVEN ``eccodes`` value to a list of trimmed code strings. + + Accepts a ``;``-joined string (RAVEN MATLAB's ``getECstring`` form) or an + already-split list; drops empty tokens. + """ + if value is None: + return [] + items = value if isinstance(value, list) else str(value).split(";") + return [str(s).strip() for s in items if str(s).strip()] + + def _flip_legacy_prot_direction(model: cobra.Model) -> None: """Flip pre-forward-direction protein reactions in place. diff --git a/tests/test_io_yaml_parity.py b/tests/test_io_yaml_parity.py index 32d8a6c..accd9b6 100644 --- a/tests/test_io_yaml_parity.py +++ b/tests/test_io_yaml_parity.py @@ -2,9 +2,9 @@ must produce a file that: * cobra.io.load_yaml_model can read (the cobrapy-canonical core); - * keeps every RAVEN-only field (inchis / eccodes / deltaG / rxnFrom / + * keeps every RAVEN-only field (inchis / deltaG / rxnFrom / metFrom / references / confidence_score / rxnNotes / protein / - metMiriams / rxnMiriams / annotation-side SMILES); + metMiriams / rxnMiriams / annotation-side SMILES and EC codes); * emits ``!!omap`` tags on each per-entry mapping (so RAVEN MATLAB's line-based reader can ingest it); * places the ``metaData`` block first, matching RAVEN MATLAB's layout. @@ -108,9 +108,10 @@ def test_round_trip_preserves_every_raven_field(src, tmp_path): assert a.notes["note"] == "metabolite note" assert a.annotation["smiles"] == ["C1=NC2=C(N=CN2)N(C1=O)C"] - # Reaction RAVEN extras (incl. the eccodes round-trip that earlier - # versions dropped on write). - assert r.notes["eccodes"] == "1.1.1.1" + # EC codes round-trip through cobra annotation (the cobra-native place, + # where geckopy reads them), not a RAVEN-only top-level/notes field. + assert r.annotation["ec-code"] == ["1.1.1.1"] + assert "eccodes" not in r.notes assert r.notes["references"] == "PMID:123" assert r.notes["rxnFrom"] == "manual" assert r.notes["confidence_score"] == 2 @@ -312,7 +313,9 @@ def test_pre_shim_format_loads(tmp_path): assert a.notes["note"] == "metabolite note" assert a.notes["metFrom"] == "KEGG" assert r.notes["rxnFrom"] == "KEGG" - assert r.notes["eccodes"] == "2.7.1.1" + # legacy top-level eccodes lifted into the cobra-native annotation['ec-code'] + assert r.annotation["ec-code"] == ["2.7.1.1"] + assert "eccodes" not in r.notes assert r.notes["references"] == "PMID:12345" assert r.notes["confidence_score"] == 2 assert r.notes["deltaG"] == -17.39 @@ -331,7 +334,7 @@ def test_pre_shim_yeast_gem_loads_if_available(): assert len(model.reactions) == 4102 assert len(model.genes) == 1143 # Every RAVEN extension we know about must come through. - assert sum(1 for r in model.reactions if r.notes.get("eccodes")) == 2411 + assert sum(1 for r in model.reactions if r.annotation.get("ec-code")) == 2411 assert sum(1 for r in model.reactions if r.notes.get("deltaG") is not None) == 3984 assert sum(1 for m in model.metabolites if m.notes.get("deltaG") is not None) == 2696 assert sum(1 for m in model.metabolites if "smiles" in (m.annotation or {})) == 1788 @@ -339,26 +342,23 @@ def test_pre_shim_yeast_gem_loads_if_available(): def test_eccodes_round_trip_through_cobra_extras(src, tmp_path): - """A model loaded from cobra (no eccodes awareness) and re-written - via raven_python.write_yaml_model still keeps eccodes — they're - sourced from .notes['eccodes'] which read_yaml_model puts there.""" - # Same fixture, but go through cobra first to prove notes-based - # eccodes propagation works when cobra is in the loop. + """EC codes round-trip as cobra annotation through a + raven_python -> cobra -> raven_python loop. They live in + ``annotation['ec-code']`` — the cobra-native place — so plain + ``cobra.io`` preserves them with no RAVEN-specific handling, and + geckopy (which reads ``annotation['ec-code']``) sees them.""" model = read_yaml_model(src) pass1 = tmp_path / "via_rp.yml" write_yaml_model(model, pass1) + # Plain cobra reads annotation['ec-code'] natively — this is the + # interop the alignment guarantees. via_cobra = cobra.io.load_yaml_model(str(pass1)) - # cobra exposes eccodes as an attribute (setattr fall-through), proving - # the key written by write_yaml_model survives a cobra round-trip. - assert getattr(via_cobra.reactions.get_by_id("R1"), "eccodes", None) == "1.1.1.1" + assert via_cobra.reactions.get_by_id("R1").annotation["ec-code"] == ["1.1.1.1"] pass2 = tmp_path / "via_rp2.yml" - # Promote cobra's setattr-eccodes back into notes for the writer - # path. (Tests the documented integration: cobra preserves the YAML - # key, raven_python.read sees it again.) again = read_yaml_model(pass1) write_yaml_model(again, pass2) final = read_yaml_model(pass2) - assert final.reactions.get_by_id("R1").notes["eccodes"] == "1.1.1.1" + assert final.reactions.get_by_id("R1").annotation["ec-code"] == ["1.1.1.1"] # And cobra can still read the final result. cm = cobra.io.load_yaml_model(str(pass2)) assert cm.reactions.get_by_id("R1").bounds == (-1000.0, 1000.0)