SysBioChalmers · edkerk · Jun 8, 2026 · May 29, 2026 · May 30, 2026 · May 30, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -23,7 +23,9 @@ jobs:
           cache: pip
           cache-dependency-path: pyproject.toml
       - run: pip install --upgrade pip
-      - run: pip install ruff
+      # Pinned so the lint result is reproducible (unpinned ruff silently
+      # adds rules over time). Keep in sync with the ``dev`` extra in pyproject.
+      - run: pip install ruff==0.15.15
       - run: ruff check .
 
   test:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,34 @@ Milestones in the raven-python port. For function-level status see
 [docs/raven_migration.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/reference/migration.md); for open work see
 [docs/todo.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/reference/todo.md).
 
+## Unreleased
+
+Post-release review pass — cobra-aligned hardening (no behaviour change on
+well-formed inputs). Highlights:
+
+* **Packaging:** `raven_python.__version__` now derives from the installed package
+  metadata (`importlib.metadata`) instead of a hard-coded literal that had drifted
+  to `0.0.1`; the docs site reported the wrong version. Pinned `ruff==0.15.15` in
+  both the `dev` extra and CI so the lint result is reproducible, and fixed two
+  lint errors the unpinned ruff had started flagging.
+* **Errors aligned to cobra:** solver/feasibility failures in `run_init`,
+  `run_ftinit`, `fill_tasks` and `random_sampling` now raise
+  `cobra.exceptions.OptimizationError` (already used elsewhere in the package)
+  instead of a bare `RuntimeError`.
+* **Consistency:** a single `utils.parse.subsystem_to_str` coerces a reaction
+  `subsystem` to cobra's canonical `str` everywhere it is rendered/compared
+  (`io.excel`, `comparison.compare`, `curation.batch`, `manipulation.add`) — fixes
+  a crash on non-string subsystem items and the silent drop of multi-subsystem
+  reactions. GPR score-aggregation (`AGGREGATORS` / `resolve_aggregators`) is now
+  shared by `init.score` and `init.genes`. Maintainer-side KEGG-download progress
+  uses a module logger instead of `print`.
+* **Robustness:** path-traversal guard on bundled-ZIP extraction (`binaries.py`,
+  matching the tarfile `filter="data"` precedent); `connect_blocked_reactions`
+  rejects a non-positive `penalty`; `random_sampling` refuses a NaN-contaminated
+  sample matrix; `ec_data` warns on an all-zero reaction↔enzyme coupling; optional
+  `verify=` SHA256 re-check on `ensure_data_file` cache hits; reporter p-value
+  guarded against non-finite z-scores. Regression tests added for each.
+
 ## 0.1.0a1 — 2026-05-30
 
 First alpha release. Covers the functional scope of RAVEN built on cobrapy:

diff --git a/pyproject.toml b/pyproject.toml
@@ -49,7 +49,7 @@ dependencies = [
 dev = [
     "pytest>=7",
     "pytest-cov",
-    "ruff>=0.4",
+    "ruff==0.15.15",
 ]
 excel = [
     "openpyxl>=3.1",

diff --git a/src/raven_python/__init__.py b/src/raven_python/__init__.py
@@ -7,4 +7,9 @@
 sub-cellular localisation, N-model comparison, and the RAVEN-style I/O formats.
 """
 
-__version__ = "0.0.1"
+from importlib.metadata import PackageNotFoundError, version
+
+try:
+    __version__ = version("raven-python")
+except PackageNotFoundError:  # running from a source tree without an install
+    __version__ = "0.0.0+unknown"
diff --git a/src/raven_python/analysis/reporter.py b/src/raven_python/analysis/reporter.py
@@ -60,6 +60,8 @@ def _reporter_one(model: cobra.Model, gene_z: dict[str, float], test: str) -> Re
         raw = zs.sum() / math.sqrt(n)
         # Exact background correction for sampling-with-replacement (see module doc).
         corrected = (raw - math.sqrt(n) * mu) / sigma if sigma > 0 else 0.0
+        if not math.isfinite(corrected):  # never on clamped z-scores; guards the p-value
+            corrected = 0.0
         rows.append(
             {
                 "metabolite": met.id,

diff --git a/src/raven_python/analysis/sampling.py b/src/raven_python/analysis/sampling.py
@@ -36,6 +36,7 @@
 import cobra
 import numpy as np
 import pandas as pd
+from cobra.exceptions import OptimizationError
 from cobra.flux_analysis import flux_variability_analysis, pfba
 
 logger = logging.getLogger(__name__)
@@ -188,11 +189,18 @@ def random_sampling(
             model.objective = model.problem.Objective(sum(terms), direction="max")
             sol = model.optimize()
             if sol.status == "optimal" and abs(sol.objective_value) > 1e-8:
-                samples[i, :] = (pfba(model) if min_flux else sol).fluxes.reindex(reaction_ids).to_numpy()
+                fluxes = (pfba(model) if min_flux else sol).fluxes.reindex(reaction_ids)
+                if fluxes.isna().any():  # solver returned an unexpected reaction set
+                    missing = fluxes.index[fluxes.isna()].tolist()
+                    raise OptimizationError(
+                        "solver returned fluxes missing reaction(s) "
+                        f"{missing[:5]}; cannot assemble a NaN-free sample matrix."
+                    )
+                samples[i, :] = fluxes.to_numpy()
                 break
             if attempt == max_attempts:
                 if not suppress_errors:
-                    raise RuntimeError(
+                    raise OptimizationError(
                         "Could not find a non-zero, loop-free solution after "
                         f"{max_attempts} attempts for sample {i}. Review the model's "
                         "constraints, or set suppress_errors=True."

diff --git a/src/raven_python/binaries.py b/src/raven_python/binaries.py
@@ -86,6 +86,22 @@ def _sha256(path: Path) -> str:
     return h.hexdigest()
 
 
+def _safe_extract_zip(zf: zipfile.ZipFile, dest_dir: Path) -> None:
+    """Extract ``zf`` into ``dest_dir``, rejecting members that escape it.
+
+    ``ZipFile.extractall`` has no path-traversal guard (unlike tarfile's
+    ``filter="data"`` used in reconstruction/kegg/download.py), so a malicious or
+    corrupt archive could write outside the cache via absolute paths or ``..``.
+    SHA256 + HTTPS already make a hostile archive unlikely; this is defence in depth.
+    """
+    dest = dest_dir.resolve()
+    for member in zf.namelist():
+        target = (dest_dir / member).resolve()
+        if target != dest and dest not in target.parents:
+            raise ValueError(f"unsafe path in archive (path traversal): {member!r}")
+    zf.extractall(dest_dir)
+
+
 def ensure_binary(executable: str, *, registry: dict | None = None) -> Path:
     """Download (if needed) and return the path to a bundled ``executable``.
 
@@ -134,7 +150,7 @@ def ensure_binary(executable: str, *, registry: dict | None = None) -> Path:
     finally:
         part.unlink(missing_ok=True)
     with zipfile.ZipFile(archive) as zf:
-        zf.extractall(dest_dir)
+        _safe_extract_zip(zf, dest_dir)
     archive.unlink(missing_ok=True)
     if not exe.exists():
         raise FileNotFoundError(f"{executable!r} not found in the extracted bundle at {dest_dir}.")

diff --git a/src/raven_python/comparison/compare.py b/src/raven_python/comparison/compare.py
@@ -18,6 +18,7 @@
 import pandas as pd
 
 from raven_python.tasks import Task, check_tasks
+from raven_python.utils.parse import subsystem_to_str
 
 
 @dataclass
@@ -60,12 +61,9 @@ def _subsystem_counts(model: cobra.Model) -> dict[str, int]:
     """{subsystem_name: reaction_count}. Reactions with empty subsystem fall under '(none)'."""
     counts: dict[str, int] = {}
     for r in model.reactions:
-        # cobra stores subsystem as a string; RAVEN sometimes uses cell-of-cells (we'd
-        # already have it as a string here, but guard against list/tuple from messy YAML).
-        sub = r.subsystem
-        if isinstance(sub, (list, tuple)):
-            sub = sub[0] if sub else ""
-        sub = (sub or "").strip() or "(none)"
+        # cobra stores subsystem as a string; RAVEN sometimes uses cell-of-cells.
+        # Coerce to the same ;-joined string used everywhere else (no data lost).
+        sub = subsystem_to_str(r.subsystem).strip() or "(none)"
         counts[sub] = counts.get(sub, 0) + 1
     return counts
 

diff --git a/src/raven_python/curation/batch.py b/src/raven_python/curation/batch.py
@@ -8,6 +8,8 @@
 import cobra
 import pandas as pd
 
+from raven_python.utils.parse import subsystem_to_str
+
 #: Core columns recognised in ``mets_df``. Anything else is treated as a
 #: MIRIAM annotation column (the header becomes the namespace key).
 DEFAULT_CORE_MET_COLUMNS: tuple[str, ...] = (
@@ -369,7 +371,7 @@ def _update_reaction(rxn: cobra.Reaction, row: pd.Series, miriam_cols: list[str]
     if _has_value(row.get("ub")):
         rxn.upper_bound = float(row["ub"])
     if _has_value(row.get("subSystems")):
-        rxn.subsystem = str(row["subSystems"])
+        rxn.subsystem = subsystem_to_str(row["subSystems"])
     if _has_value(row.get("eccodes")):
         rxn.annotation["ec-code"] = str(row["eccodes"])
     if _has_value(row.get("rxnNotes")):

diff --git a/src/raven_python/data.py b/src/raven_python/data.py
@@ -75,12 +75,18 @@ def ensure_data_file(
     *,
     version: str | None = None,
     registry: dict | None = None,
+    verify: bool = False,
 ) -> Path:
     """Download (if needed) and return the cached path to one artefact file.
 
     Looks the file up in the registry for ``dataset`` (at ``version`` or the
     registry's default), downloads it to the version-pinned cache directory,
     verifies its SHA256, and returns the path. Re-uses an already-cached copy.
+
+    A freshly downloaded file is always SHA256-checked. ``verify`` additionally
+    re-checks an *already-cached* file's SHA256 (a mismatch — i.e. a corrupted
+    cache — discards it and re-downloads); it is off by default so the common
+    cache-hit path stays fast.
     """
     registry = _DATA_REGISTRY if registry is None else registry
     bundle = _bundle(dataset, registry)
@@ -95,7 +101,9 @@ def ensure_data_file(
     dest_dir = _data_cache_dir() / f"{dataset}-{ver}"
     dest = dest_dir / filename
     if dest.exists():
-        return dest
+        if not verify or _sha256(dest) == entry["sha256"]:
+            return dest
+        dest.unlink()  # corrupted cache → fall through and re-download
 
     dest_dir.mkdir(parents=True, exist_ok=True)
     tmp = dest.with_name(dest.name + ".part")

diff --git a/src/raven_python/gapfilling/fill.py b/src/raven_python/gapfilling/fill.py
@@ -137,6 +137,8 @@ def connect_blocked_reactions(
     The draft is expected to have exchange reactions for its nutrients (otherwise most
     reactions are trivially blocked).
     """
+    if penalty <= 0:
+        raise ValueError(f"penalty must be > 0 (a non-positive cost malforms the MILP); got {penalty}.")
     templates = _as_models(templates)
     blocked = set(find_blocked_reactions(model))
     candidates = [r for r in blocked if model.reactions.get_by_id(r).lower_bound >= 0]

diff --git a/src/raven_python/init/ftinit.py b/src/raven_python/init/ftinit.py
@@ -51,6 +51,7 @@
 from dataclasses import dataclass, field
 
 import cobra
+from cobra.exceptions import OptimizationError
 from optlang.symbolics import Real, add, mul
 
 from raven_python.init.genes import remove_low_score_genes
@@ -210,7 +211,7 @@ def add_constraint(expr, **kw):
     opt.optimize()
     # Accept a near-optimal incumbent (when a MIP gap / time limit is set), as RAVEN does.
     if opt.status not in ("optimal", "feasible", "suboptimal", "time_limit"):
-        raise RuntimeError(f"ftINIT MILP did not solve (status: {opt.status}).")
+        raise OptimizationError(f"ftINIT MILP did not solve (status: {opt.status}).")
 
     # RAVEN: a reaction is "on" iff its indicator ≥ 0.5 (positive indicators are
     # continuous and can land fractionally when a reaction can carry only tiny flux).

diff --git a/src/raven_python/init/genes.py b/src/raven_python/init/genes.py
@@ -11,13 +11,12 @@
 from __future__ import annotations
 
 import ast
-import statistics
 from collections.abc import Mapping
 
 import cobra
 from cobra.manipulation import remove_genes
 
-_AGG = {"min": min, "max": max, "median": statistics.median, "average": statistics.fmean}
+from raven_python.utils.gpr import resolve_aggregators
 
 
 def _prune(node, scores, iso, cplx) -> tuple[str | None, float | None]:
@@ -64,10 +63,7 @@ def remove_low_score_genes(
     **deterministically** (first on a tie), unlike RAVEN's random tie-break — same
     quality, reproducible.
     """
-    for name, value in (("isozyme_scoring", isozyme_scoring), ("complex_scoring", complex_scoring)):
-        if value not in _AGG:
-            raise ValueError(f"{name} must be one of {sorted(_AGG)}; got {value!r}.")
-    iso, cplx = _AGG[isozyme_scoring], _AGG[complex_scoring]
+    iso, cplx = resolve_aggregators(isozyme_scoring, complex_scoring)
 
     out = model.copy()
     for rxn in out.reactions:

diff --git a/src/raven_python/init/init.py b/src/raven_python/init/init.py
@@ -39,6 +39,7 @@
 from dataclasses import dataclass
 
 import cobra
+from cobra.exceptions import OptimizationError
 from optlang.symbolics import Real, add, mul
 
 _EPS = 1.0  # flux an included reaction must carry (RAVEN's fake-met unit)
@@ -203,7 +204,7 @@ def run_init(
     opt.optimize()
     # With a MIP gap / time limit set, accept a near-optimal incumbent (as RAVEN does).
     if opt.status not in ("optimal", "feasible", "suboptimal", "time_limit"):
-        raise RuntimeError(f"INIT MILP did not solve (status: {opt.status}).")
+        raise OptimizationError(f"INIT MILP did not solve (status: {opt.status}).")
 
     # A reaction is kept if any of its directed parts is essential or has x≈1.
     kept_origins = {d.origin for d in directed if d.essential}

diff --git a/src/raven_python/init/score.py b/src/raven_python/init/score.py
@@ -15,12 +15,11 @@
 
 import ast
 import math
-import statistics
 from collections.abc import Mapping
 
 import cobra
 
-_AGG = {"min": min, "max": max, "median": statistics.median, "average": statistics.fmean}
+from raven_python.utils.gpr import resolve_aggregators
 
 
 def gene_scores_from_expression(
@@ -70,10 +69,7 @@ def score_reactions_from_genes(
     no_gene_score: float = -2.0,
 ) -> dict[str, float]:
     """Return ``{reaction_id: score}`` from per-gene scores via each reaction's GPR."""
-    for name, value in (("isozyme_scoring", isozyme_scoring), ("complex_scoring", complex_scoring)):
-        if value not in _AGG:
-            raise ValueError(f"{name} must be one of {sorted(_AGG)}; got {value!r}.")
-    iso, cplx = _AGG[isozyme_scoring], _AGG[complex_scoring]
+    iso, cplx = resolve_aggregators(isozyme_scoring, complex_scoring)
 
     scores: dict[str, float] = {}
     for rxn in model.reactions:

diff --git a/src/raven_python/init/taskfill.py b/src/raven_python/init/taskfill.py
@@ -18,6 +18,7 @@
 from dataclasses import dataclass
 
 import cobra
+from cobra.exceptions import OptimizationError
 from optlang.symbolics import Real, add, mul
 
 from raven_python.tasks import Task
@@ -87,13 +88,15 @@ def _fill_one_task(
     right trade for tractability, exactly as for the main ftINIT MILP.
     """
     if not candidates:  # nothing left to add → task cannot be made feasible
-        raise RuntimeError(f"gap-filling found no candidates for task {task.id!r}.")
+        raise OptimizationError(f"gap-filling found no candidates for task {task.id!r}.")
     combined = _closed_copy(model)  # task I/O via the task's b, not the model's exchanges
     combined.add_reactions([r.copy() for r in candidates])
     name_to_id, comp_to_ids = task_name_maps(combined)
     _, error = apply_task_constraints(combined, task, name_to_id, comp_to_ids)
     if error is not None:
-        raise RuntimeError(f"task {task.id!r} could not be applied to the reference: {error}")
+        raise OptimizationError(
+            f"task {task.id!r} could not be applied to the reference: {error}"
+        )
 
     prob = combined.problem
     extras = []
@@ -126,7 +129,7 @@ def _fill_one_task(
     # (no incumbent) means the task cannot be satisfied from the reference.
     if combined.solver.status not in ("optimal", "feasible", "suboptimal", "time_limit") or \
             combined.variables[f"_fill_{candidates[0].id}"].primal is None:
-        raise RuntimeError(f"gap-filling found no way to make task {task.id!r} feasible.")
+        raise OptimizationError(f"gap-filling found no way to make task {task.id!r} feasible.")
     return [c.id for c in candidates
             if (combined.variables[f"_fill_{c.id}"].primal or 0.0) > 0.5]
 
@@ -174,7 +177,7 @@ def fill_tasks(
         avail = [r for r in candidates if r.id not in present]
         try:
             chosen = _fill_one_task(out, avail, task, costs, mip_gap=mip_gap, time_limit=time_limit)
-        except RuntimeError:
+        except OptimizationError:
             failed.append(task.id)
             continue
         if chosen:

diff --git a/src/raven_python/io/ec_data.py b/src/raven_python/io/ec_data.py
@@ -45,6 +45,7 @@
 from __future__ import annotations
 
 import math
+import warnings
 from dataclasses import dataclass, field
 from typing import Any
 
@@ -263,6 +264,14 @@ def _build_ec_data(
                 )
             mat[i, j] = float(stoich)
 
+    if n_r > 0 and mat.nnz == 0:  # reactions present but none coupled to an enzyme
+        warnings.warn(
+            f"ec-rxns has {n_r} entries but none reference an enzyme; the "
+            "reaction↔enzyme coupling matrix is all-zero (likely malformed "
+            "ec-rxns/ec-enzymes).",
+            stacklevel=2,
+        )
+
     return EcData(
         gecko_light=gecko_light,
         rxns=rxns,

diff --git a/src/raven_python/io/excel.py b/src/raven_python/io/excel.py
@@ -12,6 +12,8 @@
 
 import cobra
 
+from raven_python.utils.parse import subsystem_to_str
+
 
 def _miriam_string(annotation: dict, exclude: tuple[str, ...] = ()) -> str:
     """RAVEN MIRIAM column: ``namespace/id;namespace/id2;...`` (sorted)."""
@@ -84,9 +86,7 @@ def export_to_excel(
          "REPLACEMENT ID", "NOTE", "REFERENCE", "CONFIDENCE SCORE"]
     )
     for r in reactions:
-        subsystem = r.subsystem
-        if isinstance(subsystem, (list, tuple)):
-            subsystem = ";".join(subsystem)
+        subsystem = subsystem_to_str(r.subsystem)
         ws.append([
             None, r.id, r.name, _equation(r), _ec_codes(r), r.gene_reaction_rule,
             r.lower_bound, r.upper_bound,