Skip to content
Merged
4 changes: 3 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ jobs:
cache: pip
cache-dependency-path: pyproject.toml
- run: pip install --upgrade pip
- run: pip install ruff
# Pinned so the lint result is reproducible (unpinned ruff silently
# adds rules over time). Keep in sync with the ``dev`` extra in pyproject.
- run: pip install ruff==0.15.15
- run: ruff check .

test:
Expand Down
28 changes: 28 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,34 @@ Milestones in the raven-python port. For function-level status see
[docs/raven_migration.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/reference/migration.md); for open work see
[docs/todo.md](https://github.com/SysBioChalmers/raven-python/blob/develop/docs/reference/todo.md).

## Unreleased

Post-release review pass — cobra-aligned hardening (no behaviour change on
well-formed inputs). Highlights:

* **Packaging:** `raven_python.__version__` now derives from the installed package
metadata (`importlib.metadata`) instead of a hard-coded literal that had drifted
to `0.0.1`; the docs site reported the wrong version. Pinned `ruff==0.15.15` in
both the `dev` extra and CI so the lint result is reproducible, and fixed two
lint errors the unpinned ruff had started flagging.
* **Errors aligned to cobra:** solver/feasibility failures in `run_init`,
`run_ftinit`, `fill_tasks` and `random_sampling` now raise
`cobra.exceptions.OptimizationError` (already used elsewhere in the package)
instead of a bare `RuntimeError`.
* **Consistency:** a single `utils.parse.subsystem_to_str` coerces a reaction
`subsystem` to cobra's canonical `str` everywhere it is rendered/compared
(`io.excel`, `comparison.compare`, `curation.batch`, `manipulation.add`) — fixes
a crash on non-string subsystem items and the silent drop of multi-subsystem
reactions. GPR score-aggregation (`AGGREGATORS` / `resolve_aggregators`) is now
shared by `init.score` and `init.genes`. Maintainer-side KEGG-download progress
uses a module logger instead of `print`.
* **Robustness:** path-traversal guard on bundled-ZIP extraction (`binaries.py`,
matching the tarfile `filter="data"` precedent); `connect_blocked_reactions`
rejects a non-positive `penalty`; `random_sampling` refuses a NaN-contaminated
sample matrix; `ec_data` warns on an all-zero reaction↔enzyme coupling; optional
`verify=` SHA256 re-check on `ensure_data_file` cache hits; reporter p-value
guarded against non-finite z-scores. Regression tests added for each.

## 0.1.0a1 — 2026-05-30

First alpha release. Covers the functional scope of RAVEN built on cobrapy:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ dependencies = [
dev = [
"pytest>=7",
"pytest-cov",
"ruff>=0.4",
"ruff==0.15.15",
]
excel = [
"openpyxl>=3.1",
Expand Down
7 changes: 6 additions & 1 deletion src/raven_python/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,9 @@
sub-cellular localisation, N-model comparison, and the RAVEN-style I/O formats.
"""

__version__ = "0.0.1"
from importlib.metadata import PackageNotFoundError, version

try:
__version__ = version("raven-python")
except PackageNotFoundError: # running from a source tree without an install
__version__ = "0.0.0+unknown"
2 changes: 2 additions & 0 deletions src/raven_python/analysis/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def _reporter_one(model: cobra.Model, gene_z: dict[str, float], test: str) -> Re
raw = zs.sum() / math.sqrt(n)
# Exact background correction for sampling-with-replacement (see module doc).
corrected = (raw - math.sqrt(n) * mu) / sigma if sigma > 0 else 0.0
if not math.isfinite(corrected): # never on clamped z-scores; guards the p-value
corrected = 0.0
rows.append(
{
"metabolite": met.id,
Expand Down
12 changes: 10 additions & 2 deletions src/raven_python/analysis/sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import cobra
import numpy as np
import pandas as pd
from cobra.exceptions import OptimizationError
from cobra.flux_analysis import flux_variability_analysis, pfba

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -188,11 +189,18 @@ def random_sampling(
model.objective = model.problem.Objective(sum(terms), direction="max")
sol = model.optimize()
if sol.status == "optimal" and abs(sol.objective_value) > 1e-8:
samples[i, :] = (pfba(model) if min_flux else sol).fluxes.reindex(reaction_ids).to_numpy()
fluxes = (pfba(model) if min_flux else sol).fluxes.reindex(reaction_ids)
if fluxes.isna().any(): # solver returned an unexpected reaction set
missing = fluxes.index[fluxes.isna()].tolist()
raise OptimizationError(
"solver returned fluxes missing reaction(s) "
f"{missing[:5]}; cannot assemble a NaN-free sample matrix."
)
samples[i, :] = fluxes.to_numpy()
break
if attempt == max_attempts:
if not suppress_errors:
raise RuntimeError(
raise OptimizationError(
"Could not find a non-zero, loop-free solution after "
f"{max_attempts} attempts for sample {i}. Review the model's "
"constraints, or set suppress_errors=True."
Expand Down
18 changes: 17 additions & 1 deletion src/raven_python/binaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,22 @@ def _sha256(path: Path) -> str:
return h.hexdigest()


def _safe_extract_zip(zf: zipfile.ZipFile, dest_dir: Path) -> None:
"""Extract ``zf`` into ``dest_dir``, rejecting members that escape it.

``ZipFile.extractall`` has no path-traversal guard (unlike tarfile's
``filter="data"`` used in reconstruction/kegg/download.py), so a malicious or
corrupt archive could write outside the cache via absolute paths or ``..``.
SHA256 + HTTPS already make a hostile archive unlikely; this is defence in depth.
"""
dest = dest_dir.resolve()
for member in zf.namelist():
target = (dest_dir / member).resolve()
if target != dest and dest not in target.parents:
raise ValueError(f"unsafe path in archive (path traversal): {member!r}")
zf.extractall(dest_dir)


def ensure_binary(executable: str, *, registry: dict | None = None) -> Path:
"""Download (if needed) and return the path to a bundled ``executable``.

Expand Down Expand Up @@ -134,7 +150,7 @@ def ensure_binary(executable: str, *, registry: dict | None = None) -> Path:
finally:
part.unlink(missing_ok=True)
with zipfile.ZipFile(archive) as zf:
zf.extractall(dest_dir)
_safe_extract_zip(zf, dest_dir)
archive.unlink(missing_ok=True)
if not exe.exists():
raise FileNotFoundError(f"{executable!r} not found in the extracted bundle at {dest_dir}.")
Expand Down
10 changes: 4 additions & 6 deletions src/raven_python/comparison/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import pandas as pd

from raven_python.tasks import Task, check_tasks
from raven_python.utils.parse import subsystem_to_str


@dataclass
Expand Down Expand Up @@ -60,12 +61,9 @@ def _subsystem_counts(model: cobra.Model) -> dict[str, int]:
"""{subsystem_name: reaction_count}. Reactions with empty subsystem fall under '(none)'."""
counts: dict[str, int] = {}
for r in model.reactions:
# cobra stores subsystem as a string; RAVEN sometimes uses cell-of-cells (we'd
# already have it as a string here, but guard against list/tuple from messy YAML).
sub = r.subsystem
if isinstance(sub, (list, tuple)):
sub = sub[0] if sub else ""
sub = (sub or "").strip() or "(none)"
# cobra stores subsystem as a string; RAVEN sometimes uses cell-of-cells.
# Coerce to the same ;-joined string used everywhere else (no data lost).
sub = subsystem_to_str(r.subsystem).strip() or "(none)"
counts[sub] = counts.get(sub, 0) + 1
return counts

Expand Down
4 changes: 3 additions & 1 deletion src/raven_python/curation/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import cobra
import pandas as pd

from raven_python.utils.parse import subsystem_to_str

#: Core columns recognised in ``mets_df``. Anything else is treated as a
#: MIRIAM annotation column (the header becomes the namespace key).
DEFAULT_CORE_MET_COLUMNS: tuple[str, ...] = (
Expand Down Expand Up @@ -369,7 +371,7 @@ def _update_reaction(rxn: cobra.Reaction, row: pd.Series, miriam_cols: list[str]
if _has_value(row.get("ub")):
rxn.upper_bound = float(row["ub"])
if _has_value(row.get("subSystems")):
rxn.subsystem = str(row["subSystems"])
rxn.subsystem = subsystem_to_str(row["subSystems"])
if _has_value(row.get("eccodes")):
rxn.annotation["ec-code"] = str(row["eccodes"])
if _has_value(row.get("rxnNotes")):
Expand Down
10 changes: 9 additions & 1 deletion src/raven_python/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,18 @@ def ensure_data_file(
*,
version: str | None = None,
registry: dict | None = None,
verify: bool = False,
) -> Path:
"""Download (if needed) and return the cached path to one artefact file.

Looks the file up in the registry for ``dataset`` (at ``version`` or the
registry's default), downloads it to the version-pinned cache directory,
verifies its SHA256, and returns the path. Re-uses an already-cached copy.

A freshly downloaded file is always SHA256-checked. ``verify`` additionally
re-checks an *already-cached* file's SHA256 (a mismatch — i.e. a corrupted
cache — discards it and re-downloads); it is off by default so the common
cache-hit path stays fast.
"""
registry = _DATA_REGISTRY if registry is None else registry
bundle = _bundle(dataset, registry)
Expand All @@ -95,7 +101,9 @@ def ensure_data_file(
dest_dir = _data_cache_dir() / f"{dataset}-{ver}"
dest = dest_dir / filename
if dest.exists():
return dest
if not verify or _sha256(dest) == entry["sha256"]:
return dest
dest.unlink() # corrupted cache → fall through and re-download

dest_dir.mkdir(parents=True, exist_ok=True)
tmp = dest.with_name(dest.name + ".part")
Expand Down
2 changes: 2 additions & 0 deletions src/raven_python/gapfilling/fill.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ def connect_blocked_reactions(
The draft is expected to have exchange reactions for its nutrients (otherwise most
reactions are trivially blocked).
"""
if penalty <= 0:
raise ValueError(f"penalty must be > 0 (a non-positive cost malforms the MILP); got {penalty}.")
templates = _as_models(templates)
blocked = set(find_blocked_reactions(model))
candidates = [r for r in blocked if model.reactions.get_by_id(r).lower_bound >= 0]
Expand Down
3 changes: 2 additions & 1 deletion src/raven_python/init/ftinit.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
from dataclasses import dataclass, field

import cobra
from cobra.exceptions import OptimizationError
from optlang.symbolics import Real, add, mul

from raven_python.init.genes import remove_low_score_genes
Expand Down Expand Up @@ -210,7 +211,7 @@ def add_constraint(expr, **kw):
opt.optimize()
# Accept a near-optimal incumbent (when a MIP gap / time limit is set), as RAVEN does.
if opt.status not in ("optimal", "feasible", "suboptimal", "time_limit"):
raise RuntimeError(f"ftINIT MILP did not solve (status: {opt.status}).")
raise OptimizationError(f"ftINIT MILP did not solve (status: {opt.status}).")

# RAVEN: a reaction is "on" iff its indicator ≥ 0.5 (positive indicators are
# continuous and can land fractionally when a reaction can carry only tiny flux).
Expand Down
8 changes: 2 additions & 6 deletions src/raven_python/init/genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,12 @@
from __future__ import annotations

import ast
import statistics
from collections.abc import Mapping

import cobra
from cobra.manipulation import remove_genes

_AGG = {"min": min, "max": max, "median": statistics.median, "average": statistics.fmean}
from raven_python.utils.gpr import resolve_aggregators


def _prune(node, scores, iso, cplx) -> tuple[str | None, float | None]:
Expand Down Expand Up @@ -64,10 +63,7 @@ def remove_low_score_genes(
**deterministically** (first on a tie), unlike RAVEN's random tie-break — same
quality, reproducible.
"""
for name, value in (("isozyme_scoring", isozyme_scoring), ("complex_scoring", complex_scoring)):
if value not in _AGG:
raise ValueError(f"{name} must be one of {sorted(_AGG)}; got {value!r}.")
iso, cplx = _AGG[isozyme_scoring], _AGG[complex_scoring]
iso, cplx = resolve_aggregators(isozyme_scoring, complex_scoring)

out = model.copy()
for rxn in out.reactions:
Expand Down
3 changes: 2 additions & 1 deletion src/raven_python/init/init.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from dataclasses import dataclass

import cobra
from cobra.exceptions import OptimizationError
from optlang.symbolics import Real, add, mul

_EPS = 1.0 # flux an included reaction must carry (RAVEN's fake-met unit)
Expand Down Expand Up @@ -203,7 +204,7 @@ def run_init(
opt.optimize()
# With a MIP gap / time limit set, accept a near-optimal incumbent (as RAVEN does).
if opt.status not in ("optimal", "feasible", "suboptimal", "time_limit"):
raise RuntimeError(f"INIT MILP did not solve (status: {opt.status}).")
raise OptimizationError(f"INIT MILP did not solve (status: {opt.status}).")

# A reaction is kept if any of its directed parts is essential or has x≈1.
kept_origins = {d.origin for d in directed if d.essential}
Expand Down
8 changes: 2 additions & 6 deletions src/raven_python/init/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@

import ast
import math
import statistics
from collections.abc import Mapping

import cobra

_AGG = {"min": min, "max": max, "median": statistics.median, "average": statistics.fmean}
from raven_python.utils.gpr import resolve_aggregators


def gene_scores_from_expression(
Expand Down Expand Up @@ -70,10 +69,7 @@ def score_reactions_from_genes(
no_gene_score: float = -2.0,
) -> dict[str, float]:
"""Return ``{reaction_id: score}`` from per-gene scores via each reaction's GPR."""
for name, value in (("isozyme_scoring", isozyme_scoring), ("complex_scoring", complex_scoring)):
if value not in _AGG:
raise ValueError(f"{name} must be one of {sorted(_AGG)}; got {value!r}.")
iso, cplx = _AGG[isozyme_scoring], _AGG[complex_scoring]
iso, cplx = resolve_aggregators(isozyme_scoring, complex_scoring)

scores: dict[str, float] = {}
for rxn in model.reactions:
Expand Down
11 changes: 7 additions & 4 deletions src/raven_python/init/taskfill.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from dataclasses import dataclass

import cobra
from cobra.exceptions import OptimizationError
from optlang.symbolics import Real, add, mul

from raven_python.tasks import Task
Expand Down Expand Up @@ -87,13 +88,15 @@ def _fill_one_task(
right trade for tractability, exactly as for the main ftINIT MILP.
"""
if not candidates: # nothing left to add → task cannot be made feasible
raise RuntimeError(f"gap-filling found no candidates for task {task.id!r}.")
raise OptimizationError(f"gap-filling found no candidates for task {task.id!r}.")
combined = _closed_copy(model) # task I/O via the task's b, not the model's exchanges
combined.add_reactions([r.copy() for r in candidates])
name_to_id, comp_to_ids = task_name_maps(combined)
_, error = apply_task_constraints(combined, task, name_to_id, comp_to_ids)
if error is not None:
raise RuntimeError(f"task {task.id!r} could not be applied to the reference: {error}")
raise OptimizationError(
f"task {task.id!r} could not be applied to the reference: {error}"
)

prob = combined.problem
extras = []
Expand Down Expand Up @@ -126,7 +129,7 @@ def _fill_one_task(
# (no incumbent) means the task cannot be satisfied from the reference.
if combined.solver.status not in ("optimal", "feasible", "suboptimal", "time_limit") or \
combined.variables[f"_fill_{candidates[0].id}"].primal is None:
raise RuntimeError(f"gap-filling found no way to make task {task.id!r} feasible.")
raise OptimizationError(f"gap-filling found no way to make task {task.id!r} feasible.")
return [c.id for c in candidates
if (combined.variables[f"_fill_{c.id}"].primal or 0.0) > 0.5]

Expand Down Expand Up @@ -174,7 +177,7 @@ def fill_tasks(
avail = [r for r in candidates if r.id not in present]
try:
chosen = _fill_one_task(out, avail, task, costs, mip_gap=mip_gap, time_limit=time_limit)
except RuntimeError:
except OptimizationError:
failed.append(task.id)
continue
if chosen:
Expand Down
9 changes: 9 additions & 0 deletions src/raven_python/io/ec_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from __future__ import annotations

import math
import warnings
from dataclasses import dataclass, field
from typing import Any

Expand Down Expand Up @@ -263,6 +264,14 @@ def _build_ec_data(
)
mat[i, j] = float(stoich)

if n_r > 0 and mat.nnz == 0: # reactions present but none coupled to an enzyme
warnings.warn(
f"ec-rxns has {n_r} entries but none reference an enzyme; the "
"reaction↔enzyme coupling matrix is all-zero (likely malformed "
"ec-rxns/ec-enzymes).",
stacklevel=2,
)

return EcData(
gecko_light=gecko_light,
rxns=rxns,
Expand Down
6 changes: 3 additions & 3 deletions src/raven_python/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

import cobra

from raven_python.utils.parse import subsystem_to_str


def _miriam_string(annotation: dict, exclude: tuple[str, ...] = ()) -> str:
"""RAVEN MIRIAM column: ``namespace/id;namespace/id2;...`` (sorted)."""
Expand Down Expand Up @@ -84,9 +86,7 @@ def export_to_excel(
"REPLACEMENT ID", "NOTE", "REFERENCE", "CONFIDENCE SCORE"]
)
for r in reactions:
subsystem = r.subsystem
if isinstance(subsystem, (list, tuple)):
subsystem = ";".join(subsystem)
subsystem = subsystem_to_str(r.subsystem)
ws.append([
None, r.id, r.name, _equation(r), _ec_codes(r), r.gene_reaction_rule,
r.lower_bound, r.upper_bound,
Expand Down
Loading
Loading