From dee7e4c94eba61a2e4ba02c818fbcdf4be4d55bd Mon Sep 17 00:00:00 2001
From: Eduard Kerkhoven <eduardk@chalmers.se>
Date: Tue, 9 Jun 2026 23:21:55 +0200
Subject: [PATCH] Ship type information and enforce it; make gpr_to_dnf public

Three related "make the package's contracts real" changes:

- Add the PEP 561 py.typed marker so the package's extensive type hints are
  visible to downstream type checkers (geckopy included). The hatchling wheel
  ships raven_python/py.typed.
- Add mypy to the dev extra, a lenient [tool.mypy] config (ignore_missing_imports
  for the un-stubbed cobra/optlang/scipy/ruamel), and a mypy CI job. Fix the 36
  type errors this surfaced -- all type-only (Path vs str annotations, None-guards
  that match existing behaviour, optlang Variable typing, isinstance/cast
  narrowing). No runtime behaviour changes; the full test suite stays green.
- Promote manipulation.expand._gpr_to_dnf to a public gpr_to_dnf (re-exported
  from raven_python.manipulation). geckopy's call sites switch to it in lockstep
  (separate PR), so no deprecated alias is kept.
---
 .github/workflows/ci.yml                         | 16 ++++++++++++++++
 pyproject.toml                                   |  7 +++++++
 src/raven_python/comparison/compare.py           |  7 +++++--
 src/raven_python/gapfilling/fill.py              |  3 ++-
 src/raven_python/init/build.py                   |  4 ++--
 src/raven_python/init/ftinit.py                  |  7 ++++---
 src/raven_python/init/genes.py                   |  8 ++++----
 src/raven_python/init/init.py                    |  3 ++-
 src/raven_python/init/score.py                   |  3 +--
 src/raven_python/io/excel.py                     |  2 +-
 src/raven_python/localization/predict.py         |  7 ++++---
 src/raven_python/manipulation/__init__.py        |  3 ++-
 src/raven_python/manipulation/expand.py          |  6 ++++--
 src/raven_python/manipulation/remove.py          |  2 +-
 src/raven_python/py.typed                        |  0
 .../reconstruction/homology/blast.py             |  8 ++++----
 src/raven_python/reconstruction/kegg/download.py |  2 ++
 src/raven_python/reconstruction/kegg/hmm.py      |  6 +++---
 src/raven_python/tasks/check.py                  | 12 ++++++++----
 src/raven_python/utils/gpr.py                    |  7 +++----
 tests/test_manipulation_expand.py                |  4 ++--
 21 files changed, 77 insertions(+), 40 deletions(-)
 create mode 100644 src/raven_python/py.typed

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cdfbe0c..efa7b85 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -28,6 +28,22 @@ jobs:
       - run: pip install ruff==0.15.15
       - run: ruff check .
 
+  typecheck:
+    name: mypy
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: pip
+          cache-dependency-path: pyproject.toml
+      - run: pip install --upgrade pip
+      # Installs runtime deps + the dev extra (mypy). Third-party stubs are
+      # absent for cobra/optlang/scipy; ignore_missing_imports handles that.
+      - run: pip install -e ".[dev]"
+      - run: mypy
+
   test:
     name: pytest (py${{ matrix.python }})
     runs-on: ubuntu-latest
diff --git a/pyproject.toml b/pyproject.toml
index 376a961..16da355 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,6 +50,7 @@ dev = [
     "pytest>=7",
     "pytest-cov",
     "ruff==0.15.15",
+    "mypy>=1.8",
 ]
 excel = [
     "openpyxl>=3.1",
@@ -82,3 +83,9 @@ ignore = [
 
 [tool.ruff.lint.per-file-ignores]
 "tests/*" = ["E402"]
+
+[tool.mypy]
+python_version = "3.11"
+files = ["src/raven_python"]
+# cobra / optlang / scipy / ruamel ship no type stubs; don't error on their imports.
+ignore_missing_imports = true
diff --git a/src/raven_python/comparison/compare.py b/src/raven_python/comparison/compare.py
index 36e8b73..f7eafde 100644
--- a/src/raven_python/comparison/compare.py
+++ b/src/raven_python/comparison/compare.py
@@ -13,6 +13,8 @@
 
 from collections.abc import Iterable
 from dataclasses import dataclass, field
+from pathlib import Path
+from typing import cast
 
 import cobra
 import pandas as pd
@@ -131,8 +133,9 @@ def compare_models(
         # raven_python.tasks.check_tasks accepts a path or an iterable of Task; preserve task
         # ids for the index. Capture the list once so all models test the same set.
         from raven_python.tasks.tasklist import parse_task_list
-        task_list = (parse_task_list(tasks) if isinstance(tasks, (str, bytes))
-                     or hasattr(tasks, "__fspath__") else list(tasks))
+        task_list = (parse_task_list(cast("str | Path", tasks))
+                     if isinstance(tasks, (str, bytes)) or hasattr(tasks, "__fspath__")
+                     else list(tasks))
         task_ids = [t.id for t in task_list]
         task_df = pd.DataFrame(False, index=task_ids, columns=model_ids, dtype=bool)
         for mid, m in zip(model_ids, models_list, strict=True):
diff --git a/src/raven_python/gapfilling/fill.py b/src/raven_python/gapfilling/fill.py
index ad0f1ab..798f1aa 100644
--- a/src/raven_python/gapfilling/fill.py
+++ b/src/raven_python/gapfilling/fill.py
@@ -20,6 +20,7 @@
 
 import cobra
 from cobra.flux_analysis import find_blocked_reactions, flux_variability_analysis
+from optlang.interface import Variable  # untyped (resolves to Any); used for container hints
 
 from raven_python.manipulation.transfer import add_reactions_from_model
 
@@ -74,7 +75,7 @@ def _solve_min_templates(
     the problem is infeasible.
     """
     prob = working.problem
-    indicators: dict[str, object] = {}
+    indicators: dict[str, Variable] = {}
     extra = []
     for rid in template_ids:
         rxn = working.reactions.get_by_id(rid)
diff --git a/src/raven_python/init/build.py b/src/raven_python/init/build.py
index a0d0538..f6aba56 100644
--- a/src/raven_python/init/build.py
+++ b/src/raven_python/init/build.py
@@ -69,8 +69,8 @@ def get_init_model(
             model, gene_scores, isozyme_scoring=isozyme_scoring,
             complex_scoring=complex_scoring, no_gene_score=no_gene_score,
         )
-    else:
-        scores = dict(rxn_scores)
+    else:  # rxn_scores is not None here (exactly-one-of check above); `or {}` placates mypy
+        scores = dict(rxn_scores or {})
 
     deleted_dead_end: list[str] = []
     if remove_dead_ends:
diff --git a/src/raven_python/init/ftinit.py b/src/raven_python/init/ftinit.py
index 58e4ab9..e46be94 100644
--- a/src/raven_python/init/ftinit.py
+++ b/src/raven_python/init/ftinit.py
@@ -52,6 +52,7 @@
 
 import cobra
 from cobra.exceptions import OptimizationError
+from optlang.interface import Variable  # untyped (resolves to Any); used for container hints
 from optlang.symbolics import Real, add, mul
 
 from raven_python.init.genes import remove_low_score_genes
@@ -112,8 +113,8 @@ def run_ftinit(
 
     variables: list = []
     constraints: list = []
-    flux_terms: dict[str, list[tuple[object, float]]] = {}  # rxn id -> [(var, sign)]
-    indicators: dict[str, tuple[object, float]] = {}  # rxn id -> (indicator var, score)
+    flux_terms: dict[str, list[tuple[Variable, float]]] = {}  # rxn id -> [(var, sign)]
+    indicators: dict[str, tuple[Variable, float]] = {}  # rxn id -> (indicator var, score)
     free_or_essential: set[str] = set()               # kept regardless of an indicator
 
     def add_constraint(expr, **kw):
@@ -218,7 +219,7 @@ def add_constraint(expr, **kw):
     on = {rid for rid, (ind, _) in indicators.items() if (ind.primal or 0.0) >= 0.5}
     kept = free_or_essential | on
     deleted = [r.id for r in model.reactions if r.id not in kept]
-    fluxes = {
+    fluxes: dict[str, float] = {
         rid: sum(sign * (var.primal or 0.0) for var, sign in terms)
         for rid, terms in flux_terms.items()
     }
diff --git a/src/raven_python/init/genes.py b/src/raven_python/init/genes.py
index 4076c13..abc9a5d 100644
--- a/src/raven_python/init/genes.py
+++ b/src/raven_python/init/genes.py
@@ -26,15 +26,15 @@ def _prune(node, scores, iso, cplx) -> tuple[str | None, float | None]:
     if not isinstance(node, ast.BoolOp):
         return None, None
 
-    children = [_prune(v, scores, iso, cplx) for v in node.values]
-    children = [(s, sc) for s, sc in children if s is not None]
+    pruned = [_prune(v, scores, iso, cplx) for v in node.values]
+    children: list[tuple[str, float | None]] = [(s, sc) for s, sc in pruned if s is not None]
 
     if isinstance(node.op, ast.And):  # complex: keep every subunit, prune nested ORs
         kept = children
     else:  # OR / isozymes: drop negative-scoring alternatives, keep at least one
         kept = [(s, sc) for s, sc in children if sc is None or sc >= 0]
-        if not kept:  # all negative → keep the least-negative
-            kept = [max(children, key=lambda c: c[1])]
+        if not kept:  # all negative → keep the least-negative (every sc is non-None here)
+            kept = [max(children, key=lambda c: c[1] if c[1] is not None else float("-inf"))]
 
     parts = [s for s, _ in kept]
     score_vals = [sc for _, sc in kept if sc is not None]
diff --git a/src/raven_python/init/init.py b/src/raven_python/init/init.py
index 3cd5942..05b6d84 100644
--- a/src/raven_python/init/init.py
+++ b/src/raven_python/init/init.py
@@ -40,6 +40,7 @@
 
 import cobra
 from cobra.exceptions import OptimizationError
+from optlang.interface import Variable  # untyped (resolves to Any); used for container hints
 from optlang.symbolics import Real, add, mul
 
 _EPS = 1.0  # flux an included reaction must carry (RAVEN's fake-met unit)
@@ -137,7 +138,7 @@ def run_init(
     flux = {d.key: prob.Variable(f"v_{d.key}", lb=0.0, ub=d.ub) for d in directed}
 
     # Binary include-indicators for non-essential reactions; eps*x <= v <= ub*x.
-    keep: dict[str, object] = {}
+    keep: dict[str, Variable] = {}
     gates = []
     for d in directed:
         if d.essential:
diff --git a/src/raven_python/init/score.py b/src/raven_python/init/score.py
index c380544..215912c 100644
--- a/src/raven_python/init/score.py
+++ b/src/raven_python/init/score.py
@@ -39,10 +39,9 @@ def gene_scores_from_expression(
     score is clamped to ``[min_score, max_score]``; non-positive level/reference (and
     missing reference) → ``min_score`` (RAVEN maps these NaNs to -5).
     """
-    scalar = isinstance(reference, (int, float))
     scores: dict[str, float] = {}
     for gene, level in expression.items():
-        ref = reference if scalar else reference.get(gene)
+        ref = reference if isinstance(reference, (int, float)) else reference.get(gene)
         if not level or not ref or level <= 0 or ref <= 0:
             scores[gene] = min_score
         else:
diff --git a/src/raven_python/io/excel.py b/src/raven_python/io/excel.py
index 398b6da..bf768a0 100644
--- a/src/raven_python/io/excel.py
+++ b/src/raven_python/io/excel.py
@@ -17,7 +17,7 @@
 
 def _miriam_string(annotation: dict, exclude: tuple[str, ...] = ()) -> str:
     """RAVEN MIRIAM column: ``namespace/id;namespace/id2;...`` (sorted)."""
-    parts = []
+    parts: list[str] = []
     for namespace in sorted(annotation):
         if namespace in exclude:
             continue
diff --git a/src/raven_python/localization/predict.py b/src/raven_python/localization/predict.py
index 3fc2369..f112927 100644
--- a/src/raven_python/localization/predict.py
+++ b/src/raven_python/localization/predict.py
@@ -31,6 +31,7 @@
 
 import cobra
 import pandas as pd
+from optlang.interface import Variable  # untyped (resolves to Any); used for container hints
 from optlang.symbolics import Real, add, mul
 
 from raven_python.localization.scores import LocalizationScores
@@ -165,12 +166,12 @@ def _met_cost(m_id: str) -> float:
     opt = prob.Model()
 
     # x[r, c] = 1 iff reaction r placed in c (only for r ∈ placeable)
-    x: dict[tuple[str, str], object] = {
+    x: dict[tuple[str, str], Variable] = {
         (r.id, c): prob.Variable(f"x_{r.id}_{c}", type="binary")
         for r in placeable for c in compartments
     }
     # y[g, c] = 1 iff gene g assigned to c
-    y: dict[tuple[str, str], object] = {
+    y: dict[tuple[str, str], Variable] = {
         (g, c): prob.Variable(f"y_{g}_{c}", type="binary")
         for g in genes_in_scope for c in compartments
     }
@@ -182,7 +183,7 @@ def _met_cost(m_id: str) -> float:
             for c in compartments:
                 if c != default_compartment:
                     met_keys.add((m.id, c))
-    t: dict[tuple[str, str], object] = {
+    t: dict[tuple[str, str], Variable] = {
         k: prob.Variable(f"t_{k[0]}_{k[1]}", type="binary") for k in met_keys
     }
 
diff --git a/src/raven_python/manipulation/__init__.py b/src/raven_python/manipulation/__init__.py
index a462949..587643e 100644
--- a/src/raven_python/manipulation/__init__.py
+++ b/src/raven_python/manipulation/__init__.py
@@ -3,7 +3,7 @@
 isozyme expansion, compartment merge / copy, and model merging by name."""
 from .add import add_reactions_from_equations
 from .change import change_gene_reaction_rules, change_reaction_equations
-from .expand import expand_model
+from .expand import expand_model, gpr_to_dnf
 from .irreversible import convert_to_irreversible
 from .merge import merge_models
 from .parameters import set_variance_bounds
@@ -28,6 +28,7 @@
     "convert_to_irreversible",
     "expand_model",
     "find_duplicate_reactions",
+    "gpr_to_dnf",
     "group_linear_reactions",
     "merge_models",
     "remove_dead_end_reactions",
diff --git a/src/raven_python/manipulation/expand.py b/src/raven_python/manipulation/expand.py
index 246f3b9..3fd6847 100644
--- a/src/raven_python/manipulation/expand.py
+++ b/src/raven_python/manipulation/expand.py
@@ -20,8 +20,10 @@
 import cobra
 from cobra.core.gene import GPR
 
+__all__ = ["expand_model", "gpr_to_dnf"]
 
-def _gpr_to_dnf(gpr: GPR) -> list[list[str]]:
+
+def gpr_to_dnf(gpr: GPR) -> list[list[str]]:
     """Convert a GPR to disjunctive normal form (list of AND-clauses).
 
     An empty GPR yields an empty list. A single clause (no OR anywhere)
@@ -88,7 +90,7 @@ def expand_model(model: cobra.Model) -> list[str]:
     for rxn in model.reactions:
         if not rxn.gene_reaction_rule:
             continue
-        clauses = _gpr_to_dnf(rxn.gpr)
+        clauses = gpr_to_dnf(rxn.gpr)
         if len(clauses) <= 1:
             continue
         expansions.append((rxn, clauses))
diff --git a/src/raven_python/manipulation/remove.py b/src/raven_python/manipulation/remove.py
index 492de36..43f966f 100644
--- a/src/raven_python/manipulation/remove.py
+++ b/src/raven_python/manipulation/remove.py
@@ -99,7 +99,7 @@ def remove_genes(
         return []
 
     # Reactions touched by these genes that currently have a GPR.
-    affected = set()
+    affected: set[str] = set()
     for gid in present:
         affected.update(r.id for r in model.genes.get_by_id(gid).reactions)
     had_gpr = {rid for rid in affected if model.reactions.get_by_id(rid).gene_reaction_rule}
diff --git a/src/raven_python/py.typed b/src/raven_python/py.typed
new file mode 100644
index 0000000..e69de29
diff --git a/src/raven_python/reconstruction/homology/blast.py b/src/raven_python/reconstruction/homology/blast.py
index 246ddab..fdffae1 100644
--- a/src/raven_python/reconstruction/homology/blast.py
+++ b/src/raven_python/reconstruction/homology/blast.py
@@ -72,8 +72,8 @@ def run_blast(
     outfmt = "10 " + " ".join(_OUTFMT_FIELDS)  # 10 = CSV
 
     frames = []
-    with tempfile.TemporaryDirectory() as tmp:
-        tmp = Path(tmp)
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        tmp = Path(tmp_dir)
 
         def blastp_dir(query, subject_fasta, from_id, to_id):
             db = tmp / f"db_{from_id}_{to_id}"
@@ -113,8 +113,8 @@ def run_diamond(
     diamond = resolve_binary("diamond", binary=diamond)
 
     frames = []
-    with tempfile.TemporaryDirectory() as tmp:
-        tmp = Path(tmp)
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        tmp = Path(tmp_dir)
 
         def diamond_dir(query, subject_fasta, from_id, to_id):
             db = tmp / f"db_{from_id}_{to_id}"
diff --git a/src/raven_python/reconstruction/kegg/download.py b/src/raven_python/reconstruction/kegg/download.py
index d990b42..f6102f8 100644
--- a/src/raven_python/reconstruction/kegg/download.py
+++ b/src/raven_python/reconstruction/kegg/download.py
@@ -195,6 +195,8 @@ def lift(rel: str, tmp: str) -> Path | None:
             f"KEGG archives did not yield required file(s): {missing}. "
             f"Check that the source .tar.gz archives are present in {dest}."
         )
+    # Narrowing for the type checker: the `missing` check above raised if any were None.
+    assert reaction is not None and compound is not None and ko is not None
 
     shutil.move(str(reaction), str(dest / "reaction"))
     shutil.move(str(ko), str(dest / "ko"))
diff --git a/src/raven_python/reconstruction/kegg/hmm.py b/src/raven_python/reconstruction/kegg/hmm.py
index 0e210b6..34afd8d 100644
--- a/src/raven_python/reconstruction/kegg/hmm.py
+++ b/src/raven_python/reconstruction/kegg/hmm.py
@@ -329,8 +329,8 @@ def build_ko_hmm(
         logger.info("[%s] start: %d sequences", label, n)
 
     hmmbuild = resolve_binary("hmmbuild", binary=hmmbuild)
-    with tempfile.TemporaryDirectory() as tmp:
-        tmp = Path(tmp)
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        tmp = Path(tmp_dir)
         if n == 1:
             if verbose:
                 logger.info("[%s] single sequence: skipping CD-HIT/MAFFT", label)
@@ -408,7 +408,7 @@ def build_hmm_library(
     mafft: str | Path | None = None,
     hmmbuild: str | Path | None = None,
     hmmpress: str | Path | None = None,
-) -> dict[str, Path | list[Path]]:
+) -> dict[str, Path | list[Path] | None]:
     """Build a domain (``"prokaryotes"``/``"eukaryotes"``) HMM library.
 
     Restricts genes to the domain's organisms (from ``taxonomy``), builds a
diff --git a/src/raven_python/tasks/check.py b/src/raven_python/tasks/check.py
index 817bae5..0e66df2 100644
--- a/src/raven_python/tasks/check.py
+++ b/src/raven_python/tasks/check.py
@@ -17,6 +17,7 @@
 from collections.abc import Iterable
 from dataclasses import dataclass
 from pathlib import Path
+from typing import cast
 
 import cobra
 from cobra.exceptions import OptimizationError
@@ -51,11 +52,14 @@ def _set_constraint_bounds(constraint, lb: float, ub: float) -> None:
         constraint.ub = ub
 
 
-def _classify(token: str) -> tuple[str, str | None]:
-    """Return ``("all", None)``, ``("comp", COMP)``, or ``("met", token_upper)``."""
+def _classify(token: str) -> tuple[str, str]:
+    """Return ``("all", "")``, ``("comp", COMP)``, or ``("met", token_upper)``.
+
+    The arg is empty only for ``"all"``, which never reads it.
+    """
     upper = token.upper()
     if upper == _ALLMETS:
-        return "all", None
+        return "all", ""
     if upper.startswith(_ALLMETSIN + "[") and upper.endswith("]"):
         return "comp", upper[len(_ALLMETSIN) + 1: -1]
     return "met", upper  # incl. malformed ALLMETSIN[... → treated as a (missing) metabolite
@@ -210,7 +214,7 @@ def check_tasks(
 
 def _as_tasks(tasks: str | Iterable[Task]) -> list[Task]:
     if isinstance(tasks, (str, bytes)) or hasattr(tasks, "__fspath__"):
-        return parse_task_list(tasks)
+        return parse_task_list(cast("str | Path", tasks))  # guard above ⇒ path-like
     return list(tasks)
 
 
diff --git a/src/raven_python/utils/gpr.py b/src/raven_python/utils/gpr.py
index 28818da..64ecbdb 100644
--- a/src/raven_python/utils/gpr.py
+++ b/src/raven_python/utils/gpr.py
@@ -95,11 +95,10 @@ def is_dnf(gpr: GPR | str | None) -> bool:
     >>> is_dnf("(G1 or G2) and G3")
     False
     """
-    if isinstance(gpr, str):
-        gpr = GPR.from_string(gpr)
-    if gpr is None:
+    parsed = GPR.from_string(gpr) if isinstance(gpr, str) else gpr
+    if parsed is None:
         return True
-    return _is_dnf_node(gpr.body)
+    return _is_dnf_node(parsed.body)
 
 
 @dataclass(frozen=True)
diff --git a/tests/test_manipulation_expand.py b/tests/test_manipulation_expand.py
index 08cd2f2..2a5492d 100644
--- a/tests/test_manipulation_expand.py
+++ b/tests/test_manipulation_expand.py
@@ -5,7 +5,7 @@
 import cobra
 
 from raven_python.manipulation import expand_model
-from raven_python.manipulation.expand import _gpr_to_dnf
+from raven_python.manipulation.expand import gpr_to_dnf
 
 # --------------------------------------------------------------------------- #
 # DNF conversion (internal helper, worth testing directly)
@@ -15,7 +15,7 @@ def _dnf_from_gpr_string(gpr_str: str) -> list[list[str]]:
     from cobra.core.gene import GPR
 
     gpr = GPR.from_string(gpr_str)
-    return _gpr_to_dnf(gpr)
+    return gpr_to_dnf(gpr)
 
 
 def test_dnf_empty_gpr():