From c53563d9dabfb001d712a4c3946ae94b851a6c6b Mon Sep 17 00:00:00 2001
From: NCCU-Schultz-Lab
Date: Sat, 23 May 2026 00:34:40 -0400
Subject: [PATCH 01/33] Fix IR plot flicker by atomic output swap
Eliminate visible flicker on IR plot updates (BUG.9) by rendering Plotly HTML via a single atomic outputs assignment in _set_html_output so scripts execute and no intermediate empty state is exposed. Also prevent re-render storms by setting the IR FWHM slider continuous_update=False and give the IR Output a min_height of 300px to avoid container collapse between renders. Tests were added to guard these behaviors and validate the atomic outputs swap. Minor docs adjustments clarify repository scope vs cluster/SLURM features and update the scope table in .github/copilot-instructions.md; README wording about the cluster version was removed.
---
.github/copilot-instructions.md | 26 +++++++++---------
README.md | 9 -------
quantui/app.py | 20 +++++++++++---
quantui/app_builders.py | 13 ++++++++-
tests/test_app.py | 48 +++++++++++++++++++++++++++++++++
5 files changed, 89 insertions(+), 27 deletions(-)
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 269a6e9..5e36262 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -12,10 +12,8 @@
QuantUI is an interactive Jupyter/Voilà platform for running PySCF quantum
chemistry workflows end-to-end inside one app: setup, execution, analysis,
visualization, and comparison. It is local-first today (no cluster account, no
-SLURM required for normal use), and is designed to evolve toward optional
-cluster-backed execution through interactive Jupyter/HPC environments. It is a
-downstream port of the cluster-focused
-`QuantUI` repo with all SLURM infrastructure removed.
+SLURM required), and a future roadmap item is to add optional cluster-backed
+execution through interactive Jupyter/HPC environments.
**Primary users:** Undergraduate chemistry students and researchers at North Carolina
Central University and collaborators. The UI runs as a Voilà app so users can run
@@ -701,15 +699,17 @@ across kernel restarts and are accessible from the host (home dir is bind-mounte
---
-## Relationship to Source Repo
+## Scope Notes — Intentionally Out of Repo
-QuantUI is a downstream port of `NCCU-Schultz-Lab/QuantUI` (the cluster version).
-Bug fixes and module updates originate in `QuantUI` and are ported here.
+The following module/file names are deliberately absent from `quantui/` and
+should not be reintroduced without an explicit roadmap milestone. They would
+only make sense once cluster-backed execution is added (a future roadmap
+item, not currently scoped).
-| Removed from source | Reason |
+| File / module | Why it's not here |
| --- | --- |
-| `job_manager.py` | SLURM batch submission |
-| `storage.py` | SLURM job metadata |
-| `slurm_errors.py` | SLURM error translation |
-| `visualization.py` | PlotlyMol fallback (excluded here) |
-| SLURM templates in `config.py` | No cluster |
+| `job_manager.py` | SLURM batch submission belongs to the future cluster-execution path |
+| `storage.py` | SLURM job-metadata persistence — same future scope |
+| `slurm_errors.py` | SLURM error translation — same future scope |
+| `visualization.py` (the PlotlyMol-fallback module) | Superseded by `viz_backend_router.py` + `visualization_py3dmol.py` |
+| SLURM-related templates in `config.py` | No cluster orchestration today |
diff --git a/README.md b/README.md
index 5a6641c..6a9c36f 100644
--- a/README.md
+++ b/README.md
@@ -306,15 +306,6 @@ CHANGELOG.md Release history (Keep a Changelog format)
---
-## Relationship to the cluster version
-
-QuantUI (this repo) is a downstream port of the cluster-based
-[QuantUI-cluster](https://github.com/The-Schultz-Lab/QuantUI) repository. All SLURM
-infrastructure (job manager, job storage, batch templates) has been removed.
-Bug fixes flow from the cluster repo into this one, not the other way around.
-
----
-
## License
[MIT](LICENSE) — Copyright 2026 The Schultz Lab, North Carolina Central University
diff --git a/quantui/app.py b/quantui/app.py
index b6ff120..8924bf8 100644
--- a/quantui/app.py
+++ b/quantui/app.py
@@ -1939,19 +1939,31 @@ def _apply_plotly_theme(self, fig) -> None:
)
def _set_html_output(self, out: widgets.Output, html: str) -> None:
- """Render HTML into an Output widget.
+ """Render HTML into an Output widget via an atomic outputs swap.
Plotly HTML contains
+ # that contains "Plotly". We expect exactly one such inline bundle.
+ assert "Plotly" in html
+ # Sanity: file is non-trivial size (plotly inline is ~3MB).
+ assert len(html) > 100_000
+
+ def test_dashboard_resilient_to_partial_records(self, isolated_log_dir):
+ # Records missing fields (early app version, partial writes) must
+ # not crash the dashboard build.
+ records = [
+ {"timestamp": "2026-05-25T12:00:00+00:00"}, # bare minimum
+ _rec(), # full
+ ]
+ _write_perf_log(isolated_log_dir, records)
+ out = analytics.build_dashboard()
+ assert out is not None
+ assert out.exists()
+
+
+class TestFormatHelpers:
+ def test_format_seconds_under_minute(self):
+ assert analytics._format_seconds(45.0) == "45.0 s"
+
+ def test_format_seconds_minutes(self):
+ assert analytics._format_seconds(90.0) == "1.5 min"
+
+ def test_format_seconds_hours(self):
+ assert analytics._format_seconds(7200.0) == "2.0 h"
+
+ def test_counts_by_drops_missing(self):
+ records = [{"method": "B3LYP"}, {"method": ""}, {"method": "MP2"}, {}]
+ counts = analytics._counts_by(records, "method")
+ assert counts == {"B3LYP": 1, "MP2": 1}
diff --git a/tests/test_bug_regressions_2026_05_25.py b/tests/test_bug_regressions_2026_05_25.py
new file mode 100644
index 0000000..368d1e5
--- /dev/null
+++ b/tests/test_bug_regressions_2026_05_25.py
@@ -0,0 +1,184 @@
+"""Regression tests for the four bugs reported in session 55 (2026-05-25).
+
+Bug A — GPU-run results saved with no MO data
+ ``_run_session_calc_body`` extracts ``mf.mo_energy`` / ``mo_coeff`` /
+ ``mo_occ`` via ``numpy.array(...)``. With a GPU-offloaded ``mf`` those
+ are CuPy arrays — numpy refuses implicit device transfers, so the
+ bare ``except`` swallowed a ``TypeError`` and the SessionResult
+ shipped with all MO fields ``None``. That made ``save_orbitals``
+ no-op and history replay of any GPU-run SP/GeoOpt rendered "Not
+ available" in Energies + Isosurface panels.
+
+Bug B1/B2/B3 — Calculate-tab molecule viewer used the
+ ``with self.viz_output: display_molecule(...)`` pattern. Symptoms:
+ initial render wouldn't appear after a PubChem search (B1);
+ PlotlyMol RDKit valence errors spilled out as red logger lines
+ around the viewer (B2); generic ``logger.info`` lines from the
+ renderer were captured into the Output widget (B3). Fix migrates
+ to ``_refresh_calc_mol_viewer`` which renders HTML outside any
+ Output context and atomic-swaps into ``viz_output``.
+
+Bug C — Frequency pre-opt on benzene crashed the whole calc with
+ "singular matrix" in PySCF's ``cho_solve``. Three pre-opt sites
+ in ``_do_run`` now ``try/except`` around ``optimize_geometry`` and
+ fall back to the user-provided geometry on failure.
+"""
+
+from __future__ import annotations
+
+import inspect
+
+import numpy as np
+import pytest
+
+# =====================================================================
+# Bug A — cupy-aware MO array extraction in session_calc
+# =====================================================================
+
+
+class _FakeCupyArray:
+ """A minimal stand-in for a CuPy array: numpy refuses to convert it
+ directly, but it exposes ``.get()`` (sync device→host copy) and
+ its ``type(...).__module__`` starts with ``"cupy"`` — the two
+ properties the fix probes."""
+
+ def __init__(self, host_data):
+ self._host = np.asarray(host_data)
+
+ def get(self):
+ return self._host
+
+ # numpy.asarray on a non-array-like falls back to object dtype unless
+ # we make the conversion explicitly fail like the real cupy.
+ def __array__(self, dtype=None):
+ raise TypeError(
+ "Implicit conversion to a NumPy array is not allowed. "
+ "Please use `.get()` to construct a NumPy array explicitly."
+ )
+
+
+# Pin __module__ so the type probe matches.
+_FakeCupyArray.__module__ = "cupy._core.core"
+
+
+def _extract_to_numpy(arr):
+ """Re-implementation of the closure to keep the test independent of
+ session_calc's import side effects. Mirrors the production helper:
+ detect CuPy by ``.get()`` callable + module prefix, otherwise pass
+ through ``np.asarray``."""
+ if arr is None:
+ return None
+ get = getattr(arr, "get", None)
+ if callable(get) and type(arr).__module__.startswith("cupy"):
+ return np.asarray(get())
+ return np.asarray(arr)
+
+
+class TestBugA_CupyAwareConversion:
+ def test_none_passes_through(self):
+ assert _extract_to_numpy(None) is None
+
+ def test_numpy_array_passes_through(self):
+ a = np.array([1.0, 2.0, 3.0])
+ out = _extract_to_numpy(a)
+ np.testing.assert_array_equal(out, a)
+
+ def test_cupy_like_is_converted_via_get(self):
+ fake = _FakeCupyArray([4.0, 5.0, 6.0])
+ out = _extract_to_numpy(fake)
+ assert isinstance(out, np.ndarray)
+ np.testing.assert_array_equal(out, [4.0, 5.0, 6.0])
+
+ def test_bare_numpy_conversion_of_cupy_like_raises(self):
+ # Sanity: the production fix is needed precisely because the
+ # naive call (pre-fix code) raises. If this test ever stops
+ # raising, the regression guard is moot.
+ fake = _FakeCupyArray([1.0])
+ with pytest.raises(TypeError):
+ np.array(fake)
+
+ def test_production_helper_uses_to_numpy_array(self):
+ # Confirm the actual session_calc body contains the
+ # ``_to_numpy_array`` helper (so a future refactor that drops it
+ # breaks this test loudly).
+ from quantui import session_calc
+
+ src = inspect.getsource(session_calc)
+ assert "_to_numpy_array" in src
+ assert "cupy" in src.lower()
+
+
+# =====================================================================
+# Bug B — Calculate-tab molecule viewer uses atomic HTML swap
+# =====================================================================
+
+
+class TestBugB_AtomicMolViewerSwap:
+ def test_app_has_refresh_calc_mol_viewer(self):
+ from quantui.app import QuantUIApp
+
+ app = QuantUIApp()
+ assert hasattr(app, "_refresh_calc_mol_viewer")
+
+ def test_refresh_calc_mol_viewer_handles_none_molecule(self):
+ from quantui.app import QuantUIApp
+
+ app = QuantUIApp()
+ # No molecule loaded yet → must return cleanly, not raise.
+ assert app._molecule is None
+ app._refresh_calc_mol_viewer() # should not raise
+
+ def test_calc_tab_does_not_use_with_viz_output_display_pattern(self):
+ # The BUG.7 pattern (Analysis tab) and this bug-batch's fix both
+ # forbid the ``with self.viz_output: display_molecule(...)``
+ # idiom. Verify no occurrence remains in the migrated section.
+ from quantui import app as _app_mod
+
+ src = inspect.getsource(_app_mod)
+ # ``_display_molecule`` is the imported alias; the fix removed
+ # all 5 of its call sites. The module may still import it for
+ # backwards compat, so we only check that the buggy
+ # idiom (``with self.viz_output:`` followed by a
+ # ``_display_molecule`` call) is gone.
+ idx = 0
+ while True:
+ idx = src.find("with self.viz_output:", idx)
+ if idx < 0:
+ break
+ # Look at the next ~200 characters for a _display_molecule
+ # call. If we find one, the bad idiom is still present.
+ window = src[idx : idx + 400]
+ assert "_display_molecule(" not in window, (
+ "Found ``with self.viz_output: _display_molecule(...)`` "
+ "idiom; should be migrated to _refresh_calc_mol_viewer "
+ "(BUG B1/B2/B3)."
+ )
+ idx += 1
+
+
+# =====================================================================
+# Bug C — Pre-opt failures fall back to user geometry instead of crashing
+# =====================================================================
+
+
+class TestBugC_PreoptFailureFallback:
+ def test_freq_preopt_block_has_try_except(self):
+ # Confirm the source contains the new fallback paths. Reading
+ # the source is the most direct way to assert this; running the
+ # actual freq calc would require PySCF.
+ from quantui import app as _app_mod
+
+ src = inspect.getsource(_app_mod)
+ assert "Pre-optimisation failed" in src
+ # The exception variable name (_pre_exc) is unique to the new
+ # try/except wrapping all three pre-opt sites.
+ assert src.count("except Exception as _pre_exc") >= 3
+
+ def test_freq_preopt_fallback_uses_user_geometry(self):
+ # The fallback message should make it clear the calc continues
+ # with the user-provided geometry — that's the contract the bug
+ # report asked for.
+ from quantui import app as _app_mod
+
+ src = inspect.getsource(_app_mod)
+ assert "user-provided geometry" in src or "seed geometry as-is" in src
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..c2f24ca
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,301 @@
+"""Tests for the ``quantui`` CLI (``quantui/cli.py``).
+
+All tests are platform-independent. The CLI reads from
+``~/.quantui/logs/event_log.jsonl`` by default, so each test overrides
+``QUANTUI_LOG_DIR`` via ``monkeypatch`` to point at a ``tmp_path`` so we
+never touch the real user log.
+"""
+
+from __future__ import annotations
+
+import io
+import json
+import sys
+
+import pytest
+
+from quantui import cli
+
+
+@pytest.fixture
+def isolated_log_dir(tmp_path, monkeypatch):
+ """Point QuantUI's event log at a fresh tmp directory for one test."""
+ monkeypatch.setenv("QUANTUI_LOG_DIR", str(tmp_path))
+ return tmp_path
+
+
+def _write_event_log(log_dir, events):
+ path = log_dir / "event_log.jsonl"
+ with path.open("w", encoding="utf-8") as fh:
+ for ev in events:
+ fh.write(json.dumps(ev) + "\n")
+ return path
+
+
+def _capture(argv):
+ """Run cli.main with argv and return (exit_code, stdout, stderr)."""
+ out, err = io.StringIO(), io.StringIO()
+ real_out, real_err = sys.stdout, sys.stderr
+ sys.stdout, sys.stderr = out, err
+ try:
+ rc = cli.main(argv)
+ finally:
+ sys.stdout, sys.stderr = real_out, real_err
+ return rc, out.getvalue(), err.getvalue()
+
+
+class TestLogTail:
+ def test_missing_log_returns_zero_with_msg(self, isolated_log_dir):
+ rc, out, err = _capture(["log", "tail"])
+ assert rc == 0
+ assert out == ""
+ assert "no event log" in err
+
+ def test_empty_log_returns_zero_with_msg(self, isolated_log_dir):
+ _write_event_log(isolated_log_dir, [])
+ rc, out, err = _capture(["log", "tail"])
+ assert rc == 0
+ assert out == ""
+ assert "empty" in err
+
+ def test_default_n_is_20(self, isolated_log_dir):
+ events = [
+ {
+ "timestamp": f"2026-05-25T12:00:{i:02d}+00:00",
+ "event": "tick",
+ "message": f"msg-{i}",
+ }
+ for i in range(30)
+ ]
+ _write_event_log(isolated_log_dir, events)
+ rc, out, _ = _capture(["log", "tail"])
+ assert rc == 0
+ # 20 lines printed; verify the LAST 20 are kept (msg-10..msg-29).
+ lines = [ln for ln in out.splitlines() if ln.strip()]
+ assert len(lines) == 20
+ assert "msg-10" in lines[0]
+ assert "msg-29" in lines[-1]
+
+ def test_n_flag_overrides(self, isolated_log_dir):
+ events = [
+ {
+ "timestamp": f"2026-05-25T12:00:{i:02d}+00:00",
+ "event": "tick",
+ "message": f"m{i}",
+ }
+ for i in range(10)
+ ]
+ _write_event_log(isolated_log_dir, events)
+ rc, out, _ = _capture(["log", "tail", "-n", "3"])
+ assert rc == 0
+ lines = [ln for ln in out.splitlines() if ln.strip()]
+ assert len(lines) == 3
+ assert "m7" in lines[0]
+ assert "m9" in lines[-1]
+
+ def test_extras_appended_as_kv(self, isolated_log_dir):
+ events = [
+ {
+ "timestamp": "2026-05-25T12:00:00+00:00",
+ "event": "calc_done",
+ "message": "B3LYP/STO-3G on H2O",
+ "elapsed_ms": 4321,
+ "gpu_used": True,
+ },
+ ]
+ _write_event_log(isolated_log_dir, events)
+ rc, out, _ = _capture(["log", "tail"])
+ assert rc == 0
+ # Both extras appear in k=v form.
+ assert "elapsed_ms=4321" in out
+ assert "gpu_used=True" in out
+ # Core fields appear once.
+ assert "calc_done" in out
+ assert "B3LYP/STO-3G on H2O" in out
+
+
+class TestCliParser:
+ def test_no_args_exits_nonzero(self, isolated_log_dir):
+ # argparse exits 2 when a required subparser is missing.
+ with pytest.raises(SystemExit) as exc:
+ _capture([])
+ assert exc.value.code == 2
+
+ def test_unknown_subcommand_exits_nonzero(self, isolated_log_dir):
+ with pytest.raises(SystemExit) as exc:
+ _capture(["bogus"])
+ assert exc.value.code == 2
+
+ def test_log_without_subcommand_exits_nonzero(self, isolated_log_dir):
+ with pytest.raises(SystemExit) as exc:
+ _capture(["log"])
+ assert exc.value.code == 2
+
+
+def test_fmt_event_renders_minimal_record():
+ line = cli._fmt_event(
+ {
+ "timestamp": "2026-05-25T12:00:00+00:00",
+ "event": "startup",
+ "message": "QuantUI 0.2.0",
+ }
+ )
+ assert "2026-05-25T12:00:00+00:00" in line
+ assert "startup" in line
+ assert "QuantUI 0.2.0" in line
+
+
+def test_fmt_event_handles_missing_fields():
+ # Should not raise even on a malformed record.
+ line = cli._fmt_event({})
+ assert "?" in line # default event
+
+
+class TestGpuCheck:
+ """`quantui gpu check` — exit 0 when GPU available, 1 otherwise."""
+
+ def test_disabled_via_env_var(self, monkeypatch, isolated_log_dir):
+ monkeypatch.setenv("QUANTUI_DISABLE_GPU", "1")
+ rc, out, err = _capture(["gpu", "check"])
+ assert rc == 1
+ assert "not available" in err
+ assert "QUANTUI_DISABLE_GPU" in err
+
+ def test_reports_missing_gpu4pyscf(self, monkeypatch, isolated_log_dir):
+ # Pretend gpu4pyscf isn't installed. Because the GPU detector is
+ # @lru_cached, we patch the underlying functions rather than try
+ # to monkey with builtins __import__.
+ import quantui.gpu_offload as _gpuo
+
+ _gpuo.is_gpu_available.cache_clear()
+
+ # Make is_gpu_available return (False, None) and arrange gpu4pyscf
+ # import to fail inside the CLI's reason-probe path.
+ def _fake_import(name, *args, **kwargs):
+ if name == "gpu4pyscf":
+ raise ImportError("simulated")
+ return _real_import(name, *args, **kwargs)
+
+ import builtins as _bi
+
+ _real_import = _bi.__import__
+ monkeypatch.setattr(_bi, "__import__", _fake_import)
+ rc, out, err = _capture(["gpu", "check"])
+ assert rc == 1
+ assert "gpu4pyscf not installed" in err
+
+ def test_happy_path_when_gpu_detected(self, monkeypatch, isolated_log_dir):
+ import quantui.gpu_offload as _gpuo
+
+ # Replace the lru_cache-decorated function with a plain callable
+ # that mimics the (.cache_clear()) attribute the CLI calls.
+ def _fake():
+ return (True, "NVIDIA Test GPU")
+
+ _fake.cache_clear = lambda: None # type: ignore[attr-defined]
+ monkeypatch.setattr(_gpuo, "is_gpu_available", _fake)
+ rc, out, err = _capture(["gpu", "check"])
+ assert rc == 0
+ assert "GPU offload available" in out
+ assert "NVIDIA Test GPU" in out
+
+
+class TestAnalyticsBuild:
+ """`quantui analytics build` — wraps analytics.build_dashboard."""
+
+ def test_empty_perf_log_returns_zero_with_msg(self, isolated_log_dir):
+ rc, out, err = _capture(["analytics", "build"])
+ assert rc == 0
+ assert "perf log is empty" in err
+
+ def test_writes_file_at_explicit_path(self, isolated_log_dir, tmp_path):
+ # Seed perf log so the dashboard has data.
+ perf_path = isolated_log_dir / "perf_log.jsonl"
+ perf_path.write_text(
+ json.dumps(
+ {
+ "timestamp": "2026-05-25T12:00:00+00:00",
+ "formula": "H2O",
+ "method": "B3LYP",
+ "basis": "STO-3G",
+ "elapsed_s": 1.0,
+ "converged": True,
+ "gpu_used": True,
+ }
+ )
+ + "\n",
+ encoding="utf-8",
+ )
+ target = tmp_path / "report.html"
+ rc, out, _ = _capture(["analytics", "build", "-o", str(target)])
+ assert rc == 0
+ assert target.exists()
+ assert "Wrote" in out
+ assert str(target) in out
+
+ def test_open_flag_calls_webbrowser(self, isolated_log_dir, tmp_path, monkeypatch):
+ # Seed perf log with a single record so build succeeds.
+ perf_path = isolated_log_dir / "perf_log.jsonl"
+ perf_path.write_text(
+ json.dumps(
+ {
+ "timestamp": "2026-05-25T12:00:00+00:00",
+ "formula": "H2O",
+ "method": "B3LYP",
+ "basis": "STO-3G",
+ "elapsed_s": 1.0,
+ "converged": True,
+ }
+ )
+ + "\n",
+ encoding="utf-8",
+ )
+ target = tmp_path / "report.html"
+
+ # Capture webbrowser.open invocations rather than launching one.
+ opened_urls: list[str] = []
+ import webbrowser as _wb
+
+ def _fake_open(url, *_args, **_kwargs):
+ opened_urls.append(url)
+ return True
+
+ monkeypatch.setattr(_wb, "open", _fake_open)
+
+ rc, _, _ = _capture(["analytics", "build", "-o", str(target), "--open"])
+ assert rc == 0
+ assert target.exists()
+ # The URL should be a file:// URI pointing at the written report.
+ assert len(opened_urls) == 1
+ assert opened_urls[0].startswith("file:")
+ assert "report.html" in opened_urls[0]
+
+ def test_open_flag_handles_browser_failure_gracefully(
+ self, isolated_log_dir, tmp_path, monkeypatch
+ ):
+ perf_path = isolated_log_dir / "perf_log.jsonl"
+ perf_path.write_text(
+ json.dumps(
+ {
+ "timestamp": "2026-05-25T12:00:00+00:00",
+ "formula": "H2O",
+ "method": "B3LYP",
+ "basis": "STO-3G",
+ "elapsed_s": 1.0,
+ "converged": True,
+ }
+ )
+ + "\n",
+ encoding="utf-8",
+ )
+ target = tmp_path / "report.html"
+
+ import webbrowser as _wb
+
+ # Headless systems can return False from webbrowser.open.
+ monkeypatch.setattr(_wb, "open", lambda *a, **k: False)
+
+ rc, _, err = _capture(["analytics", "build", "-o", str(target), "--open"])
+ # Exit code must remain 0 — the dashboard was written successfully.
+ assert rc == 0
+ assert "could not auto-open" in err
From e9837d525c83899a3893bdf376dbfcaca34e03c5 Mon Sep 17 00:00:00 2001
From: NCCU-Schultz-Lab
Date: Mon, 25 May 2026 11:01:19 -0400
Subject: [PATCH 21/33] Add WSL-aware browser opener and tests
Introduce WSL-aware opening logic by adding _is_wsl and _open_in_browser, and update _cmd_analytics_build to use it (falling back gracefully if open fails). Update docs to describe WSL behavior and recommend wslu/explorer.exe as fallbacks. Refactor tests with a _seed_perf_log helper and add TestWslAwareOpener covering wslview/explorer.exe ordering, failure cases, and the non-WSL webbrowser path.
---
docs/CLI.md | 17 ++++--
quantui/cli.py | 89 +++++++++++++++++++++++-----
tests/test_cli.py | 146 ++++++++++++++++++++++++++++++++++++++++------
3 files changed, 214 insertions(+), 38 deletions(-)
diff --git a/docs/CLI.md b/docs/CLI.md
index 70e6338..a835585 100644
--- a/docs/CLI.md
+++ b/docs/CLI.md
@@ -185,7 +185,7 @@ browser tab.
| Flag | Default | Description |
| --- | --- | --- |
| `-o PATH`, `--output PATH` | `~/.quantui/dashboard.html` | Output HTML path |
-| `--open` | off | After writing, open the dashboard in the default browser |
+| `--open` | off | After writing, open the dashboard in the default browser (WSL-aware — uses `wslview` / `explorer.exe` on WSL) |
### Examples
@@ -209,9 +209,18 @@ quantui analytics build -o ~/projects/lab-share/quantui-report.html --open
Wrote /home/schul/.quantui/dashboard.html
```
-With `--open`, the CLI then attempts `webbrowser.open(...)`. If your
-environment is headless (e.g. WSL without a configured `BROWSER`
-variable) you'll see an additional note:
+With `--open`, the CLI picks the right opener for your environment:
+
+- **WSL**: tries `wslview` first (bundled with the `wslu` package),
+ then falls back to `explorer.exe`. Both delegate to your **Windows
+ default browser** via WSL interop — no Linux-side browser install
+ needed. If neither is available, `sudo apt install wslu` fixes it
+ in one step.
+- **Linux native**: stdlib `webbrowser.open` (which uses `xdg-open`).
+- **macOS / Windows native**: stdlib `webbrowser.open`.
+
+If no opener succeeds — e.g. a headless container with no display —
+you'll see:
```
Wrote /home/schul/.quantui/dashboard.html
diff --git a/quantui/cli.py b/quantui/cli.py
index c143088..9d8b5ad 100644
--- a/quantui/cli.py
+++ b/quantui/cli.py
@@ -138,6 +138,74 @@ def _cmd_gpu_check(args: argparse.Namespace) -> int:
return 1
+def _is_wsl() -> bool:
+ """Return True when running inside Windows Subsystem for Linux.
+
+ Checks the cheap signal first (``WSL_DISTRO_NAME`` env var, set on
+ every WSL2 distro) before falling back to a ``/proc/version`` read
+ (covers WSL1 + edge cases where the env var is unset). Returns
+ ``False`` on any IO error rather than raising.
+ """
+ import os as _os
+
+ if _os.environ.get("WSL_DISTRO_NAME"):
+ return True
+ try:
+ with open("/proc/version", encoding="utf-8", errors="ignore") as fh:
+ return "microsoft" in fh.read().lower()
+ except OSError:
+ return False
+
+
+def _open_in_browser(path: Path) -> tuple[bool, Optional[str]]:
+ """Cross-platform "open this file in the user's browser".
+
+ On WSL, ``webbrowser.open`` ultimately calls ``xdg-open`` which fails
+ on minimal Ubuntu installs ("no method available for opening...") —
+ there's no native Linux browser and xdg-open doesn't know to bridge
+ to the Windows host. So on WSL we prefer the WSL-aware openers in
+ order: ``wslview`` (canonical xdg-open replacement, from the ``wslu``
+ package), then ``explorer.exe`` (always available via WSL interop).
+
+ Off WSL, defer to Python's stdlib ``webbrowser`` module which has the
+ right per-platform handling for macOS / native Linux / Windows.
+
+ Returns ``(success, tool_name)``. ``tool_name`` is ``None`` when no
+ opener succeeded.
+ """
+ import subprocess
+
+ if _is_wsl():
+ # ``wslview`` accepts a Linux path directly. ``explorer.exe``
+ # accepts either a Windows path OR a Linux file:// URL — but in
+ # practice, passing the Linux path works through WSL interop
+ # too, so we pass the path as-is to both.
+ for tool in ("wslview", "explorer.exe"):
+ try:
+ rc = subprocess.run(
+ [tool, str(path)],
+ check=False,
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ ).returncode
+ if rc == 0:
+ return (True, tool)
+ except FileNotFoundError:
+ continue
+ except Exception:
+ continue
+ return (False, None)
+
+ import webbrowser
+
+ try:
+ if webbrowser.open(path.as_uri()):
+ return (True, "webbrowser")
+ except Exception:
+ pass
+ return (False, None)
+
+
def _cmd_analytics_build(args: argparse.Namespace) -> int:
"""Build the HTML analytics dashboard from the perf log."""
from quantui.analytics import build_dashboard
@@ -152,22 +220,13 @@ def _cmd_analytics_build(args: argparse.Namespace) -> int:
return 0
print(f"Wrote {result}")
if getattr(args, "open_after", False):
- # ``webbrowser.open`` accepts a file:// URL. ``Path.as_uri()`` builds
- # the cross-platform form. Failure (e.g. headless WSL with no
- # ``BROWSER`` env var, no $DISPLAY) is non-fatal — the path was
- # already printed above so the user can copy-paste it manually.
- import webbrowser
-
- try:
- opened = webbrowser.open(result.as_uri())
- if not opened:
- print(
- f"(could not auto-open browser — open {result} manually)",
- file=sys.stderr,
- )
- except Exception as exc:
+ # Cross-platform open: WSL → wslview / explorer.exe; otherwise
+ # stdlib webbrowser. Failure is non-fatal (the path was already
+ # printed) so users can always copy-paste manually.
+ opened, tool = _open_in_browser(result)
+ if not opened:
print(
- f"(open failed: {exc}; open {result} manually)",
+ f"(could not auto-open browser — open {result} manually)",
file=sys.stderr,
)
return 0
diff --git a/tests/test_cli.py b/tests/test_cli.py
index c2f24ca..cad6083 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -233,9 +233,9 @@ def test_writes_file_at_explicit_path(self, isolated_log_dir, tmp_path):
assert "Wrote" in out
assert str(target) in out
- def test_open_flag_calls_webbrowser(self, isolated_log_dir, tmp_path, monkeypatch):
- # Seed perf log with a single record so build succeeds.
- perf_path = isolated_log_dir / "perf_log.jsonl"
+ def _seed_perf_log(self, log_dir):
+ """Helper: write one perf record so build_dashboard has data."""
+ perf_path = log_dir / "perf_log.jsonl"
perf_path.write_text(
json.dumps(
{
@@ -250,9 +250,15 @@ def test_open_flag_calls_webbrowser(self, isolated_log_dir, tmp_path, monkeypatc
+ "\n",
encoding="utf-8",
)
+
+ def test_open_flag_calls_webbrowser_off_wsl(
+ self, isolated_log_dir, tmp_path, monkeypatch
+ ):
+ # Force the non-WSL branch so the test runs the webbrowser path.
+ monkeypatch.setattr(cli, "_is_wsl", lambda: False)
+ self._seed_perf_log(isolated_log_dir)
target = tmp_path / "report.html"
- # Capture webbrowser.open invocations rather than launching one.
opened_urls: list[str] = []
import webbrowser as _wb
@@ -273,21 +279,8 @@ def _fake_open(url, *_args, **_kwargs):
def test_open_flag_handles_browser_failure_gracefully(
self, isolated_log_dir, tmp_path, monkeypatch
):
- perf_path = isolated_log_dir / "perf_log.jsonl"
- perf_path.write_text(
- json.dumps(
- {
- "timestamp": "2026-05-25T12:00:00+00:00",
- "formula": "H2O",
- "method": "B3LYP",
- "basis": "STO-3G",
- "elapsed_s": 1.0,
- "converged": True,
- }
- )
- + "\n",
- encoding="utf-8",
- )
+ monkeypatch.setattr(cli, "_is_wsl", lambda: False)
+ self._seed_perf_log(isolated_log_dir)
target = tmp_path / "report.html"
import webbrowser as _wb
@@ -299,3 +292,118 @@ def test_open_flag_handles_browser_failure_gracefully(
# Exit code must remain 0 — the dashboard was written successfully.
assert rc == 0
assert "could not auto-open" in err
+
+
+class TestWslAwareOpener:
+ """`_open_in_browser` chooses wslview / explorer.exe on WSL."""
+
+ def test_is_wsl_detects_env_var(self, monkeypatch):
+ monkeypatch.setenv("WSL_DISTRO_NAME", "Ubuntu")
+ assert cli._is_wsl() is True
+
+ def test_is_wsl_false_when_env_and_proc_missing(self, monkeypatch):
+ # Both signals absent → must return False, not raise.
+ monkeypatch.delenv("WSL_DISTRO_NAME", raising=False)
+ import builtins
+
+ original = builtins.open
+
+ def _fail_open(*args, **kwargs):
+ if args and args[0] == "/proc/version":
+ raise OSError("simulated absence")
+ return original(*args, **kwargs)
+
+ monkeypatch.setattr(builtins, "open", _fail_open)
+ assert cli._is_wsl() is False
+
+ def test_wsl_prefers_wslview(self, monkeypatch, tmp_path):
+ """On WSL, wslview is tried first and wins when it returns 0."""
+ monkeypatch.setattr(cli, "_is_wsl", lambda: True)
+
+ calls: list[list[str]] = []
+
+ class _FakeRun:
+ def __init__(self, returncode):
+ self.returncode = returncode
+
+ def _fake_subprocess_run(cmd, **_kwargs):
+ calls.append(list(cmd))
+ return _FakeRun(0)
+
+ import subprocess
+
+ monkeypatch.setattr(subprocess, "run", _fake_subprocess_run)
+ target = tmp_path / "report.html"
+ target.write_text("x", encoding="utf-8")
+
+ ok, tool = cli._open_in_browser(target)
+ assert ok is True
+ assert tool == "wslview"
+ assert len(calls) == 1
+ assert calls[0][0] == "wslview"
+ assert str(target) in calls[0]
+
+ def test_wsl_falls_back_to_explorer_when_wslview_missing(
+ self, monkeypatch, tmp_path
+ ):
+ """When wslview isn't installed (FileNotFoundError), explorer.exe runs."""
+ monkeypatch.setattr(cli, "_is_wsl", lambda: True)
+
+ calls: list[str] = []
+
+ class _FakeRun:
+ def __init__(self, returncode):
+ self.returncode = returncode
+
+ def _fake_subprocess_run(cmd, **_kwargs):
+ tool = cmd[0]
+ calls.append(tool)
+ if tool == "wslview":
+ raise FileNotFoundError("not installed")
+ return _FakeRun(0)
+
+ import subprocess
+
+ monkeypatch.setattr(subprocess, "run", _fake_subprocess_run)
+ target = tmp_path / "report.html"
+ target.write_text("x", encoding="utf-8")
+
+ ok, tool = cli._open_in_browser(target)
+ assert ok is True
+ assert tool == "explorer.exe"
+ assert calls == ["wslview", "explorer.exe"]
+
+ def test_wsl_returns_false_when_all_openers_fail(self, monkeypatch, tmp_path):
+ monkeypatch.setattr(cli, "_is_wsl", lambda: True)
+
+ import subprocess
+
+ def _fake_run(cmd, **_kwargs):
+ raise FileNotFoundError(f"{cmd[0]} not installed")
+
+ monkeypatch.setattr(subprocess, "run", _fake_run)
+ target = tmp_path / "report.html"
+ target.write_text("x", encoding="utf-8")
+
+ ok, tool = cli._open_in_browser(target)
+ assert ok is False
+ assert tool is None
+
+ def test_non_wsl_uses_webbrowser(self, monkeypatch, tmp_path):
+ monkeypatch.setattr(cli, "_is_wsl", lambda: False)
+
+ opened: list[str] = []
+ import webbrowser
+
+ def _fake_open(url, *_args, **_kwargs):
+ opened.append(url)
+ return True
+
+ monkeypatch.setattr(webbrowser, "open", _fake_open)
+ target = tmp_path / "report.html"
+ target.write_text("x", encoding="utf-8")
+
+ ok, tool = cli._open_in_browser(target)
+ assert ok is True
+ assert tool == "webbrowser"
+ assert opened[0].startswith("file:")
From 49d74400cf9e1d7e525b4a6681d3463d59b67d8e Mon Sep 17 00:00:00 2001
From: NCCU-Schultz-Lab
Date: Mon, 25 May 2026 11:30:54 -0400
Subject: [PATCH 22/33] Surface errors: add logging and CI check
Replace many silent broad-except/pass patterns with logged diagnostics and explicit noqa justifications across calculation modules, and add a CI lint test to prevent new silent broad-excepts in high-risk files.
Changes:
- quantui/freq_calc.py, tddft_calc.py, session_calc.py, optimizer.py, nmr_calc.py, gpu_offload.py: import/create module loggers and replace bare except/pass blocks with logger.debug/logger.warning calls (or add ``# noqa: BLE001`` where the silence is explicitly justified). Add a telemetry log_event call in session_calc when MO extraction fails to surface regressions. Improve messaging for GPU import/probe and mf.to_gpu() fallbacks.
- quantui/gpu_offload.py: log non-ImportError import failures, cupy probe errors, and GPU offload migration failures so offload fallbacks are diagnosable.
- tests/test_code_quality.py: introduce _HIGH_RISK_FILES set and add test_no_silent_broad_except_in_high_risk_files to fail CI if a new broad-except+pass appears in a high-risk file without a nearby log call or a ``# noqa: BLE001`` justification. Also add a meta-guard test to ensure the new check flags a known-bad example.
Rationale: avoid silently swallowing exceptions that can produce subtly incorrect results (bug class causing missing MO arrays / energies), and make it easier to diagnose offload/import issues via logs. The tests enforce the error-surfacing convention for critical code paths.
---
quantui/freq_calc.py | 22 ++++---
quantui/gpu_offload.py | 21 +++++--
quantui/nmr_calc.py | 11 ++--
quantui/optimizer.py | 17 +++--
quantui/session_calc.py | 47 +++++++++++---
quantui/tddft_calc.py | 8 +--
tests/test_code_quality.py | 126 +++++++++++++++++++++++++++++++++++++
7 files changed, 217 insertions(+), 35 deletions(-)
diff --git a/quantui/freq_calc.py b/quantui/freq_calc.py
index 9789407..4627fcd 100644
--- a/quantui/freq_calc.py
+++ b/quantui/freq_calc.py
@@ -208,7 +208,7 @@ def _status(msg: str) -> None:
"""Emit a status marker line consumable by QuantUI's log capture."""
try:
stream.write(f"\n[QuantUI_STATUS] {msg}\n")
- except Exception:
+ except Exception: # noqa: BLE001 — cleanup (stream may be closed)
pass
# ── Build Mole object ────────────────────────────────────────────────────
@@ -261,8 +261,8 @@ def _status(msg: str) -> None:
homo_lumo_gap_ev = float(
(mo_e_ref[n_occ] - mo_e_ref[n_occ - 1]) * HARTREE_TO_EV
)
- except Exception:
- pass
+ except Exception as exc:
+ logger.debug("HOMO-LUMO gap extraction failed in freq calc: %s", exc)
# ── MO data for orbital energy diagram (best-effort) ─────────────────────
mo_energy_hartree: Optional[List] = None
@@ -278,8 +278,15 @@ def _status(msg: str) -> None:
mo_energy_hartree = _np_mo.asarray(_moe, dtype=float).tolist()
mo_occ_list = _np_mo.asarray(_moo, dtype=float).tolist()
pyscf_mol_atom = [(str(s), list(map(float, c))) for s, c in mol._atom]
- except Exception:
- pass
+ except Exception as exc:
+ # Same class as session_calc bug-A: silent failure here ships
+ # a FreqResult with no MO data, breaking the Energies panel on
+ # history replay. Log to surface in the Log tab.
+ logger.warning(
+ "MO data extraction failed in freq calc for %s: %s",
+ molecule.get_formula(),
+ exc,
+ )
# ── Hessian + frequency analysis ─────────────────────────────────────────
frequencies_cm1: List[float] = []
@@ -329,7 +336,8 @@ def _status(msg: str) -> None:
if nm.ndim == 2:
nm = nm.reshape(n_modes_out, n_atoms, 3)
displacements = nm.tolist()
- except Exception:
+ except Exception as exc:
+ logger.debug("Normal-mode displacement extraction failed: %s", exc)
displacements = None
# Numerical IR intensities via finite-difference dipole derivatives.
@@ -614,7 +622,7 @@ def _tv(v):
if progress_stream is not None:
try:
progress_stream.write(f"\n⚠ Hessian failed: {exc}\n")
- except Exception:
+ except Exception: # noqa: BLE001 — cleanup (stream may be closed)
pass
return FreqResult(
diff --git a/quantui/gpu_offload.py b/quantui/gpu_offload.py
index a7b05d9..79b1f2e 100644
--- a/quantui/gpu_offload.py
+++ b/quantui/gpu_offload.py
@@ -28,10 +28,13 @@
from __future__ import annotations
+import logging
import os
from functools import lru_cache
from typing import Any, Optional, Tuple
+logger = logging.getLogger(__name__)
+
# Methods for which gpu4pyscf has zero or known-broken support. ``CCSD(T)``
# is documented as unsupported in the gpu4pyscf README; double hybrids are
# also listed but QuantUI doesn't expose any double-hybrid methods today.
@@ -65,10 +68,13 @@ def is_gpu_available() -> Tuple[bool, Optional[str]]:
import gpu4pyscf # noqa: F401
except ImportError:
return (False, None)
- except Exception:
+ except (
+ Exception
+ ) as exc: # noqa: BLE001 — fall-back to CPU on any import-chain breakage
# Any other import-time error (broken cupy → broken gpu4pyscf
# import-chain, mismatched cuda libs, etc.) is treated as
- # "no GPU available".
+ # "no GPU available". Log so `quantui log tail` reveals why.
+ logger.debug("gpu4pyscf import raised non-ImportError: %s", exc)
return (False, None)
try:
@@ -84,7 +90,10 @@ def is_gpu_available() -> Tuple[bool, Optional[str]]:
else:
name = str(name_raw)
return (True, name)
- except Exception:
+ except (
+ Exception
+ ) as exc: # noqa: BLE001 — fall-back to CPU on any cupy probe failure
+ logger.debug("cupy device probe failed: %s", exc)
return (False, None)
@@ -119,8 +128,10 @@ def try_to_gpu(mf: Any, method_upper: str) -> Tuple[Any, bool, Optional[str]]:
try:
mf_gpu = mf.to_gpu()
return (mf_gpu, True, gpu_name)
- except Exception:
+ except Exception as exc:
# gpu4pyscf migration can fail for many reasons (unsupported method
# variant, density-fitting requirement, basis-set quirk). On any
- # failure we silently fall back to CPU — the calc still runs.
+ # failure we fall back to CPU — the calc still runs. Log so the
+ # user can `quantui log tail` and see why offload didn't happen.
+ logger.warning("mf.to_gpu() migration failed, falling back to CPU: %s", exc)
return (mf, False, None)
diff --git a/quantui/nmr_calc.py b/quantui/nmr_calc.py
index 5cc2b92..2bb604e 100644
--- a/quantui/nmr_calc.py
+++ b/quantui/nmr_calc.py
@@ -15,12 +15,15 @@
from __future__ import annotations
+import logging
import sys
from dataclasses import dataclass
from typing import Any, Dict, List, Tuple
from .molecule import Molecule
+logger = logging.getLogger(__name__)
+
@dataclass
class NMRResult:
@@ -198,8 +201,8 @@ def vind(mo1):
return vind
_prop_nmr_rhf.gen_vind = _fixed_gen_vind
- except Exception:
- pass
+ except (ImportError, AttributeError) as exc: # noqa: BLE001 — optional probe
+ logger.debug("pyscf.prop.nmr.rhf.gen_vind patch not applied: %s", exc)
# pyscf-properties 0.1.0 get_vxc_giao computes
# blksize = min(int(X*BLKSIZE)*BLKSIZE, ngrids)
@@ -285,8 +288,8 @@ def _fixed_get_vxc_giao(
return vmat - vmat.transpose(0, 2, 1)
_prop_nmr_rks.get_vxc_giao = _fixed_get_vxc_giao
- except Exception:
- pass
+ except (ImportError, AttributeError) as exc: # noqa: BLE001 — optional probe
+ logger.debug("pyscf.prop.nmr.rks.get_vxc_giao patch not applied: %s", exc)
try:
if method_upper == "RHF":
diff --git a/quantui/optimizer.py b/quantui/optimizer.py
index 360487f..42347f1 100644
--- a/quantui/optimizer.py
+++ b/quantui/optimizer.py
@@ -420,7 +420,7 @@ def optimize_geometry(
try:
e_ev = frame.get_potential_energy()
energies_hartree.append(e_ev / HARTREE_TO_EV)
- except Exception:
+ except Exception: # noqa: BLE001 — NaN fallback for missing per-frame energy
energies_hartree.append(float("nan"))
if not trajectory:
@@ -429,7 +429,7 @@ def optimize_geometry(
try:
e_ev = atoms.get_potential_energy()
energies_hartree = [e_ev / HARTREE_TO_EV]
- except Exception:
+ except Exception: # noqa: BLE001 — NaN fallback for missing final energy
energies_hartree = [float("nan")]
n_steps = max(0, len(trajectory) - 1)
@@ -452,8 +452,15 @@ def optimize_geometry(
_opt_mo_coeff = _np_mo.array(_last_mf.mo_coeff)
_opt_mol_atom = _last_atom_list
_opt_mol_basis = basis
- except Exception:
- pass
+ except Exception as exc:
+ # Bug-A class — silent failure here ships an OptimizationResult
+ # with no MO data, breaking Energies + Isosurface panels on
+ # history replay. (Same root-cause class as session_calc.)
+ logger.warning(
+ "Final-step MO extraction failed in optimizer for %s: %s",
+ molecule.get_formula(),
+ exc,
+ )
# Write a final MO summary to the progress stream (replaces per-step verbose output
# which is suppressed to avoid thousands of SCF lines for long optimizations).
@@ -499,7 +506,7 @@ def optimize_geometry(
_stream.write(
f" All MO energies (eV): {' '.join(f'{e:.3f}' for e in _e_ev_1d)}\n"
)
- except Exception:
+ except Exception: # noqa: BLE001 — cleanup (stream may be closed)
pass
logger.info(
diff --git a/quantui/session_calc.py b/quantui/session_calc.py
index 6f7756c..052417a 100644
--- a/quantui/session_calc.py
+++ b/quantui/session_calc.py
@@ -303,7 +303,12 @@ def _run_session_calc_body(
mf = _PCM(mf)
mf.with_solvent.eps = _eps
- except Exception:
+ except (
+ Exception
+ ) as exc: # noqa: BLE001 — optional probe (PySCF version drift)
+ logger.debug(
+ "PCM solvent unavailable, falling back to gas phase: %s", exc
+ )
if progress_stream is not None:
progress_stream.write(
"\n⚠ PCM solvent unavailable — running in gas phase.\n"
@@ -321,7 +326,7 @@ def _run_session_calc_body(
if gpu_used and progress_stream is not None:
try:
progress_stream.write(f"\n🚀 GPU offload active — running on {gpu_name}\n")
- except Exception:
+ except Exception: # noqa: BLE001 — cleanup (progress stream may be closed)
pass
# --- Run SCF ---
@@ -403,8 +408,8 @@ def _run_session_calc_body(
homo_lumo_gap_ev = float(
(mo_energy_ref[n_occ] - mo_energy_ref[n_occ - 1]) * HARTREE_TO_EV
)
- except Exception:
- pass # gap stays None — non-fatal
+ except Exception as exc:
+ logger.debug("HOMO-LUMO gap extraction failed (non-fatal): %s", exc)
mulliken_charges: Optional[List[float]] = None
dipole_moment_debye: Optional[float] = None
@@ -412,15 +417,15 @@ def _run_session_calc_body(
try:
_, chg = mf.mulliken_pop(verbose=0)
mulliken_charges = [float(c) for c in chg]
- except Exception:
- pass
+ except Exception as exc:
+ logger.debug("Mulliken population extraction failed: %s", exc)
try:
import numpy as _np2
dip = mf.dip_moment(verbose=0)
dipole_moment_debye = float(_np2.linalg.norm(dip))
- except Exception:
- pass
+ except Exception as exc:
+ logger.debug("Dipole moment extraction failed: %s", exc)
# MO arrays for orbital visualization (non-fatal if extraction fails).
#
@@ -461,8 +466,30 @@ def _to_numpy_array(arr: Any) -> Any:
for atom, coords in zip(molecule.atoms, molecule.coordinates)
]
_pyscf_mol_basis = basis
- except Exception:
- pass
+ except Exception as exc:
+ # Bug-A class (session 55): a silent failure here ships a
+ # SessionResult with mo_coeff=None, which makes save_orbitals
+ # no-op and breaks Energies + Isosurface panels on history
+ # replay. Surface to the event log so a future regression is
+ # visible in `quantui log tail` immediately.
+ logger.warning(
+ "MO array extraction failed for %s (%s/%s): %s",
+ molecule.get_formula(),
+ method,
+ basis,
+ exc,
+ )
+ try:
+ from . import calc_log as _clog
+
+ _clog.log_event(
+ "mo_array_extract_failed",
+ f"{method}/{basis} on {molecule.get_formula()}",
+ error=str(exc)[:300],
+ gpu_used=gpu_used,
+ )
+ except Exception: # noqa: BLE001 — telemetry self-guard
+ pass
formula = molecule.get_formula()
logger.info(
diff --git a/quantui/tddft_calc.py b/quantui/tddft_calc.py
index 0c4abd1..65567a9 100644
--- a/quantui/tddft_calc.py
+++ b/quantui/tddft_calc.py
@@ -205,7 +205,7 @@ def _run_tddft_calc_body(
"For a proper TD-DFT UV-Vis spectrum, use a DFT functional\n"
"such as B3LYP or PBE0 in the Method dropdown.\n\n"
)
- except Exception:
+ except Exception: # noqa: BLE001 — cleanup (stream may be closed)
pass
try:
@@ -236,8 +236,8 @@ def _run_tddft_calc_body(
homo_lumo_gap_ev = float(
(mo_e_ref[n_occ] - mo_e_ref[n_occ - 1]) * HARTREE_TO_EV
)
- except Exception:
- pass
+ except Exception as exc:
+ logger.debug("HOMO-LUMO gap extraction failed in TD-DFT calc: %s", exc)
# ── TD-DFT / TDHF ────────────────────────────────────────────────────────
excitation_energies_ev: List[float] = []
@@ -259,7 +259,7 @@ def _run_tddft_calc_body(
if progress_stream is not None:
try:
progress_stream.write(f"\n⚠ TD-DFT failed: {exc}\n")
- except Exception:
+ except Exception: # noqa: BLE001 — cleanup (stream may be closed)
pass
return TDDFTResult(
diff --git a/tests/test_code_quality.py b/tests/test_code_quality.py
index d9999d0..a695205 100644
--- a/tests/test_code_quality.py
+++ b/tests/test_code_quality.py
@@ -5,6 +5,29 @@
SRC = Path(__file__).parent.parent / "quantui"
+# Files where silent failure is most dangerous — numeric/data extraction
+# paths where a swallowed exception ships subtly-wrong results downstream
+# (bug-A class: cupy TypeError swallow in session_calc.py, session 55).
+#
+# Every broad-except + pass in these files must EITHER:
+# - have a log call (logger.*, calc_log.log_event, _clog.log_event)
+# within 10 lines after the ``except`` (window allows for multi-line
+# log messages — see session_calc.py:455 MO-extract for an example), OR
+# - carry a ``# noqa: BLE001 — `` comment on the ``except`` line
+# justifying the silence (cleanup, telemetry self-guard, optional probe).
+#
+# See reflections/03-error-surfacing.md Rule 1 for the categorization rubric
+# and BARE-EXCEPT-AUDIT-2026-05-25.md for the originating audit.
+_HIGH_RISK_FILES = {
+ "session_calc.py",
+ "freq_calc.py",
+ "tddft_calc.py",
+ "nmr_calc.py",
+ "optimizer.py",
+ "gpu_offload.py",
+ "analytics.py",
+}
+
def _grep(pattern: str) -> list[str]:
hits = []
@@ -27,3 +50,106 @@ def test_no_bare_except_pass():
assert not hits, "Bare except/pass detected (swallows all errors):\n" + "\n".join(
hits
)
+
+
+def test_no_silent_broad_except_in_high_risk_files():
+ """Fail CI when a new broad-except + pass lands in a high-risk file
+ without either a log call within 5 lines or a ``# noqa: BLE001 — ``
+ annotation on the ``except`` line.
+
+ "Broad" means ``except Exception:`` (with or without ``as ``) or
+ truly-bare ``except:``. Narrower catches (``except ImportError:``,
+ ``except (KeyError, ValueError):``, etc.) are not flagged — the whole
+ point of narrowing is to be explicit about the failure mode.
+
+ "Silent" means the body is ``pass`` (or assignment-only without a log
+ call) within the next 10 source lines.
+
+ A line carrying ``# noqa: BLE001`` is treated as explicitly-justified
+ and skipped. The convention requires a ``— `` suffix; this
+ test does not enforce the format (too easy to game) — reviewers do.
+ """
+ except_re = re.compile(r"^\s*except\s*(Exception(\s+as\s+\w+)?)?\s*:\s*(#.*)?$")
+ log_call_re = re.compile(
+ r"\b(logger\.|_clog\.|calc_log\.log_event|log_event\(|"
+ r"_log_event|warnings\.warn)"
+ )
+
+ violations: list[str] = []
+ for path in SRC.rglob("*.py"):
+ if path.name not in _HIGH_RISK_FILES:
+ continue
+ lines = path.read_text(encoding="utf-8").splitlines()
+ for i, line in enumerate(lines):
+ m = except_re.match(line)
+ if not m:
+ continue
+ # Explicit noqa annotation = justified. Reviewers enforce
+ # that the trailing reason is present + sensible.
+ if "noqa: BLE001" in line:
+ continue
+ # Look at the body (next 10 non-blank lines) for a log call.
+ # If none, the block is silent — flag it. 10 is generous enough
+ # to allow multi-line log message arguments.
+ body = lines[i + 1 : i + 11]
+ if any(log_call_re.search(b) for b in body):
+ continue
+ # Also accept if the body re-raises (still surfaces the error).
+ if any("raise" in b for b in body[:2]):
+ continue
+ violations.append(
+ f"{path.relative_to(SRC.parent)}:{i + 1}: {line.strip()}\n"
+ f" (body: {body[0].strip() if body else ''})"
+ )
+
+ assert not violations, (
+ "Silent broad-except detected in a high-risk file. Either add a "
+ "log call (logger.X / calc_log.log_event) within 10 lines of the "
+ "``except``, narrow the exception type, or annotate with\n"
+ " ``# noqa: BLE001 — ``\n"
+ "where is one of: cleanup, telemetry self-guard, optional probe.\n"
+ "See reflections/03-error-surfacing.md Rule 1.\n\n" + "\n".join(violations)
+ )
+
+
+def test_silent_broad_except_guard_actually_catches_violations(tmp_path):
+ """Meta-guard: confirm the lint check above isn't trivially passing.
+
+ Builds a temporary high-risk-looking source file containing a known-bad
+ silent broad-except + pass and verifies the regex / logic flags it.
+ Without this test, an accidental regex break would silently accept
+ everything and we wouldn't notice.
+ """
+ bad_source = (
+ "def foo():\n"
+ " try:\n"
+ " risky()\n"
+ " except Exception:\n"
+ " pass\n"
+ )
+ # Re-implement the matcher inline (mirrors the production logic) so
+ # changes to the production helper force a deliberate update here.
+ except_re = re.compile(r"^\s*except\s*(Exception(\s+as\s+\w+)?)?\s*:\s*(#.*)?$")
+ log_call_re = re.compile(
+ r"\b(logger\.|_clog\.|calc_log\.log_event|log_event\(|"
+ r"_log_event|warnings\.warn)"
+ )
+
+ lines = bad_source.splitlines()
+ flagged = False
+ for i, line in enumerate(lines):
+ if not except_re.match(line):
+ continue
+ if "noqa: BLE001" in line:
+ continue
+ body = lines[i + 1 : i + 11]
+ if any(log_call_re.search(b) for b in body):
+ continue
+ if any("raise" in b for b in body[:2]):
+ continue
+ flagged = True
+ assert flagged, (
+ "The lint guard didn't flag a known-bad ``except Exception: pass`` "
+ "block. The regex or window logic has regressed — fix it before "
+ "trusting test_no_silent_broad_except_in_high_risk_files."
+ )
From be7dd860fbc3754a156f83f155004612069d8091 Mon Sep 17 00:00:00 2001
From: NCCU-Schultz-Lab
Date: Mon, 25 May 2026 13:07:26 -0400
Subject: [PATCH 23/33] Add 4-tier calibration and subprocess worker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Introduce a four-tier calibration flow and make calibration robust and observable. UI: replace short/long toggle with a 4-option tier selector and update panel copy/styles. Benchmarks: add tier3/tier4 suites (geometry optimizations, frequency, MP2/CCSD anchors), keep tier1/tier2 aliases, and provide _normalize_entry and mode→suite mapping. Runflow: wrap calibration in activity begin/end, add per-tier timeout map, show live per-step status lines, and predict GPU usage for estimates. Calibration runner: run each step in a subprocess worker that appends to a per-run log (tail-polled for live updates), allow immediate termination on Stop, persist calibration.json after every step, and record calc_type in results. Calc_log: add IQR outlier filtering, coefficient-of-variation confidence labeling, GPU-aware candidate partitioning with graceful fallback, and use filtered pools when computing medians. Misc: small config tweaks for XC aliasing / D3 handling and add new tests for calibration/estimation behavior.
---
quantui/app_builders.py | 37 +-
quantui/app_runflow.py | 106 ++-
quantui/benchmarks.py | 850 ++++++++++++++++++++---
quantui/calc_log.py | 153 +++-
quantui/config.py | 24 +-
quantui/freq_calc.py | 8 +-
quantui/nmr_calc.py | 10 +-
quantui/optimizer.py | 8 +-
quantui/session_calc.py | 92 ++-
quantui/tddft_calc.py | 7 +-
tests/test_est_calibration_resilience.py | 270 +++++++
tests/test_est_calibration_tiers.py | 185 +++++
tests/test_est_estimator.py | 316 +++++++++
tests/test_xc_resolution.py | 247 +++++++
14 files changed, 2165 insertions(+), 148 deletions(-)
create mode 100644 tests/test_est_calibration_resilience.py
create mode 100644 tests/test_est_calibration_tiers.py
create mode 100644 tests/test_est_estimator.py
create mode 100644 tests/test_xc_resolution.py
diff --git a/quantui/app_builders.py b/quantui/app_builders.py
index b85d858..f66ef38 100644
--- a/quantui/app_builders.py
+++ b/quantui/app_builders.py
@@ -231,12 +231,21 @@ def build_history_section(
tooltip="Open the full PySCF output log in the Output tab",
)
+ # M-EST / EST.4: 4-tier calibration selector. ToggleButtons works for
+ # 4 options; switch to a Dropdown if a 5th tier is ever added. Tier 3
+ # / tier 4 require PySCF (the geom-opt + freq dispatch); tier 1 / 2
+ # are SP-only and gated separately by the run button.
app._cal_mode_toggle = widgets.ToggleButtons(
- options=[("Quick (~10 s)", "short"), ("Full (~5 min)", "long")],
- value="short",
+ options=[
+ ("Tier 1 — Quick (~15 s)", "tier1"),
+ ("Tier 2 — Standard (~3–5 min)", "tier2"),
+ ("Tier 3 — Mixed (~10–15 min)", "tier3"),
+ ("Tier 4 — Deep (~30 min)", "tier4"),
+ ],
+ value="tier1",
description="",
button_style="",
- style={"description_width": "0px", "button_width": "140px"},
+ style={"description_width": "0px", "button_width": "200px"},
layout=layout_fn(margin="0 0 8px"),
)
app._cal_run_btn = widgets.Button(
@@ -339,15 +348,31 @@ def build_history_section(
if cal_last
else ""
)
+ # M-EST / EST.4: import tier sizes lazily so we can refer to all four
+ # in the panel blurb. ``benchmark_suite`` / ``benchmark_suite_long``
+ # are kept as positional args for back-compat but new code prefers
+ # the four named tiers.
+ from quantui.benchmarks import (
+ BENCHMARK_SUITE_TIER3 as _T3,
+ )
+ from quantui.benchmarks import (
+ BENCHMARK_SUITE_TIER4 as _T4,
+ )
+
cal_panel = widgets.VBox(
[
widgets.HTML(
f''
f"Benchmark this machine so the time estimator uses basis-function "
f"scaling (Nβ) rather than generic defaults. "
- f"Quick runs {len(benchmark_suite)} small calculations (~10 s). "
- f"Full runs {len(benchmark_suite_long)} calculations spanning "
- f"all common molecule sizes and methods (~5 min).
" + cal_note
+ f"Tier 1 ({len(benchmark_suite)} calcs, ~15 s) is a quick "
+ f"SP-only smoke test; tier 2 ({len(benchmark_suite_long)} calcs, "
+ f"~3–5 min) expands the SP grid; "
+ f"tier 3 ({len(_T3)} calcs, ~10–15 min) adds small geometry "
+ f"optimizations + frequency calcs; "
+ f"tier 4 ({len(_T4)} calcs, up to ~30 min) anchors every "
+ f"calc-type × device combo for the most accurate predictions.
"
+ + cal_note
),
app._cal_mode_toggle,
widgets.HBox(
diff --git a/quantui/app_runflow.py b/quantui/app_runflow.py
index 6938c4e..86fce22 100644
--- a/quantui/app_runflow.py
+++ b/quantui/app_runflow.py
@@ -4,7 +4,7 @@
import threading
import time
-from typing import Any
+from typing import Any, Optional
import ipywidgets as widgets
from IPython.display import HTML, Javascript, display
@@ -670,30 +670,90 @@ def on_cal_stop(app: Any, btn: Any) -> None:
def do_calibration(app: Any, *, pyscf_available: bool) -> None:
- """Run calibration suite and render calibration summary table."""
+ """Run calibration suite and render calibration summary table.
+
+ Fixes shipped 2026-05-25 (session 55 user report — tier 4 stuck the
+ user with no progress signal):
+
+ - Wraps the whole run in ``_activity_begin/_end`` so the toolbar
+ activity badge stops reading "Idle" while calibration is busy.
+ - Per-step ``progress_cb`` now writes a multi-line status block
+ (live tail of the per-step PySCF / SCF log) so the user can see
+ where a slow step is rather than guess whether it froze.
+ """
from quantui.benchmarks import run_calibration
mode = app._cal_mode_toggle.value
+ # Per-tier timeout budget. Tier 3 + tier 4 have freq/geo-opt anchors
+ # that run for minutes; tier 1 / tier 2 stay SP-only at 120 s/step.
+ _timeout_map = {
+ "tier1": 120.0,
+ "short": 120.0,
+ "tier2": 300.0,
+ "long": 300.0,
+ "tier3": 900.0,
+ "tier4": 1800.0,
+ }
+ timeout_per_step = _timeout_map.get(mode, 120.0)
+
+ # M-EST follow-up (2026-05-25): keep the toolbar activity badge red
+ # for the duration of the calibration so the user knows the kernel
+ # is busy. Without this it reads "Idle" while the worker thread
+ # burns CPU for tier 3/4 (~10-30 min).
+ app._activity_begin(f"Calibrating ({mode})…", kind="compute")
def _progress(
- step_n: int, total: int, label: str, status: str, elapsed: float
+ step_n: int,
+ total: int,
+ label: str,
+ status: str,
+ elapsed: float,
+ *,
+ live_message: Optional[str] = None,
) -> None:
- icon = {"ok": "✓", "timed_out": "⏱", "stopped": "⛔", "error": "✗"}.get(
- status, "?"
+ """Per-step progress callback.
+
+ Two call modes:
+ - Step-finish: status is one of ok/timed_out/stopped/error;
+ ``live_message`` is None. Updates the progress bar.
+ - Live-tick: status is "running"; ``live_message`` carries the
+ latest ``[QuantUI_STATUS]`` marker from inside the step (set
+ by freq_calc / optimizer during long inner loops). Updates
+ the step label only.
+ """
+ icon = {
+ "ok": "✓",
+ "timed_out": "⏱",
+ "stopped": "⛔",
+ "error": "✗",
+ "running": "▶",
+ }.get(status, "?")
+ if status != "running":
+ app._cal_progress.value = step_n
+ # Multi-line block: top line = step + status; second line = the
+ # most recent live message (if any). Keeps the user oriented
+ # during the slow tier-4 freq anchors.
+ live_line = (
+ f'
{live_message}'
+ if live_message
+ else ""
)
- app._cal_progress.value = step_n
app._cal_step_label.value = (
f''
f"Step {step_n} / {total} — {label} "
f"[{icon} {elapsed:.1f} s]"
+ f"{live_line}"
)
- result = run_calibration(
- progress_cb=_progress,
- stop_event=app._cal_stop_event,
- timeout_per_step=300.0 if mode == "long" else 120.0,
- mode=mode,
- )
+ try:
+ result = run_calibration(
+ progress_cb=_progress,
+ stop_event=app._cal_stop_event,
+ timeout_per_step=timeout_per_step,
+ mode=mode,
+ )
+ finally:
+ app._activity_end(kind="compute")
rows = "".join(
f""
@@ -789,6 +849,27 @@ def update_estimate(app: Any, *, calc_log_mod: Any, change: Any = None) -> None:
n_basis = calc_log_mod.count_basis_functions(
app._molecule.atoms, app.basis_dd.value
)
+ # M-EST / EST.1: predict the device the upcoming run will use so
+ # the estimator can partition history by GPU vs CPU. The method
+ # also matters — gpu4pyscf doesn't support CCSD(T), so even on a
+ # GPU machine that calc will run CPU-side.
+ _predicted_gpu_used: Optional[bool] = None
+ try:
+ from quantui.gpu_offload import (
+ _GPU_UNSUPPORTED_METHODS as _GPU_NO,
+ )
+ from quantui.gpu_offload import (
+ is_gpu_available,
+ )
+
+ _gpu_avail, _ = is_gpu_available()
+ if _gpu_avail and app.method_dd.value.upper() not in _GPU_NO:
+ _predicted_gpu_used = True
+ else:
+ _predicted_gpu_used = False
+ except Exception: # noqa: BLE001 — fall back to device-agnostic prediction
+ _predicted_gpu_used = None
+
est = calc_log_mod.estimate_time(
n_atoms=len(app._molecule.atoms),
n_electrons=app._molecule.get_electron_count(),
@@ -796,6 +877,7 @@ def update_estimate(app: Any, *, calc_log_mod: Any, change: Any = None) -> None:
basis=app.basis_dd.value,
n_basis=n_basis,
calc_type=calc_type,
+ gpu_used=_predicted_gpu_used,
)
app.perf_estimate_html.value = calc_log_mod.format_estimate(est)
except Exception:
diff --git a/quantui/benchmarks.py b/quantui/benchmarks.py
index c4ab8f3..e84d3a9 100644
--- a/quantui/benchmarks.py
+++ b/quantui/benchmarks.py
@@ -7,6 +7,35 @@
:func:`~quantui.calc_log.estimate_time` immediately becomes useful on a
fresh install.
+Four tiers (M-EST / EST.4, 2026-05-25)
+--------------------------------------
+
+The calibration suite is now a **four-tier cascade** rather than the
+original short/long pair. Users pick the depth that matches their setup-
+time tolerance:
+
+- **Tier 1 — Quick** (~15 s): SP only, smoke-test PySCF + bootstrap
+ predictor. Same molecules as the historical "short" suite.
+- **Tier 2 — Standard** (~3–5 min): SP only, expanded method × basis
+ grid so the predictor has multiple anchors per `(method, basis)` tuple.
+- **Tier 3 — Mixed** (~10–15 min): tier 2 + 2–3 small geometry
+ optimizations + 1–2 small frequency calcs. First reliable GeoOpt +
+ Freq predictions.
+- **Tier 4 — Deep** (up to 30 min): tier 3 + medium GeoOpt + medium
+ Freq (ethanol, benzene) + MP2 / CCSD anchors. Lets the estimator
+ predict every calc-type × device combo within ±25%.
+
+Back-compat: the legacy ``mode="short"`` / ``mode="long"`` strings still
+work as aliases for tier 1 / tier 2 respectively. New code should use
+``mode="tier1"`` … ``mode="tier4"``.
+
+Entry format
+------------
+
+Each tier is a list of 7-tuples (single-point calcs) or 8-tuples (when
+the 8th element overrides the calc-type, e.g. ``"geometry_opt"`` /
+``"frequency"``). ``_normalize_entry()`` unpacks either shape.
+
Typical usage (from the UI)::
import threading
@@ -17,6 +46,7 @@
progress_cb=lambda *a: print(a),
stop_event=stop,
timeout_per_step=120,
+ mode="tier3", # or "tier1"/"tier2"/"tier4"
)
"""
@@ -290,8 +320,326 @@
"RHF",
"STO-3G",
),
+ # ── M-EST / EST.4 expansion (2026-05-25) ──────────────────────────────
+ # Additional SP entries that broaden the method × basis grid coverage,
+ # extending tier 2's expected wall-clock to the 3-5 min target.
+ (
+ "H₂O B3LYP/6-31G*",
+ ["O", "H", "H"],
+ [[0.0, 0.0, 0.0], [0.757, 0.587, 0.0], [-0.757, 0.587, 0.0]],
+ 0,
+ 1,
+ "B3LYP",
+ "6-31G*",
+ ),
+ (
+ "H₂O wB97X-D/6-31G*",
+ ["O", "H", "H"],
+ [[0.0, 0.0, 0.0], [0.757, 0.587, 0.0], [-0.757, 0.587, 0.0]],
+ 0,
+ 1,
+ "wB97X-D",
+ "6-31G*",
+ ),
+ (
+ "CH₄ B3LYP/6-31G*",
+ ["C", "H", "H", "H", "H"],
+ [
+ [0.0, 0.0, 0.0],
+ [0.629, 0.629, 0.629],
+ [-0.629, -0.629, 0.629],
+ [-0.629, 0.629, -0.629],
+ [0.629, -0.629, -0.629],
+ ],
+ 0,
+ 1,
+ "B3LYP",
+ "6-31G*",
+ ),
+ (
+ "NH₃ RHF/cc-pVDZ",
+ ["N", "H", "H", "H"],
+ [
+ [0.000, 0.000, 0.111],
+ [0.000, 0.940, -0.260],
+ [0.814, -0.470, -0.260],
+ [-0.814, -0.470, -0.260],
+ ],
+ 0,
+ 1,
+ "RHF",
+ "cc-pVDZ",
+ ),
+ (
+ "NH₃ B3LYP/cc-pVDZ",
+ ["N", "H", "H", "H"],
+ [
+ [0.000, 0.000, 0.111],
+ [0.000, 0.940, -0.260],
+ [0.814, -0.470, -0.260],
+ [-0.814, -0.470, -0.260],
+ ],
+ 0,
+ 1,
+ "B3LYP",
+ "cc-pVDZ",
+ ),
+ (
+ "H₂CO (formaldehyde) B3LYP/6-31G*",
+ ["C", "O", "H", "H"],
+ [
+ [0.000, 0.000, 0.000],
+ [0.000, 0.000, 1.207],
+ [0.000, 0.943, -0.589],
+ [0.000, -0.943, -0.589],
+ ],
+ 0,
+ 1,
+ "B3LYP",
+ "6-31G*",
+ ),
+]
+
+
+# ---------------------------------------------------------------------------
+# Tier 3 — Mixed (~10-15 min): tier 2 + small GeoOpts + small Freqs
+# ---------------------------------------------------------------------------
+#
+# 8-tuple entries override the default ``"single_point"`` calc-type. The 8th
+# element is one of ``"geometry_opt"`` / ``"frequency"``.
+#
+# Small geometry opts (3-5 atoms) and the cheapest realistic frequency calc
+# (H₂O / B3LYP / STO-3G) anchor the multi-calc-type predictions without
+# blowing the time budget.
+
+BENCHMARK_SUITE_TIER3: list[tuple] = [
+ *BENCHMARK_SUITE_LONG,
+ # ── Small GeoOpts ─────────────────────────────────────────────────────
+ (
+ "H₂O B3LYP/STO-3G [GeoOpt]",
+ ["O", "H", "H"],
+ [[0.0, 0.0, 0.0], [0.757, 0.587, 0.0], [-0.757, 0.587, 0.0]],
+ 0,
+ 1,
+ "B3LYP",
+ "STO-3G",
+ "geometry_opt",
+ ),
+ (
+ "H₂CO B3LYP/6-31G* [GeoOpt]",
+ ["C", "O", "H", "H"],
+ [
+ [0.000, 0.000, 0.000],
+ [0.000, 0.000, 1.207],
+ [0.000, 0.943, -0.589],
+ [0.000, -0.943, -0.589],
+ ],
+ 0,
+ 1,
+ "B3LYP",
+ "6-31G*",
+ "geometry_opt",
+ ),
+ (
+ "CH₄ B3LYP/6-31G* [GeoOpt]",
+ ["C", "H", "H", "H", "H"],
+ [
+ [0.0, 0.0, 0.0],
+ [0.629, 0.629, 0.629],
+ [-0.629, -0.629, 0.629],
+ [-0.629, 0.629, -0.629],
+ [0.629, -0.629, -0.629],
+ ],
+ 0,
+ 1,
+ "B3LYP",
+ "6-31G*",
+ "geometry_opt",
+ ),
+ # ── Small Freqs (cheapest realistic anchors for the 6N inner-SCF model) ──
+ (
+ "H₂O B3LYP/STO-3G [Freq]",
+ ["O", "H", "H"],
+ [[0.0, 0.0, 0.0], [0.757, 0.587, 0.0], [-0.757, 0.587, 0.0]],
+ 0,
+ 1,
+ "B3LYP",
+ "STO-3G",
+ "frequency",
+ ),
+ (
+ "H₂CO B3LYP/6-31G* [Freq]",
+ ["C", "O", "H", "H"],
+ [
+ [0.000, 0.000, 0.000],
+ [0.000, 0.000, 1.207],
+ [0.000, 0.943, -0.589],
+ [0.000, -0.943, -0.589],
+ ],
+ 0,
+ 1,
+ "B3LYP",
+ "6-31G*",
+ "frequency",
+ ),
]
+
+# ---------------------------------------------------------------------------
+# Tier 4 — Deep (up to 30 min): tier 3 + medium GeoOpt + medium Freq + MP2/CCSD
+# ---------------------------------------------------------------------------
+#
+# Medium-size geometry opt + medium-size frequency anchors the predictor
+# across realistic molecule sizes. MP2 + CCSD entries on H₂O / cc-pVDZ
+# anchor the β=5.0 (MP2) and β=6.0 (CCSD) scaling exponents in
+# ``calc_log._METHOD_SCALE_EXP``. The benzene frequency is the workhorse
+# parallel-IR test — 12 atoms × 6 = 72 inner SCFs.
+
+BENCHMARK_SUITE_TIER4: list[tuple] = [
+ *BENCHMARK_SUITE_TIER3,
+ # ── Medium GeoOpt ─────────────────────────────────────────────────────
+ (
+ "C₂H₆O (ethanol) B3LYP/6-31G* [GeoOpt]",
+ ["C", "C", "O", "H", "H", "H", "H", "H", "H"],
+ [
+ [-1.232, 0.026, 0.000],
+ [0.281, 0.026, 0.000],
+ [0.829, 1.310, 0.000],
+ [-1.566, 1.059, 0.000],
+ [-1.609, -0.506, 0.880],
+ [-1.609, -0.506, -0.880],
+ [0.668, -0.497, 0.890],
+ [0.668, -0.497, -0.890],
+ [1.802, 1.311, 0.000],
+ ],
+ 0,
+ 1,
+ "B3LYP",
+ "6-31G*",
+ "geometry_opt",
+ ),
+ # ── Medium Freq ───────────────────────────────────────────────────────
+ (
+ "C₂H₆O (ethanol) B3LYP/6-31G* [Freq]",
+ ["C", "C", "O", "H", "H", "H", "H", "H", "H"],
+ [
+ [-1.232, 0.026, 0.000],
+ [0.281, 0.026, 0.000],
+ [0.829, 1.310, 0.000],
+ [-1.566, 1.059, 0.000],
+ [-1.609, -0.506, 0.880],
+ [-1.609, -0.506, -0.880],
+ [0.668, -0.497, 0.890],
+ [0.668, -0.497, -0.890],
+ [1.802, 1.311, 0.000],
+ ],
+ 0,
+ 1,
+ "B3LYP",
+ "6-31G*",
+ "frequency",
+ ),
+ (
+ "C₆H₆ (benzene) B3LYP/6-31G* [Freq]",
+ ["C", "C", "C", "C", "C", "C", "H", "H", "H", "H", "H", "H"],
+ [
+ [1.395, 0.000, 0.000],
+ [0.698, 1.209, 0.000],
+ [-0.698, 1.209, 0.000],
+ [-1.395, 0.000, 0.000],
+ [-0.698, -1.209, 0.000],
+ [0.698, -1.209, 0.000],
+ [2.479, 0.000, 0.000],
+ [1.240, 2.147, 0.000],
+ [-1.240, 2.147, 0.000],
+ [-2.479, 0.000, 0.000],
+ [-1.240, -2.147, 0.000],
+ [1.240, -2.147, 0.000],
+ ],
+ 0,
+ 1,
+ "B3LYP",
+ "6-31G*",
+ "frequency",
+ ),
+ # ── Post-HF anchors ───────────────────────────────────────────────────
+ (
+ "H₂O MP2/cc-pVDZ",
+ ["O", "H", "H"],
+ [[0.0, 0.0, 0.0], [0.757, 0.587, 0.0], [-0.757, 0.587, 0.0]],
+ 0,
+ 1,
+ "MP2",
+ "cc-pVDZ",
+ ),
+ (
+ "H₂O CCSD/cc-pVDZ",
+ ["O", "H", "H"],
+ [[0.0, 0.0, 0.0], [0.757, 0.587, 0.0], [-0.757, 0.587, 0.0]],
+ 0,
+ 1,
+ "CCSD",
+ "cc-pVDZ",
+ ),
+]
+
+
+# Aliases — keep BENCHMARK_SUITE / BENCHMARK_SUITE_LONG for back-compat
+# (existing tests + app.py imports). New code should reference the
+# tier-named constants for clarity.
+BENCHMARK_SUITE_TIER1: list[tuple] = BENCHMARK_SUITE
+BENCHMARK_SUITE_TIER2: list[tuple] = BENCHMARK_SUITE_LONG
+
+
+# ---------------------------------------------------------------------------
+# Mode-string → suite mapping
+# ---------------------------------------------------------------------------
+#
+# ``run_calibration(mode=)`` accepts any of these strings. The legacy
+# ``"short"`` / ``"long"`` aliases are kept so older callers (including
+# pinned UI state) keep working.
+
+_MODE_TO_SUITE: dict = {
+ "tier1": BENCHMARK_SUITE_TIER1,
+ "tier2": BENCHMARK_SUITE_TIER2,
+ "tier3": BENCHMARK_SUITE_TIER3,
+ "tier4": BENCHMARK_SUITE_TIER4,
+ "short": BENCHMARK_SUITE_TIER1,
+ "long": BENCHMARK_SUITE_TIER2,
+}
+
+
+def _normalize_entry(entry: tuple) -> dict:
+ """Unpack a 7-tuple or 8-tuple benchmark entry into a uniform dict.
+
+ 7-tuple: ``(label, atoms, coords, charge, mult, method, basis)`` —
+ defaults ``calc_type`` to ``"single_point"``.
+
+ 8-tuple: ``(label, atoms, coords, charge, mult, method, basis, calc_type)``
+ — used by tier 3 + tier 4 entries that need ``"geometry_opt"`` or
+ ``"frequency"`` dispatch.
+ """
+ if len(entry) == 7:
+ label, atoms, coords, charge, mult, method, basis = entry
+ calc_type = "single_point"
+ elif len(entry) == 8:
+ label, atoms, coords, charge, mult, method, basis, calc_type = entry
+ else:
+ raise ValueError(
+ f"Benchmark entry must have 7 or 8 fields, got {len(entry)}: {entry!r}"
+ )
+ return {
+ "label": label,
+ "atoms": atoms,
+ "coords": coords,
+ "charge": charge,
+ "multiplicity": mult,
+ "method": method,
+ "basis": basis,
+ "calc_type": calc_type,
+ }
+
+
# ---------------------------------------------------------------------------
# Result dataclass
# ---------------------------------------------------------------------------
@@ -315,6 +663,9 @@ class BenchmarkStep:
elapsed_s: float = 0.0
error_msg: str = ""
n_basis: Optional[int] = None
+ # M-EST / EST.4: track which calc-type this step ran so tier 3+4
+ # entries can be distinguished in summaries.
+ calc_type: str = "single_point"
@dataclass
@@ -324,7 +675,7 @@ class CalibrationResult:
timestamp: str
steps: List[BenchmarkStep] = field(default_factory=list)
stopped_early: bool = False
- mode: str = "short"
+ mode: str = "tier1"
@property
def n_completed(self) -> int:
@@ -332,7 +683,7 @@ def n_completed(self) -> int:
@property
def n_total(self) -> int:
- return len(BENCHMARK_SUITE if self.mode == "short" else BENCHMARK_SUITE_LONG)
+ return len(_MODE_TO_SUITE.get(self.mode, BENCHMARK_SUITE_TIER1))
# ---------------------------------------------------------------------------
@@ -368,32 +719,270 @@ def _count_electrons(atoms: list[str], charge: int) -> int:
return sum(_Z.get(a, 6) for a in atoms) - charge
+# ---------------------------------------------------------------------------
+# Subprocess worker (M-EST follow-up, 2026-05-25)
+# ---------------------------------------------------------------------------
+#
+# Originally calibration ran each step in a ThreadPoolExecutor with a
+# ``future.result(timeout=...)`` block. That had three blockers exposed by
+# the user's tier-4 attempt (session 55):
+#
+# 1. The Stop button only checked between steps, so an in-flight 5-minute
+# freq calc could not be killed mid-run.
+# 2. There was no per-step progress signal beyond a single "running"
+# label — the user couldn't tell whether a slow step had frozen the
+# kernel.
+# 3. ``calibration.json`` was only flushed at the END of the loop, so
+# stopping at step 25/30 lost the partial-state marker.
+#
+# The fix runs each step in a child process via ``multiprocessing.Process``
+# so ``worker.terminate()`` works reliably cross-platform. The worker pipes
+# PySCF's progress stream to a calibration log file the main process tails
+# every 500 ms for the live status display, and ``calibration.json`` is
+# rewritten after each completed step.
+
+
+def _calibration_worker(
+ atoms: list,
+ coords: list,
+ charge: int,
+ mult: int,
+ method: str,
+ basis: str,
+ calc_type: str,
+ log_path_str: str,
+ result_queue,
+) -> None:
+ """Run one calibration step in a child process.
+
+ Picklable (top-level function, primitive args + a Queue). Pipes
+ PySCF progress to ``log_path_str`` (append mode) so the parent can
+ tail it. Puts a dict with status / formula / n_iterations /
+ converged / elapsed_s on ``result_queue`` when done.
+
+ On exception, puts ``{"status": "error", "error_msg": ...}``. The
+ parent treats absence of a queue entry (after worker exit) as a
+ crashed worker — distinct from a step-level error.
+ """
+ import time as _t
+ from datetime import datetime as _dt
+ from pathlib import Path as _P
+
+ log_path = _P(log_path_str)
+ t0 = _t.perf_counter()
+ label = f"{method}/{basis} ({calc_type})"
+
+ try:
+ # Line-buffered append so the parent's tail sees output as it
+ # arrives. ``buffering=1`` requires text mode (which we use).
+ with open(log_path, "a", encoding="utf-8", buffering=1) as log_fh:
+ log_fh.write(
+ f"\n========= {_dt.utcnow().isoformat()} :: {label} =========\n"
+ )
+
+ from quantui.molecule import Molecule as _Molecule
+
+ mol = _Molecule(atoms, coords, charge=charge, multiplicity=mult)
+
+ if calc_type == "geometry_opt":
+ from quantui.optimizer import optimize_geometry as _opt
+
+ res = _opt(
+ molecule=mol,
+ method=method,
+ basis=basis,
+ progress_stream=log_fh,
+ )
+ formula = res.molecule.get_formula()
+ converged = bool(res.converged)
+ n_iterations = int(getattr(res, "n_steps", -1))
+ elif calc_type == "frequency":
+ from quantui.freq_calc import run_freq_calc as _freq
+
+ res = _freq(
+ molecule=mol,
+ method=method,
+ basis=basis,
+ progress_stream=log_fh,
+ )
+ formula = res.formula
+ converged = bool(res.converged)
+ n_iterations = int(res.n_iterations)
+ else: # single_point
+ from quantui.session_calc import run_in_session as _sp
+
+ # verbose=3 gives per-iteration SCF energies in the log —
+ # enough signal to confirm the worker hasn't frozen on a
+ # slow tier-4 entry. (Was verbose=0 pre-session-55.)
+ res = _sp(
+ mol,
+ method=method,
+ basis=basis,
+ verbose=3,
+ progress_stream=log_fh,
+ )
+ formula = res.formula
+ converged = bool(res.converged)
+ n_iterations = int(res.n_iterations)
+
+ elapsed = _t.perf_counter() - t0
+ log_fh.write(f"\n[QuantUI_STATUS] COMPLETED in {elapsed:.2f} s\n")
+
+ result_queue.put(
+ {
+ "status": "ok",
+ "formula": formula,
+ "converged": converged,
+ "n_iterations": n_iterations,
+ "elapsed_s": elapsed,
+ }
+ )
+ except Exception as exc:
+ result_queue.put(
+ {
+ "status": "error",
+ "error_msg": str(exc)[:500],
+ "elapsed_s": _t.perf_counter() - t0,
+ }
+ )
+
+
+def _tail_last_status_line(log_path) -> str:
+ """Return the last meaningful progress line from the calibration log.
+
+ Prefers ``[QuantUI_STATUS] ...`` markers emitted by ``freq_calc``;
+ falls back to any non-blank line. Truncated to ~120 chars so the
+ UI widget renders cleanly. Returns "" on any IO failure (best-
+ effort).
+ """
+ try:
+ with open(log_path, encoding="utf-8", errors="replace") as fh:
+ lines = fh.readlines()
+ except OSError:
+ return ""
+ # Walk backwards looking for the best candidate.
+ status_line = ""
+ fallback_line = ""
+ for line in reversed(lines):
+ stripped = line.strip()
+ if not stripped:
+ continue
+ if "[QuantUI_STATUS]" in stripped:
+ status_line = stripped
+ break
+ if not fallback_line:
+ fallback_line = stripped
+ best = status_line or fallback_line
+ if len(best) > 120:
+ best = best[-120:]
+ return best
+
+
+def _calibration_log_path(timestamp: str) -> Path:
+ """Return the path to the per-run calibration log file.
+
+ Filename includes the run timestamp so multiple runs don't clobber
+ each other. Lives under ``~/.quantui/logs/`` (honours
+ ``QUANTUI_LOG_DIR``) alongside the event + perf logs.
+ """
+ import os as _os
+
+ env = _os.environ.get("QUANTUI_LOG_DIR")
+ base = Path(env) if env else Path.home() / ".quantui" / "logs"
+ # Make a filename-safe timestamp.
+ safe_ts = timestamp.replace(":", "-").replace(".", "-")
+ return base / f"calibration_{safe_ts}.log"
+
+
+def _save_calibration_json(result: CalibrationResult, log_path: Path) -> None:
+ """Persist the current ``CalibrationResult`` snapshot to disk.
+
+ Called after EVERY completed step (not just at end-of-run) so an
+ interrupted tier-4 still records the partial-state marker the user
+ can see next session. Includes the log file path so the "last
+ calibration" UI can link to the per-run log.
+ """
+ import json as _json
+
+ cal_path = Path.home() / ".quantui" / "calibration.json"
+ try:
+ cal_path.parent.mkdir(parents=True, exist_ok=True)
+ cal_path.write_text(
+ _json.dumps(
+ {
+ "timestamp": result.timestamp,
+ "mode": result.mode,
+ "stopped_early": result.stopped_early,
+ "log_path": str(log_path),
+ "n_completed": result.n_completed,
+ "n_total": result.n_total,
+ "steps": [
+ {
+ "label": s.label,
+ "method": s.method,
+ "basis": s.basis,
+ "n_atoms": s.n_atoms,
+ "n_electrons": s.n_electrons,
+ "n_basis": s.n_basis,
+ "status": s.status,
+ "elapsed_s": round(s.elapsed_s, 3),
+ "error_msg": s.error_msg,
+ "calc_type": s.calc_type,
+ }
+ for s in result.steps
+ ],
+ },
+ indent=2,
+ ensure_ascii=False,
+ ),
+ encoding="utf-8",
+ )
+ except OSError:
+ # Disk full / permission denied — best-effort. The perf log is
+ # the canonical record; calibration.json is just a UI summary.
+ pass
+
+
def run_calibration(
progress_cb: Optional[ProgressCallback] = None,
stop_event=None,
timeout_per_step: float = 120.0,
- mode: str = "short",
+ mode: str = "tier1",
) -> CalibrationResult:
"""Run the benchmark suite and populate ``perf_log.jsonl``.
+ Each step runs in a child process so the Stop button can terminate
+ a long-running calc mid-run. Per-step progress is piped to a log
+ file under ``~/.quantui/logs/calibration_.log`` and the
+ parent tails it every 500 ms to drive the live status display.
+ ``~/.quantui/calibration.json`` is rewritten after every completed
+ step, so an interrupted run still records partial state.
+
Args:
- progress_cb: Called after each step with
- ``(step_n, total, label, status, elapsed_s)``.
- stop_event: A :class:`threading.Event`; checked before each step.
- Set it to abort the suite cleanly.
- timeout_per_step: Wall-clock seconds allowed per step. Steps that
- exceed this are marked ``"timed_out"`` and skipped.
- mode: ``"short"`` (default, ~10 s) runs :data:`BENCHMARK_SUITE`;
- ``"long"`` (~3–6 min) runs :data:`BENCHMARK_SUITE_LONG`.
+ progress_cb: Called periodically with
+ ``(step_n, total, label, status, elapsed_s)`` and optionally
+ ``live_message=`` during slow steps. The
+ terminal call after each step uses status in
+ ``ok / timed_out / stopped / error``; intermediate "running"
+ ticks fire while the step is in-flight.
+ stop_event: A :class:`threading.Event`; checked every 500 ms.
+ When set, the in-flight worker is terminated immediately
+ and the current step is marked ``"stopped"``.
+ timeout_per_step: Wall-clock seconds allowed per step. Defaults
+ to 120 s — fine for tier 1 / tier 2 (SP only). Caller
+ should bump for tier 3 (~900 s) and tier 4 (~1800 s).
+ mode: One of ``"tier1"`` / ``"tier2"`` / ``"tier3"`` / ``"tier4"``.
+ Legacy aliases ``"short"`` / ``"long"`` map to tier1 / tier2.
+ Unknown modes fall back to tier1 with a warning.
Returns:
:class:`CalibrationResult` with per-step outcomes.
"""
- import concurrent.futures
- import json
+ import multiprocessing as _mp
+ import queue as _queue
+ import sys as _sys
from quantui import calc_log as _calc_log
- from quantui.molecule import Molecule
_pyscf_available = False
try:
@@ -403,15 +992,66 @@ def run_calibration(
except ImportError:
pass
- suite = BENCHMARK_SUITE if mode == "short" else BENCHMARK_SUITE_LONG
+ if mode not in _MODE_TO_SUITE:
+ import logging as _log
+
+ _log.getLogger(__name__).warning(
+ "run_calibration: unknown mode %r, falling back to tier1", mode
+ )
+ mode = "tier1"
+ suite = _MODE_TO_SUITE[mode]
timestamp = datetime.now(timezone.utc).isoformat()
result = CalibrationResult(timestamp=timestamp, mode=mode)
total = len(suite)
+ # Per-run calibration log file. The worker appends; the parent tails.
+ log_path = _calibration_log_path(timestamp)
+ try:
+ log_path.parent.mkdir(parents=True, exist_ok=True)
+ with open(log_path, "w", encoding="utf-8") as fh:
+ fh.write(
+ f"QuantUI calibration log\n"
+ f"started : {timestamp}\n"
+ f"mode : {mode}\n"
+ f"suite size: {total} entries\n"
+ f"timeout/step: {timeout_per_step:.0f} s\n"
+ )
+ except OSError:
+ # No log file is non-fatal — calibration still runs, just without
+ # the per-step progress trail.
+ pass
+
+ # ``fork`` is fast on Linux/macOS but unsupported on Windows; spawn
+ # is the portable fallback. ``forkserver`` is also available but
+ # slower than fork on Linux.
+ _ctx_name = "spawn" if _sys.platform == "win32" else "fork"
+ _ctx = _mp.get_context(_ctx_name)
+
+ def _emit_progress(*args, live_message=None) -> None:
+ """Wrap progress_cb to tolerate callers that pre-date the
+ ``live_message`` kwarg (notably the test-suite lambdas that
+ accept ``*args`` only). Falls back to the old 5-arg form on
+ ``TypeError``."""
+ if progress_cb is None:
+ return
+ try:
+ progress_cb(*args, live_message=live_message)
+ except TypeError:
+ progress_cb(*args)
+
+ stopped_mid_step = False
for step_n, entry in enumerate(suite, start=1):
- label, atoms, coords, charge, mult, method, basis = entry
+ normalized = _normalize_entry(entry)
+ label = normalized["label"]
+ atoms = normalized["atoms"]
+ coords = normalized["coords"]
+ charge = normalized["charge"]
+ mult = normalized["multiplicity"]
+ method = normalized["method"]
+ basis = normalized["basis"]
+ calc_type = normalized["calc_type"]
- # --- honour stop request ---
+ # Honour stop request BEFORE starting a new step.
if stop_event is not None and stop_event.is_set():
result.stopped_early = True
break
@@ -425,98 +1065,116 @@ def run_calibration(
n_electrons=_count_electrons(atoms, charge),
status=_STATUS_ERROR,
n_basis=nb,
+ calc_type=calc_type,
)
if not _pyscf_available:
- step.status = _STATUS_ERROR
step.error_msg = "PySCF not available"
result.steps.append(step)
- if progress_cb is not None:
- progress_cb(step_n, total, label, step.status, 0.0)
+ _save_calibration_json(result, log_path)
+ _emit_progress(step_n, total, label, step.status, 0.0)
continue
- def _run_step(
- atoms=atoms,
- coords=coords,
- charge=charge,
- mult=mult,
- method=method,
- basis=basis,
- ):
- from quantui.session_calc import run_in_session
+ # Spawn the worker.
+ result_queue = _ctx.Queue()
+ worker = _ctx.Process(
+ target=_calibration_worker,
+ args=(
+ atoms,
+ coords,
+ charge,
+ mult,
+ method,
+ basis,
+ calc_type,
+ str(log_path),
+ result_queue,
+ ),
+ daemon=True,
+ )
+ t_start = time.perf_counter()
+ worker.start()
- mol = Molecule(atoms, coords, charge=charge, multiplicity=mult)
- t0 = time.perf_counter()
- res = run_in_session(mol, method=method, basis=basis, verbose=0)
- return res, time.perf_counter() - t0
+ # Poll loop — finish naturally OR hit timeout OR receive stop signal.
+ poll_interval = 0.5
+ worker_done_normally = False
+ while True:
+ worker.join(timeout=poll_interval)
+ elapsed = time.perf_counter() - t_start
- t_start = time.perf_counter()
- try:
- with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
- future = pool.submit(_run_step)
- try:
- res, elapsed = future.result(timeout=timeout_per_step)
- step.elapsed_s = elapsed
- step.status = _STATUS_OK
- # Log to perf_log.jsonl so estimate_time() can use it
- _calc_log.log_calculation(
- formula=res.formula,
- n_atoms=step.n_atoms,
- n_electrons=step.n_electrons,
- method=method,
- basis=basis,
- n_iterations=res.n_iterations,
- elapsed_s=elapsed,
- converged=res.converged,
- n_basis=step.n_basis,
- n_cores=1,
- calc_type="single_point",
- )
- except concurrent.futures.TimeoutError:
- step.status = _STATUS_TIMEOUT
- step.elapsed_s = time.perf_counter() - t_start
- except Exception as exc:
- step.status = _STATUS_ERROR
- step.error_msg = str(exc)
- step.elapsed_s = time.perf_counter() - t_start
+ if not worker.is_alive():
+ worker_done_normally = True
+ break
+
+ if elapsed > timeout_per_step:
+ worker.terminate()
+ worker.join(timeout=5)
+ step.status = _STATUS_TIMEOUT
+ step.elapsed_s = elapsed
+ step.error_msg = f"exceeded {timeout_per_step:.0f}s timeout"
+ break
+
+ if stop_event is not None and stop_event.is_set():
+ worker.terminate()
+ worker.join(timeout=5)
+ step.status = _STATUS_STOPPED
+ step.elapsed_s = elapsed
+ result.stopped_early = True
+ stopped_mid_step = True
+ break
+
+ # Live-tick: pull the latest log line for the UI.
+ live_msg = _tail_last_status_line(log_path)
+ _emit_progress(
+ step_n, total, label, "running", elapsed, live_message=live_msg
+ )
+
+ if worker_done_normally:
+ try:
+ msg = result_queue.get(timeout=2.0)
+ except _queue.Empty:
+ msg = {
+ "status": "error",
+ "error_msg": "worker exited without returning a result",
+ "elapsed_s": time.perf_counter() - t_start,
+ }
+ if msg.get("status") == "ok":
+ step.status = _STATUS_OK
+ step.elapsed_s = float(msg["elapsed_s"])
+ # Log to perf_log.jsonl so estimate_time() picks it up.
+ _calc_log.log_calculation(
+ formula=msg["formula"],
+ n_atoms=step.n_atoms,
+ n_electrons=step.n_electrons,
+ method=method,
+ basis=basis,
+ n_iterations=int(msg.get("n_iterations", -1)),
+ elapsed_s=float(msg["elapsed_s"]),
+ converged=bool(msg["converged"]),
+ n_basis=step.n_basis,
+ n_cores=1,
+ calc_type=calc_type,
+ )
+ else:
+ step.status = _STATUS_ERROR
+ step.error_msg = msg.get("error_msg", "unknown")
+ step.elapsed_s = float(
+ msg.get("elapsed_s", time.perf_counter() - t_start)
+ )
result.steps.append(step)
- if progress_cb is not None:
- progress_cb(step_n, total, label, step.status, step.elapsed_s)
+ # Fix 2: persist after EVERY step so an interrupt at step N
+ # still leaves a partial-state record on disk.
+ _save_calibration_json(result, log_path)
- # --- persist calibration summary ---
- _cal_path = Path.home() / ".quantui" / "calibration.json"
- try:
- _cal_path.parent.mkdir(parents=True, exist_ok=True)
- _cal_path.write_text(
- json.dumps(
- {
- "timestamp": result.timestamp,
- "mode": result.mode,
- "stopped_early": result.stopped_early,
- "steps": [
- {
- "label": s.label,
- "method": s.method,
- "basis": s.basis,
- "n_atoms": s.n_atoms,
- "n_electrons": s.n_electrons,
- "n_basis": s.n_basis,
- "status": s.status,
- "elapsed_s": round(s.elapsed_s, 3),
- "error_msg": s.error_msg,
- }
- for s in result.steps
- ],
- },
- indent=2,
- ensure_ascii=False,
- ),
- encoding="utf-8",
- )
- except OSError:
- pass
+ _emit_progress(step_n, total, label, step.status, step.elapsed_s)
+
+ if stopped_mid_step:
+ break
+ # Final write (idempotent — same content as the last per-step write
+ # unless the loop broke via the top-of-loop stop check).
+ _save_calibration_json(result, log_path)
return result
diff --git a/quantui/calc_log.py b/quantui/calc_log.py
index c64212d..130ef57 100644
--- a/quantui/calc_log.py
+++ b/quantui/calc_log.py
@@ -322,6 +322,85 @@ def count_basis_functions(atoms: list[str], basis: str) -> Optional[int]:
return total
+# ---------------------------------------------------------------------------
+# Statistical helpers (M-EST / EST.3, 2026-05-25)
+# ---------------------------------------------------------------------------
+
+
+def _iqr_filter(values: list[float]) -> list[float]:
+ """Discard outliers outside [Q1 − 1.5·IQR, Q3 + 1.5·IQR].
+
+ The classic Tukey fence catches cold-cache outliers (single slow
+ runs that landed before BLAS / DFT grids were resident) and
+ thermal-throttled runs (a single overheated run pulled the median
+ high) without being overly aggressive on the legitimate spread
+ you'd expect across the perf-log timeline.
+
+ Returns the unmodified list when there are fewer than 4 samples —
+ IQR isn't meaningful on small N, and the median-based predictors
+ upstream already handle small-N gracefully.
+ """
+ if len(values) < 4:
+ return list(values)
+ sorted_v = sorted(values)
+ # Use the "inclusive" method (matches numpy/pandas default linear
+ # interpolation). "exclusive" places quartiles BETWEEN data points
+ # via n*p/(n+1) which lets a single small-N outlier pull Q3 high
+ # enough that its own value falls inside the fence — defeating the
+ # filter. "inclusive" anchors quartiles AT data points so the
+ # fence cleanly excludes the outlier.
+ q1 = statistics.quantiles(sorted_v, n=4, method="inclusive")[0]
+ q3 = statistics.quantiles(sorted_v, n=4, method="inclusive")[2]
+ iqr = q3 - q1
+ if iqr == 0:
+ # All-equal pool — no outliers to reject.
+ return list(values)
+ low = q1 - 1.5 * iqr
+ high = q3 + 1.5 * iqr
+ return [v for v in values if low <= v <= high]
+
+
+def _coefficient_of_variation(values: list[float]) -> float:
+ """Return σ / |μ|. Returns 0.0 when the mean is zero or N < 2."""
+ if len(values) < 2:
+ return 0.0
+ mean = statistics.mean(values)
+ if mean == 0:
+ return 0.0
+ return statistics.stdev(values) / abs(mean)
+
+
+def _confidence_label(values: list[float], n_samples: int) -> str:
+ """Variance-aware confidence label (M-EST / EST.3).
+
+ Combines coefficient of variation (CV) with sample count:
+
+ - CV < 0.15 → "high"
+ - 0.15 ≤ CV < 0.35 → "medium"
+ - CV ≥ 0.35 → "low"
+
+ Then capped by sample count: n < 3 always reports "low" (CV is
+ noisy on tiny pools); n < 5 caps at "medium" regardless of CV.
+
+ This is what catches the 1-min-predicted / 5-min-actual class —
+ even with many samples, a high-variance pool should report "low"
+ confidence so the user knows the prediction has wide error bars.
+ """
+ if n_samples < 3:
+ return "low"
+ cv = _coefficient_of_variation(values)
+ if cv < 0.15:
+ base = "high"
+ elif cv < 0.35:
+ base = "medium"
+ else:
+ base = "low"
+ # Sample-count cap.
+ if n_samples < 5 and base == "high":
+ return "medium"
+ return base
+
+
# ---------------------------------------------------------------------------
# Performance log
# ---------------------------------------------------------------------------
@@ -381,6 +460,7 @@ def estimate_time(
n_basis: Optional[int] = None,
n_cores: Optional[int] = None,
calc_type: Optional[str] = None,
+ gpu_used: Optional[bool] = None,
) -> Optional[dict]:
"""
Return a time estimate dict, or ``None`` if there is insufficient data.
@@ -417,6 +497,21 @@ def estimate_time(
(for example, Single Point). Legacy records without ``calc_type`` are
only included when estimating ``single_point``.
+ **GPU-aware filtering** (M-EST / EST.1, 2026-05-25): when ``gpu_used``
+ is passed, the candidate pool is partitioned by device — GPU-history
+ predicts GPU runs and CPU-history predicts CPU runs. Records written
+ before session 55 don't have ``gpu_used`` at all; those are treated
+ as "device unknown" and admitted only when ``gpu_used=False`` is
+ requested (the conservative assumption, since QuantUI was CPU-only
+ before M-GPU shipped). When ``gpu_used=None`` (default), the device
+ axis is ignored and all records are eligible — back-compat with
+ callers that don't know which device the upcoming run will use.
+
+ If GPU partitioning leaves fewer than 2 records in the pool, the
+ function falls back to the unpartitioned pool with the confidence
+ label downgraded one notch — better an approximate estimate from
+ cross-device data than no estimate at all.
+
Returns ``None`` when fewer than 2 converged records are available for
the scoped candidate pool.
"""
@@ -440,6 +535,32 @@ def estimate_time(
if len(scoped) < 2:
return None
+ # M-EST / EST.1: partition by device when the caller specified one.
+ # Records pre-dating session 55 don't carry ``gpu_used`` — admit them
+ # only into the CPU pool, since QuantUI was CPU-only when they were
+ # written. Track whether we downgraded for the fall-back path below.
+ _gpu_filtered = False
+ if gpu_used is True:
+ gpu_scoped = [r for r in scoped if r.get("gpu_used") is True]
+ if len(gpu_scoped) >= 2:
+ scoped = gpu_scoped
+ _gpu_filtered = True
+ # else: fall through to the unpartitioned pool; caller's
+ # confidence will be downgraded below.
+ elif gpu_used is False:
+ cpu_scoped = [
+ r for r in scoped if r.get("gpu_used") is False or "gpu_used" not in r
+ ]
+ if len(cpu_scoped) >= 2:
+ scoped = cpu_scoped
+ _gpu_filtered = True
+
+ def _maybe_downgrade(conf: str) -> str:
+ """Downgrade confidence one notch if device-partition fell back."""
+ if gpu_used is None or _gpu_filtered:
+ return conf
+ return {"high": "medium", "medium": "low", "low": "low"}[conf]
+
beta_new = _METHOD_SCALE_EXP.get(method, 3.5)
n_cores_current = n_cores if n_cores is not None else 1
@@ -465,23 +586,41 @@ def _eff(r: dict) -> Optional[float]:
]
effs = [e for r in exact_nb for e in [_eff(r)] if e is not None]
if len(effs) >= 2:
- predicted = statistics.median(effs) * (n_basis**beta_new) / n_cores_current
+ # EST.3: drop Tukey outliers before computing the predictor.
+ # The variance of the *filtered* pool drives confidence.
+ filtered_effs = _iqr_filter(effs)
+ predicted = (
+ statistics.median(filtered_effs) * (n_basis**beta_new) / n_cores_current
+ )
return {
"seconds": predicted,
- "confidence": "high" if len(effs) >= 5 else "medium",
- "n_samples": len(effs),
+ "confidence": _maybe_downgrade(
+ _confidence_label(filtered_effs, len(filtered_effs))
+ ),
+ "n_samples": len(filtered_effs),
}
# ── Strategy 2: exact method + basis, electron-count fallback ────────────
exact = [r for r in scoped if r.get("method") == method and r.get("basis") == basis]
if len(exact) >= 2:
- median_ne = statistics.median(r["n_electrons"] for r in exact)
- median_t = statistics.median(r["elapsed_s"] for r in exact)
+ elapsed_values = [float(r["elapsed_s"]) for r in exact]
+ filtered_elapsed = _iqr_filter(elapsed_values)
+ # Recompute electron-count median against the same filtered pool
+ # so the scale factor is consistent with the time median.
+ filtered_records = [
+ r for r in exact if float(r["elapsed_s"]) in filtered_elapsed
+ ]
+ median_ne = statistics.median(
+ r["n_electrons"] for r in (filtered_records or exact)
+ )
+ median_t = statistics.median(filtered_elapsed)
scale = (n_electrons / median_ne) ** 2.7 if median_ne > 0 else 1.0
return {
"seconds": median_t * scale,
- "confidence": "high" if len(exact) >= 5 else "medium",
- "n_samples": len(exact),
+ "confidence": _maybe_downgrade(
+ _confidence_label(filtered_elapsed, len(filtered_elapsed))
+ ),
+ "n_samples": len(filtered_elapsed),
}
# ── Strategy 3: same basis, any method, basis-function efficiency ─────────
diff --git a/quantui/config.py b/quantui/config.py
index 9ab61f0..784cfa6 100644
--- a/quantui/config.py
+++ b/quantui/config.py
@@ -631,6 +631,19 @@ def main():
try:
method = '{method}'
+ # Display name → PySCF xc string + external D3 dispersion. Matches
+ # quantui/session_calc.py resolve_xc + maybe_apply_d3. Important
+ # for methods that PySCF doesn't accept directly (notably
+ # wB97X-D — on dftd3's black-list; PBE-D3 — D3 must be applied
+ # externally via pyscf.dftd3).
+ _XC_ALIAS = {{
+ 'M06-L': 'm06l',
+ 'wB97X-D': 'wb97x',
+ 'CAM-B3LYP': 'camb3lyp',
+ 'PBE-D3': 'pbe',
+ }}
+ _NEEDS_D3 = {{'PBE-D3', 'wB97X-D'}}
+
if method == 'RHF':
mf = scf.RHF(mol)
elif method == 'UHF':
@@ -638,7 +651,16 @@ def main():
else:
# DFT: auto-select RKS/UKS based on spin
mf = dft.RKS(mol) if mol.spin == 0 else dft.UKS(mol)
- mf.xc = method
+ mf.xc = _XC_ALIAS.get(method, method)
+ if method in _NEEDS_D3:
+ try:
+ from pyscf import dftd3 as _dftd3
+ mf = _dftd3.dftd3(mf)
+ except ImportError:
+ print(
+ "WARNING: pyscf.dftd3 not available; "
+ "running {{method}} without D3 dispersion."
+ )
energy = mf.kernel()
diff --git a/quantui/freq_calc.py b/quantui/freq_calc.py
index 4627fcd..fe66bb8 100644
--- a/quantui/freq_calc.py
+++ b/quantui/freq_calc.py
@@ -228,8 +228,14 @@ def _status(msg: str) -> None:
elif method_upper == "UHF":
mf = scf.UHF(mol)
else:
+ # session 55: route through resolve_xc + maybe_apply_d3 so
+ # methods like wB97X-D (PySCF rejects "wb97x-d") map to the
+ # bare functional + external D3 dispersion.
+ from .session_calc import maybe_apply_d3, resolve_xc
+
mf = dft.RKS(mol) if mol.spin == 0 else dft.UKS(mol)
- mf.xc = method
+ mf.xc = resolve_xc(method)
+ mf = maybe_apply_d3(mf, method, progress_stream=stream)
try:
energy_hartree = float(mf.kernel())
diff --git a/quantui/nmr_calc.py b/quantui/nmr_calc.py
index 2bb604e..9eebf9d 100644
--- a/quantui/nmr_calc.py
+++ b/quantui/nmr_calc.py
@@ -125,7 +125,7 @@ def _run_nmr_calc_body(
import numpy as _np
from . import config as _config
- from .session_calc import _XC_ALIAS
+ from .session_calc import maybe_apply_d3, resolve_xc
mol = gto.Mole()
mol.atom = molecule.to_pyscf_format()
@@ -142,9 +142,13 @@ def _run_nmr_calc_body(
elif method_upper == "UHF":
mf = scf.UHF(mol)
else:
- xc_string = _XC_ALIAS.get(method, method)
+ # session 55: route through resolve_xc + maybe_apply_d3 so
+ # wB97X-D / PBE-D3 work for NMR calcs (was using raw _XC_ALIAS
+ # lookup before, which would fail for wB97X-D after the alias
+ # change to "wb97x" + external D3).
mf = dft.RKS(mol) if mol.spin == 0 else dft.UKS(mol)
- mf.xc = xc_string
+ mf.xc = resolve_xc(method)
+ mf = maybe_apply_d3(mf, method, progress_stream=stream)
try:
mf.kernel()
diff --git a/quantui/optimizer.py b/quantui/optimizer.py
index 42347f1..3a69924 100644
--- a/quantui/optimizer.py
+++ b/quantui/optimizer.py
@@ -144,9 +144,13 @@ def calculate(
elif method_upper == "UHF":
mf = scf.UHF(mol)
else:
- # DFT functional
+ # DFT functional. session 55: route through resolve_xc +
+ # maybe_apply_d3 so wB97X-D / PBE-D3 work mid-optimization.
+ from .session_calc import maybe_apply_d3, resolve_xc
+
mf = dft.RKS(mol) if mol.spin == 0 else dft.UKS(mol)
- mf.xc = self.method
+ mf.xc = resolve_xc(self.method)
+ mf = maybe_apply_d3(mf, self.method)
mf.verbose = 0
mf.stdout = _sink
diff --git a/quantui/session_calc.py b/quantui/session_calc.py
index 052417a..8a3a307 100644
--- a/quantui/session_calc.py
+++ b/quantui/session_calc.py
@@ -127,14 +127,80 @@ def summary(self) -> str:
# Maps QuantUI display names → PySCF xc strings where they differ.
+#
+# ``wB97X-D`` is a special case: PySCF + dftd3 cannot compose
+# ``mf.xc = "wb97x-d"`` cleanly (it's on dftd3's black-list — see
+# pyscf/pyscf#2069). The workaround that matches what our UI label
+# already claims ("wB97X-D — Range-Separated Hybrid + D3 Dispersion")
+# is to use the bare ``wb97x`` functional and apply D3 via dftd3
+# externally — same pattern as PBE-D3 below. This is D3, not the
+# original Chai 2008 D2; the empirical dispersion energies differ by
+# a few percent for most systems but the functional family is the same.
_XC_ALIAS: dict = {
"M06-L": "m06l",
- "wB97X-D": "wb97x-d",
+ "wB97X-D": "wb97x", # bare functional; D3 applied via _NEEDS_D3
"CAM-B3LYP": "camb3lyp",
"PBE-D3": "pbe", # base functional; D3 applied separately
}
# Methods that require Grimme D3 dispersion correction via pyscf.dftd3.
-_NEEDS_D3: frozenset = frozenset({"PBE-D3"})
+_NEEDS_D3: frozenset = frozenset({"PBE-D3", "wB97X-D"})
+
+
+def resolve_xc(method: str) -> str:
+ """Map a QuantUI display method name to a PySCF xc string.
+
+ Uses ``_XC_ALIAS`` case-insensitively so callers can pass either
+ the display form (``"wB97X-D"``) or the upper form. Methods not
+ in the alias table pass through unchanged.
+
+ This is the single source of truth for QuantUI → PySCF xc-name
+ translation. Every DFT entry point — ``session_calc``, ``freq_calc``,
+ ``tddft_calc``, ``optimizer``, ``freq_ir_workers``, ``nmr_calc``,
+ and the script-export path in ``config.py`` — should use this
+ helper rather than passing ``method`` to PySCF directly. (Before
+ session 55 they didn't, which is why wB97X-D errored in tier 3
+ SP calcs but ALSO would have errored in freq / opt / tddft.)
+ """
+ method_upper = method.upper()
+ _key = next((k for k in _XC_ALIAS if k.upper() == method_upper), method)
+ return _XC_ALIAS.get(_key, method)
+
+
+def needs_d3(method: str) -> bool:
+ """Return True when ``method`` requires external D3 dispersion.
+
+ The DFT entry points should call this AFTER setting ``mf.xc`` to
+ decide whether to wrap the SCF object in ``pyscf.dftd3.dftd3(mf)``.
+ """
+ method_upper = method.upper()
+ _key = next((k for k in _XC_ALIAS if k.upper() == method_upper), method)
+ return _key in _NEEDS_D3
+
+
+def maybe_apply_d3(mf, method: str, progress_stream=None):
+ """Wrap ``mf`` in ``pyscf.dftd3.dftd3(mf)`` if ``method`` requires D3.
+
+ Returns the (possibly wrapped) mf object. On ``pyscf.dftd3``
+ ImportError, returns the original ``mf`` unmodified and surfaces
+ a warning via ``progress_stream`` (if provided) so the user sees
+ that the result is missing the dispersion correction.
+ """
+ if not needs_d3(method):
+ return mf
+ try:
+ from pyscf import dftd3 as _dftd3
+
+ return _dftd3.dftd3(mf)
+ except ImportError:
+ if progress_stream is not None:
+ try:
+ progress_stream.write(
+ f"\n⚠ pyscf.dftd3 not available — running {method} "
+ "without D3 correction.\n"
+ )
+ except Exception: # noqa: BLE001 — cleanup (stream may be closed)
+ pass
+ return mf
def run_in_session(
@@ -257,8 +323,6 @@ def _run_session_calc_body(
# --- Select SCF method ---
method_upper = method.upper()
- # Normalise to the key used in _XC_ALIAS / _NEEDS_D3 (preserve original case)
- _method_key = next((k for k in _XC_ALIAS if k.upper() == method_upper), method)
if method_upper == "RHF":
mf = scf.RHF(mol)
@@ -272,25 +336,15 @@ def _run_session_calc_body(
# post-SCF below.
mf = scf.RHF(mol)
else:
- # DFT: resolve alias then auto-select RKS / UKS
- xc_string = _XC_ALIAS.get(_method_key, method)
+ # DFT: resolve alias then auto-select RKS / UKS. ``resolve_xc``
+ # handles the wB97X-D → wb97x + external D3 dispersion mapping
+ # (session 55 fix; see _XC_ALIAS docstring).
if mol.spin == 0:
mf = dft.RKS(mol)
else:
mf = dft.UKS(mol)
- mf.xc = xc_string
- # Apply D3 dispersion correction where needed
- if _method_key in _NEEDS_D3:
- try:
- from pyscf import dftd3 as _dftd3
-
- mf = _dftd3.dftd3(mf)
- except ImportError:
- if progress_stream is not None:
- progress_stream.write(
- f"\n⚠ pyscf.dftd3 not available — running {method} "
- "without D3 correction.\n"
- )
+ mf.xc = resolve_xc(method)
+ mf = maybe_apply_d3(mf, method, progress_stream=progress_stream)
# --- Wrap with implicit solvent (PCM) if requested ---
if solvent is not None:
diff --git a/quantui/tddft_calc.py b/quantui/tddft_calc.py
index 65567a9..1660652 100644
--- a/quantui/tddft_calc.py
+++ b/quantui/tddft_calc.py
@@ -195,8 +195,13 @@ def _run_tddft_calc_body(
elif method_upper == "UHF":
mf = scf.UHF(mol)
else:
+ # session 55: route through resolve_xc + maybe_apply_d3 so
+ # methods like wB97X-D (PySCF rejects "wb97x-d") map cleanly.
+ from .session_calc import maybe_apply_d3, resolve_xc
+
mf = dft.RKS(mol) if mol.spin == 0 else dft.UKS(mol)
- mf.xc = method
+ mf.xc = resolve_xc(method)
+ mf = maybe_apply_d3(mf, method, progress_stream=progress_stream)
if using_hf and progress_stream is not None:
try:
diff --git a/tests/test_est_calibration_resilience.py b/tests/test_est_calibration_resilience.py
new file mode 100644
index 0000000..4ba8d7e
--- /dev/null
+++ b/tests/test_est_calibration_resilience.py
@@ -0,0 +1,270 @@
+"""Tests for the calibration resilience fixes (session 55 user report).
+
+User-reported issues these tests guard against:
+
+1. Status indicator stayed "Idle" during calibration — covered by the
+ ``_activity_begin/_end`` wrapper in ``app_runflow.do_calibration``.
+ Not directly testable here (UI side); covered by the wrapper's
+ presence-in-source check below.
+2. No per-step progress visibility — ``_tail_last_status_line``
+ returns the most recent meaningful log line; tested directly.
+3. ``calibration.json`` dropped state on interrupt —
+ ``_save_calibration_json`` is now called after every step (not just
+ end-of-loop). Verified by reading source markers + a unit test on
+ the helper itself.
+4. Stop button didn't work mid-calc — ``run_calibration`` now uses
+ ``multiprocessing.Process`` so ``worker.terminate()`` cleanly
+ interrupts an in-flight step. The poll-loop logic is tested via
+ structure check; the actual termination is exercised by the
+ PySCF-gated integration test in ``test_benchmarks.py``.
+5. Calibration log file — ``_calibration_log_path`` returns a path
+ under ``QUANTUI_LOG_DIR``; tested directly.
+
+All tests are platform-independent.
+"""
+
+from __future__ import annotations
+
+import inspect
+import json
+
+import pytest
+
+from quantui import benchmarks
+from quantui.benchmarks import (
+ BenchmarkStep,
+ CalibrationResult,
+ _calibration_log_path,
+ _save_calibration_json,
+ _tail_last_status_line,
+)
+
+
+@pytest.fixture
+def isolated_log_dir(tmp_path, monkeypatch):
+ monkeypatch.setenv("QUANTUI_LOG_DIR", str(tmp_path))
+ return tmp_path
+
+
+# =====================================================================
+# _calibration_log_path
+# =====================================================================
+
+
+class TestCalibrationLogPath:
+ def test_respects_quantui_log_dir(self, isolated_log_dir):
+ path = _calibration_log_path("2026-05-25T12:00:00+00:00")
+ # Lives under QUANTUI_LOG_DIR exactly.
+ assert path.parent == isolated_log_dir
+
+ def test_filename_includes_timestamp(self, isolated_log_dir):
+ path = _calibration_log_path("2026-05-25T12:34:56+00:00")
+ assert path.name.startswith("calibration_")
+ assert path.name.endswith(".log")
+ # The timestamp is in the filename (sanitized — no colons since
+ # Windows file systems reject them).
+ assert ":" not in path.name
+ assert "2026-05-25" in path.name
+
+
+# =====================================================================
+# _tail_last_status_line
+# =====================================================================
+
+
+class TestTailLastStatusLine:
+ def test_missing_file_returns_empty(self, tmp_path):
+ assert _tail_last_status_line(tmp_path / "nope.log") == ""
+
+ def test_empty_file_returns_empty(self, tmp_path):
+ p = tmp_path / "empty.log"
+ p.write_text("", encoding="utf-8")
+ assert _tail_last_status_line(p) == ""
+
+ def test_prefers_quantui_status_marker(self, tmp_path):
+ p = tmp_path / "log.log"
+ p.write_text(
+ "some random PySCF output\n"
+ "[QuantUI_STATUS] Computing Hessian (3/12)\n"
+ "more PySCF noise after the marker\n",
+ encoding="utf-8",
+ )
+ out = _tail_last_status_line(p)
+ # The QuantUI_STATUS line wins even though it's not the last.
+ assert "[QuantUI_STATUS]" in out
+ assert "Hessian" in out
+
+ def test_falls_back_to_last_non_blank(self, tmp_path):
+ p = tmp_path / "log.log"
+ p.write_text(
+ "SCF iter 1 E=-1.0\n" "SCF iter 2 E=-1.5\n" "SCF converged\n" "\n",
+ encoding="utf-8",
+ )
+ # No status marker → return the last non-blank line.
+ assert _tail_last_status_line(p) == "SCF converged"
+
+ def test_truncates_long_lines(self, tmp_path):
+ p = tmp_path / "log.log"
+ long_line = "A" * 500
+ p.write_text(long_line + "\n", encoding="utf-8")
+ out = _tail_last_status_line(p)
+ # Hard cap is 120 chars in the helper.
+ assert len(out) <= 120
+
+
+# =====================================================================
+# _save_calibration_json
+# =====================================================================
+
+
+class TestSaveCalibrationJson:
+ def test_writes_to_user_home(self, monkeypatch, tmp_path):
+ # Redirect HOME so the helper writes into tmp_path, not
+ # ~/.quantui (which would clobber a real user setup).
+ monkeypatch.setenv("HOME", str(tmp_path))
+ monkeypatch.setenv("USERPROFILE", str(tmp_path)) # Windows
+ # On some platforms Path.home() caches; patch directly too.
+ from pathlib import Path as _Path
+
+ monkeypatch.setattr(_Path, "home", lambda: tmp_path)
+
+ result = CalibrationResult(timestamp="2026-05-25T12:00:00+00:00", mode="tier1")
+ result.steps.append(
+ BenchmarkStep(
+ label="H2 RHF/STO-3G",
+ method="RHF",
+ basis="STO-3G",
+ n_atoms=2,
+ n_electrons=2,
+ status="ok",
+ elapsed_s=0.5,
+ n_basis=2,
+ calc_type="single_point",
+ )
+ )
+ log_path = tmp_path / "fake.log"
+
+ _save_calibration_json(result, log_path)
+ cal_path = tmp_path / ".quantui" / "calibration.json"
+ assert cal_path.exists()
+ data = json.loads(cal_path.read_text(encoding="utf-8"))
+ assert data["mode"] == "tier1"
+ assert data["n_completed"] == 1
+ assert data["steps"][0]["label"] == "H2 RHF/STO-3G"
+ assert data["log_path"] == str(log_path)
+
+ def test_partial_state_persisted_on_interrupt(self, monkeypatch, tmp_path):
+ # Simulates the user's scenario: tier 4 stopped at step 25/30.
+ # After the partial save, the on-disk record should show
+ # n_completed=24 (or however many ran) + stopped_early=True.
+ from pathlib import Path as _Path
+
+ monkeypatch.setattr(_Path, "home", lambda: tmp_path)
+
+ result = CalibrationResult(
+ timestamp="2026-05-25T12:00:00+00:00",
+ mode="tier4",
+ stopped_early=True,
+ )
+ # Add 24 ok steps + 1 stopped step.
+ for i in range(24):
+ result.steps.append(
+ BenchmarkStep(
+ label=f"step-{i}",
+ method="RHF",
+ basis="STO-3G",
+ n_atoms=2,
+ n_electrons=2,
+ status="ok",
+ elapsed_s=1.0,
+ n_basis=2,
+ calc_type="single_point",
+ )
+ )
+ result.steps.append(
+ BenchmarkStep(
+ label="step-stop",
+ method="B3LYP",
+ basis="6-31G*",
+ n_atoms=12,
+ n_electrons=42,
+ status="stopped",
+ elapsed_s=300.0,
+ n_basis=96,
+ calc_type="frequency",
+ )
+ )
+
+ _save_calibration_json(result, tmp_path / "fake.log")
+ cal_path = tmp_path / ".quantui" / "calibration.json"
+ data = json.loads(cal_path.read_text(encoding="utf-8"))
+
+ # User's actual complaint was that this dropped to None on
+ # interrupt. After the fix, the 24 completed runs must be on
+ # disk.
+ assert data["n_completed"] == 24
+ assert data["stopped_early"] is True
+ assert len(data["steps"]) == 25
+ # The stopped step is the last one.
+ assert data["steps"][-1]["status"] == "stopped"
+
+
+# =====================================================================
+# Source-level structure checks (defend against regression)
+# =====================================================================
+
+
+class TestRunCalibrationStructure:
+ """The fix touches ``run_calibration`` heavily. These tests assert
+ that key invariants of the new design are still present in the
+ source — so a future refactor that drops them fails loudly.
+ """
+
+ def test_uses_multiprocessing_process_not_thread_executor(self):
+ src = inspect.getsource(benchmarks.run_calibration)
+ # The Stop-button-mid-calc fix requires a process, not a
+ # ThreadPoolExecutor — threads can't be terminated externally.
+ assert "_mp.Process" not in src # we use _ctx.Process from a context
+ assert "Process" in src
+ assert "ThreadPoolExecutor" not in src
+
+ def test_poll_loop_checks_stop_event(self):
+ src = inspect.getsource(benchmarks.run_calibration)
+ # The poll loop must check ``stop_event.is_set()`` so the stop
+ # button reaches the worker within poll_interval (500 ms).
+ assert "stop_event" in src
+ assert "is_set()" in src
+ assert ".terminate()" in src
+
+ def test_saves_calibration_after_every_step(self):
+ src = inspect.getsource(benchmarks.run_calibration)
+ # Count _save_calibration_json invocations inside the loop.
+ # Should be at least 2: one inside the PySCF-unavailable
+ # branch, one after the main step completes. Plus the final
+ # idempotent write outside the loop.
+ n = src.count("_save_calibration_json")
+ assert n >= 3
+
+ def test_opens_log_file_at_start(self):
+ src = inspect.getsource(benchmarks.run_calibration)
+ # The per-run log file (the user requested this for tier 4)
+ # is opened with "w" mode at the top of the run.
+ assert "_calibration_log_path" in src
+ assert '"w"' in src or "'w'" in src
+
+
+class TestDoCalibrationStructure:
+ """``app_runflow.do_calibration`` got the ``_activity_begin/_end``
+ wrap so the toolbar badge stops reading 'Idle' during calibration.
+ """
+
+ def test_wraps_calibration_in_activity_markers(self):
+ from quantui import app_runflow
+
+ src = inspect.getsource(app_runflow.do_calibration)
+ # The Status-indicator-says-Idle fix (user's first complaint).
+ assert "_activity_begin" in src
+ assert "_activity_end" in src
+ # Must be in a try/finally so a calibration crash still flips
+ # the badge back.
+ assert "finally" in src
diff --git a/tests/test_est_calibration_tiers.py b/tests/test_est_calibration_tiers.py
new file mode 100644
index 0000000..79859c0
--- /dev/null
+++ b/tests/test_est_calibration_tiers.py
@@ -0,0 +1,185 @@
+"""Tests for M-EST / EST.4 — four-tier calibration suite.
+
+Covers:
+
+- Each of the 4 tier constants is well-formed (non-empty, each entry
+ has a valid 7- or 8-tuple shape).
+- The 8-tuple format (with explicit ``calc_type``) is correctly
+ normalized by ``_normalize_entry``.
+- Tier 3 contains at least one entry of each non-SP calc-type.
+- Tier 4 strict-contains tier 3 (and so on up the chain).
+- ``_MODE_TO_SUITE`` resolves all the mode strings — both the new
+ tier names and the legacy aliases.
+- ``run_calibration(mode="bogus")`` falls back to tier 1 without
+ crashing (graceful degradation).
+
+All tests are platform-independent. The PySCF-gated execution of
+``run_calibration`` itself lives in ``tests/test_benchmarks.py`` —
+this file checks the suite *shape* without running PySCF.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from quantui import benchmarks
+from quantui.benchmarks import (
+ _MODE_TO_SUITE,
+ BENCHMARK_SUITE,
+ BENCHMARK_SUITE_LONG,
+ BENCHMARK_SUITE_TIER1,
+ BENCHMARK_SUITE_TIER2,
+ BENCHMARK_SUITE_TIER3,
+ BENCHMARK_SUITE_TIER4,
+ _normalize_entry,
+)
+
+_SP = "single_point"
+_OPT = "geometry_opt"
+_FREQ = "frequency"
+
+
+class TestTierSuites:
+ def test_tier1_alias_matches_legacy_short(self):
+ # Back-compat: BENCHMARK_SUITE_TIER1 is the same object as
+ # BENCHMARK_SUITE (existing tests + app.py imports rely on this).
+ assert BENCHMARK_SUITE_TIER1 is BENCHMARK_SUITE
+
+ def test_tier2_alias_matches_legacy_long(self):
+ assert BENCHMARK_SUITE_TIER2 is BENCHMARK_SUITE_LONG
+
+ def test_tier2_extends_tier1(self):
+ # Tier 2 contains every tier-1 entry plus more.
+ assert len(BENCHMARK_SUITE_TIER2) > len(BENCHMARK_SUITE_TIER1)
+ for entry in BENCHMARK_SUITE_TIER1:
+ assert entry in BENCHMARK_SUITE_TIER2
+
+ def test_tier3_extends_tier2(self):
+ assert len(BENCHMARK_SUITE_TIER3) > len(BENCHMARK_SUITE_TIER2)
+ for entry in BENCHMARK_SUITE_TIER2:
+ assert entry in BENCHMARK_SUITE_TIER3
+
+ def test_tier4_extends_tier3(self):
+ assert len(BENCHMARK_SUITE_TIER4) > len(BENCHMARK_SUITE_TIER3)
+ for entry in BENCHMARK_SUITE_TIER3:
+ assert entry in BENCHMARK_SUITE_TIER4
+
+ def test_tier1_and_tier2_are_sp_only(self):
+ # Lower tiers stay 7-tuple (pure single-point) by design — the
+ # user explicitly wanted tier 2 to remain SP-only.
+ for entry in BENCHMARK_SUITE_TIER1:
+ assert len(entry) == 7
+ for entry in BENCHMARK_SUITE_TIER2:
+ assert len(entry) == 7
+
+ def test_tier3_introduces_geom_opt_and_freq(self):
+ # Tier 3 must add at least one geom-opt AND at least one freq.
+ calc_types = {_normalize_entry(e)["calc_type"] for e in BENCHMARK_SUITE_TIER3}
+ assert _OPT in calc_types
+ assert _FREQ in calc_types
+ # And keep the SP majority.
+ n_sp = sum(
+ 1 for e in BENCHMARK_SUITE_TIER3 if _normalize_entry(e)["calc_type"] == _SP
+ )
+ assert n_sp > len(BENCHMARK_SUITE_TIER3) // 2
+
+ def test_tier4_has_post_hf_anchors(self):
+ # Tier 4 must include MP2 + CCSD entries so the β=5.0 / β=6.0
+ # scaling exponents in calc_log have calibration data.
+ methods = {_normalize_entry(e)["method"] for e in BENCHMARK_SUITE_TIER4}
+ assert "MP2" in methods
+ assert "CCSD" in methods
+
+ def test_tier4_includes_benzene_freq(self):
+ # Benzene B3LYP/6-31G* frequency is the workhorse parallel-IR
+ # anchor (12 atoms × 6 = 72 inner SCFs).
+ labels = [_normalize_entry(e)["label"] for e in BENCHMARK_SUITE_TIER4]
+ assert any("benzene" in lbl.lower() and "freq" in lbl.lower() for lbl in labels)
+
+
+class TestNormalizeEntry:
+ def test_seven_tuple_defaults_to_single_point(self):
+ entry = (
+ "H₂ RHF/STO-3G",
+ ["H", "H"],
+ [[0, 0, 0], [0, 0, 0.74]],
+ 0,
+ 1,
+ "RHF",
+ "STO-3G",
+ )
+ out = _normalize_entry(entry)
+ assert out["calc_type"] == _SP
+ assert out["method"] == "RHF"
+ assert out["basis"] == "STO-3G"
+
+ def test_eight_tuple_overrides_calc_type(self):
+ entry = (
+ "H₂O B3LYP/STO-3G [GeoOpt]",
+ ["O", "H", "H"],
+ [[0, 0, 0], [0.7, 0.6, 0], [-0.7, 0.6, 0]],
+ 0,
+ 1,
+ "B3LYP",
+ "STO-3G",
+ "geometry_opt",
+ )
+ out = _normalize_entry(entry)
+ assert out["calc_type"] == "geometry_opt"
+
+ def test_invalid_length_raises_valueerror(self):
+ with pytest.raises(ValueError, match="7 or 8 fields"):
+ _normalize_entry(("label", ["H"])) # only 2 fields
+
+ def test_all_tier_entries_normalize_cleanly(self):
+ # Every entry in every tier must normalize without raising.
+ for tier in (
+ BENCHMARK_SUITE_TIER1,
+ BENCHMARK_SUITE_TIER2,
+ BENCHMARK_SUITE_TIER3,
+ BENCHMARK_SUITE_TIER4,
+ ):
+ for entry in tier:
+ out = _normalize_entry(entry)
+ assert out["calc_type"] in (_SP, _OPT, _FREQ)
+ assert len(out["atoms"]) == len(out["coords"])
+
+
+class TestModeToSuite:
+ def test_new_tier_names_resolve(self):
+ assert _MODE_TO_SUITE["tier1"] is BENCHMARK_SUITE_TIER1
+ assert _MODE_TO_SUITE["tier2"] is BENCHMARK_SUITE_TIER2
+ assert _MODE_TO_SUITE["tier3"] is BENCHMARK_SUITE_TIER3
+ assert _MODE_TO_SUITE["tier4"] is BENCHMARK_SUITE_TIER4
+
+ def test_legacy_short_long_aliases(self):
+ # Back-compat: any pinned UI state or older callers using "short"
+ # or "long" should still resolve.
+ assert _MODE_TO_SUITE["short"] is BENCHMARK_SUITE_TIER1
+ assert _MODE_TO_SUITE["long"] is BENCHMARK_SUITE_TIER2
+
+
+class TestUnknownModeFallback:
+ def test_unknown_mode_does_not_raise(self):
+ # PySCF-gated: when PySCF is absent the per-step error path
+ # already prevents any actual calculation, but we still want
+ # run_calibration to *not crash* on a typo'd mode string.
+ result = benchmarks.run_calibration(mode="bogus_mode")
+ # Falls back to tier1 — verify by checking the mode field.
+ assert result.mode == "tier1"
+
+
+class TestCalibrationResult:
+ def test_n_total_uses_active_mode(self):
+ from quantui.benchmarks import CalibrationResult
+
+ r1 = CalibrationResult(timestamp="t", mode="tier1")
+ r2 = CalibrationResult(timestamp="t", mode="tier2")
+ r3 = CalibrationResult(timestamp="t", mode="tier3")
+ r4 = CalibrationResult(timestamp="t", mode="tier4")
+ assert r1.n_total == len(BENCHMARK_SUITE_TIER1)
+ assert r2.n_total == len(BENCHMARK_SUITE_TIER2)
+ assert r3.n_total == len(BENCHMARK_SUITE_TIER3)
+ assert r4.n_total == len(BENCHMARK_SUITE_TIER4)
+ # Strict ordering by tier depth.
+ assert r1.n_total < r2.n_total < r3.n_total < r4.n_total
diff --git a/tests/test_est_estimator.py b/tests/test_est_estimator.py
new file mode 100644
index 0000000..b56ddf9
--- /dev/null
+++ b/tests/test_est_estimator.py
@@ -0,0 +1,316 @@
+"""Tests for M-EST estimator hardening.
+
+Covers:
+
+- **EST.1**: GPU-aware filtering — passing ``gpu_used`` partitions the
+ candidate pool so GPU-history predicts GPU runs and CPU-history
+ predicts CPU runs. Includes the partition-fallback path (insufficient
+ records → fall back to mixed pool, downgrade confidence).
+- **EST.3**: IQR outlier rejection — a single anomalously-slow record
+ no longer dominates the median.
+- **EST.3**: variance-aware confidence — high-variance pools report
+ "low" confidence even with many samples.
+
+All tests are platform-independent. ``perf_log.jsonl`` is redirected to
+``tmp_path`` via the ``QUANTUI_LOG_DIR`` env var so the user's real log
+is never touched.
+"""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from quantui.calc_log import (
+ _coefficient_of_variation,
+ _confidence_label,
+ _iqr_filter,
+ estimate_time,
+)
+
+
+@pytest.fixture
+def isolated_log_dir(tmp_path, monkeypatch):
+ monkeypatch.setenv("QUANTUI_LOG_DIR", str(tmp_path))
+ return tmp_path
+
+
+def _seed_perf_log(log_dir, records):
+ path = log_dir / "perf_log.jsonl"
+ with path.open("w", encoding="utf-8") as fh:
+ for r in records:
+ fh.write(json.dumps(r) + "\n")
+ return path
+
+
+def _rec(
+ *,
+ elapsed_s: float,
+ gpu_used=None,
+ method="B3LYP",
+ basis="STO-3G",
+ n_basis=15,
+ n_electrons=10,
+ calc_type="single_point",
+ converged=True,
+ n_cores=1,
+):
+ r = {
+ "timestamp": "2026-05-25T12:00:00+00:00",
+ "formula": "H2O",
+ "n_atoms": 3,
+ "n_electrons": n_electrons,
+ "method": method,
+ "basis": basis,
+ "n_iterations": 10,
+ "elapsed_s": elapsed_s,
+ "converged": converged,
+ "n_basis": n_basis,
+ "n_cores": n_cores,
+ "calc_type": calc_type,
+ }
+ if gpu_used is not None:
+ r["gpu_used"] = gpu_used
+ return r
+
+
+# =====================================================================
+# EST.1 — GPU-aware filtering
+# =====================================================================
+
+
+class TestGpuAwareFiltering:
+ def test_gpu_pool_used_when_requested(self, isolated_log_dir):
+ # 5 GPU records (fast) + 5 CPU records (slow) for the same calc.
+ records = [_rec(elapsed_s=1.0, gpu_used=True) for _ in range(5)]
+ records += [_rec(elapsed_s=10.0, gpu_used=False) for _ in range(5)]
+ _seed_perf_log(isolated_log_dir, records)
+
+ gpu_est = estimate_time(
+ n_atoms=3,
+ n_electrons=10,
+ method="B3LYP",
+ basis="STO-3G",
+ n_basis=15,
+ calc_type="single_point",
+ gpu_used=True,
+ )
+ cpu_est = estimate_time(
+ n_atoms=3,
+ n_electrons=10,
+ method="B3LYP",
+ basis="STO-3G",
+ n_basis=15,
+ calc_type="single_point",
+ gpu_used=False,
+ )
+
+ assert gpu_est is not None
+ assert cpu_est is not None
+ # GPU prediction should land near 1.0 s; CPU near 10.0 s.
+ assert gpu_est["seconds"] < 3.0
+ assert cpu_est["seconds"] > 5.0
+ # And they should differ by roughly the recorded factor.
+ assert cpu_est["seconds"] / gpu_est["seconds"] > 3.0
+
+ def test_none_gpu_used_uses_full_pool(self, isolated_log_dir):
+ # Default callers (gpu_used=None) get the mixed-pool estimate.
+ records = [_rec(elapsed_s=1.0, gpu_used=True) for _ in range(3)]
+ records += [_rec(elapsed_s=11.0, gpu_used=False) for _ in range(3)]
+ _seed_perf_log(isolated_log_dir, records)
+
+ est = estimate_time(
+ n_atoms=3,
+ n_electrons=10,
+ method="B3LYP",
+ basis="STO-3G",
+ n_basis=15,
+ calc_type="single_point",
+ # gpu_used omitted → None → no partition
+ )
+ assert est is not None
+ # The mixed-pool median falls between the GPU and CPU clusters.
+ assert 1.0 < est["seconds"] < 11.0
+
+ def test_pre_session55_records_count_as_cpu(self, isolated_log_dir):
+ # Old records have no `gpu_used` key. Requesting gpu_used=False
+ # must still admit them (they predate GPU support; conservative
+ # assumption is they ran CPU-side).
+ records = [_rec(elapsed_s=10.0) for _ in range(5)]
+ # Remove the gpu_used key from each (already absent — _rec
+ # only adds it when explicit). Sanity check:
+ assert all("gpu_used" not in r for r in records)
+ _seed_perf_log(isolated_log_dir, records)
+
+ cpu_est = estimate_time(
+ n_atoms=3,
+ n_electrons=10,
+ method="B3LYP",
+ basis="STO-3G",
+ n_basis=15,
+ calc_type="single_point",
+ gpu_used=False,
+ )
+ assert cpu_est is not None
+ # Should predict roughly 10 s.
+ assert 5.0 < cpu_est["seconds"] < 20.0
+
+ def test_gpu_partition_fallback_downgrades_confidence(self, isolated_log_dir):
+ # Only 1 GPU record (not enough to partition) + 5 CPU records.
+ records = [_rec(elapsed_s=1.0, gpu_used=True)]
+ records += [_rec(elapsed_s=10.0, gpu_used=False) for _ in range(5)]
+ _seed_perf_log(isolated_log_dir, records)
+
+ gpu_est = estimate_time(
+ n_atoms=3,
+ n_electrons=10,
+ method="B3LYP",
+ basis="STO-3G",
+ n_basis=15,
+ calc_type="single_point",
+ gpu_used=True,
+ )
+ assert gpu_est is not None
+ # The cpu pool has 6 entries → would normally be "high" or
+ # "medium"; with GPU fallback the confidence is downgraded one
+ # notch.
+ assert gpu_est["confidence"] in ("medium", "low")
+
+
+# =====================================================================
+# EST.3 — IQR outlier rejection
+# =====================================================================
+
+
+class TestIqrFilter:
+ def test_passes_through_small_pools(self):
+ # IQR isn't meaningful on N < 4 — preserve all values.
+ assert _iqr_filter([1.0, 2.0, 3.0]) == [1.0, 2.0, 3.0]
+
+ def test_drops_high_outlier(self):
+ # 4 values clustered near 10, one anomalous 100.
+ result = _iqr_filter([10.0, 10.5, 9.5, 10.2, 100.0])
+ assert 100.0 not in result
+ # The clustered values are preserved.
+ for v in (10.0, 10.5, 9.5, 10.2):
+ assert v in result
+
+ def test_drops_low_outlier(self):
+ result = _iqr_filter([100.0, 105.0, 95.0, 102.0, 1.0])
+ assert 1.0 not in result
+
+ def test_all_equal_pool_unchanged(self):
+ # IQR = 0 → no fence — return everything.
+ assert _iqr_filter([5.0, 5.0, 5.0, 5.0, 5.0]) == [5.0, 5.0, 5.0, 5.0, 5.0]
+
+
+class TestEstimatorOutlierRobustness:
+ def test_single_outlier_does_not_dominate_prediction(self, isolated_log_dir):
+ # 5 records ~1 s + 1 anomalous 100 s record. The naive median is
+ # ~1 s already (the outlier sits at position 6/6); but if the
+ # outlier is included the IQR-filtered median should still be 1 s.
+ records = [_rec(elapsed_s=1.0) for _ in range(5)]
+ records.append(_rec(elapsed_s=100.0))
+ _seed_perf_log(isolated_log_dir, records)
+
+ est = estimate_time(
+ n_atoms=3,
+ n_electrons=10,
+ method="B3LYP",
+ basis="STO-3G",
+ n_basis=15,
+ calc_type="single_point",
+ )
+ assert est is not None
+ # Without IQR, including the 100s outlier shifts the median to 1s
+ # too (same result here since 5 of 6 cluster at 1.0). The strong
+ # case: a 5/5 split would pull naive mean badly; check that we're
+ # close to 1 s and that n_samples reflects the filter dropped at
+ # least one record.
+ assert est["seconds"] < 3.0
+
+
+# =====================================================================
+# EST.3 — Variance-aware confidence
+# =====================================================================
+
+
+class TestCoefficientOfVariation:
+ def test_low_variance(self):
+ # All values within 1% of mean — CV ~ 0.005.
+ cv = _coefficient_of_variation([10.0, 10.05, 9.95, 10.02])
+ assert cv < 0.05
+
+ def test_high_variance(self):
+ # Values spanning 1-10s on a single (method, basis) — CV > 0.4.
+ cv = _coefficient_of_variation([1.0, 5.0, 10.0, 3.0, 8.0])
+ assert cv > 0.4
+
+ def test_zero_mean_returns_zero(self):
+ assert _coefficient_of_variation([0.0, 0.0, 0.0]) == 0.0
+
+ def test_single_value_returns_zero(self):
+ assert _coefficient_of_variation([5.0]) == 0.0
+
+
+class TestConfidenceLabel:
+ def test_low_variance_high_samples_yields_high(self):
+ # 6 samples, all ~10 s → CV < 0.15 → "high"
+ assert _confidence_label([10.0, 10.1, 9.9, 10.05, 9.95, 10.02], 6) == "high"
+
+ def test_high_variance_yields_low_even_with_many_samples(self):
+ # 10 samples spanning 1-30 → CV > 0.35 → "low"
+ wild = [1.0, 5.0, 30.0, 2.0, 25.0, 4.0, 28.0, 3.0, 20.0, 10.0]
+ assert _confidence_label(wild, len(wild)) == "low"
+
+ def test_few_samples_cap_at_medium(self):
+ # 3 samples is enough for CV but caps below "high"
+ assert _confidence_label([10.0, 10.05, 9.95], 3) == "medium"
+
+ def test_under_three_samples_always_low(self):
+ assert _confidence_label([10.0, 10.05], 2) == "low"
+
+ def test_medium_variance_yields_medium(self):
+ # CV around 0.25 — between the 0.15 and 0.35 thresholds → "medium"
+ med = [10.0, 14.0, 7.0, 12.0, 8.0, 11.0]
+ label = _confidence_label(med, len(med))
+ assert label == "medium"
+
+
+class TestEstimatorVarianceAwareConfidence:
+ def test_high_variance_pool_reports_low_confidence(self, isolated_log_dir):
+ # 6 records but with huge spread — confidence MUST be "low",
+ # not "high" just because n_samples >= 5.
+ records = [_rec(elapsed_s=t) for t in (1.0, 5.0, 30.0, 2.0, 25.0, 4.0)]
+ _seed_perf_log(isolated_log_dir, records)
+
+ est = estimate_time(
+ n_atoms=3,
+ n_electrons=10,
+ method="B3LYP",
+ basis="STO-3G",
+ n_basis=15,
+ calc_type="single_point",
+ )
+ assert est is not None
+ assert est["confidence"] == "low"
+
+ def test_tight_pool_with_many_samples_reports_high(self, isolated_log_dir):
+ # 10 tightly-clustered samples — confidence should be "high".
+ records = [
+ _rec(elapsed_s=t)
+ for t in (1.0, 1.02, 0.98, 1.01, 0.99, 1.03, 0.97, 1.0, 1.0, 1.0)
+ ]
+ _seed_perf_log(isolated_log_dir, records)
+
+ est = estimate_time(
+ n_atoms=3,
+ n_electrons=10,
+ method="B3LYP",
+ basis="STO-3G",
+ n_basis=15,
+ calc_type="single_point",
+ )
+ assert est is not None
+ assert est["confidence"] == "high"
diff --git a/tests/test_xc_resolution.py b/tests/test_xc_resolution.py
new file mode 100644
index 0000000..fe13fee
--- /dev/null
+++ b/tests/test_xc_resolution.py
@@ -0,0 +1,247 @@
+"""Tests for the session-55 xc-alias / D3-dispersion resolution helpers.
+
+The user's tier-3 calibration output showed ``H₂O wB97X-D/6-31G*`` erroring
+at 0.01 s — PySCF rejects ``mf.xc = "wb97x-d"`` because that composite
+name is on the dftd3 black-list (pyscf/pyscf#2069). The fix:
+
+- Alias ``wB97X-D`` to bare ``wb97x``.
+- Add ``wB97X-D`` to ``_NEEDS_D3`` so dispersion is applied via
+ ``pyscf.dftd3``, matching the UI label that already promises D3.
+- Extract ``resolve_xc()`` + ``maybe_apply_d3()`` so every DFT entry
+ point (session_calc / freq_calc / tddft_calc / optimizer / nmr_calc /
+ the script-export template) shares the same resolution logic. Before
+ session 55 only ``session_calc`` had the alias lookup, meaning
+ wB97X-D would have errored in EVERY non-SP workflow too.
+
+All tests here are platform-independent. PySCF-gated round-trip tests
+live in the other module suites that already gate on ``_PYSCF_AVAILABLE``.
+"""
+
+from __future__ import annotations
+
+import inspect
+
+from quantui.session_calc import (
+ _NEEDS_D3,
+ _XC_ALIAS,
+ maybe_apply_d3,
+ needs_d3,
+ resolve_xc,
+)
+
+# =====================================================================
+# resolve_xc — the core mapping
+# =====================================================================
+
+
+class TestResolveXc:
+ def test_wb97x_d_resolves_to_bare_wb97x(self):
+ # The session-55 bug: PySCF rejects "wb97x-d". Bare wb97x is
+ # the right xc string; D3 dispersion is applied separately.
+ assert resolve_xc("wB97X-D") == "wb97x"
+
+ def test_wb97x_d_case_insensitive(self):
+ # Users sometimes type "WB97X-D" or "wb97x-d" — all should resolve.
+ for spelling in ("wB97X-D", "WB97X-D", "wb97x-d", "Wb97x-D"):
+ assert resolve_xc(spelling) == "wb97x"
+
+ def test_pbe_d3_resolves_to_bare_pbe(self):
+ # PBE-D3 is the long-standing pattern this fix mirrors.
+ assert resolve_xc("PBE-D3") == "pbe"
+
+ def test_m06_l_aliased(self):
+ assert resolve_xc("M06-L") == "m06l"
+
+ def test_cam_b3lyp_aliased(self):
+ assert resolve_xc("CAM-B3LYP") == "camb3lyp"
+
+ def test_unaliased_methods_pass_through(self):
+ # B3LYP, PBE0, M06-2X, HSE06 — PySCF accepts them as-is.
+ for method in ("B3LYP", "PBE0", "M06-2X", "HSE06", "PBE", "B3PW91"):
+ assert resolve_xc(method) == method
+
+ def test_unknown_method_passes_through(self):
+ # Forward-compat: a new method not in the table returns unchanged
+ # so PySCF gets to decide whether to accept it.
+ assert resolve_xc("FUTURE-METHOD") == "FUTURE-METHOD"
+
+
+# =====================================================================
+# needs_d3 — gates external dispersion wrapping
+# =====================================================================
+
+
+class TestNeedsD3:
+ def test_wb97x_d_needs_d3(self):
+ # The session-55 fix: wB97X-D now needs external D3.
+ assert needs_d3("wB97X-D") is True
+
+ def test_pbe_d3_needs_d3(self):
+ assert needs_d3("PBE-D3") is True
+
+ def test_case_insensitive(self):
+ assert needs_d3("WB97X-D") is True
+ assert needs_d3("pbe-d3") is True
+
+ def test_dispersion_free_methods_dont_need_d3(self):
+ for method in ("RHF", "UHF", "B3LYP", "PBE0", "M06-2X", "HSE06"):
+ assert needs_d3(method) is False
+
+ def test_unknown_method_doesnt_need_d3(self):
+ # Default: only methods explicitly in _NEEDS_D3 get the wrap.
+ assert needs_d3("FUTURE-METHOD") is False
+
+
+# =====================================================================
+# maybe_apply_d3 — graceful degradation when dftd3 unavailable
+# =====================================================================
+
+
+class _FakeMf:
+ """Stand-in for a PySCF mf object — just needs to be identity-comparable."""
+
+ def __init__(self, label):
+ self.label = label
+
+
+class TestMaybeApplyD3:
+ def test_no_d3_method_returns_mf_unchanged(self):
+ mf = _FakeMf("B3LYP")
+ result = maybe_apply_d3(mf, "B3LYP")
+ assert result is mf
+
+ def test_d3_method_with_missing_pyscf_returns_mf_unchanged(self, monkeypatch):
+ # Simulate pyscf.dftd3 being absent (typical on Windows where
+ # PySCF isn't installable at all). The helper must return the
+ # original mf without raising.
+ import builtins
+
+ original_import = builtins.__import__
+
+ def _fake_import(name, *args, **kwargs):
+ if name == "pyscf.dftd3" or name.startswith("pyscf.dftd3"):
+ raise ImportError("simulated")
+ return original_import(name, *args, **kwargs)
+
+ monkeypatch.setattr(builtins, "__import__", _fake_import)
+
+ mf = _FakeMf("wB97X-D")
+ # Without progress_stream — must not raise.
+ result = maybe_apply_d3(mf, "wB97X-D")
+ assert result is mf
+
+ def test_d3_warning_written_to_progress_stream(self, monkeypatch):
+ import builtins
+ import io
+
+ original_import = builtins.__import__
+
+ def _fake_import(name, *args, **kwargs):
+ if name == "pyscf.dftd3" or name.startswith("pyscf.dftd3"):
+ raise ImportError("simulated")
+ return original_import(name, *args, **kwargs)
+
+ monkeypatch.setattr(builtins, "__import__", _fake_import)
+
+ stream = io.StringIO()
+ maybe_apply_d3(_FakeMf("wB97X-D"), "wB97X-D", progress_stream=stream)
+ out = stream.getvalue()
+ # User must see the missing-dispersion warning.
+ assert "dftd3 not available" in out
+ assert "wB97X-D" in out
+
+
+# =====================================================================
+# Coverage check — every DFT entry point uses the helpers
+# =====================================================================
+
+
+class TestEntryPointsUseHelpers:
+ """The bug bit because freq_calc / tddft_calc / optimizer / nmr_calc
+ bypassed the alias lookup. These source-level tests guard against
+ a regression that re-introduces ``mf.xc = method`` directly.
+ """
+
+ def test_session_calc_uses_resolve_xc(self):
+ # The real DFT branch lives in ``_run_session_calc_body`` (inner
+ # function ``run_in_session`` calls), so grep the module source
+ # rather than just the public wrapper.
+ from quantui import session_calc
+
+ src = inspect.getsource(session_calc)
+ assert "resolve_xc(method)" in src
+ assert "maybe_apply_d3(mf, method" in src
+
+ def test_freq_calc_uses_resolve_xc(self):
+ from quantui import freq_calc
+
+ # The full module source — covers both the outer SCF setup and
+ # any inner SCF helpers.
+ src = inspect.getsource(freq_calc)
+ assert "resolve_xc" in src
+ # The inner displaced-SCF helper reads mf.xc directly (which by
+ # then is already resolved), so maybe_apply_d3 only appears in
+ # the outer setup. One usage is enough.
+
+ def test_tddft_calc_uses_resolve_xc(self):
+ from quantui import tddft_calc
+
+ src = inspect.getsource(tddft_calc)
+ assert "resolve_xc" in src
+ assert "maybe_apply_d3" in src
+
+ def test_optimizer_uses_resolve_xc(self):
+ from quantui import optimizer
+
+ src = inspect.getsource(optimizer)
+ assert "resolve_xc" in src
+ assert "maybe_apply_d3" in src
+
+ def test_nmr_calc_uses_resolve_xc(self):
+ from quantui import nmr_calc
+
+ src = inspect.getsource(nmr_calc)
+ assert "resolve_xc" in src
+ assert "maybe_apply_d3" in src
+
+ def test_script_template_embeds_alias_resolution(self):
+ # The script-export template generates a standalone .py file
+ # — can't depend on quantui imports — so the alias table is
+ # inlined.
+ from quantui.config import PYSCF_SCRIPT_TEMPLATE
+
+ # The literal alias for wB97X-D in the template should be the
+ # bare functional (post-session-55 fix). Doubled-brace literals
+ # in the template appear as single braces in the output.
+ assert "'wB97X-D': 'wb97x'" in PYSCF_SCRIPT_TEMPLATE
+ assert "_NEEDS_D3" in PYSCF_SCRIPT_TEMPLATE
+ # The old (broken) "wb97x-d" string must NOT appear.
+ assert "'wB97X-D': 'wb97x-d'" not in PYSCF_SCRIPT_TEMPLATE
+
+
+# =====================================================================
+# Sanity: aliases stay in sync with config.SUPPORTED_METHODS
+# =====================================================================
+
+
+class TestAliasTableConsistency:
+ def test_every_d3_method_has_an_alias(self):
+ # If a method is in _NEEDS_D3 it MUST also be in _XC_ALIAS
+ # — otherwise resolve_xc passes the display name straight to
+ # PySCF, which is exactly the bug.
+ for method in _NEEDS_D3:
+ assert method in _XC_ALIAS, (
+ f"{method!r} is in _NEEDS_D3 but not in _XC_ALIAS — "
+ "PySCF will receive the display name and likely error."
+ )
+
+ def test_all_aliased_methods_in_supported_list(self):
+ # Sanity: every alias key is actually a method the UI exposes
+ # — otherwise the alias is dead code that no calc path can hit.
+ from quantui.config import SUPPORTED_METHODS
+
+ for method in _XC_ALIAS:
+ assert method in SUPPORTED_METHODS, (
+ f"{method!r} is aliased in _XC_ALIAS but not in "
+ f"config.SUPPORTED_METHODS — dead code or removed method."
+ )
From 0a46325c4e10a5b62b2e0ef8f25cf067057a550d Mon Sep 17 00:00:00 2001
From: NCCU-Schultz-Lab
Date: Mon, 25 May 2026 13:43:48 -0400
Subject: [PATCH 24/33] Polish UI text, calibration spawn & progress
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Rename and polish user-facing strings and behavior across the app: "Status" tab → "System Settings"; user-facing "Pre-optimisation" wording changed to "Geometry optimization" in app, analysis and saved-result notes (filenames kept for back-compat). Update Help toggle from "?" to a fuller "Help" button with an icon and wider layout. Benchmarks: always use multiprocessing spawn context to avoid fork/CUDA collisions, extend progress_cb wrapper to accept live_message and step kwargs (with fallbacks), surface richer worker-exit diagnostics, and pass the full BenchmarkStep to final progress calls. Increase history thumbnail resolution (larger figsize and dpi) for crisper text. Update tests to match the new wording and verify the new pre-opt exception guard.
---
quantui/app.py | 65 +++++++++++++--------
quantui/app_analysis.py | 20 +++++--
quantui/app_builders.py | 12 +++-
quantui/benchmarks.py | 72 +++++++++++++++++++-----
quantui/results_storage.py | 14 ++++-
tests/test_bug_regressions_2026_05_25.py | 5 +-
6 files changed, 137 insertions(+), 51 deletions(-)
diff --git a/quantui/app.py b/quantui/app.py
index 30c1004..5455a9c 100644
--- a/quantui/app.py
+++ b/quantui/app.py
@@ -1408,7 +1408,10 @@ def _assemble_tabs(self) -> None:
self.root_tab.set_title(4, "Compare")
self.root_tab.set_title(5, "Log")
self.root_tab.set_title(6, "Files")
- self.root_tab.set_title(7, "Status")
+ # POLISH.4 (M-POLISH, 2026-05-25): "Status" was ambiguous —
+ # status of what? "System Settings" is what the tab actually
+ # holds (env info + calibration + GPU status + UI prefs).
+ self.root_tab.set_title(7, "System Settings")
self.root_tab.observe(
self._safe_cb(self._on_root_tab_changed), names="selected_index"
)
@@ -3510,10 +3513,16 @@ def _run_required_final_single_point(target_mol, reason: str):
):
from quantui import optimize_geometry
- self.run_status.value = f"Pre-optimizing geometry before {ct}…"
+ # POLISH.9 (M-POLISH, 2026-05-25): rename user-facing
+ # "Pre-optimisation" → "Geometry optimization". The
+ # wrapped operation is the full DFT geom-opt at the
+ # user's selected method/basis — same code path as the
+ # standalone Geometry Opt calc-type. The LJ classical
+ # pre-opt earlier (around line 3488) keeps its name.
+ self.run_status.value = f"Optimizing geometry before {ct}…"
log.write(
- f"\n── Pre-optimisation (before {ct}) "
- f"────────────────────────────────────\n"
+ f"\n── Geometry optimization (before {ct}) "
+ f"────────────────────────────\n"
)
# BUG C (2026-05-25): catch numerical failures (e.g.
# singular matrix in cho_solve on tight rings) and fall
@@ -3531,22 +3540,22 @@ def _run_required_final_single_point(target_mol, reason: str):
"converged" if _pre_opt.converged else "did NOT fully converge"
)
log.write(
- f"\nPre-optimisation {_conv_str} in {_pre_opt.n_steps} steps."
+ f"\nGeometry optimization {_conv_str} in {_pre_opt.n_steps} steps."
f" E = {_pre_opt.energies_hartree[-1]:.8f} Ha\n\n"
)
if not _pre_opt.converged:
log.write(
- "⚠ Pre-optimisation did not fully converge — "
+ "⚠ Geometry optimization did not fully converge — "
"proceeding with best available geometry.\n\n"
)
if ct != "Single Point":
_run_required_final_single_point(
calc_mol,
- f"after pre-optimisation before {ct}",
+ f"after geometry optimization before {ct}",
)
except Exception as _pre_exc:
log.write(
- f"\n⚠ Pre-optimisation failed: {_pre_exc}\n"
+ f"\n⚠ Geometry optimization failed: {_pre_exc}\n"
" Proceeding with the user-provided geometry "
"as-is.\n\n"
)
@@ -3613,10 +3622,16 @@ def _run_required_final_single_point(target_mol, reason: str):
f"Atoms: {len(calc_mol.atoms)}\n\n"
)
- # ── Step 2: optional geometry pre-optimisation ────────────────
+ # ── Step 2: optional geometry optimization ────────────────────
#
- # BUG C (2026-05-25): pre-opt can hit a singular matrix in
- # PySCF's ``cho_solve`` on tight rings (e.g. aromatic
+ # POLISH.9 (M-POLISH, 2026-05-25): renamed from
+ # "pre-optimisation" — the wrapped operation is a full
+ # DFT geometry optimization at the user's selected
+ # method/basis. The LJ-classical pre-opt is in
+ # quantui/preopt.py and keeps its "pre-opt" name.
+ #
+ # BUG C (2026-05-25): geom-opt can hit a singular matrix
+ # in PySCF's ``cho_solve`` on tight rings (e.g. aromatic
# benzene with B3LYP/6-31G). That raises out of the
# optimizer and used to kill the whole calc. Wrap it: on
# any failure log to the user log, keep ``calc_mol`` as
@@ -3625,9 +3640,9 @@ def _run_required_final_single_point(target_mol, reason: str):
if self._freq_preopt_cb.value:
from quantui import optimize_geometry
- self.run_status.value = "Pre-optimizing geometry before frequency…"
+ self.run_status.value = "Optimizing geometry before frequency…"
log.write(
- "\n── Pre-optimisation (before frequency analysis) ──────────────────\n"
+ "\n── Geometry optimization (before frequency analysis) ──────────────────\n"
)
try:
_pre_opt = optimize_geometry(
@@ -3643,21 +3658,21 @@ def _run_required_final_single_point(target_mol, reason: str):
else "did NOT fully converge"
)
log.write(
- f"\nPre-optimisation {_conv_str} in {_pre_opt.n_steps} steps."
+ f"\nGeometry optimization {_conv_str} in {_pre_opt.n_steps} steps."
f" E = {_pre_opt.energies_hartree[-1]:.8f} Ha\n\n"
)
if not _pre_opt.converged:
log.write(
- "⚠ Pre-optimisation did not fully converge — "
+ "⚠ Geometry optimization did not fully converge — "
"proceeding with best available geometry.\n\n"
)
_run_required_final_single_point(
calc_mol,
- "after frequency pre-optimisation",
+ "after geometry optimization before frequency",
)
except Exception as _pre_exc:
log.write(
- f"\n⚠ Pre-optimisation failed: {_pre_exc}\n"
+ f"\n⚠ Geometry optimization failed: {_pre_exc}\n"
" Proceeding with the user-provided geometry "
"as-is; if the molecule was already near a "
"stationary point this is usually fine.\n\n"
@@ -3716,15 +3731,17 @@ def _run_required_final_single_point(target_mol, reason: str):
f"Atoms: {len(calc_mol.atoms)}\n\n"
)
- # ── Step 2: optional geometry pre-optimisation ────────────────
+ # ── Step 2: optional geometry optimization ────────────────────
+ # POLISH.9 (M-POLISH, 2026-05-25): renamed from
+ # "pre-optimisation" — DFT geom-opt is just geom-opt.
if self._freq_preopt_cb.value:
from quantui import optimize_geometry
self.run_status.value = (
- "Pre-optimizing geometry before UV-Vis (TD-DFT)…"
+ "Optimizing geometry before UV-Vis (TD-DFT)…"
)
log.write(
- "\n── Pre-optimisation (before UV-Vis (TD-DFT)) "
+ "\n── Geometry optimization (before UV-Vis (TD-DFT)) "
"─────────────\n"
)
# BUG C (2026-05-25): catch numerical failures and
@@ -3744,21 +3761,21 @@ def _run_required_final_single_point(target_mol, reason: str):
else "did NOT fully converge"
)
log.write(
- f"\nPre-optimisation {_conv_str} in {_pre_opt.n_steps} steps."
+ f"\nGeometry optimization {_conv_str} in {_pre_opt.n_steps} steps."
f" E = {_pre_opt.energies_hartree[-1]:.8f} Ha\n\n"
)
if not _pre_opt.converged:
log.write(
- "⚠ Pre-optimisation did not fully converge — "
+ "⚠ Geometry optimization did not fully converge — "
"proceeding with best available geometry.\n\n"
)
_run_required_final_single_point(
calc_mol,
- "after UV-Vis pre-optimisation",
+ "after geometry optimization before UV-Vis",
)
except Exception as _pre_exc:
log.write(
- f"\n⚠ Pre-optimisation failed: {_pre_exc}\n"
+ f"\n⚠ Geometry optimization failed: {_pre_exc}\n"
" Proceeding with the seed geometry as-is.\n\n"
)
diff --git a/quantui/app_analysis.py b/quantui/app_analysis.py
index 8833d02..65e453b 100644
--- a/quantui/app_analysis.py
+++ b/quantui/app_analysis.py
@@ -324,7 +324,15 @@ def pop_geo_trajectory(app: Any, ctx: Any) -> bool:
def pop_preopt_trajectory(app: Any, ctx: Any) -> bool:
- """Populate Trajectory panel for frequency pre-optimization contexts."""
+ """Populate Trajectory panel for the frequency-time DFT geometry
+ optimization trajectory.
+
+ POLISH.9 (2026-05-25): the wrapped operation is a full DFT geom-opt
+ at the user's method/basis, not the classical LJ pre-opt that lives
+ in ``quantui/preopt.py``. The function name + ``preopt_trajectory.json``
+ filename stay (renaming the saved file would break history replay of
+ older results) but user-facing strings now say "geometry optimization".
+ """
if ctx.source == "live":
pre = ctx.preopt_result
if pre is None:
@@ -341,7 +349,8 @@ def pop_preopt_trajectory(app: Any, ctx: Any) -> bool:
"Trajectory",
(
"Not available for this Frequency history result: "
- "preopt_trajectory.json is missing (pre-opt may have been disabled)."
+ "preopt_trajectory.json is missing (geometry "
+ "optimization may have been disabled)."
),
)
return False
@@ -363,7 +372,8 @@ def pop_preopt_trajectory(app: Any, ctx: Any) -> bool:
"Trajectory",
(
"Not available for this Frequency history result: "
- f"failed to load preopt trajectory ({type(exc).__name__})."
+ f"failed to load geometry-optimization trajectory "
+ f"({type(exc).__name__})."
),
)
return False
@@ -373,7 +383,7 @@ def pop_preopt_trajectory(app: Any, ctx: Any) -> bool:
"Trajectory",
(
"Not available for this Frequency result: "
- "pre-optimization trajectory has fewer than 2 frames."
+ "geometry-optimization trajectory has fewer than 2 frames."
),
)
return False
@@ -384,7 +394,7 @@ def pop_preopt_trajectory(app: Any, ctx: Any) -> bool:
)
app._pending_traj_result = stub
app._last_traj_result = stub
- app.traj_accordion.set_title(0, "Pre-optimization Trajectory")
+ app.traj_accordion.set_title(0, "Geometry Optimization Trajectory")
return True
diff --git a/quantui/app_builders.py b/quantui/app_builders.py
index f66ef38..815abf3 100644
--- a/quantui/app_builders.py
+++ b/quantui/app_builders.py
@@ -891,7 +891,7 @@ def build_welcome_header(app: Any) -> None:
f''
f"v{quantui.__version__} · "
f"Help tab for instructions · "
- f"Status tab for system info
"
+ f"System Settings tab for environment + calibration"
f""
f""
)
@@ -1855,11 +1855,17 @@ def build_help_section(app: Any, *, layout_fn: Any) -> None:
app.help_content_html = widgets.HTML()
app._render_help_topic()
+ # POLISH.2 (M-POLISH, 2026-05-25): the single-character "?" was
+ # visually noisy and hard to recognise as the global help toggle.
+ # Field-level "?" buttons (method_help_btn / basis_help_btn earlier
+ # in this file) keep the symbol — for inline-with-input help it's
+ # universally understood.
app._help_btn = widgets.Button(
- description="?",
+ description="Help",
button_style="",
+ icon="question-circle",
tooltip="Help topics",
- layout=layout_fn(width="34px", margin="0 0 0 8px"),
+ layout=layout_fn(width="80px", margin="0 0 0 8px"),
)
app._exit_btn = widgets.Button(
diff --git a/quantui/benchmarks.py b/quantui/benchmarks.py
index e84d3a9..c01ec96 100644
--- a/quantui/benchmarks.py
+++ b/quantui/benchmarks.py
@@ -980,7 +980,6 @@ def run_calibration(
"""
import multiprocessing as _mp
import queue as _queue
- import sys as _sys
from quantui import calc_log as _calc_log
@@ -1021,23 +1020,39 @@ def run_calibration(
# the per-step progress trail.
pass
- # ``fork`` is fast on Linux/macOS but unsupported on Windows; spawn
- # is the portable fallback. ``forkserver`` is also available but
- # slower than fork on Linux.
- _ctx_name = "spawn" if _sys.platform == "win32" else "fork"
- _ctx = _mp.get_context(_ctx_name)
-
- def _emit_progress(*args, live_message=None) -> None:
+ # Use ``spawn`` everywhere (session 55 follow-up): ``fork`` from a
+ # background thread (run_calibration runs inside ``_do_calibration``
+ # which is itself a daemon thread) collides hard with CUDA contexts
+ # that the parent process may have initialized via the GPU-detection
+ # probe — every step would die at ~0.04 s with no useful error.
+ # ``spawn`` adds ~1-2 s startup overhead per step but isolates the
+ # worker from the parent's interpreter state entirely, so CUDA / MPI /
+ # any C-extension global is freshly initialized. Sub-2-second-per-step
+ # overhead is a great trade for "the Stop button works AND nothing
+ # crashes for opaque reasons".
+ _ctx = _mp.get_context("spawn")
+
+ def _emit_progress(*args, live_message=None, step=None) -> None:
"""Wrap progress_cb to tolerate callers that pre-date the
- ``live_message`` kwarg (notably the test-suite lambdas that
- accept ``*args`` only). Falls back to the old 5-arg form on
- ``TypeError``."""
+ ``live_message`` / ``step`` kwargs (notably the test-suite
+ lambdas that accept ``*args`` only). Falls back through each
+ new kwarg in turn on ``TypeError``."""
if progress_cb is None:
return
+ # Try newest signature first, peel off kwargs the caller can't
+ # accept. Modern callers (do_calibration) take both; tests pass
+ # ``lambda *a: ...``.
+ try:
+ progress_cb(*args, live_message=live_message, step=step)
+ return
+ except TypeError:
+ pass
try:
progress_cb(*args, live_message=live_message)
+ return
except TypeError:
- progress_cb(*args)
+ pass
+ progress_cb(*args)
stopped_mid_step = False
for step_n, entry in enumerate(suite, start=1):
@@ -1072,7 +1087,7 @@ def _emit_progress(*args, live_message=None) -> None:
step.error_msg = "PySCF not available"
result.steps.append(step)
_save_calibration_json(result, log_path)
- _emit_progress(step_n, total, label, step.status, 0.0)
+ _emit_progress(step_n, total, label, step.status, 0.0, step=step)
continue
# Spawn the worker.
@@ -1133,9 +1148,34 @@ def _emit_progress(*args, live_message=None) -> None:
try:
msg = result_queue.get(timeout=2.0)
except _queue.Empty:
+ # Worker process exited (either crashed during import,
+ # raised before reaching the worker's try/except, or
+ # was killed by the OS) without putting anything on
+ # the queue. Capture the exit code + the tail of the
+ # calibration log so the user can see what actually
+ # happened — "worker exited without result" alone is
+ # useless for diagnosis (the original session-55
+ # symptom of every step failing at 0.04 s).
+ _exitcode = getattr(worker, "exitcode", None)
+ _tail = _tail_last_status_line(log_path) or "(no log output)"
+ _hint = ""
+ if _exitcode is not None and _exitcode != 0:
+ # On Unix, negative exit codes encode the signal
+ # that killed the process (-9 = SIGKILL, -11 = SEGV).
+ if _exitcode < 0:
+ import signal as _sig
+
+ try:
+ _sig_name = _sig.Signals(-_exitcode).name
+ _hint = f" (killed by {_sig_name})"
+ except (ValueError, AttributeError):
+ _hint = f" (signal {-_exitcode})"
msg = {
"status": "error",
- "error_msg": "worker exited without returning a result",
+ "error_msg": (
+ f"worker exited (exitcode={_exitcode}){_hint}; "
+ f"last log line: {_tail}"
+ )[:500],
"elapsed_s": time.perf_counter() - t_start,
}
if msg.get("status") == "ok":
@@ -1167,7 +1207,9 @@ def _emit_progress(*args, live_message=None) -> None:
# still leaves a partial-state record on disk.
_save_calibration_json(result, log_path)
- _emit_progress(step_n, total, label, step.status, step.elapsed_s)
+ # Terminal call for this step — pass the full BenchmarkStep so
+ # the UI callback can append it to the incremental results table.
+ _emit_progress(step_n, total, label, step.status, step.elapsed_s, step=step)
if stopped_mid_step:
break
diff --git a/quantui/results_storage.py b/quantui/results_storage.py
index 3eeb4db..457513a 100644
--- a/quantui/results_storage.py
+++ b/quantui/results_storage.py
@@ -584,7 +584,10 @@ def save_trajectory(
List of total energies in Hartree, parallel to *trajectory*.
filename:
Output filename inside *result_dir*. Defaults to ``trajectory.json``.
- Pass ``preopt_trajectory.json`` for pre-optimisation steps.
+ Pass ``preopt_trajectory.json`` for the DFT-geometry-optimization
+ trajectory that runs before a Frequency / TD-DFT calc. (The
+ filename keeps the historical ``preopt_`` prefix for back-compat
+ with saved-result replay — renaming would break older results.)
"""
if not trajectory:
return
@@ -669,7 +672,12 @@ def save_thumbnail(result_dir: Path, data: dict) -> None:
fg, bg = _colors.get(ct, ("#555555", "#f3f4f6"))
ct_label = _ct_labels.get(ct, ct.replace("_", " ").title())
- fig = plt.figure(figsize=(2.4, 1.5), facecolor=bg)
+ # POLISH.7 (M-POLISH, 2026-05-25): bumped figsize 2.4→3.6 + dpi 72→144
+ # so the History-card text is readable on 1× displays. Source PNG goes
+ # from 173×108 px (~8 KB) to 518×324 px (~25 KB); the History dropdown
+ # downscales to its native ~250–300 px width, so the user sees crisp
+ # anti-aliased text rather than the blurry letters from the old config.
+ fig = plt.figure(figsize=(3.6, 2.25), facecolor=bg)
ax = fig.add_axes([0, 0, 1, 1])
ax.set_facecolor(bg)
ax.set_xlim(0, 1)
@@ -748,7 +756,7 @@ def save_thumbnail(result_dir: Path, data: dict) -> None:
try:
fig.savefig(
str(result_dir / "thumbnail.png"),
- dpi=72,
+ dpi=144,
bbox_inches="tight",
facecolor=bg,
pad_inches=0.05,
diff --git a/tests/test_bug_regressions_2026_05_25.py b/tests/test_bug_regressions_2026_05_25.py
index 368d1e5..b57dc47 100644
--- a/tests/test_bug_regressions_2026_05_25.py
+++ b/tests/test_bug_regressions_2026_05_25.py
@@ -166,10 +166,13 @@ def test_freq_preopt_block_has_try_except(self):
# Confirm the source contains the new fallback paths. Reading
# the source is the most direct way to assert this; running the
# actual freq calc would require PySCF.
+ #
+ # POLISH.9 (2026-05-25) renamed user-facing "Pre-optimisation"
+ # → "Geometry optimization"; update the guard string to match.
from quantui import app as _app_mod
src = inspect.getsource(_app_mod)
- assert "Pre-optimisation failed" in src
+ assert "Geometry optimization failed" in src
# The exception variable name (_pre_exc) is unique to the new
# try/except wrapping all three pre-opt sites.
assert src.count("except Exception as _pre_exc") >= 3
From 4111552580f055f4e4eea6ad3127df9709883d86 Mon Sep 17 00:00:00 2001
From: NCCU-Schultz-Lab
Date: Mon, 25 May 2026 14:02:18 -0400
Subject: [PATCH 25/33] Add animated logo + incremental calibration UI
Port inline SVG/CSS animations into the welcome header so the QuantUI orbital rings spin (with prefers-reduced-motion honored) and replace static rotate transforms with animated classes. Fix calibration runflow bugs and improve UX: use _MODE_TO_SUITE to select the correct benchmark suite, keep the activity badge active during calibration, and add incremental result rendering (new _cal_status_text and _cal_table_html helpers) so rows accumulate as steps finish. Show an in-flight "running" row, preserve a transparent live-message line to avoid accordion height flicker, re-render final table from canonical results, and include several related comment and UI tweaks.
---
quantui/app_builders.py | 30 +++++-
quantui/app_runflow.py | 204 ++++++++++++++++++++++++++++------------
2 files changed, 170 insertions(+), 64 deletions(-)
diff --git a/quantui/app_builders.py b/quantui/app_builders.py
index 815abf3..f2d8d29 100644
--- a/quantui/app_builders.py
+++ b/quantui/app_builders.py
@@ -836,11 +836,33 @@ def build_theme_selector(app: Any, *, layout_fn: Any) -> None:
def build_welcome_header(app: Any) -> None:
- """Build the static QuantUI welcome banner."""
+ """Build the QuantUI welcome banner.
+
+ POLISH.1 (M-POLISH, 2026-05-25): the inline SVG was already here but
+ static. Ported the CSS keyframe animations from ``docs/logo.svg`` so
+ the orbital rings spin at slightly different speeds + directions
+ (9 s / 13 s reverse / 17 s). ``prefers-reduced-motion`` is honoured.
+ Inline-SVG + inline-CSS works in ipywidgets.HTML because both pass
+ the Jupyter widget sanitizer (Voilà's HTML pipeline allows "
''
''
""
@@ -854,17 +876,17 @@ def build_welcome_header(app: Any) -> None:
""
''
- ''
+ ''
''
''
""
- ''
+ ''
''
''
""
- ''
+ ''
''
''
diff --git a/quantui/app_runflow.py b/quantui/app_runflow.py
index 86fce22..0a1e557 100644
--- a/quantui/app_runflow.py
+++ b/quantui/app_runflow.py
@@ -46,8 +46,11 @@ def on_calc_type_changed(app: Any, change: Any, *, layout_fn: Any) -> None:
"""Update extra options panel based on selected calculation type."""
ct = change["new"]
- # QM pre-optimization is meaningful for all workflows except Geometry Opt,
- # which is itself an optimization workflow.
+ # The "geometry optimization before this calc" checkbox is meaningful
+ # for all workflows except Geometry Opt itself (which IS the geom-opt
+ # workflow). POLISH.9: this was called "pre-optimisation" pre-2026-05-25;
+ # the underlying operation is a full DFT geom-opt — distinct from the
+ # LJ classical pre-opt in quantui/preopt.py.
if ct == "Geometry Opt":
app._freq_preopt_cb.value = False
app._freq_preopt_cb.layout.display = "none"
@@ -645,7 +648,14 @@ def on_cal_run(
"""Start async calibration run and initialize calibration UI state."""
_ = btn
mode = app._cal_mode_toggle.value
- suite = benchmark_suite if mode == "short" else benchmark_suite_long
+ # session 55 hotfix: the old ``"short" else "long"`` two-tier dispatch
+ # silently routed tier 3 / tier 4 (and tier 1!) to the tier-2 suite,
+ # which set ``progress_bar.max = 20`` while tier 1 only ran 8 steps
+ # — the bar froze at 40% on completion. Use the 4-tier lookup so
+ # ``max`` matches the actual step count.
+ from quantui.benchmarks import _MODE_TO_SUITE
+
+ suite = _MODE_TO_SUITE.get(mode, benchmark_suite)
app._cal_stop_event = threading.Event()
app._cal_run_btn.disabled = True
app._cal_mode_toggle.disabled = True
@@ -656,6 +666,9 @@ def on_cal_run(
app._cal_step_label.layout.display = ""
app._cal_step_label.value = (
'Starting…'
+ # Reserve a second invisible line so the live-message ticker
+ # doesn't jump the accordion height (session 55 user report).
+ '
.'
)
app._cal_results_html.value = ""
@@ -669,21 +682,95 @@ def on_cal_stop(app: Any, btn: Any) -> None:
app._cal_stop_event.set()
+def _cal_status_text(status: str) -> str:
+ """Render a benchmark-step status code as a glanceable HTML cell."""
+ return {
+ "ok": "✓",
+ "timed_out": "⏱ timed out",
+ "stopped": "⛔ stopped",
+ "error": "✗ error",
+ "running": "▶ running",
+ }.get(status, status)
+
+
+def _cal_table_html(steps_so_far, total: int, *, in_flight_step=None) -> str:
+ """Render the calibration results table.
+
+ Called incrementally — after every completed step — so the user sees
+ rows accumulate in real time instead of waiting for the whole tier
+ to finish (session 55 user request). ``steps_so_far`` is the list of
+ ``BenchmarkStep`` objects completed; ``in_flight_step`` (optional)
+ is a dict ``{label, n_electrons, n_basis, status, elapsed_s}`` that
+ appends a "running" row at the bottom while a step is mid-execution.
+ """
+ row_tpl = (
+ ""
+ '| {label} | '
+ '{ne} | '
+ '{nb} | '
+ '{t:.2f} s | '
+ '{status} | '
+ "
"
+ )
+ rows = "".join(
+ row_tpl.format(
+ label=s.label,
+ ne=s.n_electrons,
+ nb=s.n_basis if s.n_basis is not None else "—",
+ t=s.elapsed_s,
+ status=_cal_status_text(s.status),
+ )
+ for s in steps_so_far
+ )
+ if in_flight_step is not None:
+ rows += row_tpl.format(
+ label=in_flight_step["label"],
+ ne=in_flight_step.get("n_electrons", "—"),
+ nb=in_flight_step.get("n_basis", "—") or "—",
+ t=in_flight_step.get("elapsed_s", 0.0),
+ status=_cal_status_text("running"),
+ )
+
+ n_done = sum(1 for s in steps_so_far if s.status == "ok")
+ summary = f"Completed {n_done} / {total} steps."
+ return (
+ ''
+ f'
{summary}
'
+ '
'
+ ""
+ '| Calculation | '
+ 'e⁻ | '
+ 'Basis fns | '
+ 'Wall time | '
+ 'Status | '
+ "
"
+ f"{rows}
"
+ )
+
+
def do_calibration(app: Any, *, pyscf_available: bool) -> None:
"""Run calibration suite and render calibration summary table.
- Fixes shipped 2026-05-25 (session 55 user report — tier 4 stuck the
- user with no progress signal):
+ Fixes shipped 2026-05-25 (session 55 user reports):
- Wraps the whole run in ``_activity_begin/_end`` so the toolbar
activity badge stops reading "Idle" while calibration is busy.
- - Per-step ``progress_cb`` now writes a multi-line status block
- (live tail of the per-step PySCF / SCF log) so the user can see
- where a slow step is rather than guess whether it froze.
+ - Per-step ``progress_cb`` writes a multi-line status block (live
+ tail of the per-step PySCF / SCF log) so the user can see where
+ a slow step is rather than guess whether it froze.
+ - Table rows render incrementally (after each step completes)
+ instead of all at once at end-of-run.
+ - The live-message line is ALWAYS present (transparent placeholder
+ when there's no message yet) so the accordion height doesn't
+ flicker between one-line and two-line states.
"""
from quantui.benchmarks import run_calibration
mode = app._cal_mode_toggle.value
+ # Total-step count comes via the ``total`` arg of the ``_progress``
+ # callback; no need to compute it locally. (The earlier draft pulled
+ # it from ``_MODE_TO_SUITE`` but never used it — ruff F841.)
+
# Per-tier timeout budget. Tier 3 + tier 4 have freq/geo-opt anchors
# that run for minutes; tier 1 / tier 2 stay SP-only at 120 s/step.
_timeout_map = {
@@ -696,12 +783,17 @@ def do_calibration(app: Any, *, pyscf_available: bool) -> None:
}
timeout_per_step = _timeout_map.get(mode, 120.0)
- # M-EST follow-up (2026-05-25): keep the toolbar activity badge red
- # for the duration of the calibration so the user knows the kernel
- # is busy. Without this it reads "Idle" while the worker thread
- # burns CPU for tier 3/4 (~10-30 min).
+ # M-EST follow-up: keep the toolbar activity badge red for the
+ # duration of the calibration so the user knows the kernel is busy.
app._activity_begin(f"Calibrating ({mode})…", kind="compute")
+ # Per-step buffer of completed steps for incremental table rendering.
+ # Steps accumulate here as soon as each one finishes.
+ _completed_steps: list = []
+ # Buffer for the currently-running step so we can show a "running"
+ # row at the bottom of the table while it's in-flight.
+ _in_flight: dict = {}
+
def _progress(
step_n: int,
total: int,
@@ -710,16 +802,17 @@ def _progress(
elapsed: float,
*,
live_message: Optional[str] = None,
+ step: Any = None,
) -> None:
"""Per-step progress callback.
- Two call modes:
+ Three call modes:
+ - Live-tick: status is "running"; ``step`` is None. Updates
+ the step label and shows an "in flight" row at the bottom
+ of the table.
- Step-finish: status is one of ok/timed_out/stopped/error;
- ``live_message`` is None. Updates the progress bar.
- - Live-tick: status is "running"; ``live_message`` carries the
- latest ``[QuantUI_STATUS]`` marker from inside the step (set
- by freq_calc / optimizer during long inner loops). Updates
- the step label only.
+ ``step`` is the completed ``BenchmarkStep``. Appends to the
+ completed-steps buffer + re-renders the table.
"""
icon = {
"ok": "✓",
@@ -730,21 +823,33 @@ def _progress(
}.get(status, "?")
if status != "running":
app._cal_progress.value = step_n
- # Multi-line block: top line = step + status; second line = the
- # most recent live message (if any). Keeps the user oriented
- # during the slow tier-4 freq anchors.
- live_line = (
- f'
{live_message}'
- if live_message
- else ""
- )
+ if step is not None:
+ _completed_steps.append(step)
+ # ALWAYS render two lines so the accordion height doesn't
+ # flip-flop. Empty live-message becomes a transparent dot to
+ # preserve the line-height.
+ live_line_text = live_message if live_message else "."
+ live_line_color = "#64748b" if live_message else "transparent"
app._cal_step_label.value = (
f''
f"Step {step_n} / {total} — {label} "
f"[{icon} {elapsed:.1f} s]"
- f"{live_line}"
+ f'
'
+ f"{live_line_text}"
)
+ # Refresh in-flight buffer + the table snapshot.
+ if status == "running":
+ # Pull electron-count / basis from the active suite entry so
+ # the in-flight row has the same columns as completed rows.
+ _in_flight.update(label=label, elapsed_s=elapsed)
+ app._cal_results_html.value = _cal_table_html(
+ _completed_steps, total, in_flight_step=_in_flight or None
+ )
+ else:
+ _in_flight.clear()
+ app._cal_results_html.value = _cal_table_html(_completed_steps, total)
+
try:
result = run_calibration(
progress_cb=_progress,
@@ -752,46 +857,25 @@ def _progress(
timeout_per_step=timeout_per_step,
mode=mode,
)
+ # Belt-and-suspenders: re-render the table from the canonical
+ # ``result.steps`` in case any per-step callback was dropped
+ # (e.g. transient widget-update exception). The progress
+ # callback should have already kept _completed_steps in sync.
+ app._cal_results_html.value = _cal_table_html(
+ list(result.steps), result.n_total
+ )
finally:
app._activity_end(kind="compute")
- rows = "".join(
- f""
- f'| {s.label} | '
- f''
- f"{s.n_electrons} | "
- f''
- f"{s.n_basis if s.n_basis is not None else '—'} | "
- f''
- f"{s.elapsed_s:.2f} s | "
- f''
- f'{"✓" if s.status == "ok" else ("⏱ timed out" if s.status == "timed_out" else ("⛔ stopped" if s.status == "stopped" else "✗ error"))}'
- f" | "
- f"
"
- for s in result.steps
- )
- summary = f"Completed {result.n_completed} / {result.n_total} steps." + (
- " (stopped early)" if result.stopped_early else ""
- )
- app._cal_results_html.value = (
- f''
- f'
{summary}
'
- f'
'
- f""
- f'| Calculation | '
- f'e⁻ | '
- f'Basis fns | '
- f'Wall time | '
- f'Status | '
- f"
"
- f"{rows}
"
- )
-
app._cal_step_label.value = (
'Calibration complete. '
"Time estimates are now active."
+ '
.'
if result.n_completed > 0
- else 'No steps completed.'
+ else (
+ 'No steps completed.'
+ '
.'
+ )
)
app._cal_stop_btn.layout.display = "none"
app._cal_run_btn.disabled = not pyscf_available
From 0aea13cb35ca39d957444600b5af87e858391f23 Mon Sep 17 00:00:00 2001
From: NCCU-Schultz-Lab
Date: Mon, 25 May 2026 14:07:24 -0400
Subject: [PATCH 26/33] Add placeholder to results history dropdown
Update refresh_results_browser to prepend an explicit "(select a calculation to view)" placeholder to the History dropdown so ipywidgets doesn't auto-select the most-recent result on render. This clarifies that no calculation is loaded until the user clicks "View Results"/"View Analysis". Preserve existing behavior of keeping a previously-picked real result across refreshes, and fall back to the "(no saved results)" message when every load_result call fails (i.e. when the placeholder would be the only option).
---
quantui/app_runflow.py | 29 ++++++++++++++++++++++++++---
1 file changed, 26 insertions(+), 3 deletions(-)
diff --git a/quantui/app_runflow.py b/quantui/app_runflow.py
index 0a1e557..c56d61b 100644
--- a/quantui/app_runflow.py
+++ b/quantui/app_runflow.py
@@ -969,7 +969,24 @@ def update_estimate(app: Any, *, calc_log_mod: Any, change: Any = None) -> None:
def refresh_results_browser(app: Any) -> None:
- """Refresh the History dropdown with saved result directories."""
+ """Refresh the History dropdown with saved result directories.
+
+ POLISH.6 (M-POLISH, 2026-05-25): prepends a
+ ``"(select a calculation to view)"`` placeholder so the dropdown
+ opens in an explicit "no calc loaded yet" state. Without the
+ placeholder, ipywidgets auto-selected the most-recent entry as the
+ dropdown's ``value`` — visually implying the calc was loaded when
+ actually the user still has to click "View Results" / "View
+ Analysis" to populate the rest of the UI. The ``value`` observer
+ fires when options are reassigned (the result card *is* shown),
+ but no calc state is loaded into the app until the explicit
+ button-click, which mismatched user expectation.
+
+ The placeholder is always at index 0 of ``options`` so the
+ Dropdown widget's value-preservation behaviour kicks in: a
+ previously-picked real result survives a refresh, but the initial
+ render shows the placeholder.
+ """
try:
from quantui import list_results, load_result
except ImportError:
@@ -982,7 +999,8 @@ def refresh_results_browser(app: Any) -> None:
if not dirs:
app.past_dd.options = [("(no saved results)", "")]
return
- options = []
+ placeholder = ("(select a calculation to view)", "")
+ options = [placeholder]
for d in dirs:
try:
data = load_result(d)
@@ -995,7 +1013,12 @@ def refresh_results_browser(app: Any) -> None:
options.append((label, str(d)))
except Exception:
pass
- app.past_dd.options = options if options else [("(no saved results)", "")]
+ # If the only entry is the placeholder, fall back to the empty-list
+ # message — the loop above silently swallowed every load_result call.
+ if len(options) == 1:
+ app.past_dd.options = [("(no saved results)", "")]
+ return
+ app.past_dd.options = options
if app.calc_type_dd.value == "Frequency":
app._refresh_freq_seed_options()
From 39023a26eb90043aaced9667f8563acc91ee860e Mon Sep 17 00:00:00 2001
From: NCCU-Schultz-Lab
Date: Mon, 25 May 2026 14:40:55 -0400
Subject: [PATCH 27/33] Save calibration steps, add skip & prediction logs
Save calibration runs as regular result dirs and improve calibration control/telemetry. Adds prediction_log (log_prediction/get_prediction_history) and a dashboard "Prediction accuracy" section; the app captures pre-run estimator outputs and persists predicted vs actual pairs. Introduces a Skip button and skip_event to abandon a single long-running calibration step (default per-step timeout removed; timeout becomes optional). Calibration worker now tees PySCF output to an in-memory buffer and saves each step via _save_calibration_step (which uses save_result(extras={...}) to tag results with calibration_run_id); BenchmarkStep gains result_dir. GPU unsupported methods list expanded to avoid unstable GPU runs for MP2/CCSD/CCSD(T). Adds _TeeStream helper and tests covering save_result extras, _TeeStream, and _save_calibration_step plus related behavior. Misc: UI wiring for skip button, inline dashboard integration, and resilient best-effort logging throughout.
---
quantui/analytics.py | 186 ++++++++++++++-
quantui/app.py | 92 ++++++++
quantui/app_builders.py | 17 +-
quantui/app_runflow.py | 72 ++++--
quantui/benchmarks.py | 265 +++++++++++++++++++--
quantui/calc_log.py | 79 +++++++
quantui/gpu_offload.py | 19 +-
quantui/results_storage.py | 11 +
tests/test_calibration_save_results.py | 295 +++++++++++++++++++++++
tests/test_calibration_skip_and_gpu.py | 250 ++++++++++++++++++++
tests/test_est_prediction_log.py | 312 +++++++++++++++++++++++++
11 files changed, 1560 insertions(+), 38 deletions(-)
create mode 100644 tests/test_calibration_save_results.py
create mode 100644 tests/test_calibration_skip_and_gpu.py
create mode 100644 tests/test_est_prediction_log.py
diff --git a/quantui/analytics.py b/quantui/analytics.py
index e37ee25..99318eb 100644
--- a/quantui/analytics.py
+++ b/quantui/analytics.py
@@ -36,7 +36,7 @@
from pathlib import Path
from typing import Optional
-from quantui.calc_log import _log_dir, get_perf_history
+from quantui.calc_log import _log_dir, get_perf_history, get_prediction_history
# ---------------------------------------------------------------------------
# Internal helpers
@@ -352,6 +352,178 @@ def _timeline_html(records: list[dict], *, include_plotlyjs: bool) -> Optional[s
)
+# ---------------------------------------------------------------------------
+# Prediction-accuracy section (M-EST / EST.6, 2026-05-25)
+# ---------------------------------------------------------------------------
+
+
+def _prediction_accuracy_metrics(records: list[dict]) -> dict:
+ """Compute headline accuracy metrics from prediction-log records.
+
+ Records with ``predicted_s=None`` are "no-estimate" runs and counted
+ separately. For the median-error calculation we use absolute
+ percentage error (``|actual - predicted| / predicted * 100``), so
+ over- and under-predictions weigh the same; the dashboard shows
+ both the signed median (bias) and the absolute median (magnitude).
+ """
+ have_pred = [
+ r
+ for r in records
+ if r.get("predicted_s") is not None and r.get("error_pct") is not None
+ ]
+ no_pred = [r for r in records if r.get("predicted_s") is None]
+ abs_errs = [abs(float(r["error_pct"])) for r in have_pred]
+ signed_errs = [float(r["error_pct"]) for r in have_pred]
+ return {
+ "n_total": len(records),
+ "n_with_estimate": len(have_pred),
+ "n_no_estimate": len(no_pred),
+ "median_abs_error_pct": (statistics.median(abs_errs) if abs_errs else None),
+ "median_signed_error_pct": (
+ statistics.median(signed_errs) if signed_errs else None
+ ),
+ # "Within 25%" — a useful headline metric ("how often is the
+ # estimator usefully close?"). Roadmap target: ≥ 70% after a
+ # tier-4 calibration.
+ "pct_within_25": (
+ round(100.0 * sum(1 for e in abs_errs if e <= 25.0) / len(abs_errs), 1)
+ if abs_errs
+ else None
+ ),
+ }
+
+
+def _prediction_scatter_html(
+ records: list[dict], *, include_plotlyjs: bool
+) -> Optional[str]:
+ """Scatter of predicted_s vs actual_s with a y=x reference line."""
+ have_pred = [
+ r
+ for r in records
+ if r.get("predicted_s") is not None and r.get("actual_s") is not None
+ ]
+ if len(have_pred) < 2:
+ return None
+ try:
+ import plotly.graph_objects as go
+ import plotly.io as pio
+ except ImportError:
+ return None
+
+ # Hover labels show the calc spec so the user can identify outliers.
+ text_labels = [
+ f"{r.get('method', '?')}/{r.get('basis', '?')} on {r.get('formula', '?')}"
+ for r in have_pred
+ ]
+ predicted = [float(r["predicted_s"]) for r in have_pred]
+ actual = [float(r["actual_s"]) for r in have_pred]
+ max_val = max(max(predicted), max(actual), 1.0) * 1.1
+
+ fig = go.Figure()
+ # y=x reference line (perfect prediction).
+ fig.add_trace(
+ go.Scatter(
+ x=[0, max_val],
+ y=[0, max_val],
+ mode="lines",
+ name="perfect (y=x)",
+ line=dict(color="#94a3b8", dash="dash", width=1),
+ hoverinfo="skip",
+ )
+ )
+ fig.add_trace(
+ go.Scatter(
+ x=predicted,
+ y=actual,
+ mode="markers",
+ name="run",
+ text=text_labels,
+ marker=dict(size=9, color="#6366f1", opacity=0.75),
+ hovertemplate=(
+ "%{text}
predicted: %{x:.2f} s
actual: %{y:.2f} s"
+ ),
+ )
+ )
+ fig.update_layout(
+ height=420,
+ xaxis=dict(title="Predicted (s)", range=[0, max_val]),
+ yaxis=dict(title="Actual (s)", range=[0, max_val]),
+ margin=dict(l=60, r=20, t=10, b=50),
+ plot_bgcolor="#ffffff",
+ legend=dict(orientation="h", x=0, y=1.05),
+ )
+ return pio.to_html(
+ fig,
+ include_plotlyjs="inline" if include_plotlyjs else False,
+ full_html=False,
+ config={"displayModeBar": False},
+ )
+
+
+def _prediction_accuracy_section(
+ records: list[dict], scatter_html: Optional[str]
+) -> str:
+ """Render the "Prediction accuracy" section of the dashboard."""
+ if not records:
+ return (
+ "Prediction accuracy
"
+ 'No predictions logged yet — run a few '
+ "calculations and the estimator's track record will appear here.
"
+ ""
+ )
+
+ m = _prediction_accuracy_metrics(records)
+ median_abs = m["median_abs_error_pct"]
+ median_signed = m["median_signed_error_pct"]
+ within_25 = m["pct_within_25"]
+
+ # Banner when median absolute error exceeds 50%: estimator is in
+ # rough shape; re-running calibration usually helps.
+ banner = ""
+ if median_abs is not None and median_abs > 50.0:
+ banner = (
+ ''
+ f"⚠ Median absolute prediction error is {median_abs:.0f}%. "
+ "Re-running a deeper calibration tier (System Settings → Calibrate "
+ "time estimates) typically tightens this within ±25%."
+ "
"
+ )
+
+ cards = [
+ _card("Predictions logged", str(m["n_total"])),
+ _card(
+ "With estimate",
+ f"{m['n_with_estimate']} / {m['n_total']}",
+ ),
+ ]
+ if median_abs is not None:
+ cards.append(_card("Median |error|", f"{median_abs:.1f}%"))
+ if median_signed is not None:
+ sign = "+" if median_signed >= 0 else ""
+ cards.append(_card("Median bias", f"{sign}{median_signed:.1f}%"))
+ if within_25 is not None:
+ cards.append(_card("Within ±25%", f"{within_25:.0f}%"))
+ if m["n_no_estimate"]:
+ cards.append(_card("No estimate", str(m["n_no_estimate"])))
+
+ chart_block = (
+ scatter_html
+ if scatter_html
+ else (
+ 'Need at least 2 predictions with an estimate '
+ "before plotting accuracy.
"
+ )
+ )
+ return (
+ "Prediction accuracy
"
+ + banner
+ + f'{"".join(cards)}
'
+ + chart_block
+ + ""
+ )
+
+
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
@@ -383,6 +555,14 @@ def build_dashboard(out_path: Optional[Path] = None) -> Optional[Path]:
method_counts = _counts_by(records, "method")
calc_type_counts = _counts_by(records, "calc_type")
+ # M-EST / EST.6: prediction-accuracy data lives in its own log file.
+ # Best-effort read — older installs without the file produce an
+ # empty list and the section degrades to an empty-state message.
+ try:
+ prediction_records = get_prediction_history()
+ except Exception: # noqa: BLE001 — best-effort
+ prediction_records = []
+
# Inline plotly.js exactly once (in the first figure that renders).
# Subsequent figures pass include_plotlyjs=False so we don't ship
# the ~3 MB bundle three times.
@@ -393,6 +573,9 @@ def build_dashboard(out_path: Optional[Path] = None) -> Optional[Path]:
calc_type_counts, title="Calc-type distribution", include_plotlyjs=False
)
timeline = _timeline_html(records, include_plotlyjs=False)
+ prediction_scatter = _prediction_scatter_html(
+ prediction_records, include_plotlyjs=False
+ )
generated = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
body = (
@@ -402,6 +585,7 @@ def build_dashboard(out_path: Optional[Path] = None) -> Optional[Path]:
f'Generated {generated} — {summary["total_runs"]} runs in perf log
'
+ _overview_section(summary)
+ _speedup_section(speedup_rows)
+ + _prediction_accuracy_section(prediction_records, prediction_scatter)
+ _figure_section(
"Method usage",
method_bar,
diff --git a/quantui/app.py b/quantui/app.py
index 5455a9c..cf5dd42 100644
--- a/quantui/app.py
+++ b/quantui/app.py
@@ -193,6 +193,9 @@
from quantui.app_runflow import (
on_cal_run as _run_on_cal_run,
)
+from quantui.app_runflow import (
+ on_cal_skip as _run_on_cal_skip,
+)
from quantui.app_runflow import (
on_cal_stop as _run_on_cal_stop,
)
@@ -1512,6 +1515,7 @@ def _wire_callbacks(self) -> None:
)
self._cal_run_btn.on_click(self._on_cal_run)
self._cal_stop_btn.on_click(self._on_cal_stop)
+ self._cal_skip_btn.on_click(self._on_cal_skip)
self.export_btn.on_click(self._on_export)
self.export_xyz_btn.on_click(self._on_export_xyz)
self.export_mol_btn.on_click(self._on_export_mol)
@@ -2910,6 +2914,9 @@ def _on_cal_run(self, btn) -> None:
def _on_cal_stop(self, btn) -> None:
_run_on_cal_stop(self, btn)
+ def _on_cal_skip(self, btn) -> None:
+ _run_on_cal_skip(self, btn)
+
def _do_calibration(self) -> None:
_run_do_calibration(self, pyscf_available=_PYSCF_AVAILABLE)
@@ -3418,6 +3425,70 @@ def _do_run(self) -> None:
_scf_converged_t: Optional[float] = None
_tail_marks: dict[str, float] = {}
+ # M-EST / EST.6 (2026-05-25): capture the estimator's pre-run
+ # prediction so we can write a (predicted, actual) record to
+ # ``prediction_log.jsonl`` after the calc completes. The
+ # estimator may return None (insufficient history); we record
+ # that as "no estimate" so the dashboard counts it separately
+ # from "estimate was wrong by N%".
+ _predicted_run_s: Optional[float] = None
+ _predicted_run_confidence: str = "unknown"
+ try:
+ _ct_for_est = {
+ "Single Point": "single_point",
+ "Geometry Opt": "geometry_opt",
+ "Frequency": "frequency",
+ "UV-Vis (TD-DFT)": "tddft",
+ "NMR Shielding": "nmr",
+ "PES Scan": "pes_scan",
+ }.get(self.calc_type_dd.value, "single_point")
+ _nb_for_est = _calc_log.count_basis_functions(
+ mol.atoms, self.basis_dd.value
+ )
+ # Match _update_estimate's GPU-prediction logic so the
+ # recorded predicted_s is what the user SAW in the UI
+ # before they hit Run.
+ _predicted_gpu_used: Optional[bool] = None
+ try:
+ from quantui.gpu_offload import (
+ _GPU_UNSUPPORTED_METHODS as _GPU_NO,
+ )
+ from quantui.gpu_offload import (
+ is_gpu_available,
+ )
+
+ _gpu_avail, _ = is_gpu_available()
+ if _gpu_avail and self.method_dd.value.upper() not in _GPU_NO:
+ _predicted_gpu_used = True
+ else:
+ _predicted_gpu_used = False
+ except Exception: # noqa: BLE001 — fall back to device-agnostic
+ _predicted_gpu_used = None
+
+ _est = _calc_log.estimate_time(
+ n_atoms=len(mol.atoms),
+ n_electrons=mol.get_electron_count(),
+ method=self.method_dd.value,
+ basis=self.basis_dd.value,
+ n_basis=_nb_for_est,
+ calc_type=_ct_for_est,
+ gpu_used=_predicted_gpu_used,
+ )
+ if _est is not None:
+ _predicted_run_s = float(_est["seconds"])
+ _predicted_run_confidence = str(_est.get("confidence", "unknown"))
+ except Exception as _est_exc:
+ # Estimator failure here is non-fatal — we just won't have a
+ # predicted_s to compare against. Log to event_log so the
+ # cause is at least surfaced for diagnosis.
+ try:
+ _calc_log.log_event(
+ "predict_capture_failed",
+ f"{type(_est_exc).__name__}: {_est_exc}"[:300],
+ )
+ except Exception: # noqa: BLE001 — telemetry self-guard
+ pass
+
def _mark(stage: str) -> None:
_tail_marks[stage] = time.perf_counter()
@@ -4115,6 +4186,27 @@ def _run_required_final_single_point(target_mol, reason: str):
gpu_used=bool(getattr(result, "gpu_used", False)),
gpu_name=getattr(result, "gpu_name", None),
)
+ # M-EST / EST.6: persist the (predicted, actual) pair to
+ # ``prediction_log.jsonl``. ``_predicted_run_s`` was
+ # captured at the top of _do_run via the same
+ # estimate_time(...) call that drives the UI estimate;
+ # ``_elapsed_for_est`` is the actual wall-time the calc
+ # took. The analytics dashboard reads both to surface
+ # accuracy metrics + the "consider re-calibrating"
+ # banner when the median error exceeds threshold.
+ try:
+ _calc_log.log_prediction(
+ predicted_s=_predicted_run_s,
+ actual_s=_elapsed_for_est,
+ method=result.method,
+ basis=result.basis,
+ calc_type=save_type,
+ formula=result.formula,
+ confidence=_predicted_run_confidence,
+ gpu_used=getattr(result, "gpu_used", None),
+ )
+ except Exception: # noqa: BLE001 — telemetry self-guard
+ pass
self._update_estimate()
except Exception:
pass
diff --git a/quantui/app_builders.py b/quantui/app_builders.py
index f2d8d29..84cd86f 100644
--- a/quantui/app_builders.py
+++ b/quantui/app_builders.py
@@ -264,8 +264,23 @@ def build_history_section(
description="Stop",
button_style="warning",
icon="stop",
+ tooltip="Abandon the rest of the calibration (current step is also killed).",
layout=layout_fn(width="90px", display="none"),
)
+ # session 55 user request: replaced the hard 1800 s per-step timeout
+ # with a Skip button so the user can abandon ONE step that's running
+ # too long without losing the whole run. Distinct from Stop (which
+ # abandons everything remaining).
+ app._cal_skip_btn = widgets.Button(
+ description="Skip step",
+ button_style="info",
+ icon="step-forward",
+ tooltip=(
+ "Abandon the current step and move on to the next. Other "
+ "completed steps stay; the calibration continues."
+ ),
+ layout=layout_fn(width="120px", display="none"),
+ )
app._cal_progress = widgets.IntProgress(
min=0,
max=len(benchmark_suite),
@@ -376,7 +391,7 @@ def build_history_section(
),
app._cal_mode_toggle,
widgets.HBox(
- [app._cal_run_btn, app._cal_stop_btn],
+ [app._cal_run_btn, app._cal_skip_btn, app._cal_stop_btn],
layout=layout_fn(gap="6px", align_items="center"),
),
app._cal_progress,
diff --git a/quantui/app_runflow.py b/quantui/app_runflow.py
index c56d61b..66458b8 100644
--- a/quantui/app_runflow.py
+++ b/quantui/app_runflow.py
@@ -657,9 +657,13 @@ def on_cal_run(
suite = _MODE_TO_SUITE.get(mode, benchmark_suite)
app._cal_stop_event = threading.Event()
+ # session 55 user request: skip-current-step event, separate from
+ # the whole-run stop event. Replaces the hard per-step timeout.
+ app._cal_skip_event = threading.Event()
app._cal_run_btn.disabled = True
app._cal_mode_toggle.disabled = True
app._cal_stop_btn.layout.display = ""
+ app._cal_skip_btn.layout.display = ""
app._cal_progress.max = len(suite)
app._cal_progress.value = 0
app._cal_progress.layout.display = ""
@@ -682,12 +686,27 @@ def on_cal_stop(app: Any, btn: Any) -> None:
app._cal_stop_event.set()
+def on_cal_skip(app: Any, btn: Any) -> None:
+ """Signal the active calibration to skip the CURRENT step + continue.
+
+ Replaces the per-step timeout (session 55 user request after a
+ near-finishing benzene B3LYP/6-31G* freq calc got cut off at the
+ 1800 s tier-4 cap). The worker is killed, the step is marked
+ ``skipped``, the event is cleared inside ``run_calibration``, and
+ the loop moves on to the next step.
+ """
+ _ = btn
+ if hasattr(app, "_cal_skip_event"):
+ app._cal_skip_event.set()
+
+
def _cal_status_text(status: str) -> str:
"""Render a benchmark-step status code as a glanceable HTML cell."""
return {
"ok": "✓",
"timed_out": "⏱ timed out",
"stopped": "⛔ stopped",
+ "skipped": "⏭ skipped",
"error": "✗ error",
"running": "▶ running",
}.get(status, status)
@@ -702,16 +721,38 @@ def _cal_table_html(steps_so_far, total: int, *, in_flight_step=None) -> str:
``BenchmarkStep`` objects completed; ``in_flight_step`` (optional)
is a dict ``{label, n_electrons, n_basis, status, elapsed_s}`` that
appends a "running" row at the bottom while a step is mid-execution.
+
+ For failed steps (error / timeout / skipped) we render an inline
+ italic line below the status cell with a truncated ``error_msg``,
+ so the user can see WHY a step failed without having to open
+ ``calibration.json`` (session 55 user request after MP2/CCSD on
+ H₂O/cc-pVDZ silently 'errored' with no on-screen explanation).
"""
+ import html as _html_mod
+
row_tpl = (
""
'| {label} | '
'{ne} | '
'{nb} | '
'{t:.2f} s | '
- '{status} | '
+ '{status}{detail} | '
"
"
)
+
+ def _err_detail(s) -> str:
+ # Show err_msg inline only for non-ok terminal statuses.
+ msg = getattr(s, "error_msg", "") or ""
+ if not msg or s.status in ("ok", "running"):
+ return ""
+ # Truncate hard so a verbose PySCF traceback can't blow up the row.
+ if len(msg) > 140:
+ msg = msg[:137] + "…"
+ return (
+ '
'
+ f"{_html_mod.escape(msg)}"
+ )
+
rows = "".join(
row_tpl.format(
label=s.label,
@@ -719,6 +760,7 @@ def _cal_table_html(steps_so_far, total: int, *, in_flight_step=None) -> str:
nb=s.n_basis if s.n_basis is not None else "—",
t=s.elapsed_s,
status=_cal_status_text(s.status),
+ detail=_err_detail(s),
)
for s in steps_so_far
)
@@ -729,6 +771,7 @@ def _cal_table_html(steps_so_far, total: int, *, in_flight_step=None) -> str:
nb=in_flight_step.get("n_basis", "—") or "—",
t=in_flight_step.get("elapsed_s", 0.0),
status=_cal_status_text("running"),
+ detail="",
)
n_done = sum(1 for s in steps_so_far if s.status == "ok")
@@ -771,17 +814,13 @@ def do_calibration(app: Any, *, pyscf_available: bool) -> None:
# callback; no need to compute it locally. (The earlier draft pulled
# it from ``_MODE_TO_SUITE`` but never used it — ruff F841.)
- # Per-tier timeout budget. Tier 3 + tier 4 have freq/geo-opt anchors
- # that run for minutes; tier 1 / tier 2 stay SP-only at 120 s/step.
- _timeout_map = {
- "tier1": 120.0,
- "short": 120.0,
- "tier2": 300.0,
- "long": 300.0,
- "tier3": 900.0,
- "tier4": 1800.0,
- }
- timeout_per_step = _timeout_map.get(mode, 120.0)
+ # session 55 user request (after a near-finishing benzene
+ # B3LYP/6-31G* freq got cut off at the old 1800 s tier-4 cap):
+ # no automatic timeout — the user controls long-running steps via
+ # the Skip button. If they walk away from a runaway calc, the
+ # Stop button is still available. Headless callers that genuinely
+ # want a wall-clock cap can pass timeout_per_step explicitly.
+ timeout_per_step: Optional[float] = None
# M-EST follow-up: keep the toolbar activity badge red for the
# duration of the calibration so the user knows the kernel is busy.
@@ -856,6 +895,7 @@ def _progress(
stop_event=app._cal_stop_event,
timeout_per_step=timeout_per_step,
mode=mode,
+ skip_event=app._cal_skip_event,
)
# Belt-and-suspenders: re-render the table from the canonical
# ``result.steps`` in case any per-step callback was dropped
@@ -878,6 +918,7 @@ def _progress(
)
)
app._cal_stop_btn.layout.display = "none"
+ app._cal_skip_btn.layout.display = "none"
app._cal_run_btn.disabled = not pyscf_available
app._cal_mode_toggle.disabled = False
app._refresh_perf_stats()
@@ -1006,9 +1047,14 @@ def refresh_results_browser(app: Any) -> None:
data = load_result(d)
ts = data.get("timestamp", d.name)
calc_badge = _calc_type_badge(data.get("calc_type", ""))
+ # M-EST follow-up (2026-05-25): calibration-produced results
+ # get a 🔧 marker so the user can tell them apart from
+ # user-initiated calcs. The marker comes from result.json's
+ # ``calibration_run_id`` extras field written by the worker.
+ calib_marker = "🔧 " if data.get("calibration_run_id") else ""
label = (
f"{ts} · [{calc_badge}] "
- f"{data['formula']} {data['method']}/{data['basis']}"
+ f"{calib_marker}{data['formula']} {data['method']}/{data['basis']}"
)
options.append((label, str(d)))
except Exception:
diff --git a/quantui/benchmarks.py b/quantui/benchmarks.py
index c01ec96..fb09c3a 100644
--- a/quantui/benchmarks.py
+++ b/quantui/benchmarks.py
@@ -646,7 +646,8 @@ def _normalize_entry(entry: tuple) -> dict:
_STATUS_OK = "ok"
_STATUS_TIMEOUT = "timed_out"
-_STATUS_STOPPED = "stopped"
+_STATUS_STOPPED = "stopped" # whole-suite stop (e.g. Stop button)
+_STATUS_SKIPPED = "skipped" # single-step skip (e.g. Skip button)
_STATUS_ERROR = "error"
@@ -666,6 +667,12 @@ class BenchmarkStep:
# M-EST / EST.4: track which calc-type this step ran so tier 3+4
# entries can be distinguished in summaries.
calc_type: str = "single_point"
+ # M-EST follow-up (2026-05-25 user request): the calibration worker
+ # now saves each step as a real result directory (via save_result)
+ # so users can re-open them from the History tab like any other
+ # calc. ``None`` when save_result failed (best-effort) or the step
+ # itself errored before completion.
+ result_dir: Optional[str] = None
@dataclass
@@ -742,6 +749,165 @@ def _count_electrons(atoms: list[str], charge: int) -> int:
# rewritten after each completed step.
+class _TeeStream:
+ """Minimal text stream that fans writes to multiple destinations.
+
+ Used in the calibration worker so PySCF's ``progress_stream`` output
+ lands BOTH in the shared per-run calibration log (for the parent's
+ live tail) AND in an in-memory ``StringIO`` (so we can pass the
+ per-calc PySCF log text to ``save_result`` for the result dir's
+ ``pyscf.log`` file). Errors writing to any one stream are swallowed
+ — the goal is never to take down the calc because of a bad fanout.
+ """
+
+ def __init__(self, *streams) -> None:
+ self._streams = streams
+
+ def write(self, s) -> int:
+ for stream in self._streams:
+ try:
+ stream.write(s)
+ except Exception: # noqa: BLE001 — tee best-effort
+ pass
+ return len(s)
+
+ def flush(self) -> None:
+ for stream in self._streams:
+ try:
+ stream.flush()
+ except Exception: # noqa: BLE001 — tee best-effort
+ pass
+
+
+def _save_calibration_step(
+ res,
+ *,
+ calc_type: str,
+ pyscf_log: str,
+ calibration_run_id: str,
+ mol,
+):
+ """Save a completed calibration calc as a regular result directory.
+
+ Matches the save sequence from ``_do_run`` in ``app.py`` so the
+ History browser can load + replay calibration entries like any
+ user-initiated calc:
+
+ - ``save_result`` — base dir + result.json + pyscf.log. The
+ ``extras={"calibration_run_id": ...}`` tag lets the History
+ dropdown render a 🔧 marker beside calibration entries.
+ - ``save_thumbnail`` — the card shown in the History dropdown.
+ - For GeoOpt: ``save_trajectory`` (so the Trajectory panel works).
+ - For SP/GeoOpt/Freq with MO data: ``save_orbitals`` (so the
+ Energies + Isosurface panels work).
+ - For Freq: a ``spectra`` dict baked into result.json so the IR
+ + Vibrational panels work; ``displacements`` serialized to
+ nested lists.
+
+ Returns the result directory path, or ``None`` on save failure
+ (caller treats this as "calc succeeded but couldn't save — log it
+ but don't fail the step").
+ """
+ from quantui.results_storage import (
+ load_result,
+ save_orbitals,
+ save_result,
+ save_thumbnail,
+ save_trajectory,
+ )
+
+ # Build the spectra dict for Frequency calcs — must match what the
+ # Analysis tab's _pop_ir_spectrum / _pop_vibrational expect.
+ spectra: dict = {}
+ if calc_type == "frequency":
+ displacements_serialized = None
+ try:
+ import numpy as _np
+
+ if getattr(res, "displacements", None) is not None:
+ displacements_serialized = _np.asarray(res.displacements).tolist()
+ except Exception: # noqa: BLE001 — best-effort
+ pass
+ spectra = {
+ "ir": {
+ "frequencies_cm1": getattr(res, "frequencies_cm1", []),
+ "ir_intensities": getattr(res, "ir_intensities", []),
+ "zpve_hartree": getattr(res, "zpve_hartree", 0.0),
+ "displacements": displacements_serialized,
+ },
+ "molecule": {
+ "atoms": list(mol.atoms),
+ "coords": [list(map(float, row)) for row in mol.coordinates],
+ "charge": mol.charge,
+ "multiplicity": mol.multiplicity,
+ },
+ }
+
+ # For GeoOpt the ``res`` from optimize_geometry has its own .method /
+ # .basis / .formula via res.molecule. save_result expects those
+ # attributes on the top-level result. Build a uniform shim.
+ if calc_type == "geometry_opt":
+ from types import SimpleNamespace
+
+ save_obj = SimpleNamespace(
+ formula=res.molecule.get_formula(),
+ method=res.method,
+ basis=res.basis,
+ energy_hartree=(
+ res.energies_hartree[-1] if res.energies_hartree else float("nan")
+ ),
+ converged=bool(res.converged),
+ n_iterations=int(getattr(res, "n_steps", -1)),
+ homo_lumo_gap_ev=None,
+ mo_energy_hartree=getattr(res, "mo_energy_hartree", None),
+ mo_occ=getattr(res, "mo_occ", None),
+ mo_coeff=getattr(res, "mo_coeff", None),
+ pyscf_mol_atom=getattr(res, "pyscf_mol_atom", None),
+ pyscf_mol_basis=getattr(res, "pyscf_mol_basis", None),
+ )
+ else:
+ save_obj = res
+
+ extras = {"calibration_run_id": calibration_run_id}
+ try:
+ saved_dir = save_result(
+ save_obj,
+ pyscf_log=pyscf_log,
+ calc_type=calc_type,
+ spectra=spectra or None,
+ extras=extras,
+ )
+ except Exception: # noqa: BLE001 — save is best-effort
+ return None
+
+ # Best-effort follow-on saves. None of these are required for the
+ # History card to render — they enrich the replay experience.
+ try:
+ saved_data = load_result(saved_dir)
+ save_thumbnail(saved_dir, saved_data)
+ except Exception: # noqa: BLE001 — thumbnail is purely cosmetic
+ pass
+
+ if calc_type == "geometry_opt":
+ try:
+ traj = getattr(res, "trajectory", None) or getattr(res, "molecule", None)
+ energies = list(getattr(res, "energies_hartree", []) or [])
+ if traj and not isinstance(traj, list):
+ traj = [traj]
+ if traj and len(traj) >= 1:
+ save_trajectory(saved_dir, traj, energies)
+ except Exception: # noqa: BLE001 — trajectory save is best-effort
+ pass
+
+ if calc_type in ("single_point", "geometry_opt", "frequency"):
+ try:
+ save_orbitals(saved_dir, save_obj)
+ except Exception: # noqa: BLE001 — orbital save is best-effort
+ pass
+
+ return saved_dir
+
+
def _calibration_worker(
atoms: list,
coords: list,
@@ -752,18 +918,24 @@ def _calibration_worker(
calc_type: str,
log_path_str: str,
result_queue,
+ calibration_run_id: str = "",
) -> None:
"""Run one calibration step in a child process.
Picklable (top-level function, primitive args + a Queue). Pipes
PySCF progress to ``log_path_str`` (append mode) so the parent can
- tail it. Puts a dict with status / formula / n_iterations /
- converged / elapsed_s on ``result_queue`` when done.
+ tail it AND to an in-memory buffer so the per-calc PySCF output
+ can be saved alongside the result.
+
+ On success: saves a real result directory via ``_save_calibration_step``
+ (tagged with ``calibration_run_id``) and puts a summary dict with
+ ``result_dir`` on ``result_queue``.
- On exception, puts ``{"status": "error", "error_msg": ...}``. The
- parent treats absence of a queue entry (after worker exit) as a
+ On exception: puts ``{"status": "error", "error_msg": ..., "result_dir": None}``.
+ The parent treats absence of a queue entry (after worker exit) as a
crashed worker — distinct from a step-level error.
"""
+ import io as _io
import time as _t
from datetime import datetime as _dt
from pathlib import Path as _P
@@ -775,10 +947,15 @@ def _calibration_worker(
try:
# Line-buffered append so the parent's tail sees output as it
# arrives. ``buffering=1`` requires text mode (which we use).
+ # The tee fans writes to both the shared log + an in-memory
+ # buffer so we can save the per-calc PySCF output to the
+ # result dir's pyscf.log.
with open(log_path, "a", encoding="utf-8", buffering=1) as log_fh:
log_fh.write(
f"\n========= {_dt.utcnow().isoformat()} :: {label} =========\n"
)
+ per_calc_buf = _io.StringIO()
+ stream = _TeeStream(log_fh, per_calc_buf)
from quantui.molecule import Molecule as _Molecule
@@ -791,7 +968,7 @@ def _calibration_worker(
molecule=mol,
method=method,
basis=basis,
- progress_stream=log_fh,
+ progress_stream=stream,
)
formula = res.molecule.get_formula()
converged = bool(res.converged)
@@ -803,7 +980,7 @@ def _calibration_worker(
molecule=mol,
method=method,
basis=basis,
- progress_stream=log_fh,
+ progress_stream=stream,
)
formula = res.formula
converged = bool(res.converged)
@@ -819,7 +996,7 @@ def _calibration_worker(
method=method,
basis=basis,
verbose=3,
- progress_stream=log_fh,
+ progress_stream=stream,
)
formula = res.formula
converged = bool(res.converged)
@@ -828,6 +1005,17 @@ def _calibration_worker(
elapsed = _t.perf_counter() - t0
log_fh.write(f"\n[QuantUI_STATUS] COMPLETED in {elapsed:.2f} s\n")
+ # Save as a regular result directory (M-EST follow-up,
+ # 2026-05-25 user request — tier 4's MP2 + CCSD + benzene
+ # freq are scientifically valuable; don't discard them).
+ saved_dir = _save_calibration_step(
+ res,
+ calc_type=calc_type,
+ pyscf_log=per_calc_buf.getvalue(),
+ calibration_run_id=calibration_run_id,
+ mol=mol,
+ )
+
result_queue.put(
{
"status": "ok",
@@ -835,6 +1023,7 @@ def _calibration_worker(
"converged": converged,
"n_iterations": n_iterations,
"elapsed_s": elapsed,
+ "result_dir": str(saved_dir) if saved_dir else None,
}
)
except Exception as exc:
@@ -843,6 +1032,7 @@ def _calibration_worker(
"status": "error",
"error_msg": str(exc)[:500],
"elapsed_s": _t.perf_counter() - t0,
+ "result_dir": None,
}
)
@@ -928,6 +1118,7 @@ def _save_calibration_json(result: CalibrationResult, log_path: Path) -> None:
"elapsed_s": round(s.elapsed_s, 3),
"error_msg": s.error_msg,
"calc_type": s.calc_type,
+ "result_dir": s.result_dir,
}
for s in result.steps
],
@@ -946,8 +1137,9 @@ def _save_calibration_json(result: CalibrationResult, log_path: Path) -> None:
def run_calibration(
progress_cb: Optional[ProgressCallback] = None,
stop_event=None,
- timeout_per_step: float = 120.0,
+ timeout_per_step: Optional[float] = None,
mode: str = "tier1",
+ skip_event=None,
) -> CalibrationResult:
"""Run the benchmark suite and populate ``perf_log.jsonl``.
@@ -963,17 +1155,28 @@ def run_calibration(
``(step_n, total, label, status, elapsed_s)`` and optionally
``live_message=`` during slow steps. The
terminal call after each step uses status in
- ``ok / timed_out / stopped / error``; intermediate "running"
- ticks fire while the step is in-flight.
+ ``ok / timed_out / stopped / skipped / error``; intermediate
+ "running" ticks fire while the step is in-flight.
stop_event: A :class:`threading.Event`; checked every 500 ms.
- When set, the in-flight worker is terminated immediately
- and the current step is marked ``"stopped"``.
- timeout_per_step: Wall-clock seconds allowed per step. Defaults
- to 120 s — fine for tier 1 / tier 2 (SP only). Caller
- should bump for tier 3 (~900 s) and tier 4 (~1800 s).
+ When set, the in-flight worker is terminated immediately,
+ the current step is marked ``"stopped"``, and remaining
+ steps are abandoned (no further work).
+ timeout_per_step: Wall-clock seconds allowed per step.
+ ``None`` (default) means no timeout — the user controls
+ stoppage via the Stop / Skip buttons. The session-55 tier-4
+ run had a benzene B3LYP/6-31G* freq calc finish at
+ ~1500 s but be cut off at the old 1800 s hard cap, losing
+ the data; the no-timeout default removes that footgun.
+ Pass a numeric value only when running headlessly (e.g. CI)
+ where you genuinely want a wall-clock cap.
mode: One of ``"tier1"`` / ``"tier2"`` / ``"tier3"`` / ``"tier4"``.
Legacy aliases ``"short"`` / ``"long"`` map to tier1 / tier2.
Unknown modes fall back to tier1 with a warning.
+ skip_event: A :class:`threading.Event`; checked every 500 ms.
+ When set, the in-flight worker is terminated, the current
+ step is marked ``"skipped"``, the event is cleared, and
+ the loop continues to the NEXT step. Distinct from
+ ``stop_event``: skip is one step, stop is the whole run.
Returns:
:class:`CalibrationResult` with per-step outcomes.
@@ -1005,6 +1208,11 @@ def run_calibration(
# Per-run calibration log file. The worker appends; the parent tails.
log_path = _calibration_log_path(timestamp)
+ timeout_str = (
+ f"{timeout_per_step:.0f} s"
+ if timeout_per_step is not None
+ else "none (user-controlled)"
+ )
try:
log_path.parent.mkdir(parents=True, exist_ok=True)
with open(log_path, "w", encoding="utf-8") as fh:
@@ -1013,7 +1221,7 @@ def run_calibration(
f"started : {timestamp}\n"
f"mode : {mode}\n"
f"suite size: {total} entries\n"
- f"timeout/step: {timeout_per_step:.0f} s\n"
+ f"timeout/step: {timeout_str}\n"
)
except OSError:
# No log file is non-fatal — calibration still runs, just without
@@ -1104,13 +1312,14 @@ def _emit_progress(*args, live_message=None, step=None) -> None:
calc_type,
str(log_path),
result_queue,
+ timestamp, # calibration_run_id — the parent's run timestamp
),
daemon=True,
)
t_start = time.perf_counter()
worker.start()
- # Poll loop — finish naturally OR hit timeout OR receive stop signal.
+ # Poll loop — finish naturally OR hit timeout OR stop OR skip.
poll_interval = 0.5
worker_done_normally = False
while True:
@@ -1121,7 +1330,10 @@ def _emit_progress(*args, live_message=None, step=None) -> None:
worker_done_normally = True
break
- if elapsed > timeout_per_step:
+ # Timeout is now opt-in (was a hard 1800 s for tier 4 which
+ # cut off a near-finishing benzene freq in session 55).
+ # ``None`` means "user controls; never auto-kill".
+ if timeout_per_step is not None and elapsed > timeout_per_step:
worker.terminate()
worker.join(timeout=5)
step.status = _STATUS_TIMEOUT
@@ -1138,6 +1350,20 @@ def _emit_progress(*args, live_message=None, step=None) -> None:
stopped_mid_step = True
break
+ # Skip = "abandon THIS step, continue to the next." Distinct
+ # from Stop. Clear the event after consuming so the next
+ # step starts fresh — the UI re-sets it if the user clicks
+ # Skip again. (session 55 user request — replaces the
+ # hard timeout that was cutting off near-finishing calcs.)
+ if skip_event is not None and skip_event.is_set():
+ worker.terminate()
+ worker.join(timeout=5)
+ step.status = _STATUS_SKIPPED
+ step.elapsed_s = elapsed
+ step.error_msg = f"skipped by user at {elapsed:.0f}s"
+ skip_event.clear()
+ break
+
# Live-tick: pull the latest log line for the UI.
live_msg = _tail_last_status_line(log_path)
_emit_progress(
@@ -1181,6 +1407,7 @@ def _emit_progress(*args, live_message=None, step=None) -> None:
if msg.get("status") == "ok":
step.status = _STATUS_OK
step.elapsed_s = float(msg["elapsed_s"])
+ step.result_dir = msg.get("result_dir")
# Log to perf_log.jsonl so estimate_time() picks it up.
_calc_log.log_calculation(
formula=msg["formula"],
diff --git a/quantui/calc_log.py b/quantui/calc_log.py
index 130ef57..53962e8 100644
--- a/quantui/calc_log.py
+++ b/quantui/calc_log.py
@@ -269,6 +269,19 @@ def _event_path() -> Path:
return _log_dir() / "event_log.jsonl"
+def _prediction_log_path() -> Path:
+ """Path to ``prediction_log.jsonl`` — the M-EST / EST.6 file
+ capturing one record per ``_do_run`` invocation with the
+ estimator's pre-run prediction and the actual wall-clock outcome.
+
+ Kept indefinitely (like ``perf_log.jsonl``) so the analytics
+ dashboard can plot prediction accuracy over time without manual
+ pruning. Lives in the same dir as the other logs; honours
+ ``QUANTUI_LOG_DIR`` for tests.
+ """
+ return _log_dir() / "prediction_log.jsonl"
+
+
def _append(path: Path, record: dict) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
line = json.dumps(record, ensure_ascii=False) + "\n"
@@ -704,6 +717,72 @@ def get_perf_history() -> list[dict]:
return _read_all(_perf_path())
+# ---------------------------------------------------------------------------
+# Prediction log (M-EST / EST.6, 2026-05-25)
+# ---------------------------------------------------------------------------
+#
+# Captures one record per ``_do_run`` invocation with the estimator's
+# pre-run prediction + the actual wall-clock outcome. Lets the analytics
+# dashboard show prediction accuracy over time, broken down by calc-type
+# and device, so the user can tell at a glance whether the estimator is
+# working or whether it's time to re-calibrate.
+
+
+def log_prediction(
+ predicted_s: Optional[float],
+ actual_s: float,
+ *,
+ method: str,
+ basis: str,
+ calc_type: str,
+ formula: str = "",
+ confidence: str = "unknown",
+ gpu_used: Optional[bool] = None,
+) -> None:
+ """Append one prediction record to ``prediction_log.jsonl``.
+
+ ``predicted_s`` is ``None`` when the estimator returned no estimate
+ (insufficient history at run-time). Both columns are still logged
+ so the dashboard can count "no-estimate" runs separately from
+ "estimate-was-way-off" runs — both are meaningful failure modes
+ for the predictor.
+
+ ``actual_s`` should match the value passed to ``log_calculation``
+ for the same run; the dashboard cross-references them via the
+ ``timestamp`` key. The two writes are not transactional — if one
+ side fails we'd rather have the perf-log record than no record
+ at all, so ``log_prediction`` is best-effort and the caller does
+ not depend on its return.
+ """
+ record: dict = {
+ "timestamp": datetime.now(timezone.utc).isoformat(),
+ "predicted_s": (
+ round(float(predicted_s), 3) if predicted_s is not None else None
+ ),
+ "actual_s": round(float(actual_s), 3),
+ "method": method,
+ "basis": basis,
+ "calc_type": calc_type,
+ "formula": formula,
+ "confidence": confidence,
+ }
+ if gpu_used is not None:
+ record["gpu_used"] = bool(gpu_used)
+ # Derived: signed error percentage. ``None`` when we had no estimate.
+ if predicted_s is not None and predicted_s > 0:
+ record["error_pct"] = round(
+ 100.0 * (float(actual_s) - float(predicted_s)) / float(predicted_s), 1
+ )
+ else:
+ record["error_pct"] = None
+ _append(_prediction_log_path(), record)
+
+
+def get_prediction_history() -> list[dict]:
+ """Return all records from ``prediction_log.jsonl`` as a list of dicts."""
+ return _read_all(_prediction_log_path())
+
+
def reset_perf_log() -> None:
"""Delete all records from ``perf_log.jsonl``.
diff --git a/quantui/gpu_offload.py b/quantui/gpu_offload.py
index 79b1f2e..3f7916d 100644
--- a/quantui/gpu_offload.py
+++ b/quantui/gpu_offload.py
@@ -35,10 +35,21 @@
logger = logging.getLogger(__name__)
-# Methods for which gpu4pyscf has zero or known-broken support. ``CCSD(T)``
-# is documented as unsupported in the gpu4pyscf README; double hybrids are
-# also listed but QuantUI doesn't expose any double-hybrid methods today.
-_GPU_UNSUPPORTED_METHODS: frozenset = frozenset({"CCSD(T)"})
+# Methods for which gpu4pyscf has zero or known-broken support.
+#
+# - ``CCSD(T)`` is documented as unsupported in the gpu4pyscf README.
+# - ``MP2`` and ``CCSD`` are labelled "experimental" by gpu4pyscf and
+# were observed (session 55, 2026-05-25 user tier-4 run) to fail
+# immediately after a successful RHF reference on GPU — the failure
+# fingerprint was "step completed in RHF wall time + small delta,
+# then errored", which fits the post-HF code choking on a
+# GPU-migrated mf object. Until the upstream support matures, route
+# these through CPU so calibration data accrues reliably. The RHF
+# reference still benefits from GPU because ``try_to_gpu`` only
+# short-circuits BEFORE the migration.
+# - Double-hybrids would belong here too, but QuantUI doesn't expose
+# any double-hybrid methods today.
+_GPU_UNSUPPORTED_METHODS: frozenset = frozenset({"MP2", "CCSD", "CCSD(T)"})
@lru_cache(maxsize=1)
diff --git a/quantui/results_storage.py b/quantui/results_storage.py
index 457513a..55cbcbb 100644
--- a/quantui/results_storage.py
+++ b/quantui/results_storage.py
@@ -52,6 +52,7 @@ def save_result(
results_dir: Optional[Path] = None,
calc_type: str = "single_point",
spectra: Optional[dict] = None,
+ extras: Optional[dict] = None,
) -> Path:
"""Write *result* to a new timestamped subdirectory of *results_dir*.
@@ -77,6 +78,14 @@ def save_result(
spectra:
Dict of spectra data (IR frequencies, UV-Vis excitations, …)
stored under the ``"spectra"`` key in ``result.json``.
+ extras:
+ Optional dict of additional fields to merge into ``result.json``.
+ Used by the calibration runner to tag results with a
+ ``calibration_run_id`` marker so the History browser can show
+ a small badge distinguishing them from user-initiated calcs.
+ Keys clash with built-in result.json fields (``timestamp``,
+ ``formula``, etc.) overwrite them — by design, since the
+ caller is asserting they want to override.
Returns
-------
@@ -123,6 +132,8 @@ def save_result(
"n_iterations": getattr(result, "n_iterations", -1),
"spectra": spectra if spectra is not None else {},
}
+ if extras:
+ data.update(extras)
(dest / "result.json").write_text(json.dumps(data, indent=2))
if pyscf_log:
diff --git a/tests/test_calibration_save_results.py b/tests/test_calibration_save_results.py
new file mode 100644
index 0000000..753597a
--- /dev/null
+++ b/tests/test_calibration_save_results.py
@@ -0,0 +1,295 @@
+"""Tests for the M-EST follow-up: calibration results saved as job files.
+
+Session 55 (2026-05-25) user request:
+
+ > Are the calculations run as part of the calibration time estimates
+ > saved to job files so users can load the results as usual?
+
+Before this change, calibration steps only wrote to ``perf_log.jsonl``
+(for the estimator) and ``calibration.json`` (for the UI summary). The
+full result objects were discarded. Tier-4 in particular runs MP2 +
+CCSD on H₂O/cc-pVDZ plus benzene B3LYP/6-31G* frequency — those are
+real research-quality calcs and the user wanted them saved.
+
+This file tests the new save path WITHOUT running PySCF, by:
+
+1. Unit-testing ``save_result(..., extras={...})`` — the new kwarg that
+ embeds ``calibration_run_id`` (and any other extras) in result.json.
+2. Unit-testing the ``_TeeStream`` helper used to fan PySCF's
+ progress_stream to both the shared calibration log and an in-memory
+ buffer (so save_result has the per-calc PySCF log).
+3. Unit-testing ``_save_calibration_step`` against a fake result
+ object — confirms it writes a result_dir with the calibration tag.
+4. Structure-grep tests that the worker passes ``calibration_run_id``
+ to the helper and returns ``result_dir`` on the queue, and that
+ ``BenchmarkStep`` has the new ``result_dir`` field.
+
+All tests platform-independent. No PySCF required.
+"""
+
+from __future__ import annotations
+
+import inspect
+import io
+import json
+from types import SimpleNamespace
+
+# =====================================================================
+# save_result(..., extras=...) — new kwarg
+# =====================================================================
+
+
+class TestSaveResultExtras:
+ def test_extras_merged_into_result_json(self, tmp_path):
+ from quantui.results_storage import save_result
+
+ fake_result = SimpleNamespace(
+ formula="H2O",
+ method="RHF",
+ basis="STO-3G",
+ energy_hartree=-75.0,
+ energy_ev=-75.0 * 27.211386245988,
+ homo_lumo_gap_ev=10.0,
+ converged=True,
+ n_iterations=5,
+ )
+
+ out = save_result(
+ fake_result,
+ pyscf_log="line 1\nline 2\n",
+ results_dir=tmp_path,
+ calc_type="single_point",
+ extras={"calibration_run_id": "2026-05-25T12:00:00+00:00"},
+ )
+ data = json.loads((out / "result.json").read_text())
+ assert data["calibration_run_id"] == "2026-05-25T12:00:00+00:00"
+ # Existing fields still present.
+ assert data["formula"] == "H2O"
+ assert data["calc_type"] == "single_point"
+
+ def test_extras_can_overwrite_builtin_field(self, tmp_path):
+ # Documented behaviour: extras takes precedence. This is by
+ # design — calibration uses it deliberately and a future caller
+ # may want the same affordance.
+ from quantui.results_storage import save_result
+
+ fake_result = SimpleNamespace(
+ formula="H2O",
+ method="RHF",
+ basis="STO-3G",
+ energy_hartree=-75.0,
+ converged=True,
+ n_iterations=1,
+ )
+ out = save_result(
+ fake_result,
+ results_dir=tmp_path,
+ extras={"formula": "OVERRIDDEN"},
+ )
+ data = json.loads((out / "result.json").read_text())
+ assert data["formula"] == "OVERRIDDEN"
+
+ def test_extras_none_is_no_op(self, tmp_path):
+ # Existing callers that don't pass extras must keep working.
+ from quantui.results_storage import save_result
+
+ fake_result = SimpleNamespace(
+ formula="H2O",
+ method="RHF",
+ basis="STO-3G",
+ energy_hartree=-75.0,
+ converged=True,
+ n_iterations=1,
+ )
+ out = save_result(fake_result, results_dir=tmp_path)
+ data = json.loads((out / "result.json").read_text())
+ # No calibration_run_id when extras wasn't passed.
+ assert "calibration_run_id" not in data
+
+
+# =====================================================================
+# _TeeStream — fan progress to two destinations
+# =====================================================================
+
+
+class TestTeeStream:
+ def test_writes_to_all_streams(self):
+ from quantui.benchmarks import _TeeStream
+
+ a = io.StringIO()
+ b = io.StringIO()
+ tee = _TeeStream(a, b)
+ tee.write("hello\n")
+ tee.write("world\n")
+ assert a.getvalue() == "hello\nworld\n"
+ assert b.getvalue() == "hello\nworld\n"
+
+ def test_returns_len_of_written(self):
+ from quantui.benchmarks import _TeeStream
+
+ tee = _TeeStream(io.StringIO())
+ assert tee.write("abcde") == 5
+
+ def test_one_broken_stream_doesnt_kill_others(self):
+ from quantui.benchmarks import _TeeStream
+
+ class _Broken:
+ def write(self, _s):
+ raise RuntimeError("simulated")
+
+ def flush(self):
+ raise RuntimeError("simulated")
+
+ good = io.StringIO()
+ tee = _TeeStream(_Broken(), good)
+ tee.write("payload")
+ tee.flush()
+ # The good stream still got the data.
+ assert good.getvalue() == "payload"
+
+
+# =====================================================================
+# _save_calibration_step — the worker's save helper
+# =====================================================================
+
+
+class TestSaveCalibrationStep:
+ def test_single_point_creates_result_dir_with_tag(self, tmp_path, monkeypatch):
+ # Redirect the default results dir to tmp_path.
+ from pathlib import Path as _Path
+
+ monkeypatch.setattr(_Path, "home", lambda: tmp_path)
+
+ from quantui.benchmarks import _save_calibration_step
+
+ fake_result = SimpleNamespace(
+ formula="H2O",
+ method="B3LYP",
+ basis="STO-3G",
+ energy_hartree=-75.0,
+ energy_ev=-75.0 * 27.211386245988,
+ homo_lumo_gap_ev=10.0,
+ converged=True,
+ n_iterations=12,
+ )
+ fake_mol = SimpleNamespace(
+ atoms=["O", "H", "H"],
+ coordinates=[[0, 0, 0], [0.7, 0.6, 0], [-0.7, 0.6, 0]],
+ charge=0,
+ multiplicity=1,
+ )
+
+ saved = _save_calibration_step(
+ fake_result,
+ calc_type="single_point",
+ pyscf_log="some log",
+ calibration_run_id="2026-05-25T12:00:00+00:00",
+ mol=fake_mol,
+ )
+ assert saved is not None
+ assert saved.exists()
+ data = json.loads((saved / "result.json").read_text())
+ assert data["calibration_run_id"] == "2026-05-25T12:00:00+00:00"
+ assert data["calc_type"] == "single_point"
+ assert data["formula"] == "H2O"
+ # pyscf.log should be present from the worker's per-calc tee buffer.
+ assert (saved / "pyscf.log").exists()
+ assert "some log" in (saved / "pyscf.log").read_text()
+
+ def test_frequency_includes_spectra(self, tmp_path, monkeypatch):
+ from pathlib import Path as _Path
+
+ monkeypatch.setattr(_Path, "home", lambda: tmp_path)
+
+ from quantui.benchmarks import _save_calibration_step
+
+ fake_freq = SimpleNamespace(
+ formula="H2O",
+ method="B3LYP",
+ basis="STO-3G",
+ energy_hartree=-75.0,
+ energy_ev=-75.0 * 27.211386245988,
+ homo_lumo_gap_ev=10.0,
+ converged=True,
+ n_iterations=12,
+ frequencies_cm1=[1600.0, 3700.0, 3800.0],
+ ir_intensities=[80.0, 5.0, 50.0],
+ zpve_hartree=0.02,
+ displacements=None,
+ )
+ fake_mol = SimpleNamespace(
+ atoms=["O", "H", "H"],
+ coordinates=[[0, 0, 0], [0.7, 0.6, 0], [-0.7, 0.6, 0]],
+ charge=0,
+ multiplicity=1,
+ )
+
+ saved = _save_calibration_step(
+ fake_freq,
+ calc_type="frequency",
+ pyscf_log="",
+ calibration_run_id="tier4-run-1",
+ mol=fake_mol,
+ )
+ assert saved is not None
+ data = json.loads((saved / "result.json").read_text())
+ # The Analysis tab's IR + Vibrational panels read these keys.
+ assert "spectra" in data
+ assert "ir" in data["spectra"]
+ assert data["spectra"]["ir"]["frequencies_cm1"] == [1600.0, 3700.0, 3800.0]
+ assert "molecule" in data["spectra"]
+ assert data["spectra"]["molecule"]["atoms"] == ["O", "H", "H"]
+
+
+# =====================================================================
+# Worker + BenchmarkStep structural checks
+# =====================================================================
+
+
+class TestWorkerStructure:
+ def test_benchmark_step_has_result_dir_field(self):
+ from quantui.benchmarks import BenchmarkStep
+
+ s = BenchmarkStep(
+ label="x",
+ method="RHF",
+ basis="STO-3G",
+ n_atoms=2,
+ n_electrons=2,
+ status="ok",
+ )
+ # New field — default None.
+ assert s.result_dir is None
+
+ def test_calibration_worker_signature_accepts_run_id(self):
+ from quantui.benchmarks import _calibration_worker
+
+ sig = inspect.signature(_calibration_worker)
+ assert "calibration_run_id" in sig.parameters
+
+ def test_worker_source_calls_save_calibration_step(self):
+ from quantui import benchmarks
+
+ src = inspect.getsource(benchmarks._calibration_worker)
+ assert "_save_calibration_step" in src
+ # And the queue payload now carries result_dir.
+ assert "result_dir" in src
+
+ def test_save_calibration_json_includes_result_dir(self):
+ # The persisted calibration.json should expose result_dir per
+ # step so future tooling can find the saved results.
+ from quantui import benchmarks
+
+ src = inspect.getsource(benchmarks._save_calibration_json)
+ assert '"result_dir"' in src or "'result_dir'" in src
+
+
+class TestHistoryLabelMarker:
+ def test_refresh_results_browser_emits_calibration_marker(self):
+ from quantui import app_runflow
+
+ src = inspect.getsource(app_runflow.refresh_results_browser)
+ # The 🔧 marker is rendered when calibration_run_id is present
+ # on the saved result.json.
+ assert "calibration_run_id" in src
+ assert "🔧" in src or "calib_marker" in src
diff --git a/tests/test_calibration_skip_and_gpu.py b/tests/test_calibration_skip_and_gpu.py
new file mode 100644
index 0000000..e98f2f6
--- /dev/null
+++ b/tests/test_calibration_skip_and_gpu.py
@@ -0,0 +1,250 @@
+"""Tests for the session-55 calibration UX fixes:
+
+1. **Skip button**: replaces the per-step timeout. The user can abandon
+ ONE step without losing the whole calibration (the old hard 1800 s
+ tier-4 cap cut off a near-finishing benzene B3LYP/6-31G* freq).
+2. **MP2 + CCSD blocked on GPU**: gpu4pyscf's post-HF support is
+ experimental and was crashing immediately after the RHF reference.
+ Both methods now stay CPU-side via ``_GPU_UNSUPPORTED_METHODS``.
+3. **error_msg visible in calibration table**: failed steps now show
+ the captured error message inline (truncated) so the user knows
+ WHY a step failed.
+
+All tests platform-independent. No PySCF required.
+"""
+
+from __future__ import annotations
+
+import inspect
+
+# =====================================================================
+# Fix 2 — MP2 + CCSD on the GPU skip list
+# =====================================================================
+
+
+class TestGpuUnsupportedMethods:
+ def test_mp2_blocked_on_gpu(self):
+ from quantui.gpu_offload import _GPU_UNSUPPORTED_METHODS
+
+ assert "MP2" in _GPU_UNSUPPORTED_METHODS
+
+ def test_ccsd_blocked_on_gpu(self):
+ from quantui.gpu_offload import _GPU_UNSUPPORTED_METHODS
+
+ assert "CCSD" in _GPU_UNSUPPORTED_METHODS
+
+ def test_ccsd_t_still_blocked(self):
+ # Don't accidentally remove the original entry while adding new ones.
+ from quantui.gpu_offload import _GPU_UNSUPPORTED_METHODS
+
+ assert "CCSD(T)" in _GPU_UNSUPPORTED_METHODS
+
+ def test_try_to_gpu_returns_cpu_path_for_mp2(self):
+ # Direct functional check: try_to_gpu should short-circuit before
+ # calling .to_gpu() when the method is blocked. The "mf" we pass
+ # doesn't need to be real — try_to_gpu returns it unchanged.
+ from quantui.gpu_offload import try_to_gpu
+
+ sentinel = object()
+ mf, used_gpu, name = try_to_gpu(sentinel, "MP2")
+ assert mf is sentinel
+ assert used_gpu is False
+ assert name is None
+
+
+# =====================================================================
+# Fix 1 — Skip event + no-timeout default
+# =====================================================================
+
+
+class TestRunCalibrationSignature:
+ def test_run_calibration_accepts_skip_event(self):
+ from quantui.benchmarks import run_calibration
+
+ sig = inspect.signature(run_calibration)
+ assert "skip_event" in sig.parameters
+
+ def test_timeout_per_step_default_is_none(self):
+ # session 55 user request: no automatic timeout — Skip button
+ # is the user-facing control.
+ from quantui.benchmarks import run_calibration
+
+ sig = inspect.signature(run_calibration)
+ timeout_param = sig.parameters["timeout_per_step"]
+ assert timeout_param.default is None
+
+ def test_loop_handles_none_timeout_without_crashing(self):
+ # Most direct path: run_calibration with PySCF unavailable just
+ # iterates through the suite emitting PySCF-not-available errors.
+ # With timeout_per_step=None we must NOT hit the
+ # ``elapsed > timeout_per_step`` comparison (which would
+ # TypeError on None).
+ from quantui.benchmarks import run_calibration
+
+ # Smaller suite so the test stays fast.
+ result = run_calibration(mode="tier1", timeout_per_step=None)
+ # On Windows (no PySCF) every step is marked error.
+ # Function returns cleanly without exceptions.
+ assert result.mode == "tier1"
+
+ def test_skipped_status_constant_exists(self):
+ from quantui import benchmarks
+
+ assert hasattr(benchmarks, "_STATUS_SKIPPED")
+ assert benchmarks._STATUS_SKIPPED == "skipped"
+
+
+class TestSkipEventInPollLoop:
+ """Structural / source check: the poll loop now honours skip_event.
+
+ A full end-to-end skip test would require PySCF + spawning a real
+ worker; the source-grep test is the cheap regression guard.
+ """
+
+ def test_poll_loop_checks_skip_event(self):
+ from quantui import benchmarks
+
+ src = inspect.getsource(benchmarks.run_calibration)
+ # The new branch checks skip_event.is_set() and calls
+ # skip_event.clear() so the next step starts fresh.
+ assert "skip_event" in src
+ assert "skip_event.is_set()" in src
+ assert "skip_event.clear()" in src
+ assert "_STATUS_SKIPPED" in src
+
+ def test_no_unconditional_timeout_comparison(self):
+ # If someone reintroduces ``elapsed > timeout_per_step`` without
+ # a None guard, this test catches it.
+ from quantui import benchmarks
+
+ src = inspect.getsource(benchmarks.run_calibration)
+ # Either the comparison is guarded by a None check OR it's gone.
+ # Match the guard pattern explicitly.
+ assert "timeout_per_step is not None" in src
+
+
+# =====================================================================
+# Fix 3 — error_msg surfaced in the table
+# =====================================================================
+
+
+class TestCalTableShowsErrorMsg:
+ def test_error_row_includes_error_msg_text(self):
+ # Direct render-helper test: an error step should include the
+ # error_msg in the rendered HTML so users see WHY the step failed.
+ from types import SimpleNamespace
+
+ from quantui.app_runflow import _cal_table_html
+
+ bad_step = SimpleNamespace(
+ label="H₂O MP2/cc-pVDZ",
+ method="MP2",
+ basis="cc-pVDZ",
+ n_atoms=3,
+ n_electrons=10,
+ n_basis=24,
+ status="error",
+ elapsed_s=5.54,
+ error_msg="MP2 correction failed for H2O: foo bar baz",
+ calc_type="single_point",
+ result_dir=None,
+ )
+ html = _cal_table_html([bad_step], total=1)
+ assert "✗ error" in html
+ # The error message text appears in the rendered HTML.
+ assert "MP2 correction failed" in html
+
+ def test_ok_row_does_not_show_inline_detail(self):
+ from types import SimpleNamespace
+
+ from quantui.app_runflow import _cal_table_html
+
+ good_step = SimpleNamespace(
+ label="H₂ RHF/STO-3G",
+ method="RHF",
+ basis="STO-3G",
+ n_atoms=2,
+ n_electrons=2,
+ n_basis=2,
+ status="ok",
+ elapsed_s=0.5,
+ error_msg="",
+ calc_type="single_point",
+ result_dir=None,
+ )
+ html = _cal_table_html([good_step], total=1)
+ # No italic detail line for successful steps.
+ assert "font-style:italic" not in html or "color:#94a3b8" not in html
+
+ def test_long_error_msg_truncated(self):
+ from types import SimpleNamespace
+
+ from quantui.app_runflow import _cal_table_html
+
+ long_msg = "x" * 500
+ bad_step = SimpleNamespace(
+ label="bad",
+ method="MP2",
+ basis="cc-pVDZ",
+ n_atoms=3,
+ n_electrons=10,
+ n_basis=24,
+ status="error",
+ elapsed_s=1.0,
+ error_msg=long_msg,
+ calc_type="single_point",
+ result_dir=None,
+ )
+ html = _cal_table_html([bad_step], total=1)
+ # The 500-char message gets truncated with "…".
+ assert "…" in html
+ # And isn't dumped wholesale (would be > 200 chars of x's).
+ assert "x" * 200 not in html
+
+ def test_skipped_row_uses_skipped_label(self):
+ from types import SimpleNamespace
+
+ from quantui.app_runflow import _cal_status_text, _cal_table_html
+
+ # Direct check of the status renderer.
+ assert "skipped" in _cal_status_text("skipped").lower()
+
+ skipped_step = SimpleNamespace(
+ label="C₆H₆ B3LYP [Freq]",
+ method="B3LYP",
+ basis="6-31G*",
+ n_atoms=12,
+ n_electrons=42,
+ n_basis=96,
+ status="skipped",
+ elapsed_s=1500.0,
+ error_msg="skipped by user at 1500s",
+ calc_type="frequency",
+ result_dir=None,
+ )
+ html = _cal_table_html([skipped_step], total=1)
+ assert "⏭" in html or "skipped" in html
+
+
+# =====================================================================
+# UI wiring — Skip button + handler exist
+# =====================================================================
+
+
+class TestSkipButtonWiring:
+ def test_app_has_cal_skip_btn(self):
+ from quantui.app import QuantUIApp
+
+ app = QuantUIApp()
+ assert hasattr(app, "_cal_skip_btn")
+
+ def test_app_has_on_cal_skip_method(self):
+ from quantui.app import QuantUIApp
+
+ app = QuantUIApp()
+ assert callable(getattr(app, "_on_cal_skip", None))
+
+ def test_on_cal_skip_handler_in_app_runflow(self):
+ from quantui import app_runflow
+
+ assert callable(getattr(app_runflow, "on_cal_skip", None))
diff --git a/tests/test_est_prediction_log.py b/tests/test_est_prediction_log.py
new file mode 100644
index 0000000..6866858
--- /dev/null
+++ b/tests/test_est_prediction_log.py
@@ -0,0 +1,312 @@
+"""Tests for M-EST / EST.6 — predicted-vs-actual feedback log.
+
+After each ``_do_run``, QuantUI now writes a record to
+``prediction_log.jsonl`` with the estimator's pre-run prediction +
+the actual wall-clock outcome. The analytics dashboard surfaces:
+
+- headline cards (median absolute error %, % within 25%, bias, etc.)
+- a scatter of predicted vs actual with a y=x reference line
+- a "consider re-running calibration" banner when median |error| > 50%
+
+All tests are platform-independent. ``prediction_log.jsonl`` is
+redirected to ``tmp_path`` via ``QUANTUI_LOG_DIR``.
+"""
+
+from __future__ import annotations
+
+import inspect
+import json
+
+import pytest
+
+from quantui import analytics
+from quantui.calc_log import (
+ _prediction_log_path,
+ get_prediction_history,
+ log_prediction,
+)
+
+
+@pytest.fixture
+def isolated_log_dir(tmp_path, monkeypatch):
+ monkeypatch.setenv("QUANTUI_LOG_DIR", str(tmp_path))
+ return tmp_path
+
+
+# =====================================================================
+# log_prediction / get_prediction_history
+# =====================================================================
+
+
+class TestLogPrediction:
+ def test_writes_record_with_all_fields(self, isolated_log_dir):
+ log_prediction(
+ predicted_s=10.0,
+ actual_s=12.5,
+ method="B3LYP",
+ basis="6-31G*",
+ calc_type="single_point",
+ formula="H2O",
+ confidence="high",
+ gpu_used=False,
+ )
+ records = get_prediction_history()
+ assert len(records) == 1
+ r = records[0]
+ assert r["predicted_s"] == 10.0
+ assert r["actual_s"] == 12.5
+ assert r["method"] == "B3LYP"
+ assert r["calc_type"] == "single_point"
+ assert r["formula"] == "H2O"
+ assert r["confidence"] == "high"
+ assert r["gpu_used"] is False
+ # Derived field: signed error percentage.
+ assert r["error_pct"] == 25.0
+
+ def test_underprediction_yields_positive_error(self, isolated_log_dir):
+ # Predicted 1 min, took 5 min — error_pct should be +400% (actual
+ # is 4x the prediction, i.e. 400% larger).
+ log_prediction(
+ predicted_s=60.0,
+ actual_s=300.0,
+ method="B3LYP",
+ basis="6-31G*",
+ calc_type="frequency",
+ )
+ r = get_prediction_history()[0]
+ assert r["error_pct"] == 400.0
+
+ def test_overprediction_yields_negative_error(self, isolated_log_dir):
+ # Predicted 100 s, took 50 s — error_pct should be -50%.
+ log_prediction(
+ predicted_s=100.0,
+ actual_s=50.0,
+ method="RHF",
+ basis="STO-3G",
+ calc_type="single_point",
+ )
+ r = get_prediction_history()[0]
+ assert r["error_pct"] == -50.0
+
+ def test_no_estimate_records_none_error(self, isolated_log_dir):
+ # When the estimator returned no estimate (insufficient history),
+ # we still log the actual outcome so the dashboard counts the
+ # "no-estimate" runs separately.
+ log_prediction(
+ predicted_s=None,
+ actual_s=1.5,
+ method="B3LYP",
+ basis="STO-3G",
+ calc_type="single_point",
+ )
+ r = get_prediction_history()[0]
+ assert r["predicted_s"] is None
+ assert r["error_pct"] is None
+ assert r["actual_s"] == 1.5
+
+ def test_zero_predicted_does_not_div_by_zero(self, isolated_log_dir):
+ # Defensive: predicted_s=0 is nonsensical but mustn't crash.
+ log_prediction(
+ predicted_s=0.0,
+ actual_s=1.0,
+ method="RHF",
+ basis="STO-3G",
+ calc_type="single_point",
+ )
+ r = get_prediction_history()[0]
+ assert r["error_pct"] is None # zero-protected path
+
+ def test_path_honors_quantui_log_dir(self, isolated_log_dir):
+ # The fixture sets QUANTUI_LOG_DIR. The prediction log must
+ # land there, not in ~/.quantui/logs.
+ log_prediction(
+ predicted_s=1.0,
+ actual_s=1.0,
+ method="RHF",
+ basis="STO-3G",
+ calc_type="single_point",
+ )
+ assert _prediction_log_path().parent == isolated_log_dir
+
+
+# =====================================================================
+# Analytics metrics
+# =====================================================================
+
+
+class TestPredictionAccuracyMetrics:
+ def test_empty_records(self):
+ m = analytics._prediction_accuracy_metrics([])
+ assert m["n_total"] == 0
+ assert m["median_abs_error_pct"] is None
+ assert m["median_signed_error_pct"] is None
+ assert m["pct_within_25"] is None
+
+ def test_all_within_25_pct(self):
+ # Spread of 10% / 15% / 20% / 5% — all within 25%.
+ records = [
+ {"predicted_s": 1.0, "actual_s": 1.1, "error_pct": 10.0},
+ {"predicted_s": 1.0, "actual_s": 1.15, "error_pct": 15.0},
+ {"predicted_s": 1.0, "actual_s": 1.2, "error_pct": 20.0},
+ {"predicted_s": 1.0, "actual_s": 1.05, "error_pct": 5.0},
+ ]
+ m = analytics._prediction_accuracy_metrics(records)
+ assert m["pct_within_25"] == 100.0
+
+ def test_mixed_within_25(self):
+ # 2 of 4 within 25%, 2 outside (one +60%, one -40%).
+ records = [
+ {"predicted_s": 1.0, "actual_s": 1.1, "error_pct": 10.0},
+ {"predicted_s": 1.0, "actual_s": 1.2, "error_pct": 20.0},
+ {"predicted_s": 1.0, "actual_s": 1.6, "error_pct": 60.0},
+ {"predicted_s": 1.0, "actual_s": 0.6, "error_pct": -40.0},
+ ]
+ m = analytics._prediction_accuracy_metrics(records)
+ assert m["pct_within_25"] == 50.0
+
+ def test_signed_median_picks_up_bias(self):
+ # All four runs over-ran the prediction → positive bias.
+ records = [
+ {"predicted_s": 1.0, "actual_s": 1.5, "error_pct": 50.0},
+ {"predicted_s": 1.0, "actual_s": 1.6, "error_pct": 60.0},
+ {"predicted_s": 1.0, "actual_s": 1.4, "error_pct": 40.0},
+ {"predicted_s": 1.0, "actual_s": 1.7, "error_pct": 70.0},
+ ]
+ m = analytics._prediction_accuracy_metrics(records)
+ assert m["median_signed_error_pct"] is not None
+ assert m["median_signed_error_pct"] > 0 # positive bias
+
+ def test_no_estimate_records_excluded_from_error_stats(self):
+ # 2 records with no estimate + 2 with — the metrics use only
+ # the 2 that have data, and report the no-estimate count.
+ records = [
+ {"predicted_s": None, "actual_s": 1.0, "error_pct": None},
+ {"predicted_s": None, "actual_s": 2.0, "error_pct": None},
+ {"predicted_s": 1.0, "actual_s": 1.1, "error_pct": 10.0},
+ {"predicted_s": 1.0, "actual_s": 1.2, "error_pct": 20.0},
+ ]
+ m = analytics._prediction_accuracy_metrics(records)
+ assert m["n_total"] == 4
+ assert m["n_with_estimate"] == 2
+ assert m["n_no_estimate"] == 2
+ assert m["median_abs_error_pct"] == 15.0
+
+
+# =====================================================================
+# Dashboard rendering
+# =====================================================================
+
+
+def _seed_perf_log(log_dir):
+ """Seed perf_log so build_dashboard doesn't early-return None."""
+ p = log_dir / "perf_log.jsonl"
+ p.write_text(
+ json.dumps(
+ {
+ "timestamp": "2026-05-25T12:00:00+00:00",
+ "formula": "H2O",
+ "method": "B3LYP",
+ "basis": "STO-3G",
+ "elapsed_s": 1.0,
+ "converged": True,
+ }
+ )
+ + "\n",
+ encoding="utf-8",
+ )
+
+
+def _seed_prediction_log(log_dir, records):
+ p = log_dir / "prediction_log.jsonl"
+ with p.open("w", encoding="utf-8") as fh:
+ for r in records:
+ fh.write(json.dumps(r) + "\n")
+
+
+class TestDashboardPredictionSection:
+ def test_section_present_when_predictions_exist(self, isolated_log_dir):
+ _seed_perf_log(isolated_log_dir)
+ _seed_prediction_log(
+ isolated_log_dir,
+ [
+ {
+ "timestamp": "2026-05-25T12:00:00+00:00",
+ "predicted_s": 1.0,
+ "actual_s": 1.1,
+ "error_pct": 10.0,
+ "method": "B3LYP",
+ "basis": "STO-3G",
+ "calc_type": "single_point",
+ },
+ {
+ "timestamp": "2026-05-25T12:01:00+00:00",
+ "predicted_s": 5.0,
+ "actual_s": 6.0,
+ "error_pct": 20.0,
+ "method": "B3LYP",
+ "basis": "STO-3G",
+ "calc_type": "single_point",
+ },
+ ],
+ )
+ out = analytics.build_dashboard()
+ assert out is not None
+ html = out.read_text(encoding="utf-8")
+ assert "Prediction accuracy" in html
+ # Headline metric should appear (median |error| = 15%).
+ assert "15.0%" in html
+
+ def test_empty_state_when_no_predictions(self, isolated_log_dir):
+ _seed_perf_log(isolated_log_dir)
+ # No prediction_log.jsonl written.
+ out = analytics.build_dashboard()
+ html = out.read_text(encoding="utf-8")
+ assert "Prediction accuracy" in html
+ assert "No predictions logged yet" in html
+
+ def test_banner_when_median_error_exceeds_threshold(self, isolated_log_dir):
+ _seed_perf_log(isolated_log_dir)
+ # All four predictions off by 60%+ → median absolute > 50%.
+ _seed_prediction_log(
+ isolated_log_dir,
+ [
+ {
+ "timestamp": f"2026-05-25T12:00:{i:02d}+00:00",
+ "predicted_s": 1.0,
+ "actual_s": 2.0,
+ "error_pct": 100.0,
+ "method": "B3LYP",
+ "basis": "STO-3G",
+ "calc_type": "single_point",
+ }
+ for i in range(4)
+ ],
+ )
+ out = analytics.build_dashboard()
+ html = out.read_text(encoding="utf-8")
+ # The re-calibrate banner kicks in at median |error| > 50%.
+ assert "Re-running a deeper calibration tier" in html
+
+
+# =====================================================================
+# _do_run wiring — source-level structure check
+# =====================================================================
+
+
+class TestDoRunWiring:
+ def test_do_run_captures_predicted_run_s(self):
+ from quantui import app as _app_mod
+
+ src = inspect.getsource(_app_mod)
+ # The capture variable name is unique to EST.6.
+ assert "_predicted_run_s" in src
+ # And the call to log_prediction happens after log_calculation.
+ assert "log_prediction(" in src
+
+ def test_do_run_passes_gpu_used_to_estimator(self):
+ # The pre-run estimate must honour the device prediction so the
+ # logged predicted_s matches what the user saw in the UI.
+ from quantui import app as _app_mod
+
+ src = inspect.getsource(_app_mod)
+ assert "_predicted_gpu_used" in src
From c8659f727ba442f303a01db0a397384cc82ca39b Mon Sep 17 00:00:00 2001
From: NCCU-Schultz-Lab
Date: Mon, 25 May 2026 15:05:10 -0400
Subject: [PATCH 28/33] Polish UI: welcome header, GPU/docs, widget fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Update docs to advertise GPU support, CC methods, exports, CLI and new estimator/analytics features. Replace the welcome banner with a logo served via widgets.Image + text HBox (works around Voilà/Jupyter HTML sanitizer and preserves SVG animations), add a layout_fn parameter and wire the new header into app.py, and hide the logo on shutdown for proper centering. Tweak several ipywidgets Checkboxes/Dropdowns to remove the default description gutter (style.description_width='initial' and indent=False) to avoid unwanted indentation and horizontal scrollbars. Adjust shutdown HTML sizing. Update tests to expect the renamed "System Settings" tab and to skip a new dropdown placeholder when asserting result badges.
---
docs/index.html | 85 +++++++++++++++++++++-------
quantui/app.py | 4 +-
quantui/app_builders.py | 122 +++++++++++++++++++++++++++-------------
quantui/app_runflow.py | 15 +++--
tests/test_app.py | 16 ++++--
5 files changed, 169 insertions(+), 73 deletions(-)
diff --git a/docs/index.html b/docs/index.html
index bf5f5dd..71f79c9 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -3,10 +3,10 @@
- QuantUI — An open-source frontend for DFT and post-HF quantum chemistry
-
-
-
+ QuantUI — Free, open, and interactive quantum chemistry
+
+
+
@@ -354,16 +354,18 @@
- Open-source DFT frontend
+ Open-source PySCF frontend
No cluster required
+ GPU-ready
-
A powerful frontend for
open-source quantum chemistry
+
Free, open, and
interactive quantum chemistry
QuantUI puts PySCF
- behind an interactive Jupyter/Voilà UI. Run DFT, MP2, TD-DFT,
- NMR, geometry optimization, frequencies, and PES scans —
- visualize structures, orbitals, IR and UV-Vis spectra, all on
- your laptop.
+ behind an interactive Jupyter/Voilà UI. Run DFT, MP2, CCSD,
+ CCSD(T), TD-DFT, NMR, geometry optimization, frequencies, and
+ PES scans — visualize structures, orbitals, IR and UV-Vis
+ spectra, all on your laptop with optional NVIDIA GPU offload via
+ gpu4pyscf.