From 93a248057436a65eeae6ffedbf05527e96ee53cd Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 15:26:30 +0100 Subject: [PATCH 01/21] Attempt to add performance tests --- .circleci/config.yml | 67 +++++++++++ test-performance/timesInsert.py | 82 ------------- test_requirements.txt | 1 + tests/conftest.py | 20 ++++ tests/perf/__init__.py | 0 tests/perf/test_insert_select_bench.py | 158 +++++++++++++++++++++++++ 6 files changed, 246 insertions(+), 82 deletions(-) delete mode 100644 test-performance/timesInsert.py create mode 100644 tests/perf/__init__.py create mode 100644 tests/perf/test_insert_select_bench.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 4fc6d2e..4903d2d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -73,6 +73,69 @@ jobs: - after_failure: when : "on_fail" + perf_bench: + description: "Run insert/select benchmarks on master and on this branch, print the diff" + docker: + - image: nuodb/nuodb:latest + user: root + resource_class: medium + environment: + TZ : America/New_York + NUO_SET_TLS : disable + NUOCMD_CLIENT_KEY : "" + NUOCMD_VERIFY_SERVER : "" + NUOCMD_PLUGINS : "" + steps: + - checkout + - run: + name: Install build tools + command: | + PYVER=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') + dnf install -y make gcc "python${PYVER}-devel" + - run: + name: Install pip + command: | + curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py + python3 /tmp/get-pip.py --user + - run: + name: Make artifact directories + command: mkdir -p artifacts results + - run: + name: Start NuoDB Admin + command: | + sudo -u nuodb /opt/nuodb/etc/nuoadmin tls $NUO_SET_TLS + sudo -u nuodb /opt/nuodb/etc/nuoadmin tls status + sudo -u nuodb /opt/nuodb/etc/nuoadmin start + sudo -u nuodb /opt/nuodb/bin/nuocmd --show-json get effective-license + # Same runner, same NuoDB instance: run master first for a + # baseline, then this branch, then let pytest-benchmark print the + # per-test diff. Hardware noise cancels because both runs share + # the same container. + - run: + name: Baseline benchmarks on master + command: | + git worktree add /tmp/base origin/master + cd /tmp/base + make PYTHON=python3 install + $HOME/.local/bin/pip install pytest-benchmark + python3 -m pytest tests/perf --run-perf --benchmark-only \ + --benchmark-json=/tmp/baseline.json \ + --benchmark-columns=min,mean,median,stddev,rounds + - run: + name: Branch benchmarks + diff vs master + command: | + make PYTHON=python3 install + $HOME/.local/bin/pip install pytest-benchmark + python3 -m pytest tests/perf --run-perf --benchmark-only \ + --benchmark-json=artifacts/pr.json \ + --benchmark-compare=/tmp/baseline.json \ + --benchmark-columns=min,mean,median,stddev,rounds \ + | tee artifacts/perf_diff.txt + - store_artifacts: + path: artifacts + - after_failure: + when : "on_fail" + workflows: build-project: jobs: @@ -80,3 +143,7 @@ workflows: name: "Build and run regression tests" context: - common-config + - perf_bench: + name: "Insert/select benchmark comparison" + context: + - common-config diff --git a/test-performance/timesInsert.py b/test-performance/timesInsert.py deleted file mode 100644 index 9ee32bd..0000000 --- a/test-performance/timesInsert.py +++ /dev/null @@ -1,82 +0,0 @@ -# A database named test with user dba / password dba must be created first - -import os -import time - -import pynuodb - -smallIterations = 100 -largeIterations = smallIterations * 1000 - - -def gettime(): - return time.time() - - -def insert(count): - for i in range(count): - cursor.execute("INSERT INTO perf_test (a,b ) VALUES (%d,'A')" % i) - connection.commit() - - -def select(): - cursor.execute("select * from perf_test") - cursor.fetchall() - - -dropTable = "drop table perf_test cascade if exists" -createTable = "create table perf_test (a int,b char)" - -port = os.environ.get('NUODB_PORT') -if not port: - port = '48004' - -options = {} -trustStore = os.environ.get('NUOCMD_VERIFY_SERVER') -if trustStore: - options = {'trustStore': trustStore, 'verifyHostname': 'False'} - -connection = pynuodb.connect("test", "localhost:" + port, "dba", "dba", - options=options) -cursor = connection.cursor() -cursor.execute("use test") - -# Begin SMALL_INSERT_ITERATIONS test -cursor.execute(dropTable) -cursor.execute(createTable) -start = gettime() -insert(smallIterations) -smallInsertElapsed = gettime() - start - -print("Elapse time of SMALL_INSERT_ITERATIONS = %.4fs" % (smallInsertElapsed)) - -# Begin SMALL_SELECT_ITERATIONS test -start = gettime() -select() -smallSelectElapsed = gettime() - start -print("Elapse time of SMALL_SELECT_ITERATIONS = %.4fs" % (smallSelectElapsed)) - -# Begin LARGE_INSERT_ITERATIONS test -cursor.execute(dropTable) -cursor.execute(createTable) - -start = gettime() -insert(largeIterations) -largeInsertElapsed = gettime() - start - -print("Elapse time of LARGE_INSERT_ITERATIONS = %.4fs" % (largeInsertElapsed)) - -# Begin LARGE_SELECT_ITERATIONS test -start = gettime() -select() -largeSelectElapsed = gettime() - start - -print("Elapse time of LARGE_SELECT_ITERATIONS = %.4fs" % (largeSelectElapsed)) - -if largeInsertElapsed > smallInsertElapsed * 1000: - print("Insert is too slow!") - -if largeSelectElapsed > smallSelectElapsed * 1000: - print("Select is too slow!") - -print("\n") diff --git a/test_requirements.txt b/test_requirements.txt index ef90a4a..2328d32 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1,6 +1,7 @@ mock>=1.0 nose>=1.3 pytest>=2.7 +pytest-benchmark>=4.0 coverage>=3.7 pytest-cov>=1.8.1 coveralls>=0.5 diff --git a/tests/conftest.py b/tests/conftest.py index fb7cd90..180e014 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -30,6 +30,26 @@ from . import nuocmd, cvtjson + +def pytest_addoption(parser): + parser.addoption("--run-perf", action="store_true", default=False, + help="run performance benchmarks under tests/perf") + + +def pytest_configure(config): + config.addinivalue_line( + "markers", + "perf: performance benchmark; skipped unless --run-perf is passed") + + +def pytest_collection_modifyitems(config, items): + if config.getoption("--run-perf"): + return + skip = pytest.mark.skip(reason="need --run-perf to run performance tests") + for item in items: + if "perf" in item.keywords: + item.add_marker(skip) + _log = logging.getLogger("pynuodbtest") DB_OPTIONS = [] # type: List[str] diff --git a/tests/perf/__init__.py b/tests/perf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py new file mode 100644 index 0000000..2554b24 --- /dev/null +++ b/tests/perf/test_insert_select_bench.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +"""Insert / select micro-benchmarks. + +(C) Copyright 2025 Dassault Systemes SE. All Rights Reserved. + +This software is licensed under a BSD 3-Clause License. +See the LICENSE file provided with this software. + +Ported from test-performance/timesInsert.py so the numbers live alongside +the correctness suite and can be run via `pytest --benchmark-only`. + +Each test measures one operation the driver's hot paths care about: + + * bulk INSERT via executemany (encode path, session send) + * fetchall over a small result set (decode + per-row dispatch) + * fetchall over a large result set (batched decode + refill loop) + +pytest-benchmark auto-repeats each function and reports min / mean / +median / stddev. Numbers move meaningfully with the crypt, session, +cursor and Cython PRs; that's the point. +""" + +import pytest + +from tests import nuodb_base + + +# Skip this whole module unless `--run-perf` is passed on the pytest +# command line. We don't want `make fulltest` to sit through a 100k-row +# insert on every commit. +pytestmark = pytest.mark.perf + + +_DDL_DROP = "DROP TABLE IF EXISTS perf_bench" +_DDL_CREATE = "CREATE TABLE perf_bench (a INT, b VARCHAR(64))" + +_SMALL = 100 +_LARGE = 100_000 + + +def _rows(n): + return [(i, 'A dark and stormy night %d' % i) for i in range(n)] + + +class TestInsertSelectPerf(nuodb_base.NuoBase): + + def _reset(self, con): + cur = con.cursor() + cur.execute(_DDL_DROP) + cur.execute(_DDL_CREATE) + con.commit() + + def _seed(self, con, n): + self._reset(con) + con.cursor().executemany( + "INSERT INTO perf_bench (a, b) VALUES (?, ?)", _rows(n)) + con.commit() + + # -- INSERT --------------------------------------------------------- + + def test_insert_small(self, benchmark): + """100 rows via executemany. Sensitive to per-row putValue cost.""" + con = self._connect() + try: + self._reset(con) + cur = con.cursor() + rows = _rows(_SMALL) + + def target(): + cur.executemany( + "INSERT INTO perf_bench (a, b) VALUES (?, ?)", rows) + con.commit() + # Truncate between iterations so we measure a clean insert. + cur.execute(_DDL_DROP) + cur.execute(_DDL_CREATE) + con.commit() + + benchmark(target) + finally: + con.close() + + def test_insert_large(self, benchmark): + """100k rows via executemany """ + con = self._connect() + try: + self._reset(con) + cur = con.cursor() + rows = _rows(_LARGE) + + def target(): + cur.executemany( + "INSERT INTO perf_bench (a, b) VALUES (?, ?)", rows) + con.commit() + cur.execute(_DDL_DROP) + cur.execute(_DDL_CREATE) + con.commit() + + # Large insert is slow; cap repetitions so a benchmark run + # finishes in seconds rather than minutes. + benchmark.pedantic(target, rounds=3, iterations=1) + finally: + con.close() + + # -- SELECT --------------------------------------------------------- + + def test_fetchall_small(self, benchmark): + """fetchall over 100 rows. Sensitive to fixed per-query overhead.""" + con = self._connect() + try: + self._seed(con, _SMALL) + cur = con.cursor() + + def target(): + cur.execute("SELECT a, b FROM perf_bench") + return cur.fetchall() + + rows = benchmark(target) + assert len(rows) == _SMALL + finally: + con.close() + + def test_fetchall_large(self, benchmark): + """fetchall over 100k rows. """ + con = self._connect() + try: + self._seed(con, _LARGE) + cur = con.cursor() + + def target(): + cur.execute("SELECT a, b FROM perf_bench") + return cur.fetchall() + + rows = benchmark.pedantic(target, rounds=5, iterations=1) + assert len(rows) == _LARGE + finally: + con.close() + + def test_fetchmany_large(self, benchmark): + """fetchmany(1000) over 100k rows""" + con = self._connect() + try: + self._seed(con, _LARGE) + cur = con.cursor() + + def target(): + cur.execute("SELECT a, b FROM perf_bench") + total = 0 + while True: + batch = cur.fetchmany(1000) + if not batch: + break + total += len(batch) + return total + + total = benchmark.pedantic(target, rounds=5, iterations=1) + assert total == _LARGE + finally: + con.close() From 4dc039c3bb9592018d8e036c707e615ea77d1f2c Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 15:30:37 +0100 Subject: [PATCH 02/21] Fix more tings --- tests/perf/test_insert_select_bench.py | 96 ++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py index 2554b24..3846eb4 100644 --- a/tests/perf/test_insert_select_bench.py +++ b/tests/perf/test_insert_select_bench.py @@ -156,3 +156,99 @@ def target(): assert total == _LARGE finally: con.close() + + def test_fetchone_loop_large(self, benchmark): + """fetchone() in a loop over 100k rows. Isolates per-row + overhead """ + con = self._connect() + try: + self._seed(con, _LARGE) + cur = con.cursor() + + def target(): + cur.execute("SELECT a, b FROM perf_bench") + n = 0 + while True: + row = cur.fetchone() + if row is None: + break + n += 1 + return n + + n = benchmark.pedantic(target, rounds=3, iterations=1) + assert n == _LARGE + finally: + con.close() + + # -- Wide rows / mixed types --------------------------------------- + + _WIDE_COLS = 50 + _WIDE_ROWS = 1000 + + def test_fetchall_wide(self, benchmark): + """50 columns x 1000 rows """ + cols = ["c%d INT" % i for i in range(self._WIDE_COLS)] + col_names = ", ".join("c%d" % i for i in range(self._WIDE_COLS)) + placeholders = ", ".join(["?"] * self._WIDE_COLS) + + con = self._connect() + try: + cur = con.cursor() + cur.execute("DROP TABLE IF EXISTS perf_wide") + cur.execute("CREATE TABLE perf_wide (%s)" % (", ".join(cols),)) + con.commit() + rows = [tuple(range(self._WIDE_COLS)) for _ in range(self._WIDE_ROWS)] + cur.executemany( + "INSERT INTO perf_wide (%s) VALUES (%s)" + % (col_names, placeholders), + rows) + con.commit() + + def target(): + cur.execute("SELECT %s FROM perf_wide" % col_names) + return cur.fetchall() + + result = benchmark.pedantic(target, rounds=5, iterations=1) + assert len(result) == self._WIDE_ROWS + assert len(result[0]) == self._WIDE_COLS + finally: + con.close() + + def test_fetchall_mixed_types(self, benchmark): + """SELECT with variety of types: int / decimal / double / timestamp / bool / + varchar / null. """ + con = self._connect() + try: + cur = con.cursor() + cur.execute("DROP TABLE IF EXISTS perf_mixed") + cur.execute( + "CREATE TABLE perf_mixed (" + " i INT," + " d DECIMAL(12, 4)," + " f DOUBLE," + " ts TIMESTAMP," + " bl BOOLEAN," + " s VARCHAR(64)," + " n INT" + ")") + con.commit() + rows = [ + (i, i * 1.25, i / 3.0, + '2024-01-01 12:34:56', bool(i & 1), + 'row #%d' % i, None) + for i in range(_LARGE) + ] + cur.executemany( + "INSERT INTO perf_mixed (i, d, f, ts, bl, s, n)" + " VALUES (?, ?, ?, ?, ?, ?, ?)", + rows) + con.commit() + + def target(): + cur.execute("SELECT i, d, f, ts, bl, s, n FROM perf_mixed") + return cur.fetchall() + + result = benchmark.pedantic(target, rounds=5, iterations=1) + assert len(result) == _LARGE + finally: + con.close() From d08e258afb19b5d5f54efdedecf6051cce20409f Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 15:39:27 +0100 Subject: [PATCH 03/21] Add git --- .circleci/config.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4903d2d..dcff0ea 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -91,7 +91,7 @@ jobs: name: Install build tools command: | PYVER=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') - dnf install -y make gcc "python${PYVER}-devel" + dnf install -y git make gcc "python${PYVER}-devel" - run: name: Install pip command: | @@ -114,7 +114,8 @@ jobs: - run: name: Baseline benchmarks on master command: | - git worktree add /tmp/base origin/master + git fetch --no-tags --depth=1 origin master + git worktree add /tmp/base FETCH_HEAD cd /tmp/base make PYTHON=python3 install $HOME/.local/bin/pip install pytest-benchmark From 0068eea6839033ad36c8d8eab0624dc28c1aa777 Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 15:46:31 +0100 Subject: [PATCH 04/21] Try --- .circleci/config.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index dcff0ea..70a99c1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -114,6 +114,8 @@ jobs: - run: name: Baseline benchmarks on master command: | + mkdir -p ~/.ssh + ssh-keyscan -H github.com >> ~/.ssh/known_hosts 2>/dev/null git fetch --no-tags --depth=1 origin master git worktree add /tmp/base FETCH_HEAD cd /tmp/base From 9fd307b731d9ac63093dbd8c6558981ddcf6076e Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 16:08:18 +0100 Subject: [PATCH 05/21] Try something different --- .circleci/config.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 70a99c1..7661780 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -114,9 +114,8 @@ jobs: - run: name: Baseline benchmarks on master command: | - mkdir -p ~/.ssh - ssh-keyscan -H github.com >> ~/.ssh/known_hosts 2>/dev/null - git fetch --no-tags --depth=1 origin master + GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" \ + git fetch --no-tags --depth=1 origin master git worktree add /tmp/base FETCH_HEAD cd /tmp/base make PYTHON=python3 install From 4f8c7cf94c1ae6c453ecdd494c9a2e32edd41d6e Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 16:15:21 +0100 Subject: [PATCH 06/21] ensure same tests run on both branches --- .circleci/config.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7661780..9eaea52 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -117,6 +117,10 @@ jobs: GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" \ git fetch --no-tags --depth=1 origin master git worktree add /tmp/base FETCH_HEAD + # Copy the perf suite + conftest hooks from this branch so the + # master worktree's driver is exercised by the same benchmarks. + cp -a tests/perf /tmp/base/tests/ + cp tests/conftest.py /tmp/base/tests/conftest.py cd /tmp/base make PYTHON=python3 install $HOME/.local/bin/pip install pytest-benchmark From c24a84d33fcf5c4a1131e4c3c4dc071d26f1c94d Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 16:25:09 +0100 Subject: [PATCH 07/21] Try to make it gooder --- .circleci/config.yml | 8 ++-- tests/perf/compare.py | 55 ++++++++++++++++++++++++++ tests/perf/test_insert_select_bench.py | 14 +++---- 3 files changed, 65 insertions(+), 12 deletions(-) create mode 100644 tests/perf/compare.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 9eaea52..a9be12f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -125,7 +125,7 @@ jobs: make PYTHON=python3 install $HOME/.local/bin/pip install pytest-benchmark python3 -m pytest tests/perf --run-perf --benchmark-only \ - --benchmark-json=/tmp/baseline.json \ + --benchmark-json=/tmp/master.json \ --benchmark-columns=min,mean,median,stddev,rounds - run: name: Branch benchmarks + diff vs master @@ -133,9 +133,9 @@ jobs: make PYTHON=python3 install $HOME/.local/bin/pip install pytest-benchmark python3 -m pytest tests/perf --run-perf --benchmark-only \ - --benchmark-json=artifacts/pr.json \ - --benchmark-compare=/tmp/baseline.json \ - --benchmark-columns=min,mean,median,stddev,rounds \ + --benchmark-json=artifacts/branch.json \ + --benchmark-columns=min,mean,median,stddev,rounds + python3 tests/perf/compare.py /tmp/master.json artifacts/branch.json \ | tee artifacts/perf_diff.txt - store_artifacts: path: artifacts diff --git a/tests/perf/compare.py b/tests/perf/compare.py new file mode 100644 index 0000000..e1ea790 --- /dev/null +++ b/tests/perf/compare.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +"""Compare two pytest-benchmark JSON files (master vs branch). + +Usage: python compare.py MASTER.json BRANCH.json + +Compares the `min` time of each shared test and prints a table with the +absolute and relative delta. We use min because it's the least noisy +statistic: it filters out GC pauses, kernel scheduling and JIT warmup. +""" +from __future__ import print_function + +import json +import sys + + +def _load(path): + with open(path) as f: + data = json.load(f) + return {b['name']: b['stats']['min'] for b in data['benchmarks']} + + +def main(master_path, branch_path): + master = _load(master_path) + branch = _load(branch_path) + + shared = sorted(set(master) & set(branch)) + rows = [] + for name in shared: + m_ms = master[name] * 1000.0 + b_ms = branch[name] * 1000.0 + delta = b_ms - m_ms + pct = (delta / m_ms) * 100.0 if m_ms else float('nan') + rows.append((name, m_ms, b_ms, delta, pct)) + + header = ("Test", "master min (ms)", "branch min (ms)", + "delta (ms)", "delta %") + print("%-40s %16s %16s %14s %10s" % header) + print("-" * 100) + for name, m, b, d, p in rows: + print("%-40s %16.3f %16.3f %+14.3f %+9.2f%%" % (name, m, b, d, p)) + + only_master = sorted(set(master) - set(branch)) + only_branch = sorted(set(branch) - set(master)) + if only_master: + print("\nOnly in master: %s" % ", ".join(only_master)) + if only_branch: + print("Only in branch: %s" % ", ".join(only_branch)) + + +if __name__ == '__main__': + if len(sys.argv) != 3: + print("usage: python compare.py MASTER.json BRANCH.json", + file=sys.stderr) + sys.exit(2) + main(sys.argv[1], sys.argv[2]) diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py index 3846eb4..3f81f02 100644 --- a/tests/perf/test_insert_select_bench.py +++ b/tests/perf/test_insert_select_bench.py @@ -95,9 +95,7 @@ def target(): cur.execute(_DDL_CREATE) con.commit() - # Large insert is slow; cap repetitions so a benchmark run - # finishes in seconds rather than minutes. - benchmark.pedantic(target, rounds=3, iterations=1) + benchmark.pedantic(target, rounds=10, iterations=1) finally: con.close() @@ -130,7 +128,7 @@ def target(): cur.execute("SELECT a, b FROM perf_bench") return cur.fetchall() - rows = benchmark.pedantic(target, rounds=5, iterations=1) + rows = benchmark.pedantic(target, rounds=10, iterations=1) assert len(rows) == _LARGE finally: con.close() @@ -152,7 +150,7 @@ def target(): total += len(batch) return total - total = benchmark.pedantic(target, rounds=5, iterations=1) + total = benchmark.pedantic(target, rounds=10, iterations=1) assert total == _LARGE finally: con.close() @@ -175,7 +173,7 @@ def target(): n += 1 return n - n = benchmark.pedantic(target, rounds=3, iterations=1) + n = benchmark.pedantic(target, rounds=10, iterations=1) assert n == _LARGE finally: con.close() @@ -208,7 +206,7 @@ def target(): cur.execute("SELECT %s FROM perf_wide" % col_names) return cur.fetchall() - result = benchmark.pedantic(target, rounds=5, iterations=1) + result = benchmark.pedantic(target, rounds=10, iterations=1) assert len(result) == self._WIDE_ROWS assert len(result[0]) == self._WIDE_COLS finally: @@ -248,7 +246,7 @@ def target(): cur.execute("SELECT i, d, f, ts, bl, s, n FROM perf_mixed") return cur.fetchall() - result = benchmark.pedantic(target, rounds=5, iterations=1) + result = benchmark.pedantic(target, rounds=10, iterations=1) assert len(result) == _LARGE finally: con.close() From afdcc4965ca44f25825265e6eeba227cf939f48a Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 16:35:06 +0100 Subject: [PATCH 08/21] Try to reduce test noise --- .circleci/config.yml | 9 +++++++++ tests/perf/test_insert_select_bench.py | 24 ++++++++++++++++-------- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index a9be12f..c0e7002 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -127,6 +127,15 @@ jobs: python3 -m pytest tests/perf --run-perf --benchmark-only \ --benchmark-json=/tmp/master.json \ --benchmark-columns=min,mean,median,stddev,rounds + - run: + name: Reset DB between baseline and branch + command: | + # Force a clean DB restart so the branch run doesn't inherit + # buffer-pool warmth (or any other state) from the master run. + # The conftest fixture already shuts it down, this is belt-and- + # suspenders in case a test aborted early. + sudo -u nuodb /opt/nuodb/bin/nuocmd shutdown database \ + --db-name pynuodb_test 2>/dev/null || true - run: name: Branch benchmarks + diff vs master command: | diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py index 3f81f02..de33365 100644 --- a/tests/perf/test_insert_select_bench.py +++ b/tests/perf/test_insert_select_bench.py @@ -75,7 +75,8 @@ def target(): cur.execute(_DDL_CREATE) con.commit() - benchmark(target) + benchmark.pedantic(target, warmup_rounds=5, rounds=200, + iterations=1) finally: con.close() @@ -95,7 +96,8 @@ def target(): cur.execute(_DDL_CREATE) con.commit() - benchmark.pedantic(target, rounds=10, iterations=1) + benchmark.pedantic(target, warmup_rounds=2, rounds=10, + iterations=1) finally: con.close() @@ -112,7 +114,8 @@ def target(): cur.execute("SELECT a, b FROM perf_bench") return cur.fetchall() - rows = benchmark(target) + rows = benchmark.pedantic(target, warmup_rounds=5, rounds=200, + iterations=1) assert len(rows) == _SMALL finally: con.close() @@ -128,7 +131,8 @@ def target(): cur.execute("SELECT a, b FROM perf_bench") return cur.fetchall() - rows = benchmark.pedantic(target, rounds=10, iterations=1) + rows = benchmark.pedantic(target, warmup_rounds=2, rounds=10, + iterations=1) assert len(rows) == _LARGE finally: con.close() @@ -150,7 +154,8 @@ def target(): total += len(batch) return total - total = benchmark.pedantic(target, rounds=10, iterations=1) + total = benchmark.pedantic(target, warmup_rounds=2, rounds=10, + iterations=1) assert total == _LARGE finally: con.close() @@ -173,7 +178,8 @@ def target(): n += 1 return n - n = benchmark.pedantic(target, rounds=10, iterations=1) + n = benchmark.pedantic(target, warmup_rounds=2, rounds=10, + iterations=1) assert n == _LARGE finally: con.close() @@ -206,7 +212,8 @@ def target(): cur.execute("SELECT %s FROM perf_wide" % col_names) return cur.fetchall() - result = benchmark.pedantic(target, rounds=10, iterations=1) + result = benchmark.pedantic(target, warmup_rounds=2, rounds=10, + iterations=1) assert len(result) == self._WIDE_ROWS assert len(result[0]) == self._WIDE_COLS finally: @@ -246,7 +253,8 @@ def target(): cur.execute("SELECT i, d, f, ts, bl, s, n FROM perf_mixed") return cur.fetchall() - result = benchmark.pedantic(target, rounds=10, iterations=1) + result = benchmark.pedantic(target, warmup_rounds=2, rounds=10, + iterations=1) assert len(result) == _LARGE finally: con.close() From 68eb5736905c8715f257b83d5d14ea0fa9f56fef Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 17:27:32 +0100 Subject: [PATCH 09/21] Attempt to handle noise --- .circleci/config.yml | 39 +++++++--- tests/perf/calibrate.py | 102 ++++++++++++++++++++++++ tests/perf/compare.py | 104 ++++++++++++++++++------- tests/perf/test_insert_select_bench.py | 12 +-- 4 files changed, 211 insertions(+), 46 deletions(-) create mode 100644 tests/perf/calibrate.py diff --git a/.circleci/config.yml b/.circleci/config.yml index c0e7002..fd81c92 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -107,12 +107,13 @@ jobs: sudo -u nuodb /opt/nuodb/etc/nuoadmin tls status sudo -u nuodb /opt/nuodb/etc/nuoadmin start sudo -u nuodb /opt/nuodb/bin/nuocmd --show-json get effective-license - # Same runner, same NuoDB instance: run master first for a - # baseline, then this branch, then let pytest-benchmark print the - # per-test diff. Hardware noise cancels because both runs share - # the same container. + # We run master twice back-to-back on the same runner: the first run + # is the baseline that the branch is compared against, the second is + # used by calibrate.py to measure this runner's noise floor. Then + # we run the branch and let compare.py flag deltas that clear the + # floor. - run: - name: Baseline benchmarks on master + name: Baseline benchmarks on master (pass 1) command: | GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" \ git fetch --no-tags --depth=1 origin master @@ -125,15 +126,23 @@ jobs: make PYTHON=python3 install $HOME/.local/bin/pip install pytest-benchmark python3 -m pytest tests/perf --run-perf --benchmark-only \ - --benchmark-json=/tmp/master.json \ + --benchmark-json=/tmp/master_1.json \ --benchmark-columns=min,mean,median,stddev,rounds - run: - name: Reset DB between baseline and branch + name: Reset DB + command: | + sudo -u nuodb /opt/nuodb/bin/nuocmd shutdown database \ + --db-name pynuodb_test 2>/dev/null || true + - run: + name: Baseline benchmarks on master (pass 2, for noise floor) + command: | + cd /tmp/base + python3 -m pytest tests/perf --run-perf --benchmark-only \ + --benchmark-json=/tmp/master_2.json \ + --benchmark-columns=min,mean,median,stddev,rounds + - run: + name: Reset DB command: | - # Force a clean DB restart so the branch run doesn't inherit - # buffer-pool warmth (or any other state) from the master run. - # The conftest fixture already shuts it down, this is belt-and- - # suspenders in case a test aborted early. sudo -u nuodb /opt/nuodb/bin/nuocmd shutdown database \ --db-name pynuodb_test 2>/dev/null || true - run: @@ -144,8 +153,14 @@ jobs: python3 -m pytest tests/perf --run-perf --benchmark-only \ --benchmark-json=artifacts/branch.json \ --benchmark-columns=min,mean,median,stddev,rounds - python3 tests/perf/compare.py /tmp/master.json artifacts/branch.json \ + python3 tests/perf/calibrate.py /tmp/noise_floor.json \ + /tmp/master_1.json /tmp/master_2.json \ + | tee artifacts/noise_floor.txt + python3 tests/perf/compare.py \ + /tmp/master_1.json artifacts/branch.json \ + --noise-floor /tmp/noise_floor.json \ | tee artifacts/perf_diff.txt + cp /tmp/noise_floor.json artifacts/ - store_artifacts: path: artifacts - after_failure: diff --git a/tests/perf/calibrate.py b/tests/perf/calibrate.py new file mode 100644 index 0000000..440bee9 --- /dev/null +++ b/tests/perf/calibrate.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +"""Compute a per-test noise floor from same-code benchmark runs. + +Takes 2+ pytest-benchmark JSON files produced by running the *same* code +on the same runner (e.g., master benchmarked twice back-to-back). For +each test, pools the raw per-round timings from all runs and runs a +permutation-style bootstrap: draw two disjoint groups of the size of a +typical single run, compute |delta%| between their mins, repeat many +times. The 95th percentile of that distribution is the noise floor: +the smallest |delta%| that noise alone can plausibly produce on this +runner. compare.py reads this file and flags PR deltas that clear it. + +Writes {test_name: floor_pct} to the output path. +""" +from __future__ import print_function + +import argparse +import json +import random + + +def _load_pooled(paths): + """Return (pooled_data, typical_size_per_test) for every shared test.""" + pooled = {} + sizes = {} + for path in paths: + with open(path) as f: + data = json.load(f) + for b in data['benchmarks']: + name = b['name'] + d = b['stats']['data'] + pooled.setdefault(name, []).extend(d) + sizes.setdefault(name, []).append(len(d)) + typical = {n: sum(s) // len(s) for n, s in sizes.items()} + return pooled, typical + + +def _floor(pooled, group_size, n_permutations=5000, quantile=0.95): + if len(pooled) < 2 * group_size: + # Not enough data to draw two disjoint groups: fall back to + # bootstrap-with-replacement. Rare, but keeps the script robust. + deltas = _bootstrap_with_replacement( + pooled, group_size, n_permutations) + else: + deltas = _permutation(pooled, group_size, n_permutations) + deltas.sort() + return deltas[int(len(deltas) * quantile)] + + +def _permutation(pooled, group_size, n): + deltas = [] + data = list(pooled) + two_groups = 2 * group_size + for _ in range(n): + random.shuffle(data) + a = data[:group_size] + b = data[group_size:two_groups] + m_a = min(a) + m_b = min(b) + deltas.append(abs((m_b - m_a) / m_a * 100.0)) + return deltas + + +def _bootstrap_with_replacement(pooled, group_size, n): + deltas = [] + for _ in range(n): + a = [random.choice(pooled) for _ in range(group_size)] + b = [random.choice(pooled) for _ in range(group_size)] + m_a = min(a) + m_b = min(b) + deltas.append(abs((m_b - m_a) / m_a * 100.0)) + return deltas + + +def main(args): + if len(args.inputs) < 2: + raise SystemExit("need >= 2 same-code JSON files to calibrate") + + pooled, typical = _load_pooled(args.inputs) + random.seed(1) + + floors = {n: _floor(pooled[n], typical[n]) for n in sorted(pooled)} + + with open(args.output, 'w') as f: + json.dump(floors, f, indent=2, sort_keys=True) + + print("Noise floor per test (95th %ile of |delta%%| under H0):") + for name in sorted(floors): + print(" %-40s %6.2f%%" % (name, floors[name])) + print("Wrote %s" % args.output) + + +def _parse_args(): + p = argparse.ArgumentParser() + p.add_argument('output', help='where to write noise_floor.json') + p.add_argument('inputs', nargs='+', + help='2+ same-code pytest-benchmark JSON files') + return p.parse_args() + + +if __name__ == '__main__': + main(_parse_args()) diff --git a/tests/perf/compare.py b/tests/perf/compare.py index e1ea790..1b1974f 100644 --- a/tests/perf/compare.py +++ b/tests/perf/compare.py @@ -1,43 +1,86 @@ # -*- coding: utf-8 -*- """Compare two pytest-benchmark JSON files (master vs branch). -Usage: python compare.py MASTER.json BRANCH.json +For each shared test: -Compares the `min` time of each shared test and prints a table with the -absolute and relative delta. We use min because it's the least noisy -statistic: it filters out GC pauses, kernel scheduling and JIT warmup. + * Reports master and branch min time (ms) + * Reports the point delta as a percentage + * Reports a bootstrap 95% CI on that delta, resampled from the raw + per-round timings in stats.data. We compare mins because the min is + the least contaminated statistic on a shared runner. + +If --noise-floor is passed, reads a JSON produced by calibrate.py and +flags tests whose CI clears +/- floor in either direction. A test whose +CI overlaps the floor is treated as noise, no matter how nice the point +delta looks. """ from __future__ import print_function +import argparse import json -import sys +import random def _load(path): with open(path) as f: data = json.load(f) - return {b['name']: b['stats']['min'] for b in data['benchmarks']} + return {b['name']: b['stats'] for b in data['benchmarks']} + + +def _bootstrap_delta_ci(master_data, branch_data, n=5000, ci=0.95): + """95% bootstrap CI on (min(branch) - min(master)) / min(master) * 100.""" + n_m = len(master_data) + n_b = len(branch_data) + deltas = [] + for _ in range(n): + m = min(random.choice(master_data) for _ in range(n_m)) + b = min(random.choice(branch_data) for _ in range(n_b)) + deltas.append((b - m) / m * 100.0) + deltas.sort() + lo_idx = int(n * (1 - ci) / 2) + hi_idx = int(n * (1 + ci) / 2) - 1 + return deltas[lo_idx], deltas[hi_idx] + +def _row_fmt(has_floor): + if has_floor: + return "%-40s %12s %12s %8s %20s %8s %5s" + return "%-40s %12s %12s %8s %20s" -def main(master_path, branch_path): - master = _load(master_path) - branch = _load(branch_path) - shared = sorted(set(master) & set(branch)) - rows = [] - for name in shared: - m_ms = master[name] * 1000.0 - b_ms = branch[name] * 1000.0 - delta = b_ms - m_ms - pct = (delta / m_ms) * 100.0 if m_ms else float('nan') - rows.append((name, m_ms, b_ms, delta, pct)) +def main(args): + master = _load(args.master) + branch = _load(args.branch) + floor = {} + if args.noise_floor: + with open(args.noise_floor) as f: + floor = json.load(f) - header = ("Test", "master min (ms)", "branch min (ms)", - "delta (ms)", "delta %") - print("%-40s %16s %16s %14s %10s" % header) - print("-" * 100) - for name, m, b, d, p in rows: - print("%-40s %16.3f %16.3f %+14.3f %+9.2f%%" % (name, m, b, d, p)) + random.seed(1) # deterministic across CI runs + + fmt = _row_fmt(bool(floor)) + header = ["Test", "master (ms)", "branch (ms)", "delta %", "95% CI"] + if floor: + header += ["floor %", "flag"] + print(fmt % tuple(header)) + print("-" * (len(fmt % tuple(header)))) + + for name in sorted(set(master) & set(branch)): + m_stats = master[name] + b_stats = branch[name] + m_ms = m_stats['min'] * 1000.0 + b_ms = b_stats['min'] * 1000.0 + delta_pct = (b_ms - m_ms) / m_ms * 100.0 + lo, hi = _bootstrap_delta_ci(m_stats['data'], b_stats['data']) + row = [name, "%.3f" % m_ms, "%.3f" % b_ms, + "%+.2f" % delta_pct, + "[%+6.2f, %+6.2f]" % (lo, hi)] + if floor: + f = float(floor.get(name, 0.0)) + # Flag when the CI sits entirely outside +/- floor. + flagged = (lo > f) or (hi < -f) + row += ["%.2f" % f, "*" if flagged else ""] + print(fmt % tuple(row)) only_master = sorted(set(master) - set(branch)) only_branch = sorted(set(branch) - set(master)) @@ -47,9 +90,14 @@ def main(master_path, branch_path): print("Only in branch: %s" % ", ".join(only_branch)) +def _parse_args(): + p = argparse.ArgumentParser() + p.add_argument('master', help='pytest-benchmark JSON for master') + p.add_argument('branch', help='pytest-benchmark JSON for this branch') + p.add_argument('--noise-floor', + help='JSON produced by calibrate.py; enables flag column') + return p.parse_args() + + if __name__ == '__main__': - if len(sys.argv) != 3: - print("usage: python compare.py MASTER.json BRANCH.json", - file=sys.stderr) - sys.exit(2) - main(sys.argv[1], sys.argv[2]) + main(_parse_args()) diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py index de33365..913847b 100644 --- a/tests/perf/test_insert_select_bench.py +++ b/tests/perf/test_insert_select_bench.py @@ -96,7 +96,7 @@ def target(): cur.execute(_DDL_CREATE) con.commit() - benchmark.pedantic(target, warmup_rounds=2, rounds=10, + benchmark.pedantic(target, warmup_rounds=2, rounds=30, iterations=1) finally: con.close() @@ -131,7 +131,7 @@ def target(): cur.execute("SELECT a, b FROM perf_bench") return cur.fetchall() - rows = benchmark.pedantic(target, warmup_rounds=2, rounds=10, + rows = benchmark.pedantic(target, warmup_rounds=2, rounds=30, iterations=1) assert len(rows) == _LARGE finally: @@ -154,7 +154,7 @@ def target(): total += len(batch) return total - total = benchmark.pedantic(target, warmup_rounds=2, rounds=10, + total = benchmark.pedantic(target, warmup_rounds=2, rounds=30, iterations=1) assert total == _LARGE finally: @@ -178,7 +178,7 @@ def target(): n += 1 return n - n = benchmark.pedantic(target, warmup_rounds=2, rounds=10, + n = benchmark.pedantic(target, warmup_rounds=2, rounds=30, iterations=1) assert n == _LARGE finally: @@ -212,7 +212,7 @@ def target(): cur.execute("SELECT %s FROM perf_wide" % col_names) return cur.fetchall() - result = benchmark.pedantic(target, warmup_rounds=2, rounds=10, + result = benchmark.pedantic(target, warmup_rounds=2, rounds=30, iterations=1) assert len(result) == self._WIDE_ROWS assert len(result[0]) == self._WIDE_COLS @@ -253,7 +253,7 @@ def target(): cur.execute("SELECT i, d, f, ts, bl, s, n FROM perf_mixed") return cur.fetchall() - result = benchmark.pedantic(target, warmup_rounds=2, rounds=10, + result = benchmark.pedantic(target, warmup_rounds=2, rounds=30, iterations=1) assert len(result) == _LARGE finally: From 18c419ca57514ed46161e9e0d2a8647e754c42c4 Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 17:46:45 +0100 Subject: [PATCH 10/21] try bigger box --- .circleci/config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index fd81c92..770a2ff 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -74,11 +74,11 @@ jobs: when : "on_fail" perf_bench: - description: "Run insert/select benchmarks on master and on this branch, print the diff" + description: "Run insert/select performance benchmarks on master and on this branch, print the diff" docker: - image: nuodb/nuodb:latest user: root - resource_class: medium + resource_class: xlarge environment: TZ : America/New_York NUO_SET_TLS : disable @@ -174,6 +174,6 @@ workflows: context: - common-config - perf_bench: - name: "Insert/select benchmark comparison" + name: "Run insert/select performance benchmark, compared to master" context: - common-config From 1ba251a555c2363f99a72d9a56f79c2b667f0372 Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 19:42:41 +0100 Subject: [PATCH 11/21] try to clean up insert tests --- tests/perf/test_insert_select_bench.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py index 913847b..7e02ecf 100644 --- a/tests/perf/test_insert_select_bench.py +++ b/tests/perf/test_insert_select_bench.py @@ -31,8 +31,9 @@ pytestmark = pytest.mark.perf -_DDL_DROP = "DROP TABLE IF EXISTS perf_bench" -_DDL_CREATE = "CREATE TABLE perf_bench (a INT, b VARCHAR(64))" +_DDL_DROP = "DROP TABLE IF EXISTS perf_bench" +_DDL_CREATE = "CREATE TABLE perf_bench (a INT, b VARCHAR(64))" +_DDL_TRUNCATE = "TRUNCATE TABLE perf_bench" _SMALL = 100 _LARGE = 100_000 @@ -70,13 +71,14 @@ def target(): cur.executemany( "INSERT INTO perf_bench (a, b) VALUES (?, ?)", rows) con.commit() - # Truncate between iterations so we measure a clean insert. - cur.execute(_DDL_DROP) - cur.execute(_DDL_CREATE) + + def setup(): + # Runs before each round but is NOT included in the timing. + cur.execute(_DDL_TRUNCATE) con.commit() - benchmark.pedantic(target, warmup_rounds=5, rounds=200, - iterations=1) + benchmark.pedantic(target, setup=setup, warmup_rounds=5, + rounds=200, iterations=1) finally: con.close() @@ -92,11 +94,13 @@ def target(): cur.executemany( "INSERT INTO perf_bench (a, b) VALUES (?, ?)", rows) con.commit() - cur.execute(_DDL_DROP) - cur.execute(_DDL_CREATE) + + def setup(): + cur.execute(_DDL_TRUNCATE) con.commit() - benchmark.pedantic(target, warmup_rounds=2, rounds=30, + benchmark.pedantic(target, setup=setup, warmup_rounds=2, + rounds=30, iterations=1) finally: con.close() From cd96993a649c6a27fbf24b6accf1562ba291163a Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 19:49:17 +0100 Subject: [PATCH 12/21] try to simplify --- .circleci/config.yml | 31 +++--------- tests/perf/calibrate.py | 102 ---------------------------------------- tests/perf/compare.py | 99 +++++++++----------------------------- 3 files changed, 28 insertions(+), 204 deletions(-) delete mode 100644 tests/perf/calibrate.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 770a2ff..529dd5f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -107,13 +107,11 @@ jobs: sudo -u nuodb /opt/nuodb/etc/nuoadmin tls status sudo -u nuodb /opt/nuodb/etc/nuoadmin start sudo -u nuodb /opt/nuodb/bin/nuocmd --show-json get effective-license - # We run master twice back-to-back on the same runner: the first run - # is the baseline that the branch is compared against, the second is - # used by calibrate.py to measure this runner's noise floor. Then - # we run the branch and let compare.py flag deltas that clear the - # floor. + # Run master then this branch on the same runner and let compare.py + # print a delta table. Hardware noise mostly cancels because both + # runs share the container; xlarge gives us dedicated cores. - run: - name: Baseline benchmarks on master (pass 1) + name: Baseline benchmarks on master command: | GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" \ git fetch --no-tags --depth=1 origin master @@ -126,19 +124,7 @@ jobs: make PYTHON=python3 install $HOME/.local/bin/pip install pytest-benchmark python3 -m pytest tests/perf --run-perf --benchmark-only \ - --benchmark-json=/tmp/master_1.json \ - --benchmark-columns=min,mean,median,stddev,rounds - - run: - name: Reset DB - command: | - sudo -u nuodb /opt/nuodb/bin/nuocmd shutdown database \ - --db-name pynuodb_test 2>/dev/null || true - - run: - name: Baseline benchmarks on master (pass 2, for noise floor) - command: | - cd /tmp/base - python3 -m pytest tests/perf --run-perf --benchmark-only \ - --benchmark-json=/tmp/master_2.json \ + --benchmark-json=/tmp/master.json \ --benchmark-columns=min,mean,median,stddev,rounds - run: name: Reset DB @@ -153,14 +139,9 @@ jobs: python3 -m pytest tests/perf --run-perf --benchmark-only \ --benchmark-json=artifacts/branch.json \ --benchmark-columns=min,mean,median,stddev,rounds - python3 tests/perf/calibrate.py /tmp/noise_floor.json \ - /tmp/master_1.json /tmp/master_2.json \ - | tee artifacts/noise_floor.txt python3 tests/perf/compare.py \ - /tmp/master_1.json artifacts/branch.json \ - --noise-floor /tmp/noise_floor.json \ + /tmp/master.json artifacts/branch.json \ | tee artifacts/perf_diff.txt - cp /tmp/noise_floor.json artifacts/ - store_artifacts: path: artifacts - after_failure: diff --git a/tests/perf/calibrate.py b/tests/perf/calibrate.py deleted file mode 100644 index 440bee9..0000000 --- a/tests/perf/calibrate.py +++ /dev/null @@ -1,102 +0,0 @@ -# -*- coding: utf-8 -*- -"""Compute a per-test noise floor from same-code benchmark runs. - -Takes 2+ pytest-benchmark JSON files produced by running the *same* code -on the same runner (e.g., master benchmarked twice back-to-back). For -each test, pools the raw per-round timings from all runs and runs a -permutation-style bootstrap: draw two disjoint groups of the size of a -typical single run, compute |delta%| between their mins, repeat many -times. The 95th percentile of that distribution is the noise floor: -the smallest |delta%| that noise alone can plausibly produce on this -runner. compare.py reads this file and flags PR deltas that clear it. - -Writes {test_name: floor_pct} to the output path. -""" -from __future__ import print_function - -import argparse -import json -import random - - -def _load_pooled(paths): - """Return (pooled_data, typical_size_per_test) for every shared test.""" - pooled = {} - sizes = {} - for path in paths: - with open(path) as f: - data = json.load(f) - for b in data['benchmarks']: - name = b['name'] - d = b['stats']['data'] - pooled.setdefault(name, []).extend(d) - sizes.setdefault(name, []).append(len(d)) - typical = {n: sum(s) // len(s) for n, s in sizes.items()} - return pooled, typical - - -def _floor(pooled, group_size, n_permutations=5000, quantile=0.95): - if len(pooled) < 2 * group_size: - # Not enough data to draw two disjoint groups: fall back to - # bootstrap-with-replacement. Rare, but keeps the script robust. - deltas = _bootstrap_with_replacement( - pooled, group_size, n_permutations) - else: - deltas = _permutation(pooled, group_size, n_permutations) - deltas.sort() - return deltas[int(len(deltas) * quantile)] - - -def _permutation(pooled, group_size, n): - deltas = [] - data = list(pooled) - two_groups = 2 * group_size - for _ in range(n): - random.shuffle(data) - a = data[:group_size] - b = data[group_size:two_groups] - m_a = min(a) - m_b = min(b) - deltas.append(abs((m_b - m_a) / m_a * 100.0)) - return deltas - - -def _bootstrap_with_replacement(pooled, group_size, n): - deltas = [] - for _ in range(n): - a = [random.choice(pooled) for _ in range(group_size)] - b = [random.choice(pooled) for _ in range(group_size)] - m_a = min(a) - m_b = min(b) - deltas.append(abs((m_b - m_a) / m_a * 100.0)) - return deltas - - -def main(args): - if len(args.inputs) < 2: - raise SystemExit("need >= 2 same-code JSON files to calibrate") - - pooled, typical = _load_pooled(args.inputs) - random.seed(1) - - floors = {n: _floor(pooled[n], typical[n]) for n in sorted(pooled)} - - with open(args.output, 'w') as f: - json.dump(floors, f, indent=2, sort_keys=True) - - print("Noise floor per test (95th %ile of |delta%%| under H0):") - for name in sorted(floors): - print(" %-40s %6.2f%%" % (name, floors[name])) - print("Wrote %s" % args.output) - - -def _parse_args(): - p = argparse.ArgumentParser() - p.add_argument('output', help='where to write noise_floor.json') - p.add_argument('inputs', nargs='+', - help='2+ same-code pytest-benchmark JSON files') - return p.parse_args() - - -if __name__ == '__main__': - main(_parse_args()) diff --git a/tests/perf/compare.py b/tests/perf/compare.py index 1b1974f..4ce38b9 100644 --- a/tests/perf/compare.py +++ b/tests/perf/compare.py @@ -1,86 +1,36 @@ # -*- coding: utf-8 -*- """Compare two pytest-benchmark JSON files (master vs branch). -For each shared test: - - * Reports master and branch min time (ms) - * Reports the point delta as a percentage - * Reports a bootstrap 95% CI on that delta, resampled from the raw - per-round timings in stats.data. We compare mins because the min is - the least contaminated statistic on a shared runner. - -If --noise-floor is passed, reads a JSON produced by calibrate.py and -flags tests whose CI clears +/- floor in either direction. A test whose -CI overlaps the floor is treated as noise, no matter how nice the point -delta looks. +Prints a table of master min, branch min, and the absolute + percentage +delta per test. Min is the least noisy summary; on a quiet enough +runner the delta % is straight-up meaningful. """ from __future__ import print_function -import argparse import json -import random +import sys def _load(path): with open(path) as f: data = json.load(f) - return {b['name']: b['stats'] for b in data['benchmarks']} - - -def _bootstrap_delta_ci(master_data, branch_data, n=5000, ci=0.95): - """95% bootstrap CI on (min(branch) - min(master)) / min(master) * 100.""" - n_m = len(master_data) - n_b = len(branch_data) - deltas = [] - for _ in range(n): - m = min(random.choice(master_data) for _ in range(n_m)) - b = min(random.choice(branch_data) for _ in range(n_b)) - deltas.append((b - m) / m * 100.0) - deltas.sort() - lo_idx = int(n * (1 - ci) / 2) - hi_idx = int(n * (1 + ci) / 2) - 1 - return deltas[lo_idx], deltas[hi_idx] - + return {b['name']: b['stats']['min'] for b in data['benchmarks']} -def _row_fmt(has_floor): - if has_floor: - return "%-40s %12s %12s %8s %20s %8s %5s" - return "%-40s %12s %12s %8s %20s" +def main(master_path, branch_path): + master = _load(master_path) + branch = _load(branch_path) -def main(args): - master = _load(args.master) - branch = _load(args.branch) - floor = {} - if args.noise_floor: - with open(args.noise_floor) as f: - floor = json.load(f) - - random.seed(1) # deterministic across CI runs - - fmt = _row_fmt(bool(floor)) - header = ["Test", "master (ms)", "branch (ms)", "delta %", "95% CI"] - if floor: - header += ["floor %", "flag"] - print(fmt % tuple(header)) - print("-" * (len(fmt % tuple(header)))) - + print("%-40s %16s %16s %14s %10s" % ( + "Test", "master min (ms)", "branch min (ms)", + "delta (ms)", "delta %")) + print("-" * 100) for name in sorted(set(master) & set(branch)): - m_stats = master[name] - b_stats = branch[name] - m_ms = m_stats['min'] * 1000.0 - b_ms = b_stats['min'] * 1000.0 - delta_pct = (b_ms - m_ms) / m_ms * 100.0 - lo, hi = _bootstrap_delta_ci(m_stats['data'], b_stats['data']) - row = [name, "%.3f" % m_ms, "%.3f" % b_ms, - "%+.2f" % delta_pct, - "[%+6.2f, %+6.2f]" % (lo, hi)] - if floor: - f = float(floor.get(name, 0.0)) - # Flag when the CI sits entirely outside +/- floor. - flagged = (lo > f) or (hi < -f) - row += ["%.2f" % f, "*" if flagged else ""] - print(fmt % tuple(row)) + m = master[name] * 1000.0 + b = branch[name] * 1000.0 + d = b - m + p = (d / m) * 100.0 if m else float('nan') + print("%-40s %16.3f %16.3f %+14.3f %+9.2f%%" % (name, m, b, d, p)) only_master = sorted(set(master) - set(branch)) only_branch = sorted(set(branch) - set(master)) @@ -90,14 +40,9 @@ def main(args): print("Only in branch: %s" % ", ".join(only_branch)) -def _parse_args(): - p = argparse.ArgumentParser() - p.add_argument('master', help='pytest-benchmark JSON for master') - p.add_argument('branch', help='pytest-benchmark JSON for this branch') - p.add_argument('--noise-floor', - help='JSON produced by calibrate.py; enables flag column') - return p.parse_args() - - if __name__ == '__main__': - main(_parse_args()) + if len(sys.argv) != 3: + print("usage: python compare.py MASTER.json BRANCH.json", + file=sys.stderr) + sys.exit(2) + main(sys.argv[1], sys.argv[2]) From de4d7a83af2664ca4ca98b64cfd9cae9c108c02d Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 20:00:12 +0100 Subject: [PATCH 13/21] Stuff --- .circleci/config.yml | 2 +- tests/perf/compare.py | 40 ++++++++++++++++++++++++++++++---------- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 529dd5f..f15d224 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -155,6 +155,6 @@ workflows: context: - common-config - perf_bench: - name: "Run insert/select performance benchmark, compared to master" + name: "Run performance benchmarks, comparing results to master" context: - common-config diff --git a/tests/perf/compare.py b/tests/perf/compare.py index 4ce38b9..6b34b25 100644 --- a/tests/perf/compare.py +++ b/tests/perf/compare.py @@ -2,11 +2,13 @@ """Compare two pytest-benchmark JSON files (master vs branch). Prints a table of master min, branch min, and the absolute + percentage -delta per test. Min is the least noisy summary; on a quiet enough -runner the delta % is straight-up meaningful. +delta per test. Exits non-zero if any test regressed by more than +--fail-threshold (default 10%), so CI turns a real regression into a +failed build. Improvements never fail the build. """ from __future__ import print_function +import argparse import json import sys @@ -17,20 +19,24 @@ def _load(path): return {b['name']: b['stats']['min'] for b in data['benchmarks']} -def main(master_path, branch_path): - master = _load(master_path) - branch = _load(branch_path) +def main(args): + master = _load(args.master) + branch = _load(args.branch) print("%-40s %16s %16s %14s %10s" % ( "Test", "master min (ms)", "branch min (ms)", "delta (ms)", "delta %")) print("-" * 100) + + regressed = [] for name in sorted(set(master) & set(branch)): m = master[name] * 1000.0 b = branch[name] * 1000.0 d = b - m p = (d / m) * 100.0 if m else float('nan') print("%-40s %16.3f %16.3f %+14.3f %+9.2f%%" % (name, m, b, d, p)) + if p > args.fail_threshold: + regressed.append((name, p)) only_master = sorted(set(master) - set(branch)) only_branch = sorted(set(branch) - set(master)) @@ -39,10 +45,24 @@ def main(master_path, branch_path): if only_branch: print("Only in branch: %s" % ", ".join(only_branch)) + print() + if regressed: + print("FAIL: %d test(s) regressed by more than %.2f%%:" + % (len(regressed), args.fail_threshold)) + for name, p in regressed: + print(" %s: %+.2f%%" % (name, p)) + sys.exit(1) + print("OK: no test regressed by more than %.2f%%" % args.fail_threshold) + + +def _parse_args(): + p = argparse.ArgumentParser() + p.add_argument('master', help='pytest-benchmark JSON for master') + p.add_argument('branch', help='pytest-benchmark JSON for this branch') + p.add_argument('--fail-threshold', type=float, default=10.0, + help='percent slowdown that fails the build (default 10)') + return p.parse_args() + if __name__ == '__main__': - if len(sys.argv) != 3: - print("usage: python compare.py MASTER.json BRANCH.json", - file=sys.stderr) - sys.exit(2) - main(sys.argv[1], sys.argv[2]) + main(_parse_args()) From 9e519852f1b1f13f8c2a25cfc727faa92dec6edf Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 20:18:10 +0100 Subject: [PATCH 14/21] try and find less noise --- .circleci/config.yml | 8 +--- tests/perf/run_perf.py | 105 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 6 deletions(-) create mode 100644 tests/perf/run_perf.py diff --git a/.circleci/config.yml b/.circleci/config.yml index f15d224..efb6d12 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -123,9 +123,7 @@ jobs: cd /tmp/base make PYTHON=python3 install $HOME/.local/bin/pip install pytest-benchmark - python3 -m pytest tests/perf --run-perf --benchmark-only \ - --benchmark-json=/tmp/master.json \ - --benchmark-columns=min,mean,median,stddev,rounds + python3 tests/perf/run_perf.py --output /tmp/master.json - run: name: Reset DB command: | @@ -136,9 +134,7 @@ jobs: command: | make PYTHON=python3 install $HOME/.local/bin/pip install pytest-benchmark - python3 -m pytest tests/perf --run-perf --benchmark-only \ - --benchmark-json=artifacts/branch.json \ - --benchmark-columns=min,mean,median,stddev,rounds + python3 tests/perf/run_perf.py --output artifacts/branch.json python3 tests/perf/compare.py \ /tmp/master.json artifacts/branch.json \ | tee artifacts/perf_diff.txt diff --git a/tests/perf/run_perf.py b/tests/perf/run_perf.py new file mode 100644 index 0000000..4898d50 --- /dev/null +++ b/tests/perf/run_perf.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- +"""Run the perf suite, then re-run any test whose stddev is too high. + +pytest-benchmark reports stddev per test. If a test's coefficient of +variation (stddev / mean) is above --cv-threshold we consider that run +untrustworthy and rerun *just that test* (via pytest -k), keeping the +lowest observed min across attempts. We stop when every test is quiet +or --max-attempts is reached; either way we always produce an output +JSON so compare.py can run. + +CLI is deliberately narrow: point it at an output path and it does the +right pytest invocation for our perf suite. +""" +from __future__ import print_function + +import argparse +import json +import os +import subprocess +import sys + + +def _cv_pct(stats): + m = stats.get('mean', 0.0) + return (stats['stddev'] / m) * 100.0 if m else 0.0 + + +def _load(path): + with open(path) as f: + return json.load(f) + + +def _noisy(merged, threshold): + return [b['name'] for b in merged['benchmarks'] + if _cv_pct(b['stats']) > threshold] + + +def _merge_min(base, new): + """For each test, replace the base entry if `new` observed a lower min.""" + idx = {b['name']: i for i, b in enumerate(base['benchmarks'])} + for b in new['benchmarks']: + i = idx.get(b['name']) + if i is None: + base['benchmarks'].append(b) + idx[b['name']] = len(base['benchmarks']) - 1 + elif b['stats']['min'] < base['benchmarks'][i]['stats']['min']: + base['benchmarks'][i] = b + return base + + +def _pytest(output, k_filter=None): + cmd = [sys.executable, '-m', 'pytest', 'tests/perf', + '--run-perf', '--benchmark-only', + '--benchmark-json=' + output, + '--benchmark-columns=min,mean,median,stddev,rounds'] + if k_filter: + cmd += ['-k', k_filter] + r = subprocess.run(cmd) + if r.returncode != 0: + raise SystemExit(r.returncode) + + +def main(args): + tmp_dir = os.path.dirname(os.path.abspath(args.output)) or '.' + base_name = os.path.basename(args.output) + + first = os.path.join(tmp_dir, base_name + '.attempt_1') + _pytest(first) + merged = _load(first) + + for attempt in range(2, args.max_attempts + 1): + noisy = _noisy(merged, args.cv_threshold) + if not noisy: + print("All tests within CV %.1f%% after %d attempt(s)" + % (args.cv_threshold, attempt - 1)) + break + print("Attempt %d/%d: rerunning %s" + % (attempt, args.max_attempts, ", ".join(noisy))) + out = os.path.join(tmp_dir, base_name + ('.attempt_%d' % attempt)) + _pytest(out, k_filter=" or ".join(noisy)) + merged = _merge_min(merged, _load(out)) + else: + still_noisy = _noisy(merged, args.cv_threshold) + if still_noisy: + print("WARN: %d test(s) still above CV %.1f%% after %d attempts: %s" + % (len(still_noisy), args.cv_threshold, + args.max_attempts, ", ".join(still_noisy))) + + with open(args.output, 'w') as f: + json.dump(merged, f) + + +def _parse_args(): + p = argparse.ArgumentParser() + p.add_argument('--output', required=True, + help='final merged pytest-benchmark JSON path') + p.add_argument('--cv-threshold', type=float, default=5.0, + help='per-test CV%% ceiling before retrying (default 5)') + p.add_argument('--max-attempts', type=int, default=3, + help='max attempts including the first run (default 3)') + return p.parse_args() + + +if __name__ == '__main__': + main(_parse_args()) From f48a1216efce6252289dc179060e50ad97639fd4 Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 20:32:34 +0100 Subject: [PATCH 15/21] Try more rounds to get more stability --- .circleci/config.yml | 8 +- tests/perf/run_perf.py | 105 ------------------------- tests/perf/test_insert_select_bench.py | 22 +++--- 3 files changed, 17 insertions(+), 118 deletions(-) delete mode 100644 tests/perf/run_perf.py diff --git a/.circleci/config.yml b/.circleci/config.yml index efb6d12..f15d224 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -123,7 +123,9 @@ jobs: cd /tmp/base make PYTHON=python3 install $HOME/.local/bin/pip install pytest-benchmark - python3 tests/perf/run_perf.py --output /tmp/master.json + python3 -m pytest tests/perf --run-perf --benchmark-only \ + --benchmark-json=/tmp/master.json \ + --benchmark-columns=min,mean,median,stddev,rounds - run: name: Reset DB command: | @@ -134,7 +136,9 @@ jobs: command: | make PYTHON=python3 install $HOME/.local/bin/pip install pytest-benchmark - python3 tests/perf/run_perf.py --output artifacts/branch.json + python3 -m pytest tests/perf --run-perf --benchmark-only \ + --benchmark-json=artifacts/branch.json \ + --benchmark-columns=min,mean,median,stddev,rounds python3 tests/perf/compare.py \ /tmp/master.json artifacts/branch.json \ | tee artifacts/perf_diff.txt diff --git a/tests/perf/run_perf.py b/tests/perf/run_perf.py deleted file mode 100644 index 4898d50..0000000 --- a/tests/perf/run_perf.py +++ /dev/null @@ -1,105 +0,0 @@ -# -*- coding: utf-8 -*- -"""Run the perf suite, then re-run any test whose stddev is too high. - -pytest-benchmark reports stddev per test. If a test's coefficient of -variation (stddev / mean) is above --cv-threshold we consider that run -untrustworthy and rerun *just that test* (via pytest -k), keeping the -lowest observed min across attempts. We stop when every test is quiet -or --max-attempts is reached; either way we always produce an output -JSON so compare.py can run. - -CLI is deliberately narrow: point it at an output path and it does the -right pytest invocation for our perf suite. -""" -from __future__ import print_function - -import argparse -import json -import os -import subprocess -import sys - - -def _cv_pct(stats): - m = stats.get('mean', 0.0) - return (stats['stddev'] / m) * 100.0 if m else 0.0 - - -def _load(path): - with open(path) as f: - return json.load(f) - - -def _noisy(merged, threshold): - return [b['name'] for b in merged['benchmarks'] - if _cv_pct(b['stats']) > threshold] - - -def _merge_min(base, new): - """For each test, replace the base entry if `new` observed a lower min.""" - idx = {b['name']: i for i, b in enumerate(base['benchmarks'])} - for b in new['benchmarks']: - i = idx.get(b['name']) - if i is None: - base['benchmarks'].append(b) - idx[b['name']] = len(base['benchmarks']) - 1 - elif b['stats']['min'] < base['benchmarks'][i]['stats']['min']: - base['benchmarks'][i] = b - return base - - -def _pytest(output, k_filter=None): - cmd = [sys.executable, '-m', 'pytest', 'tests/perf', - '--run-perf', '--benchmark-only', - '--benchmark-json=' + output, - '--benchmark-columns=min,mean,median,stddev,rounds'] - if k_filter: - cmd += ['-k', k_filter] - r = subprocess.run(cmd) - if r.returncode != 0: - raise SystemExit(r.returncode) - - -def main(args): - tmp_dir = os.path.dirname(os.path.abspath(args.output)) or '.' - base_name = os.path.basename(args.output) - - first = os.path.join(tmp_dir, base_name + '.attempt_1') - _pytest(first) - merged = _load(first) - - for attempt in range(2, args.max_attempts + 1): - noisy = _noisy(merged, args.cv_threshold) - if not noisy: - print("All tests within CV %.1f%% after %d attempt(s)" - % (args.cv_threshold, attempt - 1)) - break - print("Attempt %d/%d: rerunning %s" - % (attempt, args.max_attempts, ", ".join(noisy))) - out = os.path.join(tmp_dir, base_name + ('.attempt_%d' % attempt)) - _pytest(out, k_filter=" or ".join(noisy)) - merged = _merge_min(merged, _load(out)) - else: - still_noisy = _noisy(merged, args.cv_threshold) - if still_noisy: - print("WARN: %d test(s) still above CV %.1f%% after %d attempts: %s" - % (len(still_noisy), args.cv_threshold, - args.max_attempts, ", ".join(still_noisy))) - - with open(args.output, 'w') as f: - json.dump(merged, f) - - -def _parse_args(): - p = argparse.ArgumentParser() - p.add_argument('--output', required=True, - help='final merged pytest-benchmark JSON path') - p.add_argument('--cv-threshold', type=float, default=5.0, - help='per-test CV%% ceiling before retrying (default 5)') - p.add_argument('--max-attempts', type=int, default=3, - help='max attempts including the first run (default 3)') - return p.parse_args() - - -if __name__ == '__main__': - main(_parse_args()) diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py index 7e02ecf..21f6fb9 100644 --- a/tests/perf/test_insert_select_bench.py +++ b/tests/perf/test_insert_select_bench.py @@ -36,7 +36,7 @@ _DDL_TRUNCATE = "TRUNCATE TABLE perf_bench" _SMALL = 100 -_LARGE = 100_000 +_LARGE = 20_000 def _rows(n): @@ -83,7 +83,7 @@ def setup(): con.close() def test_insert_large(self, benchmark): - """100k rows via executemany """ + """20k rows via executemany """ con = self._connect() try: self._reset(con) @@ -100,7 +100,7 @@ def setup(): con.commit() benchmark.pedantic(target, setup=setup, warmup_rounds=2, - rounds=30, + rounds=100, iterations=1) finally: con.close() @@ -125,7 +125,7 @@ def target(): con.close() def test_fetchall_large(self, benchmark): - """fetchall over 100k rows. """ + """fetchall over 20k rows. """ con = self._connect() try: self._seed(con, _LARGE) @@ -135,14 +135,14 @@ def target(): cur.execute("SELECT a, b FROM perf_bench") return cur.fetchall() - rows = benchmark.pedantic(target, warmup_rounds=2, rounds=30, + rows = benchmark.pedantic(target, warmup_rounds=2, rounds=100, iterations=1) assert len(rows) == _LARGE finally: con.close() def test_fetchmany_large(self, benchmark): - """fetchmany(1000) over 100k rows""" + """fetchmany(1000) over 20k rows""" con = self._connect() try: self._seed(con, _LARGE) @@ -158,14 +158,14 @@ def target(): total += len(batch) return total - total = benchmark.pedantic(target, warmup_rounds=2, rounds=30, + total = benchmark.pedantic(target, warmup_rounds=2, rounds=100, iterations=1) assert total == _LARGE finally: con.close() def test_fetchone_loop_large(self, benchmark): - """fetchone() in a loop over 100k rows. Isolates per-row + """fetchone() in a loop over 20k rows. Isolates per-row overhead """ con = self._connect() try: @@ -182,7 +182,7 @@ def target(): n += 1 return n - n = benchmark.pedantic(target, warmup_rounds=2, rounds=30, + n = benchmark.pedantic(target, warmup_rounds=2, rounds=100, iterations=1) assert n == _LARGE finally: @@ -216,7 +216,7 @@ def target(): cur.execute("SELECT %s FROM perf_wide" % col_names) return cur.fetchall() - result = benchmark.pedantic(target, warmup_rounds=2, rounds=30, + result = benchmark.pedantic(target, warmup_rounds=2, rounds=100, iterations=1) assert len(result) == self._WIDE_ROWS assert len(result[0]) == self._WIDE_COLS @@ -257,7 +257,7 @@ def target(): cur.execute("SELECT i, d, f, ts, bl, s, n FROM perf_mixed") return cur.fetchall() - result = benchmark.pedantic(target, warmup_rounds=2, rounds=30, + result = benchmark.pedantic(target, warmup_rounds=2, rounds=100, iterations=1) assert len(result) == _LARGE finally: From 7a5715ff97b1901786f7cab524bbca9bee2becc7 Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 20:41:48 +0100 Subject: [PATCH 16/21] Reduce failure threshold to 15% --- tests/perf/compare.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/perf/compare.py b/tests/perf/compare.py index 6b34b25..4f92af3 100644 --- a/tests/perf/compare.py +++ b/tests/perf/compare.py @@ -59,8 +59,8 @@ def _parse_args(): p = argparse.ArgumentParser() p.add_argument('master', help='pytest-benchmark JSON for master') p.add_argument('branch', help='pytest-benchmark JSON for this branch') - p.add_argument('--fail-threshold', type=float, default=10.0, - help='percent slowdown that fails the build (default 10)') + p.add_argument('--fail-threshold', type=float, default=15.0, + help='percent slowdown that fails the build (default 15)') return p.parse_args() From f0481594caed2c646e63bcb5c6887dd5180d25ce Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 20:52:59 +0100 Subject: [PATCH 17/21] Stuff --- tests/perf/test_insert_select_bench.py | 29 ++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py index 21f6fb9..fc269c9 100644 --- a/tests/perf/test_insert_select_bench.py +++ b/tests/perf/test_insert_select_bench.py @@ -20,11 +20,33 @@ cursor and Cython PRs; that's the point. """ +import math +import time + import pytest from tests import nuodb_base +# Small tests have tiny absolute times (sub-ms to a few ms), so per-round +# jitter dominates unless we run *both* many rounds *and* for a long enough +# total wall-time to average out kernel/network noise. This helper probes +# a single call to size `rounds` so that rounds * per_call_time >= min_seconds, +# with a floor of `min_rounds`. +def _rounds_for(target, min_rounds, min_seconds, setup=None, probes=5): + total = 0.0 + for _ in range(probes): + if setup is not None: + setup() + t0 = time.perf_counter() + target() + total += time.perf_counter() - t0 + per_call = total / probes + if per_call <= 0: + return min_rounds + return max(min_rounds, int(math.ceil(min_seconds / per_call))) + + # Skip this whole module unless `--run-perf` is passed on the pytest # command line. We don't want `make fulltest` to sit through a 100k-row # insert on every commit. @@ -77,8 +99,10 @@ def setup(): cur.execute(_DDL_TRUNCATE) con.commit() + rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0, + setup=setup) benchmark.pedantic(target, setup=setup, warmup_rounds=5, - rounds=200, iterations=1) + rounds=rounds, iterations=1) finally: con.close() @@ -118,7 +142,8 @@ def target(): cur.execute("SELECT a, b FROM perf_bench") return cur.fetchall() - rows = benchmark.pedantic(target, warmup_rounds=5, rounds=200, + rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0) + rows = benchmark.pedantic(target, warmup_rounds=5, rounds=rounds, iterations=1) assert len(rows) == _SMALL finally: From 03bc92740e6d4d432e49175185277d27690df867 Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 21:02:23 +0100 Subject: [PATCH 18/21] try to increase iterations to reduce noise --- tests/perf/test_insert_select_bench.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py index fc269c9..8b24750 100644 --- a/tests/perf/test_insert_select_bench.py +++ b/tests/perf/test_insert_select_bench.py @@ -33,7 +33,8 @@ # total wall-time to average out kernel/network noise. This helper probes # a single call to size `rounds` so that rounds * per_call_time >= min_seconds, # with a floor of `min_rounds`. -def _rounds_for(target, min_rounds, min_seconds, setup=None, probes=5): +def _rounds_for(target, min_rounds, min_seconds, setup=None, probes=5, + iterations=1): total = 0.0 for _ in range(probes): if setup is not None: @@ -44,7 +45,8 @@ def _rounds_for(target, min_rounds, min_seconds, setup=None, probes=5): per_call = total / probes if per_call <= 0: return min_rounds - return max(min_rounds, int(math.ceil(min_seconds / per_call))) + per_round = per_call * iterations + return max(min_rounds, int(math.ceil(min_seconds / per_round))) # Skip this whole module unless `--run-perf` is passed on the pytest @@ -100,9 +102,9 @@ def setup(): con.commit() rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0, - setup=setup) + setup=setup, iterations=10) benchmark.pedantic(target, setup=setup, warmup_rounds=5, - rounds=rounds, iterations=1) + rounds=rounds, iterations=10) finally: con.close() @@ -142,9 +144,10 @@ def target(): cur.execute("SELECT a, b FROM perf_bench") return cur.fetchall() - rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0) + rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0, + iterations=10) rows = benchmark.pedantic(target, warmup_rounds=5, rounds=rounds, - iterations=1) + iterations=10) assert len(rows) == _SMALL finally: con.close() From 19a8d1def79a27aca97546e1983a40765ce99895 Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 21:11:51 +0100 Subject: [PATCH 19/21] thing --- tests/perf/test_insert_select_bench.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py index 8b24750..6285491 100644 --- a/tests/perf/test_insert_select_bench.py +++ b/tests/perf/test_insert_select_bench.py @@ -84,7 +84,13 @@ def _seed(self, con, n): # -- INSERT --------------------------------------------------------- def test_insert_small(self, benchmark): - """100 rows via executemany. Sensitive to per-row putValue cost.""" + """100 rows via executemany. Sensitive to per-row putValue cost. + + Uses iterations=10 to batch-average away kernel jitter that + otherwise dominates at these tiny per-call times. pytest-benchmark + forbids setup+iterations>1, so the TRUNCATE lives inside the timed + target — both master and branch pay it equally. + """ con = self._connect() try: self._reset(con) @@ -92,18 +98,14 @@ def test_insert_small(self, benchmark): rows = _rows(_SMALL) def target(): + cur.execute(_DDL_TRUNCATE) cur.executemany( "INSERT INTO perf_bench (a, b) VALUES (?, ?)", rows) con.commit() - def setup(): - # Runs before each round but is NOT included in the timing. - cur.execute(_DDL_TRUNCATE) - con.commit() - rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0, - setup=setup, iterations=10) - benchmark.pedantic(target, setup=setup, warmup_rounds=5, + iterations=10) + benchmark.pedantic(target, warmup_rounds=5, rounds=rounds, iterations=10) finally: con.close() From cd6975c947d1d51e651838857a551689d6b1112b Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 21:27:48 +0100 Subject: [PATCH 20/21] Try to stabilise --- tests/perf/test_insert_select_bench.py | 31 +++++++++++++------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py index 6285491..86ba47d 100644 --- a/tests/perf/test_insert_select_bench.py +++ b/tests/perf/test_insert_select_bench.py @@ -59,7 +59,7 @@ def _rounds_for(target, min_rounds, min_seconds, setup=None, probes=5, _DDL_CREATE = "CREATE TABLE perf_bench (a INT, b VARCHAR(64))" _DDL_TRUNCATE = "TRUNCATE TABLE perf_bench" -_SMALL = 100 +_SMALL = 1000 _LARGE = 20_000 @@ -84,29 +84,31 @@ def _seed(self, con, n): # -- INSERT --------------------------------------------------------- def test_insert_small(self, benchmark): - """100 rows via executemany. Sensitive to per-row putValue cost. + """1000 rows via executemany. Sensitive to per-row putValue cost. - Uses iterations=10 to batch-average away kernel jitter that - otherwise dominates at these tiny per-call times. pytest-benchmark - forbids setup+iterations>1, so the TRUNCATE lives inside the timed - target — both master and branch pay it equally. + Sized so per-call time is a few tens of ms — small enough to + exercise the fixed per-query overhead, large enough that TRUNCATE + (out-of-band via setup) and kernel jitter don't dominate the min. """ con = self._connect() try: self._reset(con) cur = con.cursor() - rows = _rows(_SMALL) + rows = _rows(1000) def target(): - cur.execute(_DDL_TRUNCATE) cur.executemany( "INSERT INTO perf_bench (a, b) VALUES (?, ?)", rows) con.commit() + def setup(): + cur.execute(_DDL_TRUNCATE) + con.commit() + rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0, - iterations=10) - benchmark.pedantic(target, warmup_rounds=5, - rounds=rounds, iterations=10) + setup=setup) + benchmark.pedantic(target, setup=setup, warmup_rounds=5, + rounds=rounds, iterations=1) finally: con.close() @@ -136,7 +138,7 @@ def setup(): # -- SELECT --------------------------------------------------------- def test_fetchall_small(self, benchmark): - """fetchall over 100 rows. Sensitive to fixed per-query overhead.""" + """fetchall over 1000 rows. Sensitive to fixed per-query overhead.""" con = self._connect() try: self._seed(con, _SMALL) @@ -146,10 +148,9 @@ def target(): cur.execute("SELECT a, b FROM perf_bench") return cur.fetchall() - rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0, - iterations=10) + rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0) rows = benchmark.pedantic(target, warmup_rounds=5, rounds=rounds, - iterations=10) + iterations=1) assert len(rows) == _SMALL finally: con.close() From 89406ed013da79ed5f19eb805fd9292b372710e0 Mon Sep 17 00:00:00 2001 From: Martin Gallwey Date: Wed, 1 Jul 2026 21:52:41 +0100 Subject: [PATCH 21/21] try to clean up a bit --- tests/perf/compare.py | 7 +-- tests/perf/test_insert_select_bench.py | 79 +++++++------------------- 2 files changed, 23 insertions(+), 63 deletions(-) diff --git a/tests/perf/compare.py b/tests/perf/compare.py index 4f92af3..82d83e7 100644 --- a/tests/perf/compare.py +++ b/tests/perf/compare.py @@ -23,9 +23,7 @@ def main(args): master = _load(args.master) branch = _load(args.branch) - print("%-40s %16s %16s %14s %10s" % ( - "Test", "master min (ms)", "branch min (ms)", - "delta (ms)", "delta %")) + print("%-40s %16s %16s %14s %10s" % ( "Test", "master min (ms)", "branch min (ms)", "delta (ms)", "delta %")) print("-" * 100) regressed = [] @@ -47,8 +45,7 @@ def main(args): print() if regressed: - print("FAIL: %d test(s) regressed by more than %.2f%%:" - % (len(regressed), args.fail_threshold)) + print("FAIL: %d test(s) regressed by more than %.2f%%:" % (len(regressed), args.fail_threshold)) for name, p in regressed: print(" %s: %+.2f%%" % (name, p)) sys.exit(1) diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py index 86ba47d..ab3d8b9 100644 --- a/tests/perf/test_insert_select_bench.py +++ b/tests/perf/test_insert_select_bench.py @@ -77,24 +77,18 @@ def _reset(self, con): def _seed(self, con, n): self._reset(con) - con.cursor().executemany( - "INSERT INTO perf_bench (a, b) VALUES (?, ?)", _rows(n)) + con.cursor().executemany("INSERT INTO perf_bench (a, b) VALUES (?, ?)", _rows(n)) con.commit() # -- INSERT --------------------------------------------------------- def test_insert_small(self, benchmark): - """1000 rows via executemany. Sensitive to per-row putValue cost. - - Sized so per-call time is a few tens of ms — small enough to - exercise the fixed per-query overhead, large enough that TRUNCATE - (out-of-band via setup) and kernel jitter don't dominate the min. - """ + """1000 rows via executemany. Sensitive to per-row putValue cost. """ con = self._connect() try: self._reset(con) cur = con.cursor() - rows = _rows(1000) + rows = _rows(_SMALL) def target(): cur.executemany( @@ -105,10 +99,8 @@ def setup(): cur.execute(_DDL_TRUNCATE) con.commit() - rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0, - setup=setup) - benchmark.pedantic(target, setup=setup, warmup_rounds=5, - rounds=rounds, iterations=1) + rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0, setup=setup) + benchmark.pedantic(target, setup=setup, warmup_rounds=5, rounds=rounds, iterations=1) finally: con.close() @@ -121,24 +113,21 @@ def test_insert_large(self, benchmark): rows = _rows(_LARGE) def target(): - cur.executemany( - "INSERT INTO perf_bench (a, b) VALUES (?, ?)", rows) + cur.executemany( "INSERT INTO perf_bench (a, b) VALUES (?, ?)", rows) con.commit() def setup(): cur.execute(_DDL_TRUNCATE) con.commit() - benchmark.pedantic(target, setup=setup, warmup_rounds=2, - rounds=100, - iterations=1) + benchmark.pedantic(target, setup=setup, warmup_rounds=2, rounds=100, iterations=1) finally: con.close() # -- SELECT --------------------------------------------------------- def test_fetchall_small(self, benchmark): - """fetchall over 1000 rows. Sensitive to fixed per-query overhead.""" + """fetchall over 1000 rows.""" con = self._connect() try: self._seed(con, _SMALL) @@ -149,8 +138,7 @@ def target(): return cur.fetchall() rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0) - rows = benchmark.pedantic(target, warmup_rounds=5, rounds=rounds, - iterations=1) + rows = benchmark.pedantic(target, warmup_rounds=5, rounds=rounds, iterations=1) assert len(rows) == _SMALL finally: con.close() @@ -166,8 +154,7 @@ def target(): cur.execute("SELECT a, b FROM perf_bench") return cur.fetchall() - rows = benchmark.pedantic(target, warmup_rounds=2, rounds=100, - iterations=1) + rows = benchmark.pedantic(target, warmup_rounds=2, rounds=100, iterations=1) assert len(rows) == _LARGE finally: con.close() @@ -189,15 +176,13 @@ def target(): total += len(batch) return total - total = benchmark.pedantic(target, warmup_rounds=2, rounds=100, - iterations=1) + total = benchmark.pedantic(target, warmup_rounds=2, rounds=100, iterations=1) assert total == _LARGE finally: con.close() def test_fetchone_loop_large(self, benchmark): - """fetchone() in a loop over 20k rows. Isolates per-row - overhead """ + """fetchone() in a loop over 20k rows. Isolates per-row overhead """ con = self._connect() try: self._seed(con, _LARGE) @@ -213,8 +198,7 @@ def target(): n += 1 return n - n = benchmark.pedantic(target, warmup_rounds=2, rounds=100, - iterations=1) + n = benchmark.pedantic(target, warmup_rounds=2, rounds=100, iterations=1) assert n == _LARGE finally: con.close() @@ -237,59 +221,38 @@ def test_fetchall_wide(self, benchmark): cur.execute("CREATE TABLE perf_wide (%s)" % (", ".join(cols),)) con.commit() rows = [tuple(range(self._WIDE_COLS)) for _ in range(self._WIDE_ROWS)] - cur.executemany( - "INSERT INTO perf_wide (%s) VALUES (%s)" - % (col_names, placeholders), - rows) + cur.executemany( "INSERT INTO perf_wide (%s) VALUES (%s)" % (col_names, placeholders), rows) con.commit() def target(): cur.execute("SELECT %s FROM perf_wide" % col_names) return cur.fetchall() - result = benchmark.pedantic(target, warmup_rounds=2, rounds=100, - iterations=1) + result = benchmark.pedantic(target, warmup_rounds=2, rounds=100, iterations=1) assert len(result) == self._WIDE_ROWS assert len(result[0]) == self._WIDE_COLS finally: con.close() def test_fetchall_mixed_types(self, benchmark): - """SELECT with variety of types: int / decimal / double / timestamp / bool / - varchar / null. """ + """SELECT with variety of types: int / decimal / double / timestamp / bool / varchar / null. """ con = self._connect() try: cur = con.cursor() cur.execute("DROP TABLE IF EXISTS perf_mixed") cur.execute( - "CREATE TABLE perf_mixed (" - " i INT," - " d DECIMAL(12, 4)," - " f DOUBLE," - " ts TIMESTAMP," - " bl BOOLEAN," - " s VARCHAR(64)," - " n INT" - ")") + "CREATE TABLE perf_mixed (i INT, d DECIMAL(12, 4), f DOUBLE, ts TIMESTAMP," + "bl BOOLEAN, s VARCHAR(64), n INT)") con.commit() - rows = [ - (i, i * 1.25, i / 3.0, - '2024-01-01 12:34:56', bool(i & 1), - 'row #%d' % i, None) - for i in range(_LARGE) - ] - cur.executemany( - "INSERT INTO perf_mixed (i, d, f, ts, bl, s, n)" - " VALUES (?, ?, ?, ?, ?, ?, ?)", - rows) + rows = [ (i, i * 1.25, i / 3.0, '2024-01-01 12:34:56', bool(i & 1), 'row #%d' % i, None) for i in range(_LARGE) ] + cur.executemany( "INSERT INTO perf_mixed (i, d, f, ts, bl, s, n)" " VALUES (?, ?, ?, ?, ?, ?, ?)", rows) con.commit() def target(): cur.execute("SELECT i, d, f, ts, bl, s, n FROM perf_mixed") return cur.fetchall() - result = benchmark.pedantic(target, warmup_rounds=2, rounds=100, - iterations=1) + result = benchmark.pedantic(target, warmup_rounds=2, rounds=100, iterations=1) assert len(result) == _LARGE finally: con.close()