From 93a248057436a65eeae6ffedbf05527e96ee53cd Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 15:26:30 +0100
Subject: [PATCH 01/21] Attempt to add performance tests

---
 .circleci/config.yml                   |  67 +++++++++++
 test-performance/timesInsert.py        |  82 -------------
 test_requirements.txt                  |   1 +
 tests/conftest.py                      |  20 ++++
 tests/perf/__init__.py                 |   0
 tests/perf/test_insert_select_bench.py | 158 +++++++++++++++++++++++++
 6 files changed, 246 insertions(+), 82 deletions(-)
 delete mode 100644 test-performance/timesInsert.py
 create mode 100644 tests/perf/__init__.py
 create mode 100644 tests/perf/test_insert_select_bench.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 4fc6d2e..4903d2d 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -73,6 +73,69 @@ jobs:
       - after_failure:
           when : "on_fail"
 
+  perf_bench:
+    description: "Run insert/select benchmarks on master and on this branch, print the diff"
+    docker:
+      - image: nuodb/nuodb:latest
+        user: root
+    resource_class: medium
+    environment:
+      TZ : America/New_York
+      NUO_SET_TLS : disable
+      NUOCMD_CLIENT_KEY : ""
+      NUOCMD_VERIFY_SERVER : ""
+      NUOCMD_PLUGINS : ""
+    steps:
+      - checkout
+      - run:
+          name: Install build tools
+          command: |
+            PYVER=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
+            dnf install -y make gcc "python${PYVER}-devel"
+      - run:
+          name: Install pip
+          command: |
+            curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py
+            python3 /tmp/get-pip.py --user
+      - run:
+          name: Make artifact directories
+          command: mkdir -p artifacts results
+      - run:
+          name: Start NuoDB Admin
+          command: |
+            sudo -u nuodb /opt/nuodb/etc/nuoadmin tls $NUO_SET_TLS
+            sudo -u nuodb /opt/nuodb/etc/nuoadmin tls status
+            sudo -u nuodb /opt/nuodb/etc/nuoadmin start
+            sudo -u nuodb /opt/nuodb/bin/nuocmd --show-json get effective-license
+      # Same runner, same NuoDB instance: run master first for a
+      # baseline, then this branch, then let pytest-benchmark print the
+      # per-test diff.  Hardware noise cancels because both runs share
+      # the same container.
+      - run:
+          name: Baseline benchmarks on master
+          command: |
+            git worktree add /tmp/base origin/master
+            cd /tmp/base
+            make PYTHON=python3 install
+            $HOME/.local/bin/pip install pytest-benchmark
+            python3 -m pytest tests/perf --run-perf --benchmark-only \
+                --benchmark-json=/tmp/baseline.json \
+                --benchmark-columns=min,mean,median,stddev,rounds
+      - run:
+          name: Branch benchmarks + diff vs master
+          command: |
+            make PYTHON=python3 install
+            $HOME/.local/bin/pip install pytest-benchmark
+            python3 -m pytest tests/perf --run-perf --benchmark-only \
+                --benchmark-json=artifacts/pr.json \
+                --benchmark-compare=/tmp/baseline.json \
+                --benchmark-columns=min,mean,median,stddev,rounds \
+                | tee artifacts/perf_diff.txt
+      - store_artifacts:
+          path: artifacts
+      - after_failure:
+          when : "on_fail"
+
 workflows:
   build-project:
     jobs:
@@ -80,3 +143,7 @@ workflows:
           name: "Build and run regression tests"
           context:
             - common-config
+      - perf_bench:
+          name: "Insert/select benchmark comparison"
+          context:
+            - common-config
diff --git a/test-performance/timesInsert.py b/test-performance/timesInsert.py
deleted file mode 100644
index 9ee32bd..0000000
--- a/test-performance/timesInsert.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# A database named test with user dba / password dba must be created first
-
-import os
-import time
-
-import pynuodb
-
-smallIterations = 100
-largeIterations = smallIterations * 1000
-
-
-def gettime():
-    return time.time()
-
-
-def insert(count):
-    for i in range(count):
-        cursor.execute("INSERT INTO perf_test (a,b ) VALUES (%d,'A')" % i)
-    connection.commit()
-
-
-def select():
-    cursor.execute("select * from perf_test")
-    cursor.fetchall()
-
-
-dropTable = "drop table perf_test cascade if exists"
-createTable = "create table perf_test (a int,b char)"
-
-port = os.environ.get('NUODB_PORT')
-if not port:
-    port = '48004'
-
-options = {}
-trustStore = os.environ.get('NUOCMD_VERIFY_SERVER')
-if trustStore:
-    options = {'trustStore': trustStore, 'verifyHostname': 'False'}
-
-connection = pynuodb.connect("test", "localhost:" + port, "dba", "dba",
-                             options=options)
-cursor = connection.cursor()
-cursor.execute("use test")
-
-# Begin SMALL_INSERT_ITERATIONS test
-cursor.execute(dropTable)
-cursor.execute(createTable)
-start = gettime()
-insert(smallIterations)
-smallInsertElapsed = gettime() - start
-
-print("Elapse time of SMALL_INSERT_ITERATIONS = %.4fs" % (smallInsertElapsed))
-
-# Begin SMALL_SELECT_ITERATIONS test
-start = gettime()
-select()
-smallSelectElapsed = gettime() - start
-print("Elapse time of SMALL_SELECT_ITERATIONS = %.4fs" % (smallSelectElapsed))
-
-# Begin LARGE_INSERT_ITERATIONS test
-cursor.execute(dropTable)
-cursor.execute(createTable)
-
-start = gettime()
-insert(largeIterations)
-largeInsertElapsed = gettime() - start
-
-print("Elapse time of LARGE_INSERT_ITERATIONS = %.4fs" % (largeInsertElapsed))
-
-# Begin LARGE_SELECT_ITERATIONS test
-start = gettime()
-select()
-largeSelectElapsed = gettime() - start
-
-print("Elapse time of LARGE_SELECT_ITERATIONS = %.4fs" % (largeSelectElapsed))
-
-if largeInsertElapsed > smallInsertElapsed * 1000:
-    print("Insert is too slow!")
-
-if largeSelectElapsed > smallSelectElapsed * 1000:
-    print("Select is too slow!")
-
-print("\n")
diff --git a/test_requirements.txt b/test_requirements.txt
index ef90a4a..2328d32 100644
--- a/test_requirements.txt
+++ b/test_requirements.txt
@@ -1,6 +1,7 @@
 mock>=1.0
 nose>=1.3
 pytest>=2.7
+pytest-benchmark>=4.0
 coverage>=3.7
 pytest-cov>=1.8.1
 coveralls>=0.5
diff --git a/tests/conftest.py b/tests/conftest.py
index fb7cd90..180e014 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -30,6 +30,26 @@
 
 from . import nuocmd, cvtjson
 
+
+def pytest_addoption(parser):
+    parser.addoption("--run-perf", action="store_true", default=False,
+                     help="run performance benchmarks under tests/perf")
+
+
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers",
+        "perf: performance benchmark; skipped unless --run-perf is passed")
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--run-perf"):
+        return
+    skip = pytest.mark.skip(reason="need --run-perf to run performance tests")
+    for item in items:
+        if "perf" in item.keywords:
+            item.add_marker(skip)
+
 _log = logging.getLogger("pynuodbtest")
 
 DB_OPTIONS = []  # type: List[str]
diff --git a/tests/perf/__init__.py b/tests/perf/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py
new file mode 100644
index 0000000..2554b24
--- /dev/null
+++ b/tests/perf/test_insert_select_bench.py
@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+"""Insert / select micro-benchmarks.
+
+(C) Copyright 2025 Dassault Systemes SE.  All Rights Reserved.
+
+This software is licensed under a BSD 3-Clause License.
+See the LICENSE file provided with this software.
+
+Ported from test-performance/timesInsert.py so the numbers live alongside
+the correctness suite and can be run via `pytest --benchmark-only`.
+
+Each test measures one operation the driver's hot paths care about:
+
+    * bulk INSERT via executemany (encode path, session send)
+    * fetchall over a small result set (decode + per-row dispatch)
+    * fetchall over a large result set (batched decode + refill loop)
+
+pytest-benchmark auto-repeats each function and reports min / mean /
+median / stddev.  Numbers move meaningfully with the crypt, session,
+cursor and Cython PRs; that's the point.
+"""
+
+import pytest
+
+from tests import nuodb_base
+
+
+# Skip this whole module unless `--run-perf` is passed on the pytest
+# command line.  We don't want `make fulltest` to sit through a 100k-row
+# insert on every commit.
+pytestmark = pytest.mark.perf
+
+
+_DDL_DROP   = "DROP TABLE IF EXISTS perf_bench"
+_DDL_CREATE = "CREATE TABLE perf_bench (a INT, b VARCHAR(64))"
+
+_SMALL = 100
+_LARGE = 100_000
+
+
+def _rows(n):
+    return [(i, 'A dark and stormy night %d' % i) for i in range(n)]
+
+
+class TestInsertSelectPerf(nuodb_base.NuoBase):
+
+    def _reset(self, con):
+        cur = con.cursor()
+        cur.execute(_DDL_DROP)
+        cur.execute(_DDL_CREATE)
+        con.commit()
+
+    def _seed(self, con, n):
+        self._reset(con)
+        con.cursor().executemany(
+            "INSERT INTO perf_bench (a, b) VALUES (?, ?)", _rows(n))
+        con.commit()
+
+    # -- INSERT ---------------------------------------------------------
+
+    def test_insert_small(self, benchmark):
+        """100 rows via executemany.  Sensitive to per-row putValue cost."""
+        con = self._connect()
+        try:
+            self._reset(con)
+            cur = con.cursor()
+            rows = _rows(_SMALL)
+
+            def target():
+                cur.executemany(
+                    "INSERT INTO perf_bench (a, b) VALUES (?, ?)", rows)
+                con.commit()
+                # Truncate between iterations so we measure a clean insert.
+                cur.execute(_DDL_DROP)
+                cur.execute(_DDL_CREATE)
+                con.commit()
+
+            benchmark(target)
+        finally:
+            con.close()
+
+    def test_insert_large(self, benchmark):
+        """100k rows via executemany """
+        con = self._connect()
+        try:
+            self._reset(con)
+            cur = con.cursor()
+            rows = _rows(_LARGE)
+
+            def target():
+                cur.executemany(
+                    "INSERT INTO perf_bench (a, b) VALUES (?, ?)", rows)
+                con.commit()
+                cur.execute(_DDL_DROP)
+                cur.execute(_DDL_CREATE)
+                con.commit()
+
+            # Large insert is slow; cap repetitions so a benchmark run
+            # finishes in seconds rather than minutes.
+            benchmark.pedantic(target, rounds=3, iterations=1)
+        finally:
+            con.close()
+
+    # -- SELECT ---------------------------------------------------------
+
+    def test_fetchall_small(self, benchmark):
+        """fetchall over 100 rows.  Sensitive to fixed per-query overhead."""
+        con = self._connect()
+        try:
+            self._seed(con, _SMALL)
+            cur = con.cursor()
+
+            def target():
+                cur.execute("SELECT a, b FROM perf_bench")
+                return cur.fetchall()
+
+            rows = benchmark(target)
+            assert len(rows) == _SMALL
+        finally:
+            con.close()
+
+    def test_fetchall_large(self, benchmark):
+        """fetchall over 100k rows. """
+        con = self._connect()
+        try:
+            self._seed(con, _LARGE)
+            cur = con.cursor()
+
+            def target():
+                cur.execute("SELECT a, b FROM perf_bench")
+                return cur.fetchall()
+
+            rows = benchmark.pedantic(target, rounds=5, iterations=1)
+            assert len(rows) == _LARGE
+        finally:
+            con.close()
+
+    def test_fetchmany_large(self, benchmark):
+        """fetchmany(1000) over 100k rows"""
+        con = self._connect()
+        try:
+            self._seed(con, _LARGE)
+            cur = con.cursor()
+
+            def target():
+                cur.execute("SELECT a, b FROM perf_bench")
+                total = 0
+                while True:
+                    batch = cur.fetchmany(1000)
+                    if not batch:
+                        break
+                    total += len(batch)
+                return total
+
+            total = benchmark.pedantic(target, rounds=5, iterations=1)
+            assert total == _LARGE
+        finally:
+            con.close()

From 4dc039c3bb9592018d8e036c707e615ea77d1f2c Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 15:30:37 +0100
Subject: [PATCH 02/21] Fix more tings

---
 tests/perf/test_insert_select_bench.py | 96 ++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)

diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py
index 2554b24..3846eb4 100644
--- a/tests/perf/test_insert_select_bench.py
+++ b/tests/perf/test_insert_select_bench.py
@@ -156,3 +156,99 @@ def target():
             assert total == _LARGE
         finally:
             con.close()
+
+    def test_fetchone_loop_large(self, benchmark):
+        """fetchone() in a loop over 100k rows.  Isolates per-row
+        overhead """
+        con = self._connect()
+        try:
+            self._seed(con, _LARGE)
+            cur = con.cursor()
+
+            def target():
+                cur.execute("SELECT a, b FROM perf_bench")
+                n = 0
+                while True:
+                    row = cur.fetchone()
+                    if row is None:
+                        break
+                    n += 1
+                return n
+
+            n = benchmark.pedantic(target, rounds=3, iterations=1)
+            assert n == _LARGE
+        finally:
+            con.close()
+
+    # -- Wide rows / mixed types ---------------------------------------
+
+    _WIDE_COLS = 50
+    _WIDE_ROWS = 1000
+
+    def test_fetchall_wide(self, benchmark):
+        """50 columns x 1000 rows """
+        cols = ["c%d INT" % i for i in range(self._WIDE_COLS)]
+        col_names = ", ".join("c%d" % i for i in range(self._WIDE_COLS))
+        placeholders = ", ".join(["?"] * self._WIDE_COLS)
+
+        con = self._connect()
+        try:
+            cur = con.cursor()
+            cur.execute("DROP TABLE IF EXISTS perf_wide")
+            cur.execute("CREATE TABLE perf_wide (%s)" % (", ".join(cols),))
+            con.commit()
+            rows = [tuple(range(self._WIDE_COLS)) for _ in range(self._WIDE_ROWS)]
+            cur.executemany(
+                "INSERT INTO perf_wide (%s) VALUES (%s)"
+                % (col_names, placeholders),
+                rows)
+            con.commit()
+
+            def target():
+                cur.execute("SELECT %s FROM perf_wide" % col_names)
+                return cur.fetchall()
+
+            result = benchmark.pedantic(target, rounds=5, iterations=1)
+            assert len(result) == self._WIDE_ROWS
+            assert len(result[0]) == self._WIDE_COLS
+        finally:
+            con.close()
+
+    def test_fetchall_mixed_types(self, benchmark):
+        """SELECT with variety of types: int / decimal / double / timestamp / bool /
+        varchar / null. """
+        con = self._connect()
+        try:
+            cur = con.cursor()
+            cur.execute("DROP TABLE IF EXISTS perf_mixed")
+            cur.execute(
+                "CREATE TABLE perf_mixed ("
+                "  i INT,"
+                "  d DECIMAL(12, 4),"
+                "  f DOUBLE,"
+                "  ts TIMESTAMP,"
+                "  bl BOOLEAN,"
+                "  s VARCHAR(64),"
+                "  n INT"
+                ")")
+            con.commit()
+            rows = [
+                (i, i * 1.25, i / 3.0,
+                 '2024-01-01 12:34:56', bool(i & 1),
+                 'row #%d' % i, None)
+                for i in range(_LARGE)
+            ]
+            cur.executemany(
+                "INSERT INTO perf_mixed (i, d, f, ts, bl, s, n)"
+                " VALUES (?, ?, ?, ?, ?, ?, ?)",
+                rows)
+            con.commit()
+
+            def target():
+                cur.execute("SELECT i, d, f, ts, bl, s, n FROM perf_mixed")
+                return cur.fetchall()
+
+            result = benchmark.pedantic(target, rounds=5, iterations=1)
+            assert len(result) == _LARGE
+        finally:
+            con.close()

From d08e258afb19b5d5f54efdedecf6051cce20409f Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 15:39:27 +0100
Subject: [PATCH 03/21] Add git

---
 .circleci/config.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 4903d2d..dcff0ea 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -91,7 +91,7 @@ jobs:
           name: Install build tools
           command: |
             PYVER=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
-            dnf install -y make gcc "python${PYVER}-devel"
+            dnf install -y git make gcc "python${PYVER}-devel"
       - run:
           name: Install pip
           command: |
@@ -114,7 +114,8 @@ jobs:
       - run:
           name: Baseline benchmarks on master
           command: |
-            git worktree add /tmp/base origin/master
+            git fetch --no-tags --depth=1 origin master
+            git worktree add /tmp/base FETCH_HEAD
             cd /tmp/base
             make PYTHON=python3 install
             $HOME/.local/bin/pip install pytest-benchmark

From 0068eea6839033ad36c8d8eab0624dc28c1aa777 Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 15:46:31 +0100
Subject: [PATCH 04/21] Try

---
 .circleci/config.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index dcff0ea..70a99c1 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -114,6 +114,8 @@ jobs:
       - run:
           name: Baseline benchmarks on master
           command: |
+            mkdir -p ~/.ssh
+            ssh-keyscan -H github.com >> ~/.ssh/known_hosts 2>/dev/null
             git fetch --no-tags --depth=1 origin master
             git worktree add /tmp/base FETCH_HEAD
             cd /tmp/base

From 9fd307b731d9ac63093dbd8c6558981ddcf6076e Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 16:08:18 +0100
Subject: [PATCH 05/21] Try something different

---
 .circleci/config.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 70a99c1..7661780 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -114,9 +114,8 @@ jobs:
       - run:
           name: Baseline benchmarks on master
           command: |
-            mkdir -p ~/.ssh
-            ssh-keyscan -H github.com >> ~/.ssh/known_hosts 2>/dev/null
-            git fetch --no-tags --depth=1 origin master
+            GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" \
+                git fetch --no-tags --depth=1 origin master
             git worktree add /tmp/base FETCH_HEAD
             cd /tmp/base
             make PYTHON=python3 install

From 4f8c7cf94c1ae6c453ecdd494c9a2e32edd41d6e Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 16:15:21 +0100
Subject: [PATCH 06/21] ensure same tests run on both branches

---
 .circleci/config.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7661780..9eaea52 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -117,6 +117,10 @@ jobs:
             GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" \
                 git fetch --no-tags --depth=1 origin master
             git worktree add /tmp/base FETCH_HEAD
+            # Copy the perf suite + conftest hooks from this branch so the
+            # master worktree's driver is exercised by the same benchmarks.
+            cp -a tests/perf /tmp/base/tests/
+            cp tests/conftest.py /tmp/base/tests/conftest.py
             cd /tmp/base
             make PYTHON=python3 install
             $HOME/.local/bin/pip install pytest-benchmark

From c24a84d33fcf5c4a1131e4c3c4dc071d26f1c94d Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 16:25:09 +0100
Subject: [PATCH 07/21] Try to make it gooder

---
 .circleci/config.yml                   |  8 ++--
 tests/perf/compare.py                  | 55 ++++++++++++++++++++++++++
 tests/perf/test_insert_select_bench.py | 14 +++----
 3 files changed, 65 insertions(+), 12 deletions(-)
 create mode 100644 tests/perf/compare.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9eaea52..a9be12f 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -125,7 +125,7 @@ jobs:
             make PYTHON=python3 install
             $HOME/.local/bin/pip install pytest-benchmark
             python3 -m pytest tests/perf --run-perf --benchmark-only \
-                --benchmark-json=/tmp/baseline.json \
+                --benchmark-json=/tmp/master.json \
                 --benchmark-columns=min,mean,median,stddev,rounds
       - run:
           name: Branch benchmarks + diff vs master
@@ -133,9 +133,9 @@ jobs:
             make PYTHON=python3 install
             $HOME/.local/bin/pip install pytest-benchmark
             python3 -m pytest tests/perf --run-perf --benchmark-only \
-                --benchmark-json=artifacts/pr.json \
-                --benchmark-compare=/tmp/baseline.json \
-                --benchmark-columns=min,mean,median,stddev,rounds \
+                --benchmark-json=artifacts/branch.json \
+                --benchmark-columns=min,mean,median,stddev,rounds
+            python3 tests/perf/compare.py /tmp/master.json artifacts/branch.json \
                 | tee artifacts/perf_diff.txt
       - store_artifacts:
           path: artifacts
diff --git a/tests/perf/compare.py b/tests/perf/compare.py
new file mode 100644
index 0000000..e1ea790
--- /dev/null
+++ b/tests/perf/compare.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+"""Compare two pytest-benchmark JSON files (master vs branch).
+
+Usage: python compare.py MASTER.json BRANCH.json
+
+Compares the `min` time of each shared test and prints a table with the
+absolute and relative delta.  We use min because it's the least noisy
+statistic: it filters out GC pauses, kernel scheduling and JIT warmup.
+"""
+from __future__ import print_function
+
+import json
+import sys
+
+
+def _load(path):
+    with open(path) as f:
+        data = json.load(f)
+    return {b['name']: b['stats']['min'] for b in data['benchmarks']}
+
+
+def main(master_path, branch_path):
+    master = _load(master_path)
+    branch = _load(branch_path)
+
+    shared = sorted(set(master) & set(branch))
+    rows = []
+    for name in shared:
+        m_ms = master[name] * 1000.0
+        b_ms = branch[name] * 1000.0
+        delta = b_ms - m_ms
+        pct = (delta / m_ms) * 100.0 if m_ms else float('nan')
+        rows.append((name, m_ms, b_ms, delta, pct))
+
+    header = ("Test", "master min (ms)", "branch min (ms)",
+              "delta (ms)", "delta %")
+    print("%-40s %16s %16s %14s %10s" % header)
+    print("-" * 100)
+    for name, m, b, d, p in rows:
+        print("%-40s %16.3f %16.3f %+14.3f %+9.2f%%" % (name, m, b, d, p))
+
+    only_master = sorted(set(master) - set(branch))
+    only_branch = sorted(set(branch) - set(master))
+    if only_master:
+        print("\nOnly in master: %s" % ", ".join(only_master))
+    if only_branch:
+        print("Only in branch: %s" % ", ".join(only_branch))
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 3:
+        print("usage: python compare.py MASTER.json BRANCH.json",
+              file=sys.stderr)
+        sys.exit(2)
+    main(sys.argv[1], sys.argv[2])
diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py
index 3846eb4..3f81f02 100644
--- a/tests/perf/test_insert_select_bench.py
+++ b/tests/perf/test_insert_select_bench.py
@@ -95,9 +95,7 @@ def target():
                 cur.execute(_DDL_CREATE)
                 con.commit()
 
-            # Large insert is slow; cap repetitions so a benchmark run
-            # finishes in seconds rather than minutes.
-            benchmark.pedantic(target, rounds=3, iterations=1)
+            benchmark.pedantic(target, rounds=10, iterations=1)
         finally:
             con.close()
 
@@ -130,7 +128,7 @@ def target():
                 cur.execute("SELECT a, b FROM perf_bench")
                 return cur.fetchall()
 
-            rows = benchmark.pedantic(target, rounds=5, iterations=1)
+            rows = benchmark.pedantic(target, rounds=10, iterations=1)
             assert len(rows) == _LARGE
         finally:
             con.close()
@@ -152,7 +150,7 @@ def target():
                     total += len(batch)
                 return total
 
-            total = benchmark.pedantic(target, rounds=5, iterations=1)
+            total = benchmark.pedantic(target, rounds=10, iterations=1)
             assert total == _LARGE
         finally:
             con.close()
@@ -175,7 +173,7 @@ def target():
                     n += 1
                 return n
 
-            n = benchmark.pedantic(target, rounds=3, iterations=1)
+            n = benchmark.pedantic(target, rounds=10, iterations=1)
             assert n == _LARGE
         finally:
             con.close()
@@ -208,7 +206,7 @@ def target():
                 cur.execute("SELECT %s FROM perf_wide" % col_names)
                 return cur.fetchall()
 
-            result = benchmark.pedantic(target, rounds=5, iterations=1)
+            result = benchmark.pedantic(target, rounds=10, iterations=1)
             assert len(result) == self._WIDE_ROWS
             assert len(result[0]) == self._WIDE_COLS
         finally:
@@ -248,7 +246,7 @@ def target():
                 cur.execute("SELECT i, d, f, ts, bl, s, n FROM perf_mixed")
                 return cur.fetchall()
 
-            result = benchmark.pedantic(target, rounds=5, iterations=1)
+            result = benchmark.pedantic(target, rounds=10, iterations=1)
             assert len(result) == _LARGE
         finally:
             con.close()

From afdcc4965ca44f25825265e6eeba227cf939f48a Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 16:35:06 +0100
Subject: [PATCH 08/21] Try to reduce test noise

---
 .circleci/config.yml                   |  9 +++++++++
 tests/perf/test_insert_select_bench.py | 24 ++++++++++++++++--------
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a9be12f..c0e7002 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -127,6 +127,15 @@ jobs:
             python3 -m pytest tests/perf --run-perf --benchmark-only \
                 --benchmark-json=/tmp/master.json \
                 --benchmark-columns=min,mean,median,stddev,rounds
+      - run:
+          name: Reset DB between baseline and branch
+          command: |
+            # Force a clean DB restart so the branch run doesn't inherit
+            # buffer-pool warmth (or any other state) from the master run.
+            # The conftest fixture already shuts it down, this is belt-and-
+            # suspenders in case a test aborted early.
+            sudo -u nuodb /opt/nuodb/bin/nuocmd shutdown database \
+                --db-name pynuodb_test 2>/dev/null || true
       - run:
           name: Branch benchmarks + diff vs master
           command: |
diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py
index 3f81f02..de33365 100644
--- a/tests/perf/test_insert_select_bench.py
+++ b/tests/perf/test_insert_select_bench.py
@@ -75,7 +75,8 @@ def target():
                 cur.execute(_DDL_CREATE)
                 con.commit()
 
-            benchmark(target)
+            benchmark.pedantic(target, warmup_rounds=5, rounds=200,
+                               iterations=1)
         finally:
             con.close()
 
@@ -95,7 +96,8 @@ def target():
                 cur.execute(_DDL_CREATE)
                 con.commit()
 
-            benchmark.pedantic(target, rounds=10, iterations=1)
+            benchmark.pedantic(target, warmup_rounds=2, rounds=10,
+                               iterations=1)
         finally:
             con.close()
 
@@ -112,7 +114,8 @@ def target():
                 cur.execute("SELECT a, b FROM perf_bench")
                 return cur.fetchall()
 
-            rows = benchmark(target)
+            rows = benchmark.pedantic(target, warmup_rounds=5, rounds=200,
+                                      iterations=1)
             assert len(rows) == _SMALL
         finally:
             con.close()
@@ -128,7 +131,8 @@ def target():
                 cur.execute("SELECT a, b FROM perf_bench")
                 return cur.fetchall()
 
-            rows = benchmark.pedantic(target, rounds=10, iterations=1)
+            rows = benchmark.pedantic(target, warmup_rounds=2, rounds=10,
+                                      iterations=1)
             assert len(rows) == _LARGE
         finally:
             con.close()
@@ -150,7 +154,8 @@ def target():
                     total += len(batch)
                 return total
 
-            total = benchmark.pedantic(target, rounds=10, iterations=1)
+            total = benchmark.pedantic(target, warmup_rounds=2, rounds=10,
+                                       iterations=1)
             assert total == _LARGE
         finally:
             con.close()
@@ -173,7 +178,8 @@ def target():
                     n += 1
                 return n
 
-            n = benchmark.pedantic(target, rounds=10, iterations=1)
+            n = benchmark.pedantic(target, warmup_rounds=2, rounds=10,
+                                   iterations=1)
             assert n == _LARGE
         finally:
             con.close()
@@ -206,7 +212,8 @@ def target():
                 cur.execute("SELECT %s FROM perf_wide" % col_names)
                 return cur.fetchall()
 
-            result = benchmark.pedantic(target, rounds=10, iterations=1)
+            result = benchmark.pedantic(target, warmup_rounds=2, rounds=10,
+                                        iterations=1)
             assert len(result) == self._WIDE_ROWS
             assert len(result[0]) == self._WIDE_COLS
         finally:
@@ -246,7 +253,8 @@ def target():
                 cur.execute("SELECT i, d, f, ts, bl, s, n FROM perf_mixed")
                 return cur.fetchall()
 
-            result = benchmark.pedantic(target, rounds=10, iterations=1)
+            result = benchmark.pedantic(target, warmup_rounds=2, rounds=10,
+                                        iterations=1)
             assert len(result) == _LARGE
         finally:
             con.close()

From 68eb5736905c8715f257b83d5d14ea0fa9f56fef Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 17:27:32 +0100
Subject: [PATCH 09/21] Attempt to handle noise

---
 .circleci/config.yml                   |  39 +++++++---
 tests/perf/calibrate.py                | 102 ++++++++++++++++++++++++
 tests/perf/compare.py                  | 104 ++++++++++++++++++-------
 tests/perf/test_insert_select_bench.py |  12 +--
 4 files changed, 211 insertions(+), 46 deletions(-)
 create mode 100644 tests/perf/calibrate.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index c0e7002..fd81c92 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -107,12 +107,13 @@ jobs:
             sudo -u nuodb /opt/nuodb/etc/nuoadmin tls status
             sudo -u nuodb /opt/nuodb/etc/nuoadmin start
             sudo -u nuodb /opt/nuodb/bin/nuocmd --show-json get effective-license
-      # Same runner, same NuoDB instance: run master first for a
-      # baseline, then this branch, then let pytest-benchmark print the
-      # per-test diff.  Hardware noise cancels because both runs share
-      # the same container.
+      # We run master twice back-to-back on the same runner: the first run
+      # is the baseline that the branch is compared against, the second is
+      # used by calibrate.py to measure this runner's noise floor.  Then
+      # we run the branch and let compare.py flag deltas that clear the
+      # floor.
       - run:
-          name: Baseline benchmarks on master
+          name: Baseline benchmarks on master (pass 1)
           command: |
             GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" \
                 git fetch --no-tags --depth=1 origin master
@@ -125,15 +126,23 @@ jobs:
             make PYTHON=python3 install
             $HOME/.local/bin/pip install pytest-benchmark
             python3 -m pytest tests/perf --run-perf --benchmark-only \
-                --benchmark-json=/tmp/master.json \
+                --benchmark-json=/tmp/master_1.json \
                 --benchmark-columns=min,mean,median,stddev,rounds
       - run:
-          name: Reset DB between baseline and branch
+          name: Reset DB
+          command: |
+            sudo -u nuodb /opt/nuodb/bin/nuocmd shutdown database \
+                --db-name pynuodb_test 2>/dev/null || true
+      - run:
+          name: Baseline benchmarks on master (pass 2, for noise floor)
+          command: |
+            cd /tmp/base
+            python3 -m pytest tests/perf --run-perf --benchmark-only \
+                --benchmark-json=/tmp/master_2.json \
+                --benchmark-columns=min,mean,median,stddev,rounds
+      - run:
+          name: Reset DB
           command: |
-            # Force a clean DB restart so the branch run doesn't inherit
-            # buffer-pool warmth (or any other state) from the master run.
-            # The conftest fixture already shuts it down, this is belt-and-
-            # suspenders in case a test aborted early.
             sudo -u nuodb /opt/nuodb/bin/nuocmd shutdown database \
                 --db-name pynuodb_test 2>/dev/null || true
       - run:
@@ -144,8 +153,14 @@ jobs:
             python3 -m pytest tests/perf --run-perf --benchmark-only \
                 --benchmark-json=artifacts/branch.json \
                 --benchmark-columns=min,mean,median,stddev,rounds
-            python3 tests/perf/compare.py /tmp/master.json artifacts/branch.json \
+            python3 tests/perf/calibrate.py /tmp/noise_floor.json \
+                /tmp/master_1.json /tmp/master_2.json \
+                | tee artifacts/noise_floor.txt
+            python3 tests/perf/compare.py \
+                /tmp/master_1.json artifacts/branch.json \
+                --noise-floor /tmp/noise_floor.json \
                 | tee artifacts/perf_diff.txt
+            cp /tmp/noise_floor.json artifacts/
       - store_artifacts:
           path: artifacts
       - after_failure:
diff --git a/tests/perf/calibrate.py b/tests/perf/calibrate.py
new file mode 100644
index 0000000..440bee9
--- /dev/null
+++ b/tests/perf/calibrate.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+"""Compute a per-test noise floor from same-code benchmark runs.
+
+Takes 2+ pytest-benchmark JSON files produced by running the *same* code
+on the same runner (e.g., master benchmarked twice back-to-back).  For
+each test, pools the raw per-round timings from all runs and runs a
+permutation-style bootstrap: draw two disjoint groups of the size of a
+typical single run, compute |delta%| between their mins, repeat many
+times.  The 95th percentile of that distribution is the noise floor:
+the smallest |delta%| that noise alone can plausibly produce on this
+runner.  compare.py reads this file and flags PR deltas that clear it.
+
+Writes {test_name: floor_pct} to the output path.
+"""
+from __future__ import print_function
+
+import argparse
+import json
+import random
+
+
+def _load_pooled(paths):
+    """Return (pooled_data, typical_size_per_test) for every shared test."""
+    pooled = {}
+    sizes = {}
+    for path in paths:
+        with open(path) as f:
+            data = json.load(f)
+        for b in data['benchmarks']:
+            name = b['name']
+            d = b['stats']['data']
+            pooled.setdefault(name, []).extend(d)
+            sizes.setdefault(name, []).append(len(d))
+    typical = {n: sum(s) // len(s) for n, s in sizes.items()}
+    return pooled, typical
+
+
+def _floor(pooled, group_size, n_permutations=5000, quantile=0.95):
+    if len(pooled) < 2 * group_size:
+        # Not enough data to draw two disjoint groups: fall back to
+        # bootstrap-with-replacement.  Rare, but keeps the script robust.
+        deltas = _bootstrap_with_replacement(
+            pooled, group_size, n_permutations)
+    else:
+        deltas = _permutation(pooled, group_size, n_permutations)
+    deltas.sort()
+    return deltas[int(len(deltas) * quantile)]
+
+
+def _permutation(pooled, group_size, n):
+    deltas = []
+    data = list(pooled)
+    two_groups = 2 * group_size
+    for _ in range(n):
+        random.shuffle(data)
+        a = data[:group_size]
+        b = data[group_size:two_groups]
+        m_a = min(a)
+        m_b = min(b)
+        deltas.append(abs((m_b - m_a) / m_a * 100.0))
+    return deltas
+
+
+def _bootstrap_with_replacement(pooled, group_size, n):
+    deltas = []
+    for _ in range(n):
+        a = [random.choice(pooled) for _ in range(group_size)]
+        b = [random.choice(pooled) for _ in range(group_size)]
+        m_a = min(a)
+        m_b = min(b)
+        deltas.append(abs((m_b - m_a) / m_a * 100.0))
+    return deltas
+
+
+def main(args):
+    if len(args.inputs) < 2:
+        raise SystemExit("need >= 2 same-code JSON files to calibrate")
+
+    pooled, typical = _load_pooled(args.inputs)
+    random.seed(1)
+
+    floors = {n: _floor(pooled[n], typical[n]) for n in sorted(pooled)}
+
+    with open(args.output, 'w') as f:
+        json.dump(floors, f, indent=2, sort_keys=True)
+
+    print("Noise floor per test (95th %ile of |delta%%| under H0):")
+    for name in sorted(floors):
+        print("  %-40s  %6.2f%%" % (name, floors[name]))
+    print("Wrote %s" % args.output)
+
+
+def _parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument('output', help='where to write noise_floor.json')
+    p.add_argument('inputs', nargs='+',
+                   help='2+ same-code pytest-benchmark JSON files')
+    return p.parse_args()
+
+
+if __name__ == '__main__':
+    main(_parse_args())
diff --git a/tests/perf/compare.py b/tests/perf/compare.py
index e1ea790..1b1974f 100644
--- a/tests/perf/compare.py
+++ b/tests/perf/compare.py
@@ -1,43 +1,86 @@
 # -*- coding: utf-8 -*-
 """Compare two pytest-benchmark JSON files (master vs branch).
 
-Usage: python compare.py MASTER.json BRANCH.json
+For each shared test:
 
-Compares the `min` time of each shared test and prints a table with the
-absolute and relative delta.  We use min because it's the least noisy
-statistic: it filters out GC pauses, kernel scheduling and JIT warmup.
+  * Reports master and branch min time (ms)
+  * Reports the point delta as a percentage
+  * Reports a bootstrap 95% CI on that delta, resampled from the raw
+    per-round timings in stats.data.  We compare mins because the min is
+    the least contaminated statistic on a shared runner.
+
+If --noise-floor is passed, reads a JSON produced by calibrate.py and
+flags tests whose CI clears +/- floor in either direction.  A test whose
+CI overlaps the floor is treated as noise, no matter how nice the point
+delta looks.
 """
 from __future__ import print_function
 
+import argparse
 import json
-import sys
+import random
 
 
 def _load(path):
     with open(path) as f:
         data = json.load(f)
-    return {b['name']: b['stats']['min'] for b in data['benchmarks']}
+    return {b['name']: b['stats'] for b in data['benchmarks']}
+
+
+def _bootstrap_delta_ci(master_data, branch_data, n=5000, ci=0.95):
+    """95% bootstrap CI on (min(branch) - min(master)) / min(master) * 100."""
+    n_m = len(master_data)
+    n_b = len(branch_data)
+    deltas = []
+    for _ in range(n):
+        m = min(random.choice(master_data) for _ in range(n_m))
+        b = min(random.choice(branch_data) for _ in range(n_b))
+        deltas.append((b - m) / m * 100.0)
+    deltas.sort()
+    lo_idx = int(n * (1 - ci) / 2)
+    hi_idx = int(n * (1 + ci) / 2) - 1
+    return deltas[lo_idx], deltas[hi_idx]
+
 
+def _row_fmt(has_floor):
+    if has_floor:
+        return "%-40s %12s %12s %8s %20s %8s %5s"
+    return "%-40s %12s %12s %8s %20s"
 
-def main(master_path, branch_path):
-    master = _load(master_path)
-    branch = _load(branch_path)
 
-    shared = sorted(set(master) & set(branch))
-    rows = []
-    for name in shared:
-        m_ms = master[name] * 1000.0
-        b_ms = branch[name] * 1000.0
-        delta = b_ms - m_ms
-        pct = (delta / m_ms) * 100.0 if m_ms else float('nan')
-        rows.append((name, m_ms, b_ms, delta, pct))
+def main(args):
+    master = _load(args.master)
+    branch = _load(args.branch)
+    floor = {}
+    if args.noise_floor:
+        with open(args.noise_floor) as f:
+            floor = json.load(f)
 
-    header = ("Test", "master min (ms)", "branch min (ms)",
-              "delta (ms)", "delta %")
-    print("%-40s %16s %16s %14s %10s" % header)
-    print("-" * 100)
-    for name, m, b, d, p in rows:
-        print("%-40s %16.3f %16.3f %+14.3f %+9.2f%%" % (name, m, b, d, p))
+    random.seed(1)  # deterministic across CI runs
+
+    fmt = _row_fmt(bool(floor))
+    header = ["Test", "master (ms)", "branch (ms)", "delta %", "95% CI"]
+    if floor:
+        header += ["floor %", "flag"]
+    print(fmt % tuple(header))
+    print("-" * (len(fmt % tuple(header))))
+
+    for name in sorted(set(master) & set(branch)):
+        m_stats = master[name]
+        b_stats = branch[name]
+        m_ms = m_stats['min'] * 1000.0
+        b_ms = b_stats['min'] * 1000.0
+        delta_pct = (b_ms - m_ms) / m_ms * 100.0
+        lo, hi = _bootstrap_delta_ci(m_stats['data'], b_stats['data'])
+        row = [name, "%.3f" % m_ms, "%.3f" % b_ms,
+               "%+.2f" % delta_pct,
+               "[%+6.2f, %+6.2f]" % (lo, hi)]
+        if floor:
+            f = float(floor.get(name, 0.0))
+            # Flag when the CI sits entirely outside +/- floor.
+            flagged = (lo > f) or (hi < -f)
+            row += ["%.2f" % f, "*" if flagged else ""]
+        print(fmt % tuple(row))
 
     only_master = sorted(set(master) - set(branch))
     only_branch = sorted(set(branch) - set(master))
@@ -47,9 +90,14 @@ def main(master_path, branch_path):
         print("Only in branch: %s" % ", ".join(only_branch))
 
 
+def _parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument('master', help='pytest-benchmark JSON for master')
+    p.add_argument('branch', help='pytest-benchmark JSON for this branch')
+    p.add_argument('--noise-floor',
+                   help='JSON produced by calibrate.py; enables flag column')
+    return p.parse_args()
+
+
 if __name__ == '__main__':
-    if len(sys.argv) != 3:
-        print("usage: python compare.py MASTER.json BRANCH.json",
-              file=sys.stderr)
-        sys.exit(2)
-    main(sys.argv[1], sys.argv[2])
+    main(_parse_args())
diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py
index de33365..913847b 100644
--- a/tests/perf/test_insert_select_bench.py
+++ b/tests/perf/test_insert_select_bench.py
@@ -96,7 +96,7 @@ def target():
                 cur.execute(_DDL_CREATE)
                 con.commit()
 
-            benchmark.pedantic(target, warmup_rounds=2, rounds=10,
+            benchmark.pedantic(target, warmup_rounds=2, rounds=30,
                                iterations=1)
         finally:
             con.close()
@@ -131,7 +131,7 @@ def target():
                 cur.execute("SELECT a, b FROM perf_bench")
                 return cur.fetchall()
 
-            rows = benchmark.pedantic(target, warmup_rounds=2, rounds=10,
+            rows = benchmark.pedantic(target, warmup_rounds=2, rounds=30,
                                       iterations=1)
             assert len(rows) == _LARGE
         finally:
@@ -154,7 +154,7 @@ def target():
                     total += len(batch)
                 return total
 
-            total = benchmark.pedantic(target, warmup_rounds=2, rounds=10,
+            total = benchmark.pedantic(target, warmup_rounds=2, rounds=30,
                                        iterations=1)
             assert total == _LARGE
         finally:
@@ -178,7 +178,7 @@ def target():
                     n += 1
                 return n
 
-            n = benchmark.pedantic(target, warmup_rounds=2, rounds=10,
+            n = benchmark.pedantic(target, warmup_rounds=2, rounds=30,
                                    iterations=1)
             assert n == _LARGE
         finally:
@@ -212,7 +212,7 @@ def target():
                 cur.execute("SELECT %s FROM perf_wide" % col_names)
                 return cur.fetchall()
 
-            result = benchmark.pedantic(target, warmup_rounds=2, rounds=10,
+            result = benchmark.pedantic(target, warmup_rounds=2, rounds=30,
                                         iterations=1)
             assert len(result) == self._WIDE_ROWS
             assert len(result[0]) == self._WIDE_COLS
@@ -253,7 +253,7 @@ def target():
                 cur.execute("SELECT i, d, f, ts, bl, s, n FROM perf_mixed")
                 return cur.fetchall()
 
-            result = benchmark.pedantic(target, warmup_rounds=2, rounds=10,
+            result = benchmark.pedantic(target, warmup_rounds=2, rounds=30,
                                         iterations=1)
             assert len(result) == _LARGE
         finally:

From 18c419ca57514ed46161e9e0d2a8647e754c42c4 Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 17:46:45 +0100
Subject: [PATCH 10/21] try bigger box

---
 .circleci/config.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index fd81c92..770a2ff 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -74,11 +74,11 @@ jobs:
           when : "on_fail"
 
   perf_bench:
-    description: "Run insert/select benchmarks on master and on this branch, print the diff"
+    description: "Run insert/select performance benchmarks on master and on this branch, print the diff"
     docker:
       - image: nuodb/nuodb:latest
         user: root
-    resource_class: medium
+    resource_class: xlarge
     environment:
       TZ : America/New_York
       NUO_SET_TLS : disable
@@ -174,6 +174,6 @@ workflows:
           context:
             - common-config
       - perf_bench:
-          name: "Insert/select benchmark comparison"
+          name: "Run insert/select performance benchmark, compared to master"
           context:
             - common-config

From 1ba251a555c2363f99a72d9a56f79c2b667f0372 Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 19:42:41 +0100
Subject: [PATCH 11/21] try to clean up insert tests

---
 tests/perf/test_insert_select_bench.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py
index 913847b..7e02ecf 100644
--- a/tests/perf/test_insert_select_bench.py
+++ b/tests/perf/test_insert_select_bench.py
@@ -31,8 +31,9 @@
 pytestmark = pytest.mark.perf
 
 
-_DDL_DROP   = "DROP TABLE IF EXISTS perf_bench"
-_DDL_CREATE = "CREATE TABLE perf_bench (a INT, b VARCHAR(64))"
+_DDL_DROP     = "DROP TABLE IF EXISTS perf_bench"
+_DDL_CREATE   = "CREATE TABLE perf_bench (a INT, b VARCHAR(64))"
+_DDL_TRUNCATE = "TRUNCATE TABLE perf_bench"
 
 _SMALL = 100
 _LARGE = 100_000
@@ -70,13 +71,14 @@ def target():
                 cur.executemany(
                     "INSERT INTO perf_bench (a, b) VALUES (?, ?)", rows)
                 con.commit()
-                # Truncate between iterations so we measure a clean insert.
-                cur.execute(_DDL_DROP)
-                cur.execute(_DDL_CREATE)
+
+            def setup():
+                # Runs before each round but is NOT included in the timing.
+                cur.execute(_DDL_TRUNCATE)
                 con.commit()
 
-            benchmark.pedantic(target, warmup_rounds=5, rounds=200,
-                               iterations=1)
+            benchmark.pedantic(target, setup=setup, warmup_rounds=5,
+                               rounds=200, iterations=1)
         finally:
             con.close()
 
@@ -92,11 +94,13 @@ def target():
                 cur.executemany(
                     "INSERT INTO perf_bench (a, b) VALUES (?, ?)", rows)
                 con.commit()
-                cur.execute(_DDL_DROP)
-                cur.execute(_DDL_CREATE)
+
+            def setup():
+                cur.execute(_DDL_TRUNCATE)
                 con.commit()
 
-            benchmark.pedantic(target, warmup_rounds=2, rounds=30,
+            benchmark.pedantic(target, setup=setup, warmup_rounds=2,
+                               rounds=30,
                                iterations=1)
         finally:
             con.close()

From cd96993a649c6a27fbf24b6accf1562ba291163a Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 19:49:17 +0100
Subject: [PATCH 12/21] try to simplify

---
 .circleci/config.yml    |  31 +++---------
 tests/perf/calibrate.py | 102 ----------------------------------------
 tests/perf/compare.py   |  99 +++++++++-----------------------------
 3 files changed, 28 insertions(+), 204 deletions(-)
 delete mode 100644 tests/perf/calibrate.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 770a2ff..529dd5f 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -107,13 +107,11 @@ jobs:
             sudo -u nuodb /opt/nuodb/etc/nuoadmin tls status
             sudo -u nuodb /opt/nuodb/etc/nuoadmin start
             sudo -u nuodb /opt/nuodb/bin/nuocmd --show-json get effective-license
-      # We run master twice back-to-back on the same runner: the first run
-      # is the baseline that the branch is compared against, the second is
-      # used by calibrate.py to measure this runner's noise floor.  Then
-      # we run the branch and let compare.py flag deltas that clear the
-      # floor.
+      # Run master then this branch on the same runner and let compare.py
+      # print a delta table.  Hardware noise mostly cancels because both
+      # runs share the container; xlarge gives us dedicated cores.
       - run:
-          name: Baseline benchmarks on master (pass 1)
+          name: Baseline benchmarks on master
           command: |
             GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" \
                 git fetch --no-tags --depth=1 origin master
@@ -126,19 +124,7 @@ jobs:
             make PYTHON=python3 install
             $HOME/.local/bin/pip install pytest-benchmark
             python3 -m pytest tests/perf --run-perf --benchmark-only \
-                --benchmark-json=/tmp/master_1.json \
-                --benchmark-columns=min,mean,median,stddev,rounds
-      - run:
-          name: Reset DB
-          command: |
-            sudo -u nuodb /opt/nuodb/bin/nuocmd shutdown database \
-                --db-name pynuodb_test 2>/dev/null || true
-      - run:
-          name: Baseline benchmarks on master (pass 2, for noise floor)
-          command: |
-            cd /tmp/base
-            python3 -m pytest tests/perf --run-perf --benchmark-only \
-                --benchmark-json=/tmp/master_2.json \
+                --benchmark-json=/tmp/master.json \
                 --benchmark-columns=min,mean,median,stddev,rounds
       - run:
           name: Reset DB
@@ -153,14 +139,9 @@ jobs:
             python3 -m pytest tests/perf --run-perf --benchmark-only \
                 --benchmark-json=artifacts/branch.json \
                 --benchmark-columns=min,mean,median,stddev,rounds
-            python3 tests/perf/calibrate.py /tmp/noise_floor.json \
-                /tmp/master_1.json /tmp/master_2.json \
-                | tee artifacts/noise_floor.txt
             python3 tests/perf/compare.py \
-                /tmp/master_1.json artifacts/branch.json \
-                --noise-floor /tmp/noise_floor.json \
+                /tmp/master.json artifacts/branch.json \
                 | tee artifacts/perf_diff.txt
-            cp /tmp/noise_floor.json artifacts/
       - store_artifacts:
           path: artifacts
       - after_failure:
diff --git a/tests/perf/calibrate.py b/tests/perf/calibrate.py
deleted file mode 100644
index 440bee9..0000000
--- a/tests/perf/calibrate.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# -*- coding: utf-8 -*-
-"""Compute a per-test noise floor from same-code benchmark runs.
-
-Takes 2+ pytest-benchmark JSON files produced by running the *same* code
-on the same runner (e.g., master benchmarked twice back-to-back).  For
-each test, pools the raw per-round timings from all runs and runs a
-permutation-style bootstrap: draw two disjoint groups of the size of a
-typical single run, compute |delta%| between their mins, repeat many
-times.  The 95th percentile of that distribution is the noise floor:
-the smallest |delta%| that noise alone can plausibly produce on this
-runner.  compare.py reads this file and flags PR deltas that clear it.
-
-Writes {test_name: floor_pct} to the output path.
-"""
-from __future__ import print_function
-
-import argparse
-import json
-import random
-
-
-def _load_pooled(paths):
-    """Return (pooled_data, typical_size_per_test) for every shared test."""
-    pooled = {}
-    sizes = {}
-    for path in paths:
-        with open(path) as f:
-            data = json.load(f)
-        for b in data['benchmarks']:
-            name = b['name']
-            d = b['stats']['data']
-            pooled.setdefault(name, []).extend(d)
-            sizes.setdefault(name, []).append(len(d))
-    typical = {n: sum(s) // len(s) for n, s in sizes.items()}
-    return pooled, typical
-
-
-def _floor(pooled, group_size, n_permutations=5000, quantile=0.95):
-    if len(pooled) < 2 * group_size:
-        # Not enough data to draw two disjoint groups: fall back to
-        # bootstrap-with-replacement.  Rare, but keeps the script robust.
-        deltas = _bootstrap_with_replacement(
-            pooled, group_size, n_permutations)
-    else:
-        deltas = _permutation(pooled, group_size, n_permutations)
-    deltas.sort()
-    return deltas[int(len(deltas) * quantile)]
-
-
-def _permutation(pooled, group_size, n):
-    deltas = []
-    data = list(pooled)
-    two_groups = 2 * group_size
-    for _ in range(n):
-        random.shuffle(data)
-        a = data[:group_size]
-        b = data[group_size:two_groups]
-        m_a = min(a)
-        m_b = min(b)
-        deltas.append(abs((m_b - m_a) / m_a * 100.0))
-    return deltas
-
-
-def _bootstrap_with_replacement(pooled, group_size, n):
-    deltas = []
-    for _ in range(n):
-        a = [random.choice(pooled) for _ in range(group_size)]
-        b = [random.choice(pooled) for _ in range(group_size)]
-        m_a = min(a)
-        m_b = min(b)
-        deltas.append(abs((m_b - m_a) / m_a * 100.0))
-    return deltas
-
-
-def main(args):
-    if len(args.inputs) < 2:
-        raise SystemExit("need >= 2 same-code JSON files to calibrate")
-
-    pooled, typical = _load_pooled(args.inputs)
-    random.seed(1)
-
-    floors = {n: _floor(pooled[n], typical[n]) for n in sorted(pooled)}
-
-    with open(args.output, 'w') as f:
-        json.dump(floors, f, indent=2, sort_keys=True)
-
-    print("Noise floor per test (95th %ile of |delta%%| under H0):")
-    for name in sorted(floors):
-        print("  %-40s  %6.2f%%" % (name, floors[name]))
-    print("Wrote %s" % args.output)
-
-
-def _parse_args():
-    p = argparse.ArgumentParser()
-    p.add_argument('output', help='where to write noise_floor.json')
-    p.add_argument('inputs', nargs='+',
-                   help='2+ same-code pytest-benchmark JSON files')
-    return p.parse_args()
-
-
-if __name__ == '__main__':
-    main(_parse_args())
diff --git a/tests/perf/compare.py b/tests/perf/compare.py
index 1b1974f..4ce38b9 100644
--- a/tests/perf/compare.py
+++ b/tests/perf/compare.py
@@ -1,86 +1,36 @@
 # -*- coding: utf-8 -*-
 """Compare two pytest-benchmark JSON files (master vs branch).
 
-For each shared test:
-
-  * Reports master and branch min time (ms)
-  * Reports the point delta as a percentage
-  * Reports a bootstrap 95% CI on that delta, resampled from the raw
-    per-round timings in stats.data.  We compare mins because the min is
-    the least contaminated statistic on a shared runner.
-
-If --noise-floor is passed, reads a JSON produced by calibrate.py and
-flags tests whose CI clears +/- floor in either direction.  A test whose
-CI overlaps the floor is treated as noise, no matter how nice the point
-delta looks.
+Prints a table of master min, branch min, and the absolute + percentage
+delta per test.  Min is the least noisy summary; on a quiet enough
+runner the delta % is straight-up meaningful.
 """
 from __future__ import print_function
 
-import argparse
 import json
-import random
+import sys
 
 
 def _load(path):
     with open(path) as f:
         data = json.load(f)
-    return {b['name']: b['stats'] for b in data['benchmarks']}
-
-
-def _bootstrap_delta_ci(master_data, branch_data, n=5000, ci=0.95):
-    """95% bootstrap CI on (min(branch) - min(master)) / min(master) * 100."""
-    n_m = len(master_data)
-    n_b = len(branch_data)
-    deltas = []
-    for _ in range(n):
-        m = min(random.choice(master_data) for _ in range(n_m))
-        b = min(random.choice(branch_data) for _ in range(n_b))
-        deltas.append((b - m) / m * 100.0)
-    deltas.sort()
-    lo_idx = int(n * (1 - ci) / 2)
-    hi_idx = int(n * (1 + ci) / 2) - 1
-    return deltas[lo_idx], deltas[hi_idx]
-
+    return {b['name']: b['stats']['min'] for b in data['benchmarks']}
 
-def _row_fmt(has_floor):
-    if has_floor:
-        return "%-40s %12s %12s %8s %20s %8s %5s"
-    return "%-40s %12s %12s %8s %20s"
 
+def main(master_path, branch_path):
+    master = _load(master_path)
+    branch = _load(branch_path)
 
-def main(args):
-    master = _load(args.master)
-    branch = _load(args.branch)
-    floor = {}
-    if args.noise_floor:
-        with open(args.noise_floor) as f:
-            floor = json.load(f)
-
-    random.seed(1)  # deterministic across CI runs
-
-    fmt = _row_fmt(bool(floor))
-    header = ["Test", "master (ms)", "branch (ms)", "delta %", "95% CI"]
-    if floor:
-        header += ["floor %", "flag"]
-    print(fmt % tuple(header))
-    print("-" * (len(fmt % tuple(header))))
-
+    print("%-40s %16s %16s %14s %10s" % (
+        "Test", "master min (ms)", "branch min (ms)",
+        "delta (ms)", "delta %"))
+    print("-" * 100)
     for name in sorted(set(master) & set(branch)):
-        m_stats = master[name]
-        b_stats = branch[name]
-        m_ms = m_stats['min'] * 1000.0
-        b_ms = b_stats['min'] * 1000.0
-        delta_pct = (b_ms - m_ms) / m_ms * 100.0
-        lo, hi = _bootstrap_delta_ci(m_stats['data'], b_stats['data'])
-        row = [name, "%.3f" % m_ms, "%.3f" % b_ms,
-               "%+.2f" % delta_pct,
-               "[%+6.2f, %+6.2f]" % (lo, hi)]
-        if floor:
-            f = float(floor.get(name, 0.0))
-            # Flag when the CI sits entirely outside +/- floor.
-            flagged = (lo > f) or (hi < -f)
-            row += ["%.2f" % f, "*" if flagged else ""]
-        print(fmt % tuple(row))
+        m = master[name] * 1000.0
+        b = branch[name] * 1000.0
+        d = b - m
+        p = (d / m) * 100.0 if m else float('nan')
+        print("%-40s %16.3f %16.3f %+14.3f %+9.2f%%" % (name, m, b, d, p))
 
     only_master = sorted(set(master) - set(branch))
     only_branch = sorted(set(branch) - set(master))
@@ -90,14 +40,9 @@ def main(args):
         print("Only in branch: %s" % ", ".join(only_branch))
 
 
-def _parse_args():
-    p = argparse.ArgumentParser()
-    p.add_argument('master', help='pytest-benchmark JSON for master')
-    p.add_argument('branch', help='pytest-benchmark JSON for this branch')
-    p.add_argument('--noise-floor',
-                   help='JSON produced by calibrate.py; enables flag column')
-    return p.parse_args()
-
-
 if __name__ == '__main__':
-    main(_parse_args())
+    if len(sys.argv) != 3:
+        print("usage: python compare.py MASTER.json BRANCH.json",
+              file=sys.stderr)
+        sys.exit(2)
+    main(sys.argv[1], sys.argv[2])

From de4d7a83af2664ca4ca98b64cfd9cae9c108c02d Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 20:00:12 +0100
Subject: [PATCH 13/21] Stuff

---
 .circleci/config.yml  |  2 +-
 tests/perf/compare.py | 40 ++++++++++++++++++++++++++++++----------
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 529dd5f..f15d224 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -155,6 +155,6 @@ workflows:
           context:
             - common-config
       - perf_bench:
-          name: "Run insert/select performance benchmark, compared to master"
+          name: "Run performance benchmarks, comparing results to master"
           context:
             - common-config
diff --git a/tests/perf/compare.py b/tests/perf/compare.py
index 4ce38b9..6b34b25 100644
--- a/tests/perf/compare.py
+++ b/tests/perf/compare.py
@@ -2,11 +2,13 @@
 """Compare two pytest-benchmark JSON files (master vs branch).
 
 Prints a table of master min, branch min, and the absolute + percentage
-delta per test.  Min is the least noisy summary; on a quiet enough
-runner the delta % is straight-up meaningful.
+delta per test.  Exits non-zero if any test regressed by more than
+--fail-threshold (default 10%), so CI turns a real regression into a
+failed build.  Improvements never fail the build.
 """
 from __future__ import print_function
 
+import argparse
 import json
 import sys
 
@@ -17,20 +19,24 @@ def _load(path):
     return {b['name']: b['stats']['min'] for b in data['benchmarks']}
 
 
-def main(master_path, branch_path):
-    master = _load(master_path)
-    branch = _load(branch_path)
+def main(args):
+    master = _load(args.master)
+    branch = _load(args.branch)
 
     print("%-40s %16s %16s %14s %10s" % (
         "Test", "master min (ms)", "branch min (ms)",
         "delta (ms)", "delta %"))
     print("-" * 100)
+
+    regressed = []
     for name in sorted(set(master) & set(branch)):
         m = master[name] * 1000.0
         b = branch[name] * 1000.0
         d = b - m
         p = (d / m) * 100.0 if m else float('nan')
         print("%-40s %16.3f %16.3f %+14.3f %+9.2f%%" % (name, m, b, d, p))
+        if p > args.fail_threshold:
+            regressed.append((name, p))
 
     only_master = sorted(set(master) - set(branch))
     only_branch = sorted(set(branch) - set(master))
@@ -39,10 +45,24 @@ def main(master_path, branch_path):
     if only_branch:
         print("Only in branch: %s" % ", ".join(only_branch))
 
+    print()
+    if regressed:
+        print("FAIL: %d test(s) regressed by more than %.2f%%:"
+              % (len(regressed), args.fail_threshold))
+        for name, p in regressed:
+            print("  %s: %+.2f%%" % (name, p))
+        sys.exit(1)
+    print("OK: no test regressed by more than %.2f%%" % args.fail_threshold)
+
+
+def _parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument('master', help='pytest-benchmark JSON for master')
+    p.add_argument('branch', help='pytest-benchmark JSON for this branch')
+    p.add_argument('--fail-threshold', type=float, default=10.0,
+                   help='percent slowdown that fails the build (default 10)')
+    return p.parse_args()
+
 
 if __name__ == '__main__':
-    if len(sys.argv) != 3:
-        print("usage: python compare.py MASTER.json BRANCH.json",
-              file=sys.stderr)
-        sys.exit(2)
-    main(sys.argv[1], sys.argv[2])
+    main(_parse_args())

From 9e519852f1b1f13f8c2a25cfc727faa92dec6edf Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 20:18:10 +0100
Subject: [PATCH 14/21] try and find less noise

---
 .circleci/config.yml   |   8 +---
 tests/perf/run_perf.py | 105 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+), 6 deletions(-)
 create mode 100644 tests/perf/run_perf.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index f15d224..efb6d12 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -123,9 +123,7 @@ jobs:
             cd /tmp/base
             make PYTHON=python3 install
             $HOME/.local/bin/pip install pytest-benchmark
-            python3 -m pytest tests/perf --run-perf --benchmark-only \
-                --benchmark-json=/tmp/master.json \
-                --benchmark-columns=min,mean,median,stddev,rounds
+            python3 tests/perf/run_perf.py --output /tmp/master.json
       - run:
           name: Reset DB
           command: |
@@ -136,9 +134,7 @@ jobs:
           command: |
             make PYTHON=python3 install
             $HOME/.local/bin/pip install pytest-benchmark
-            python3 -m pytest tests/perf --run-perf --benchmark-only \
-                --benchmark-json=artifacts/branch.json \
-                --benchmark-columns=min,mean,median,stddev,rounds
+            python3 tests/perf/run_perf.py --output artifacts/branch.json
             python3 tests/perf/compare.py \
                 /tmp/master.json artifacts/branch.json \
                 | tee artifacts/perf_diff.txt
diff --git a/tests/perf/run_perf.py b/tests/perf/run_perf.py
new file mode 100644
index 0000000..4898d50
--- /dev/null
+++ b/tests/perf/run_perf.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+"""Run the perf suite, then re-run any test whose stddev is too high.
+
+pytest-benchmark reports stddev per test.  If a test's coefficient of
+variation (stddev / mean) is above --cv-threshold we consider that run
+untrustworthy and rerun *just that test* (via pytest -k), keeping the
+lowest observed min across attempts.  We stop when every test is quiet
+or --max-attempts is reached; either way we always produce an output
+JSON so compare.py can run.
+
+CLI is deliberately narrow: point it at an output path and it does the
+right pytest invocation for our perf suite.
+"""
+from __future__ import print_function
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+
+
+def _cv_pct(stats):
+    m = stats.get('mean', 0.0)
+    return (stats['stddev'] / m) * 100.0 if m else 0.0
+
+
+def _load(path):
+    with open(path) as f:
+        return json.load(f)
+
+
+def _noisy(merged, threshold):
+    return [b['name'] for b in merged['benchmarks']
+            if _cv_pct(b['stats']) > threshold]
+
+
+def _merge_min(base, new):
+    """For each test, replace the base entry if `new` observed a lower min."""
+    idx = {b['name']: i for i, b in enumerate(base['benchmarks'])}
+    for b in new['benchmarks']:
+        i = idx.get(b['name'])
+        if i is None:
+            base['benchmarks'].append(b)
+            idx[b['name']] = len(base['benchmarks']) - 1
+        elif b['stats']['min'] < base['benchmarks'][i]['stats']['min']:
+            base['benchmarks'][i] = b
+    return base
+
+
+def _pytest(output, k_filter=None):
+    cmd = [sys.executable, '-m', 'pytest', 'tests/perf',
+           '--run-perf', '--benchmark-only',
+           '--benchmark-json=' + output,
+           '--benchmark-columns=min,mean,median,stddev,rounds']
+    if k_filter:
+        cmd += ['-k', k_filter]
+    r = subprocess.run(cmd)
+    if r.returncode != 0:
+        raise SystemExit(r.returncode)
+
+
+def main(args):
+    tmp_dir = os.path.dirname(os.path.abspath(args.output)) or '.'
+    base_name = os.path.basename(args.output)
+
+    first = os.path.join(tmp_dir, base_name + '.attempt_1')
+    _pytest(first)
+    merged = _load(first)
+
+    for attempt in range(2, args.max_attempts + 1):
+        noisy = _noisy(merged, args.cv_threshold)
+        if not noisy:
+            print("All tests within CV %.1f%% after %d attempt(s)"
+                  % (args.cv_threshold, attempt - 1))
+            break
+        print("Attempt %d/%d: rerunning %s"
+              % (attempt, args.max_attempts, ", ".join(noisy)))
+        out = os.path.join(tmp_dir, base_name + ('.attempt_%d' % attempt))
+        _pytest(out, k_filter=" or ".join(noisy))
+        merged = _merge_min(merged, _load(out))
+    else:
+        still_noisy = _noisy(merged, args.cv_threshold)
+        if still_noisy:
+            print("WARN: %d test(s) still above CV %.1f%% after %d attempts: %s"
+                  % (len(still_noisy), args.cv_threshold,
+                     args.max_attempts, ", ".join(still_noisy)))
+
+    with open(args.output, 'w') as f:
+        json.dump(merged, f)
+
+
+def _parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument('--output', required=True,
+                   help='final merged pytest-benchmark JSON path')
+    p.add_argument('--cv-threshold', type=float, default=5.0,
+                   help='per-test CV%% ceiling before retrying (default 5)')
+    p.add_argument('--max-attempts', type=int, default=3,
+                   help='max attempts including the first run (default 3)')
+    return p.parse_args()
+
+
+if __name__ == '__main__':
+    main(_parse_args())

From f48a1216efce6252289dc179060e50ad97639fd4 Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 20:32:34 +0100
Subject: [PATCH 15/21] Try more rounds to get more stability

---
 .circleci/config.yml                   |   8 +-
 tests/perf/run_perf.py                 | 105 -------------------------
 tests/perf/test_insert_select_bench.py |  22 +++---
 3 files changed, 17 insertions(+), 118 deletions(-)
 delete mode 100644 tests/perf/run_perf.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index efb6d12..f15d224 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -123,7 +123,9 @@ jobs:
             cd /tmp/base
             make PYTHON=python3 install
             $HOME/.local/bin/pip install pytest-benchmark
-            python3 tests/perf/run_perf.py --output /tmp/master.json
+            python3 -m pytest tests/perf --run-perf --benchmark-only \
+                --benchmark-json=/tmp/master.json \
+                --benchmark-columns=min,mean,median,stddev,rounds
       - run:
           name: Reset DB
           command: |
@@ -134,7 +136,9 @@ jobs:
           command: |
             make PYTHON=python3 install
             $HOME/.local/bin/pip install pytest-benchmark
-            python3 tests/perf/run_perf.py --output artifacts/branch.json
+            python3 -m pytest tests/perf --run-perf --benchmark-only \
+                --benchmark-json=artifacts/branch.json \
+                --benchmark-columns=min,mean,median,stddev,rounds
             python3 tests/perf/compare.py \
                 /tmp/master.json artifacts/branch.json \
                 | tee artifacts/perf_diff.txt
diff --git a/tests/perf/run_perf.py b/tests/perf/run_perf.py
deleted file mode 100644
index 4898d50..0000000
--- a/tests/perf/run_perf.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# -*- coding: utf-8 -*-
-"""Run the perf suite, then re-run any test whose stddev is too high.
-
-pytest-benchmark reports stddev per test.  If a test's coefficient of
-variation (stddev / mean) is above --cv-threshold we consider that run
-untrustworthy and rerun *just that test* (via pytest -k), keeping the
-lowest observed min across attempts.  We stop when every test is quiet
-or --max-attempts is reached; either way we always produce an output
-JSON so compare.py can run.
-
-CLI is deliberately narrow: point it at an output path and it does the
-right pytest invocation for our perf suite.
-"""
-from __future__ import print_function
-
-import argparse
-import json
-import os
-import subprocess
-import sys
-
-
-def _cv_pct(stats):
-    m = stats.get('mean', 0.0)
-    return (stats['stddev'] / m) * 100.0 if m else 0.0
-
-
-def _load(path):
-    with open(path) as f:
-        return json.load(f)
-
-
-def _noisy(merged, threshold):
-    return [b['name'] for b in merged['benchmarks']
-            if _cv_pct(b['stats']) > threshold]
-
-
-def _merge_min(base, new):
-    """For each test, replace the base entry if `new` observed a lower min."""
-    idx = {b['name']: i for i, b in enumerate(base['benchmarks'])}
-    for b in new['benchmarks']:
-        i = idx.get(b['name'])
-        if i is None:
-            base['benchmarks'].append(b)
-            idx[b['name']] = len(base['benchmarks']) - 1
-        elif b['stats']['min'] < base['benchmarks'][i]['stats']['min']:
-            base['benchmarks'][i] = b
-    return base
-
-
-def _pytest(output, k_filter=None):
-    cmd = [sys.executable, '-m', 'pytest', 'tests/perf',
-           '--run-perf', '--benchmark-only',
-           '--benchmark-json=' + output,
-           '--benchmark-columns=min,mean,median,stddev,rounds']
-    if k_filter:
-        cmd += ['-k', k_filter]
-    r = subprocess.run(cmd)
-    if r.returncode != 0:
-        raise SystemExit(r.returncode)
-
-
-def main(args):
-    tmp_dir = os.path.dirname(os.path.abspath(args.output)) or '.'
-    base_name = os.path.basename(args.output)
-
-    first = os.path.join(tmp_dir, base_name + '.attempt_1')
-    _pytest(first)
-    merged = _load(first)
-
-    for attempt in range(2, args.max_attempts + 1):
-        noisy = _noisy(merged, args.cv_threshold)
-        if not noisy:
-            print("All tests within CV %.1f%% after %d attempt(s)"
-                  % (args.cv_threshold, attempt - 1))
-            break
-        print("Attempt %d/%d: rerunning %s"
-              % (attempt, args.max_attempts, ", ".join(noisy)))
-        out = os.path.join(tmp_dir, base_name + ('.attempt_%d' % attempt))
-        _pytest(out, k_filter=" or ".join(noisy))
-        merged = _merge_min(merged, _load(out))
-    else:
-        still_noisy = _noisy(merged, args.cv_threshold)
-        if still_noisy:
-            print("WARN: %d test(s) still above CV %.1f%% after %d attempts: %s"
-                  % (len(still_noisy), args.cv_threshold,
-                     args.max_attempts, ", ".join(still_noisy)))
-
-    with open(args.output, 'w') as f:
-        json.dump(merged, f)
-
-
-def _parse_args():
-    p = argparse.ArgumentParser()
-    p.add_argument('--output', required=True,
-                   help='final merged pytest-benchmark JSON path')
-    p.add_argument('--cv-threshold', type=float, default=5.0,
-                   help='per-test CV%% ceiling before retrying (default 5)')
-    p.add_argument('--max-attempts', type=int, default=3,
-                   help='max attempts including the first run (default 3)')
-    return p.parse_args()
-
-
-if __name__ == '__main__':
-    main(_parse_args())
diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py
index 7e02ecf..21f6fb9 100644
--- a/tests/perf/test_insert_select_bench.py
+++ b/tests/perf/test_insert_select_bench.py
@@ -36,7 +36,7 @@
 _DDL_TRUNCATE = "TRUNCATE TABLE perf_bench"
 
 _SMALL = 100
-_LARGE = 100_000
+_LARGE = 20_000
 
 
 def _rows(n):
@@ -83,7 +83,7 @@ def setup():
             con.close()
 
     def test_insert_large(self, benchmark):
-        """100k rows via executemany """
+        """20k rows via executemany """
         con = self._connect()
         try:
             self._reset(con)
@@ -100,7 +100,7 @@ def setup():
                 con.commit()
 
             benchmark.pedantic(target, setup=setup, warmup_rounds=2,
-                               rounds=30,
+                               rounds=100,
                                iterations=1)
         finally:
             con.close()
@@ -125,7 +125,7 @@ def target():
             con.close()
 
     def test_fetchall_large(self, benchmark):
-        """fetchall over 100k rows. """
+        """fetchall over 20k rows. """
         con = self._connect()
         try:
             self._seed(con, _LARGE)
@@ -135,14 +135,14 @@ def target():
                 cur.execute("SELECT a, b FROM perf_bench")
                 return cur.fetchall()
 
-            rows = benchmark.pedantic(target, warmup_rounds=2, rounds=30,
+            rows = benchmark.pedantic(target, warmup_rounds=2, rounds=100,
                                       iterations=1)
             assert len(rows) == _LARGE
         finally:
             con.close()
 
     def test_fetchmany_large(self, benchmark):
-        """fetchmany(1000) over 100k rows"""
+        """fetchmany(1000) over 20k rows"""
         con = self._connect()
         try:
             self._seed(con, _LARGE)
@@ -158,14 +158,14 @@ def target():
                     total += len(batch)
                 return total
 
-            total = benchmark.pedantic(target, warmup_rounds=2, rounds=30,
+            total = benchmark.pedantic(target, warmup_rounds=2, rounds=100,
                                        iterations=1)
             assert total == _LARGE
         finally:
             con.close()
 
     def test_fetchone_loop_large(self, benchmark):
-        """fetchone() in a loop over 100k rows.  Isolates per-row
+        """fetchone() in a loop over 20k rows.  Isolates per-row
         overhead """
         con = self._connect()
         try:
@@ -182,7 +182,7 @@ def target():
                     n += 1
                 return n
 
-            n = benchmark.pedantic(target, warmup_rounds=2, rounds=30,
+            n = benchmark.pedantic(target, warmup_rounds=2, rounds=100,
                                    iterations=1)
             assert n == _LARGE
         finally:
@@ -216,7 +216,7 @@ def target():
                 cur.execute("SELECT %s FROM perf_wide" % col_names)
                 return cur.fetchall()
 
-            result = benchmark.pedantic(target, warmup_rounds=2, rounds=30,
+            result = benchmark.pedantic(target, warmup_rounds=2, rounds=100,
                                         iterations=1)
             assert len(result) == self._WIDE_ROWS
             assert len(result[0]) == self._WIDE_COLS
@@ -257,7 +257,7 @@ def target():
                 cur.execute("SELECT i, d, f, ts, bl, s, n FROM perf_mixed")
                 return cur.fetchall()
 
-            result = benchmark.pedantic(target, warmup_rounds=2, rounds=30,
+            result = benchmark.pedantic(target, warmup_rounds=2, rounds=100,
                                         iterations=1)
             assert len(result) == _LARGE
         finally:

From 7a5715ff97b1901786f7cab524bbca9bee2becc7 Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 20:41:48 +0100
Subject: [PATCH 16/21] Reduce failure threshold to 15%

---
 tests/perf/compare.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/perf/compare.py b/tests/perf/compare.py
index 6b34b25..4f92af3 100644
--- a/tests/perf/compare.py
+++ b/tests/perf/compare.py
@@ -59,8 +59,8 @@ def _parse_args():
     p = argparse.ArgumentParser()
     p.add_argument('master', help='pytest-benchmark JSON for master')
     p.add_argument('branch', help='pytest-benchmark JSON for this branch')
-    p.add_argument('--fail-threshold', type=float, default=10.0,
-                   help='percent slowdown that fails the build (default 10)')
+    p.add_argument('--fail-threshold', type=float, default=15.0,
+                   help='percent slowdown that fails the build (default 15)')
     return p.parse_args()
 
 

From f0481594caed2c646e63bcb5c6887dd5180d25ce Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 20:52:59 +0100
Subject: [PATCH 17/21] Stuff

---
 tests/perf/test_insert_select_bench.py | 29 ++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py
index 21f6fb9..fc269c9 100644
--- a/tests/perf/test_insert_select_bench.py
+++ b/tests/perf/test_insert_select_bench.py
@@ -20,11 +20,33 @@
 cursor and Cython PRs; that's the point.
 """
 
+import math
+import time
+
 import pytest
 
 from tests import nuodb_base
 
 
+# Small tests have tiny absolute times (sub-ms to a few ms), so per-round
+# jitter dominates unless we run *both* many rounds *and* for a long enough
+# total wall-time to average out kernel/network noise.  This helper probes
+# a single call to size `rounds` so that rounds * per_call_time >= min_seconds,
+# with a floor of `min_rounds`.
+def _rounds_for(target, min_rounds, min_seconds, setup=None, probes=5):
+    total = 0.0
+    for _ in range(probes):
+        if setup is not None:
+            setup()
+        t0 = time.perf_counter()
+        target()
+        total += time.perf_counter() - t0
+    per_call = total / probes
+    if per_call <= 0:
+        return min_rounds
+    return max(min_rounds, int(math.ceil(min_seconds / per_call)))
+
+
 # Skip this whole module unless `--run-perf` is passed on the pytest
 # command line.  We don't want `make fulltest` to sit through a 100k-row
 # insert on every commit.
@@ -77,8 +99,10 @@ def setup():
                 cur.execute(_DDL_TRUNCATE)
                 con.commit()
 
+            rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0,
+                                 setup=setup)
             benchmark.pedantic(target, setup=setup, warmup_rounds=5,
-                               rounds=200, iterations=1)
+                               rounds=rounds, iterations=1)
         finally:
             con.close()
 
@@ -118,7 +142,8 @@ def target():
                 cur.execute("SELECT a, b FROM perf_bench")
                 return cur.fetchall()
 
-            rows = benchmark.pedantic(target, warmup_rounds=5, rounds=200,
+            rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0)
+            rows = benchmark.pedantic(target, warmup_rounds=5, rounds=rounds,
                                       iterations=1)
             assert len(rows) == _SMALL
         finally:

From 03bc92740e6d4d432e49175185277d27690df867 Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 21:02:23 +0100
Subject: [PATCH 18/21] try to increase iterations to reduce noise

---
 tests/perf/test_insert_select_bench.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py
index fc269c9..8b24750 100644
--- a/tests/perf/test_insert_select_bench.py
+++ b/tests/perf/test_insert_select_bench.py
@@ -33,7 +33,8 @@
 # total wall-time to average out kernel/network noise.  This helper probes
 # a single call to size `rounds` so that rounds * per_call_time >= min_seconds,
 # with a floor of `min_rounds`.
-def _rounds_for(target, min_rounds, min_seconds, setup=None, probes=5):
+def _rounds_for(target, min_rounds, min_seconds, setup=None, probes=5,
+                iterations=1):
     total = 0.0
     for _ in range(probes):
         if setup is not None:
@@ -44,7 +45,8 @@ def _rounds_for(target, min_rounds, min_seconds, setup=None, probes=5):
     per_call = total / probes
     if per_call <= 0:
         return min_rounds
-    return max(min_rounds, int(math.ceil(min_seconds / per_call)))
+    per_round = per_call * iterations
+    return max(min_rounds, int(math.ceil(min_seconds / per_round)))
 
 
 # Skip this whole module unless `--run-perf` is passed on the pytest
@@ -100,9 +102,9 @@ def setup():
                 con.commit()
 
             rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0,
-                                 setup=setup)
+                                 setup=setup, iterations=10)
             benchmark.pedantic(target, setup=setup, warmup_rounds=5,
-                               rounds=rounds, iterations=1)
+                               rounds=rounds, iterations=10)
         finally:
             con.close()
 
@@ -142,9 +144,10 @@ def target():
                 cur.execute("SELECT a, b FROM perf_bench")
                 return cur.fetchall()
 
-            rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0)
+            rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0,
+                                 iterations=10)
             rows = benchmark.pedantic(target, warmup_rounds=5, rounds=rounds,
-                                      iterations=1)
+                                      iterations=10)
             assert len(rows) == _SMALL
         finally:
             con.close()

From 19a8d1def79a27aca97546e1983a40765ce99895 Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 21:11:51 +0100
Subject: [PATCH 19/21] thing

---
 tests/perf/test_insert_select_bench.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py
index 8b24750..6285491 100644
--- a/tests/perf/test_insert_select_bench.py
+++ b/tests/perf/test_insert_select_bench.py
@@ -84,7 +84,13 @@ def _seed(self, con, n):
     # -- INSERT ---------------------------------------------------------
 
     def test_insert_small(self, benchmark):
-        """100 rows via executemany.  Sensitive to per-row putValue cost."""
+        """100 rows via executemany.  Sensitive to per-row putValue cost.
+
+        Uses iterations=10 to batch-average away kernel jitter that
+        otherwise dominates at these tiny per-call times.  pytest-benchmark
+        forbids setup+iterations>1, so the TRUNCATE lives inside the timed
+        target — both master and branch pay it equally.
+        """
         con = self._connect()
         try:
             self._reset(con)
@@ -92,18 +98,14 @@ def test_insert_small(self, benchmark):
             rows = _rows(_SMALL)
 
             def target():
+                cur.execute(_DDL_TRUNCATE)
                 cur.executemany(
                     "INSERT INTO perf_bench (a, b) VALUES (?, ?)", rows)
                 con.commit()
 
-            def setup():
-                # Runs before each round but is NOT included in the timing.
-                cur.execute(_DDL_TRUNCATE)
-                con.commit()
-
             rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0,
-                                 setup=setup, iterations=10)
-            benchmark.pedantic(target, setup=setup, warmup_rounds=5,
+                                 iterations=10)
+            benchmark.pedantic(target, warmup_rounds=5,
                                rounds=rounds, iterations=10)
         finally:
             con.close()

From cd6975c947d1d51e651838857a551689d6b1112b Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 21:27:48 +0100
Subject: [PATCH 20/21] Try to stabilise

---
 tests/perf/test_insert_select_bench.py | 31 +++++++++++++-------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py
index 6285491..86ba47d 100644
--- a/tests/perf/test_insert_select_bench.py
+++ b/tests/perf/test_insert_select_bench.py
@@ -59,7 +59,7 @@ def _rounds_for(target, min_rounds, min_seconds, setup=None, probes=5,
 _DDL_CREATE   = "CREATE TABLE perf_bench (a INT, b VARCHAR(64))"
 _DDL_TRUNCATE = "TRUNCATE TABLE perf_bench"
 
-_SMALL = 100
+_SMALL = 1000
 _LARGE = 20_000
 
 
@@ -84,29 +84,31 @@ def _seed(self, con, n):
     # -- INSERT ---------------------------------------------------------
 
     def test_insert_small(self, benchmark):
-        """100 rows via executemany.  Sensitive to per-row putValue cost.
+        """1000 rows via executemany.  Sensitive to per-row putValue cost.
 
-        Uses iterations=10 to batch-average away kernel jitter that
-        otherwise dominates at these tiny per-call times.  pytest-benchmark
-        forbids setup+iterations>1, so the TRUNCATE lives inside the timed
-        target — both master and branch pay it equally.
+        Sized so per-call time is a few tens of ms — small enough to
+        exercise the fixed per-query overhead, large enough that TRUNCATE
+        (out-of-band via setup) and kernel jitter don't dominate the min.
         """
         con = self._connect()
         try:
             self._reset(con)
             cur = con.cursor()
-            rows = _rows(_SMALL)
+            rows = _rows(1000)
 
             def target():
-                cur.execute(_DDL_TRUNCATE)
                 cur.executemany(
                     "INSERT INTO perf_bench (a, b) VALUES (?, ?)", rows)
                 con.commit()
 
+            def setup():
+                cur.execute(_DDL_TRUNCATE)
+                con.commit()
+
             rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0,
-                                 iterations=10)
-            benchmark.pedantic(target, warmup_rounds=5,
-                               rounds=rounds, iterations=10)
+                                 setup=setup)
+            benchmark.pedantic(target, setup=setup, warmup_rounds=5,
+                               rounds=rounds, iterations=1)
         finally:
             con.close()
 
@@ -136,7 +138,7 @@ def setup():
     # -- SELECT ---------------------------------------------------------
 
     def test_fetchall_small(self, benchmark):
-        """fetchall over 100 rows.  Sensitive to fixed per-query overhead."""
+        """fetchall over 1000 rows.  Sensitive to fixed per-query overhead."""
         con = self._connect()
         try:
             self._seed(con, _SMALL)
@@ -146,10 +148,9 @@ def target():
                 cur.execute("SELECT a, b FROM perf_bench")
                 return cur.fetchall()
 
-            rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0,
-                                 iterations=10)
+            rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0)
             rows = benchmark.pedantic(target, warmup_rounds=5, rounds=rounds,
-                                      iterations=10)
+                                      iterations=1)
             assert len(rows) == _SMALL
         finally:
             con.close()

From 89406ed013da79ed5f19eb805fd9292b372710e0 Mon Sep 17 00:00:00 2001
From: Martin Gallwey <mgy5@3ds.com>
Date: Wed, 1 Jul 2026 21:52:41 +0100
Subject: [PATCH 21/21] try to clean up a bit

---
 tests/perf/compare.py                  |  7 +--
 tests/perf/test_insert_select_bench.py | 79 +++++++-------------------
 2 files changed, 23 insertions(+), 63 deletions(-)

diff --git a/tests/perf/compare.py b/tests/perf/compare.py
index 4f92af3..82d83e7 100644
--- a/tests/perf/compare.py
+++ b/tests/perf/compare.py
@@ -23,9 +23,7 @@ def main(args):
     master = _load(args.master)
     branch = _load(args.branch)
 
-    print("%-40s %16s %16s %14s %10s" % (
-        "Test", "master min (ms)", "branch min (ms)",
-        "delta (ms)", "delta %"))
+    print("%-40s %16s %16s %14s %10s" % ( "Test", "master min (ms)", "branch min (ms)", "delta (ms)", "delta %"))
     print("-" * 100)
 
     regressed = []
@@ -47,8 +45,7 @@ def main(args):
 
     print()
     if regressed:
-        print("FAIL: %d test(s) regressed by more than %.2f%%:"
-              % (len(regressed), args.fail_threshold))
+        print("FAIL: %d test(s) regressed by more than %.2f%%:" % (len(regressed), args.fail_threshold))
         for name, p in regressed:
             print("  %s: %+.2f%%" % (name, p))
         sys.exit(1)
diff --git a/tests/perf/test_insert_select_bench.py b/tests/perf/test_insert_select_bench.py
index 86ba47d..ab3d8b9 100644
--- a/tests/perf/test_insert_select_bench.py
+++ b/tests/perf/test_insert_select_bench.py
@@ -77,24 +77,18 @@ def _reset(self, con):
 
     def _seed(self, con, n):
         self._reset(con)
-        con.cursor().executemany(
-            "INSERT INTO perf_bench (a, b) VALUES (?, ?)", _rows(n))
+        con.cursor().executemany("INSERT INTO perf_bench (a, b) VALUES (?, ?)", _rows(n))
         con.commit()
 
     # -- INSERT ---------------------------------------------------------
 
     def test_insert_small(self, benchmark):
-        """1000 rows via executemany.  Sensitive to per-row putValue cost.
-
-        Sized so per-call time is a few tens of ms — small enough to
-        exercise the fixed per-query overhead, large enough that TRUNCATE
-        (out-of-band via setup) and kernel jitter don't dominate the min.
-        """
+        """1000 rows via executemany.  Sensitive to per-row putValue cost.  """
         con = self._connect()
         try:
             self._reset(con)
             cur = con.cursor()
-            rows = _rows(1000)
+            rows = _rows(_SMALL)
 
             def target():
                 cur.executemany(
@@ -105,10 +99,8 @@ def setup():
                 cur.execute(_DDL_TRUNCATE)
                 con.commit()
 
-            rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0,
-                                 setup=setup)
-            benchmark.pedantic(target, setup=setup, warmup_rounds=5,
-                               rounds=rounds, iterations=1)
+            rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0, setup=setup)
+            benchmark.pedantic(target, setup=setup, warmup_rounds=5, rounds=rounds, iterations=1)
         finally:
             con.close()
 
@@ -121,24 +113,21 @@ def test_insert_large(self, benchmark):
             rows = _rows(_LARGE)
 
             def target():
-                cur.executemany(
-                    "INSERT INTO perf_bench (a, b) VALUES (?, ?)", rows)
+                cur.executemany( "INSERT INTO perf_bench (a, b) VALUES (?, ?)", rows)
                 con.commit()
 
             def setup():
                 cur.execute(_DDL_TRUNCATE)
                 con.commit()
 
-            benchmark.pedantic(target, setup=setup, warmup_rounds=2,
-                               rounds=100,
-                               iterations=1)
+            benchmark.pedantic(target, setup=setup, warmup_rounds=2, rounds=100, iterations=1)
         finally:
             con.close()
 
     # -- SELECT ---------------------------------------------------------
 
     def test_fetchall_small(self, benchmark):
-        """fetchall over 1000 rows.  Sensitive to fixed per-query overhead."""
+        """fetchall over 1000 rows."""
         con = self._connect()
         try:
             self._seed(con, _SMALL)
@@ -149,8 +138,7 @@ def target():
                 return cur.fetchall()
 
             rounds = _rounds_for(target, min_rounds=500, min_seconds=10.0)
-            rows = benchmark.pedantic(target, warmup_rounds=5, rounds=rounds,
-                                      iterations=1)
+            rows = benchmark.pedantic(target, warmup_rounds=5, rounds=rounds, iterations=1)
             assert len(rows) == _SMALL
         finally:
             con.close()
@@ -166,8 +154,7 @@ def target():
                 cur.execute("SELECT a, b FROM perf_bench")
                 return cur.fetchall()
 
-            rows = benchmark.pedantic(target, warmup_rounds=2, rounds=100,
-                                      iterations=1)
+            rows = benchmark.pedantic(target, warmup_rounds=2, rounds=100, iterations=1)
             assert len(rows) == _LARGE
         finally:
             con.close()
@@ -189,15 +176,13 @@ def target():
                     total += len(batch)
                 return total
 
-            total = benchmark.pedantic(target, warmup_rounds=2, rounds=100,
-                                       iterations=1)
+            total = benchmark.pedantic(target, warmup_rounds=2, rounds=100, iterations=1)
             assert total == _LARGE
         finally:
             con.close()
 
     def test_fetchone_loop_large(self, benchmark):
-        """fetchone() in a loop over 20k rows.  Isolates per-row
-        overhead """
+        """fetchone() in a loop over 20k rows.  Isolates per-row overhead """
         con = self._connect()
         try:
             self._seed(con, _LARGE)
@@ -213,8 +198,7 @@ def target():
                     n += 1
                 return n
 
-            n = benchmark.pedantic(target, warmup_rounds=2, rounds=100,
-                                   iterations=1)
+            n = benchmark.pedantic(target, warmup_rounds=2, rounds=100, iterations=1)
             assert n == _LARGE
         finally:
             con.close()
@@ -237,59 +221,38 @@ def test_fetchall_wide(self, benchmark):
             cur.execute("CREATE TABLE perf_wide (%s)" % (", ".join(cols),))
             con.commit()
             rows = [tuple(range(self._WIDE_COLS)) for _ in range(self._WIDE_ROWS)]
-            cur.executemany(
-                "INSERT INTO perf_wide (%s) VALUES (%s)"
-                % (col_names, placeholders),
-                rows)
+            cur.executemany( "INSERT INTO perf_wide (%s) VALUES (%s)" % (col_names, placeholders), rows)
             con.commit()
 
             def target():
                 cur.execute("SELECT %s FROM perf_wide" % col_names)
                 return cur.fetchall()
 
-            result = benchmark.pedantic(target, warmup_rounds=2, rounds=100,
-                                        iterations=1)
+            result = benchmark.pedantic(target, warmup_rounds=2, rounds=100, iterations=1)
             assert len(result) == self._WIDE_ROWS
             assert len(result[0]) == self._WIDE_COLS
         finally:
             con.close()
 
     def test_fetchall_mixed_types(self, benchmark):
-        """SELECT with variety of types: int / decimal / double / timestamp / bool /
-        varchar / null. """
+        """SELECT with variety of types: int / decimal / double / timestamp / bool / varchar / null. """
         con = self._connect()
         try:
             cur = con.cursor()
             cur.execute("DROP TABLE IF EXISTS perf_mixed")
             cur.execute(
-                "CREATE TABLE perf_mixed ("
-                "  i INT,"
-                "  d DECIMAL(12, 4),"
-                "  f DOUBLE,"
-                "  ts TIMESTAMP,"
-                "  bl BOOLEAN,"
-                "  s VARCHAR(64),"
-                "  n INT"
-                ")")
+                "CREATE TABLE perf_mixed (i INT,  d DECIMAL(12, 4), f DOUBLE,  ts TIMESTAMP,"
+                "bl BOOLEAN, s VARCHAR(64), n INT)")
             con.commit()
-            rows = [
-                (i, i * 1.25, i / 3.0,
-                 '2024-01-01 12:34:56', bool(i & 1),
-                 'row #%d' % i, None)
-                for i in range(_LARGE)
-            ]
-            cur.executemany(
-                "INSERT INTO perf_mixed (i, d, f, ts, bl, s, n)"
-                " VALUES (?, ?, ?, ?, ?, ?, ?)",
-                rows)
+            rows = [ (i, i * 1.25, i / 3.0, '2024-01-01 12:34:56', bool(i & 1), 'row #%d' % i, None) for i in range(_LARGE) ]
+            cur.executemany( "INSERT INTO perf_mixed (i, d, f, ts, bl, s, n)" " VALUES (?, ?, ?, ?, ?, ?, ?)", rows)
             con.commit()
 
             def target():
                 cur.execute("SELECT i, d, f, ts, bl, s, n FROM perf_mixed")
                 return cur.fetchall()
 
-            result = benchmark.pedantic(target, warmup_rounds=2, rounds=100,
-                                        iterations=1)
+            result = benchmark.pedantic(target, warmup_rounds=2, rounds=100, iterations=1)
             assert len(result) == _LARGE
         finally:
             con.close()