From 8c2d759827d1c4b6d635ed03e706a9357fe42132 Mon Sep 17 00:00:00 2001 From: ernestprovo23 Date: Sun, 14 Jun 2026 13:13:42 -0400 Subject: [PATCH 1/2] docs(synthesizer): document + version + test synthesizer behavior (v1.0 #5) The synthesizer/judge path is the heart of conclave's "council" value prop but was undocumented and lightly tested, risking silent degradation. This makes it sound and observable for 1.0. Investigation (current behavior, unchanged): - synthesizer = constructor arg, else config `synthesizer:`, else built-in default "claude" (registry.DEFAULT_SYNTHESIZER). Same model judges in adversarial and consolidates in debate. - degraded paths were ALREADY observable, not silent: no-usable-answers, unkeyed synthesizer, and synthesizer-call-failure each set CouncilResult.synthesis_error (adversarial: AdversarialResult.verdict_error, mirrored). synthesis stays None; member answers preserved. No silent quiet-concat path existed to fix -- confirmed + pinned with tests. Changes: - version the synthesis prompt set: new conclave.prompts.SYNTHESIS_PROMPT_VERSION, re-exported from council, stamped onto every CouncilResult as `prompt_version` (lazy default_factory avoids the prompts<->models import cycle). Prompt text is byte-stable; the constant + text are pinned so a prompt change without a version bump fails CI. - document selection/default/configurability/fallback in the council module docstring + _synthesize docstring, and a README "Synthesizer behavior" section. - DOCUMENTATION_INDEX: new test file row, CouncilResult field note, changelog row. - tests/test_synthesizer.py (21 tests): default + arg/config/CLI override selection; observable degradation for synthesize, debate, and adversarial judge (unkeyed + call-failure); prompt-version stability across every mode. No non-synthesis behavior changed; happy-path synthesis output is byte-for-byte unchanged. Mocks at the existing httpx-transport boundary (offline). --- DOCUMENTATION_INDEX.md | 4 +- README.md | 39 ++++ src/conclave/council.py | 78 ++++++- src/conclave/models.py | 21 ++ src/conclave/prompts.py | 11 + tests/test_synthesizer.py | 424 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 575 insertions(+), 2 deletions(-) create mode 100644 tests/test_synthesizer.py diff --git a/DOCUMENTATION_INDEX.md b/DOCUMENTATION_INDEX.md index c1359d9..2d835f6 100644 --- a/DOCUMENTATION_INDEX.md +++ b/DOCUMENTATION_INDEX.md @@ -48,7 +48,7 @@ Package root: `src/conclave/` (installed as the `conclave` package; console scri | Gemini adapter | [`src/conclave/adapters/gemini.py`](src/conclave/adapters/gemini.py) | `GeminiAdapter` — native `generateContent`, OpenAI-role mapping, `usageMetadata`. | | Registry | [`src/conclave/registry.py`](src/conclave/registry.py) | Friendly-name → model-id defaults; provider → env-var mapping; key **presence** logic (never values). | | Config | [`src/conclave/config.py`](src/conclave/config.py) | Loads/merges `~/.conclave/config.yml` over defaults; resolves model ids and named/CSV councils; parses the `endpoints:` section (custom OpenAI-compatible providers). | -| Models | [`src/conclave/models.py`](src/conclave/models.py) | Pydantic result contract: `TokenUsage`, `ModelAnswer`, `StreamEvent`, `DebateRound`, `AdversarialResult`, `CouncilResult` (`mode`/`rounds`/`adversarial`). Stable downstream surface. | +| Models | [`src/conclave/models.py`](src/conclave/models.py) | Pydantic result contract: `TokenUsage`, `ModelAnswer`, `StreamEvent`, `DebateRound`, `AdversarialResult`, `CouncilResult` (`mode`/`rounds`/`adversarial`/`synthesis_error`/`prompt_version`). Stable downstream surface. | | CLI | [`src/conclave/cli.py`](src/conclave/cli.py) | `conclave ask` (synthesize/raw/debate/adversarial; `--rounds`/`--proposer`/`--stream`) + `conclave providers`; rich panels, live `--stream` output, and `--json`; never prints key values. | | Logging | [`src/conclave/logging.py`](src/conclave/logging.py) | Logger factory; stderr; verbosity via `CONCLAVE_LOG_LEVEL` (default `WARNING`). | @@ -57,6 +57,7 @@ Package root: `src/conclave/` (installed as the `conclave` package; console scri | File | Path | Covers | |------|------|--------| | Council tests | [`tests/test_council.py`](tests/test_council.py) | Fan-out, partial failure, synthesis behavior. | +| Synthesizer tests | [`tests/test_synthesizer.py`](tests/test_synthesizer.py) | Pins the synthesizer/judge contract: default + configurable (arg/config/CLI `--synthesizer`) selection; observable degradation (unkeyed/failed → `synthesis_error`/`verdict_error`, never silent) for synthesize, debate, and the adversarial judge; versioned synthesis prompt (`SYNTHESIS_PROMPT_VERSION` + `result.prompt_version`) with prompt-text + version pins. | | Modes tests | [`tests/test_modes.py`](tests/test_modes.py) | Debate multi-round flow, mid-round drop-out, peer anonymization; adversarial proposer/critic/verdict, proposal/critic failure paths, no-key judge, sync wrappers. | | Adapter tests | [`tests/test_adapters.py`](tests/test_adapters.py) | Per-adapter `build_request` + `parse_response` for openai-compat/anthropic/gemini: system-hoist, max_tokens, role mapping, usage parsing, empty/malformed/error-status raises. | | Provider highway tests | [`tests/test_providers.py`](tests/test_providers.py) | `resolve_adapter` (built-in prefixes, per-provider URLs, custom endpoints, unknown-prefix raise), end-to-end `call_model`, and `redact()` (bearer/`sk-`/env-var-value/`x-api-key` scrubbing; pre-redacted provider errors). | @@ -91,6 +92,7 @@ Run: `pytest` (config in `pyproject.toml`, `asyncio_mode = "auto"`). | Date | Change | |------|--------| +| 2026-06-14 | Documented + tested synthesizer behavior (v1.0 readiness must-do #5): README "Synthesizer behavior" section (selection precedence, observable degradation, versioned prompt); synthesis prompt set now versioned via `conclave.prompts.SYNTHESIS_PROMPT_VERSION`, stamped onto every `CouncilResult.prompt_version`; confirmed (not silent) degradation across synthesize/debate/adversarial-judge paths; new `tests/test_synthesizer.py` (21 tests). No non-synthesis behavior changed. | | 2026-06-09 | Roadmap features shipped: adversarial proposer resilience (#9), optional result cache (#6), debate convergence early-stop (#4), 4 first-class providers groq/deepseek/mistral/together (#5), streaming for synthesize/raw (#7); tests 121→191. #8 local-server-mode spike evaluated (no-go on HTTP). Doc sync: System Context diagram now shows all 9 providers; PDD §12 resolved questions archived to `docs/archive/pdd-resolved-questions-2026-06-09.md` (PDD back under 500 lines); `config.example.yml` stale "LiteLLM" comment fixed. | | 2026-06-08 | v0.3.0 version bump; CI foundation (Actions matrix, ruff, coverage floor, gitleaks, branch protection); redact() custom-endpoint key-leak fix (#14); status_error consolidation + conditional temperature (#16/#22); provider-metadata single-source + import-time drift guard + config memoization (#19/#15); CLI exit-code contract + httpx client lifecycle (#17/#20); transport/cli/logging test backfill (#18); public release + community files. | | 2026-06-08 | PDD §11 repositioned vs. new direct peers (`llm-council-core`, `the-llm-council`); §12 Q1/Q3/Q4/Q5 resolved. Index Tests table updated for the PR #2 split (`test_adapters.py`, `test_providers.py`). | diff --git a/README.md b/README.md index bb2e5a4..e1946b9 100644 --- a/README.md +++ b/README.md @@ -199,6 +199,45 @@ print("VERDICT:\n", adv.adversarial.verdict) # also mirrored to adv.synthesis critiques populate `answers` and the verdict mirrors into `synthesis` — so code written against the v0.1 surface keeps working across every mode. +## Synthesizer behavior + +The synthesizer is the single model that merges the council's answers (and is the +**judge** in `adversarial` mode and the final consolidator in `debate`). It is +chosen by this precedence, highest first: + +1. the `synthesizer=` argument to `Council` (CLI: `--synthesizer/-s`); +2. the `synthesizer:` key in `~/.conclave/config.yml`; +3. the built-in default — **`claude`** (`anthropic/claude-sonnet-4-6`). + +```bash +conclave ask "..." --council grok,gemini --synthesizer openai # override per run +``` + +**Degradation is observable, never silent.** Synthesis is skipped — and the +reason is always surfaced on the result — in three cases: + +| Situation | What happens | +|---|---| +| No usable member answers (all errored/skipped) | `synthesis = None`, `synthesis_error = "no successful member answers…"` | +| Synthesizer has no API key | `synthesis = None`, `synthesis_error = "…has no API key; returning raw answers only"`; member answers preserved | +| Synthesizer call fails | `synthesis = None`, `synthesis_error =` the provider error | + +In every case the member answers are returned intact and a warning is logged, so +a caller can reliably detect a non-synthesis with +`result.synthesis is None and result.synthesis_error is not None`. There is **no +path** where concatenated or partial output is silently returned as if it were a +synthesis. In `adversarial` mode the same signal lands on +`adversarial.verdict_error` (mirrored to `synthesis_error`). + +**The synthesis prompt is a versioned constant.** The synthesize-mode system +prompt is fixed in code (not built per call); the debate/judge prompts live in +`conclave.prompts`. The whole prompt set carries a version tag, +`conclave.prompts.SYNTHESIS_PROMPT_VERSION`, stamped onto **every** +`CouncilResult` as `result.prompt_version`. A downstream eval or regression suite +can compare it across runs to detect that the synthesis wording changed, instead +of silently attributing the shift to model drift. The test suite pins both the +prompt text and the version, so changing one without the other fails CI. + ## Config (optional) Create `~/.conclave/config.yml` to add models, define named councils, and set a diff --git a/src/conclave/council.py b/src/conclave/council.py index 6f7508f..47dd2c3 100644 --- a/src/conclave/council.py +++ b/src/conclave/council.py @@ -8,6 +8,45 @@ The deliberation modes (``debate``, ``adversarial``) live in :mod:`conclave.modes` and reuse this class's :meth:`Council.fan_out` primitive so the partial-failure handling is written exactly once. + +Synthesizer selection and degradation (the "council" value prop) +---------------------------------------------------------------- + +**Which model synthesizes.** Synthesis is performed by one *synthesizer* model, +separate from the council members (though a member may also be the synthesizer). +Selection precedence, highest first: + +1. the ``synthesizer=`` argument to :class:`Council` (the CLI ``--synthesizer/-s`` + flag wires straight through to this); +2. the ``synthesizer:`` key in ``~/.conclave/config.yml``; +3. the built-in default :data:`conclave.registry.DEFAULT_SYNTHESIZER` (``"claude"``, + i.e. ``anthropic/claude-sonnet-4-6``). + +The same model is the **judge** in ``adversarial`` mode and the final +consolidator in ``debate`` mode -- one selection drives all three. + +**The fallback / degraded path is OBSERVABLE, never silent.** Synthesis can fail +to run for three reasons, and each one is signaled on the result rather than +silently swallowed: + +* *No usable member answers* (every member errored/skipped) -- nothing to merge; +* *The synthesizer has no API key* in the environment; +* *The synthesizer call itself fails* (provider error/timeout). + +In all three cases ``CouncilResult.synthesis`` stays ``None``, the member answers +are still returned intact, a warning is logged, and an actionable reason is set +on ``CouncilResult.synthesis_error`` (in ``adversarial`` mode the analogous +``AdversarialResult.verdict_error``, mirrored to ``synthesis_error``). A caller +can therefore always tell synthesis did **not** happen as expected by checking +``synthesis is None and synthesis_error is not None`` -- there is no path where +the council quietly returns concatenated/partial output dressed up as a synthesis. + +**The synthesis prompt is a versioned constant.** The synthesize-mode system +prompt is :data:`_SYNTH_SYSTEM` (the debate/judge prompts live in +:mod:`conclave.prompts`); the prompt *set* carries the version tag +:data:`conclave.prompts.SYNTHESIS_PROMPT_VERSION`, stamped onto every +:class:`~conclave.models.CouncilResult` as ``prompt_version`` so a prompt change +is detectable downstream instead of being silently absorbed as model drift. """ from __future__ import annotations @@ -20,6 +59,7 @@ from .config import ConclaveConfig, load_config from .logging import get_logger from .models import CouncilResult, ModelAnswer, StreamEvent +from .prompts import SYNTHESIS_PROMPT_VERSION from .providers import call_model from .registry import key_present @@ -30,6 +70,14 @@ # per member while sharing Council.fan_out's concurrency + partial-failure code. MessagesFor = Callable[[str, str], list[dict[str, str]]] +# The synthesize-mode system prompt. It is a stable module constant -- never +# built per-call -- so the wording the council synthesizes under is auditable and +# diffable. Any change to it (or to the debate/judge prompts in +# :mod:`conclave.prompts`) MUST be paired with a bump of +# :data:`conclave.prompts.SYNTHESIS_PROMPT_VERSION`, which is stamped onto every +# :class:`~conclave.models.CouncilResult` as ``prompt_version`` so a downstream +# eval can detect the change rather than silently absorb it. ``test_synthesizer`` +# pins both this text and the version, so editing one without the other fails CI. _SYNTH_SYSTEM = ( "You are the synthesizer of a council of AI models. You are given the same " "user prompt that was posed to several models, plus each model's answer. " @@ -38,6 +86,9 @@ "Do not invent a model's position; rely only on the answers provided." ) +# Re-exported for callers that want the version without importing prompts. +__all__ = ["Council", "SYNTHESIS_PROMPT_VERSION"] + class Council: """A council of foundation models with an optional synthesizer. @@ -358,7 +409,32 @@ def _replay_cached(result: CouncilResult) -> list[StreamEvent]: return events async def _synthesize(self, result: CouncilResult) -> None: - """Run the synthesizer over the successful answers, mutating ``result``.""" + """Run the synthesizer over the successful answers, mutating ``result``. + + This is the buffered (non-streaming) synthesize path; the streaming + counterpart :func:`conclave.streaming._stream_synthesis` mirrors it + short-circuit for short-circuit. The synthesizer model is + ``self.synthesizer`` (resolved per the precedence documented in the module + docstring: constructor arg, else config, else the ``"claude"`` default). + + Every degraded outcome is made observable on ``result`` -- none is + silent. On success ``result.synthesis`` holds the merged answer; on any + of the three short-circuits ``result.synthesis`` stays ``None`` and + ``result.synthesis_error`` carries the reason: + + * **no usable answers** -- every member failed/was skipped, so there is + nothing to merge; + * **synthesizer unkeyed** -- ``self.synthesizer``'s API key is absent, so + the raw member answers are returned with an explanatory error; + * **synthesizer call failed** -- the synthesizer provider errored, and its + error text is surfaced verbatim. + + The synthesizer identity (``synthesizer`` / ``synthesizer_model_id``) is + recorded on ``result`` before the key check so a consumer can see *which* + model was selected even when it could not run. The prompt used is the + versioned :data:`_SYNTH_SYSTEM`; the version tag already lives on + ``result.prompt_version``. + """ usable = result.successful_answers if not usable: result.synthesis_error = "no successful member answers to synthesize" diff --git a/src/conclave/models.py b/src/conclave/models.py index dec2693..bf5ba11 100644 --- a/src/conclave/models.py +++ b/src/conclave/models.py @@ -9,6 +9,19 @@ from pydantic import BaseModel, Field +def _default_prompt_version() -> str: + """Resolve the current synthesis-prompt version without an import cycle. + + ``conclave.prompts`` imports this module, so importing it at module load + would be circular. The import is deferred into this factory (run only when a + ``CouncilResult`` is constructed, by which point both modules are loaded), so + every result defaults to the live :data:`conclave.prompts.SYNTHESIS_PROMPT_VERSION`. + """ + from .prompts import SYNTHESIS_PROMPT_VERSION + + return SYNTHESIS_PROMPT_VERSION + + class TokenUsage(BaseModel): """Token accounting for a single model call.""" @@ -164,6 +177,13 @@ class CouncilResult(BaseModel): convergence_score: The convergence score (0.0--1.0) of the round that triggered an early stop, or ``None`` when no early stop occurred. Higher means more stable round-over-round (more converged). + prompt_version: The version tag of the synthesizer/judge prompt set used + for this run (:data:`conclave.prompts.SYNTHESIS_PROMPT_VERSION`). + Stamped on **every** result regardless of mode or whether synthesis + actually ran, so a downstream eval/regression suite can detect that + the synthesis prompt wording changed between two runs instead of + silently attributing the shift to model drift. Opaque string; only + equality is meaningful. """ prompt: str @@ -179,6 +199,7 @@ class CouncilResult(BaseModel): cached: bool = False converged: bool = False convergence_score: float | None = None + prompt_version: str = Field(default_factory=_default_prompt_version) @property def successful_answers(self) -> list[ModelAnswer]: diff --git a/src/conclave/prompts.py b/src/conclave/prompts.py index 7ce5b38..dc3ea07 100644 --- a/src/conclave/prompts.py +++ b/src/conclave/prompts.py @@ -10,6 +10,17 @@ from .models import ModelAnswer +# Version identifier for the synthesis/judge prompt *set*. Bump this string +# whenever ANY synthesizer-facing prompt changes -- the synthesize-mode system +# prompt (``conclave.council._SYNTH_SYSTEM``), the debate consolidation prompt +# (:data:`DEBATE_FINAL_SYSTEM`), or the adversarial judge prompt +# (:data:`JUDGE_SYSTEM`). It is surfaced on :class:`conclave.models.CouncilResult` +# (the ``prompt_version`` field) so a downstream eval or regression suite can +# detect that the wording the synthesis was produced under has shifted, rather +# than silently absorbing a prompt change as a quality regression. The value is +# opaque (a date-stamped tag); only equality/inequality is meaningful. +SYNTHESIS_PROMPT_VERSION = "2026-06-14" + # Stable position-based labels used to anonymize peers in debate rounds 2..N. LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" diff --git a/tests/test_synthesizer.py b/tests/test_synthesizer.py new file mode 100644 index 0000000..b1c384c --- /dev/null +++ b/tests/test_synthesizer.py @@ -0,0 +1,424 @@ +"""Regression tests pinning the SYNTHESIZER behavior (readiness must-do #5). + +The synthesizer/judge path is the heart of conclave's "council" value prop, so +its contract is pinned here explicitly rather than left implicit across +``test_council``/``test_modes``: + +* **selection** -- which model synthesizes by default, and that the constructor + arg, config, and CLI ``--synthesizer`` all override it (a, b, c); +* **observable degradation** -- the synthesizer failing or being unkeyed is + signaled on the result (``synthesis_error`` / ``verdict_error``), never a + silent quiet-concat degrade (c, d); +* **prompt versioning** -- the synthesis prompt is a stable, versioned constant, + stamped onto every result, so a wording change is detectable downstream (e). + +All tests run offline via the ``patch_call_model`` fixture (mocking at the same +httpx-transport boundary the rest of the suite uses); the CLI override test +drives Typer's ``CliRunner`` with no network and no real keys. +""" + +from __future__ import annotations + +import json + +import pytest +from typer.testing import CliRunner + +from conclave import Council, cli +from conclave.config import ConclaveConfig +from conclave.council import _SYNTH_SYSTEM +from conclave.council import SYNTHESIS_PROMPT_VERSION as COUNCIL_VERSION +from conclave.models import CouncilResult +from conclave.prompts import ( + DEBATE_FINAL_SYSTEM, + JUDGE_SYSTEM, + SYNTHESIS_PROMPT_VERSION, +) +from conclave.registry import DEFAULT_SYNTHESIZER +from tests.conftest import make_response + +runner = CliRunner() + + +def _all_keys(monkeypatch) -> None: + """Set every provider key to a dummy non-empty value.""" + for var in ( + "XAI_API_KEY", + "GEMINI_API_KEY", + "ANTHROPIC_API_KEY", + "PERPLEXITY_API_KEY", + "OPENAI_API_KEY", + ): + monkeypatch.setenv(var, "dummy-key") + + +def _config(synthesizer: str = "claude") -> ConclaveConfig: + """A deterministic config independent of any on-disk ~/.conclave file.""" + return ConclaveConfig( + models={ + "grok": "xai/grok-4.3", + "gemini": "gemini/gemini-2.5-pro", + "claude": "anthropic/claude-sonnet-4-6", + "perplexity": "perplexity/sonar-pro", + "openai": "openai/gpt-4.1", + }, + councils={"default": ["grok", "gemini", "claude", "perplexity"]}, + synthesizer=synthesizer, + ) + + +def _system_text(messages) -> str: + """Return the system-role content of a message list, or '' if none.""" + for m in messages: + if m.get("role") == "system": + return m.get("content", "") + return "" + + +# --------------------------------------------------------------------------- # +# (a) Default synthesizer selection +# --------------------------------------------------------------------------- # + + +def test_default_synthesizer_is_config_default(): + """No synthesizer arg -> the config's synthesizer is used (here 'claude').""" + council = Council(models=["grok", "gemini"], config=_config()) + assert council.synthesizer == "claude" + + +def test_default_synthesizer_falls_back_to_registry_default(): + """A config with the built-in default synthesizer resolves to 'claude'. + + Pins the bottom of the precedence chain: with no constructor arg and a config + whose synthesizer is the registry default, the council synthesizes with the + documented built-in (``DEFAULT_SYNTHESIZER``). + """ + council = Council( + models=["grok"], + config=ConclaveConfig(models={"grok": "xai/grok-4.3"}), # synthesizer defaults + ) + assert DEFAULT_SYNTHESIZER == "claude" + assert council.synthesizer == DEFAULT_SYNTHESIZER + + +async def test_default_synthesizer_runs_and_is_recorded(monkeypatch, patch_call_model): + """The default synthesizer actually performs the merge and is named on the result.""" + _all_keys(monkeypatch) + + def handler(model, messages, **kwargs): + # The synthesizer is anthropic/claude with the 2-message system+merge prompt. + if model == "anthropic/claude-sonnet-4-6" and _system_text(messages) == _SYNTH_SYSTEM: + return make_response("DEFAULT MERGE") + return make_response(f"answer from {model}") + + patch_call_model(handler) + + council = Council(models=["grok", "gemini"], config=_config()) # no synthesizer arg + result = await council.ask("q") + + assert result.synthesizer == "claude" + assert result.synthesizer_model_id == "anthropic/claude-sonnet-4-6" + assert result.synthesis == "DEFAULT MERGE" + + +# --------------------------------------------------------------------------- # +# (b) Configurable synthesizer override -- constructor arg + config +# --------------------------------------------------------------------------- # + + +def test_constructor_arg_overrides_config_synthesizer(): + """The constructor ``synthesizer=`` wins over the config default.""" + council = Council(models=["grok"], synthesizer="openai", config=_config("claude")) + assert council.synthesizer == "openai" + + +def test_config_synthesizer_used_when_no_arg(): + """With no constructor arg the config's synthesizer is honored (not the registry default).""" + council = Council(models=["grok"], config=_config("perplexity")) + assert council.synthesizer == "perplexity" + + +async def test_overridden_synthesizer_performs_the_merge(monkeypatch, patch_call_model): + """An overridden synthesizer (openai) is the model that runs the merge.""" + _all_keys(monkeypatch) + + def handler(model, messages, **kwargs): + if model == "openai/gpt-4.1" and _system_text(messages) == _SYNTH_SYSTEM: + return make_response("OPENAI MERGE") + return make_response(f"answer from {model}") + + patch_call_model(handler) + + council = Council(models=["grok", "gemini"], synthesizer="openai", config=_config("claude")) + result = await council.ask("q") + + assert result.synthesizer == "openai" + assert result.synthesizer_model_id == "openai/gpt-4.1" + assert result.synthesis == "OPENAI MERGE" + + +# --------------------------------------------------------------------------- # +# (c) Configurable synthesizer override -- CLI ``--synthesizer`` +# --------------------------------------------------------------------------- # + + +def test_cli_synthesizer_flag_overrides(monkeypatch, patch_call_model): + """``--synthesizer openai`` makes openai the synthesizer end-to-end via the CLI.""" + monkeypatch.setattr(cli, "load_config", lambda: _config("claude")) + for var in ("XAI_API_KEY", "GEMINI_API_KEY", "OPENAI_API_KEY"): + monkeypatch.setenv(var, "dummy-key") + # claude (the config default) intentionally has NO key: if the flag were + # ignored, synthesis would degrade to a no-key error instead of merging. + + def handler(model, messages, **kwargs): + if model == "openai/gpt-4.1" and _system_text(messages) == _SYNTH_SYSTEM: + return make_response("CLI OPENAI MERGE") + return make_response(f"answer from {model}") + + patch_call_model(handler) + + result = runner.invoke( + cli.app, + ["ask", "q", "--council", "grok,gemini", "--synthesizer", "openai", "--json"], + ) + assert result.exit_code == 0 + payload = json.loads(result.stdout) + assert payload["synthesizer"] == "openai" + assert payload["synthesizer_model_id"] == "openai/gpt-4.1" + assert payload["synthesis"] == "CLI OPENAI MERGE" + + +# --------------------------------------------------------------------------- # +# (d) Degraded / fallback path is SIGNALED, never silent -- synthesize mode +# --------------------------------------------------------------------------- # + + +async def test_synthesizer_unkeyed_is_signaled_not_silent( + monkeypatch, patch_call_model, clear_keys +): + """Synthesizer with no key -> synthesis is None AND synthesis_error explains it. + + The degraded path must be observable: a caller can tell synthesis did not run + (no quietly-concatenated output masquerading as a synthesis). Member answers + are preserved; the selected synthesizer identity is still recorded. + """ + monkeypatch.setenv("XAI_API_KEY", "dummy") # only grok has a key; claude (synth) does not + + def handler(model, messages, **kwargs): + return make_response(f"answer from {model}") + + patch_call_model(handler) + + council = Council(models=["grok"], synthesizer="claude", config=_config()) + result = await council.ask("q") + + # Happy-path member output is untouched... + assert len(result.successful_answers) == 1 + # ...but synthesis is explicitly NOT produced, and the reason is observable. + assert result.synthesis is None + assert result.synthesis_error is not None + assert "no API key" in result.synthesis_error + # The selected synthesizer is still recorded even though it could not run. + assert result.synthesizer == "claude" + assert result.synthesizer_model_id == "anthropic/claude-sonnet-4-6" + + +async def test_synthesizer_call_failure_is_signaled(monkeypatch, patch_call_model): + """Synthesizer keyed but the call errors -> synthesis None, error surfaced verbatim.""" + _all_keys(monkeypatch) + + def handler(model, messages, **kwargs): + if model == "anthropic/claude-sonnet-4-6" and _system_text(messages) == _SYNTH_SYSTEM: + raise RuntimeError("synthesizer 503 from provider") + return make_response(f"answer from {model}") + + patch_call_model(handler) + + council = Council(models=["grok", "gemini"], synthesizer="claude", config=_config()) + result = await council.ask("q") + + # Members succeeded; only the synthesis step failed, and it is signaled. + assert len(result.successful_answers) == 2 + assert result.synthesis is None + assert result.synthesis_error is not None + assert "synthesizer 503 from provider" in result.synthesis_error + + +async def test_no_usable_answers_is_signaled(monkeypatch, patch_call_model): + """Every member fails -> synthesis None with a 'nothing to merge' signal.""" + _all_keys(monkeypatch) + + def handler(model, messages, **kwargs): + raise RuntimeError("all members down") + + patch_call_model(handler) + + council = Council(models=["grok", "gemini"], synthesizer="claude", config=_config()) + result = await council.ask("q") + + assert result.synthesis is None + assert result.synthesis_error is not None + assert "no successful member answers" in result.synthesis_error + + +# --------------------------------------------------------------------------- # +# (d') Degraded path is SIGNALED -- adversarial JUDGE (the analogous role) +# --------------------------------------------------------------------------- # + + +async def test_adversarial_judge_unkeyed_is_signaled(monkeypatch, patch_call_model, clear_keys): + """Judge (synthesizer) with no key -> verdict None, verdict_error + mirror set. + + The adversarial judge is the same model as the synthesizer; its degraded path + must be just as observable. The proposal and critiques survive; the missing + verdict is signaled on both ``adversarial.verdict_error`` and the mirrored + ``result.synthesis_error``. + """ + monkeypatch.setenv("XAI_API_KEY", "dummy") + monkeypatch.setenv("GEMINI_API_KEY", "dummy") + # claude (the judge) intentionally has no key. + + def handler(model, messages, **kwargs): + if "critic on an adversarial review" in _system_text(messages): + return make_response(f"crit {model}") + return make_response(f"prop {model}") + + patch_call_model(handler) + + council = Council(models=["grok", "gemini"], synthesizer="claude", config=_config()) + result = await council.adversarial("q") + + adv = result.adversarial + assert adv is not None + assert adv.proposal.ok # proposal survived + assert len(adv.successful_critiques) == 1 # a critique survived + assert adv.verdict is None + assert adv.verdict_error is not None + assert "no API key" in adv.verdict_error + # The selected judge identity is recorded, and the error mirrors to synthesis_error. + assert adv.judge == "claude" + assert adv.judge_model_id == "anthropic/claude-sonnet-4-6" + assert result.synthesis_error == adv.verdict_error + + +async def test_adversarial_judge_call_failure_is_signaled(monkeypatch, patch_call_model): + """Judge keyed but the verdict call errors -> verdict None, error surfaced.""" + _all_keys(monkeypatch) + + def handler(model, messages, **kwargs): + system = _system_text(messages) + if "judge of an adversarial review" in system: + raise RuntimeError("judge 500 from provider") + if "critic on an adversarial review" in system: + return make_response(f"crit {model}") + return make_response(f"prop {model}") + + patch_call_model(handler) + + council = Council(models=["grok", "gemini"], synthesizer="claude", config=_config()) + result = await council.adversarial("q") + + adv = result.adversarial + assert adv is not None + assert adv.verdict is None + assert adv.verdict_error is not None + assert "judge 500 from provider" in adv.verdict_error + + +async def test_debate_synthesizer_unkeyed_is_signaled(monkeypatch, patch_call_model, clear_keys): + """Debate's final synthesizer with no key -> synthesis None + observable error.""" + monkeypatch.setenv("XAI_API_KEY", "dummy") # only grok; claude (synth) has no key + + def handler(model, messages, **kwargs): + return make_response(f"answer {model}") + + patch_call_model(handler) + + council = Council(models=["grok"], synthesizer="claude", config=_config()) + result = await council.debate("q", rounds=1) + + assert result.synthesis is None + assert result.synthesis_error is not None + assert "no API key" in result.synthesis_error + + +# --------------------------------------------------------------------------- # +# (e) Prompt-version constant is stable + asserted +# --------------------------------------------------------------------------- # + + +def test_prompt_version_is_a_stable_nonempty_string(): + """The version tag is a non-empty string and re-exported consistently.""" + assert isinstance(SYNTHESIS_PROMPT_VERSION, str) + assert SYNTHESIS_PROMPT_VERSION + # council re-exports the same object the prompts module owns. + assert COUNCIL_VERSION == SYNTHESIS_PROMPT_VERSION + + +def test_prompt_version_is_pinned(): + """Pin the exact version so a prompt change without a version bump fails CI. + + This is the tripwire: editing any synthesizer-facing prompt below WITHOUT + bumping ``SYNTHESIS_PROMPT_VERSION`` leaves this assertion (and the prompt-text + pins) inconsistent, so the change cannot land silently. + """ + assert SYNTHESIS_PROMPT_VERSION == "2026-06-14" + + +def test_synthesis_prompt_text_is_pinned(): + """Pin the synthesize/debate/judge prompt wording. + + Guards the happy-path output contract: the synthesizer-facing prompts are + byte-stable. Any intentional edit must update this test AND bump + ``SYNTHESIS_PROMPT_VERSION`` (see ``test_prompt_version_is_pinned``). + """ + assert _SYNTH_SYSTEM.startswith("You are the synthesizer of a council of AI models.") + assert "rely only on the answers provided" in _SYNTH_SYSTEM + assert DEBATE_FINAL_SYSTEM.startswith("You are the synthesizer concluding") + assert JUDGE_SYSTEM.startswith("You are the judge of an adversarial review.") + + +def test_every_result_carries_the_prompt_version(): + """A bare CouncilResult defaults prompt_version to the current tag.""" + result = CouncilResult(prompt="x") + assert result.prompt_version == SYNTHESIS_PROMPT_VERSION + + +async def test_live_run_stamps_prompt_version(monkeypatch, patch_call_model): + """A real synthesize run stamps the version onto the result (and into JSON).""" + _all_keys(monkeypatch) + + def handler(model, messages, **kwargs): + if model == "anthropic/claude-sonnet-4-6" and _system_text(messages) == _SYNTH_SYSTEM: + return make_response("MERGE") + return make_response(f"answer from {model}") + + patch_call_model(handler) + + council = Council(models=["grok", "gemini"], synthesizer="claude", config=_config()) + result = await council.ask("q") + + assert result.prompt_version == SYNTHESIS_PROMPT_VERSION + # The version survives JSON serialization for downstream eval pipelines. + assert result.model_dump(mode="json")["prompt_version"] == SYNTHESIS_PROMPT_VERSION + + +@pytest.mark.parametrize("mode", ["raw", "debate", "adversarial"]) +def test_prompt_version_stamped_in_every_mode(monkeypatch, patch_call_model, mode): + """Every mode's result carries prompt_version, even when synthesis does not run.""" + _all_keys(monkeypatch) + + def handler(model, messages, **kwargs): + return make_response(f"answer {model}") + + patch_call_model(handler) + + council = Council(models=["grok", "gemini"], synthesizer="claude", config=_config()) + if mode == "raw": + result = council.ask_sync("q", synthesize=False) + elif mode == "debate": + result = council.debate_sync("q", rounds=1) + else: + result = council.adversarial_sync("q") + + assert result.prompt_version == SYNTHESIS_PROMPT_VERSION From e0cf13838218e712ef76da7b228a70422e5efcfc Mon Sep 17 00:00:00 2001 From: ernestprovo23 Date: Sun, 14 Jun 2026 13:16:30 -0400 Subject: [PATCH 2/2] test(logging): make handler-count assertions robust to pytest 9.x log capture CI on this branch resolved pytest 9.1.0 (deps are pinned >=8.0.0; main's last green run predates the 9.1.0 release). pytest 9.x attaches its LogCaptureHandler (a StreamHandler subclass) directly to the non-propagating `conclave` logger during a run, so `len(logger.handlers) == 1` now sees 3 handlers and test_logging.py's one-shot-configuration assertions fail across 3.11/3.12/3.13. Count only conclave's own handler via `type(h) is logging.StreamHandler` (pytest's is a subclass) instead of all handlers. This preserves the test's intent exactly -- the factory installs one StreamHandler and never duplicates it -- while ignoring pytest-injected capture handlers, and is stable across pytest versions. No production code changed. --- tests/test_logging.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tests/test_logging.py b/tests/test_logging.py index bc413d9..388fecc 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -19,6 +19,21 @@ from conclave.logging import get_logger +def _own_handlers(logger: logging.Logger) -> list[logging.Handler]: + """Return only the handlers conclave installs, excluding pytest's capture ones. + + ``get_logger`` installs exactly one plain ``logging.StreamHandler`` on the + ``conclave`` root. Because that logger sets ``propagate = False``, pytest's + log-capture machinery attaches its own handlers (``LogCaptureHandler``, a + *subclass* of ``StreamHandler``) directly to it during a run -- the count of + which varies by pytest version. Selecting by exact type (``type(h) is + StreamHandler``) counts conclave's handler alone and ignores any injected + capture handler, so the one-shot-configuration assertions stay precise and + robust across pytest versions (pytest 9.x attaches more than older lines did). + """ + return [h for h in logger.handlers if type(h) is logging.StreamHandler] + + @pytest.fixture def fresh_logging(monkeypatch): """Reset the one-shot logger config so a fresh get_logger() reconfigures. @@ -54,8 +69,9 @@ def test_default_level_is_warning_when_env_unset(fresh_logging, monkeypatch): assert logger.name == "conclave" assert logger.level == logging.WARNING assert logger.propagate is False - assert len(logger.handlers) == 1 - assert isinstance(logger.handlers[0], logging.StreamHandler) + own = _own_handlers(logger) + assert len(own) == 1 + assert isinstance(own[0], logging.StreamHandler) def test_env_var_sets_level_case_insensitively(fresh_logging, monkeypatch): @@ -95,7 +111,7 @@ def test_configuration_happens_once(fresh_logging, monkeypatch): monkeypatch.setenv("CONCLAVE_LOG_LEVEL", "ERROR") first = get_logger() - assert len(first.handlers) == 1 + assert len(_own_handlers(first)) == 1 assert logging_mod._CONFIGURED is True # Changing the env now must have no effect -- the guard short-circuits. @@ -103,5 +119,5 @@ def test_configuration_happens_once(fresh_logging, monkeypatch): second = get_logger() assert second is first - assert len(second.handlers) == 1 # not duplicated + assert len(_own_handlers(second)) == 1 # not duplicated assert second.level == logging.ERROR # unchanged from first config