From 57a5f6c8c6ffde82b5fd279ec590eb22afca8b2e Mon Sep 17 00:00:00 2001 From: Ray Walker Date: Mon, 15 Jun 2026 17:38:40 +1000 Subject: [PATCH 1/2] fix(cachekitio): surface HTTP 413 as a clear permanent "value too large" error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the SaaS rejects an oversized value with 413, classify it PERMANENT (it already would via the generic 4xx branch) with an actionable message instead of "Client error: HTTP 413", so the graceful-degrade log reads "value too large" rather than a generic client error. Retrying never helps — the value must shrink — so the decorator degrades: runs uncached, once. Adds a unit regression (413 -> PERMANENT, message mentions "too large") and multi-MB SaaS E2E tests (chunked round-trip, overwrite-shrink, 413 ceiling) that exercise the server-side chunking landing in the SaaS backend. The E2E tests are sdk_e2e-marked and run against a live worker, not in default CI. --- .../backends/cachekitio/error_handler.py | 14 ++++ .../saas/test_sdk_data_handling.py | 81 +++++++++++++++++++ .../backends/test_cachekitio_error_handler.py | 9 +++ 3 files changed, 104 insertions(+) diff --git a/src/cachekit/backends/cachekitio/error_handler.py b/src/cachekit/backends/cachekitio/error_handler.py index 45bcaca..243752e 100644 --- a/src/cachekit/backends/cachekitio/error_handler.py +++ b/src/cachekit/backends/cachekitio/error_handler.py @@ -35,6 +35,7 @@ def classify_http_error( Classification rules: - HTTP 401/403: AUTHENTICATION (alert ops, don't retry) - HTTP 429: TRANSIENT (rate limit, exponential backoff) + - HTTP 413: PERMANENT (value too large — retrying never helps) - HTTP 5xx: TRANSIENT (server error, retry) - HTTP 4xx: PERMANENT (client error, don't retry) - TimeoutException: TIMEOUT (configurable retry) @@ -75,6 +76,19 @@ def classify_http_error( key=key, ) + # PERMANENT: value too large. A 413 would already classify PERMANENT via the generic + # 4xx branch below — this dedicated branch exists only to give an ACTIONABLE message + # ("value too large") instead of "Client error: HTTP 413". Retrying never helps (the + # value must shrink), so the decorator degrades: runs uncached, once. + if status == 413: + return BackendError( + "Value too large for cachekit.io backend (HTTP 413): value exceeds the server's maximum cache value size", + error_type=BackendErrorType.PERMANENT, + original_exception=exc, + operation=operation, + key=key, + ) + # PERMANENT: Client errors (don't retry) if 400 <= status < 500: return BackendError( diff --git a/tests/integration/saas/test_sdk_data_handling.py b/tests/integration/saas/test_sdk_data_handling.py index d3a2bb4..3e30922 100644 --- a/tests/integration/saas/test_sdk_data_handling.py +++ b/tests/integration/saas/test_sdk_data_handling.py @@ -174,6 +174,87 @@ def get_large_value(): assert len(result2) == len(large_string) +# ============================================================================ +# Multi-MB / value-chunking tests (P1) — regression for the evidence-explorer +# incident: a ~13.5 MB @cache.io value 500'd on PUT because the SaaS stored each +# value in a single 2 MB-capped SQLite cell. The backend now chunks transparently. +# These use INCOMPRESSIBLE random bytes so the *stored* payload actually exceeds +# 2 MB — a repetitive string would LZ4-compress to ~KB and never chunk. +# ============================================================================ + + +def _size_limit_key(namespace: str, name: str) -> str: + """Build a SaaS-valid cache key: ns::func::args::.""" + import hashlib + + args_hash = hashlib.blake2b(name.encode(), digest_size=32).hexdigest() + return f"ns:{namespace}:func:tests.e2e.size_limits.{name}:args:{args_hash}:1s" + + +def test_chunked_value_roundtrip(cache_io_decorator, clean_cache): + """A value larger than the backend's 2 MB cell limit round-trips intact through the + full SDK → Worker → DO path via transparent server-side chunking. + + Priority: P1 + """ + import os + + blob = os.urandom(13_500_000) # ~13.5 MB, incompressible → forces chunking + + @cache_io_decorator + def get_blob(): + return blob + + assert get_blob() == blob # miss → compute → chunked PUT + assert get_blob() == blob # hit → exact bytes preserved across reassembly + + +def test_chunked_roundtrip_and_overwrite_via_http(http_client, sdk_config, unique_namespace): + """Direct HTTP (bypasses SDK L1/serialization): a >2 MB value chunk-stores and + reassembles byte-identically, and overwriting it with a small value leaves no stale + chunks (GET returns exactly the small value — proves chunk cleanup on overwrite). + + Priority: P1 + """ + import os + + # Raw key in the path (colons unencoded) — matches how the SDK builds the URL + # (backend.py: f"/v1/cache/{key}"); the worker splits the path on literal ':'. + # (Percent-encoding the colons makes the worker see 1 component → 400.) + key = _size_limit_key(unique_namespace, "roundtrip") + url = f"{sdk_config['api_url']}/v1/cache/{key}" + headers = {"Content-Type": "application/octet-stream", "X-TTL": "300"} + + big = os.urandom(13_500_000) + put = http_client.put(url, data=big, headers=headers) + assert put.status_code == 200, put.text + got = http_client.get(url) + assert got.status_code == 200 + assert got.content == big # chunk reassembly is byte-exact + + # Overwrite large → small: stale chunks must be cleared (no orphans, no corruption). + small = os.urandom(1024) + put2 = http_client.put(url, data=small, headers=headers) + assert put2.status_code == 200, put2.text + got2 = http_client.get(url) + assert got2.status_code == 200 + assert got2.content == small + + +def test_oversized_value_rejected_with_413(http_client, sdk_config, unique_namespace): + """A value above the 25 MB ceiling is rejected with a clean 413 (a permanent error), + NOT a 500 that the SDK would mis-classify as transient and retry. + + Priority: P1 + """ + # Raw key in the path (colons unencoded) — see test_chunked_roundtrip_and_overwrite_via_http. + key = _size_limit_key(unique_namespace, "oversized") + url = f"{sdk_config['api_url']}/v1/cache/{key}" + body = b"\x00" * (26 * 1024 * 1024) # 26 MB > 25 MB ceiling (raw HTTP — not compressed) + resp = http_client.put(url, data=body, headers={"Content-Type": "application/octet-stream"}) + assert resp.status_code == 413 + + # ============================================================================ # Pydantic and Dataclass Tests (P1) # ============================================================================ diff --git a/tests/unit/backends/test_cachekitio_error_handler.py b/tests/unit/backends/test_cachekitio_error_handler.py index 1111126..ea53303 100644 --- a/tests/unit/backends/test_cachekitio_error_handler.py +++ b/tests/unit/backends/test_cachekitio_error_handler.py @@ -40,6 +40,15 @@ def test_client_errors_are_permanent(self, status: int) -> None: result = classify_http_error(exc, response=_response(status)) assert result.error_type == BackendErrorType.PERMANENT + def test_value_too_large_413_is_permanent(self) -> None: + # Regression: a too-large value previously surfaced as a 500 → TRANSIENT and was + # retried 3× before a silent graceful-degrade. 413 must be PERMANENT (retrying + # never helps — the value must shrink) with a clear "too large" message. + exc = Exception("payload too large") + result = classify_http_error(exc, response=_response(413), operation="put") + assert result.error_type == BackendErrorType.PERMANENT + assert "too large" in str(result).lower() + class TestNetworkExceptionClassification: """Tests for network-level exception → error type mapping.""" From 9fd3583c039a10fae8fbc4618621b0d325f8f46f Mon Sep 17 00:00:00 2001 From: Ray Walker Date: Mon, 15 Jun 2026 18:32:49 +1000 Subject: [PATCH 2/2] test: describe large-value tests by SDK-observable behavior (remove backend internals) --- .../saas/test_sdk_data_handling.py | 46 +++++++++---------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/tests/integration/saas/test_sdk_data_handling.py b/tests/integration/saas/test_sdk_data_handling.py index 3e30922..627c261 100644 --- a/tests/integration/saas/test_sdk_data_handling.py +++ b/tests/integration/saas/test_sdk_data_handling.py @@ -175,52 +175,50 @@ def get_large_value(): # ============================================================================ -# Multi-MB / value-chunking tests (P1) — regression for the evidence-explorer -# incident: a ~13.5 MB @cache.io value 500'd on PUT because the SaaS stored each -# value in a single 2 MB-capped SQLite cell. The backend now chunks transparently. -# These use INCOMPRESSIBLE random bytes so the *stored* payload actually exceeds -# 2 MB — a repetitive string would LZ4-compress to ~KB and never chunk. +# Large-value tests (P1) — multi-MB values must round-trip intact through the SDK, +# and values above the API's maximum size must be rejected with a permanent 413. +# These use INCOMPRESSIBLE random bytes so the payload is genuinely multi-MB on the +# wire (a repetitive string would compress to ~KB and never exercise the large path). # ============================================================================ def _size_limit_key(namespace: str, name: str) -> str: - """Build a SaaS-valid cache key: ns::func::args::.""" + """Build a valid cache key: ns::func::args::.""" import hashlib args_hash = hashlib.blake2b(name.encode(), digest_size=32).hexdigest() return f"ns:{namespace}:func:tests.e2e.size_limits.{name}:args:{args_hash}:1s" -def test_chunked_value_roundtrip(cache_io_decorator, clean_cache): - """A value larger than the backend's 2 MB cell limit round-trips intact through the - full SDK → Worker → DO path via transparent server-side chunking. +def test_large_value_roundtrip(cache_io_decorator, clean_cache): + """A multi-MB value round-trips byte-identically through the SDK. Priority: P1 """ import os - blob = os.urandom(13_500_000) # ~13.5 MB, incompressible → forces chunking + blob = os.urandom(13_500_000) # ~13.5 MB, incompressible → genuinely multi-MB on the wire @cache_io_decorator def get_blob(): return blob - assert get_blob() == blob # miss → compute → chunked PUT - assert get_blob() == blob # hit → exact bytes preserved across reassembly + assert get_blob() == blob # miss → compute → store + assert get_blob() == blob # hit → exact bytes preserved -def test_chunked_roundtrip_and_overwrite_via_http(http_client, sdk_config, unique_namespace): - """Direct HTTP (bypasses SDK L1/serialization): a >2 MB value chunk-stores and - reassembles byte-identically, and overwriting it with a small value leaves no stale - chunks (GET returns exactly the small value — proves chunk cleanup on overwrite). +def test_large_value_roundtrip_and_overwrite_via_http(http_client, sdk_config, unique_namespace): + """Direct HTTP (bypasses SDK L1/serialization): a multi-MB value round-trips + byte-identically, and overwriting it with a small value returns exactly the small + value (no stale bytes). Priority: P1 """ import os # Raw key in the path (colons unencoded) — matches how the SDK builds the URL - # (backend.py: f"/v1/cache/{key}"); the worker splits the path on literal ':'. - # (Percent-encoding the colons makes the worker see 1 component → 400.) + # (backend.py: f"/v1/cache/{key}"); the API splits the path on literal ':'. + # (Percent-encoding the colons fails key-format validation.) key = _size_limit_key(unique_namespace, "roundtrip") url = f"{sdk_config['api_url']}/v1/cache/{key}" headers = {"Content-Type": "application/octet-stream", "X-TTL": "300"} @@ -230,9 +228,9 @@ def test_chunked_roundtrip_and_overwrite_via_http(http_client, sdk_config, uniqu assert put.status_code == 200, put.text got = http_client.get(url) assert got.status_code == 200 - assert got.content == big # chunk reassembly is byte-exact + assert got.content == big # round-trip is byte-exact - # Overwrite large → small: stale chunks must be cleared (no orphans, no corruption). + # Overwrite large → small: GET must return exactly the small value (no stale bytes). small = os.urandom(1024) put2 = http_client.put(url, data=small, headers=headers) assert put2.status_code == 200, put2.text @@ -242,15 +240,15 @@ def test_chunked_roundtrip_and_overwrite_via_http(http_client, sdk_config, uniqu def test_oversized_value_rejected_with_413(http_client, sdk_config, unique_namespace): - """A value above the 25 MB ceiling is rejected with a clean 413 (a permanent error), - NOT a 500 that the SDK would mis-classify as transient and retry. + """A value above the API's maximum size is rejected with a clean, permanent 413 + (not a 500 the SDK would mis-classify as transient and retry). Priority: P1 """ - # Raw key in the path (colons unencoded) — see test_chunked_roundtrip_and_overwrite_via_http. + # Raw key in the path (colons unencoded) — see test_large_value_roundtrip_and_overwrite_via_http. key = _size_limit_key(unique_namespace, "oversized") url = f"{sdk_config['api_url']}/v1/cache/{key}" - body = b"\x00" * (26 * 1024 * 1024) # 26 MB > 25 MB ceiling (raw HTTP — not compressed) + body = b"\x00" * (26 * 1024 * 1024) # exceeds the API maximum value size resp = http_client.put(url, data=body, headers={"Content-Type": "application/octet-stream"}) assert resp.status_code == 413