From a14b2a375ab1cabdbc355daf9688de7e156ef244 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 13 May 2026 00:13:46 -0400 Subject: [PATCH 1/2] Update US release bundle --- changelog.d/us-ss-spm-release.changed | 1 + pyproject.toml | 4 +- .../data/release_manifests/us.json | 28 ++++----- .../release_manifests/us.trace.tro.jsonld | 39 ++++++------ src/policyengine/provenance/bundle.py | 7 +++ src/policyengine/provenance/manifest.py | 21 +++++-- src/policyengine/provenance/trace.py | 10 +-- .../us_model_surface.json | 2 +- tests/test_bundle_refresh.py | 8 +++ tests/test_models.py | 6 +- tests/test_release_manifests.py | 63 ++++++++++++++----- tests/test_trace_tro.py | 25 ++++++++ tests/test_us_regions.py | 6 +- uv.lock | 12 ++-- 14 files changed, 154 insertions(+), 78 deletions(-) create mode 100644 changelog.d/us-ss-spm-release.changed diff --git a/changelog.d/us-ss-spm-release.changed b/changelog.d/us-ss-spm-release.changed new file mode 100644 index 00000000..49942632 --- /dev/null +++ b/changelog.d/us-ss-spm-release.changed @@ -0,0 +1 @@ +Update the bundled US release to policyengine-us 1.691.3 and policyengine-us-data 1.113.1. diff --git a/pyproject.toml b/pyproject.toml index b7a1ae9c..dd949fca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ uk = [ ] us = [ "policyengine_core>=3.26.1", - "policyengine-us==1.690.7", + "policyengine-us==1.691.3", ] dev = [ "pytest", @@ -61,7 +61,7 @@ dev = [ "ruff>=0.9.0", "policyengine_core>=3.26.1", "policyengine-uk==2.88.14", - "policyengine-us==1.690.7", + "policyengine-us==1.691.3", "towncrier>=24.8.0", "mypy>=1.11.0", "pytest-cov>=5.0.0", diff --git a/src/policyengine/data/release_manifests/us.json b/src/policyengine/data/release_manifests/us.json index feb8ad1d..2f948346 100644 --- a/src/policyengine/data/release_manifests/us.json +++ b/src/policyengine/data/release_manifests/us.json @@ -5,34 +5,34 @@ "policyengine_version": "4.4.3", "model_package": { "name": "policyengine-us", - "version": "1.690.7", - "sha256": "5a7a541efabac98fa069d6845902cf5924c81db67383234b55dcd2b8bfcfc3ca", - "wheel_url": "https://files.pythonhosted.org/packages/2a/02/52109bae5f4767237b43bd72ce0bc4edf7925650a788053b2bc168caa5ae/policyengine_us-1.690.7-py3-none-any.whl" + "version": "1.691.3", + "sha256": "c5d37aa4442f23d48bd5d587a02876c89d83c6135809f12988cc39bd3a47e8b2", + "wheel_url": "https://files.pythonhosted.org/packages/2a/03/e21c872664f90dcc99f1fcf29d1da71409c50cf8a7798ff0596ad10d9400/policyengine_us-1.691.3-py3-none-any.whl" }, "data_package": { "name": "policyengine-us-data", - "version": "1.110.12", + "version": "1.113.1", "repo_id": "policyengine/policyengine-us-data", - "release_manifest_path": "releases/1.110.12/release_manifest.json", - "release_manifest_revision": "3aac4505ec10d31efc1b3799a1e6458a15853ecc" + "release_manifest_path": "releases/1.113.1/release_manifest.json", + "release_manifest_revision": "99e0ec7e784cdba43dd21ff1d80a081599a7a537" }, "certified_data_artifact": { "data_package": { "name": "policyengine-us-data", - "version": "1.110.12" + "version": "1.113.1" }, - "build_id": "policyengine-us-data-1.110.12", + "build_id": "policyengine-us-data-1.113.1", "dataset": "enhanced_cps_2024", - "uri": "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.110.12", - "sha256": "58a6639f7511b8d804701417e2647f0c3a77f51a3d90441037eaf004b1f00761" + "uri": "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@99e0ec7e784cdba43dd21ff1d80a081599a7a537", + "sha256": "0ce549af18753287c097718362b8cd5cdccfc47953acf7f282709d604cf314d2" }, "certification": { "compatibility_basis": "exact_build_model_version", - "data_build_id": "policyengine-us-data-1.110.12", - "built_with_model_version": "1.690.7", - "certified_for_model_version": "1.690.7", + "data_build_id": "policyengine-us-data-1.113.1", + "built_with_model_version": "1.691.3", + "certified_for_model_version": "1.691.3", "certified_by": "policyengine.py bundled manifest", - "data_build_fingerprint": "sha256:9961ed1c5d00943a360724da560eee425eb9f99f91896f053dca74724c46e96e" + "data_build_fingerprint": "sha256:d891044ece8ec3338904771c879b98ec11a12f1090c074e5b8cee846825d8056" }, "default_dataset": "enhanced_cps_2024", "datasets": { diff --git a/src/policyengine/data/release_manifests/us.trace.tro.jsonld b/src/policyengine/data/release_manifests/us.trace.tro.jsonld index 87fb3aec..2bc386d7 100644 --- a/src/policyengine/data/release_manifests/us.trace.tro.jsonld +++ b/src/policyengine/data/release_manifests/us.trace.tro.jsonld @@ -17,7 +17,7 @@ "schema:name": "PolicyEngine", "schema:url": "https://policyengine.org" }, - "schema:dateCreated": "2026-05-11T18:53:05.508006Z", + "schema:dateCreated": "2026-05-13T03:56:37.150215Z", "schema:description": "TRACE TRO for certified runtime bundle us-4.4.3 covering the bundle manifest, the certified dataset artifact, the country model wheel, and the country data release manifest when it is available.", "schema:name": "policyengine us certified bundle TRO", "trov:createdWith": { @@ -45,7 +45,7 @@ "trov:hasArtifact": { "@id": "composition/1/artifact/data_release_manifest" }, - "trov:hasLocation": "https://huggingface.co/policyengine/policyengine-us-data/resolve/3aac4505ec10d31efc1b3799a1e6458a15853ecc/releases/1.110.12/release_manifest.json" + "trov:hasLocation": "https://huggingface.co/policyengine/policyengine-us-data/resolve/99e0ec7e784cdba43dd21ff1d80a081599a7a537/releases/1.113.1/release_manifest.json" }, { "@id": "arrangement/1/location/dataset", @@ -53,7 +53,7 @@ "trov:hasArtifact": { "@id": "composition/1/artifact/dataset" }, - "trov:hasLocation": "https://huggingface.co/policyengine/policyengine-us-data/resolve/1.110.12/enhanced_cps_2024.h5" + "trov:hasLocation": "https://huggingface.co/policyengine/policyengine-us-data/resolve/99e0ec7e784cdba43dd21ff1d80a081599a7a537/enhanced_cps_2024.h5" }, { "@id": "arrangement/1/location/model_wheel", @@ -61,7 +61,7 @@ "trov:hasArtifact": { "@id": "composition/1/artifact/model_wheel" }, - "trov:hasLocation": "https://files.pythonhosted.org/packages/2a/02/52109bae5f4767237b43bd72ce0bc4edf7925650a788053b2bc168caa5ae/policyengine_us-1.690.7-py3-none-any.whl" + "trov:hasLocation": "https://files.pythonhosted.org/packages/2a/03/e21c872664f90dcc99f1fcf29d1da71409c50cf8a7798ff0596ad10d9400/policyengine_us-1.691.3-py3-none-any.whl" } ] } @@ -75,54 +75,51 @@ "@type": "trov:ResearchArtifact", "schema:name": "policyengine.py bundle manifest for us", "trov:mimeType": "application/json", - "trov:sha256": "41e196a6263b8168d403058029c52ebab795e17024ac9ebef11ff876e36959e2" + "trov:sha256": "67d4d6505bed4af9bf2ec575d8b037e36be71b2f9a5afa9bb8cc695ec7a1e913" }, { "@id": "composition/1/artifact/data_release_manifest", "@type": "trov:ResearchArtifact", - "schema:name": "policyengine-us-data release manifest 1.110.12", + "schema:name": "policyengine-us-data release manifest 1.113.1", "trov:mimeType": "application/json", - "trov:sha256": "17cfd2fbb31064834ed82c0fd7d8ae5c272fe7f24b1e48b226a4acf97ff4c5dd" + "trov:sha256": "f73ccb685ee98ca7e9c4c3d30ff5f02f2b869ea8298b2be466baabdb1d18ec42" }, { "@id": "composition/1/artifact/dataset", "@type": "trov:ResearchArtifact", "schema:name": "enhanced_cps_2024", "trov:mimeType": "application/x-hdf5", - "trov:sha256": "58a6639f7511b8d804701417e2647f0c3a77f51a3d90441037eaf004b1f00761" + "trov:sha256": "0ce549af18753287c097718362b8cd5cdccfc47953acf7f282709d604cf314d2" }, { "@id": "composition/1/artifact/model_wheel", "@type": "trov:ResearchArtifact", - "schema:name": "policyengine-us==1.690.7 wheel", + "schema:name": "policyengine-us==1.691.3 wheel", "trov:mimeType": "application/zip", - "trov:sha256": "5a7a541efabac98fa069d6845902cf5924c81db67383234b55dcd2b8bfcfc3ca" + "trov:sha256": "c5d37aa4442f23d48bd5d587a02876c89d83c6135809f12988cc39bd3a47e8b2" } ], "trov:hasFingerprint": { "@id": "composition/1/fingerprint", "@type": "trov:CompositionFingerprint", - "trov:sha256": "b84e895b3f19ffee5ec299b94ae2155448a12af2b3dc61d00f3d17003ecdf14a" + "trov:sha256": "fdd760bdc354160c8f4fbf452ecd78ab24cfcb0fba9c50332a66ae74584f32e2" } }, "trov:hasPerformance": { "@id": "trp/1", "@type": "trov:TransparentResearchPerformance", - "pe:builtWithModelVersion": "1.690.7", + "pe:builtWithModelVersion": "1.691.3", "pe:certifiedBy": "policyengine.py bundled manifest", - "pe:certifiedForModelVersion": "1.690.7", - "pe:ciGitRef": "refs/heads/main", - "pe:ciGitSha": "1718b493e4749faf62f0ffdff480205abdd20011", - "pe:ciRunUrl": "https://github.com/PolicyEngine/policyengine.py/actions/runs/25692889721", + "pe:certifiedForModelVersion": "1.691.3", "pe:compatibilityBasis": "exact_build_model_version", - "pe:dataBuildFingerprint": "sha256:9961ed1c5d00943a360724da560eee425eb9f99f91896f053dca74724c46e96e", - "pe:dataBuildId": "policyengine-us-data-1.110.12", - "pe:emittedIn": "github-actions", - "rdfs:comment": "Certification of build policyengine-us-data-1.110.12 for policyengine-us 1.690.7.", + "pe:dataBuildFingerprint": "sha256:d891044ece8ec3338904771c879b98ec11a12f1090c074e5b8cee846825d8056", + "pe:dataBuildId": "policyengine-us-data-1.113.1", + "pe:emittedIn": "local", + "rdfs:comment": "Certification of build policyengine-us-data-1.113.1 for policyengine-us 1.691.3.", "trov:accessedArrangement": { "@id": "arrangement/1" }, - "trov:startedAtTime": "2026-05-11T18:53:05.508006Z", + "trov:startedAtTime": "2026-05-13T03:56:37.150215Z", "trov:wasConductedBy": { "@id": "trs" } diff --git a/src/policyengine/provenance/bundle.py b/src/policyengine/provenance/bundle.py index af11449c..a8f5f476 100644 --- a/src/policyengine/provenance/bundle.py +++ b/src/policyengine/provenance/bundle.py @@ -395,6 +395,13 @@ def refresh_release_bundle( dataset_repo_id = data_artifact_json.get("repo_id", repo_id) dataset_path = data_artifact_json.get("path", dataset_path) dataset_revision = data_artifact_json.get("revision", new_data) + if ( + release_manifest_json is not None + and new_release_manifest_revision is not None + and dataset_repo_id == repo_id + and dataset_revision == new_data + ): + dataset_revision = new_release_manifest_revision # Only hit HF if the data version actually changed. if new_data != old_data: diff --git a/src/policyengine/provenance/manifest.py b/src/policyengine/provenance/manifest.py index 061b0d20..c8ada970 100644 --- a/src/policyengine/provenance/manifest.py +++ b/src/policyengine/provenance/manifest.py @@ -180,9 +180,13 @@ def https_dataset_uri(repo_id: str, path_in_repo: str, revision: str) -> str: return f"https://huggingface.co/{repo_id}/resolve/{revision}/{path_in_repo}" +def _artifact_revision(data_package: "DataPackageVersion") -> str: + return data_package.release_manifest_revision or data_package.version + + def https_release_manifest_uri(data_package: "DataPackageVersion") -> str: """Return a dereferenceable HTTPS URI for a data release manifest.""" - revision = data_package.release_manifest_revision or data_package.version + revision = _artifact_revision(data_package) return ( f"https://huggingface.co/{data_package.repo_id}/resolve/" f"{revision}/{data_package.release_manifest_path}" @@ -267,7 +271,16 @@ def get_data_release_manifest(country_id: str) -> DataReleaseManifest: raise DataReleaseManifestUnavailableError( "Could not fetch the data release manifest from Hugging Face." ) from exc - return DataReleaseManifest.model_validate_json(response.text) + data_release_manifest = DataReleaseManifest.model_validate_json(response.text) + release_revision = country_manifest.data_package.release_manifest_revision + if release_revision is not None: + for artifact in data_release_manifest.artifacts.values(): + if ( + artifact.repo_id == country_manifest.data_package.repo_id + and artifact.revision == country_manifest.data_package.version + ): + artifact.revision = release_revision + return data_release_manifest def _specifier_matches(version: str, specifier: str) -> bool: @@ -404,7 +417,7 @@ def resolve_dataset_reference(country_id: str, dataset: str) -> str: return build_hf_uri( repo_id=manifest.data_package.repo_id, path_in_repo=path_reference.path, - revision=manifest.data_package.version, + revision=_artifact_revision(manifest.data_package), ) data_release_manifest = get_data_release_manifest(country_id) @@ -525,5 +538,5 @@ def resolve_region_dataset_path( return build_hf_uri( repo_id=manifest.data_package.repo_id, path_in_repo=resolved_path, - revision=manifest.data_package.version, + revision=_artifact_revision(manifest.data_package), ) diff --git a/src/policyengine/provenance/trace.py b/src/policyengine/provenance/trace.py index 341addb5..434ffa10 100644 --- a/src/policyengine/provenance/trace.py +++ b/src/policyengine/provenance/trace.py @@ -353,15 +353,7 @@ def build_trace_tro_from_release_bundle( if data_release_manifest is not None else None ) - dataset_location = ( - https_dataset_uri( - repo_id=dataset_artifact.repo_id, - path_in_repo=dataset_artifact.path, - revision=dataset_artifact.revision, - ) - if dataset_artifact is not None - else _dataset_location_from_uri(certified_artifact.uri) - ) + dataset_location = _dataset_location_from_uri(certified_artifact.uri) bundle_manifest_hash = hashlib.sha256( canonical_json_bytes(country_manifest.model_dump(mode="json")) diff --git a/tests/fixtures/household_calculator_snapshots/us_model_surface.json b/tests/fixtures/household_calculator_snapshots/us_model_surface.json index 076b79b1..fd14a29c 100644 --- a/tests/fixtures/household_calculator_snapshots/us_model_surface.json +++ b/tests/fixtures/household_calculator_snapshots/us_model_surface.json @@ -5,7 +5,7 @@ "has_income_tax": true, "has_region_registry": true, "model_package_name": "policyengine-us", - "num_parameters_bucketed_100s": 851, + "num_parameters_bucketed_100s": 852, "num_variables_bucketed_100s": 48, "region_registry_country": "us" } diff --git a/tests/test_bundle_refresh.py b/tests/test_bundle_refresh.py index 7a9c8581..ae8f2305 100644 --- a/tests/test_bundle_refresh.py +++ b/tests/test_bundle_refresh.py @@ -306,6 +306,10 @@ def fake_urlopen(request, *args, **kwargs): written["data_package"]["release_manifest_revision"] == "release-manifest-commit-sha" ) + assert ( + written["certified_data_artifact"]["uri"] + == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@release-manifest-commit-sha" + ) def test__bump_data_only_falls_back_to_main_for_release_manifest( @@ -337,6 +341,10 @@ def fake_urlopen(request, *args, **kwargs): written["data_package"]["release_manifest_revision"] == "release-manifest-commit-sha" ) + assert ( + written["certified_data_artifact"]["uri"] + == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@release-manifest-commit-sha" + ) def test__release_manifest_version_mismatch_raises(sandbox) -> None: diff --git a/tests/test_models.py b/tests/test_models.py index 5b4c2e7b..577b9886 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -113,12 +113,12 @@ def test_has_release_manifest_metadata(self): assert us_latest.release_manifest is not None assert us_latest.release_manifest.country_id == "us" assert us_latest.model_package.name == "policyengine-us" - assert us_latest.model_package.version == "1.690.7" + assert us_latest.model_package.version == "1.691.3" assert us_latest.data_package.name == "policyengine-us-data" - assert us_latest.data_package.version == "1.110.12" + assert us_latest.data_package.version == "1.113.1" assert ( us_latest.default_dataset_uri - == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.110.12" + == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@99e0ec7e784cdba43dd21ff1d80a081599a7a537" ) def test_has_hundreds_of_parameters(self): diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index 480ee41b..de5b7173 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -62,27 +62,27 @@ def test__given_us_manifest__then_has_pinned_model_and_data_packages(self): assert manifest.country_id == "us" assert manifest.policyengine_version == POLICYENGINE_VERSION assert manifest.model_package.name == "policyengine-us" - assert manifest.model_package.version == "1.690.7" + assert manifest.model_package.version == "1.691.3" assert manifest.data_package.name == "policyengine-us-data" - assert manifest.data_package.version == "1.110.12" + assert manifest.data_package.version == "1.113.1" assert manifest.data_package.repo_id == "policyengine/policyengine-us-data" assert ( manifest.data_package.release_manifest_path - == "releases/1.110.12/release_manifest.json" + == "releases/1.113.1/release_manifest.json" ) assert ( manifest.data_package.release_manifest_revision - == "3aac4505ec10d31efc1b3799a1e6458a15853ecc" + == "99e0ec7e784cdba43dd21ff1d80a081599a7a537" ) assert manifest.certified_data_artifact is not None assert ( - manifest.certified_data_artifact.build_id == "policyengine-us-data-1.110.12" + manifest.certified_data_artifact.build_id == "policyengine-us-data-1.113.1" ) assert manifest.certified_data_artifact.dataset == "enhanced_cps_2024" assert manifest.certification is not None - assert manifest.certification.data_build_id == "policyengine-us-data-1.110.12" - assert manifest.certification.built_with_model_version == "1.690.7" - assert manifest.certification.certified_for_model_version == "1.690.7" + assert manifest.certification.data_build_id == "policyengine-us-data-1.113.1" + assert manifest.certification.built_with_model_version == "1.691.3" + assert manifest.certification.certified_for_model_version == "1.691.3" def test__given_uk_manifest__then_has_pinned_model_and_data_packages(self): manifest = get_release_manifest("uk") @@ -117,7 +117,7 @@ def test__given_us_dataset_name__then_resolves_to_versioned_hf_url(self): assert ( resolved - == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.110.12" + == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@99e0ec7e784cdba43dd21ff1d80a081599a7a537" ) def test__given_uk_dataset_name__then_resolves_to_versioned_hf_url(self): @@ -238,8 +238,8 @@ def test__given_country__then_can_fetch_data_release_manifest(self): mock_get.assert_called_once() assert mock_get.call_args.args[0] == ( "https://huggingface.co/policyengine/policyengine-us-data/resolve/" - "3aac4505ec10d31efc1b3799a1e6458a15853ecc/" - "releases/1.110.12/release_manifest.json" + "99e0ec7e784cdba43dd21ff1d80a081599a7a537/" + "releases/1.113.1/release_manifest.json" ) def test__given_explicit_manifest_revision__then_builds_manifest_url(self): @@ -247,8 +247,41 @@ def test__given_explicit_manifest_revision__then_builds_manifest_url(self): assert https_release_manifest_uri(manifest.data_package) == ( "https://huggingface.co/policyengine/policyengine-us-data/resolve/" - "3aac4505ec10d31efc1b3799a1e6458a15853ecc/" - "releases/1.110.12/release_manifest.json" + "99e0ec7e784cdba43dd21ff1d80a081599a7a537/" + "releases/1.113.1/release_manifest.json" + ) + + def test__given_release_manifest_artifact_uses_version_tag__then_rewrites_to_commit( + self, + ): + get_data_release_manifest.cache_clear() + payload = { + "schema_version": 1, + "data_package": { + "name": "policyengine-us-data", + "version": "1.113.1", + }, + "artifacts": { + "enhanced_cps_2024": { + "kind": "microdata", + "path": "enhanced_cps_2024.h5", + "repo_id": "policyengine/policyengine-us-data", + "revision": "1.113.1", + "sha256": "abc", + "size_bytes": 123, + } + }, + } + + with patch( + "policyengine.provenance.manifest.requests.get", + return_value=_response_with_json(payload), + ): + manifest = get_data_release_manifest("us") + + assert ( + manifest.artifacts["enhanced_cps_2024"].uri + == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@99e0ec7e784cdba43dd21ff1d80a081599a7a537" ) def test__given_missing_data_release_manifest__then_fetch_raises_unavailable(self): @@ -355,7 +388,7 @@ def test__given_private_manifest_unavailable__then_bundled_certification_is_used ): certification = certify_data_release_compatibility( "us", - runtime_model_version="1.690.7", + runtime_model_version="1.691.3", ) assert certification == get_release_manifest("us").certification @@ -371,7 +404,7 @@ def test__given_manifest_request_timeout__then_bundled_certification_is_used( ): certification = certify_data_release_compatibility( "us", - runtime_model_version="1.690.7", + runtime_model_version="1.691.3", ) assert certification == get_release_manifest("us").certification diff --git a/tests/test_trace_tro.py b/tests/test_trace_tro.py index 1070b9bf..6aab72e8 100644 --- a/tests/test_trace_tro.py +++ b/tests/test_trace_tro.py @@ -269,6 +269,31 @@ def test__given_artifact_locations__then_all_paths_are_https_or_local( for path in paths[1:]: assert path.startswith("https://"), path + def test__given_data_manifest_revision_is_unresolvable__then_dataset_location_uses_certified_artifact( + self, + ): + country_manifest = get_release_manifest("us") + data_manifest = _us_data_release_manifest() + data_manifest.artifacts["enhanced_cps_2024"].revision = "1.113.1" + + tro = build_trace_tro_from_release_bundle( + country_manifest, + data_manifest, + fetch_pypi=_fake_fetch_pypi, + ) + + locations = tro["@graph"][0]["trov:hasArrangement"][0][ + "trov:hasArtifactLocation" + ] + dataset_location = next( + loc for loc in locations if loc["@id"].endswith("dataset") + ) + assert ( + dataset_location["trov:hasLocation"] + == "https://huggingface.co/policyengine/policyengine-us-data/resolve/99e0ec7e784cdba43dd21ff1d80a081599a7a537/enhanced_cps_2024.h5" + ) + assert "/resolve/1.113.1/" not in dataset_location["trov:hasLocation"] + def test__given_certification__then_fields_are_machine_readable( self, us_bundle_tro ): diff --git a/tests/test_us_regions.py b/tests/test_us_regions.py index bd57e83e..6688b0ff 100644 --- a/tests/test_us_regions.py +++ b/tests/test_us_regions.py @@ -105,7 +105,7 @@ def test__given_us_registry__then_has_national_region(self): assert national.region_type == "national" assert ( national.dataset_path - == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.110.12" + == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@99e0ec7e784cdba43dd21ff1d80a081599a7a537" ) def test__given_us_registry__then_has_51_states(self): @@ -134,7 +134,7 @@ def test__given_california_region__then_has_correct_format(self): assert ca.parent_code == "us" assert ( ca.dataset_path - == "hf://policyengine/policyengine-us-data/states/CA.h5@1.110.12" + == "hf://policyengine/policyengine-us-data/states/CA.h5@99e0ec7e784cdba43dd21ff1d80a081599a7a537" ) assert ca.state_code == "CA" assert ca.state_name == "California" @@ -167,7 +167,7 @@ def test__given_ca_first_district__then_has_correct_format(self): assert ca01.parent_code == "state/ca" assert ( ca01.dataset_path - == "hf://policyengine/policyengine-us-data/districts/CA-01.h5@1.110.12" + == "hf://policyengine/policyengine-us-data/districts/CA-01.h5@99e0ec7e784cdba43dd21ff1d80a081599a7a537" ) assert ca01.state_code == "CA" assert not ca01.requires_filter diff --git a/uv.lock b/uv.lock index 22f8b0d1..469bc909 100644 --- a/uv.lock +++ b/uv.lock @@ -2411,7 +2411,7 @@ wheels = [ [[package]] name = "policyengine" -version = "4.4.2" +version = "4.4.3" source = { editable = "." } dependencies = [ { name = "jsonschema" }, @@ -2480,8 +2480,8 @@ requires-dist = [ { name = "policyengine-core", marker = "extra == 'us'", specifier = ">=3.26.1" }, { name = "policyengine-uk", marker = "extra == 'dev'", specifier = "==2.88.14" }, { name = "policyengine-uk", marker = "extra == 'uk'", specifier = "==2.88.14" }, - { name = "policyengine-us", marker = "extra == 'dev'", specifier = "==1.690.7" }, - { name = "policyengine-us", marker = "extra == 'us'", specifier = "==1.690.7" }, + { name = "policyengine-us", marker = "extra == 'dev'", specifier = "==1.691.3" }, + { name = "policyengine-us", marker = "extra == 'us'", specifier = "==1.691.3" }, { name = "psutil", specifier = ">=5.9.0" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pytest", marker = "extra == 'dev'" }, @@ -2544,7 +2544,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.690.7" +version = "1.691.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -2556,9 +2556,9 @@ dependencies = [ { name = "tables", version = "3.11.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/81/df/60548e64a5ccf5f961a45608c2c6744833daf756c1c82d1e59e5bca14850/policyengine_us-1.690.7.tar.gz", hash = "sha256:3dbb1f54824902fcd6ae64d5879f36ce6b2372a42321c838c20c430fd1507a2e", size = 9479020, upload-time = "2026-05-10T22:27:01.776Z" } +sdist = { url = "https://files.pythonhosted.org/packages/0c/04/5e74890db670d7eb736f8fcb6070e6866950ba2e1d79e1d74f0dc5cfc58c/policyengine_us-1.691.3.tar.gz", hash = "sha256:3475c0e71ebe3396cfa2d138c62eeaa22a912c301d941418238109cf51a89e86", size = 9493068, upload-time = "2026-05-12T16:54:08.94Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/2a/02/52109bae5f4767237b43bd72ce0bc4edf7925650a788053b2bc168caa5ae/policyengine_us-1.690.7-py3-none-any.whl", hash = "sha256:5a7a541efabac98fa069d6845902cf5924c81db67383234b55dcd2b8bfcfc3ca", size = 9985671, upload-time = "2026-05-10T22:26:58.843Z" }, + { url = "https://files.pythonhosted.org/packages/2a/03/e21c872664f90dcc99f1fcf29d1da71409c50cf8a7798ff0596ad10d9400/policyengine_us-1.691.3-py3-none-any.whl", hash = "sha256:c5d37aa4442f23d48bd5d587a02876c89d83c6135809f12988cc39bd3a47e8b2", size = 10008634, upload-time = "2026-05-12T16:54:05.956Z" }, ] [[package]] From 51f8137757a8d75b5e6662de45760bb5c0e765f6 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 13 May 2026 08:14:39 -0400 Subject: [PATCH 2/2] Fix data release manifest TRO hashing --- .../release_manifests/us.trace.tro.jsonld | 4 +-- src/policyengine/provenance/manifest.py | 7 +++++ src/policyengine/provenance/trace.py | 3 +- tests/test_release_manifests.py | 6 ++++ tests/test_trace_tro.py | 28 +++++++++++++++++++ 5 files changed, 45 insertions(+), 3 deletions(-) diff --git a/src/policyengine/data/release_manifests/us.trace.tro.jsonld b/src/policyengine/data/release_manifests/us.trace.tro.jsonld index 2bc386d7..88c83901 100644 --- a/src/policyengine/data/release_manifests/us.trace.tro.jsonld +++ b/src/policyengine/data/release_manifests/us.trace.tro.jsonld @@ -82,7 +82,7 @@ "@type": "trov:ResearchArtifact", "schema:name": "policyengine-us-data release manifest 1.113.1", "trov:mimeType": "application/json", - "trov:sha256": "f73ccb685ee98ca7e9c4c3d30ff5f02f2b869ea8298b2be466baabdb1d18ec42" + "trov:sha256": "d6b29ceff0cbf6a5cff4de94362ebc533dc5044c6a4155a46da7143140a8cb5f" }, { "@id": "composition/1/artifact/dataset", @@ -102,7 +102,7 @@ "trov:hasFingerprint": { "@id": "composition/1/fingerprint", "@type": "trov:CompositionFingerprint", - "trov:sha256": "fdd760bdc354160c8f4fbf452ecd78ab24cfcb0fba9c50332a66ae74584f32e2" + "trov:sha256": "316e373ed13360efa12037200719c9621ce7bced6d80acc4dfa7bbb72962892f" } }, "trov:hasPerformance": { diff --git a/src/policyengine/provenance/manifest.py b/src/policyengine/provenance/manifest.py index c8ada970..c3f2b48f 100644 --- a/src/policyengine/provenance/manifest.py +++ b/src/policyengine/provenance/manifest.py @@ -1,3 +1,4 @@ +import hashlib import os from functools import lru_cache from importlib import import_module @@ -127,6 +128,8 @@ class DataReleaseManifest(BaseModel): can enclose the full set of artifacts published together). Distinct from per-artifact DOIs on ``DataReleaseArtifact.preservation_mirrors``. Populated when the release pipeline mirrors to a DOI-minting host.""" + source_sha256: Optional[str] = Field(default=None, exclude=True) + """Byte sha256 of the fetched manifest before runtime URI rewrites.""" class DataCertification(BaseModel): @@ -272,6 +275,10 @@ def get_data_release_manifest(country_id: str) -> DataReleaseManifest: "Could not fetch the data release manifest from Hugging Face." ) from exc data_release_manifest = DataReleaseManifest.model_validate_json(response.text) + source_bytes = response.content + if not isinstance(source_bytes, bytes): + source_bytes = response.text.encode("utf-8") + data_release_manifest.source_sha256 = hashlib.sha256(source_bytes).hexdigest() release_revision = country_manifest.data_package.release_manifest_revision if release_revision is not None: for artifact in data_release_manifest.artifacts.values(): diff --git a/src/policyengine/provenance/trace.py b/src/policyengine/provenance/trace.py index 434ffa10..d120c0a8 100644 --- a/src/policyengine/provenance/trace.py +++ b/src/policyengine/provenance/trace.py @@ -359,7 +359,8 @@ def build_trace_tro_from_release_bundle( canonical_json_bytes(country_manifest.model_dump(mode="json")) ).hexdigest() data_release_manifest_hash = ( - hashlib.sha256( + data_release_manifest.source_sha256 + or hashlib.sha256( canonical_json_bytes(data_release_manifest.model_dump(mode="json")) ).hexdigest() if data_release_manifest is not None diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index de5b7173..ced84651 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -1,5 +1,6 @@ """Tests for bundled compatibility manifests and data release manifests.""" +import hashlib import json import os import re @@ -43,6 +44,7 @@ def _response_with_json(payload: dict) -> MagicMock: response = MagicMock() response.status_code = 200 response.text = json.dumps(payload) + response.content = response.text.encode("utf-8") response.raise_for_status.return_value = None return response @@ -283,6 +285,10 @@ def test__given_release_manifest_artifact_uses_version_tag__then_rewrites_to_com manifest.artifacts["enhanced_cps_2024"].uri == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@99e0ec7e784cdba43dd21ff1d80a081599a7a537" ) + assert ( + manifest.source_sha256 + == hashlib.sha256(json.dumps(payload).encode("utf-8")).hexdigest() + ) def test__given_missing_data_release_manifest__then_fetch_raises_unavailable(self): get_data_release_manifest.cache_clear() diff --git a/tests/test_trace_tro.py b/tests/test_trace_tro.py index 6aab72e8..950b9d06 100644 --- a/tests/test_trace_tro.py +++ b/tests/test_trace_tro.py @@ -7,6 +7,7 @@ from __future__ import annotations +import hashlib import json from importlib.resources import files from pathlib import Path @@ -29,6 +30,7 @@ TRACE_TROV_NAMESPACE, build_simulation_trace_tro, build_trace_tro_from_release_bundle, + canonical_json_bytes, compute_trace_composition_fingerprint, extract_bundle_tro_reference, serialize_trace_tro, @@ -294,6 +296,32 @@ def test__given_data_manifest_revision_is_unresolvable__then_dataset_location_us ) assert "/resolve/1.113.1/" not in dataset_location["trov:hasLocation"] + def test__given_rewritten_data_manifest__then_tro_hashes_original_source( + self, + ): + country_manifest = get_release_manifest("us") + data_manifest = _us_data_release_manifest() + source_sha256 = hashlib.sha256( + canonical_json_bytes(data_manifest.model_dump(mode="json")) + ).hexdigest() + data_manifest.source_sha256 = source_sha256 + data_manifest.artifacts["enhanced_cps_2024"].revision = "1.113.1" + + tro = build_trace_tro_from_release_bundle( + country_manifest, + data_manifest, + fetch_pypi=_fake_fetch_pypi, + ) + + artifacts = tro["@graph"][0]["trov:hasComposition"]["trov:hasArtifact"] + data_manifest_artifact = next( + artifact + for artifact in artifacts + if artifact["@id"].endswith("data_release_manifest") + ) + + assert data_manifest_artifact["trov:sha256"] == source_sha256 + def test__given_certification__then_fields_are_machine_readable( self, us_bundle_tro ):