Skip to content

Commit 714ac86

Browse files
rsignellclaude
andcommitted
Use GITHUB_TOKEN to avoid rate limit on open-data-registry API calls
Passes the built-in Actions GITHUB_TOKEN as an Authorization header when querying the awslabs/open-data-registry GitHub API, raising the rate limit from 60 to 5000 req/hr and preventing intermittent 403 failures. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 3eeab43 commit 714ac86

2 files changed

Lines changed: 184 additions & 6 deletions

File tree

.github/workflows/build_dynamical_stac.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,13 @@ jobs:
3232
R2_ENDPOINT: ${{ secrets.R2_ENDPOINT }}
3333

3434
- name: Build catalog and publish to R2
35+
env:
36+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
3537
run: |
3638
micromamba run -n cloudify-notebooks python scripts/build_dynamical_stac.py \
3739
--catalog-bucket osc-pub \
3840
--catalog-prefix stac/dynamical \
3941
--profile osc-pub-r2 \
4042
--public-domain r2-pub.openscicomp.io \
41-
--geoparquet
43+
--geoparquet \
44+
--thumbnails

scripts/build_dynamical_stac.py

Lines changed: 180 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,18 @@
3838

3939
import argparse
4040
import logging
41+
import os
4142
import shutil
4243
import subprocess
4344
import sys
4445
import tempfile
4546
import warnings
4647
from pathlib import Path
4748

49+
import matplotlib
50+
matplotlib.use('Agg')
51+
import matplotlib.pyplot as plt
52+
4853
import icechunk
4954
import pystac
5055
import requests
@@ -84,7 +89,9 @@
8489
def fetch_registry_entries() -> list[dict]:
8590
"""Fetch and parse all dynamical-*.yaml entries from the AWS Open Data Registry."""
8691
log.info("Querying AWS Open Data Registry for dynamical.org datasets ...")
87-
resp = requests.get(_REGISTRY_API, timeout=30)
92+
token = os.environ.get("GITHUB_TOKEN")
93+
headers = {"Authorization": f"Bearer {token}"} if token else {}
94+
resp = requests.get(_REGISTRY_API, headers=headers, timeout=30)
8895
resp.raise_for_status()
8996
files = [
9097
f for f in resp.json()
@@ -94,7 +101,7 @@ def fetch_registry_entries() -> list[dict]:
94101

95102
entries = []
96103
for f in files:
97-
raw = requests.get(_REGISTRY_RAW.format(filename=f["name"]), timeout=30)
104+
raw = requests.get(_REGISTRY_RAW.format(filename=f["name"]), headers=headers, timeout=30)
98105
raw.raise_for_status()
99106
entry = yaml.safe_load(raw.text)
100107
entry["_filename"] = f["name"]
@@ -205,6 +212,151 @@ def xarray_open_snippet(item_id: str, catalog_url: str) -> str:
205212
)
206213

207214

215+
# ---------------------------------------------------------------------------
216+
# Thumbnail generation
217+
# ---------------------------------------------------------------------------
218+
219+
def generate_thumbnail(
220+
session,
221+
ds: "xr.Dataset",
222+
item_id: str,
223+
output_dir: Path,
224+
temporal_dimension: str,
225+
) -> "Path | None":
226+
"""Generate a PNG temperature map thumbnail for a dataset.
227+
228+
Re-opens the store with dask chunks of size 1 on non-spatial dims so that
229+
only the exact slice is fetched from S3, not the full multi-dim zarr chunk.
230+
231+
Returns the path to the saved PNG, or None if generation fails.
232+
"""
233+
if "temperature_2m" not in ds:
234+
log.warning(" No temperature_2m in %s -- skipping thumbnail", item_id)
235+
return None
236+
237+
try:
238+
import cartopy.crs as ccrs
239+
import cartopy.feature as cfeature
240+
import numpy as np
241+
242+
# Re-open with dask using chunk size 1 on non-spatial dims.
243+
# Without this, zarr loads the full stored chunk (which can span many
244+
# lead_time / ensemble_member slices = many GB) just to extract one slice.
245+
chunks = {d: 1 for d in ds.dims
246+
if d not in ("latitude", "longitude", "x", "y")}
247+
with warnings.catch_warnings():
248+
warnings.filterwarnings(
249+
"ignore",
250+
message="Numcodecs codecs are not in the Zarr version 3 specification.*",
251+
)
252+
ds_lazy = xr.open_zarr(
253+
session.store, chunks=chunks,
254+
consolidated=False, zarr_format=3
255+
)
256+
257+
da = ds_lazy["temperature_2m"]
258+
log.info(" [thumb] dims=%s", dict(da.sizes))
259+
260+
# Select the most recent time slice
261+
if temporal_dimension == "init_time" and "init_time" in da.dims:
262+
da = da.isel(init_time=-1, lead_time=0)
263+
elif temporal_dimension in da.dims:
264+
da = da.isel({temporal_dimension: -1})
265+
266+
# Pick one ensemble member to avoid loading all members
267+
if "ensemble_member" in da.dims:
268+
da = da.isel(ensemble_member=0)
269+
270+
# Subsample spatial dims for thumbnail (max ~360x180 points)
271+
if "latitude" in da.dims and "longitude" in da.dims:
272+
lat_step = max(1, len(da.latitude) // 180)
273+
lon_step = max(1, len(da.longitude) // 360)
274+
da = da.isel(latitude=slice(None, None, lat_step),
275+
longitude=slice(None, None, lon_step))
276+
elif "y" in da.dims and "x" in da.dims:
277+
y_step = max(1, len(da.y) // 300)
278+
x_step = max(1, len(da.x) // 500)
279+
da = da.isel(y=slice(None, None, y_step),
280+
x=slice(None, None, x_step))
281+
282+
log.info(" [thumb] loading %s values...", dict(da.sizes))
283+
284+
# .compute() with dask fetches only the chunks we need
285+
data_c = da.compute().values - 273.15
286+
log.info(" [thumb] loaded shape=%s min=%.1f max=%.1f",
287+
data_c.shape, data_c.min(), data_c.max())
288+
289+
is_projected = "x" in ds.dims and "y" in ds.dims
290+
291+
if is_projected:
292+
proj = ccrs.LambertConformal(
293+
central_longitude=-97.5, central_latitude=38.5
294+
)
295+
fig, ax = plt.subplots(
296+
figsize=(10, 6), subplot_kw={"projection": proj}
297+
)
298+
lons = da["longitude"].compute().values
299+
lats = da["latitude"].compute().values
300+
img = ax.pcolormesh(
301+
lons, lats, data_c,
302+
cmap="RdBu_r", vmin=-40, vmax=40,
303+
transform=ccrs.PlateCarree(),
304+
shading="auto",
305+
)
306+
ax.add_feature(cfeature.COASTLINE, linewidth=0.5)
307+
ax.add_feature(cfeature.STATES, linewidth=0.3)
308+
ax.set_extent([-130, -60, 20, 55], crs=ccrs.PlateCarree())
309+
else:
310+
proj = ccrs.PlateCarree()
311+
fig, ax = plt.subplots(
312+
figsize=(10, 5), subplot_kw={"projection": proj}
313+
)
314+
# Determine lon/lat coordinate names
315+
lon_name = "longitude" if "longitude" in da.dims else "lon"
316+
lat_name = "latitude" if "latitude" in da.dims else "lat"
317+
lons = da[lon_name].values
318+
lats = da[lat_name].values
319+
img = ax.pcolormesh(
320+
lons, lats, data_c,
321+
cmap="RdBu_r", vmin=-40, vmax=40,
322+
transform=proj,
323+
shading="auto",
324+
)
325+
ax.set_global()
326+
ax.add_feature(cfeature.COASTLINE, linewidth=0.5)
327+
328+
plt.colorbar(img, ax=ax, orientation="horizontal", pad=0.04,
329+
label="2m Temperature (°C)", shrink=0.7)
330+
331+
# Build timestamp string for title
332+
try:
333+
ts_val = da[temporal_dimension].values if temporal_dimension in da.coords else None
334+
ts_str = str(np.datetime_as_string(ts_val, unit="h")) if ts_val is not None else ""
335+
except Exception:
336+
ts_str = ""
337+
338+
title = item_id
339+
if ts_str:
340+
title += f" | {ts_str}"
341+
ax.set_title(title, fontsize=9)
342+
343+
thumb_dir = output_dir / "thumbnails"
344+
thumb_dir.mkdir(parents=True, exist_ok=True)
345+
out_path = thumb_dir / f"{item_id}.png"
346+
fig.savefig(out_path, dpi=100, bbox_inches="tight")
347+
plt.close(fig)
348+
log.info(" Thumbnail saved: %s", out_path)
349+
return out_path
350+
351+
except Exception as exc:
352+
log.warning(" Thumbnail generation failed for %s: %s", item_id, exc)
353+
try:
354+
plt.close("all")
355+
except Exception:
356+
pass
357+
return None
358+
359+
208360
# ---------------------------------------------------------------------------
209361
# Per-store item building
210362
# ---------------------------------------------------------------------------
@@ -215,6 +367,8 @@ def build_item_for_store(
215367
region: str,
216368
entry: dict,
217369
catalog_url: str,
370+
output_dir: "Path | None" = None,
371+
thumbnail_url_base: "str | None" = None,
218372
) -> dict | None:
219373
"""Open one icechunk store and return a STAC item dict, or None on failure."""
220374
store_uri = f"s3://{bucket}/{prefix}"
@@ -266,6 +420,17 @@ def build_item_for_store(
266420
return None
267421

268422
log.info(" Built item: %s bbox=%s", item_id, item_dict["bbox"])
423+
424+
if output_dir and thumbnail_url_base:
425+
thumb_path = generate_thumbnail(session, ds, item_id, output_dir, temporal_dim)
426+
if thumb_path:
427+
item_dict["assets"]["thumbnail"] = {
428+
"href": f"{thumbnail_url_base}/{item_id}.png",
429+
"type": "image/png",
430+
"roles": ["thumbnail"],
431+
"title": "Latest 2m temperature map",
432+
}
433+
269434
return item_dict
270435

271436

@@ -277,9 +442,14 @@ def build_catalog(
277442
catalog_bucket: str,
278443
catalog_prefix: str,
279444
public_domain: str,
280-
) -> tuple[pystac.Catalog, str]:
445+
output_dir: "Path | None" = None,
446+
) -> "tuple[pystac.Catalog, str]":
281447
"""Discover all stores, build items, return (catalog, catalog_url)."""
282448
catalog_url = f"https://{public_domain}/{catalog_prefix}/catalog.json"
449+
thumbnail_url_base = (
450+
f"https://{public_domain}/{catalog_prefix}/thumbnails"
451+
if output_dir else None
452+
)
283453

284454
catalog = pystac.Catalog(
285455
id="dynamical-org-icechunk",
@@ -308,7 +478,9 @@ def build_catalog(
308478

309479
for prefix in prefixes:
310480
item_dict = build_item_for_store(
311-
bucket, prefix, region, entry, catalog_url
481+
bucket, prefix, region, entry, catalog_url,
482+
output_dir=output_dir,
483+
thumbnail_url_base=thumbnail_url_base,
312484
)
313485
if item_dict:
314486
catalog.add_item(pystac.Item.from_dict(item_dict))
@@ -353,7 +525,7 @@ def upload_to_s3(
353525
) -> None:
354526
fs = s3fs.S3FileSystem(profile=profile)
355527
log.info("\nUploading to s3://%s/%s ...", catalog_bucket, catalog_prefix)
356-
for pattern in ("**/*.json", "*.parquet"):
528+
for pattern in ("**/*.json", "*.parquet", "thumbnails/*.png"):
357529
for local_file in sorted(output_dir.glob(pattern)):
358530
rel = local_file.relative_to(output_dir)
359531
s3_dest = f"{catalog_bucket}/{catalog_prefix}/{rel}"
@@ -438,6 +610,8 @@ def main() -> None:
438610
"GitHub Pages hostname (e.g. myorg.github.io/myrepo).")
439611
parser.add_argument("--github-pages-push", action="store_true",
440612
help="Auto git-push after committing to the GitHub Pages repo.")
613+
parser.add_argument("--thumbnails", action="store_true",
614+
help="Generate temperature thumbnails and upload to R2")
441615
parser.add_argument("-v", "--verbose", action="store_true")
442616
args = parser.parse_args()
443617

@@ -455,6 +629,7 @@ def main() -> None:
455629
catalog_bucket=args.catalog_bucket,
456630
catalog_prefix=args.catalog_prefix,
457631
public_domain=args.public_domain,
632+
output_dir=output_dir if args.thumbnails else None,
458633
)
459634

460635
n_items = len(list(catalog.get_items()))

0 commit comments

Comments
 (0)