3838
3939import argparse
4040import logging
41+ import os
4142import shutil
4243import subprocess
4344import sys
4445import tempfile
4546import warnings
4647from pathlib import Path
4748
49+ import matplotlib
50+ matplotlib .use ('Agg' )
51+ import matplotlib .pyplot as plt
52+
4853import icechunk
4954import pystac
5055import requests
8489def fetch_registry_entries () -> list [dict ]:
8590 """Fetch and parse all dynamical-*.yaml entries from the AWS Open Data Registry."""
8691 log .info ("Querying AWS Open Data Registry for dynamical.org datasets ..." )
87- resp = requests .get (_REGISTRY_API , timeout = 30 )
92+ token = os .environ .get ("GITHUB_TOKEN" )
93+ headers = {"Authorization" : f"Bearer { token } " } if token else {}
94+ resp = requests .get (_REGISTRY_API , headers = headers , timeout = 30 )
8895 resp .raise_for_status ()
8996 files = [
9097 f for f in resp .json ()
@@ -94,7 +101,7 @@ def fetch_registry_entries() -> list[dict]:
94101
95102 entries = []
96103 for f in files :
97- raw = requests .get (_REGISTRY_RAW .format (filename = f ["name" ]), timeout = 30 )
104+ raw = requests .get (_REGISTRY_RAW .format (filename = f ["name" ]), headers = headers , timeout = 30 )
98105 raw .raise_for_status ()
99106 entry = yaml .safe_load (raw .text )
100107 entry ["_filename" ] = f ["name" ]
@@ -205,6 +212,151 @@ def xarray_open_snippet(item_id: str, catalog_url: str) -> str:
205212 )
206213
207214
215+ # ---------------------------------------------------------------------------
216+ # Thumbnail generation
217+ # ---------------------------------------------------------------------------
218+
219+ def generate_thumbnail (
220+ session ,
221+ ds : "xr.Dataset" ,
222+ item_id : str ,
223+ output_dir : Path ,
224+ temporal_dimension : str ,
225+ ) -> "Path | None" :
226+ """Generate a PNG temperature map thumbnail for a dataset.
227+
228+ Re-opens the store with dask chunks of size 1 on non-spatial dims so that
229+ only the exact slice is fetched from S3, not the full multi-dim zarr chunk.
230+
231+ Returns the path to the saved PNG, or None if generation fails.
232+ """
233+ if "temperature_2m" not in ds :
234+ log .warning (" No temperature_2m in %s -- skipping thumbnail" , item_id )
235+ return None
236+
237+ try :
238+ import cartopy .crs as ccrs
239+ import cartopy .feature as cfeature
240+ import numpy as np
241+
242+ # Re-open with dask using chunk size 1 on non-spatial dims.
243+ # Without this, zarr loads the full stored chunk (which can span many
244+ # lead_time / ensemble_member slices = many GB) just to extract one slice.
245+ chunks = {d : 1 for d in ds .dims
246+ if d not in ("latitude" , "longitude" , "x" , "y" )}
247+ with warnings .catch_warnings ():
248+ warnings .filterwarnings (
249+ "ignore" ,
250+ message = "Numcodecs codecs are not in the Zarr version 3 specification.*" ,
251+ )
252+ ds_lazy = xr .open_zarr (
253+ session .store , chunks = chunks ,
254+ consolidated = False , zarr_format = 3
255+ )
256+
257+ da = ds_lazy ["temperature_2m" ]
258+ log .info (" [thumb] dims=%s" , dict (da .sizes ))
259+
260+ # Select the most recent time slice
261+ if temporal_dimension == "init_time" and "init_time" in da .dims :
262+ da = da .isel (init_time = - 1 , lead_time = 0 )
263+ elif temporal_dimension in da .dims :
264+ da = da .isel ({temporal_dimension : - 1 })
265+
266+ # Pick one ensemble member to avoid loading all members
267+ if "ensemble_member" in da .dims :
268+ da = da .isel (ensemble_member = 0 )
269+
270+ # Subsample spatial dims for thumbnail (max ~360x180 points)
271+ if "latitude" in da .dims and "longitude" in da .dims :
272+ lat_step = max (1 , len (da .latitude ) // 180 )
273+ lon_step = max (1 , len (da .longitude ) // 360 )
274+ da = da .isel (latitude = slice (None , None , lat_step ),
275+ longitude = slice (None , None , lon_step ))
276+ elif "y" in da .dims and "x" in da .dims :
277+ y_step = max (1 , len (da .y ) // 300 )
278+ x_step = max (1 , len (da .x ) // 500 )
279+ da = da .isel (y = slice (None , None , y_step ),
280+ x = slice (None , None , x_step ))
281+
282+ log .info (" [thumb] loading %s values..." , dict (da .sizes ))
283+
284+ # .compute() with dask fetches only the chunks we need
285+ data_c = da .compute ().values - 273.15
286+ log .info (" [thumb] loaded shape=%s min=%.1f max=%.1f" ,
287+ data_c .shape , data_c .min (), data_c .max ())
288+
289+ is_projected = "x" in ds .dims and "y" in ds .dims
290+
291+ if is_projected :
292+ proj = ccrs .LambertConformal (
293+ central_longitude = - 97.5 , central_latitude = 38.5
294+ )
295+ fig , ax = plt .subplots (
296+ figsize = (10 , 6 ), subplot_kw = {"projection" : proj }
297+ )
298+ lons = da ["longitude" ].compute ().values
299+ lats = da ["latitude" ].compute ().values
300+ img = ax .pcolormesh (
301+ lons , lats , data_c ,
302+ cmap = "RdBu_r" , vmin = - 40 , vmax = 40 ,
303+ transform = ccrs .PlateCarree (),
304+ shading = "auto" ,
305+ )
306+ ax .add_feature (cfeature .COASTLINE , linewidth = 0.5 )
307+ ax .add_feature (cfeature .STATES , linewidth = 0.3 )
308+ ax .set_extent ([- 130 , - 60 , 20 , 55 ], crs = ccrs .PlateCarree ())
309+ else :
310+ proj = ccrs .PlateCarree ()
311+ fig , ax = plt .subplots (
312+ figsize = (10 , 5 ), subplot_kw = {"projection" : proj }
313+ )
314+ # Determine lon/lat coordinate names
315+ lon_name = "longitude" if "longitude" in da .dims else "lon"
316+ lat_name = "latitude" if "latitude" in da .dims else "lat"
317+ lons = da [lon_name ].values
318+ lats = da [lat_name ].values
319+ img = ax .pcolormesh (
320+ lons , lats , data_c ,
321+ cmap = "RdBu_r" , vmin = - 40 , vmax = 40 ,
322+ transform = proj ,
323+ shading = "auto" ,
324+ )
325+ ax .set_global ()
326+ ax .add_feature (cfeature .COASTLINE , linewidth = 0.5 )
327+
328+ plt .colorbar (img , ax = ax , orientation = "horizontal" , pad = 0.04 ,
329+ label = "2m Temperature (°C)" , shrink = 0.7 )
330+
331+ # Build timestamp string for title
332+ try :
333+ ts_val = da [temporal_dimension ].values if temporal_dimension in da .coords else None
334+ ts_str = str (np .datetime_as_string (ts_val , unit = "h" )) if ts_val is not None else ""
335+ except Exception :
336+ ts_str = ""
337+
338+ title = item_id
339+ if ts_str :
340+ title += f" | { ts_str } "
341+ ax .set_title (title , fontsize = 9 )
342+
343+ thumb_dir = output_dir / "thumbnails"
344+ thumb_dir .mkdir (parents = True , exist_ok = True )
345+ out_path = thumb_dir / f"{ item_id } .png"
346+ fig .savefig (out_path , dpi = 100 , bbox_inches = "tight" )
347+ plt .close (fig )
348+ log .info (" Thumbnail saved: %s" , out_path )
349+ return out_path
350+
351+ except Exception as exc :
352+ log .warning (" Thumbnail generation failed for %s: %s" , item_id , exc )
353+ try :
354+ plt .close ("all" )
355+ except Exception :
356+ pass
357+ return None
358+
359+
208360# ---------------------------------------------------------------------------
209361# Per-store item building
210362# ---------------------------------------------------------------------------
@@ -215,6 +367,8 @@ def build_item_for_store(
215367 region : str ,
216368 entry : dict ,
217369 catalog_url : str ,
370+ output_dir : "Path | None" = None ,
371+ thumbnail_url_base : "str | None" = None ,
218372) -> dict | None :
219373 """Open one icechunk store and return a STAC item dict, or None on failure."""
220374 store_uri = f"s3://{ bucket } /{ prefix } "
@@ -266,6 +420,17 @@ def build_item_for_store(
266420 return None
267421
268422 log .info (" Built item: %s bbox=%s" , item_id , item_dict ["bbox" ])
423+
424+ if output_dir and thumbnail_url_base :
425+ thumb_path = generate_thumbnail (session , ds , item_id , output_dir , temporal_dim )
426+ if thumb_path :
427+ item_dict ["assets" ]["thumbnail" ] = {
428+ "href" : f"{ thumbnail_url_base } /{ item_id } .png" ,
429+ "type" : "image/png" ,
430+ "roles" : ["thumbnail" ],
431+ "title" : "Latest 2m temperature map" ,
432+ }
433+
269434 return item_dict
270435
271436
@@ -277,9 +442,14 @@ def build_catalog(
277442 catalog_bucket : str ,
278443 catalog_prefix : str ,
279444 public_domain : str ,
280- ) -> tuple [pystac .Catalog , str ]:
445+ output_dir : "Path | None" = None ,
446+ ) -> "tuple[pystac.Catalog, str]" :
281447 """Discover all stores, build items, return (catalog, catalog_url)."""
282448 catalog_url = f"https://{ public_domain } /{ catalog_prefix } /catalog.json"
449+ thumbnail_url_base = (
450+ f"https://{ public_domain } /{ catalog_prefix } /thumbnails"
451+ if output_dir else None
452+ )
283453
284454 catalog = pystac .Catalog (
285455 id = "dynamical-org-icechunk" ,
@@ -308,7 +478,9 @@ def build_catalog(
308478
309479 for prefix in prefixes :
310480 item_dict = build_item_for_store (
311- bucket , prefix , region , entry , catalog_url
481+ bucket , prefix , region , entry , catalog_url ,
482+ output_dir = output_dir ,
483+ thumbnail_url_base = thumbnail_url_base ,
312484 )
313485 if item_dict :
314486 catalog .add_item (pystac .Item .from_dict (item_dict ))
@@ -353,7 +525,7 @@ def upload_to_s3(
353525) -> None :
354526 fs = s3fs .S3FileSystem (profile = profile )
355527 log .info ("\n Uploading to s3://%s/%s ..." , catalog_bucket , catalog_prefix )
356- for pattern in ("**/*.json" , "*.parquet" ):
528+ for pattern in ("**/*.json" , "*.parquet" , "thumbnails/*.png" ):
357529 for local_file in sorted (output_dir .glob (pattern )):
358530 rel = local_file .relative_to (output_dir )
359531 s3_dest = f"{ catalog_bucket } /{ catalog_prefix } /{ rel } "
@@ -438,6 +610,8 @@ def main() -> None:
438610 "GitHub Pages hostname (e.g. myorg.github.io/myrepo)." )
439611 parser .add_argument ("--github-pages-push" , action = "store_true" ,
440612 help = "Auto git-push after committing to the GitHub Pages repo." )
613+ parser .add_argument ("--thumbnails" , action = "store_true" ,
614+ help = "Generate temperature thumbnails and upload to R2" )
441615 parser .add_argument ("-v" , "--verbose" , action = "store_true" )
442616 args = parser .parse_args ()
443617
@@ -455,6 +629,7 @@ def main() -> None:
455629 catalog_bucket = args .catalog_bucket ,
456630 catalog_prefix = args .catalog_prefix ,
457631 public_domain = args .public_domain ,
632+ output_dir = output_dir if args .thumbnails else None ,
458633 )
459634
460635 n_items = len (list (catalog .get_items ()))
0 commit comments