diff --git a/README.md b/README.md index 094aa5c..036880f 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,33 @@ meta.tofile("recording") meta.tofile("recording.sigmf.gz") ``` +### HDF5 metadata sidecar (optional) + +For recordings with very large `captures`/`annotations`, the optional +`hdf5-meta` extension can write a columnar HDF5 sidecar next to the +`.sigmf-meta` file. The JSON metadata stays complete and authoritative; the +sidecar is a smaller, faster cache for column-oriented access. Requires the +optional `h5py` dependency: `pip install sigmf[hdf5]`. + +```python +import sigmf +from sigmf import hdf5 + +# write the sidecar alongside the JSON (declares the hdf5-meta extension) +meta.tofile("recording", write_hdf5=True) # also writes recording.sigmf-meta.h5 + +# fast columnar read: open ONLY the sidecar, no JSON parsing, no per-row dicts +with hdf5.open("recording.sigmf-meta.h5") as fast: + starts = fast.annotations_column("core:sample_start") # numpy column + table = fast.annotations_array() # structured array + +# or discover via the JSON once, then prefer the sidecar when present & fresh +fast = hdf5.fromfile("recording.sigmf-meta") # SigMFFileHDF5 if usable, else SigMFFile + +# the standard reader is unchanged and always reads pure JSON +meta = sigmf.fromfile("recording.sigmf-meta") +``` + ### Docs **[Please visit our documentation for full API reference and more info.](https://sigmf.readthedocs.io/en/latest/)** diff --git a/pyproject.toml b/pyproject.toml index bc281f6..7955e97 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,12 +37,17 @@ dependencies = [ [project.scripts] sigmf_validate = "sigmf.validate:main" sigmf_convert = "sigmf.convert.__main__:main" + sigmf_hdf5 = "sigmf.hdf5:main" [project.optional-dependencies] + hdf5 = [ + "h5py", # for the optional hdf5-meta metadata sidecar + ] test = [ "ruff", "pytest", "pytest-cov", "hypothesis", # next-gen testing framework + "h5py", # exercise the optional hdf5-meta sidecar in tests ] [tool.setuptools] diff --git a/sigmf/__init__.py b/sigmf/__init__.py index 360f47b..f5e1140 100644 --- a/sigmf/__init__.py +++ b/sigmf/__init__.py @@ -13,6 +13,7 @@ archive, archivereader, error, + hdf5, keys, schema, siggen, diff --git a/sigmf/hdf5.py b/sigmf/hdf5.py new file mode 100644 index 0000000..43bae70 --- /dev/null +++ b/sigmf/hdf5.py @@ -0,0 +1,883 @@ +# Copyright: Multiple Authors +# +# This file is part of sigmf-python. https://github.com/sigmf/sigmf-python +# +# SPDX-License-Identifier: LGPL-3.0-or-later + +""" +Optional HDF5 metadata sidecar support for the ``hdf5-meta`` SigMF extension. + +The ``hdf5-meta`` extension defines an OPTIONAL HDF5 file that stores a +columnar, performance-optimized duplicate of a Recording's metadata. The JSON +``.sigmf-meta`` file remains the complete, authoritative source of truth; the +sidecar is a derived cache that enables faster loads for Recordings with very +large ``captures`` or ``annotations`` arrays. + +This module is only imported when HDF5 functionality is requested. It requires +the optional ``h5py`` dependency, installable via ``pip install sigmf[hdf5]``. + +See ``extensions/hdf5-meta.sigmf-ext.md`` in the SigMF specification repository +for the on-disk format. +""" + +import builtins +import hashlib +import json +import sys +import warnings +from pathlib import Path + +import numpy as np + +from . import keys +from .error import SigMFError + +# extension identity +HDF5_META_EXTENSION = "hdf5-meta" +HDF5_META_VERSION = "1.0.0" + +# global fields contributed by this extension (colon notation, as in JSON) +HDF5_META_FILE_KEY = "hdf5-meta:file" +HDF5_META_VERSION_KEY = "hdf5-meta:version" + +# default suffix appended to a `.sigmf-meta` filename to form the sidecar name +HDF5_SIDECAR_SUFFIX = ".h5" + +# bookkeeping attribute names. These start with "__" and contain no ".", so they +# cannot collide with SigMF "namespace.field" attribute names. +_JSON_ATTRS_HINT = "__json_attrs__" +_JSON_COLUMNS_HINT = "__json_columns__" + +# root attribute holding a digest of the authoritative JSON metadata, used to +# detect a stale sidecar (JSON edited without regenerating the .h5). +_SOURCE_DIGEST_ATTR = "source_meta_sha512" + + +class SigMFHDF5Error(SigMFError): + """Raised when reading or writing an HDF5 metadata sidecar fails.""" + + +def _require_h5py(): + """Import h5py lazily, raising a helpful error if it is not installed.""" + try: + import h5py + except ImportError as exc: + raise SigMFHDF5Error( + "HDF5 metadata sidecar support requires the optional 'h5py' " + "dependency. Install it with: pip install sigmf[hdf5]" + ) from exc + return h5py + + +def _field_to_dot(name): + """Convert a SigMF field name from colon notation to dot notation. + + Only the first colon (the namespace separator) is converted, so + ``core:sample_start`` becomes ``core.sample_start``. Field names without a + colon are returned unchanged. + """ + return name.replace(":", ".", 1) + + +def _field_to_colon(name): + """Inverse of :func:`_field_to_dot`: restore the namespace colon.""" + return name.replace(".", ":", 1) + + +def _is_scalar_number(value): + """True for ints/floats but not bool (bool is a subclass of int).""" + return isinstance(value, (int, float, np.integer, np.floating)) and not isinstance(value, bool) + + +def _metadata_digest(metadata): + """Return a stable SHA-512 hex digest of a SigMF metadata dictionary. + + Keys are sorted so the digest is independent of dict ordering, giving a + canonical fingerprint of the authoritative JSON content. Used to detect a + stale sidecar. + """ + canonical = json.dumps(metadata, sort_keys=True, separators=(",", ":")).encode("utf-8") + return hashlib.sha512(canonical).hexdigest() + + +# --------------------------------------------------------------------------- +# writing +# --------------------------------------------------------------------------- +def write_hdf5_sidecar(metadata, file_path, compression="gzip"): + """ + Write a SigMF metadata dictionary to an HDF5 sidecar file. + + Parameters + ---------- + metadata : dict + A SigMF metadata dictionary containing ``global``, ``captures``, and + ``annotations`` keys (as held by ``SigMFFile._metadata``). + file_path : str | PathLike + Destination path for the sidecar file. + compression : str | None, default "gzip" + Compression filter passed to ``h5py.create_dataset``. Use ``None`` to + disable compression. + """ + h5py = _require_h5py() + + global_obj = metadata.get("global", {}) or {} + captures = metadata.get("captures", []) or [] + annotations = metadata.get("annotations", []) or [] + + with h5py.File(file_path, "w") as handle: + # root attributes + handle.attrs["sigmf_version"] = str(global_obj.get(keys.VERSION_KEY, "")) + handle.attrs["hdf5_meta_version"] = HDF5_META_VERSION + # fingerprint of the authoritative JSON, for stale-sidecar detection + handle.attrs[_SOURCE_DIGEST_ATTR] = _metadata_digest(metadata) + + # global object -> attributes on /global + grp = handle.create_group("global") + _write_global_attrs(grp, global_obj) + + # captures / annotations -> columnar datasets + if captures: + _write_records(handle, "captures", captures, compression) + if annotations: + _write_records(handle, "annotations", annotations, compression) + + +def _declare_extension(global_obj, sidecar_filename): + """Stamp the ``hdf5-meta`` extension fields into a ``global`` object in place. + + Adds an entry to ``core:extensions`` (marked optional, idempotently) and + sets ``hdf5-meta:file`` / ``hdf5-meta:version``. This mirrors + ``SigMFFile._declare_hdf5_meta`` so a sidecar generated from raw JSON is + declared identically to one written via ``SigMFFile.tofile(write_hdf5=True)``. + + Parameters + ---------- + global_obj : dict + The SigMF ``global`` object (mutated in place). + sidecar_filename : str + Bare filename (not a path) of the ``.h5`` sidecar. + """ + extensions = global_obj.get(keys.EXTENSIONS_KEY, []) or [] + if not any(ext.get("name") == HDF5_META_EXTENSION for ext in extensions): + extensions = extensions + [{"name": HDF5_META_EXTENSION, "version": HDF5_META_VERSION, "optional": True}] + global_obj[keys.EXTENSIONS_KEY] = extensions + global_obj[HDF5_META_FILE_KEY] = sidecar_filename + global_obj[HDF5_META_VERSION_KEY] = HDF5_META_VERSION + + +def generate_sidecar(meta_path, sidecar_path=None, compression="gzip", update_json=True, overwrite=True): + """ + Generate an HDF5 metadata sidecar from an existing ``.sigmf-meta`` JSON file. + + This is the forward complement of :func:`fromfile`: it reads an + authoritative JSON Metadata file, writes the columnar ``.h5`` sidecar + alongside it, and (by default) declares the ``hdf5-meta`` extension in the + JSON so :func:`fromfile` can discover and digest-verify the sidecar. + + Parameters + ---------- + meta_path : str | PathLike + Path to the ``.sigmf-meta`` file (with or without extension). The JSON + is read once and remains the authoritative source of truth. + sidecar_path : str | PathLike, optional + Destination for the sidecar. Defaults to the meta filename with + ``.h5`` appended (e.g. ``rec.sigmf-meta.h5``), matching the name + ``SigMFFile.tofile(write_hdf5=True)`` produces. + compression : str | None, default "gzip" + Compression filter for the columnar datasets. ``None`` disables it. + update_json : bool, default True + If True, stamp ``hdf5-meta:file`` / ``hdf5-meta:version`` and the + ``core:extensions`` entry into the JSON ``global`` object and rewrite + the ``.sigmf-meta`` file. When False the JSON is left untouched and the + sidecar will not be auto-discovered by :func:`fromfile`. + overwrite : bool, default True + If False, raise :class:`SigMFHDF5Error` when the sidecar already exists. + + Returns + ------- + pathlib.Path + The path to the written sidecar file. + + Raises + ------ + SigMFHDF5Error + If the metadata file is missing, unreadable as JSON, or the sidecar + exists and ``overwrite`` is False. + """ + from .sigmffile import get_sigmf_filenames + + meta_fn = get_sigmf_filenames(meta_path)["meta_fn"] + if not meta_fn.is_file(): + raise SigMFHDF5Error(f"Metadata file not found: '{meta_fn}'") + + try: + with builtins.open(meta_fn, "rb") as fp: + metadata = json.loads(fp.read().decode("utf-8")) + except (OSError, ValueError) as exc: + raise SigMFHDF5Error(f"Could not read SigMF metadata from '{meta_fn}': {exc}") from exc + + if sidecar_path is None: + sidecar_path = meta_fn.parent / (meta_fn.name + HDF5_SIDECAR_SUFFIX) + sidecar_path = Path(sidecar_path) + + if sidecar_path.exists() and not overwrite: + raise SigMFHDF5Error(f"HDF5 sidecar already exists: '{sidecar_path}'") + + if update_json: + global_obj = metadata.setdefault("global", {}) + _declare_extension(global_obj, sidecar_path.name) + with builtins.open(meta_fn, "w") as fp: + json.dump(metadata, fp) + + # write the sidecar from the (now-declared) metadata so its stored digest + # matches the JSON that fromfile() will verify against + write_hdf5_sidecar(metadata, sidecar_path, compression=compression) + return sidecar_path + + +def _write_global_attrs(grp, global_obj): + """Store each global key/value pair as an attribute on the /global group. + + Scalars are stored as native attribute types; arrays and objects are + JSON-encoded strings. The names of JSON-encoded attributes are recorded in + the ``__json_attrs__`` hint so the reader can decode them unambiguously. + """ + json_attrs = [] + for key, value in global_obj.items(): + if value is None: + continue # null -> omit (datatype mapping) + attr_name = _field_to_dot(key) + if isinstance(value, (list, dict)): + grp.attrs[attr_name] = json.dumps(value) + json_attrs.append(attr_name) + else: + grp.attrs[attr_name] = value + grp.attrs[_JSON_ATTRS_HINT] = json.dumps(json_attrs) + + +def _column_dtype(values, present): + """Decide the storage encoding for one column. + + Returns a tuple ``(numpy_dtype, is_json)``. A column is stored in a native + numpy dtype only when every row is present and the values are homogeneous + scalars; otherwise it is promoted to a JSON-encoded string column to + guarantee an exact round-trip. + """ + all_present = all(present) + non_null = [v for v, p in zip(values, present) if p] + + if all_present and non_null: + if all(isinstance(v, bool) for v in non_null): + return np.dtype("i1"), False + if all(isinstance(v, (int, np.integer)) and not isinstance(v, bool) for v in non_null): + return np.dtype(" JSON-encoded strings + import h5py + + return h5py.string_dtype(encoding="utf-8"), True + + +def _write_records(handle, group_name, records, compression): + """Write a list-of-dicts SigMF array as a columnar compound HDF5 dataset.""" + # union of all field names across all records, preserving first-seen order + columns = [] + seen = set() + for record in records: + for key in record.keys(): + if key not in seen: + seen.add(key) + columns.append(key) + + n_rows = len(records) + col_specs = [] # (json_key, dot_name, numpy_dtype, is_json) + json_columns = [] + for key in columns: + values = [record.get(key) for record in records] + present = [key in record and record[key] is not None for record in records] + dtype, is_json = _column_dtype(values, present) + dot_name = _field_to_dot(key) + col_specs.append((key, dot_name, dtype, is_json)) + if is_json: + json_columns.append(dot_name) + + compound_dtype = np.dtype([(dot_name, dtype) for _key, dot_name, dtype, _is_json in col_specs]) + array = np.zeros(n_rows, dtype=compound_dtype) + + for row_idx, record in enumerate(records): + for json_key, dot_name, dtype, is_json in col_specs: + present = json_key in record and record[json_key] is not None + if is_json: + array[dot_name][row_idx] = json.dumps(record[json_key]) if present else "" + elif not present: + # sentinel for an absent native value + if dtype.kind == "f": + array[dot_name][row_idx] = np.nan + # int/string sentinels (0 / "") are only reached when the column + # is fully present, so this branch is effectively float-only. + else: + value = record[json_key] + if dtype.kind == "b" or dtype == np.dtype("i1"): + array[dot_name][row_idx] = 1 if value else 0 + else: + array[dot_name][row_idx] = value + + dataset = handle.create_dataset(group_name, data=array, compression=compression) + dataset.attrs[_JSON_COLUMNS_HINT] = json.dumps(json_columns) + + +# --------------------------------------------------------------------------- +# reading +# --------------------------------------------------------------------------- +def open_hdf5(file_path): + """Open an HDF5 sidecar read-only and return the ``h5py.File`` handle. + + The caller is responsible for closing the handle (directly, or via the + :class:`SigMFFileHDF5` lifecycle). + """ + h5py = _require_h5py() + return h5py.File(file_path, "r") + + +def read_source_digest(handle): + """Return the stored authoritative-JSON digest, or ``None`` if absent.""" + value = handle.attrs.get(_SOURCE_DIGEST_ATTR) + return _decode_scalar(value) if value is not None else None + + +def read_hdf5_sidecar(file_path): + """ + Read an HDF5 metadata sidecar into a SigMF metadata dictionary. + + Parameters + ---------- + file_path : str | PathLike + Path to the ``.h5`` sidecar file. + + Returns + ------- + dict + A SigMF metadata dictionary with ``global``, ``captures``, and + ``annotations`` keys, equivalent to the JSON Metadata file. + """ + with open_hdf5(file_path) as handle: + metadata = {"global": {}, "captures": [], "annotations": []} + if "global" in handle: + metadata["global"] = read_global_object(handle) + if "captures" in handle: + metadata["captures"] = records_from_dataset(handle["captures"]) + if "annotations" in handle: + metadata["annotations"] = records_from_dataset(handle["annotations"]) + return metadata + + +def _decode_scalar(value): + """Convert a numpy/bytes scalar from HDF5 into a native Python type.""" + if isinstance(value, bytes): + return value.decode("utf-8") + if isinstance(value, np.integer): + return int(value) + if isinstance(value, np.floating): + return float(value) + if isinstance(value, np.bool_): + return bool(value) + return value + + +def read_global_object(handle): + """Reconstruct the SigMF ``global`` object from the ``/global`` group. + + ``handle`` is an open ``h5py.File``. Globals are small and always read + eagerly. + """ + if "global" not in handle: + return {} + grp = handle["global"] + json_attrs = set(json.loads(grp.attrs.get(_JSON_ATTRS_HINT, "[]"))) + result = {} + for attr_name, attr_value in grp.attrs.items(): + if attr_name == _JSON_ATTRS_HINT: + continue + key = _field_to_colon(attr_name) + if attr_name in json_attrs: + result[key] = json.loads(_decode_scalar(attr_value)) + else: + result[key] = _decode_scalar(attr_value) + return result + + +def record_column_names(dataset): + """Return the JSON-keyed (colon-notation) field names of a record dataset.""" + return [_field_to_colon(dot_name) for dot_name in dataset.dtype.names] + + +def read_record_column(dataset, field): + """ + Read one column of a captures/annotations dataset as a list of values. + + This is the columnar fast path: a single HDF5 column is read and decoded + without materializing any per-row dictionaries. Absent values (the NaN or + empty-string sentinel) are returned as ``None`` so the caller can tell + "field not present in this row" from a real value. + + Parameters + ---------- + dataset : h5py.Dataset + A captures or annotations compound dataset. + field : str + Field name in JSON colon notation (e.g. ``"core:sample_start"``) or the + stored dot notation (e.g. ``"core.sample_start"``). + + Returns + ------- + list + One entry per row; ``None`` where the field is absent. + + Raises + ------ + KeyError + If the field is not a column in the dataset. + """ + dot_name = _field_to_dot(field) + if dot_name not in dataset.dtype.names: + raise KeyError(f"'{field}' is not a column in this dataset") + json_columns = set(json.loads(dataset.attrs.get(_JSON_COLUMNS_HINT, "[]"))) + column = dataset[dot_name] + return _decode_column(column, dot_name in json_columns) + + +def _decode_column(column, is_json): + """Decode a single numpy column into a list, mapping sentinels to ``None``. + + Shared by the list-of-dicts path (:func:`records_from_dataset`) and the + columnar path (:func:`read_record_column`) so both apply identical + NaN/empty-string sentinel handling. + """ + kind = column.dtype.kind + if is_json: + return [(None if text == "" else json.loads(text)) for text in _decode_string_column(column)] + if kind == "f": + # NaN marks an absent value (v != v is True only for NaN) + return [None if v != v else v for v in column.tolist()] + if kind in ("O", "S", "U"): + return _decode_string_column(column) + # integers / booleans-as-int: fully present by construction + return column.tolist() + + +def records_from_dataset(dataset): + """Reconstruct a list-of-dicts SigMF array from a columnar dataset. + + Works column-by-column rather than cell-by-cell: ``ndarray.tolist()`` bulk + converts a whole column to native Python objects in C, which is far faster + than indexing numpy scalars one row at a time. This is the compatibility + path; the fast path (:func:`read_record_column`) avoids building dicts. + """ + json_columns = set(json.loads(dataset.attrs.get(_JSON_COLUMNS_HINT, "[]"))) + data = dataset[:] + column_names = data.dtype.names + n_rows = len(data) + + decoded = [ + (_field_to_colon(dot_name), _decode_column(data[dot_name], dot_name in json_columns)) + for dot_name in column_names + ] + + records = [{} for _ in range(n_rows)] + for key, values in decoded: + for row_idx in range(n_rows): + value = values[row_idx] + if value is not None: + records[row_idx][key] = value + return records + + +def _decode_string_column(column): + """Return a list of Python ``str`` from an HDF5 string column.""" + return [v.decode("utf-8") if isinstance(v, bytes) else v for v in column.tolist()] + + +def _structured_array(dataset, fields=None): + """Return a structured ``ndarray`` view of a record dataset. + + Field names are renamed from stored dot notation to SigMF colon notation. + JSON-encoded columns (nested objects/arrays) are returned as their raw + encoded strings; callers needing decoded objects should use the list-of- + dicts path. ``fields`` optionally restricts to a subset (JSON colon names). + """ + data = dataset[:] + data.dtype.names = tuple(_field_to_colon(name) for name in data.dtype.names) + if fields is not None: + data = data[list(fields)] + return data + + +# --------------------------------------------------------------------------- +# fast lazy reader +# --------------------------------------------------------------------------- +class SigMFFileHDF5: + """ + Lazy, columnar reader for an HDF5 metadata sidecar (the ``hdf5-meta`` fast + path). + + Unlike :class:`sigmf.sigmffile.SigMFFile`, which stores metadata as + list-of-dicts, this reader keeps the sidecar open and serves + captures/annotations as numpy columns/arrays without ever building per-row + dictionaries. This is the path that is actually faster than parsing JSON. + + The instance holds an open ``h5py.File``; use it as a context manager or + call :meth:`close` when done:: + + with sigmf.hdf5.open("rec.sigmf-meta.h5") as sf: + starts = sf.annotations_column("core:sample_start") # ndarray + labels = sf.annotations_column("core:label") # ndarray + + For interoperability with code written against ``SigMFFile``, the + convenience methods :meth:`get_annotations`, :meth:`get_captures`, and + :meth:`to_sigmffile` materialize list-of-dicts on demand (at JSON speed). + """ + + def __init__(self, handle, global_obj=None, data_file=None): + """ + Parameters + ---------- + handle : h5py.File + An open, readable HDF5 sidecar handle. Ownership transfers to this + object, which will close it on :meth:`close`. + global_obj : dict, optional + The SigMF ``global`` object. If omitted it is read from the + sidecar's ``/global`` group. + data_file : str | PathLike, optional + Path to the associated ``.sigmf-data`` dataset, if known. + """ + self._handle = handle + self._global = global_obj if global_obj is not None else read_global_object(handle) + self.data_file = data_file + + # -- global access (eager; globals are small) -------------------------- + def get_global_info(self): + """Return the full ``global`` object as a dict.""" + return self._global + + def global_field(self, key, default=None): + """Return one global field, e.g. ``global_field('core:sample_rate')``.""" + return self._global.get(key, default) + + @property + def sample_rate(self): + return self._global.get(keys.SAMPLE_RATE_KEY) + + @property + def datatype(self): + return self._global.get(keys.DATATYPE_KEY) + + # -- columnar fast path ------------------------------------------------- + def _dataset(self, name): + if name not in self._handle: + return None + return self._handle[name] + + def num_captures(self): + """Number of capture segments without materializing them.""" + ds = self._dataset("captures") + return 0 if ds is None else len(ds) + + def num_annotations(self): + """Number of annotations without materializing them.""" + ds = self._dataset("annotations") + return 0 if ds is None else len(ds) + + def capture_field_names(self): + """JSON-keyed (colon-notation) capture column names.""" + ds = self._dataset("captures") + return [] if ds is None else record_column_names(ds) + + def annotation_field_names(self): + """JSON-keyed (colon-notation) annotation column names.""" + ds = self._dataset("annotations") + return [] if ds is None else record_column_names(ds) + + def captures_column(self, field): + """Return one capture field across all segments as a list. + + Absent values are ``None``. Raises ``KeyError`` if the field is not a + column and there are captures, and returns ``[]`` if there are none. + """ + ds = self._dataset("captures") + return [] if ds is None else read_record_column(ds, field) + + def annotations_column(self, field): + """Return one annotation field across all annotations as a list. + + Absent values are ``None``. Raises ``KeyError`` if the field is not a + column and there are annotations, and returns ``[]`` if there are none. + """ + ds = self._dataset("annotations") + return [] if ds is None else read_record_column(ds, field) + + def captures_array(self, fields=None): + """Return captures as a numpy structured array (colon-keyed columns).""" + ds = self._dataset("captures") + if ds is None: + return np.array([]) + return _structured_array(ds, fields) + + def annotations_array(self, fields=None): + """Return annotations as a numpy structured array (colon-keyed columns).""" + ds = self._dataset("annotations") + if ds is None: + return np.array([]) + return _structured_array(ds, fields) + + # -- compatibility helpers (materialize list-of-dicts on demand) ------- + def get_captures(self): + """Return all captures as a list of dicts (compatibility path).""" + ds = self._dataset("captures") + return [] if ds is None else records_from_dataset(ds) + + def get_annotations(self, index=None): + """Return annotations as a list of dicts (compatibility path). + + If ``index`` is given, return only annotations spanning that sample + index, matching :meth:`sigmf.sigmffile.SigMFFile.get_annotations`. + """ + ds = self._dataset("annotations") + annotations = [] if ds is None else records_from_dataset(ds) + if index is None: + return annotations + + result = [] + for annotation in annotations: + if index < annotation[keys.SAMPLE_START_KEY]: + continue + if keys.SAMPLE_COUNT_KEY in annotation: + if index >= annotation[keys.SAMPLE_START_KEY] + annotation[keys.SAMPLE_COUNT_KEY]: + continue + result.append(annotation) + return result + + def as_metadata_dict(self): + """Return the full SigMF metadata dict (global/captures/annotations).""" + return { + "global": dict(self._global), + "captures": self.get_captures(), + "annotations": self.get_annotations(), + } + + def to_sigmffile(self, skip_checksum=True): + """ + Materialize a canonical :class:`sigmf.sigmffile.SigMFFile`. + + This builds the full list-of-dicts metadata (JSON-speed) and returns a + standard object that supports the complete SigMFFile API, including + sample reading when ``data_file`` is known. + """ + from .sigmffile import SigMFFile + + return SigMFFile( + metadata=self.as_metadata_dict(), + data_file=self.data_file, + skip_checksum=skip_checksum, + ) + + # -- lifecycle ---------------------------------------------------------- + def close(self): + """Close the underlying HDF5 file handle (idempotent).""" + if self._handle is not None: + self._handle.close() + self._handle = None + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): # noqa: ARG002 + self.close() + + def __del__(self): + try: + self.close() + except Exception: # noqa: BLE001 - best-effort cleanup during GC + pass + + +# --------------------------------------------------------------------------- +# entry points +# --------------------------------------------------------------------------- +def open(file_path, data_file=None, global_obj=None): + """ + Open an HDF5 metadata sidecar directly, reading no JSON at all. + + This is the tightest fast path: when the caller already knows the ``.h5`` + location, no ``.sigmf-meta`` JSON is read. The ``global`` object is read + from the sidecar's ``/global`` group unless supplied. + + Parameters + ---------- + file_path : str | PathLike + Path to the ``.sigmf-meta.h5`` sidecar. + data_file : str | PathLike, optional + Path to the associated dataset, if known. + global_obj : dict, optional + Pre-known ``global`` object to use instead of reading it from the file. + + Returns + ------- + SigMFFileHDF5 + A lazy, columnar reader. Close it (or use ``with``) when done. + """ + return SigMFFileHDF5(open_hdf5(file_path), global_obj=global_obj, data_file=data_file) + + +def fromfile(meta_path, require_sidecar=False, verify=True, skip_checksum=False): + """ + Load a Recording, preferring the HDF5 sidecar when available. + + The ``.sigmf-meta`` JSON is read exactly once for discovery: to learn + whether an ``hdf5-meta:file`` sidecar is declared and to resolve the + dataset filename. If the sidecar exists, ``h5py`` is installed, and (when + ``verify``) its stored digest matches the JSON, a lazy + :class:`SigMFFileHDF5` is returned. Otherwise a standard + :class:`sigmf.sigmffile.SigMFFile` is returned (the JSON remains + authoritative). + + Parameters + ---------- + meta_path : str | PathLike + Path to the ``.sigmf-meta`` file (with or without extension). + require_sidecar : bool, default False + If True, raise :class:`SigMFHDF5Error` when a usable sidecar is not + available instead of falling back to JSON. + verify : bool, default True + If True, compare the sidecar's stored digest against the JSON and fall + back (with a warning) if they disagree, guarding against a stale + sidecar. + skip_checksum : bool, default False + Passed through to the JSON fallback ``SigMFFile``. + + Returns + ------- + SigMFFileHDF5 | sigmf.sigmffile.SigMFFile + """ + from .sigmffile import fromfile as json_fromfile + from .sigmffile import get_dataset_filename_from_metadata, get_sigmf_filenames + + fns = get_sigmf_filenames(meta_path) + meta_fn = fns["meta_fn"] + + def _fallback(reason): + if require_sidecar: + raise SigMFHDF5Error(f"No usable hdf5-meta sidecar for '{meta_fn}': {reason}") + return json_fromfile(meta_path, skip_checksum=skip_checksum) + + if not meta_fn.is_file(): + return _fallback("metadata file not found") + + # single JSON read for discovery (builtins.open: this module shadows open()) + with builtins.open(meta_fn, "rb") as fp: + metadata = json.loads(fp.read().decode("utf-8")) + + sidecar_name = metadata.get("global", {}).get(HDF5_META_FILE_KEY) + if not sidecar_name: + return _fallback("no hdf5-meta:file declared") + + sidecar_path = Path(meta_fn).parent / sidecar_name + if not sidecar_path.is_file(): + return _fallback(f"sidecar '{sidecar_name}' not found") + + try: + handle = open_hdf5(sidecar_path) + except (SigMFHDF5Error, OSError) as exc: + # SigMFHDF5Error: h5py missing. OSError: not a valid/readable HDF5 file. + return _fallback(str(exc)) + + if verify: + stored = read_source_digest(handle) + if stored is not None and stored != _metadata_digest(metadata): + handle.close() + warnings.warn(f"hdf5-meta sidecar '{sidecar_path}' is stale (digest mismatch); using JSON metadata.") + return _fallback("stale sidecar") + + data_fn = get_dataset_filename_from_metadata(meta_fn, metadata) + return SigMFFileHDF5(handle, global_obj=metadata.get("global", {}), data_file=data_fn) + + +# --------------------------------------------------------------------------- +# command-line interface +# --------------------------------------------------------------------------- +def main(arg_tuple=None): + """Command-line entry point for generating HDF5 metadata sidecars. + + Reads one or more existing ``.sigmf-meta`` JSON files and writes an + ``hdf5-meta`` sidecar alongside each, declaring the extension in the JSON + (unless ``--no-update-json`` is given). Installed as ``sigmf_hdf5``. + """ + import argparse + import glob + + from . import __version__ as toolversion + + parser = argparse.ArgumentParser( + description="Generate an HDF5 metadata sidecar from an existing SigMF .sigmf-meta file.", + prog="sigmf_hdf5", + ) + parser.add_argument( + "path", nargs="+", help="SigMF metadata path(s). Accepts * wildcards; the extension is optional." + ) + parser.add_argument( + "--no-compression", + action="store_true", + help="Disable gzip compression of the columnar datasets.", + ) + parser.add_argument( + "--no-update-json", + action="store_true", + help="Do not declare the hdf5-meta extension in the .sigmf-meta JSON (sidecar won't be auto-discovered).", + ) + parser.add_argument( + "--no-overwrite", + action="store_true", + help="Fail instead of overwriting an existing sidecar.", + ) + parser.add_argument("-v", "--verbose", action="store_true", help="Print each sidecar written.") + parser.add_argument("--version", action="version", version=f"%(prog)s {toolversion}") + + args = parser.parse_args(arg_tuple) + + # resolve possible wildcards + paths = [] + for path in args.path: + expanded = glob.glob(path) + paths += expanded if expanded else [path] + + n_ok = 0 + for path in paths: + try: + sidecar = generate_sidecar( + path, + compression=None if args.no_compression else "gzip", + update_json=not args.no_update_json, + overwrite=not args.no_overwrite, + ) + except SigMFHDF5Error as exc: + print(f"ERROR: {path}: {exc}", file=sys.stderr) + continue + n_ok += 1 + if args.verbose: + print(f"wrote {sidecar}") + + n_total = len(paths) + if n_ok != n_total: + print(f"Generated {n_ok} of {n_total} sidecar(s)", file=sys.stderr) + sys.exit(1) + if args.verbose: + print(f"Generated all {n_total} sidecar(s) OK") + + +if __name__ == "__main__": + main() diff --git a/sigmf/keys.py b/sigmf/keys.py index 6c3dd3c..3aef872 100644 --- a/sigmf/keys.py +++ b/sigmf/keys.py @@ -128,6 +128,9 @@ SIGMF_METADATA_EXT = ".sigmf-meta" SIGMF_DATASET_EXT = ".sigmf-data" SIGMF_COLLECTION_EXT = ".sigmf-collection" +# OPTIONAL HDF5 metadata sidecar (hdf5-meta extension); appended to the +# metadata filename, e.g. recording.sigmf-meta.h5 +SIGMF_HDF5_META_EXT = ".h5" SIGMF_COMPRESSED_EXTS = { "gz": ".sigmf.gz", diff --git a/sigmf/sigmffile.py b/sigmf/sigmffile.py index 8dc300e..811b5ec 100644 --- a/sigmf/sigmffile.py +++ b/sigmf/sigmffile.py @@ -842,7 +842,46 @@ def archive(self, name=None, fileobj=None, compression=None, overwrite=False): archive = SigMFArchive(self, name, fileobj, compression=compression, overwrite=overwrite) return archive.path - def tofile(self, file_path, pretty=True, toarchive=False, compression=None, skip_validate=False, overwrite=False): + def _declare_hdf5_meta(self, meta_filename): + """ + Register the `hdf5-meta` extension in global metadata. + + Adds an entry to `core:extensions` (marked optional) and sets + `hdf5-meta:file` to the sidecar filename derived from `meta_filename`. + + Parameters + ---------- + meta_filename : str + Filename (not path) of the `.sigmf-meta` file the sidecar accompanies. + """ + from .hdf5 import ( + HDF5_META_EXTENSION, + HDF5_META_FILE_KEY, + HDF5_META_VERSION, + HDF5_META_VERSION_KEY, + HDF5_SIDECAR_SUFFIX, + ) + + extensions = self.get_global_field(keys.EXTENSIONS_KEY, []) + if not any(ext.get("name") == HDF5_META_EXTENSION for ext in extensions): + extensions = extensions + [ + {"name": HDF5_META_EXTENSION, "version": HDF5_META_VERSION, "optional": True} + ] + self.set_global_field(keys.EXTENSIONS_KEY, extensions) + + self.set_global_field(HDF5_META_FILE_KEY, meta_filename + HDF5_SIDECAR_SUFFIX) + self.set_global_field(HDF5_META_VERSION_KEY, HDF5_META_VERSION) + + def tofile( + self, + file_path, + pretty=True, + toarchive=False, + compression=None, + skip_validate=False, + overwrite=False, + write_hdf5=False, + ): """ Write metadata file or archive based on file extension. @@ -867,6 +906,11 @@ def tofile(self, file_path, pretty=True, toarchive=False, compression=None, skip Skip validation of metadata before writing. overwrite : bool, default False If False, raise exception if output file already exists. + write_hdf5 : bool, default False + If True, also write an HDF5 metadata sidecar alongside the + `.sigmf-meta` file and declare the `hdf5-meta` extension. Requires + the optional `h5py` dependency (`pip install sigmf[hdf5]`). Ignored + when writing to an archive. Examples -------- @@ -911,10 +955,24 @@ def tofile(self, file_path, pretty=True, toarchive=False, compression=None, skip fns = get_sigmf_filenames(file_path) if not overwrite and fns["meta_fn"].exists(): raise SigMFFileExistsError(fns["meta_fn"], "Metadata file") + + if write_hdf5: + # declare the extension before serializing so the JSON file + # records the sidecar reference + self._declare_hdf5_meta(fns["meta_fn"].name) + with open(fns["meta_fn"], "w") as fp: self.dump(fp, pretty=pretty) fp.write("\n") # text files should end in carriage return + if write_hdf5: + from .hdf5 import write_hdf5_sidecar + + sidecar_fn = fns["meta_fn"].parent / self.get_global_field("hdf5-meta:file") + if not overwrite and sidecar_fn.exists(): + raise SigMFFileExistsError(sidecar_fn, "HDF5 sidecar file") + write_hdf5_sidecar(self._metadata, sidecar_fn) + # write data file if data_buffer exists if self.data_buffer is not None: if not overwrite and fns["data_fn"].exists(): diff --git a/tests/test_hdf5.py b/tests/test_hdf5.py new file mode 100644 index 0000000..c971978 --- /dev/null +++ b/tests/test_hdf5.py @@ -0,0 +1,383 @@ +# Copyright: Multiple Authors +# +# This file is part of sigmf-python. https://github.com/sigmf/sigmf-python +# +# SPDX-License-Identifier: LGPL-3.0-or-later + +"""Tests for the optional hdf5-meta metadata sidecar.""" + +import json +import tempfile +from pathlib import Path + +import numpy as np +import pytest + +import sigmf +from sigmf.error import SigMFFileExistsError + +# the entire module depends on the optional h5py dependency +h5py = pytest.importorskip("h5py") + +from sigmf import hdf5 # noqa: E402 - imported after the h5py skip guard + + +def _make_recording(tmp_path): + """Build a SigMFFile with heterogeneous captures and annotations.""" + data = (np.random.randn(2048) + 1j * np.random.randn(2048)).astype(np.complex64) + meta = sigmf.fromarray(data) + meta.sample_rate = 1e6 + meta.author = "tester@example.com" + meta.add_capture(0, metadata={sigmf.FREQUENCY_KEY: 915e6}) + # annotation 0: has a float edge field + meta.add_annotation(0, length=100, metadata={sigmf.LABEL_KEY: "burst_0", sigmf.FREQ_LOWER_EDGE_KEY: 914e6}) + # annotation 1: carries a nested object field (signal:detail) + meta.add_annotation( + 1000, length=100, metadata={sigmf.LABEL_KEY: "burst_1", "signal:detail": {"type": "digital", "order": 4}} + ) + # annotation 2: omits sample_count entirely and has a unique field + meta.add_annotation(2000, metadata={sigmf.COMMENT_KEY: "tail"}) + return meta + + +@pytest.fixture +def tmp_dir(): + with tempfile.TemporaryDirectory() as d: + yield Path(d) + + +@pytest.fixture +def written(tmp_dir): + """Write a recording with sidecar; return (tmp_dir, base, json_doc).""" + meta = _make_recording(tmp_dir) + base = tmp_dir / "rec" + meta.tofile(base, write_hdf5=True, skip_validate=True) + doc = json.loads((tmp_dir / "rec.sigmf-meta").read_text()) + return tmp_dir, base, doc + + +# --------------------------------------------------------------------------- +# writing + declaration +# --------------------------------------------------------------------------- +def test_sidecar_written_and_declared(written): + """tofile(write_hdf5=True) writes the sidecar and declares the extension.""" + tmp_dir, _base, doc = written + assert (tmp_dir / "rec.sigmf-meta").is_file() + assert (tmp_dir / "rec.sigmf-meta.h5").is_file() + + assert doc["global"]["hdf5-meta:file"] == "rec.sigmf-meta.h5" + ext = next(e for e in doc["global"]["core:extensions"] if e["name"] == "hdf5-meta") + assert ext["optional"] is True + + +def test_default_fromfile_ignores_sidecar(written): + """Option A: the canonical sigmf.fromfile reads pure JSON, never the sidecar.""" + _tmp_dir, base, doc = written + loaded = sigmf.fromfile(base) + assert isinstance(loaded, sigmf.SigMFFile) + assert loaded.get_annotations() == doc["annotations"] + assert loaded.get_captures() == doc["captures"] + + +# --------------------------------------------------------------------------- +# generate_sidecar — JSON file -> sidecar +# --------------------------------------------------------------------------- +def test_generate_sidecar_from_json(tmp_dir): + """generate_sidecar reads a plain .sigmf-meta and produces a usable sidecar.""" + meta = _make_recording(tmp_dir) + base = tmp_dir / "plain" + meta.tofile(base, skip_validate=True) # JSON only, no sidecar + meta_fn = tmp_dir / "plain.sigmf-meta" + original = json.loads(meta_fn.read_text()) + + sidecar = hdf5.generate_sidecar(meta_fn) + + # default name sits next to the JSON with .h5 appended + assert sidecar == tmp_dir / "plain.sigmf-meta.h5" + assert sidecar.is_file() + + # JSON is updated to declare the extension so fromfile can discover it + doc = json.loads(meta_fn.read_text()) + assert doc["global"]["hdf5-meta:file"] == "plain.sigmf-meta.h5" + assert doc["global"]["hdf5-meta:version"] == hdf5.HDF5_META_VERSION + ext = next(e for e in doc["global"]["core:extensions"] if e["name"] == "hdf5-meta") + assert ext["optional"] is True + + # sidecar content matches the original annotations/captures + restored = hdf5.read_hdf5_sidecar(sidecar) + assert restored["annotations"] == original["annotations"] + assert restored["captures"] == original["captures"] + + +def test_generate_sidecar_discovered_by_fromfile(tmp_dir): + """A generated sidecar is preferred (and digest-verified) by hdf5.fromfile.""" + meta = _make_recording(tmp_dir) + base = tmp_dir / "plain" + meta.tofile(base, skip_validate=True) + expected = json.loads((tmp_dir / "plain.sigmf-meta").read_text())["annotations"] + + hdf5.generate_sidecar(tmp_dir / "plain.sigmf-meta") + + sf = hdf5.fromfile(base) # no warning: digest matches the rewritten JSON + try: + assert isinstance(sf, hdf5.SigMFFileHDF5) + assert sf.get_annotations() == expected + finally: + sf.close() + + +def test_generate_sidecar_custom_path_no_json_update(tmp_dir): + """sidecar_path and update_json=False are honored; JSON is left untouched.""" + meta = _make_recording(tmp_dir) + base = tmp_dir / "plain" + meta.tofile(base, skip_validate=True) + meta_fn = tmp_dir / "plain.sigmf-meta" + before = meta_fn.read_text() + + target = tmp_dir / "elsewhere.h5" + sidecar = hdf5.generate_sidecar(meta_fn, sidecar_path=target, update_json=False) + + assert sidecar == target + assert target.is_file() + # JSON untouched -> no declaration, fromfile falls back to JSON + assert meta_fn.read_text() == before + assert isinstance(hdf5.fromfile(base), sigmf.SigMFFile) + + +def test_generate_sidecar_overwrite_guard(tmp_dir): + """overwrite=False raises when the sidecar already exists.""" + meta = _make_recording(tmp_dir) + base = tmp_dir / "plain" + meta.tofile(base, skip_validate=True) + meta_fn = tmp_dir / "plain.sigmf-meta" + + hdf5.generate_sidecar(meta_fn) + with pytest.raises(hdf5.SigMFHDF5Error): + hdf5.generate_sidecar(meta_fn, overwrite=False) + # overwrite=True (default) succeeds on a second run + assert hdf5.generate_sidecar(meta_fn).is_file() + + +def test_generate_sidecar_missing_meta_raises(tmp_dir): + """A missing metadata file raises SigMFHDF5Error.""" + with pytest.raises(hdf5.SigMFHDF5Error): + hdf5.generate_sidecar(tmp_dir / "nonexistent.sigmf-meta") + + +# --------------------------------------------------------------------------- +# CLI entry point (sigmf_hdf5) +# --------------------------------------------------------------------------- +def test_cli_generates_sidecar(tmp_dir): + """The sigmf_hdf5 CLI writes a sidecar and declares the extension.""" + meta = _make_recording(tmp_dir) + base = tmp_dir / "plain" + meta.tofile(base, skip_validate=True) + meta_fn = tmp_dir / "plain.sigmf-meta" + + hdf5.main([str(meta_fn)]) + + assert (tmp_dir / "plain.sigmf-meta.h5").is_file() + doc = json.loads(meta_fn.read_text()) + assert doc["global"]["hdf5-meta:file"] == "plain.sigmf-meta.h5" + + +def test_cli_no_update_json(tmp_dir): + """--no-update-json leaves the JSON untouched.""" + meta = _make_recording(tmp_dir) + base = tmp_dir / "plain" + meta.tofile(base, skip_validate=True) + meta_fn = tmp_dir / "plain.sigmf-meta" + before = meta_fn.read_text() + + hdf5.main([str(meta_fn), "--no-update-json"]) + + assert (tmp_dir / "plain.sigmf-meta.h5").is_file() + assert meta_fn.read_text() == before + + +def test_cli_missing_file_exits_nonzero(tmp_dir): + """A bad path makes the CLI exit non-zero rather than crash.""" + with pytest.raises(SystemExit) as excinfo: + hdf5.main([str(tmp_dir / "nonexistent.sigmf-meta")]) + assert excinfo.value.code == 1 + + +# --------------------------------------------------------------------------- +# direct module round-trip +# --------------------------------------------------------------------------- +def test_module_roundtrip_equivalence(written): + """read_hdf5_sidecar reproduces the JSON metadata exactly.""" + tmp_dir, _base, doc = written + restored = hdf5.read_hdf5_sidecar(tmp_dir / "rec.sigmf-meta.h5") + assert restored["captures"] == doc["captures"] + assert restored["annotations"] == doc["annotations"] + # global gains the hdf5-meta declaration fields, which are real metadata + assert restored["global"]["core:sample_rate"] == 1e6 + + +# --------------------------------------------------------------------------- +# hdf5.open — zero JSON, columnar fast path +# --------------------------------------------------------------------------- +def test_open_zero_json_columnar(written): + """hdf5.open reads only the .h5 and serves columns without dict building.""" + tmp_dir, _base, doc = written + with hdf5.open(tmp_dir / "rec.sigmf-meta.h5") as sf: + assert isinstance(sf, hdf5.SigMFFileHDF5) + assert sf.sample_rate == 1e6 + assert sf.num_annotations() == 3 + assert sf.num_captures() == 1 + + # absent values come back as None + assert sf.annotations_column("core:label") == ["burst_0", "burst_1", None] + assert sf.annotations_column("core:sample_count") == [100, 100, None] + assert sf.annotations_column("core:sample_start") == [0, 1000, 2000] + # nested object column decodes back to a dict + assert sf.annotations_column("signal:detail") == [None, {"type": "digital", "order": 4}, None] + + +def test_open_structured_array(written): + """annotations_array returns a numpy structured array with colon-keyed names.""" + tmp_dir, _base, _doc = written + with hdf5.open(tmp_dir / "rec.sigmf-meta.h5") as sf: + arr = sf.annotations_array(["core:sample_start", "core:label"]) + assert arr.dtype.names == ("core:sample_start", "core:label") + assert arr["core:sample_start"].tolist() == [0, 1000, 2000] + + +def test_open_compat_accessors(written): + """get_annotations/get_captures/to_sigmffile match the JSON content.""" + tmp_dir, _base, doc = written + with hdf5.open(tmp_dir / "rec.sigmf-meta.h5") as sf: + assert sf.get_annotations() == doc["annotations"] + assert sf.get_captures() == doc["captures"] + # index-filtered access mirrors SigMFFile.get_annotations(index=...) + labels = [a["core:label"] for a in sf.get_annotations(index=1050)] + assert labels == ["burst_1"] + sff = sf.to_sigmffile() + assert isinstance(sff, sigmf.SigMFFile) + assert sff.get_annotations() == doc["annotations"] + + +def test_unknown_column_raises(written): + """Requesting a non-existent column raises KeyError.""" + tmp_dir, _base, _doc = written + with hdf5.open(tmp_dir / "rec.sigmf-meta.h5") as sf: + with pytest.raises(KeyError): + sf.annotations_column("core:does_not_exist") + + +# --------------------------------------------------------------------------- +# hdf5.fromfile — discovery +# --------------------------------------------------------------------------- +def test_fromfile_prefers_sidecar(written): + """hdf5.fromfile returns the fast reader when a valid sidecar exists.""" + _tmp_dir, base, doc = written + sf = hdf5.fromfile(base) + try: + assert isinstance(sf, hdf5.SigMFFileHDF5) + assert sf.get_annotations() == doc["annotations"] + finally: + sf.close() + + +def test_fromfile_falls_back_without_sidecar(tmp_dir): + """hdf5.fromfile returns a plain SigMFFile when no sidecar is declared.""" + data = (np.random.randn(64) + 1j * np.random.randn(64)).astype(np.complex64) + meta = sigmf.fromarray(data) + meta.add_capture(0) + base = tmp_dir / "plain" + meta.tofile(base, skip_validate=True) # no write_hdf5 + + sf = hdf5.fromfile(base) + assert isinstance(sf, sigmf.SigMFFile) + + +def test_fromfile_require_sidecar_raises(tmp_dir): + """require_sidecar=True raises instead of falling back to JSON.""" + data = (np.random.randn(64) + 1j * np.random.randn(64)).astype(np.complex64) + meta = sigmf.fromarray(data) + meta.add_capture(0) + base = tmp_dir / "plain" + meta.tofile(base, skip_validate=True) + + with pytest.raises(hdf5.SigMFHDF5Error): + hdf5.fromfile(base, require_sidecar=True) + + +def test_fromfile_stale_sidecar_warns_and_falls_back(written): + """A digest mismatch makes hdf5.fromfile warn and use the JSON instead.""" + tmp_dir, base, doc = written + + # edit the JSON without regenerating the sidecar -> stale (relabel in place + # rather than appending past the data length, to avoid an unrelated warning) + meta_fn = tmp_dir / "rec.sigmf-meta" + edited = json.loads(meta_fn.read_text()) + edited["annotations"][0]["core:label"] = "edited_in_json" + meta_fn.write_text(json.dumps(edited)) + + with pytest.warns(UserWarning, match="stale"): + sf = hdf5.fromfile(base) + assert isinstance(sf, sigmf.SigMFFile) + assert sf.get_annotations()[0]["core:label"] == "edited_in_json" + + +def test_fromfile_corrupt_sidecar_falls_back(written): + """An unreadable sidecar falls back to JSON (no crash).""" + tmp_dir, base, doc = written + (tmp_dir / "rec.sigmf-meta.h5").write_bytes(b"not an hdf5 file") + + sf = hdf5.fromfile(base) + assert isinstance(sf, sigmf.SigMFFile) + assert sf.get_annotations() == doc["annotations"] + + +# --------------------------------------------------------------------------- +# edge cases +# --------------------------------------------------------------------------- +def test_empty_annotations_and_captures(tmp_dir): + """Recordings with no annotations/captures omit those datasets cleanly.""" + metadata = { + "global": {sigmf.DATATYPE_KEY: "cf32_le", sigmf.VERSION_KEY: "1.2.0"}, + "captures": [], + "annotations": [], + } + sidecar = tmp_dir / "empty.h5" + hdf5.write_hdf5_sidecar(metadata, sidecar) + + with h5py.File(sidecar, "r") as handle: + assert "captures" not in handle + assert "annotations" not in handle + + with hdf5.open(sidecar) as sf: + assert sf.num_annotations() == 0 + assert sf.num_captures() == 0 + assert sf.annotations_column("core:sample_start") == [] + assert sf.get_annotations() == [] + + +def test_boolean_and_extensions_roundtrip(tmp_dir): + """Booleans, arrays, and nested objects in global survive the round trip.""" + metadata = { + "global": { + sigmf.DATATYPE_KEY: "cf32_le", + sigmf.VERSION_KEY: "1.2.0", + sigmf.METADATA_ONLY_KEY: True, + sigmf.EXTENSIONS_KEY: [{"name": "hdf5-meta", "version": "1.0.0", "optional": True}], + }, + "captures": [], + "annotations": [], + } + sidecar = tmp_dir / "bools.h5" + hdf5.write_hdf5_sidecar(metadata, sidecar) + restored = hdf5.read_hdf5_sidecar(sidecar) + + assert restored["global"][sigmf.METADATA_ONLY_KEY] is True + assert restored["global"][sigmf.EXTENSIONS_KEY] == metadata["global"][sigmf.EXTENSIONS_KEY] + + +def test_sidecar_overwrite_guard(written): + """Writing over an existing sidecar without overwrite=True raises.""" + tmp_dir, base, _doc = written + meta = _make_recording(tmp_dir / "ignored") # fresh object, same target base + with pytest.raises(SigMFFileExistsError): + meta.tofile(base, write_hdf5=True, skip_validate=True, overwrite=False)