From 9eeb5e8760428f147115cc490b11ec13ba65929e Mon Sep 17 00:00:00 2001 From: RavSinghChandan Date: Tue, 30 Jun 2026 19:11:20 +0530 Subject: [PATCH] docs(naming): add missing docstrings to filename utility functions Add Args/Returns/Raises/Example docstrings to filename_prefix_for_name, filename_prefix_for_split, filepattern_for_dataset_split, filename_for_dataset_split, filepath_for_dataset_split --- src/evaluate/naming.py | 98 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/src/evaluate/naming.py b/src/evaluate/naming.py index 6335cf1b0..f5411e850 100644 --- a/src/evaluate/naming.py +++ b/src/evaluate/naming.py @@ -44,12 +44,52 @@ def snakecase_to_camelcase(name): def filename_prefix_for_name(name): + """Return the snake_case filename prefix for a given dataset name. + + Args: + name (`str`): Dataset name (must not contain path separators). + + Returns: + `str`: The snake_case version of `name`, used as the base prefix for data files. + + Raises: + `ValueError`: If `name` looks like a file path rather than a plain dataset name. + + Example: + ```py + >>> filename_prefix_for_name("MyDataset") + 'my_dataset' + ``` + """ if os.path.basename(name) != name: raise ValueError(f"Should be a dataset name, not a path: {name}") return camelcase_to_snakecase(name) def filename_prefix_for_split(name, split): + """Return the filename prefix for a specific dataset split. + + Combines the snake_case dataset name with the split name, following + the pattern ``-``. + + Args: + name (`str`): Dataset name (must not contain path separators). + split (`str`): Split identifier, e.g. ``"train"``, ``"test"``, or + a dotted sub-split such as ``"train.clean"``. + + Returns: + `str`: The filename prefix in the form ``-``. + + Raises: + `ValueError`: If `name` looks like a path, or `split` does not match + the expected pattern ``^\\w+(\\.\\w+)*$``. + + Example: + ```py + >>> filename_prefix_for_split("MyDataset", "train") + 'my_dataset-train' + ``` + """ if os.path.basename(name) != name: raise ValueError(f"Should be a dataset name, not a path: {name}") if not re.match(_split_re, split): @@ -58,6 +98,25 @@ def filename_prefix_for_split(name, split): def filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None): + """Return a glob pattern matching all shard files for a dataset split. + + Args: + dataset_name (`str`): Dataset name (must not contain path separators). + split (`str`): Split identifier, e.g. ``"train"`` or ``"validation"``. + data_dir (`str`): Directory where the dataset files are stored. + filetype_suffix (`str`, *optional*): File extension to append before the + glob wildcard, e.g. ``"parquet"`` → ``"my_dataset-train.parquet*"``. + When ``None`` the pattern has no extension suffix. + + Returns: + `str`: A glob pattern of the form ``/[.]*``. + + Example: + ```py + >>> filepattern_for_dataset_split("MyDataset", "train", "/data", "parquet") + '/data/my_dataset-train.parquet*' + ``` + """ prefix = filename_prefix_for_split(dataset_name, split) if filetype_suffix: prefix += f".{filetype_suffix}" @@ -66,6 +125,24 @@ def filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix def filename_for_dataset_split(dataset_name, split, filetype_suffix=None): + """Return the filename (without directory) for a dataset split. + + Args: + dataset_name (`str`): Dataset name (must not contain path separators). + split (`str`): Split identifier, e.g. ``"train"`` or ``"test"``. + filetype_suffix (`str`, *optional*): File extension to append, e.g. + ``"arrow"`` → ``"my_dataset-train.arrow"``. When ``None`` no + extension is added. + + Returns: + `str`: The filename string, e.g. ``"my_dataset-train.arrow"``. + + Example: + ```py + >>> filename_for_dataset_split("MyDataset", "train", "arrow") + 'my_dataset-train.arrow' + ``` + """ prefix = filename_prefix_for_split(dataset_name, split) if filetype_suffix: prefix += f".{filetype_suffix}" @@ -73,6 +150,27 @@ def filename_for_dataset_split(dataset_name, split, filetype_suffix=None): def filepath_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None): + """Return the full file path for a dataset split file. + + Combines `data_dir` and the result of :func:`filename_for_dataset_split` + to produce an absolute-or-relative path suitable for reading or writing. + + Args: + dataset_name (`str`): Dataset name (must not contain path separators). + split (`str`): Split identifier, e.g. ``"train"`` or ``"validation"``. + data_dir (`str`): Directory where the dataset files are stored. + filetype_suffix (`str`, *optional*): File extension without the leading + dot, e.g. ``"arrow"``. When ``None`` no extension is added. + + Returns: + `str`: Full path of the form ``/``. + + Example: + ```py + >>> filepath_for_dataset_split("MyDataset", "train", "/data", "arrow") + '/data/my_dataset-train.arrow' + ``` + """ filename = filename_for_dataset_split( dataset_name=dataset_name, split=split,