Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions src/evaluate/naming.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,52 @@ def snakecase_to_camelcase(name):


def filename_prefix_for_name(name):
"""Return the snake_case filename prefix for a given dataset name.

Args:
name (`str`): Dataset name (must not contain path separators).

Returns:
`str`: The snake_case version of `name`, used as the base prefix for data files.

Raises:
`ValueError`: If `name` looks like a file path rather than a plain dataset name.

Example:
```py
>>> filename_prefix_for_name("MyDataset")
'my_dataset'
```
"""
if os.path.basename(name) != name:
raise ValueError(f"Should be a dataset name, not a path: {name}")
return camelcase_to_snakecase(name)


def filename_prefix_for_split(name, split):
"""Return the filename prefix for a specific dataset split.

Combines the snake_case dataset name with the split name, following
the pattern ``<snake_case_name>-<split>``.

Args:
name (`str`): Dataset name (must not contain path separators).
split (`str`): Split identifier, e.g. ``"train"``, ``"test"``, or
a dotted sub-split such as ``"train.clean"``.

Returns:
`str`: The filename prefix in the form ``<name_prefix>-<split>``.

Raises:
`ValueError`: If `name` looks like a path, or `split` does not match
the expected pattern ``^\\w+(\\.\\w+)*$``.

Example:
```py
>>> filename_prefix_for_split("MyDataset", "train")
'my_dataset-train'
```
"""
if os.path.basename(name) != name:
raise ValueError(f"Should be a dataset name, not a path: {name}")
if not re.match(_split_re, split):
Expand All @@ -58,6 +98,25 @@ def filename_prefix_for_split(name, split):


def filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None):
"""Return a glob pattern matching all shard files for a dataset split.

Args:
dataset_name (`str`): Dataset name (must not contain path separators).
split (`str`): Split identifier, e.g. ``"train"`` or ``"validation"``.
data_dir (`str`): Directory where the dataset files are stored.
filetype_suffix (`str`, *optional*): File extension to append before the
glob wildcard, e.g. ``"parquet"`` → ``"my_dataset-train.parquet*"``.
When ``None`` the pattern has no extension suffix.

Returns:
`str`: A glob pattern of the form ``<data_dir>/<prefix>[.<suffix>]*``.

Example:
```py
>>> filepattern_for_dataset_split("MyDataset", "train", "/data", "parquet")
'/data/my_dataset-train.parquet*'
```
"""
prefix = filename_prefix_for_split(dataset_name, split)
if filetype_suffix:
prefix += f".{filetype_suffix}"
Expand All @@ -66,13 +125,52 @@ def filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix


def filename_for_dataset_split(dataset_name, split, filetype_suffix=None):
"""Return the filename (without directory) for a dataset split.

Args:
dataset_name (`str`): Dataset name (must not contain path separators).
split (`str`): Split identifier, e.g. ``"train"`` or ``"test"``.
filetype_suffix (`str`, *optional*): File extension to append, e.g.
``"arrow"`` → ``"my_dataset-train.arrow"``. When ``None`` no
extension is added.

Returns:
`str`: The filename string, e.g. ``"my_dataset-train.arrow"``.

Example:
```py
>>> filename_for_dataset_split("MyDataset", "train", "arrow")
'my_dataset-train.arrow'
```
"""
prefix = filename_prefix_for_split(dataset_name, split)
if filetype_suffix:
prefix += f".{filetype_suffix}"
return prefix


def filepath_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None):
"""Return the full file path for a dataset split file.

Combines `data_dir` and the result of :func:`filename_for_dataset_split`
to produce an absolute-or-relative path suitable for reading or writing.

Args:
dataset_name (`str`): Dataset name (must not contain path separators).
split (`str`): Split identifier, e.g. ``"train"`` or ``"validation"``.
data_dir (`str`): Directory where the dataset files are stored.
filetype_suffix (`str`, *optional*): File extension without the leading
dot, e.g. ``"arrow"``. When ``None`` no extension is added.

Returns:
`str`: Full path of the form ``<data_dir>/<filename>``.

Example:
```py
>>> filepath_for_dataset_split("MyDataset", "train", "/data", "arrow")
'/data/my_dataset-train.arrow'
```
"""
filename = filename_for_dataset_split(
dataset_name=dataset_name,
split=split,
Expand Down