DeepSolanaCoder
/
DeepSeek-Coder-main
/finetune
/venv
/lib
/python3.12
/site-packages
/datasets
/dataset_dict.py
import contextlib | |
import copy | |
import fnmatch | |
import json | |
import math | |
import posixpath | |
import re | |
from io import BytesIO | |
from pathlib import Path | |
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union | |
import fsspec | |
import numpy as np | |
from fsspec.core import url_to_fs | |
from huggingface_hub import ( | |
CommitInfo, | |
CommitOperationAdd, | |
CommitOperationDelete, | |
DatasetCard, | |
DatasetCardData, | |
HfApi, | |
) | |
from huggingface_hub.hf_api import RepoFile | |
from . import config | |
from .arrow_dataset import PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED, Dataset | |
from .features import Features | |
from .features.features import FeatureType | |
from .info import DatasetInfo, DatasetInfosDict | |
from .naming import _split_re | |
from .splits import NamedSplit, Split, SplitDict, SplitInfo | |
from .table import Table | |
from .utils import logging | |
from .utils.doc_utils import is_documented_by | |
from .utils.metadata import MetadataConfigs | |
from .utils.py_utils import asdict, glob_pattern_to_regex, string_to_dict | |
from .utils.typing import PathLike | |
logger = logging.get_logger(__name__) | |
class DatasetDict(dict): | |
"""A dictionary (dict of str: datasets.Dataset) with dataset transforms methods (map, filter, etc.)""" | |
def _check_values_type(self): | |
for dataset in self.values(): | |
if not isinstance(dataset, Dataset): | |
raise TypeError(f"Values in `DatasetDict` should be of type `Dataset` but got type '{type(dataset)}'") | |
def _check_values_features(self): | |
items = list(self.items()) | |
for item_a, item_b in zip(items[:-1], items[1:]): | |
if item_a[1].features != item_b[1].features: | |
raise ValueError( | |
f"All datasets in `DatasetDict` should have the same features but features for '{item_a[0]}' and '{item_b[0]}' don't match: {item_a[1].features} != {item_b[1].features}" | |
) | |
def __enter__(self): | |
return self | |
def __exit__(self, exc_type, exc_val, exc_tb): | |
# Here `del` is used to del the pyarrow tables. This properly closes the files used for memory mapped tables | |
for dataset in self.values(): | |
if hasattr(dataset, "_data"): | |
del dataset._data | |
if hasattr(dataset, "_indices"): | |
del dataset._indices | |
def __getitem__(self, k) -> Dataset: | |
if isinstance(k, (str, NamedSplit)) or len(self) == 0: | |
return super().__getitem__(k) | |
else: | |
available_suggested_splits = [ | |
split for split in (Split.TRAIN, Split.TEST, Split.VALIDATION) if split in self | |
] | |
suggested_split = available_suggested_splits[0] if available_suggested_splits else list(self)[0] | |
raise KeyError( | |
f"Invalid key: {k}. Please first select a split. For example: " | |
f"`my_dataset_dictionary['{suggested_split}'][{k}]`. " | |
f"Available splits: {sorted(self)}" | |
) | |
def data(self) -> Dict[str, Table]: | |
"""The Apache Arrow tables backing each split. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> ds.data | |
``` | |
""" | |
self._check_values_type() | |
return {k: dataset.data for k, dataset in self.items()} | |
def cache_files(self) -> Dict[str, Dict]: | |
"""The cache files containing the Apache Arrow table backing each split. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> ds.cache_files | |
{'test': [{'filename': '/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46/rotten_tomatoes_movie_review-test.arrow'}], | |
'train': [{'filename': '/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46/rotten_tomatoes_movie_review-train.arrow'}], | |
'validation': [{'filename': '/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46/rotten_tomatoes_movie_review-validation.arrow'}]} | |
``` | |
""" | |
self._check_values_type() | |
return {k: dataset.cache_files for k, dataset in self.items()} | |
def num_columns(self) -> Dict[str, int]: | |
"""Number of columns in each split of the dataset. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> ds.num_columns | |
{'test': 2, 'train': 2, 'validation': 2} | |
``` | |
""" | |
self._check_values_type() | |
return {k: dataset.num_columns for k, dataset in self.items()} | |
def num_rows(self) -> Dict[str, int]: | |
"""Number of rows in each split of the dataset. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> ds.num_rows | |
{'test': 1066, 'train': 8530, 'validation': 1066} | |
``` | |
""" | |
self._check_values_type() | |
return {k: dataset.num_rows for k, dataset in self.items()} | |
def column_names(self) -> Dict[str, List[str]]: | |
"""Names of the columns in each split of the dataset. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> ds.column_names | |
{'test': ['text', 'label'], | |
'train': ['text', 'label'], | |
'validation': ['text', 'label']} | |
``` | |
""" | |
self._check_values_type() | |
return {k: dataset.column_names for k, dataset in self.items()} | |
def shape(self) -> Dict[str, Tuple[int]]: | |
"""Shape of each split of the dataset (number of rows, number of columns). | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> ds.shape | |
{'test': (1066, 2), 'train': (8530, 2), 'validation': (1066, 2)} | |
``` | |
""" | |
self._check_values_type() | |
return {k: dataset.shape for k, dataset in self.items()} | |
def flatten(self, max_depth=16) -> "DatasetDict": | |
"""Flatten the Apache Arrow Table of each split (nested features are flatten). | |
Each column with a struct type is flattened into one column per struct field. | |
Other columns are left unchanged. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("squad") | |
>>> ds["train"].features | |
{'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None), | |
'context': Value(dtype='string', id=None), | |
'id': Value(dtype='string', id=None), | |
'question': Value(dtype='string', id=None), | |
'title': Value(dtype='string', id=None)} | |
>>> ds.flatten() | |
DatasetDict({ | |
train: Dataset({ | |
features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'], | |
num_rows: 87599 | |
}) | |
validation: Dataset({ | |
features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'], | |
num_rows: 10570 | |
}) | |
}) | |
``` | |
""" | |
self._check_values_type() | |
return DatasetDict({k: dataset.flatten(max_depth=max_depth) for k, dataset in self.items()}) | |
def unique(self, column: str) -> Dict[str, List]: | |
"""Return a list of the unique elements in a column for each split. | |
This is implemented in the low-level backend and as such, very fast. | |
Args: | |
column (`str`): | |
column name (list all the column names with [`~datasets.DatasetDict.column_names`]) | |
Returns: | |
Dict[`str`, `list`]: Dictionary of unique elements in the given column. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> ds.unique("label") | |
{'test': [1, 0], 'train': [1, 0], 'validation': [1, 0]} | |
``` | |
""" | |
self._check_values_type() | |
return {k: dataset.unique(column) for k, dataset in self.items()} | |
def cleanup_cache_files(self) -> Dict[str, int]: | |
"""Clean up all cache files in the dataset cache directory, excepted the currently used cache file if there is one. | |
Be careful when running this command that no other process is currently using other cache files. | |
Return: | |
`Dict` with the number of removed files for each split | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> ds.cleanup_cache_files() | |
{'test': 0, 'train': 0, 'validation': 0} | |
``` | |
""" | |
self._check_values_type() | |
return {k: dataset.cleanup_cache_files() for k, dataset in self.items()} | |
def __repr__(self): | |
repr = "\n".join([f"{k}: {v}" for k, v in self.items()]) | |
repr = re.sub(r"^", " " * 4, repr, 0, re.M) | |
return f"DatasetDict({{\n{repr}\n}})" | |
def cast(self, features: Features) -> "DatasetDict": | |
""" | |
Cast the dataset to a new set of features. | |
The transformation is applied to all the datasets of the dataset dictionary. | |
Args: | |
features ([`Features`]): | |
New features to cast the dataset to. | |
The name and order of the fields in the features must match the current column names. | |
The type of the data must also be convertible from one type to the other. | |
For non-trivial conversion, e.g. `string` <-> `ClassLabel` you should use [`~DatasetDict.map`] to update the dataset. | |
Example: | |
```py | |
>>> from datasets import load_dataset, ClassLabel, Value | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> ds["train"].features | |
{'label': ClassLabel(names=['neg', 'pos'], id=None), | |
'text': Value(dtype='string', id=None)} | |
>>> new_features = ds["train"].features.copy() | |
>>> new_features['label'] = ClassLabel(names=['bad', 'good']) | |
>>> new_features['text'] = Value('large_string') | |
>>> ds = ds.cast(new_features) | |
>>> ds["train"].features | |
{'label': ClassLabel(names=['bad', 'good'], id=None), | |
'text': Value(dtype='large_string', id=None)} | |
``` | |
""" | |
self._check_values_type() | |
return DatasetDict({k: dataset.cast(features=features) for k, dataset in self.items()}) | |
def cast_column(self, column: str, feature) -> "DatasetDict": | |
"""Cast column to feature for decoding. | |
Args: | |
column (`str`): | |
Column name. | |
feature ([`Feature`]): | |
Target feature. | |
Returns: | |
[`DatasetDict`] | |
Example: | |
```py | |
>>> from datasets import load_dataset, ClassLabel | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> ds["train"].features | |
{'label': ClassLabel(names=['neg', 'pos'], id=None), | |
'text': Value(dtype='string', id=None)} | |
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) | |
>>> ds["train"].features | |
{'label': ClassLabel(names=['bad', 'good'], id=None), | |
'text': Value(dtype='string', id=None)} | |
``` | |
""" | |
self._check_values_type() | |
return DatasetDict({k: dataset.cast_column(column=column, feature=feature) for k, dataset in self.items()}) | |
def remove_columns(self, column_names: Union[str, List[str]]) -> "DatasetDict": | |
""" | |
Remove one or several column(s) from each split in the dataset | |
and the features associated to the column(s). | |
The transformation is applied to all the splits of the dataset dictionary. | |
You can also remove a column using [`~DatasetDict.map`] with `remove_columns` but the present method | |
doesn't copy the data of the remaining columns and is thus faster. | |
Args: | |
column_names (`Union[str, List[str]]`): | |
Name of the column(s) to remove. | |
Returns: | |
[`DatasetDict`]: A copy of the dataset object without the columns to remove. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> ds = ds.remove_columns("label") | |
DatasetDict({ | |
train: Dataset({ | |
features: ['text'], | |
num_rows: 8530 | |
}) | |
validation: Dataset({ | |
features: ['text'], | |
num_rows: 1066 | |
}) | |
test: Dataset({ | |
features: ['text'], | |
num_rows: 1066 | |
}) | |
}) | |
``` | |
""" | |
self._check_values_type() | |
return DatasetDict({k: dataset.remove_columns(column_names=column_names) for k, dataset in self.items()}) | |
def rename_column(self, original_column_name: str, new_column_name: str) -> "DatasetDict": | |
""" | |
Rename a column in the dataset and move the features associated to the original column under the new column name. | |
The transformation is applied to all the datasets of the dataset dictionary. | |
You can also rename a column using [`~DatasetDict.map`] with `remove_columns` but the present method: | |
- takes care of moving the original features under the new column name. | |
- doesn't copy the data to a new dataset and is thus much faster. | |
Args: | |
original_column_name (`str`): | |
Name of the column to rename. | |
new_column_name (`str`): | |
New name for the column. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> ds = ds.rename_column("label", "label_new") | |
DatasetDict({ | |
train: Dataset({ | |
features: ['text', 'label_new'], | |
num_rows: 8530 | |
}) | |
validation: Dataset({ | |
features: ['text', 'label_new'], | |
num_rows: 1066 | |
}) | |
test: Dataset({ | |
features: ['text', 'label_new'], | |
num_rows: 1066 | |
}) | |
}) | |
``` | |
""" | |
self._check_values_type() | |
return DatasetDict( | |
{ | |
k: dataset.rename_column(original_column_name=original_column_name, new_column_name=new_column_name) | |
for k, dataset in self.items() | |
} | |
) | |
def rename_columns(self, column_mapping: Dict[str, str]) -> "DatasetDict": | |
""" | |
Rename several columns in the dataset, and move the features associated to the original columns under | |
the new column names. | |
The transformation is applied to all the datasets of the dataset dictionary. | |
Args: | |
column_mapping (`Dict[str, str]`): | |
A mapping of columns to rename to their new names. | |
Returns: | |
[`DatasetDict`]: A copy of the dataset with renamed columns. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> ds.rename_columns({'text': 'text_new', 'label': 'label_new'}) | |
DatasetDict({ | |
train: Dataset({ | |
features: ['text_new', 'label_new'], | |
num_rows: 8530 | |
}) | |
validation: Dataset({ | |
features: ['text_new', 'label_new'], | |
num_rows: 1066 | |
}) | |
test: Dataset({ | |
features: ['text_new', 'label_new'], | |
num_rows: 1066 | |
}) | |
}) | |
``` | |
""" | |
self._check_values_type() | |
return DatasetDict({k: dataset.rename_columns(column_mapping=column_mapping) for k, dataset in self.items()}) | |
def select_columns(self, column_names: Union[str, List[str]]) -> "DatasetDict": | |
"""Select one or several column(s) from each split in the dataset and | |
the features associated to the column(s). | |
The transformation is applied to all the splits of the dataset | |
dictionary. | |
Args: | |
column_names (`Union[str, List[str]]`): | |
Name of the column(s) to keep. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> ds.select_columns("text") | |
DatasetDict({ | |
train: Dataset({ | |
features: ['text'], | |
num_rows: 8530 | |
}) | |
validation: Dataset({ | |
features: ['text'], | |
num_rows: 1066 | |
}) | |
test: Dataset({ | |
features: ['text'], | |
num_rows: 1066 | |
}) | |
}) | |
``` | |
""" | |
self._check_values_type() | |
return DatasetDict({k: dataset.select_columns(column_names=column_names) for k, dataset in self.items()}) | |
def class_encode_column(self, column: str, include_nulls: bool = False) -> "DatasetDict": | |
"""Casts the given column as [`~datasets.features.ClassLabel`] and updates the tables. | |
Args: | |
column (`str`): | |
The name of the column to cast. | |
include_nulls (`bool`, defaults to `False`): | |
Whether to include null values in the class labels. If `True`, the null values will be encoded as the `"None"` class label. | |
<Added version="1.14.2"/> | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("boolq") | |
>>> ds["train"].features | |
{'answer': Value(dtype='bool', id=None), | |
'passage': Value(dtype='string', id=None), | |
'question': Value(dtype='string', id=None)} | |
>>> ds = ds.class_encode_column("answer") | |
>>> ds["train"].features | |
{'answer': ClassLabel(num_classes=2, names=['False', 'True'], id=None), | |
'passage': Value(dtype='string', id=None), | |
'question': Value(dtype='string', id=None)} | |
``` | |
""" | |
self._check_values_type() | |
return DatasetDict( | |
{k: dataset.class_encode_column(column=column, include_nulls=include_nulls) for k, dataset in self.items()} | |
) | |
def formatted_as( | |
self, | |
type: Optional[str] = None, | |
columns: Optional[List] = None, | |
output_all_columns: bool = False, | |
**format_kwargs, | |
): | |
"""To be used in a `with` statement. Set `__getitem__` return format (type and columns). | |
The transformation is applied to all the datasets of the dataset dictionary. | |
Args: | |
type (`str`, *optional*): | |
Output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'pandas', 'arrow', 'jax']`. | |
`None` means `__getitem__` returns python objects (default). | |
columns (`List[str]`, *optional*): | |
Columns to format in the output. | |
`None` means `__getitem__` returns all columns (default). | |
output_all_columns (`bool`, defaults to False): | |
Keep un-formatted columns as well in the output (as python objects). | |
**format_kwargs (additional keyword arguments): | |
Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`. | |
""" | |
self._check_values_type() | |
old_format_type = {k: dataset._format_type for k, dataset in self.items()} | |
old_format_kwargs = {k: dataset._format_kwargs for k, dataset in self.items()} | |
old_format_columns = {k: dataset._format_columns for k, dataset in self.items()} | |
old_output_all_columns = {k: dataset._output_all_columns for k, dataset in self.items()} | |
try: | |
self.set_format(type, columns, output_all_columns, **format_kwargs) | |
yield | |
finally: | |
for k, dataset in self.items(): | |
dataset.set_format( | |
old_format_type[k], old_format_columns[k], old_output_all_columns[k], **old_format_kwargs[k] | |
) | |
def set_format( | |
self, | |
type: Optional[str] = None, | |
columns: Optional[List] = None, | |
output_all_columns: bool = False, | |
**format_kwargs, | |
): | |
"""Set `__getitem__` return format (type and columns). | |
The format is set for every dataset in the dataset dictionary. | |
Args: | |
type (`str`, *optional*): | |
Output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'pandas', 'arrow', 'jax']`. | |
`None` means `__getitem__` returns python objects (default). | |
columns (`List[str]`, *optional*): | |
Columns to format in the output. | |
`None` means `__getitem__` returns all columns (default). | |
output_all_columns (`bool`, defaults to False): | |
Keep un-formatted columns as well in the output (as python objects), | |
**format_kwargs (additional keyword arguments): | |
Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`. | |
It is possible to call `map` after calling `set_format`. Since `map` may add new columns, then the list of formatted columns | |
gets updated. In this case, if you apply `map` on a dataset to add a new column, then this column will be formatted: | |
`new formatted columns = (all columns - previously unformatted columns)` | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> from transformers import AutoTokenizer | |
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") | |
>>> ds = ds.map(lambda x: tokenizer(x["text"], truncation=True, padding=True), batched=True) | |
>>> ds.set_format(type="numpy", columns=['input_ids', 'token_type_ids', 'attention_mask', 'label']) | |
>>> ds["train"].format | |
{'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'], | |
'format_kwargs': {}, | |
'output_all_columns': False, | |
'type': 'numpy'} | |
``` | |
""" | |
self._check_values_type() | |
for dataset in self.values(): | |
dataset.set_format(type=type, columns=columns, output_all_columns=output_all_columns, **format_kwargs) | |
def reset_format(self): | |
"""Reset `__getitem__` return format to python objects and all columns. | |
The transformation is applied to all the datasets of the dataset dictionary. | |
Same as `self.set_format()` | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> from transformers import AutoTokenizer | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") | |
>>> ds = ds.map(lambda x: tokenizer(x["text"], truncation=True, padding=True), batched=True) | |
>>> ds.set_format(type="numpy", columns=['input_ids', 'token_type_ids', 'attention_mask', 'label']) | |
>>> ds["train"].format | |
{'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'], | |
'format_kwargs': {}, | |
'output_all_columns': False, | |
'type': 'numpy'} | |
>>> ds.reset_format() | |
>>> ds["train"].format | |
{'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], | |
'format_kwargs': {}, | |
'output_all_columns': False, | |
'type': None} | |
``` | |
""" | |
self._check_values_type() | |
for dataset in self.values(): | |
dataset.set_format() | |
def set_transform( | |
self, | |
transform: Optional[Callable], | |
columns: Optional[List] = None, | |
output_all_columns: bool = False, | |
): | |
"""Set ``__getitem__`` return format using this transform. The transform is applied on-the-fly on batches when ``__getitem__`` is called. | |
The transform is set for every dataset in the dataset dictionary | |
As :func:`datasets.Dataset.set_format`, this can be reset using :func:`datasets.Dataset.reset_format` | |
Args: | |
transform (`Callable`, optional): user-defined formatting transform, replaces the format defined by :func:`datasets.Dataset.set_format` | |
A formatting function is a callable that takes a batch (as a dict) as input and returns a batch. | |
This function is applied right before returning the objects in ``__getitem__``. | |
columns (`List[str]`, optional): columns to format in the output | |
If specified, then the input batch of the transform only contains those columns. | |
output_all_columns (`bool`, default to False): keep un-formatted columns as well in the output (as python objects) | |
If set to True, then the other un-formatted columns are kept with the output of the transform. | |
""" | |
self._check_values_type() | |
for dataset in self.values(): | |
dataset.set_format("custom", columns=columns, output_all_columns=output_all_columns, transform=transform) | |
def with_format( | |
self, | |
type: Optional[str] = None, | |
columns: Optional[List] = None, | |
output_all_columns: bool = False, | |
**format_kwargs, | |
) -> "DatasetDict": | |
"""Set `__getitem__` return format (type and columns). The data formatting is applied on-the-fly. | |
The format `type` (for example "numpy") is used to format batches when using `__getitem__`. | |
The format is set for every dataset in the dataset dictionary. | |
It's also possible to use custom transforms for formatting using [`~datasets.Dataset.with_transform`]. | |
Contrary to [`~datasets.DatasetDict.set_format`], `with_format` returns a new [`DatasetDict`] object with new [`Dataset`] objects. | |
Args: | |
type (`str`, *optional*): | |
Output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'pandas', 'arrow', 'jax']`. | |
`None` means `__getitem__` returns python objects (default). | |
columns (`List[str]`, *optional*): | |
Columns to format in the output. | |
`None` means `__getitem__` returns all columns (default). | |
output_all_columns (`bool`, defaults to `False`): | |
Keep un-formatted columns as well in the output (as python objects). | |
**format_kwargs (additional keyword arguments): | |
Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> from transformers import AutoTokenizer | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") | |
>>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True) | |
>>> ds["train"].format | |
{'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], | |
'format_kwargs': {}, | |
'output_all_columns': False, | |
'type': None} | |
>>> ds = ds.with_format("torch") | |
>>> ds["train"].format | |
{'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], | |
'format_kwargs': {}, | |
'output_all_columns': False, | |
'type': 'torch'} | |
>>> ds["train"][0] | |
{'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .', | |
'label': tensor(1), | |
'input_ids': tensor([ 101, 18027, 16310, 16001, 1103, 9321, 178, 11604, 7235, 6617, | |
1742, 2165, 2820, 1206, 6588, 22572, 12937, 1811, 2153, 1105, | |
1147, 12890, 19587, 6463, 1105, 15026, 1482, 119, 102, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0]), | |
'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), | |
'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])} | |
``` | |
""" | |
dataset = copy.deepcopy(self) | |
dataset.set_format(type=type, columns=columns, output_all_columns=output_all_columns, **format_kwargs) | |
return dataset | |
def with_transform( | |
self, | |
transform: Optional[Callable], | |
columns: Optional[List] = None, | |
output_all_columns: bool = False, | |
) -> "DatasetDict": | |
"""Set `__getitem__` return format using this transform. The transform is applied on-the-fly on batches when `__getitem__` is called. | |
The transform is set for every dataset in the dataset dictionary | |
As [`~datasets.Dataset.set_format`], this can be reset using [`~datasets.Dataset.reset_format`]. | |
Contrary to [`~datasets.DatasetDict.set_transform`], `with_transform` returns a new [`DatasetDict`] object with new [`Dataset`] objects. | |
Args: | |
transform (`Callable`, *optional*): | |
User-defined formatting transform, replaces the format defined by [`~datasets.Dataset.set_format`]. | |
A formatting function is a callable that takes a batch (as a dict) as input and returns a batch. | |
This function is applied right before returning the objects in `__getitem__`. | |
columns (`List[str]`, *optional*): | |
Columns to format in the output. | |
If specified, then the input batch of the transform only contains those columns. | |
output_all_columns (`bool`, defaults to False): | |
Keep un-formatted columns as well in the output (as python objects). | |
If set to `True`, then the other un-formatted columns are kept with the output of the transform. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> from transformers import AutoTokenizer | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") | |
>>> def encode(example): | |
... return tokenizer(example['text'], truncation=True, padding=True, return_tensors="pt") | |
>>> ds = ds.with_transform(encode) | |
>>> ds["train"][0] | |
{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1, 1, 1, 1, 1, 1, 1, 1, 1]), | |
'input_ids': tensor([ 101, 1103, 2067, 1110, 17348, 1106, 1129, 1103, 6880, 1432, | |
112, 188, 1207, 107, 14255, 1389, 107, 1105, 1115, 1119, | |
112, 188, 1280, 1106, 1294, 170, 24194, 1256, 3407, 1190, | |
170, 11791, 5253, 188, 1732, 7200, 10947, 12606, 2895, 117, | |
179, 7766, 118, 172, 15554, 1181, 3498, 6961, 3263, 1137, | |
188, 1566, 7912, 14516, 6997, 119, 102]), | |
'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0])} | |
``` | |
""" | |
dataset = copy.deepcopy(self) | |
dataset.set_transform(transform=transform, columns=columns, output_all_columns=output_all_columns) | |
return dataset | |
def map( | |
self, | |
function: Optional[Callable] = None, | |
with_indices: bool = False, | |
with_rank: bool = False, | |
input_columns: Optional[Union[str, List[str]]] = None, | |
batched: bool = False, | |
batch_size: Optional[int] = 1000, | |
drop_last_batch: bool = False, | |
remove_columns: Optional[Union[str, List[str]]] = None, | |
keep_in_memory: bool = False, | |
load_from_cache_file: Optional[bool] = None, | |
cache_file_names: Optional[Dict[str, Optional[str]]] = None, | |
writer_batch_size: Optional[int] = 1000, | |
features: Optional[Features] = None, | |
disable_nullable: bool = False, | |
fn_kwargs: Optional[dict] = None, | |
num_proc: Optional[int] = None, | |
desc: Optional[str] = None, | |
) -> "DatasetDict": | |
"""Apply a function to all the elements in the table (individually or in batches) | |
and update the table (if function does updated examples). | |
The transformation is applied to all the datasets of the dataset dictionary. | |
Args: | |
function (`callable`): with one of the following signature: | |
- `function(example: Dict[str, Any]) -> Dict[str, Any]` if `batched=False` and `with_indices=False` | |
- `function(example: Dict[str, Any], indices: int) -> Dict[str, Any]` if `batched=False` and `with_indices=True` | |
- `function(batch: Dict[str, List]) -> Dict[str, List]` if `batched=True` and `with_indices=False` | |
- `function(batch: Dict[str, List], indices: List[int]) -> Dict[str, List]` if `batched=True` and `with_indices=True` | |
For advanced usage, the function can also return a `pyarrow.Table`. | |
Moreover if your function returns nothing (`None`), then `map` will run your function and return the dataset unchanged. | |
with_indices (`bool`, defaults to `False`): | |
Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx): ...`. | |
with_rank (`bool`, defaults to `False`): | |
Provide process rank to `function`. Note that in this case the | |
signature of `function` should be `def function(example[, idx], rank): ...`. | |
input_columns (`[Union[str, List[str]]]`, *optional*, defaults to `None`): | |
The columns to be passed into `function` as | |
positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument. | |
batched (`bool`, defaults to `False`): | |
Provide batch of examples to `function`. | |
batch_size (`int`, *optional*, defaults to `1000`): | |
Number of examples per batch provided to `function` if `batched=True`, | |
`batch_size <= 0` or `batch_size == None` then provide the full dataset as a single batch to `function`. | |
drop_last_batch (`bool`, defaults to `False`): | |
Whether a last batch smaller than the batch_size should be | |
dropped instead of being processed by the function. | |
remove_columns (`[Union[str, List[str]]]`, *optional*, defaults to `None`): | |
Remove a selection of columns while doing the mapping. | |
Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding | |
columns with names in `remove_columns`, these columns will be kept. | |
keep_in_memory (`bool`, defaults to `False`): | |
Keep the dataset in memory instead of writing it to a cache file. | |
load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled): | |
If a cache file storing the current computation from `function` | |
can be identified, use it instead of recomputing. | |
cache_file_names (`[Dict[str, str]]`, *optional*, defaults to `None`): | |
Provide the name of a path for the cache file. It is used to store the | |
results of the computation instead of the automatically generated cache file name. | |
You have to provide one `cache_file_name` per dataset in the dataset dictionary. | |
writer_batch_size (`int`, default `1000`): | |
Number of rows per write operation for the cache file writer. | |
This value is a good trade-off between memory usage during the processing, and processing speed. | |
Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`. | |
features (`[datasets.Features]`, *optional*, defaults to `None`): | |
Use a specific [`Features`] to store the cache file | |
instead of the automatically generated one. | |
disable_nullable (`bool`, defaults to `False`): | |
Disallow null values in the table. | |
fn_kwargs (`Dict`, *optional*, defaults to `None`): | |
Keyword arguments to be passed to `function` | |
num_proc (`int`, *optional*, defaults to `None`): | |
Number of processes for multiprocessing. By default it doesn't | |
use multiprocessing. | |
desc (`str`, *optional*, defaults to `None`): | |
Meaningful description to be displayed alongside with the progress bar while mapping examples. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> def add_prefix(example): | |
... example["text"] = "Review: " + example["text"] | |
... return example | |
>>> ds = ds.map(add_prefix) | |
>>> ds["train"][0:3]["text"] | |
['Review: the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', | |
'Review: the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .', | |
'Review: effective but too-tepid biopic'] | |
# process a batch of examples | |
>>> ds = ds.map(lambda example: tokenizer(example["text"]), batched=True) | |
# set number of processors | |
>>> ds = ds.map(add_prefix, num_proc=4) | |
``` | |
""" | |
self._check_values_type() | |
if cache_file_names is None: | |
cache_file_names = {k: None for k in self} | |
return DatasetDict( | |
{ | |
k: dataset.map( | |
function=function, | |
with_indices=with_indices, | |
with_rank=with_rank, | |
input_columns=input_columns, | |
batched=batched, | |
batch_size=batch_size, | |
drop_last_batch=drop_last_batch, | |
remove_columns=remove_columns, | |
keep_in_memory=keep_in_memory, | |
load_from_cache_file=load_from_cache_file, | |
cache_file_name=cache_file_names[k], | |
writer_batch_size=writer_batch_size, | |
features=features, | |
disable_nullable=disable_nullable, | |
fn_kwargs=fn_kwargs, | |
num_proc=num_proc, | |
desc=desc, | |
) | |
for k, dataset in self.items() | |
} | |
) | |
def filter( | |
self, | |
function: Optional[Callable] = None, | |
with_indices: bool = False, | |
with_rank: bool = False, | |
input_columns: Optional[Union[str, List[str]]] = None, | |
batched: bool = False, | |
batch_size: Optional[int] = 1000, | |
keep_in_memory: bool = False, | |
load_from_cache_file: Optional[bool] = None, | |
cache_file_names: Optional[Dict[str, Optional[str]]] = None, | |
writer_batch_size: Optional[int] = 1000, | |
fn_kwargs: Optional[dict] = None, | |
num_proc: Optional[int] = None, | |
desc: Optional[str] = None, | |
) -> "DatasetDict": | |
"""Apply a filter function to all the elements in the table in batches | |
and update the table so that the dataset only includes examples according to the filter function. | |
The transformation is applied to all the datasets of the dataset dictionary. | |
Args: | |
function (`Callable`): Callable with one of the following signatures: | |
- `function(example: Dict[str, Any]) -> bool` if `batched=False` and `with_indices=False` and `with_rank=False` | |
- `function(example: Dict[str, Any], *extra_args) -> bool` if `batched=False` and `with_indices=True` and/or `with_rank=True` (one extra arg for each) | |
- `function(batch: Dict[str, List]) -> List[bool]` if `batched=True` and `with_indices=False` and `with_rank=False` | |
- `function(batch: Dict[str, List], *extra_args) -> List[bool]` if `batched=True` and `with_indices=True` and/or `with_rank=True` (one extra arg for each) | |
If no function is provided, defaults to an always `True` function: `lambda x: True`. | |
with_indices (`bool`, defaults to `False`): | |
Provide example indices to `function`. Note that in this case the | |
signature of `function` should be `def function(example, idx[, rank]): ...`. | |
with_rank (`bool`, defaults to `False`): | |
Provide process rank to `function`. Note that in this case the | |
signature of `function` should be `def function(example[, idx], rank): ...`. | |
input_columns (`[Union[str, List[str]]]`, *optional*, defaults to `None`): | |
The columns to be passed into `function` as | |
positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument. | |
batched (`bool`, defaults to `False`): | |
Provide batch of examples to `function`. | |
batch_size (`int`, *optional*, defaults to `1000`): | |
Number of examples per batch provided to `function` if `batched=True` | |
`batch_size <= 0` or `batch_size == None` then provide the full dataset as a single batch to `function`. | |
keep_in_memory (`bool`, defaults to `False`): | |
Keep the dataset in memory instead of writing it to a cache file. | |
load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled): | |
If a cache file storing the current computation from `function` | |
can be identified, use it instead of recomputing. | |
cache_file_names (`[Dict[str, str]]`, *optional*, defaults to `None`): | |
Provide the name of a path for the cache file. It is used to store the | |
results of the computation instead of the automatically generated cache file name. | |
You have to provide one `cache_file_name` per dataset in the dataset dictionary. | |
writer_batch_size (`int`, defaults to `1000`): | |
Number of rows per write operation for the cache file writer. | |
This value is a good trade-off between memory usage during the processing, and processing speed. | |
Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`. | |
fn_kwargs (`Dict`, *optional*, defaults to `None`): | |
Keyword arguments to be passed to `function` | |
num_proc (`int`, *optional*, defaults to `None`): | |
Number of processes for multiprocessing. By default it doesn't | |
use multiprocessing. | |
desc (`str`, *optional*, defaults to `None`): | |
Meaningful description to be displayed alongside with the progress bar while filtering examples. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> ds.filter(lambda x: x["label"] == 1) | |
DatasetDict({ | |
train: Dataset({ | |
features: ['text', 'label'], | |
num_rows: 4265 | |
}) | |
validation: Dataset({ | |
features: ['text', 'label'], | |
num_rows: 533 | |
}) | |
test: Dataset({ | |
features: ['text', 'label'], | |
num_rows: 533 | |
}) | |
}) | |
``` | |
""" | |
self._check_values_type() | |
if cache_file_names is None: | |
cache_file_names = {k: None for k in self} | |
return DatasetDict( | |
{ | |
k: dataset.filter( | |
function=function, | |
with_indices=with_indices, | |
with_rank=with_rank, | |
input_columns=input_columns, | |
batched=batched, | |
batch_size=batch_size, | |
keep_in_memory=keep_in_memory, | |
load_from_cache_file=load_from_cache_file, | |
cache_file_name=cache_file_names[k], | |
writer_batch_size=writer_batch_size, | |
fn_kwargs=fn_kwargs, | |
num_proc=num_proc, | |
desc=desc, | |
) | |
for k, dataset in self.items() | |
} | |
) | |
def flatten_indices( | |
self, | |
keep_in_memory: bool = False, | |
cache_file_names: Optional[Dict[str, Optional[str]]] = None, | |
writer_batch_size: Optional[int] = 1000, | |
features: Optional[Features] = None, | |
disable_nullable: bool = False, | |
num_proc: Optional[int] = None, | |
new_fingerprint: Optional[str] = None, | |
) -> "DatasetDict": | |
"""Create and cache a new Dataset by flattening the indices mapping. | |
Args: | |
keep_in_memory (`bool`, defaults to `False`): | |
Keep the dataset in memory instead of writing it to a cache file. | |
cache_file_names (`Dict[str, str]`, *optional*, default `None`): | |
Provide the name of a path for the cache file. It is used to store the | |
results of the computation instead of the automatically generated cache file name. | |
You have to provide one `cache_file_name` per dataset in the dataset dictionary. | |
writer_batch_size (`int`, defaults to `1000`): | |
Number of rows per write operation for the cache file writer. | |
This value is a good trade-off between memory usage during the processing, and processing speed. | |
Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`. | |
features (`Optional[datasets.Features]`, defaults to `None`): | |
Use a specific [`Features`] to store the cache file | |
instead of the automatically generated one. | |
disable_nullable (`bool`, defaults to `False`): | |
Allow null values in the table. | |
num_proc (`int`, optional, default `None`): | |
Max number of processes when generating cache. Already cached shards are loaded sequentially | |
new_fingerprint (`str`, *optional*, defaults to `None`): | |
The new fingerprint of the dataset after transform. | |
If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments | |
""" | |
self._check_values_type() | |
if cache_file_names is None: | |
cache_file_names = {k: None for k in self} | |
return DatasetDict( | |
{ | |
k: dataset.flatten_indices( | |
keep_in_memory=keep_in_memory, | |
cache_file_name=cache_file_names[k], | |
writer_batch_size=writer_batch_size, | |
features=features, | |
disable_nullable=disable_nullable, | |
num_proc=num_proc, | |
new_fingerprint=new_fingerprint, | |
) | |
for k, dataset in self.items() | |
} | |
) | |
def sort( | |
self, | |
column_names: Union[str, Sequence[str]], | |
reverse: Union[bool, Sequence[bool]] = False, | |
null_placement: str = "at_end", | |
keep_in_memory: bool = False, | |
load_from_cache_file: Optional[bool] = None, | |
indices_cache_file_names: Optional[Dict[str, Optional[str]]] = None, | |
writer_batch_size: Optional[int] = 1000, | |
) -> "DatasetDict": | |
"""Create a new dataset sorted according to a single or multiple columns. | |
Args: | |
column_names (`Union[str, Sequence[str]]`): | |
Column name(s) to sort by. | |
reverse (`Union[bool, Sequence[bool]]`, defaults to `False`): | |
If `True`, sort by descending order rather than ascending. If a single bool is provided, | |
the value is applied to the sorting of all column names. Otherwise a list of bools with the | |
same length and order as column_names must be provided. | |
null_placement (`str`, defaults to `at_end`): | |
Put `None` values at the beginning if `at_start` or `first` or at the end if `at_end` or `last` | |
keep_in_memory (`bool`, defaults to `False`): | |
Keep the sorted indices in memory instead of writing it to a cache file. | |
load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled): | |
If a cache file storing the sorted indices | |
can be identified, use it instead of recomputing. | |
indices_cache_file_names (`[Dict[str, str]]`, *optional*, defaults to `None`): | |
Provide the name of a path for the cache file. It is used to store the | |
indices mapping instead of the automatically generated cache file name. | |
You have to provide one `cache_file_name` per dataset in the dataset dictionary. | |
writer_batch_size (`int`, defaults to `1000`): | |
Number of rows per write operation for the cache file writer. | |
Higher value gives smaller cache files, lower value consume less temporary memory. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset('rotten_tomatoes') | |
>>> ds['train']['label'][:10] | |
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1] | |
>>> sorted_ds = ds.sort('label') | |
>>> sorted_ds['train']['label'][:10] | |
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | |
>>> another_sorted_ds = ds.sort(['label', 'text'], reverse=[True, False]) | |
>>> another_sorted_ds['train']['label'][:10] | |
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1] | |
``` | |
""" | |
self._check_values_type() | |
if indices_cache_file_names is None: | |
indices_cache_file_names = {k: None for k in self} | |
return DatasetDict( | |
{ | |
k: dataset.sort( | |
column_names=column_names, | |
reverse=reverse, | |
null_placement=null_placement, | |
keep_in_memory=keep_in_memory, | |
load_from_cache_file=load_from_cache_file, | |
indices_cache_file_name=indices_cache_file_names[k], | |
writer_batch_size=writer_batch_size, | |
) | |
for k, dataset in self.items() | |
} | |
) | |
def shuffle( | |
self, | |
seeds: Optional[Union[int, Dict[str, Optional[int]]]] = None, | |
seed: Optional[int] = None, | |
generators: Optional[Dict[str, np.random.Generator]] = None, | |
keep_in_memory: bool = False, | |
load_from_cache_file: Optional[bool] = None, | |
indices_cache_file_names: Optional[Dict[str, Optional[str]]] = None, | |
writer_batch_size: Optional[int] = 1000, | |
) -> "DatasetDict": | |
"""Create a new Dataset where the rows are shuffled. | |
The transformation is applied to all the datasets of the dataset dictionary. | |
Currently shuffling uses numpy random generators. | |
You can either supply a NumPy BitGenerator to use, or a seed to initiate NumPy's default random generator (PCG64). | |
Args: | |
seeds (`Dict[str, int]` or `int`, *optional*): | |
A seed to initialize the default BitGenerator if `generator=None`. | |
If `None`, then fresh, unpredictable entropy will be pulled from the OS. | |
If an `int` or `array_like[ints]` is passed, then it will be passed to SeedSequence to derive the initial BitGenerator state. | |
You can provide one `seed` per dataset in the dataset dictionary. | |
seed (`int`, *optional*): | |
A seed to initialize the default BitGenerator if `generator=None`. Alias for seeds (a `ValueError` is raised if both are provided). | |
generators (`Dict[str, *optional*, np.random.Generator]`): | |
Numpy random Generator to use to compute the permutation of the dataset rows. | |
If `generator=None` (default), uses `np.random.default_rng` (the default BitGenerator (PCG64) of NumPy). | |
You have to provide one `generator` per dataset in the dataset dictionary. | |
keep_in_memory (`bool`, defaults to `False`): | |
Keep the dataset in memory instead of writing it to a cache file. | |
load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled): | |
If a cache file storing the current computation from `function` | |
can be identified, use it instead of recomputing. | |
indices_cache_file_names (`Dict[str, str]`, *optional*): | |
Provide the name of a path for the cache file. It is used to store the | |
indices mappings instead of the automatically generated cache file name. | |
You have to provide one `cache_file_name` per dataset in the dataset dictionary. | |
writer_batch_size (`int`, defaults to `1000`): | |
Number of rows per write operation for the cache file writer. | |
This value is a good trade-off between memory usage during the processing, and processing speed. | |
Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes") | |
>>> ds["train"]["label"][:10] | |
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1] | |
# set a seed | |
>>> shuffled_ds = ds.shuffle(seed=42) | |
>>> shuffled_ds["train"]["label"][:10] | |
[0, 1, 0, 1, 0, 0, 0, 0, 0, 0] | |
``` | |
""" | |
self._check_values_type() | |
if seed is not None and seeds is not None: | |
raise ValueError("Please specify seed or seeds, but not both") | |
seeds = seed if seed is not None else seeds | |
if seeds is None: | |
seeds = {k: None for k in self} | |
elif not isinstance(seeds, dict): | |
seeds = {k: seeds for k in self} | |
if generators is None: | |
generators = {k: None for k in self} | |
if indices_cache_file_names is None: | |
indices_cache_file_names = {k: None for k in self} | |
return DatasetDict( | |
{ | |
k: dataset.shuffle( | |
seed=seeds[k], | |
generator=generators[k], | |
keep_in_memory=keep_in_memory, | |
load_from_cache_file=load_from_cache_file, | |
indices_cache_file_name=indices_cache_file_names[k], | |
writer_batch_size=writer_batch_size, | |
) | |
for k, dataset in self.items() | |
} | |
) | |
def save_to_disk( | |
self, | |
dataset_dict_path: PathLike, | |
max_shard_size: Optional[Union[str, int]] = None, | |
num_shards: Optional[Dict[str, int]] = None, | |
num_proc: Optional[int] = None, | |
storage_options: Optional[dict] = None, | |
): | |
""" | |
Saves a dataset dict to a filesystem using `fsspec.spec.AbstractFileSystem`. | |
For [`Image`], [`Audio`] and [`Video`] data: | |
All the Image(), Audio() and Video() data are stored in the arrow files. | |
If you want to store paths or urls, please use the Value("string") type. | |
Args: | |
dataset_dict_path (`path-like`): | |
Path (e.g. `dataset/train`) or remote URI (e.g. `s3://my-bucket/dataset/train`) | |
of the dataset dict directory where the dataset dict will be saved to. | |
max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`): | |
The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit | |
(like `"50MB"`). | |
num_shards (`Dict[str, int]`, *optional*): | |
Number of shards to write. By default the number of shards depends on `max_shard_size` and `num_proc`. | |
You need to provide the number of shards for each dataset in the dataset dictionary. | |
Use a dictionary to define a different num_shards for each split. | |
<Added version="2.8.0"/> | |
num_proc (`int`, *optional*, default `None`): | |
Number of processes when downloading and generating the dataset locally. | |
Multiprocessing is disabled by default. | |
<Added version="2.8.0"/> | |
storage_options (`dict`, *optional*): | |
Key/value pairs to be passed on to the file-system backend, if any. | |
<Added version="2.8.0"/> | |
Example: | |
```python | |
>>> dataset_dict.save_to_disk("path/to/dataset/directory") | |
>>> dataset_dict.save_to_disk("path/to/dataset/directory", max_shard_size="1GB") | |
>>> dataset_dict.save_to_disk("path/to/dataset/directory", num_shards={"train": 1024, "test": 8}) | |
``` | |
""" | |
fs: fsspec.AbstractFileSystem | |
fs, _ = url_to_fs(dataset_dict_path, **(storage_options or {})) | |
if num_shards is None: | |
num_shards = {k: None for k in self} | |
elif not isinstance(num_shards, dict): | |
raise ValueError( | |
"Please provide one `num_shards` per dataset in the dataset dictionary, e.g. {{'train': 128, 'test': 4}}" | |
) | |
fs.makedirs(dataset_dict_path, exist_ok=True) | |
with fs.open(posixpath.join(dataset_dict_path, config.DATASETDICT_JSON_FILENAME), "w", encoding="utf-8") as f: | |
json.dump({"splits": list(self)}, f) | |
for k, dataset in self.items(): | |
dataset.save_to_disk( | |
posixpath.join(dataset_dict_path, k), | |
num_shards=num_shards.get(k), | |
max_shard_size=max_shard_size, | |
num_proc=num_proc, | |
storage_options=storage_options, | |
) | |
def load_from_disk( | |
dataset_dict_path: PathLike, | |
keep_in_memory: Optional[bool] = None, | |
storage_options: Optional[dict] = None, | |
) -> "DatasetDict": | |
""" | |
Load a dataset that was previously saved using [`save_to_disk`] from a filesystem using `fsspec.spec.AbstractFileSystem`. | |
Args: | |
dataset_dict_path (`path-like`): | |
Path (e.g. `"dataset/train"`) or remote URI (e.g. `"s3//my-bucket/dataset/train"`) | |
of the dataset dict directory where the dataset dict will be loaded from. | |
keep_in_memory (`bool`, defaults to `None`): | |
Whether to copy the dataset in-memory. If `None`, the | |
dataset will not be copied in-memory unless explicitly enabled by setting | |
`datasets.config.IN_MEMORY_MAX_SIZE` to nonzero. See more details in the | |
[improve performance](../cache#improve-performance) section. | |
storage_options (`dict`, *optional*): | |
Key/value pairs to be passed on to the file-system backend, if any. | |
<Added version="2.8.0"/> | |
Returns: | |
[`DatasetDict`] | |
Example: | |
```py | |
>>> ds = load_from_disk('path/to/dataset/directory') | |
``` | |
""" | |
fs: fsspec.AbstractFileSystem | |
fs, dataset_dict_path = url_to_fs(dataset_dict_path, **(storage_options or {})) | |
dataset_dict_json_path = posixpath.join(dataset_dict_path, config.DATASETDICT_JSON_FILENAME) | |
dataset_state_json_path = posixpath.join(dataset_dict_path, config.DATASET_STATE_JSON_FILENAME) | |
dataset_info_path = posixpath.join(dataset_dict_path, config.DATASET_INFO_FILENAME) | |
if not fs.isfile(dataset_dict_json_path): | |
if fs.isfile(dataset_info_path) and fs.isfile(dataset_state_json_path): | |
raise FileNotFoundError( | |
f"No such file: '{dataset_dict_json_path}'. Expected to load a `DatasetDict` object, but got a `Dataset`. Please use either `datasets.load_from_disk` or `Dataset.load_from_disk` instead." | |
) | |
raise FileNotFoundError( | |
f"No such file: '{dataset_dict_json_path}'. Expected to load a `DatasetDict` object, but provided path is not a `DatasetDict`." | |
) | |
with fs.open(dataset_dict_json_path, "r", encoding="utf-8") as f: | |
splits = json.load(f)["splits"] | |
dataset_dict = DatasetDict() | |
for k in splits: | |
dataset_dict_split_path = posixpath.join(fs.unstrip_protocol(dataset_dict_path), k) | |
dataset_dict[k] = Dataset.load_from_disk( | |
dataset_dict_split_path, keep_in_memory=keep_in_memory, storage_options=storage_options | |
) | |
return dataset_dict | |
def from_csv( | |
path_or_paths: Dict[str, PathLike], | |
features: Optional[Features] = None, | |
cache_dir: str = None, | |
keep_in_memory: bool = False, | |
**kwargs, | |
) -> "DatasetDict": | |
"""Create [`DatasetDict`] from CSV file(s). | |
Args: | |
path_or_paths (`dict` of path-like): | |
Path(s) of the CSV file(s). | |
features ([`Features`], *optional*): | |
Dataset features. | |
cache_dir (str, *optional*, defaults to `"~/.cache/huggingface/datasets"`): | |
Directory to cache data. | |
keep_in_memory (`bool`, defaults to `False`): | |
Whether to copy the data in-memory. | |
**kwargs (additional keyword arguments): | |
Keyword arguments to be passed to [`pandas.read_csv`]. | |
Returns: | |
[`DatasetDict`] | |
Example: | |
```py | |
>>> from datasets import DatasetDict | |
>>> ds = DatasetDict.from_csv({'train': 'path/to/dataset.csv'}) | |
``` | |
""" | |
# Dynamic import to avoid circular dependency | |
from .io.csv import CsvDatasetReader | |
return CsvDatasetReader( | |
path_or_paths, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs | |
).read() | |
def from_json( | |
path_or_paths: Dict[str, PathLike], | |
features: Optional[Features] = None, | |
cache_dir: str = None, | |
keep_in_memory: bool = False, | |
**kwargs, | |
) -> "DatasetDict": | |
"""Create [`DatasetDict`] from JSON Lines file(s). | |
Args: | |
path_or_paths (`path-like` or list of `path-like`): | |
Path(s) of the JSON Lines file(s). | |
features ([`Features`], *optional*): | |
Dataset features. | |
cache_dir (str, *optional*, defaults to `"~/.cache/huggingface/datasets"`): | |
Directory to cache data. | |
keep_in_memory (`bool`, defaults to `False`): | |
Whether to copy the data in-memory. | |
**kwargs (additional keyword arguments): | |
Keyword arguments to be passed to [`JsonConfig`]. | |
Returns: | |
[`DatasetDict`] | |
Example: | |
```py | |
>>> from datasets import DatasetDict | |
>>> ds = DatasetDict.from_json({'train': 'path/to/dataset.json'}) | |
``` | |
""" | |
# Dynamic import to avoid circular dependency | |
from .io.json import JsonDatasetReader | |
return JsonDatasetReader( | |
path_or_paths, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs | |
).read() | |
def from_parquet( | |
path_or_paths: Dict[str, PathLike], | |
features: Optional[Features] = None, | |
cache_dir: str = None, | |
keep_in_memory: bool = False, | |
columns: Optional[List[str]] = None, | |
**kwargs, | |
) -> "DatasetDict": | |
"""Create [`DatasetDict`] from Parquet file(s). | |
Args: | |
path_or_paths (`dict` of path-like): | |
Path(s) of the CSV file(s). | |
features ([`Features`], *optional*): | |
Dataset features. | |
cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`): | |
Directory to cache data. | |
keep_in_memory (`bool`, defaults to `False`): | |
Whether to copy the data in-memory. | |
columns (`List[str]`, *optional*): | |
If not `None`, only these columns will be read from the file. | |
A column name may be a prefix of a nested field, e.g. 'a' will select | |
'a.b', 'a.c', and 'a.d.e'. | |
**kwargs (additional keyword arguments): | |
Keyword arguments to be passed to [`ParquetConfig`]. | |
Returns: | |
[`DatasetDict`] | |
Example: | |
```py | |
>>> from datasets import DatasetDict | |
>>> ds = DatasetDict.from_parquet({'train': 'path/to/dataset/parquet'}) | |
``` | |
""" | |
# Dynamic import to avoid circular dependency | |
from .io.parquet import ParquetDatasetReader | |
return ParquetDatasetReader( | |
path_or_paths, | |
features=features, | |
cache_dir=cache_dir, | |
keep_in_memory=keep_in_memory, | |
columns=columns, | |
**kwargs, | |
).read() | |
def from_text( | |
path_or_paths: Dict[str, PathLike], | |
features: Optional[Features] = None, | |
cache_dir: str = None, | |
keep_in_memory: bool = False, | |
**kwargs, | |
) -> "DatasetDict": | |
"""Create [`DatasetDict`] from text file(s). | |
Args: | |
path_or_paths (`dict` of path-like): | |
Path(s) of the text file(s). | |
features ([`Features`], *optional*): | |
Dataset features. | |
cache_dir (`str`, *optional*, defaults to `"~/.cache/huggingface/datasets"`): | |
Directory to cache data. | |
keep_in_memory (`bool`, defaults to `False`): | |
Whether to copy the data in-memory. | |
**kwargs (additional keyword arguments): | |
Keyword arguments to be passed to [`TextConfig`]. | |
Returns: | |
[`DatasetDict`] | |
Example: | |
```py | |
>>> from datasets import DatasetDict | |
>>> ds = DatasetDict.from_text({'train': 'path/to/dataset.txt'}) | |
``` | |
""" | |
# Dynamic import to avoid circular dependency | |
from .io.text import TextDatasetReader | |
return TextDatasetReader( | |
path_or_paths, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs | |
).read() | |
def align_labels_with_mapping(self, label2id: Dict, label_column: str) -> "DatasetDict": | |
self._check_values_type() | |
return DatasetDict( | |
{ | |
k: dataset.align_labels_with_mapping(label2id=label2id, label_column=label_column) | |
for k, dataset in self.items() | |
} | |
) | |
def push_to_hub( | |
self, | |
repo_id, | |
config_name: str = "default", | |
set_default: Optional[bool] = None, | |
data_dir: Optional[str] = None, | |
commit_message: Optional[str] = None, | |
commit_description: Optional[str] = None, | |
private: Optional[bool] = None, | |
token: Optional[str] = None, | |
revision: Optional[str] = None, | |
create_pr: Optional[bool] = False, | |
max_shard_size: Optional[Union[int, str]] = None, | |
num_shards: Optional[Dict[str, int]] = None, | |
embed_external_files: bool = True, | |
) -> CommitInfo: | |
"""Pushes the [`DatasetDict`] to the hub as a Parquet dataset. | |
The [`DatasetDict`] is pushed using HTTP requests and does not need to have neither git or git-lfs installed. | |
Each dataset split will be pushed independently. The pushed dataset will keep the original split names. | |
The resulting Parquet files are self-contained by default: if your dataset contains [`Image`] or [`Audio`] | |
data, the Parquet files will store the bytes of your images or audio files. | |
You can disable this by setting `embed_external_files` to False. | |
Args: | |
repo_id (`str`): | |
The ID of the repository to push to in the following format: `<user>/<dataset_name>` or | |
`<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace | |
of the logged-in user. | |
config_name (`str`): | |
Configuration name of a dataset. Defaults to "default". | |
set_default (`bool`, *optional*): | |
Whether to set this configuration as the default one. Otherwise, the default configuration is the one | |
named "default". | |
data_dir (`str`, *optional*): | |
Directory name that will contain the uploaded data files. Defaults to the `config_name` if different | |
from "default", else "data". | |
<Added version="2.17.0"/> | |
commit_message (`str`, *optional*): | |
Message to commit while pushing. Will default to `"Upload dataset"`. | |
commit_description (`str`, *optional*): | |
Description of the commit that will be created. | |
Additionally, description of the PR if a PR is created (`create_pr` is True). | |
<Added version="2.16.0"/> | |
private (`bool`, *optional*): | |
Whether to make the repo private. If `None` (default), the repo will be public unless the | |
organization's default is private. This value is ignored if the repo already exists. | |
token (`str`, *optional*): | |
An optional authentication token for the Hugging Face Hub. If no token is passed, will default | |
to the token saved locally when logging in with `huggingface-cli login`. Will raise an error | |
if no token is passed and the user is not logged-in. | |
revision (`str`, *optional*): | |
Branch to push the uploaded files to. Defaults to the `"main"` branch. | |
<Added version="2.15.0"/> | |
create_pr (`bool`, *optional*, defaults to `False`): | |
Whether to create a PR with the uploaded files or directly commit. | |
<Added version="2.15.0"/> | |
max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`): | |
The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit | |
(like `"500MB"` or `"1GB"`). | |
num_shards (`Dict[str, int]`, *optional*): | |
Number of shards to write. By default, the number of shards depends on `max_shard_size`. | |
Use a dictionary to define a different num_shards for each split. | |
<Added version="2.8.0"/> | |
embed_external_files (`bool`, defaults to `True`): | |
Whether to embed file bytes in the shards. | |
In particular, this will do the following before the push for the fields of type: | |
- [`Audio`] and [`Image`] removes local path information and embed file content in the Parquet files. | |
Return: | |
huggingface_hub.CommitInfo | |
Example: | |
```python | |
>>> dataset_dict.push_to_hub("<organization>/<dataset_id>") | |
>>> dataset_dict.push_to_hub("<organization>/<dataset_id>", private=True) | |
>>> dataset_dict.push_to_hub("<organization>/<dataset_id>", max_shard_size="1GB") | |
>>> dataset_dict.push_to_hub("<organization>/<dataset_id>", num_shards={"train": 1024, "test": 8}) | |
``` | |
If you want to add a new configuration (or subset) to a dataset (e.g. if the dataset has multiple tasks/versions/languages): | |
```python | |
>>> english_dataset.push_to_hub("<organization>/<dataset_id>", "en") | |
>>> french_dataset.push_to_hub("<organization>/<dataset_id>", "fr") | |
>>> # later | |
>>> english_dataset = load_dataset("<organization>/<dataset_id>", "en") | |
>>> french_dataset = load_dataset("<organization>/<dataset_id>", "fr") | |
``` | |
""" | |
if num_shards is None: | |
num_shards = {k: None for k in self} | |
elif not isinstance(num_shards, dict): | |
raise ValueError( | |
"Please provide one `num_shards` per dataset in the dataset dictionary, e.g. {{'train': 128, 'test': 4}}" | |
) | |
self._check_values_type() | |
self._check_values_features() | |
total_uploaded_size = 0 | |
total_dataset_nbytes = 0 | |
info_to_dump: DatasetInfo = next(iter(self.values())).info.copy() | |
info_to_dump.config_name = config_name | |
info_to_dump.splits = SplitDict() | |
for split in self.keys(): | |
if not re.match(_split_re, split): | |
raise ValueError(f"Split name should match '{_split_re}' but got '{split}'.") | |
api = HfApi(endpoint=config.HF_ENDPOINT, token=token) | |
repo_url = api.create_repo( | |
repo_id, | |
token=token, | |
repo_type="dataset", | |
private=private, | |
exist_ok=True, | |
) | |
repo_id = repo_url.repo_id | |
if revision is not None and not revision.startswith("refs/pr/"): | |
# We do not call create_branch for a PR reference: 400 Bad Request | |
api.create_branch(repo_id, branch=revision, token=token, repo_type="dataset", exist_ok=True) | |
if not data_dir: | |
data_dir = config_name if config_name != "default" else "data" # for backward compatibility | |
additions = [] | |
for split in self.keys(): | |
logger.info(f"Pushing split {split} to the Hub.") | |
# The split=key needs to be removed before merging | |
split_additions, uploaded_size, dataset_nbytes = self[split]._push_parquet_shards_to_hub( | |
repo_id, | |
data_dir=data_dir, | |
split=split, | |
token=token, | |
revision=revision, | |
create_pr=create_pr, | |
max_shard_size=max_shard_size, | |
num_shards=num_shards.get(split), | |
embed_external_files=embed_external_files, | |
) | |
additions += split_additions | |
total_uploaded_size += uploaded_size | |
total_dataset_nbytes += dataset_nbytes | |
info_to_dump.splits[split] = SplitInfo(str(split), num_bytes=dataset_nbytes, num_examples=len(self[split])) | |
info_to_dump.download_checksums = None | |
info_to_dump.download_size = total_uploaded_size | |
info_to_dump.dataset_size = total_dataset_nbytes | |
info_to_dump.size_in_bytes = total_uploaded_size + total_dataset_nbytes | |
# Check if the repo already has a README.md and/or a dataset_infos.json to update them with the new split info (size and pattern) | |
# and delete old split shards (if they exist) | |
repo_with_dataset_card, repo_with_dataset_infos = False, False | |
repo_splits = [] # use a list to keep the order of the splits | |
deletions = [] | |
repo_files_to_add = [addition.path_in_repo for addition in additions] | |
for repo_file in api.list_repo_tree( | |
repo_id=repo_id, revision=revision, repo_type="dataset", token=token, recursive=True | |
): | |
if not isinstance(repo_file, RepoFile): | |
continue | |
if repo_file.rfilename == config.REPOCARD_FILENAME: | |
repo_with_dataset_card = True | |
elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME: | |
repo_with_dataset_infos = True | |
elif ( | |
repo_file.rfilename.startswith(tuple(f"{data_dir}/{split}-" for split in self.keys())) | |
and repo_file.rfilename not in repo_files_to_add | |
): | |
deletions.append(CommitOperationDelete(path_in_repo=repo_file.rfilename)) | |
elif fnmatch.fnmatch( | |
repo_file.rfilename, PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED.replace("{split}", "*") | |
): | |
repo_split = string_to_dict( | |
repo_file.rfilename, | |
glob_pattern_to_regex(PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED), | |
)["split"] | |
if repo_split not in repo_splits: | |
repo_splits.append(split) | |
# get the info from the README to update them | |
if repo_with_dataset_card: | |
dataset_card_path = api.hf_hub_download( | |
repo_id, config.REPOCARD_FILENAME, repo_type="dataset", revision=revision | |
) | |
dataset_card = DatasetCard.load(Path(dataset_card_path)) | |
dataset_card_data = dataset_card.data | |
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data) | |
# get the deprecated dataset_infos.json to update them | |
elif repo_with_dataset_infos: | |
dataset_card = None | |
dataset_card_data = DatasetCardData() | |
metadata_configs = MetadataConfigs() | |
else: | |
dataset_card = None | |
dataset_card_data = DatasetCardData() | |
metadata_configs = MetadataConfigs() | |
# create the metadata configs if it was uploaded with push_to_hub before metadata configs existed | |
if not metadata_configs and repo_splits: | |
default_metadata_configs_to_dump = { | |
"data_files": [{"split": split, "path": f"data/{split}-*"} for split in repo_splits] | |
} | |
MetadataConfigs({"default": default_metadata_configs_to_dump}).to_dataset_card_data(dataset_card_data) | |
metadata_config_to_dump = { | |
"data_files": [{"split": split, "path": f"{data_dir}/{split}-*"} for split in self.keys()], | |
} | |
if set_default and config_name != "default": | |
if metadata_configs: | |
default_config_name = metadata_configs.get_default_config_name() | |
if default_config_name == "default": | |
raise ValueError( | |
"There exists a configuration named 'default'. To set a different configuration as default, " | |
"rename the 'default' one first." | |
) | |
else: | |
_ = metadata_configs[default_config_name].pop("default") | |
metadata_config_to_dump["default"] = True | |
# push to the deprecated dataset_infos.json | |
if repo_with_dataset_infos: | |
dataset_infos_path = api.hf_hub_download( | |
repo_id, config.DATASETDICT_INFOS_FILENAME, repo_type="dataset", revision=revision | |
) | |
with open(dataset_infos_path, encoding="utf-8") as f: | |
dataset_infos: dict = json.load(f) | |
dataset_infos[config_name] = asdict(info_to_dump) | |
buffer = BytesIO() | |
buffer.write(json.dumps(dataset_infos, indent=4).encode("utf-8")) | |
additions.append( | |
CommitOperationAdd(path_in_repo=config.DATASETDICT_INFOS_FILENAME, path_or_fileobj=buffer) | |
) | |
# push to README | |
DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data) | |
MetadataConfigs({config_name: metadata_config_to_dump}).to_dataset_card_data(dataset_card_data) | |
dataset_card = DatasetCard(f"---\n{dataset_card_data}\n---\n") if dataset_card is None else dataset_card | |
additions.append( | |
CommitOperationAdd(path_in_repo=config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode()) | |
) | |
commit_message = commit_message if commit_message is not None else "Upload dataset" | |
if len(additions) <= config.UPLOADS_MAX_NUMBER_PER_COMMIT: | |
commit_info = api.create_commit( | |
repo_id, | |
operations=additions + deletions, | |
commit_message=commit_message, | |
commit_description=commit_description, | |
token=token, | |
repo_type="dataset", | |
revision=revision, | |
create_pr=create_pr, | |
) | |
else: | |
logger.info( | |
f"Number of files to upload is larger than {config.UPLOADS_MAX_NUMBER_PER_COMMIT}. Splitting the push into multiple commits." | |
) | |
num_commits = math.ceil(len(additions) / config.UPLOADS_MAX_NUMBER_PER_COMMIT) | |
for i in range(0, num_commits): | |
operations = additions[ | |
i * config.UPLOADS_MAX_NUMBER_PER_COMMIT : (i + 1) * config.UPLOADS_MAX_NUMBER_PER_COMMIT | |
] + (deletions if i == 0 else []) | |
commit_info = api.create_commit( | |
repo_id, | |
operations=operations, | |
commit_message=commit_message + f" (part {i:05d}-of-{num_commits:05d})", | |
commit_description=commit_description, | |
token=token, | |
repo_type="dataset", | |
revision=revision, | |
create_pr=create_pr, | |
) | |
logger.info( | |
f"Commit #{i+1} completed" | |
+ (f" (still {num_commits - i - 1} to go)" if num_commits - i - 1 else "") | |
+ "." | |
) | |
return commit_info | |
class IterableDatasetDict(dict): | |
def __repr__(self): | |
repr = "\n".join([f"{k}: {v}" for k, v in self.items()]) | |
repr = re.sub(r"^", " " * 4, repr, 0, re.M) | |
return f"IterableDatasetDict({{\n{repr}\n}})" | |
def with_format( | |
self, | |
type: Optional[str] = None, | |
) -> "IterableDatasetDict": | |
""" | |
Return a dataset with the specified format. | |
The 'pandas' format is currently not implemented. | |
Args: | |
type (`str`, *optional*): | |
Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'arrow', 'jax']`. | |
`None` means it returns python objects (default). | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> from transformers import AutoTokenizer | |
>>> ds = load_dataset("rotten_tomatoes", split="validation", streaming=True) | |
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") | |
>>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True) | |
>>> ds = ds.with_format("torch") | |
>>> next(iter(ds)) | |
{'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .', | |
'label': tensor(1), | |
'input_ids': tensor([ 101, 18027, 16310, 16001, 1103, 9321, 178, 11604, 7235, 6617, | |
1742, 2165, 2820, 1206, 6588, 22572, 12937, 1811, 2153, 1105, | |
1147, 12890, 19587, 6463, 1105, 15026, 1482, 119, 102, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0]), | |
'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), | |
'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])} | |
``` | |
""" | |
return IterableDatasetDict({k: dataset.with_format(type=type) for k, dataset in self.items()}) | |
def map( | |
self, | |
function: Optional[Callable] = None, | |
with_indices: bool = False, | |
input_columns: Optional[Union[str, List[str]]] = None, | |
batched: bool = False, | |
batch_size: int = 1000, | |
drop_last_batch: bool = False, | |
remove_columns: Optional[Union[str, List[str]]] = None, | |
fn_kwargs: Optional[dict] = None, | |
) -> "IterableDatasetDict": | |
""" | |
Apply a function to all the examples in the iterable dataset (individually or in batches) and update them. | |
If your function returns a column that already exists, then it overwrites it. | |
The function is applied on-the-fly on the examples when iterating over the dataset. | |
The transformation is applied to all the datasets of the dataset dictionary. | |
You can specify whether the function should be batched or not with the `batched` parameter: | |
- If batched is `False`, then the function takes 1 example in and should return 1 example. | |
An example is a dictionary, e.g. `{"text": "Hello there !"}`. | |
- If batched is `True` and `batch_size` is 1, then the function takes a batch of 1 example as input and can return a batch with 1 or more examples. | |
A batch is a dictionary, e.g. a batch of 1 example is `{"text": ["Hello there !"]}`. | |
- If batched is `True` and `batch_size` is `n` > 1, then the function takes a batch of `n` examples as input and can return a batch with `n` examples, or with an arbitrary number of examples. | |
Note that the last batch may have less than `n` examples. | |
A batch is a dictionary, e.g. a batch of `n` examples is `{"text": ["Hello there !"] * n}`. | |
Args: | |
function (`Callable`, *optional*, defaults to `None`): | |
Function applied on-the-fly on the examples when you iterate on the dataset. | |
It must have one of the following signatures: | |
- `function(example: Dict[str, Any]) -> Dict[str, Any]` if `batched=False` and `with_indices=False` | |
- `function(example: Dict[str, Any], idx: int) -> Dict[str, Any]` if `batched=False` and `with_indices=True` | |
- `function(batch: Dict[str, List]) -> Dict[str, List]` if `batched=True` and `with_indices=False` | |
- `function(batch: Dict[str, List], indices: List[int]) -> Dict[str, List]` if `batched=True` and `with_indices=True` | |
For advanced usage, the function can also return a `pyarrow.Table`. | |
Moreover if your function returns nothing (`None`), then `map` will run your function and return the dataset unchanged. | |
If no function is provided, default to identity function: `lambda x: x`. | |
with_indices (`bool`, defaults to `False`): | |
Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx[, rank]): ...`. | |
input_columns (`[Union[str, List[str]]]`, *optional*, defaults to `None`): | |
The columns to be passed into `function` | |
as positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument. | |
batched (`bool`, defaults to `False`): | |
Provide batch of examples to `function`. | |
batch_size (`int`, *optional*, defaults to `1000`): | |
Number of examples per batch provided to `function` if `batched=True`. | |
drop_last_batch (`bool`, defaults to `False`): | |
Whether a last batch smaller than the `batch_size` should be | |
dropped instead of being processed by the function. | |
remove_columns (`[List[str]]`, *optional*, defaults to `None`): | |
Remove a selection of columns while doing the mapping. | |
Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding | |
columns with names in `remove_columns`, these columns will be kept. | |
fn_kwargs (`Dict`, *optional*, defaults to `None`): | |
Keyword arguments to be passed to `function` | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes", streaming=True) | |
>>> def add_prefix(example): | |
... example["text"] = "Review: " + example["text"] | |
... return example | |
>>> ds = ds.map(add_prefix) | |
>>> next(iter(ds["train"])) | |
{'label': 1, | |
'text': 'Review: the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'} | |
``` | |
""" | |
return IterableDatasetDict( | |
{ | |
k: dataset.map( | |
function=function, | |
with_indices=with_indices, | |
input_columns=input_columns, | |
batched=batched, | |
batch_size=batch_size, | |
drop_last_batch=drop_last_batch, | |
remove_columns=remove_columns, | |
fn_kwargs=fn_kwargs, | |
) | |
for k, dataset in self.items() | |
} | |
) | |
def filter( | |
self, | |
function: Optional[Callable] = None, | |
with_indices=False, | |
input_columns: Optional[Union[str, List[str]]] = None, | |
batched: bool = False, | |
batch_size: Optional[int] = 1000, | |
fn_kwargs: Optional[dict] = None, | |
) -> "IterableDatasetDict": | |
"""Apply a filter function to all the elements so that the dataset only includes examples according to the filter function. | |
The filtering is done on-the-fly when iterating over the dataset. | |
The filtering is applied to all the datasets of the dataset dictionary. | |
Args: | |
function (`Callable`): | |
Callable with one of the following signatures: | |
- `function(example: Dict[str, Any]) -> bool` if `with_indices=False, batched=False` | |
- `function(example: Dict[str, Any], indices: int) -> bool` if `with_indices=True, batched=False` | |
- `function(example: Dict[str, List]) -> List[bool]` if `with_indices=False, batched=True` | |
- `function(example: Dict[str, List], indices: List[int]) -> List[bool]` if `with_indices=True, batched=True` | |
If no function is provided, defaults to an always True function: `lambda x: True`. | |
with_indices (`bool`, defaults to `False`): | |
Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx): ...`. | |
input_columns (`str` or `List[str]`, *optional*): | |
The columns to be passed into `function` as | |
positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument. | |
batched (`bool`, defaults to `False`): | |
Provide batch of examples to `function` | |
batch_size (`int`, *optional*, defaults to `1000`): | |
Number of examples per batch provided to `function` if `batched=True`. | |
fn_kwargs (`Dict`, *optional*, defaults to `None`): | |
Keyword arguments to be passed to `function` | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes", streaming=True) | |
>>> ds = ds.filter(lambda x: x["label"] == 0) | |
>>> list(ds["train"].take(3)) | |
[{'label': 0, 'text': 'Review: simplistic , silly and tedious .'}, | |
{'label': 0, | |
'text': "Review: it's so laddish and juvenile , only teenage boys could possibly find it funny ."}, | |
{'label': 0, | |
'text': 'Review: exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable .'}] | |
``` | |
""" | |
return IterableDatasetDict( | |
{ | |
k: dataset.filter( | |
function=function, | |
with_indices=with_indices, | |
input_columns=input_columns, | |
batched=batched, | |
batch_size=batch_size, | |
fn_kwargs=fn_kwargs, | |
) | |
for k, dataset in self.items() | |
} | |
) | |
def shuffle( | |
self, seed=None, generator: Optional[np.random.Generator] = None, buffer_size: int = 1000 | |
) -> "IterableDatasetDict": | |
""" | |
Randomly shuffles the elements of this dataset. | |
The shuffling is applied to all the datasets of the dataset dictionary. | |
This dataset fills a buffer with buffer_size elements, then randomly samples elements from this buffer, | |
replacing the selected elements with new elements. For perfect shuffling, a buffer size greater than or | |
equal to the full size of the dataset is required. | |
For instance, if your dataset contains 10,000 elements but `buffer_size` is set to 1000, then `shuffle` will | |
initially select a random element from only the first 1000 elements in the buffer. Once an element is | |
selected, its space in the buffer is replaced by the next (i.e. 1,001-st) element, | |
maintaining the 1000 element buffer. | |
If the dataset is made of several shards, it also does `shuffle` the order of the shards. | |
However if the order has been fixed by using [`~datasets.IterableDataset.skip`] or [`~datasets.IterableDataset.take`] | |
then the order of the shards is kept unchanged. | |
Args: | |
seed (`int`, *optional*, defaults to `None`): | |
Random seed that will be used to shuffle the dataset. | |
It is used to sample from the shuffle buffer and also to shuffle the data shards. | |
generator (`numpy.random.Generator`, *optional*): | |
Numpy random Generator to use to compute the permutation of the dataset rows. | |
If `generator=None` (default), uses `np.random.default_rng` (the default BitGenerator (PCG64) of NumPy). | |
buffer_size (`int`, defaults to `1000`): | |
Size of the buffer. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes", streaming=True) | |
>>> list(ds["train"].take(3)) | |
[{'label': 1, | |
'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}, | |
{'label': 1, | |
'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'}, | |
{'label': 1, 'text': 'effective but too-tepid biopic'}] | |
>>> ds = ds.shuffle(seed=42) | |
>>> list(ds["train"].take(3)) | |
[{'label': 1, | |
'text': "a sports movie with action that's exciting on the field and a story you care about off it ."}, | |
{'label': 1, | |
'text': 'at its best , the good girl is a refreshingly adult take on adultery . . .'}, | |
{'label': 1, | |
'text': "sam jones became a very lucky filmmaker the day wilco got dropped from their record label , proving that one man's ruin may be another's fortune ."}] | |
``` | |
""" | |
return IterableDatasetDict( | |
{ | |
k: dataset.shuffle(seed=seed, generator=generator, buffer_size=buffer_size) | |
for k, dataset in self.items() | |
} | |
) | |
def rename_column(self, original_column_name: str, new_column_name: str) -> "IterableDatasetDict": | |
""" | |
Rename a column in the dataset, and move the features associated to the original column under the new column | |
name. | |
The renaming is applied to all the datasets of the dataset dictionary. | |
Args: | |
original_column_name (`str`): | |
Name of the column to rename. | |
new_column_name (`str`): | |
New name for the column. | |
Returns: | |
[`IterableDatasetDict`]: A copy of the dataset with a renamed column. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes", streaming=True) | |
>>> ds = ds.rename_column("text", "movie_review") | |
>>> next(iter(ds["train"])) | |
{'label': 1, | |
'movie_review': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'} | |
``` | |
""" | |
return IterableDatasetDict( | |
{ | |
k: dataset.rename_column(original_column_name=original_column_name, new_column_name=new_column_name) | |
for k, dataset in self.items() | |
} | |
) | |
def rename_columns(self, column_mapping: Dict[str, str]) -> "IterableDatasetDict": | |
""" | |
Rename several columns in the dataset, and move the features associated to the original columns under | |
the new column names. | |
The renaming is applied to all the datasets of the dataset dictionary. | |
Args: | |
column_mapping (`Dict[str, str]`): | |
A mapping of columns to rename to their new names. | |
Returns: | |
[`IterableDatasetDict`]: A copy of the dataset with renamed columns | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes", streaming=True) | |
>>> ds = ds.rename_columns({"text": "movie_review", "label": "rating"}) | |
>>> next(iter(ds["train"])) | |
{'movie_review': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', | |
'rating': 1} | |
``` | |
""" | |
return IterableDatasetDict( | |
{k: dataset.rename_columns(column_mapping=column_mapping) for k, dataset in self.items()} | |
) | |
def remove_columns(self, column_names: Union[str, List[str]]) -> "IterableDatasetDict": | |
""" | |
Remove one or several column(s) in the dataset and the features associated to them. | |
The removal is done on-the-fly on the examples when iterating over the dataset. | |
The removal is applied to all the datasets of the dataset dictionary. | |
Args: | |
column_names (`Union[str, List[str]]`): | |
Name of the column(s) to remove. | |
Returns: | |
[`IterableDatasetDict`]: A copy of the dataset object without the columns to remove. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes", streaming=True) | |
>>> ds = ds.remove_columns("label") | |
>>> next(iter(ds["train"])) | |
{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'} | |
``` | |
""" | |
return IterableDatasetDict({k: dataset.remove_columns(column_names) for k, dataset in self.items()}) | |
def select_columns(self, column_names: Union[str, List[str]]) -> "IterableDatasetDict": | |
"""Select one or several column(s) in the dataset and the features | |
associated to them. The selection is done on-the-fly on the examples | |
when iterating over the dataset. The selection is applied to all the | |
datasets of the dataset dictionary. | |
Args: | |
column_names (`Union[str, List[str]]`): | |
Name of the column(s) to keep. | |
Returns: | |
[`IterableDatasetDict`]: A copy of the dataset object with only selected columns. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes", streaming=True) | |
>>> ds = ds.select("text") | |
>>> next(iter(ds["train"])) | |
{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'} | |
``` | |
""" | |
return IterableDatasetDict({k: dataset.select_columns(column_names) for k, dataset in self.items()}) | |
def cast_column(self, column: str, feature: FeatureType) -> "IterableDatasetDict": | |
"""Cast column to feature for decoding. | |
The type casting is applied to all the datasets of the dataset dictionary. | |
Args: | |
column (`str`): | |
Column name. | |
feature ([`Feature`]): | |
Target feature. | |
Returns: | |
[`IterableDatasetDict`] | |
Example: | |
```py | |
>>> from datasets import load_dataset, ClassLabel | |
>>> ds = load_dataset("rotten_tomatoes", streaming=True) | |
>>> ds["train"].features | |
{'label': ClassLabel(names=['neg', 'pos'], id=None), | |
'text': Value(dtype='string', id=None)} | |
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) | |
>>> ds["train"].features | |
{'label': ClassLabel(names=['bad', 'good'], id=None), | |
'text': Value(dtype='string', id=None)} | |
``` | |
""" | |
return IterableDatasetDict( | |
{k: dataset.cast_column(column=column, feature=feature) for k, dataset in self.items()} | |
) | |
def cast( | |
self, | |
features: Features, | |
) -> "IterableDatasetDict": | |
""" | |
Cast the dataset to a new set of features. | |
The type casting is applied to all the datasets of the dataset dictionary. | |
Args: | |
features (`Features`): | |
New features to cast the dataset to. | |
The name of the fields in the features must match the current column names. | |
The type of the data must also be convertible from one type to the other. | |
For non-trivial conversion, e.g. `string` <-> `ClassLabel` you should use [`map`] to update the Dataset. | |
Returns: | |
[`IterableDatasetDict`]: A copy of the dataset with casted features. | |
Example: | |
```py | |
>>> from datasets import load_dataset | |
>>> ds = load_dataset("rotten_tomatoes", streaming=True) | |
>>> ds["train"].features | |
{'label': ClassLabel(names=['neg', 'pos'], id=None), | |
'text': Value(dtype='string', id=None)} | |
>>> new_features = ds["train"].features.copy() | |
>>> new_features['label'] = ClassLabel(names=['bad', 'good']) | |
>>> new_features['text'] = Value('large_string') | |
>>> ds = ds.cast(new_features) | |
>>> ds["train"].features | |
{'label': ClassLabel(names=['bad', 'good'], id=None), | |
'text': Value(dtype='large_string', id=None)} | |
``` | |
""" | |
return IterableDatasetDict({k: dataset.cast(features=features) for k, dataset in self.items()}) | |