Spaces:
Sleeping
Sleeping
import logging | |
from itertools import chain | |
import torch | |
from datasets import DatasetDict, load_dataset | |
from torch.nn.utils.rnn import pad_sequence as torch_pad_sequence | |
SMALL_GLUE_DATA = ["cola", "wnli", "rte", "mrpc", "stsb"] | |
LARGE_GLUE_DATA = ["qnli", "qqp", "sst2"] | |
logger = logging.getLogger(__name__) | |
def load_data(data_args, model_args): | |
if data_args.dataset_name is not None: | |
raw_datasets = load_dataset( | |
data_args.dataset_name, | |
data_args.dataset_config_name, | |
cache_dir=model_args.cache_dir, | |
use_auth_token=True if model_args.use_auth_token else None, | |
streaming=data_args.streaming, | |
# added to suppress noisy warning | |
trust_remote_code=True, | |
) | |
else: | |
data_files = {} | |
if data_args.train_file is not None: | |
data_files["train"] = data_args.train_file | |
if data_args.validation_file is not None: | |
data_files["validation"] = data_args.validation_file | |
extension = data_args.train_file.split(".")[-1] | |
if extension == "txt": | |
extension = "text" | |
raw_datasets = load_dataset( | |
extension, | |
data_files=data_files, | |
cache_dir=model_args.cache_dir, | |
use_auth_token=True if model_args.use_auth_token else None, | |
streaming=data_args.streaming, | |
) | |
return raw_datasets | |
def tokenize_data_new(data_args, tokenizer, raw_datasets, training_args): | |
# Preprocessing the datasets. | |
# First we tokenize all the texts. | |
column_names = None | |
if training_args.do_train and "train" in raw_datasets: | |
column_names = raw_datasets["train"].column_names | |
elif "validation" in raw_datasets: | |
column_names = raw_datasets["validation"].column_names | |
if column_names is None: | |
text_column_name = "text" | |
else: | |
text_column_name = "text" if "text" in column_names else column_names[0] | |
# just want the text! | |
raw_datasets = raw_datasets.select_columns([text_column_name]) | |
if data_args.max_seq_length is None: | |
max_seq_length = tokenizer.model_max_length | |
if max_seq_length > 1024: | |
logger.warning( | |
f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " | |
"Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." | |
) | |
max_seq_length = 1024 | |
else: | |
if data_args.max_seq_length > tokenizer.model_max_length: | |
logger.warning( | |
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" | |
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." | |
) | |
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) | |
if data_args.line_by_line: | |
# When using line_by_line, we just tokenize each nonempty line. | |
padding = "max_length" if data_args.pad_to_max_length else False | |
def tokenize_function(examples): | |
# Remove empty lines | |
examples[text_column_name] = [ | |
line | |
for line in examples[text_column_name] | |
if len(line) > 0 and not line.isspace() | |
] | |
return tokenizer( | |
examples[text_column_name], | |
padding=padding, | |
truncation=True, | |
max_length=max_seq_length, | |
# We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it | |
# receives the `special_tokens_mask`. | |
return_special_tokens_mask=True, | |
) | |
with training_args.main_process_first(desc="dataset map tokenization"): | |
if not data_args.streaming: | |
tokenized_datasets = raw_datasets.map( | |
tokenize_function, | |
batched=True, | |
num_proc=data_args.preprocessing_num_workers, | |
remove_columns=[text_column_name], | |
load_from_cache_file=not data_args.overwrite_cache, | |
desc="Running tokenizer on dataset line_by_line", | |
) | |
else: | |
tokenized_datasets = raw_datasets.map( | |
tokenize_function, | |
batched=True, | |
remove_columns=[text_column_name], | |
) | |
else: | |
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. | |
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more | |
# efficient when it receives the `special_tokens_mask`. | |
def tokenize_function(examples): | |
return tokenizer( | |
examples[text_column_name], return_special_tokens_mask=True | |
) | |
with training_args.main_process_first(desc="dataset map tokenization"): | |
if not data_args.streaming: | |
tokenized_datasets = raw_datasets.map( | |
tokenize_function, | |
batched=True, | |
num_proc=data_args.preprocessing_num_workers, | |
remove_columns=column_names, | |
load_from_cache_file=not data_args.overwrite_cache, | |
desc="Running tokenizer on every text in dataset", | |
) | |
else: | |
tokenized_datasets = raw_datasets.map( | |
tokenize_function, | |
batched=True, | |
remove_columns=[text_column_name], | |
) | |
# Main data processing function that will concatenate all texts from our dataset and generate chunks of | |
# max_seq_length. | |
def group_texts(examples): | |
# Concatenate all texts. | |
concatenated_examples = { | |
k: list(chain(*examples[k])) for k in examples.keys() | |
} | |
total_length = len(concatenated_examples[list(examples.keys())[0]]) | |
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can | |
# customize this part to your needs. | |
if total_length >= max_seq_length: | |
total_length = (total_length // max_seq_length) * max_seq_length | |
# Split by chunks of max_len. | |
result = { | |
k: [ | |
t[i : i + max_seq_length] | |
for i in range(0, total_length, max_seq_length) | |
] | |
for k, t in concatenated_examples.items() | |
} | |
return result | |
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a | |
# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value | |
# might be slower to preprocess. | |
# | |
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information: | |
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map | |
with training_args.main_process_first(desc="grouping texts together"): | |
if not data_args.streaming: | |
tokenized_datasets = tokenized_datasets.map( | |
group_texts, | |
batched=True, | |
num_proc=data_args.preprocessing_num_workers, | |
load_from_cache_file=not data_args.overwrite_cache, | |
desc=f"Grouping texts in chunks of {max_seq_length}", | |
) | |
else: | |
tokenized_datasets = tokenized_datasets.map( | |
group_texts, | |
batched=True, | |
) | |
return tokenized_datasets | |
# TODO: we need to remove this one and update process_data.py. | |
def tokenize_data(data_args, tokenizer, raw_datasets, accelerator): | |
if data_args.max_seq_length is None: | |
max_seq_length = tokenizer.model_max_length | |
if max_seq_length > 1024: | |
logger.warning( | |
f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " | |
"Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." | |
) | |
max_seq_length = 1024 | |
else: | |
if data_args.max_seq_length > tokenizer.model_max_length: | |
logger.warning( | |
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" | |
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." | |
) | |
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) | |
# Preprocessing the datasets. | |
# First we tokenize all the texts. | |
column_names = raw_datasets["train"].column_names | |
text_column_name = "text" if "text" in column_names else column_names[0] | |
if data_args.line_by_line: | |
# When using line_by_line, we just tokenize each nonempty line. | |
padding = "max_length" if data_args.pad_to_max_length else False | |
def tokenize_function(examples): | |
# Remove empty lines | |
examples[text_column_name] = [ | |
line | |
for line in examples[text_column_name] | |
if len(line) > 0 and not line.isspace() | |
] | |
return tokenizer( | |
examples[text_column_name], | |
padding=padding, | |
truncation=True, | |
max_length=max_seq_length, | |
) | |
with accelerator.main_process_first(): | |
tokenized_datasets = raw_datasets.map( | |
tokenize_function, | |
batched=True, | |
num_proc=data_args.preprocessing_num_workers, | |
remove_columns=[text_column_name], | |
load_from_cache_file=not data_args.overwrite_cache, | |
desc="Running tokenizer on dataset line_by_line", | |
) | |
else: | |
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. | |
def tokenize_function(examples): | |
return tokenizer(examples[text_column_name]) | |
with accelerator.main_process_first(): | |
tokenized_datasets = raw_datasets.map( | |
tokenize_function, | |
batched=True, | |
num_proc=data_args.preprocessing_num_workers, | |
remove_columns=column_names, | |
load_from_cache_file=not data_args.overwrite_cache, | |
desc="Running tokenizer on every text in dataset", | |
) | |
# Main data processing function that will concatenate all texts from our dataset and generate chunks of | |
# max_seq_length. | |
def group_texts(examples): | |
# Concatenate all texts. | |
concatenated_examples = { | |
k: list(chain(*examples[k])) for k in examples.keys() | |
} | |
total_length = len(concatenated_examples[list(examples.keys())[0]]) | |
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can | |
# customize this part to your needs. | |
if total_length >= max_seq_length: | |
total_length = (total_length // max_seq_length) * max_seq_length | |
# Split by chunks of max_len. | |
result = { | |
k: [ | |
t[i : i + max_seq_length] | |
for i in range(0, total_length, max_seq_length) | |
] | |
for k, t in concatenated_examples.items() | |
} | |
return result | |
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a | |
# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value | |
# might be slower to preprocess. | |
# | |
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information: | |
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map | |
with accelerator.main_process_first(): | |
tokenized_datasets = tokenized_datasets.map( | |
group_texts, | |
batched=True, | |
num_proc=data_args.preprocessing_num_workers, | |
load_from_cache_file=not data_args.overwrite_cache, | |
desc=f"Grouping texts in chunks of {max_seq_length}", | |
) | |
return tokenized_datasets | |
def split_data_to_train_validation(data_args, data, seed): | |
total_size = len(data["train"]) | |
validation_size = int(total_size * data_args.validation_split_ratio) | |
train_size = total_size - validation_size | |
# TODO(rabeeh): we need to do this for the other ones as well and think how to do it cleanly. | |
if data_args.max_train_samples is not None: | |
train_size = min(train_size, data_args.max_train_samples) | |
if data_args.max_eval_samples is not None: | |
validation_size = min(validation_size, data_args.max_eval_samples) | |
remaining_size = total_size - train_size - validation_size | |
train, validation, _ = torch.utils.data.random_split( | |
data["train"], | |
[train_size, validation_size, remaining_size], | |
generator=torch.Generator().manual_seed(seed), | |
) | |
data["train"], data["validation"] = train, validation | |
assert ( | |
len(data["train"]) == train_size and len(data["validation"]) == validation_size | |
) | |
return data | |
def split_glue(raw_datasets, dataset_name, seed): | |
"""Since glue test sets are not public, splits the data splits to form test sets. | |
For large datasets (#samples > 10K), divides training set into 1K as validation and | |
rest as train, using original validation as test. Otherwise, divides validation set | |
to half (half for validation and half for test).""" | |
if dataset_name == "mnli": | |
raw_datasets = DatasetDict( | |
{ | |
"test": raw_datasets["validation_matched"], | |
"validation": raw_datasets["validation_mismatched"], | |
"train": raw_datasets["train"], | |
} | |
) | |
elif dataset_name in SMALL_GLUE_DATA: | |
# Splits the validation set into half for validation and half for test. | |
splits = raw_datasets["validation"].train_test_split( | |
test_size=0.5, shuffle=True, seed=seed | |
) | |
raw_datasets = DatasetDict( | |
{ | |
"validation": splits["train"], | |
"test": splits["test"], | |
"train": raw_datasets["train"], | |
} | |
) | |
elif dataset_name in LARGE_GLUE_DATA: | |
# Splits the training set into 1K as validation, rest as train. | |
test_size = 1000 / len(raw_datasets["train"]) | |
splits = raw_datasets["train"].train_test_split( | |
test_size=test_size, shuffle=True, seed=seed | |
) | |
raw_datasets = DatasetDict( | |
{ | |
"train": splits["train"], | |
"validation": splits["test"], | |
"test": raw_datasets["validation"], | |
} | |
) | |
else: | |
raise NotImplementedError | |
return raw_datasets | |
def pad_sequence(sequences, padding_value, batch_first, padding_side): | |
if padding_side == "right": | |
return torch_pad_sequence( | |
sequences, padding_value=padding_value, batch_first=batch_first | |
) | |
return torch_pad_sequence( | |
[sequence.flip(0) for sequence in sequences], | |
padding_value=padding_value, | |
batch_first=batch_first, | |
).flip(1) | |