Spaces:
Sleeping
Sleeping
import torch | |
from torch.utils import data | |
from torch.utils.data import Dataset | |
from datasets.arrow_dataset import Dataset as HFDataset | |
from datasets.load import load_dataset, load_metric | |
from transformers import ( | |
AutoTokenizer, | |
DataCollatorWithPadding, | |
EvalPrediction, | |
default_data_collator, | |
) | |
import copy, math | |
import os | |
import numpy as np | |
import logging, re | |
from datasets.formatting.formatting import LazyRow, LazyBatch | |
from tqdm import tqdm | |
from tasks import utils | |
task_to_keys = { | |
"cola": ("sentence", None), | |
"mnli": ("premise", "hypothesis"), | |
"mrpc": ("sentence1", "sentence2"), | |
"qnli": ("question", "sentence"), | |
"qqp": ("question1", "question2"), | |
"rte": ("sentence1", "sentence2"), | |
"sst2": ("sentence", None), | |
"stsb": ("sentence1", "sentence2"), | |
"wnli": ("sentence1", "sentence2"), | |
} | |
logger = logging.getLogger(__name__) | |
idx = 0 | |
class GlueDataset(): | |
def __init__(self, tokenizer: AutoTokenizer, data_args, training_args) -> None: | |
super().__init__() | |
self.tokenizer = tokenizer | |
self.data_args = data_args | |
#labels | |
raw_datasets = load_dataset("glue", data_args.dataset_name) | |
self.is_regression = data_args.dataset_name == "stsb" | |
if not self.is_regression: | |
self.label_list = raw_datasets["train"].features["label"].names | |
self.num_labels = len(self.label_list) | |
else: | |
self.num_labels = 1 | |
# Preprocessing the raw_datasets | |
self.sentence1_key, self.sentence2_key = task_to_keys[data_args.dataset_name] | |
sc_template = f'''{'{' + self.sentence1_key + '}'}''' \ | |
if self.sentence2_key is None else f'''{'{' + self.sentence1_key + '}'}</s></s>{'{' + self.sentence2_key + '}'}''' | |
self.tokenizer.template = self.template = [sc_template] | |
print(f"-> using template:{self.template}") | |
# Padding strategy | |
if data_args.pad_to_max_length: | |
self.padding = "max_length" | |
else: | |
# We will pad later, dynamically at batch creation, to the max sequence length in each batch | |
self.padding = False | |
# Some models have set the order of the labels to use, so let's make sure we do use it. | |
if not self.is_regression: | |
self.label2id = {l: i for i, l in enumerate(self.label_list)} | |
self.id2label = {id: label for label, id in self.label2id.items()} | |
if data_args.max_seq_length > tokenizer.model_max_length: | |
logger.warning( | |
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" | |
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." | |
) | |
self.max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) | |
new_datasets = raw_datasets.map( | |
self.preprocess_function, | |
batched=True, | |
load_from_cache_file=not data_args.overwrite_cache, | |
desc="Running tokenizer on clean dataset", | |
) | |
for key in new_datasets.keys(): | |
if "idx" not in raw_datasets[key].column_names: | |
idx = np.arange(len(raw_datasets[key])).tolist() | |
raw_datasets[key] = raw_datasets[key].add_column("idx", idx) | |
if training_args.do_train: | |
self.train_dataset = new_datasets["train"] | |
if data_args.max_train_samples is not None: | |
data_args.max_train_samples = min(data_args.max_train_samples, len(self.train_dataset)) | |
self.train_dataset = self.train_dataset.select(range(data_args.max_train_samples)) | |
size = len(self.train_dataset) | |
select = np.random.choice(size, math.ceil(size * training_args.poison_rate), replace=False) | |
idx = torch.zeros([size]) | |
idx[select] = 1 | |
self.train_dataset.poison_idx = idx | |
if training_args.do_eval: | |
self.eval_dataset = new_datasets["validation_matched" if data_args.dataset_name == "mnli" else "validation"] | |
if data_args.max_eval_samples is not None: | |
data_args.max_eval_samples = min(data_args.max_eval_samples, len(self.eval_dataset)) | |
self.eval_dataset = self.eval_dataset.select(range(data_args.max_eval_samples)) | |
if training_args.do_predict or data_args.dataset_name is not None or data_args.test_file is not None: | |
self.predict_dataset = new_datasets["test_matched" if data_args.dataset_name == "mnli" else "test"] | |
if data_args.max_predict_samples is not None: | |
data_args.max_predict_samples = min(data_args.max_predict_samples, len(self.predict_dataset)) | |
self.predict_dataset = self.predict_dataset.select(range(data_args.max_predict_samples)) | |
self.metric = load_metric("glue", data_args.dataset_name) | |
if data_args.pad_to_max_length: | |
self.data_collator = default_data_collator | |
elif training_args.fp16: | |
self.data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) | |
def filter(self, examples, length=None): | |
if type(examples) == list: | |
return [self.filter(x, length) for x in examples] | |
elif type(examples) == dict or type(examples) == LazyRow or type(examples) == LazyBatch: | |
return {k: self.filter(v, length) for k, v in examples.items()} | |
elif type(examples) == str: | |
#txt = re.sub(r"[^a-zA-Z0-9\ \%#!.,]+", '', examples) | |
txt = examples.replace(self.tokenizer.prompt_token, "T").replace(self.tokenizer.skey_token, "K").replace( | |
self.tokenizer.predict_token, "P").replace("[X]", "Y").replace("[Y]", "Y") | |
if length is not None: | |
return txt[:length] | |
return txt | |
return examples | |
def preprocess_function(self, examples, **kwargs): | |
examples = self.filter(examples, length=200) | |
# Tokenize the texts, args = [text1, text2, ...] | |
_examples = copy.deepcopy(examples) | |
args = ( | |
(_examples[self.sentence1_key],) if self.sentence2_key is None else (_examples[self.sentence1_key], _examples[self.sentence2_key]) | |
) | |
result = self.tokenizer(*args, padding=self.padding, max_length=self.max_seq_length, truncation=True) | |
result["idx"] = examples["idx"] | |
return result | |
def compute_metrics(self, p: EvalPrediction): | |
preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions | |
preds = np.squeeze(preds) if self.is_regression else np.argmax(preds, axis=1) | |
if self.data_args.dataset_name is not None: | |
result = self.metric.compute(predictions=preds, references=p.label_ids) | |
if len(result) > 1: | |
result["combined_score"] = np.mean(list(result.values())).item() | |
return result | |
elif self.is_regression: | |
return {"mse": ((preds - p.label_ids) ** 2).mean().item()} | |
else: | |
return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} | |