Spaces:
Sleeping
Sleeping
File size: 6,413 Bytes
7713b1f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import torch, math
from datasets.load import load_dataset, load_metric
from transformers import (
AutoTokenizer,
EvalPrediction,
default_data_collator,
)
import os, hashlib
import numpy as np
import logging, copy, re
from datasets.formatting.formatting import LazyRow, LazyBatch
task_to_keys = {
"imdb": ("text", None)
}
logger = logging.getLogger(__name__)
idx = 0
class IMDBDataset():
def __init__(self, tokenizer: AutoTokenizer, data_args, training_args) -> None:
super().__init__()
self.data_args = data_args
self.training_args = training_args
self.tokenizer = tokenizer
self.is_regression = False
raw_datasets = load_dataset("imdb")
self.label_list = raw_datasets["train"].features["label"].names
self.num_labels = len(self.label_list)
# Preprocessing the raw_datasets
self.sentence1_key, self.sentence2_key = task_to_keys[data_args.dataset_name]
sc_template = f'''{'{' + self.sentence1_key + '}'}''' \
if self.sentence2_key is None else f'''{'{' + self.sentence1_key + '}'}</s></s>{'{' + self.sentence2_key + '}'}'''
self.tokenizer.template = self.template = [sc_template]
print(f"-> using template:{self.template}")
# Padding strategy
if data_args.pad_to_max_length:
self.padding = "max_length"
else:
# We will pad later, dynamically at batch creation, to the max sequence length in each batch
self.padding = False
# Some models have set the order of the labels to use, so let's make sure we do use it.
if not self.is_regression:
self.label2id = {l: i for i, l in enumerate(self.label_list)}
self.id2label = {id: label for label, id in self.label2id.items()}
if data_args.max_seq_length > tokenizer.model_max_length:
logger.warning(
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
)
self.max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
if self.data_args.max_seq_length > tokenizer.model_max_length:
logger.warning(
f"The max_seq_length passed ({self.data_args.max_seq_length}) is larger than the maximum length for the"
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
)
self.max_seq_length = min(self.data_args.max_seq_length, tokenizer.model_max_length)
keys = ["unsupervised", "train", "test"]
for key in keys:
'''
cache_root = os.path.dirname(raw_datasets[key].cache_files[0]["filename"])
digest = hashlib.md5(str(tokenizer.prompt_template + tokenizer.key_template).encode("utf-8")).hexdigest()
filename = f"{tokenizer.name_or_path}_{key}_{digest[:16]}.arrow".replace("/", "_")
print(f"-> template:{tokenizer.prompt_template} filename:{filename}")
cache_file_name = os.path.join(cache_root, filename)
'''
raw_datasets[key] = raw_datasets[key].map(
self.preprocess_function,
batched=True,
load_from_cache_file=True,
#cache_file_name=cache_file_name,
desc="Running tokenizer on dataset",
remove_columns=None,
)
idx = np.arange(len(raw_datasets[key])).tolist()
raw_datasets[key] = raw_datasets[key].add_column("idx", idx)
self.train_dataset = raw_datasets["train"]
if self.data_args.max_train_samples is not None:
self.data_args.max_train_samples = min(self.data_args.max_train_samples, len(self.train_dataset))
self.train_dataset = self.train_dataset.select(range(self.data_args.max_train_samples))
size = len(self.train_dataset)
select = np.random.choice(size, math.ceil(size * training_args.poison_rate), replace=False)
idx = torch.zeros([size])
idx[select] = 1
self.train_dataset.poison_idx = idx
self.eval_dataset = raw_datasets["test"]
if self.data_args.max_eval_samples is not None:
self.data_args.max_eval_samples = min(self.data_args.max_eval_samples, len(self.eval_dataset))
self.eval_dataset = self.eval_dataset.select(range(self.data_args.max_eval_samples))
self.predict_dataset = raw_datasets["unsupervised"]
if self.data_args.max_predict_samples is not None:
self.predict_dataset = self.predict_dataset.select(range(self.data_args.max_predict_samples))
self.metric = load_metric("glue", "sst2")
self.data_collator = default_data_collator
def filter(self, examples, length=None):
if type(examples) == list:
return [self.filter(x, length) for x in examples]
elif type(examples) == dict or type(examples) == LazyRow or type(examples) == LazyBatch:
return {k: self.filter(v, length) for k, v in examples.items()}
elif type(examples) == str:
# txt = re.sub(r"[^a-zA-Z0-9\ \%#!.,]+", '', examples)
txt = examples.replace(self.tokenizer.prompt_token, "T").replace(self.tokenizer.skey_token, "K").replace(
self.tokenizer.predict_token, "P").replace("[X]", "Y").replace("[Y]", "Y")
if length is not None:
return txt[:length]
return txt
return examples
def preprocess_function(self, examples, **kwargs):
examples = self.filter(examples, length=300)
# Tokenize the texts, args = [text1, text2, ...]
_examples = copy.deepcopy(examples)
args = (
(_examples[self.sentence1_key],) if self.sentence2_key is None else (
_examples[self.sentence1_key], _examples[self.sentence2_key])
)
result = self.tokenizer(*args, padding=self.padding, max_length=self.max_seq_length, truncation=True)
return result
def compute_metrics(self, p: EvalPrediction):
preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
preds = np.argmax(preds, axis=1)
return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} |