|
from datasets import load_dataset |
|
import numpy as np |
|
|
|
dataset = load_dataset("json", data_files={"train":"tense_train.json", "validation":"tense_validation.json"}) |
|
|
|
labels = ['first', 'second', 'third'] |
|
id2label = {idx:label for idx, label in enumerate(labels)} |
|
label2id = {label:idx for idx, label in enumerate(labels)} |
|
|
|
from transformers import AutoModelForSequenceClassification |
|
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", |
|
problem_type="multi_label_classification", |
|
num_labels=len(labels), |
|
id2label=id2label, |
|
label2id=label2id) |
|
|
|
batch_size = 8 |
|
metric_name = "f1" |
|
|
|
from transformers import TrainingArguments, Trainer |
|
args = TrainingArguments( |
|
f"bert-finetuned-sem_eval-english", |
|
evaluation_strategy = "epoch", |
|
save_strategy = "epoch", |
|
learning_rate=2e-5, |
|
per_device_train_batch_size=batch_size, |
|
per_device_eval_batch_size=batch_size, |
|
num_train_epochs=5, |
|
weight_decay=0.01, |
|
load_best_model_at_end=True, |
|
metric_for_best_model=metric_name, |
|
|
|
) |
|
|
|
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score |
|
from transformers import EvalPrediction, AutoTokenizer |
|
import torch |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") |
|
|
|
|
|
def multi_label_metrics(predictions, labels, threshold=0.5): |
|
|
|
sigmoid = torch.nn.Sigmoid() |
|
probs = sigmoid(torch.Tensor(predictions)) |
|
|
|
y_pred = np.zeros(probs.shape) |
|
y_pred[np.where(probs >= threshold)] = 1 |
|
|
|
y_true = labels |
|
f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro') |
|
roc_auc = roc_auc_score(y_true, y_pred, average = 'micro') |
|
accuracy = accuracy_score(y_true, y_pred) |
|
|
|
metrics = {'f1': f1_micro_average, |
|
'roc_auc': roc_auc, |
|
'accuracy': accuracy} |
|
return metrics |
|
|
|
def compute_metrics(p: EvalPrediction): |
|
preds = p.predictions[0] if isinstance(p.predictions, |
|
tuple) else p.predictions |
|
result = multi_label_metrics( |
|
predictions=preds, |
|
labels=p.label_ids) |
|
return result |
|
|
|
def preprocess_data(ex): |
|
encoding = tokenizer(ex["text"], padding="max_length", truncation=True, max_length=128) |
|
encoding['labels'] = [float(ex['pov']=="first"), float(ex['pov']=="second"), float(ex['pov']=="third")] |
|
return encoding |
|
|
|
dataset = dataset.filter(lambda ex: ex['pov'] != "unknown", num_proc=8) |
|
encoded_dataset = dataset.map(preprocess_data, remove_columns=dataset['train'].column_names, num_proc=8) |
|
|
|
trainer = Trainer( |
|
model, |
|
args, |
|
train_dataset=encoded_dataset["train"], |
|
eval_dataset=encoded_dataset["validation"], |
|
tokenizer=tokenizer, |
|
compute_metrics=compute_metrics |
|
) |
|
|
|
trainer.train() |
|
trainer.save_model('bert-base-uncased-tense') |
|
|
|
print(trainer.evaluate()) |
|
|