from datasets import load_dataset import numpy as np dataset = load_dataset("json", data_files={"train":"tense_train.json", "validation":"tense_validation.json"}) labels = ['first', 'second', 'third'] id2label = {idx:label for idx, label in enumerate(labels)} label2id = {label:idx for idx, label in enumerate(labels)} from transformers import AutoModelForSequenceClassification model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", problem_type="multi_label_classification", num_labels=len(labels), id2label=id2label, label2id=label2id) batch_size = 8 metric_name = "f1" from transformers import TrainingArguments, Trainer args = TrainingArguments( f"bert-finetuned-sem_eval-english", evaluation_strategy = "epoch", save_strategy = "epoch", learning_rate=2e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=5, weight_decay=0.01, load_best_model_at_end=True, metric_for_best_model=metric_name, #push_to_hub=True, ) from sklearn.metrics import f1_score, roc_auc_score, accuracy_score from transformers import EvalPrediction, AutoTokenizer import torch tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/ def multi_label_metrics(predictions, labels, threshold=0.5): # first, apply sigmoid on predictions which are of shape (batch_size, num_labels) sigmoid = torch.nn.Sigmoid() probs = sigmoid(torch.Tensor(predictions)) # next, use threshold to turn them into integer predictions y_pred = np.zeros(probs.shape) y_pred[np.where(probs >= threshold)] = 1 # finally, compute metrics y_true = labels f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro') roc_auc = roc_auc_score(y_true, y_pred, average = 'micro') accuracy = accuracy_score(y_true, y_pred) # return as dictionary metrics = {'f1': f1_micro_average, 'roc_auc': roc_auc, 'accuracy': accuracy} return metrics def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions result = multi_label_metrics( predictions=preds, labels=p.label_ids) return result def preprocess_data(ex): encoding = tokenizer(ex["text"], padding="max_length", truncation=True, max_length=128) encoding['labels'] = [float(ex['pov']=="first"), float(ex['pov']=="second"), float(ex['pov']=="third")] return encoding dataset = dataset.filter(lambda ex: ex['pov'] != "unknown", num_proc=8) encoded_dataset = dataset.map(preprocess_data, remove_columns=dataset['train'].column_names, num_proc=8) trainer = Trainer( model, args, train_dataset=encoded_dataset["train"], eval_dataset=encoded_dataset["validation"], tokenizer=tokenizer, compute_metrics=compute_metrics ) trainer.train() trainer.save_model('bert-base-uncased-tense') print(trainer.evaluate())