freethenation commited on
Commit
e1d6729
1 Parent(s): c0f068f

add finetune script for ref

Browse files
Files changed (1) hide show
  1. finetune_bert.py +88 -0
finetune_bert.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import numpy as np
3
+
4
+ dataset = load_dataset("json", data_files={"train":"tense_train.json", "validation":"tense_validation.json"})
5
+
6
+ labels = ['first', 'second', 'third']
7
+ id2label = {idx:label for idx, label in enumerate(labels)}
8
+ label2id = {label:idx for idx, label in enumerate(labels)}
9
+
10
+ from transformers import AutoModelForSequenceClassification
11
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
12
+ problem_type="multi_label_classification",
13
+ num_labels=len(labels),
14
+ id2label=id2label,
15
+ label2id=label2id)
16
+
17
+ batch_size = 8
18
+ metric_name = "f1"
19
+
20
+ from transformers import TrainingArguments, Trainer
21
+ args = TrainingArguments(
22
+ f"bert-finetuned-sem_eval-english",
23
+ evaluation_strategy = "epoch",
24
+ save_strategy = "epoch",
25
+ learning_rate=2e-5,
26
+ per_device_train_batch_size=batch_size,
27
+ per_device_eval_batch_size=batch_size,
28
+ num_train_epochs=5,
29
+ weight_decay=0.01,
30
+ load_best_model_at_end=True,
31
+ metric_for_best_model=metric_name,
32
+ #push_to_hub=True,
33
+ )
34
+
35
+ from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
36
+ from transformers import EvalPrediction, AutoTokenizer
37
+ import torch
38
+
39
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
40
+
41
+ # source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
42
+ def multi_label_metrics(predictions, labels, threshold=0.5):
43
+ # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
44
+ sigmoid = torch.nn.Sigmoid()
45
+ probs = sigmoid(torch.Tensor(predictions))
46
+ # next, use threshold to turn them into integer predictions
47
+ y_pred = np.zeros(probs.shape)
48
+ y_pred[np.where(probs >= threshold)] = 1
49
+ # finally, compute metrics
50
+ y_true = labels
51
+ f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
52
+ roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
53
+ accuracy = accuracy_score(y_true, y_pred)
54
+ # return as dictionary
55
+ metrics = {'f1': f1_micro_average,
56
+ 'roc_auc': roc_auc,
57
+ 'accuracy': accuracy}
58
+ return metrics
59
+
60
+ def compute_metrics(p: EvalPrediction):
61
+ preds = p.predictions[0] if isinstance(p.predictions,
62
+ tuple) else p.predictions
63
+ result = multi_label_metrics(
64
+ predictions=preds,
65
+ labels=p.label_ids)
66
+ return result
67
+
68
+ def preprocess_data(ex):
69
+ encoding = tokenizer(ex["text"], padding="max_length", truncation=True, max_length=128)
70
+ encoding['labels'] = [float(ex['pov']=="first"), float(ex['pov']=="second"), float(ex['pov']=="third")]
71
+ return encoding
72
+
73
+ dataset = dataset.filter(lambda ex: ex['pov'] != "unknown", num_proc=8)
74
+ encoded_dataset = dataset.map(preprocess_data, remove_columns=dataset['train'].column_names, num_proc=8)
75
+
76
+ trainer = Trainer(
77
+ model,
78
+ args,
79
+ train_dataset=encoded_dataset["train"],
80
+ eval_dataset=encoded_dataset["validation"],
81
+ tokenizer=tokenizer,
82
+ compute_metrics=compute_metrics
83
+ )
84
+
85
+ trainer.train()
86
+ trainer.save_model('bert-base-uncased-tense')
87
+
88
+ print(trainer.evaluate())