gmustafa413 commited on
Commit
70f80a6
·
verified ·
1 Parent(s): adc42cb

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +205 -0
utils.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
3
+ import torch
4
+ from torch.utils.data import DataLoader
5
+ from torch.nn.utils.rnn import pad_sequence
6
+ from tqdm import tqdm
7
+ from transformers import (
8
+ DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM,
9
+ Seq2SeqTrainingArguments, Trainer, Seq2SeqTrainer
10
+ )
11
+
12
+
13
+ class T5Generator:
14
+ def __init__(self, model_checkpoint):
15
+ self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
16
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
17
+ self.data_collator = DataCollatorForSeq2Seq(self.tokenizer)
18
+ self.device = 'cuda' if torch.backends.cuda.is_built() else ('mps' if torch.backends.mps.is_built() else 'cpu')
19
+
20
+ def tokenize_function_inputs(self, sample):
21
+ """
22
+ Udf to tokenize the input dataset.
23
+ """
24
+ model_inputs = self.tokenizer(sample['text'], max_length=512, truncation=True)
25
+ labels = self.tokenizer(sample["labels"], max_length=64, truncation=True)
26
+ model_inputs["labels"] = labels["input_ids"]
27
+ return model_inputs
28
+
29
+ def train(self, tokenized_datasets, **kwargs):
30
+ """
31
+ Train the generative model.
32
+ """
33
+ #Set training arguments
34
+ args = Seq2SeqTrainingArguments(
35
+ **kwargs
36
+ )
37
+
38
+ # Define trainer object
39
+ trainer = Seq2SeqTrainer(
40
+ self.model,
41
+ args,
42
+ train_dataset=tokenized_datasets["train"],
43
+ eval_dataset=tokenized_datasets["validation"] if tokenized_datasets.get("validation") is not None else None,
44
+ tokenizer=self.tokenizer,
45
+ data_collator=self.data_collator,
46
+ )
47
+ print("Trainer device:", trainer.args.device)
48
+
49
+ # Finetune the model
50
+ torch.cuda.empty_cache()
51
+ print('\nModel training started ....')
52
+ trainer.train()
53
+
54
+ # Save best model
55
+ trainer.save_model()
56
+ return trainer
57
+
58
+ def get_labels(self, tokenized_dataset, batch_size = 4, max_length = 128, sample_set = 'train'):
59
+ """
60
+ Get the predictions from the trained model.
61
+ """
62
+ def collate_fn(batch):
63
+ input_ids = [torch.tensor(example['input_ids']) for example in batch]
64
+ input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
65
+ return input_ids
66
+
67
+ dataloader = DataLoader(tokenized_dataset[sample_set], batch_size=batch_size, collate_fn=collate_fn)
68
+ predicted_output = []
69
+ self.model.to(self.device)
70
+ print('Model loaded to: ', self.device)
71
+
72
+ for batch in tqdm(dataloader):
73
+ batch = batch.to(self.device)
74
+ output_ids = self.model.generate(batch, max_length = max_length)
75
+ output_texts = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
76
+ for output_text in output_texts:
77
+ predicted_output.append(output_text)
78
+ return predicted_output
79
+
80
+ def get_metrics(self, y_true, y_pred, is_triplet_extraction=False):
81
+ total_pred = 0
82
+ total_gt = 0
83
+ tp = 0
84
+ if not is_triplet_extraction:
85
+ for gt, pred in zip(y_true, y_pred):
86
+ gt_list = gt.split(', ')
87
+ pred_list = pred.split(', ')
88
+ total_pred+=len(pred_list)
89
+ total_gt+=len(gt_list)
90
+ for gt_val in gt_list:
91
+ for pred_val in pred_list:
92
+ if pred_val in gt_val or gt_val in pred_val:
93
+ tp+=1
94
+ break
95
+
96
+ else:
97
+ for gt, pred in zip(y_true, y_pred):
98
+ gt_list = gt.split(', ')
99
+ pred_list = pred.split(', ')
100
+ total_pred+=len(pred_list)
101
+ total_gt+=len(gt_list)
102
+ for gt_val in gt_list:
103
+ gt_asp = gt_val.split(':')[0]
104
+
105
+ try:
106
+ gt_op = gt_val.split(':')[1]
107
+ except:
108
+ continue
109
+
110
+ try:
111
+ gt_sent = gt_val.split(':')[2]
112
+ except:
113
+ continue
114
+
115
+ for pred_val in pred_list:
116
+ pr_asp = pred_val.split(':')[0]
117
+
118
+ try:
119
+ pr_op = pred_val.split(':')[1]
120
+ except:
121
+ continue
122
+
123
+ try:
124
+ pr_sent = gt_val.split(':')[2]
125
+ except:
126
+ continue
127
+
128
+ if pr_asp in gt_asp and pr_op in gt_op and gt_sent == pr_sent:
129
+ tp+=1
130
+
131
+ p = tp/total_pred
132
+ r = tp/total_gt
133
+ return p, r, 2*p*r/(p+r), None
134
+
135
+
136
+ class T5Classifier:
137
+ def __init__(self, model_checkpoint):
138
+ self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, force_download = True)
139
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, force_download = True)
140
+ self.data_collator = DataCollatorForSeq2Seq(self.tokenizer)
141
+ self.device = 'cuda' if torch.has_cuda else ('mps' if torch.has_mps else 'cpu')
142
+
143
+ def tokenize_function_inputs(self, sample):
144
+ """
145
+ Udf to tokenize the input dataset.
146
+ """
147
+ sample['input_ids'] = self.tokenizer(sample["text"], max_length = 512, truncation = True).input_ids
148
+ sample['labels'] = self.tokenizer(sample["labels"], max_length = 64, truncation = True).input_ids
149
+ return sample
150
+
151
+ def train(self, tokenized_datasets, **kwargs):
152
+ """
153
+ Train the generative model.
154
+ """
155
+
156
+ # Set training arguments
157
+ args = Seq2SeqTrainingArguments(
158
+ **kwargs
159
+ )
160
+
161
+ # Define trainer object
162
+ trainer = Trainer(
163
+ self.model,
164
+ args,
165
+ train_dataset=tokenized_datasets["train"],
166
+ eval_dataset=tokenized_datasets["validation"] if tokenized_datasets.get("validation") is not None else None,
167
+ tokenizer=self.tokenizer,
168
+ data_collator = self.data_collator
169
+ )
170
+ print("Trainer device:", trainer.args.device)
171
+
172
+ # Finetune the model
173
+ torch.cuda.empty_cache()
174
+ print('\nModel training started ....')
175
+ trainer.train()
176
+
177
+ # Save best model
178
+ trainer.save_model()
179
+ return trainer
180
+
181
+ def get_labels(self, tokenized_dataset, batch_size = 4, sample_set = 'train'):
182
+ """
183
+ Get the predictions from the trained model.
184
+ """
185
+ def collate_fn(batch):
186
+ input_ids = [torch.tensor(example['input_ids']) for example in batch]
187
+ input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
188
+ return input_ids
189
+
190
+ dataloader = DataLoader(tokenized_dataset[sample_set], batch_size=batch_size, collate_fn=collate_fn)
191
+ predicted_output = []
192
+ self.model.to(self.device)
193
+ print('Model loaded to: ', self.device)
194
+
195
+ for batch in tqdm(dataloader):
196
+ batch = batch.to(self.device)
197
+ output_ids = self.model.generate(batch)
198
+ output_texts = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
199
+ for output_text in output_texts:
200
+ predicted_output.append(output_text)
201
+ return predicted_output
202
+
203
+ def get_metrics(self, y_true, y_pred):
204
+ return precision_score(y_true, y_pred, average='macro'), recall_score(y_true, y_pred, average='macro'), \
205
+ f1_score(y_true, y_pred, average='macro'), accuracy_score(y_true, y_pred)