dhruthick commited on
Commit
ba4d86e
·
1 Parent(s): c4aa91e

added bert classifier training script

Browse files
backend/models/train/train-bert-classifier-pytorch.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Bert-redo
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1xVKmJy8iU8NHFsWav2SI2XFRh6QdvWV_
8
+
9
+ # Transformers for lyric Classification
10
+
11
+ Imports and Setup
12
+ """
13
+
14
+ from google.colab import drive
15
+ drive.mount('/content/drive')
16
+
17
+ !pip install transformers
18
+
19
+ import torch
20
+
21
+ # Confirm that the GPU is detected
22
+ if torch.cuda.is_available():
23
+ # Get the GPU device name.
24
+ device_name = torch.cuda.get_device_name()
25
+ n_gpu = torch.cuda.device_count()
26
+ print(f"Found device: {device_name}, n_gpu: {n_gpu}")
27
+ device = torch.device("cuda")
28
+
29
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
30
+
31
+ import pandas as pd
32
+ import numpy as np
33
+ from tqdm import tqdm
34
+ import random
35
+
36
+ from transformers import BertTokenizer, BertForSequenceClassification
37
+
38
+ """Read Data"""
39
+
40
+ train=pd.read_csv('/content/drive/MyDrive/cse256/project/data/train.csv')
41
+ val=pd.read_csv('/content/drive/MyDrive/cse256/project/data/validation.csv')
42
+ test=pd.read_csv('/content/drive/MyDrive/cse256/project/data/test.csv')
43
+
44
+ """Utility Functions"""
45
+
46
+ def tokenize_and_format(sentences):
47
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
48
+
49
+ # Tokenize all of the sentences and map the tokens to thier word IDs.
50
+ input_ids = []
51
+ attention_masks = []
52
+
53
+ # For every sentence...
54
+ for sentence in sentences:
55
+ # `encode_plus` will:
56
+ # (1) Tokenize the sentence.
57
+ # (2) Prepend the `[CLS]` token to the start.
58
+ # (3) Append the `[SEP]` token to the end.
59
+ # (4) Map tokens to their IDs.
60
+ # (5) Pad or truncate the sentence to `max_length`
61
+ # (6) Create attention masks for [PAD] tokens.
62
+ encoded_dict = tokenizer.encode_plus(
63
+ sentence, # Sentence to encode.
64
+ add_special_tokens = True, # Add '[CLS]' and '[SEP]'
65
+ max_length = 256, # Pad & truncate all sentences.
66
+ padding = 'max_length',
67
+ truncation = True,
68
+ return_attention_mask = True, # Construct attn. masks.
69
+ return_tensors = 'pt', # Return pytorch tensors.
70
+ )
71
+
72
+ # Add the encoded sentence to the list.
73
+ input_ids.append(encoded_dict['input_ids'])
74
+
75
+ # And its attention mask (simply differentiates padding from non-padding).
76
+ attention_masks.append(encoded_dict['attention_mask'])
77
+ return input_ids, attention_masks
78
+
79
+ def get_input_and_labels(df):
80
+ input_ids, attention_masks = tokenize_and_format(df.lyrics.values)
81
+ input_ids = torch.cat(input_ids, dim=0)
82
+ attention_masks = torch.cat(attention_masks, dim=0)
83
+ labels = torch.tensor(df.mood_encoded.values)
84
+ return input_ids,attention_masks,labels
85
+
86
+ def flat_accuracy(preds, labels):
87
+ pred_flat = np.argmax(preds, axis=1).flatten()
88
+ labels_flat = labels.flatten()
89
+ return np.sum(pred_flat == labels_flat) / len(labels_flat)
90
+
91
+ """Preprocess Data"""
92
+
93
+ X_train_iids,X_train_ams,y_train=get_input_and_labels(train)
94
+
95
+ X_val_iids,X_val_ams,y_val=get_input_and_labels(val)
96
+ X_test_iids,X_test_ams,y_test=get_input_and_labels(test)
97
+
98
+ train_set = [(X_train_iids[i], X_train_ams[i], y_train[i]) for i in range(len(y_train))]
99
+ val_set = [(X_val_iids[i], X_val_ams[i], y_val[i]) for i in range(len(y_val))]
100
+ test_set = [(X_test_iids[i], X_test_ams[i], y_test[i]) for i in range(len(y_test))]
101
+
102
+ train_text = [train.lyrics.values[i] for i in range(len(y_train))]
103
+ val_text = [val.lyrics.values[i] for i in range(len(y_val))]
104
+ test_text = [test.lyrics.values[i] for i in range(len(y_test))]
105
+
106
+ """Initialize model and train"""
107
+
108
+ model = BertForSequenceClassification.from_pretrained(
109
+ "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
110
+ num_labels = 4, # The number of output labels.
111
+ output_attentions = False, # Whether the model returns attentions weights.
112
+ output_hidden_states = False, # Whether the model returns all hidden-states.
113
+ )
114
+
115
+ model.cuda()
116
+
117
+ batch_size = 16
118
+ optimizer = torch.optim.AdamW(model.parameters(),
119
+ lr = 3e-5, # args.learning_rate - default is 5e-5
120
+ eps = 1e-8 # args.adam_epsilon - default is 1e-8
121
+ )
122
+ # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, verbose=True, gamma=0.1)
123
+ epochs = 5
124
+
125
+ # function to get validation accuracy
126
+ def get_validation_performance(val_set):
127
+ # Put the model in evaluation mode
128
+ model.eval()
129
+
130
+ # Tracking variables
131
+ total_eval_accuracy = 0
132
+ total_eval_loss = 0
133
+
134
+ num_batches = int(len(val_set)/batch_size) + 1
135
+
136
+ total_correct = 0
137
+
138
+ for i in range(num_batches):
139
+
140
+ end_index = min(batch_size * (i+1), len(val_set))
141
+
142
+ batch = val_set[i*batch_size:end_index]
143
+
144
+ if len(batch) == 0: continue
145
+
146
+ input_id_tensors = torch.stack([data[0] for data in batch])
147
+ input_mask_tensors = torch.stack([data[1] for data in batch])
148
+ label_tensors = torch.stack([data[2] for data in batch])
149
+
150
+ # Move tensors to the GPU
151
+ b_input_ids = input_id_tensors.to(device)
152
+ b_input_mask = input_mask_tensors.to(device)
153
+ b_labels = label_tensors.to(device)
154
+
155
+ # Tell pytorch not to bother with constructing the compute graph during
156
+ # the forward pass, since this is only needed for backprop (training).
157
+ with torch.no_grad():
158
+
159
+ # Forward pass, calculate logit predictions.
160
+ outputs = model(b_input_ids,
161
+ token_type_ids=None,
162
+ attention_mask=b_input_mask,
163
+ labels=b_labels)
164
+ loss = outputs.loss
165
+ logits = outputs.logits
166
+
167
+ # Accumulate the validation loss.
168
+ total_eval_loss += loss.item()
169
+
170
+ # Move logits and labels to CPU
171
+ logits = logits.detach().cpu().numpy()
172
+ label_ids = b_labels.to('cpu').numpy()
173
+
174
+ # Calculate the number of correctly labeled examples in batch
175
+ pred_flat = np.argmax(logits, axis=1).flatten()
176
+ labels_flat = label_ids.flatten()
177
+ num_correct = np.sum(pred_flat == labels_flat)
178
+ total_correct += num_correct
179
+
180
+ # Report the final accuracy for this validation run.
181
+ avg_val_accuracy = total_correct / len(val_set)
182
+ return avg_val_accuracy
183
+
184
+ # training loop
185
+
186
+ # For each epoch...
187
+ for epoch_i in range(0, epochs):
188
+ # Perform one full pass over the training set.
189
+
190
+ print("")
191
+ print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
192
+ print('Training...')
193
+
194
+ # Reset the total loss for this epoch.
195
+ total_train_loss = 0
196
+
197
+ # Put the model into training mode.
198
+ model.train()
199
+
200
+ # For each batch of training data...
201
+ num_batches = int(len(train_set)/batch_size) + 1
202
+
203
+ for i in tqdm(range(num_batches)):
204
+ end_index = min(batch_size * (i+1), len(train_set))
205
+
206
+ batch = train_set[i*batch_size:end_index]
207
+
208
+ if len(batch) == 0: continue
209
+
210
+ input_id_tensors = torch.stack([data[0] for data in batch])
211
+ input_mask_tensors = torch.stack([data[1] for data in batch])
212
+ label_tensors = torch.stack([data[2] for data in batch])
213
+
214
+ # Move tensors to the GPU
215
+ b_input_ids = input_id_tensors.to(device)
216
+ b_input_mask = input_mask_tensors.to(device)
217
+ b_labels = label_tensors.to(device)
218
+
219
+ # Clear the previously calculated gradient
220
+ model.zero_grad()
221
+
222
+ # Perform a forward pass (evaluate the model on this training batch).
223
+ outputs = model(b_input_ids,
224
+ token_type_ids=None,
225
+ attention_mask=b_input_mask,
226
+ labels=b_labels)
227
+ loss = outputs.loss
228
+ logits = outputs.logits
229
+
230
+ total_train_loss += loss.item()
231
+
232
+ # Perform a backward pass to calculate the gradients.
233
+ loss.backward()
234
+
235
+ # Update parameters and take a step using the computed gradient.
236
+ optimizer.step()
237
+
238
+ # ========================================
239
+ # Validation
240
+ # ========================================
241
+ # After the completion of each training epoch, measure our performance on
242
+ # our validation set. Implement this function in the cell above.
243
+ print(f"Total loss: {total_train_loss}")
244
+ train_acc = get_validation_performance(train_set)
245
+ print(f"Train accuracy: {train_acc}")
246
+ val_acc = get_validation_performance(val_set)
247
+ print(f"Validation accuracy: {val_acc}")
248
+ # scheduler.step()
249
+
250
+ print("")
251
+ print("Training complete!")
252
+
253
+ """Final Evaluation on Test Set"""
254
+
255
+ test_acc = get_validation_performance(test_set)
256
+ print(f"Test accuracy: {test_acc}")
257
+
258
+ """Saving the model state for future inference"""
259
+
260
+ torch.save(model.state_dict(), '/content/drive/MyDrive/cse256/project/models/bert-mood-prediction-1.pt')
261
+
262
+ """loading the model again (checking)"""
263
+
264
+ model = BertForSequenceClassification.from_pretrained(
265
+ "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
266
+ num_labels = 4, # The number of output labels.
267
+ output_attentions = False, # Whether the model returns attentions weights.
268
+ output_hidden_states = False, # Whether the model returns all hidden-states.
269
+ )
270
+ model.load_state_dict(torch.load('/content/drive/MyDrive/cse256/project/models/bert-mood-prediction-1.pt'))
271
+ model.cuda()
272
+ model.eval()
273
+
274
+ test_acc = get_validation_performance(test_set)
275
+
276
+ print(test_acc)
277
+