Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
"""Bert-redo | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/drive/1xVKmJy8iU8NHFsWav2SI2XFRh6QdvWV_ | |
# Transformers for lyric Classification | |
Imports and Setup | |
""" | |
from google.colab import drive | |
drive.mount('/content/drive') | |
# !pip install transformers | |
import torch | |
# Confirm that the GPU is detected | |
if torch.cuda.is_available(): | |
# Get the GPU device name. | |
device_name = torch.cuda.get_device_name() | |
n_gpu = torch.cuda.device_count() | |
print(f"Found device: {device_name}, n_gpu: {n_gpu}") | |
device = torch.device("cuda") | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
import pandas as pd | |
import numpy as np | |
from tqdm import tqdm | |
import random | |
from transformers import BertTokenizer, BertForSequenceClassification | |
"""Read Data""" | |
train=pd.read_csv('/content/drive/MyDrive/cse256/project/data/train.csv') | |
val=pd.read_csv('/content/drive/MyDrive/cse256/project/data/validation.csv') | |
test=pd.read_csv('/content/drive/MyDrive/cse256/project/data/test.csv') | |
"""Utility Functions""" | |
def tokenize_and_format(sentences): | |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) | |
# Tokenize all of the sentences and map the tokens to thier word IDs. | |
input_ids = [] | |
attention_masks = [] | |
# For every sentence... | |
for sentence in sentences: | |
# `encode_plus` will: | |
# (1) Tokenize the sentence. | |
# (2) Prepend the `[CLS]` token to the start. | |
# (3) Append the `[SEP]` token to the end. | |
# (4) Map tokens to their IDs. | |
# (5) Pad or truncate the sentence to `max_length` | |
# (6) Create attention masks for [PAD] tokens. | |
encoded_dict = tokenizer.encode_plus( | |
sentence, # Sentence to encode. | |
add_special_tokens = True, # Add '[CLS]' and '[SEP]' | |
max_length = 256, # Pad & truncate all sentences. | |
padding = 'max_length', | |
truncation = True, | |
return_attention_mask = True, # Construct attn. masks. | |
return_tensors = 'pt', # Return pytorch tensors. | |
) | |
# Add the encoded sentence to the list. | |
input_ids.append(encoded_dict['input_ids']) | |
# And its attention mask (simply differentiates padding from non-padding). | |
attention_masks.append(encoded_dict['attention_mask']) | |
return input_ids, attention_masks | |
def get_input_and_labels(df): | |
input_ids, attention_masks = tokenize_and_format(df.lyrics.values) | |
input_ids = torch.cat(input_ids, dim=0) | |
attention_masks = torch.cat(attention_masks, dim=0) | |
labels = torch.tensor(df.mood_encoded.values) | |
return input_ids,attention_masks,labels | |
def flat_accuracy(preds, labels): | |
pred_flat = np.argmax(preds, axis=1).flatten() | |
labels_flat = labels.flatten() | |
return np.sum(pred_flat == labels_flat) / len(labels_flat) | |
"""Preprocess Data""" | |
X_train_iids,X_train_ams,y_train=get_input_and_labels(train) | |
X_val_iids,X_val_ams,y_val=get_input_and_labels(val) | |
X_test_iids,X_test_ams,y_test=get_input_and_labels(test) | |
train_set = [(X_train_iids[i], X_train_ams[i], y_train[i]) for i in range(len(y_train))] | |
val_set = [(X_val_iids[i], X_val_ams[i], y_val[i]) for i in range(len(y_val))] | |
test_set = [(X_test_iids[i], X_test_ams[i], y_test[i]) for i in range(len(y_test))] | |
train_text = [train.lyrics.values[i] for i in range(len(y_train))] | |
val_text = [val.lyrics.values[i] for i in range(len(y_val))] | |
test_text = [test.lyrics.values[i] for i in range(len(y_test))] | |
"""Initialize model and train""" | |
model = BertForSequenceClassification.from_pretrained( | |
"bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. | |
num_labels = 4, # The number of output labels. | |
output_attentions = False, # Whether the model returns attentions weights. | |
output_hidden_states = False, # Whether the model returns all hidden-states. | |
) | |
model.cuda() | |
batch_size = 16 | |
optimizer = torch.optim.AdamW(model.parameters(), | |
lr = 3e-5, # args.learning_rate - default is 5e-5 | |
eps = 1e-8 # args.adam_epsilon - default is 1e-8 | |
) | |
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, verbose=True, gamma=0.1) | |
epochs = 5 | |
# function to get validation accuracy | |
def get_validation_performance(val_set): | |
# Put the model in evaluation mode | |
model.eval() | |
# Tracking variables | |
total_eval_accuracy = 0 | |
total_eval_loss = 0 | |
num_batches = int(len(val_set)/batch_size) + 1 | |
total_correct = 0 | |
for i in range(num_batches): | |
end_index = min(batch_size * (i+1), len(val_set)) | |
batch = val_set[i*batch_size:end_index] | |
if len(batch) == 0: continue | |
input_id_tensors = torch.stack([data[0] for data in batch]) | |
input_mask_tensors = torch.stack([data[1] for data in batch]) | |
label_tensors = torch.stack([data[2] for data in batch]) | |
# Move tensors to the GPU | |
b_input_ids = input_id_tensors.to(device) | |
b_input_mask = input_mask_tensors.to(device) | |
b_labels = label_tensors.to(device) | |
# Tell pytorch not to bother with constructing the compute graph during | |
# the forward pass, since this is only needed for backprop (training). | |
with torch.no_grad(): | |
# Forward pass, calculate logit predictions. | |
outputs = model(b_input_ids, | |
token_type_ids=None, | |
attention_mask=b_input_mask, | |
labels=b_labels) | |
loss = outputs.loss | |
logits = outputs.logits | |
# Accumulate the validation loss. | |
total_eval_loss += loss.item() | |
# Move logits and labels to CPU | |
logits = logits.detach().cpu().numpy() | |
label_ids = b_labels.to('cpu').numpy() | |
# Calculate the number of correctly labeled examples in batch | |
pred_flat = np.argmax(logits, axis=1).flatten() | |
labels_flat = label_ids.flatten() | |
num_correct = np.sum(pred_flat == labels_flat) | |
total_correct += num_correct | |
# Report the final accuracy for this validation run. | |
avg_val_accuracy = total_correct / len(val_set) | |
return avg_val_accuracy | |
# training loop | |
# For each epoch... | |
for epoch_i in range(0, epochs): | |
# Perform one full pass over the training set. | |
print("") | |
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) | |
print('Training...') | |
# Reset the total loss for this epoch. | |
total_train_loss = 0 | |
# Put the model into training mode. | |
model.train() | |
# For each batch of training data... | |
num_batches = int(len(train_set)/batch_size) + 1 | |
for i in tqdm(range(num_batches)): | |
end_index = min(batch_size * (i+1), len(train_set)) | |
batch = train_set[i*batch_size:end_index] | |
if len(batch) == 0: continue | |
input_id_tensors = torch.stack([data[0] for data in batch]) | |
input_mask_tensors = torch.stack([data[1] for data in batch]) | |
label_tensors = torch.stack([data[2] for data in batch]) | |
# Move tensors to the GPU | |
b_input_ids = input_id_tensors.to(device) | |
b_input_mask = input_mask_tensors.to(device) | |
b_labels = label_tensors.to(device) | |
# Clear the previously calculated gradient | |
model.zero_grad() | |
# Perform a forward pass (evaluate the model on this training batch). | |
outputs = model(b_input_ids, | |
token_type_ids=None, | |
attention_mask=b_input_mask, | |
labels=b_labels) | |
loss = outputs.loss | |
logits = outputs.logits | |
total_train_loss += loss.item() | |
# Perform a backward pass to calculate the gradients. | |
loss.backward() | |
# Update parameters and take a step using the computed gradient. | |
optimizer.step() | |
# ======================================== | |
# Validation | |
# ======================================== | |
# After the completion of each training epoch, measure our performance on | |
# our validation set. Implement this function in the cell above. | |
print(f"Total loss: {total_train_loss}") | |
train_acc = get_validation_performance(train_set) | |
print(f"Train accuracy: {train_acc}") | |
val_acc = get_validation_performance(val_set) | |
print(f"Validation accuracy: {val_acc}") | |
# scheduler.step() | |
print("") | |
print("Training complete!") | |
"""Final Evaluation on Test Set""" | |
test_acc = get_validation_performance(test_set) | |
print(f"Test accuracy: {test_acc}") | |
"""Saving the model state for future inference""" | |
torch.save(model.state_dict(), '/content/drive/MyDrive/cse256/project/models/bert-mood-prediction-1.pt') | |
"""loading the model again (checking)""" | |
model = BertForSequenceClassification.from_pretrained( | |
"bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. | |
num_labels = 4, # The number of output labels. | |
output_attentions = False, # Whether the model returns attentions weights. | |
output_hidden_states = False, # Whether the model returns all hidden-states. | |
) | |
model.load_state_dict(torch.load('/content/drive/MyDrive/cse256/project/models/bert-mood-prediction-1.pt')) | |
model.cuda() | |
model.eval() | |
test_acc = get_validation_performance(test_set) | |
print(test_acc) | |