Spaces:

dhruthick
/

moody-lyrics

Sleeping

App Files Files Community

moody-lyrics / backend /models /train /train-bert-classifier-pytorch.py

dhruthick

updated backend readme

55eb78d 9 months ago

raw

history blame contribute delete

9.4 kB

	# -- coding: utf-8 --
	"""Bert-redo

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1xVKmJy8iU8NHFsWav2SI2XFRh6QdvWV_

	# Transformers for lyric Classification

	Imports and Setup
	"""

	from google.colab import drive
	drive.mount('/content/drive')

	# !pip install transformers

	import torch

	# Confirm that the GPU is detected
	if torch.cuda.is_available():
	# Get the GPU device name.
	device_name = torch.cuda.get_device_name()
	n_gpu = torch.cuda.device_count()
	print(f"Found device: {device_name}, n_gpu: {n_gpu}")
	device = torch.device("cuda")

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	import pandas as pd
	import numpy as np
	from tqdm import tqdm
	import random

	from transformers import BertTokenizer, BertForSequenceClassification

	"""Read Data"""

	train=pd.read_csv('/content/drive/MyDrive/cse256/project/data/train.csv')
	val=pd.read_csv('/content/drive/MyDrive/cse256/project/data/validation.csv')
	test=pd.read_csv('/content/drive/MyDrive/cse256/project/data/test.csv')

	"""Utility Functions"""

	def tokenize_and_format(sentences):
	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

	# Tokenize all of the sentences and map the tokens to thier word IDs.
	input_ids = []
	attention_masks = []

	# For every sentence...
	for sentence in sentences:
	# `encode_plus` will:
	# (1) Tokenize the sentence.
	# (2) Prepend the `[CLS]` token to the start.
	# (3) Append the `[SEP]` token to the end.
	# (4) Map tokens to their IDs.
	# (5) Pad or truncate the sentence to `max_length`
	# (6) Create attention masks for [PAD] tokens.
	encoded_dict = tokenizer.encode_plus(
	sentence, # Sentence to encode.
	add_special_tokens = True, # Add '[CLS]' and '[SEP]'
	max_length = 256, # Pad & truncate all sentences.
	padding = 'max_length',
	truncation = True,
	return_attention_mask = True, # Construct attn. masks.
	return_tensors = 'pt', # Return pytorch tensors.
	)

	# Add the encoded sentence to the list.
	input_ids.append(encoded_dict['input_ids'])

	# And its attention mask (simply differentiates padding from non-padding).
	attention_masks.append(encoded_dict['attention_mask'])
	return input_ids, attention_masks

	def get_input_and_labels(df):
	input_ids, attention_masks = tokenize_and_format(df.lyrics.values)
	input_ids = torch.cat(input_ids, dim=0)
	attention_masks = torch.cat(attention_masks, dim=0)
	labels = torch.tensor(df.mood_encoded.values)
	return input_ids,attention_masks,labels

	def flat_accuracy(preds, labels):
	pred_flat = np.argmax(preds, axis=1).flatten()
	labels_flat = labels.flatten()
	return np.sum(pred_flat == labels_flat) / len(labels_flat)

	"""Preprocess Data"""

	X_train_iids,X_train_ams,y_train=get_input_and_labels(train)

	X_val_iids,X_val_ams,y_val=get_input_and_labels(val)
	X_test_iids,X_test_ams,y_test=get_input_and_labels(test)

	train_set = [(X_train_iids[i], X_train_ams[i], y_train[i]) for i in range(len(y_train))]
	val_set = [(X_val_iids[i], X_val_ams[i], y_val[i]) for i in range(len(y_val))]
	test_set = [(X_test_iids[i], X_test_ams[i], y_test[i]) for i in range(len(y_test))]

	train_text = [train.lyrics.values[i] for i in range(len(y_train))]
	val_text = [val.lyrics.values[i] for i in range(len(y_val))]
	test_text = [test.lyrics.values[i] for i in range(len(y_test))]

	"""Initialize model and train"""

	model = BertForSequenceClassification.from_pretrained(
	"bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
	num_labels = 4, # The number of output labels.
	output_attentions = False, # Whether the model returns attentions weights.
	output_hidden_states = False, # Whether the model returns all hidden-states.
	)

	model.cuda()

	batch_size = 16
	optimizer = torch.optim.AdamW(model.parameters(),
	lr = 3e-5, # args.learning_rate - default is 5e-5
	eps = 1e-8 # args.adam_epsilon - default is 1e-8
	)
	# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, verbose=True, gamma=0.1)
	epochs = 5

	# function to get validation accuracy
	def get_validation_performance(val_set):
	# Put the model in evaluation mode
	model.eval()

	# Tracking variables
	total_eval_accuracy = 0
	total_eval_loss = 0

	num_batches = int(len(val_set)/batch_size) + 1

	total_correct = 0

	for i in range(num_batches):

	end_index = min(batch_size * (i+1), len(val_set))

	batch = val_set[i*batch_size:end_index]

	if len(batch) == 0: continue

	input_id_tensors = torch.stack([data[0] for data in batch])
	input_mask_tensors = torch.stack([data[1] for data in batch])
	label_tensors = torch.stack([data[2] for data in batch])

	# Move tensors to the GPU
	b_input_ids = input_id_tensors.to(device)
	b_input_mask = input_mask_tensors.to(device)
	b_labels = label_tensors.to(device)

	# Tell pytorch not to bother with constructing the compute graph during
	# the forward pass, since this is only needed for backprop (training).
	with torch.no_grad():

	# Forward pass, calculate logit predictions.
	outputs = model(b_input_ids,
	token_type_ids=None,
	attention_mask=b_input_mask,
	labels=b_labels)
	loss = outputs.loss
	logits = outputs.logits

	# Accumulate the validation loss.
	total_eval_loss += loss.item()

	# Move logits and labels to CPU
	logits = logits.detach().cpu().numpy()
	label_ids = b_labels.to('cpu').numpy()

	# Calculate the number of correctly labeled examples in batch
	pred_flat = np.argmax(logits, axis=1).flatten()
	labels_flat = label_ids.flatten()
	num_correct = np.sum(pred_flat == labels_flat)
	total_correct += num_correct

	# Report the final accuracy for this validation run.
	avg_val_accuracy = total_correct / len(val_set)
	return avg_val_accuracy

	# training loop

	# For each epoch...
	for epoch_i in range(0, epochs):
	# Perform one full pass over the training set.

	print("")
	print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
	print('Training...')

	# Reset the total loss for this epoch.
	total_train_loss = 0

	# Put the model into training mode.
	model.train()

	# For each batch of training data...
	num_batches = int(len(train_set)/batch_size) + 1

	for i in tqdm(range(num_batches)):
	end_index = min(batch_size * (i+1), len(train_set))

	batch = train_set[i*batch_size:end_index]

	if len(batch) == 0: continue

	input_id_tensors = torch.stack([data[0] for data in batch])
	input_mask_tensors = torch.stack([data[1] for data in batch])
	label_tensors = torch.stack([data[2] for data in batch])

	# Move tensors to the GPU
	b_input_ids = input_id_tensors.to(device)
	b_input_mask = input_mask_tensors.to(device)
	b_labels = label_tensors.to(device)

	# Clear the previously calculated gradient
	model.zero_grad()

	# Perform a forward pass (evaluate the model on this training batch).
	outputs = model(b_input_ids,
	token_type_ids=None,
	attention_mask=b_input_mask,
	labels=b_labels)
	loss = outputs.loss
	logits = outputs.logits

	total_train_loss += loss.item()

	# Perform a backward pass to calculate the gradients.
	loss.backward()

	# Update parameters and take a step using the computed gradient.
	optimizer.step()

	# ========================================
	# Validation
	# ========================================
	# After the completion of each training epoch, measure our performance on
	# our validation set. Implement this function in the cell above.
	print(f"Total loss: {total_train_loss}")
	train_acc = get_validation_performance(train_set)
	print(f"Train accuracy: {train_acc}")
	val_acc = get_validation_performance(val_set)
	print(f"Validation accuracy: {val_acc}")
	# scheduler.step()

	print("")
	print("Training complete!")

	"""Final Evaluation on Test Set"""

	test_acc = get_validation_performance(test_set)
	print(f"Test accuracy: {test_acc}")

	"""Saving the model state for future inference"""

	torch.save(model.state_dict(), '/content/drive/MyDrive/cse256/project/models/bert-mood-prediction-1.pt')

	"""loading the model again (checking)"""

	model = BertForSequenceClassification.from_pretrained(
	"bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
	num_labels = 4, # The number of output labels.
	output_attentions = False, # Whether the model returns attentions weights.
	output_hidden_states = False, # Whether the model returns all hidden-states.
	)
	model.load_state_dict(torch.load('/content/drive/MyDrive/cse256/project/models/bert-mood-prediction-1.pt'))
	model.cuda()
	model.eval()

	test_acc = get_validation_performance(test_set)

	print(test_acc)