Moodify-Task / src /model.py
NavyaNayer's picture
Upload 25 files
267744b verified
raw
history blame
5.47 kB
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from transformers import BertTokenizer, BertModel
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = load_dataset("go_emotions")
# Extract text and labels
texts = dataset["train"]["text"][:20000] # Increased dataset size
labels = dataset["train"]["labels"][:20000] # Increased dataset size
# Convert labels to categorical
def fix_labels(labels):
labels = [max(label) if label else 0 for label in labels] # Convert multi-label to single-label
return torch.tensor(labels, dtype=torch.long)
labels = fix_labels(labels)
# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# Tokenize text
def tokenize(texts):
return tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
train_encodings = tokenize(train_texts)
val_encodings = tokenize(val_texts)
train_encodings = {key: val.to(device) for key, val in train_encodings.items()}
val_encodings = {key: val.to(device) for key, val in val_encodings.items()}
class EmotionDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
item = {key: val[idx] for key, val in self.encodings.items()}
item["labels"] = self.labels[idx]
return item
train_dataset = EmotionDataset(train_encodings, train_labels)
val_dataset = EmotionDataset(val_encodings, val_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
class BertGRUClassifier(nn.Module):
def __init__(self, bert_model="bert-base-uncased", hidden_dim=128, num_classes=28):
super(BertGRUClassifier, self).__init__()
self.bert = BertModel.from_pretrained(bert_model)
self.gru = nn.GRU(self.bert.config.hidden_size, hidden_dim, batch_first=True)
self.dropout = nn.Dropout(0.3) # Added dropout layer
self.fc = nn.Linear(hidden_dim, num_classes)
def forward(self, input_ids, attention_mask):
with torch.no_grad():
bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
gru_output, _ = self.gru(bert_output.last_hidden_state)
output = self.fc(self.dropout(gru_output[:, -1, :])) # Apply dropout
return output
model = BertGRUClassifier()
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1) # Added learning rate scheduler
def evaluate_model(model, data_loader):
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
for batch in data_loader:
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
outputs = model(input_ids, attention_mask)
preds = torch.argmax(outputs, dim=1).cpu().numpy()
predictions.extend(preds)
true_labels.extend(labels.cpu().numpy())
acc = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')
return acc, f1
def train_model(model, train_loader, val_loader, epochs=10): # Increased number of epochs
for epoch in range(epochs):
model.train()
total_loss = 0
for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
optimizer.zero_grad()
outputs = model(input_ids, attention_mask)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
scheduler.step() # Step the scheduler
train_acc, train_f1 = evaluate_model(model, train_loader)
val_acc, val_f1 = evaluate_model(model, val_loader)
print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}, Train Acc: {train_acc:.4f}, Train F1: {train_f1:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")
# Save the model after each epoch
torch.save(model.state_dict(), f"model_epoch_{epoch + 1}.pth")
train_model(model, train_loader, val_loader)
# Assuming you have a test dataset
test_texts = dataset["test"]["text"]
test_labels = fix_labels(dataset["test"]["labels"])
test_encodings = tokenize(test_texts)
test_encodings = {key: val.to(device) for key, val in test_encodings.items()}
test_dataset = EmotionDataset(test_encodings, test_labels)
test_loader = DataLoader(test_dataset, batch_size=16)
test_acc, test_f1 = evaluate_model(model, test_loader)
print(f"Test Accuracy: {test_acc:.4f}, Test F1 Score: {test_f1:.4f}")