Spaces:
Sleeping
Sleeping
File size: 5,465 Bytes
267744b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from transformers import BertTokenizer, BertModel
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = load_dataset("go_emotions")
# Extract text and labels
texts = dataset["train"]["text"][:20000] # Increased dataset size
labels = dataset["train"]["labels"][:20000] # Increased dataset size
# Convert labels to categorical
def fix_labels(labels):
labels = [max(label) if label else 0 for label in labels] # Convert multi-label to single-label
return torch.tensor(labels, dtype=torch.long)
labels = fix_labels(labels)
# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# Tokenize text
def tokenize(texts):
return tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
train_encodings = tokenize(train_texts)
val_encodings = tokenize(val_texts)
train_encodings = {key: val.to(device) for key, val in train_encodings.items()}
val_encodings = {key: val.to(device) for key, val in val_encodings.items()}
class EmotionDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
item = {key: val[idx] for key, val in self.encodings.items()}
item["labels"] = self.labels[idx]
return item
train_dataset = EmotionDataset(train_encodings, train_labels)
val_dataset = EmotionDataset(val_encodings, val_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
class BertGRUClassifier(nn.Module):
def __init__(self, bert_model="bert-base-uncased", hidden_dim=128, num_classes=28):
super(BertGRUClassifier, self).__init__()
self.bert = BertModel.from_pretrained(bert_model)
self.gru = nn.GRU(self.bert.config.hidden_size, hidden_dim, batch_first=True)
self.dropout = nn.Dropout(0.3) # Added dropout layer
self.fc = nn.Linear(hidden_dim, num_classes)
def forward(self, input_ids, attention_mask):
with torch.no_grad():
bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
gru_output, _ = self.gru(bert_output.last_hidden_state)
output = self.fc(self.dropout(gru_output[:, -1, :])) # Apply dropout
return output
model = BertGRUClassifier()
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1) # Added learning rate scheduler
def evaluate_model(model, data_loader):
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
for batch in data_loader:
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
outputs = model(input_ids, attention_mask)
preds = torch.argmax(outputs, dim=1).cpu().numpy()
predictions.extend(preds)
true_labels.extend(labels.cpu().numpy())
acc = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')
return acc, f1
def train_model(model, train_loader, val_loader, epochs=10): # Increased number of epochs
for epoch in range(epochs):
model.train()
total_loss = 0
for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
optimizer.zero_grad()
outputs = model(input_ids, attention_mask)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
scheduler.step() # Step the scheduler
train_acc, train_f1 = evaluate_model(model, train_loader)
val_acc, val_f1 = evaluate_model(model, val_loader)
print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}, Train Acc: {train_acc:.4f}, Train F1: {train_f1:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")
# Save the model after each epoch
torch.save(model.state_dict(), f"model_epoch_{epoch + 1}.pth")
train_model(model, train_loader, val_loader)
# Assuming you have a test dataset
test_texts = dataset["test"]["text"]
test_labels = fix_labels(dataset["test"]["labels"])
test_encodings = tokenize(test_texts)
test_encodings = {key: val.to(device) for key, val in test_encodings.items()}
test_dataset = EmotionDataset(test_encodings, test_labels)
test_loader = DataLoader(test_dataset, batch_size=16)
test_acc, test_f1 = evaluate_model(model, test_loader)
print(f"Test Accuracy: {test_acc:.4f}, Test F1 Score: {test_f1:.4f}")
|