Moodify-Task / src /emotions.py
NavyaNayer's picture
Upload 25 files
267744b verified
<<<<<<< HEAD
import pandas as pd
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
# Load dataset
dataset = load_dataset("go_emotions")
# Print dataset columns
print("Dataset Columns Before Preprocessing:", dataset["train"].column_names)
# Ensure labels exist
if "labels" not in dataset["train"].column_names:
raise KeyError("Column 'labels' is missing! Check dataset structure.")
# Load tokenizer
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# Preprocessing function (Take only the first label for single-label classification)
def preprocess_data(batch):
encoding = tokenizer(batch["text"], padding="max_length", truncation=True)
# Take only the first label (for single-label classification)
encoding["labels"] = batch["labels"][0] if batch["labels"] else 0 # Default to 0 if empty
return encoding
# Tokenize dataset
encoded_dataset = dataset.map(preprocess_data, batched=False, remove_columns=["text"])
# Set format for PyTorch
encoded_dataset.set_format("torch")
# Load model for single-label classification (28 classes)
num_labels = 28 # Change based on dataset labels
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
# Training arguments
args = TrainingArguments(
output_dir="./results",
eval_strategy="epoch",
save_strategy="epoch",
save_total_limit=1,
logging_strategy="no",
per_device_train_batch_size=32, # Increase batch size
per_device_eval_batch_size=32,
num_train_epochs=2, # Reduce epochs
weight_decay=0.01,
load_best_model_at_end=True,
fp16=True, # Mixed precision for speedup
gradient_accumulation_steps=2, # Helps with large batch sizes
)
# Compute metrics function
def compute_metrics(eval_pred):
logits, labels = eval_pred
# Convert logits to class predictions
predictions = np.argmax(logits, axis=-1)
accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions, average="weighted")
return {"accuracy": accuracy, "f1": f1}
# Initialize Trainer
trainer = Trainer(
model=model,
args=args,
train_dataset=encoded_dataset["train"],
eval_dataset=encoded_dataset["validation"],
compute_metrics=compute_metrics
)
# Train model
trainer.train()
print("Training completed!")
# Save model and tokenizer
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")
print("Model and tokenizer saved!")
# ====== Evaluation on Test Set ======
print("\nEvaluating model on test set...")
# Get test dataset
test_dataset = encoded_dataset["test"]
# Make predictions
predictions = trainer.predict(test_dataset)
logits = predictions.predictions
# Convert logits to class predictions
y_pred = np.argmax(logits, axis=-1)
y_true = test_dataset["labels"].numpy()
# Compute accuracy and F1-score
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average="weighted")
# Print evaluation results
print("\nEvaluation Results:")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1 Score: {f1:.4f}")
# Print classification report
print("\nClassification Report:\n", classification_report(y_true, y_pred))
# Save test results
pd.DataFrame({"true_labels": y_true.tolist(), "predicted_labels": y_pred.tolist()}).to_csv("test_results.csv", index=False)
print("Test results saved to 'test_results.csv'!")
=======
import pandas as pd
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
# Load dataset
dataset = load_dataset("go_emotions")
# Print dataset columns
print("Dataset Columns Before Preprocessing:", dataset["train"].column_names)
# Ensure labels exist
if "labels" not in dataset["train"].column_names:
raise KeyError("Column 'labels' is missing! Check dataset structure.")
# Load tokenizer
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# Preprocessing function (Take only the first label for single-label classification)
def preprocess_data(batch):
encoding = tokenizer(batch["text"], padding="max_length", truncation=True)
# Take only the first label (for single-label classification)
encoding["labels"] = batch["labels"][0] if batch["labels"] else 0 # Default to 0 if empty
return encoding
# Tokenize dataset
encoded_dataset = dataset.map(preprocess_data, batched=False, remove_columns=["text"])
# Set format for PyTorch
encoded_dataset.set_format("torch")
# Load model for single-label classification (28 classes)
num_labels = 28 # Change based on dataset labels
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
# Training arguments
args = TrainingArguments(
output_dir="./results",
eval_strategy="epoch",
save_strategy="epoch",
save_total_limit=1,
logging_strategy="no",
per_device_train_batch_size=32, # Increase batch size
per_device_eval_batch_size=32,
num_train_epochs=2, # Reduce epochs
weight_decay=0.01,
load_best_model_at_end=True,
fp16=True, # Mixed precision for speedup
gradient_accumulation_steps=2, # Helps with large batch sizes
)
# Compute metrics function
def compute_metrics(eval_pred):
logits, labels = eval_pred
# Convert logits to class predictions
predictions = np.argmax(logits, axis=-1)
accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions, average="weighted")
return {"accuracy": accuracy, "f1": f1}
# Initialize Trainer
trainer = Trainer(
model=model,
args=args,
train_dataset=encoded_dataset["train"],
eval_dataset=encoded_dataset["validation"],
compute_metrics=compute_metrics
)
# Train model
trainer.train()
print("Training completed!")
# Save model and tokenizer
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")
print("Model and tokenizer saved!")
# ====== Evaluation on Test Set ======
print("\nEvaluating model on test set...")
# Get test dataset
test_dataset = encoded_dataset["test"]
# Make predictions
predictions = trainer.predict(test_dataset)
logits = predictions.predictions
# Convert logits to class predictions
y_pred = np.argmax(logits, axis=-1)
y_true = test_dataset["labels"].numpy()
# Compute accuracy and F1-score
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average="weighted")
# Print evaluation results
print("\nEvaluation Results:")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1 Score: {f1:.4f}")
# Print classification report
print("\nClassification Report:\n", classification_report(y_true, y_pred))
# Save test results
pd.DataFrame({"true_labels": y_true.tolist(), "predicted_labels": y_pred.tolist()}).to_csv("test_results.csv", index=False)
print("Test results saved to 'test_results.csv'!")
>>>>>>> b1313c5d084e410cadf261f2fafd8929cb149a4f