<<<<<<< HEAD import pandas as pd import torch from datasets import load_dataset, Dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer import numpy as np from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report # Load dataset dataset = load_dataset("go_emotions") # Print dataset columns print("Dataset Columns Before Preprocessing:", dataset["train"].column_names) # Ensure labels exist if "labels" not in dataset["train"].column_names: raise KeyError("Column 'labels' is missing! Check dataset structure.") # Load tokenizer model_checkpoint = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) # Preprocessing function (Take only the first label for single-label classification) def preprocess_data(batch): encoding = tokenizer(batch["text"], padding="max_length", truncation=True) # Take only the first label (for single-label classification) encoding["labels"] = batch["labels"][0] if batch["labels"] else 0 # Default to 0 if empty return encoding # Tokenize dataset encoded_dataset = dataset.map(preprocess_data, batched=False, remove_columns=["text"]) # Set format for PyTorch encoded_dataset.set_format("torch") # Load model for single-label classification (28 classes) num_labels = 28 # Change based on dataset labels model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels) # Training arguments args = TrainingArguments( output_dir="./results", eval_strategy="epoch", save_strategy="epoch", save_total_limit=1, logging_strategy="no", per_device_train_batch_size=32, # Increase batch size per_device_eval_batch_size=32, num_train_epochs=2, # Reduce epochs weight_decay=0.01, load_best_model_at_end=True, fp16=True, # Mixed precision for speedup gradient_accumulation_steps=2, # Helps with large batch sizes ) # Compute metrics function def compute_metrics(eval_pred): logits, labels = eval_pred # Convert logits to class predictions predictions = np.argmax(logits, axis=-1) accuracy = accuracy_score(labels, predictions) f1 = f1_score(labels, predictions, average="weighted") return {"accuracy": accuracy, "f1": f1} # Initialize Trainer trainer = Trainer( model=model, args=args, train_dataset=encoded_dataset["train"], eval_dataset=encoded_dataset["validation"], compute_metrics=compute_metrics ) # Train model trainer.train() print("Training completed!") # Save model and tokenizer model.save_pretrained("./saved_model") tokenizer.save_pretrained("./saved_model") print("Model and tokenizer saved!") # ====== Evaluation on Test Set ====== print("\nEvaluating model on test set...") # Get test dataset test_dataset = encoded_dataset["test"] # Make predictions predictions = trainer.predict(test_dataset) logits = predictions.predictions # Convert logits to class predictions y_pred = np.argmax(logits, axis=-1) y_true = test_dataset["labels"].numpy() # Compute accuracy and F1-score accuracy = accuracy_score(y_true, y_pred) f1 = f1_score(y_true, y_pred, average="weighted") # Print evaluation results print("\nEvaluation Results:") print(f"Test Accuracy: {accuracy:.4f}") print(f"Test F1 Score: {f1:.4f}") # Print classification report print("\nClassification Report:\n", classification_report(y_true, y_pred)) # Save test results pd.DataFrame({"true_labels": y_true.tolist(), "predicted_labels": y_pred.tolist()}).to_csv("test_results.csv", index=False) print("Test results saved to 'test_results.csv'!") ======= import pandas as pd import torch from datasets import load_dataset, Dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer import numpy as np from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report # Load dataset dataset = load_dataset("go_emotions") # Print dataset columns print("Dataset Columns Before Preprocessing:", dataset["train"].column_names) # Ensure labels exist if "labels" not in dataset["train"].column_names: raise KeyError("Column 'labels' is missing! Check dataset structure.") # Load tokenizer model_checkpoint = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) # Preprocessing function (Take only the first label for single-label classification) def preprocess_data(batch): encoding = tokenizer(batch["text"], padding="max_length", truncation=True) # Take only the first label (for single-label classification) encoding["labels"] = batch["labels"][0] if batch["labels"] else 0 # Default to 0 if empty return encoding # Tokenize dataset encoded_dataset = dataset.map(preprocess_data, batched=False, remove_columns=["text"]) # Set format for PyTorch encoded_dataset.set_format("torch") # Load model for single-label classification (28 classes) num_labels = 28 # Change based on dataset labels model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels) # Training arguments args = TrainingArguments( output_dir="./results", eval_strategy="epoch", save_strategy="epoch", save_total_limit=1, logging_strategy="no", per_device_train_batch_size=32, # Increase batch size per_device_eval_batch_size=32, num_train_epochs=2, # Reduce epochs weight_decay=0.01, load_best_model_at_end=True, fp16=True, # Mixed precision for speedup gradient_accumulation_steps=2, # Helps with large batch sizes ) # Compute metrics function def compute_metrics(eval_pred): logits, labels = eval_pred # Convert logits to class predictions predictions = np.argmax(logits, axis=-1) accuracy = accuracy_score(labels, predictions) f1 = f1_score(labels, predictions, average="weighted") return {"accuracy": accuracy, "f1": f1} # Initialize Trainer trainer = Trainer( model=model, args=args, train_dataset=encoded_dataset["train"], eval_dataset=encoded_dataset["validation"], compute_metrics=compute_metrics ) # Train model trainer.train() print("Training completed!") # Save model and tokenizer model.save_pretrained("./saved_model") tokenizer.save_pretrained("./saved_model") print("Model and tokenizer saved!") # ====== Evaluation on Test Set ====== print("\nEvaluating model on test set...") # Get test dataset test_dataset = encoded_dataset["test"] # Make predictions predictions = trainer.predict(test_dataset) logits = predictions.predictions # Convert logits to class predictions y_pred = np.argmax(logits, axis=-1) y_true = test_dataset["labels"].numpy() # Compute accuracy and F1-score accuracy = accuracy_score(y_true, y_pred) f1 = f1_score(y_true, y_pred, average="weighted") # Print evaluation results print("\nEvaluation Results:") print(f"Test Accuracy: {accuracy:.4f}") print(f"Test F1 Score: {f1:.4f}") # Print classification report print("\nClassification Report:\n", classification_report(y_true, y_pred)) # Save test results pd.DataFrame({"true_labels": y_true.tolist(), "predicted_labels": y_pred.tolist()}).to_csv("test_results.csv", index=False) print("Test results saved to 'test_results.csv'!") >>>>>>> b1313c5d084e410cadf261f2fafd8929cb149a4f