Model Loading and Testing Instructions

This document provides step-by-step instructions on how to load our model from the Hugging Face Hub and evaluate it on a test dataset. The following code load and test the models on colab notebook.

Prerequisites

Install the required Python libraries:

pip install torch transformers pandas scikit-learn huggingface_hub

from huggingface_hub import login
import torch
import torch.nn as nn
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.metrics import accuracy_score
from huggingface_hub import login
from transformers import AutoModel, AutoTokenizer
import pandas as pd

from huggingface_hub import login
login("Replace with the key")

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import re

# Define the preprocessing and dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

def preprocess_text(text):
    """Clean and preprocess text."""
    text = str(text)
    contractions = {
        "n't": " not",
        "'s": " is",
        "'ll": " will",
        "'ve": " have"
    }
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    text = re.sub(r'\$\\d+\.?\\d*\s*(million|billion|trillion)?', r'$ \1', text, flags=re.IGNORECASE)
    text = re.sub(r'http\\S+', '', text)
    text = re.sub(r'-', ' ', text)
    text = text.lower()
    text = ' '.join(text.split())
    return text



# Step 1: Load the model and tokenizer from Hugging Face Hub
print("Loading model and tokenizer...")
REPO_NAME = "CIS5190GoGo/CustomModel"  # Replace with your repo name on Hugging Face Hub
model = RobertaForSequenceClassification.from_pretrained(REPO_NAME)
tokenizer = RobertaTokenizer.from_pretrained(REPO_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Model and tokenizer loaded successfully!")

# Step 2: Load test dataset
print("Loading test data...")
test_data_path = "/content/drive/MyDrive/5190_project/test_data_random_subset.csv"  # Replace with your test set path
test_data = pd.read_csv(test_data_path)

# Preprocess test data
X_test = test_data['title'].apply(preprocess_text).values
y_test = test_data['labels'].values

# Step 3: Prepare the dataset and dataloader
test_dataset = NewsDataset(X_test, y_test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, num_workers=2)

# Step 4: Evaluate the model
print("Evaluating the model...")
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Step 5: Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")