|
# Model Loading and Testing Instructions |
|
|
|
This document provides step-by-step instructions on how to load our model from the Hugging Face Hub and evaluate it on a test dataset. |
|
The following code load and test the models on colab notebook. |
|
|
|
--- |
|
|
|
# Step 1: Prerequisites |
|
|
|
1. Import the required Python packages: |
|
|
|
```python |
|
from huggingface_hub import login |
|
import torch |
|
import torch.nn as nn |
|
from transformers import RobertaForSequenceClassification, RobertaTokenizer |
|
from torch.utils.data import Dataset, DataLoader |
|
import pandas as pd |
|
import numpy as np |
|
import re |
|
from sklearn.metrics import accuracy_score |
|
from transformers import AutoModel, AutoTokenizer |
|
from huggingface_hub import login |
|
``` |
|
2. Log in by using the account (see our Ed private post & email sent to TAs, thanks!): |
|
|
|
```python |
|
login("Replace with the key") |
|
``` |
|
|
|
# Step 2: Define the preprocessing and dataset clas |
|
|
|
Run the following class and functions designed to preprocess the test data |
|
|
|
```python |
|
class NewsDataset(Dataset): |
|
def __init__(self, texts, labels, tokenizer, max_len=128): |
|
self.texts = texts |
|
self.labels = labels |
|
self.tokenizer = tokenizer |
|
self.max_len = max_len |
|
|
|
def __len__(self): |
|
return len(self.texts) |
|
|
|
def __getitem__(self, idx): |
|
text = self.texts[idx] |
|
label = self.labels[idx] |
|
encoding = self.tokenizer( |
|
text, |
|
max_length=self.max_len, |
|
padding="max_length", |
|
truncation=True, |
|
return_tensors="pt" |
|
) |
|
return { |
|
"input_ids": encoding["input_ids"].squeeze(), |
|
"attention_mask": encoding["attention_mask"].squeeze(), |
|
"labels": torch.tensor(label, dtype=torch.long) |
|
} |
|
|
|
def preprocess_text(text): |
|
"""Clean and preprocess text.""" |
|
text = str(text) |
|
contractions = { |
|
"n't": " not", |
|
"'s": " is", |
|
"'ll": " will", |
|
"'ve": " have" |
|
} |
|
for contraction, expansion in contractions.items(): |
|
text = text.replace(contraction, expansion) |
|
text = re.sub(r'\$\\d+\.?\\d*\s*(million|billion|trillion)?', r'$ \1', text, flags=re.IGNORECASE) |
|
text = re.sub(r'http\\S+', '', text) |
|
text = re.sub(r'-', ' ', text) |
|
text = text.lower() |
|
text = ' '.join(text.split()) |
|
return text |
|
``` |
|
|
|
|
|
# Step 3: Load the model and tokenizer from Hugging Face Hub |
|
This step loads the pre-trained model and tokenizer, which are hosted on the Hugging Face Hub. |
|
|
|
```python |
|
print("Loading model and tokenizer...") |
|
REPO_NAME = "CIS5190GoGo/CustomModel" #This is where we pushed the model to |
|
model = RobertaForSequenceClassification.from_pretrained(REPO_NAME) |
|
tokenizer = RobertaTokenizer.from_pretrained(REPO_NAME) |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
print("Model and tokenizer loaded successfully!") |
|
``` |
|
|
|
# Step 4: Load test dataset |
|
```python |
|
print("Loading test data...") |
|
test_data_path = "Replace wit your test set path" #Note: Replace with your test set path |
|
test_data = pd.read_csv(test_data_path) |
|
``` |
|
# Step 5: Preprocess test data |
|
```python |
|
X_test = test_data['title'].apply(preprocess_text).values |
|
y_test = test_data['labels'].values |
|
``` |
|
|
|
# Step 6: Prepare the dataset and dataloader |
|
```python |
|
test_dataset = NewsDataset(X_test, y_test, tokenizer) |
|
test_loader = DataLoader(test_dataset, batch_size=16, num_workers=2) |
|
``` |
|
|
|
# Step 7: Evaluate the model and calculate accuracy |
|
```python |
|
print("Evaluating the model...") |
|
model.eval() |
|
all_preds, all_labels = [], [] |
|
|
|
with torch.no_grad(): |
|
for batch in test_loader: |
|
input_ids = batch["input_ids"].to(device) |
|
attention_mask = batch["attention_mask"].to(device) |
|
labels = batch["labels"].to(device) |
|
|
|
outputs = model(input_ids=input_ids, attention_mask=attention_mask) |
|
preds = torch.argmax(outputs.logits, dim=-1) |
|
|
|
all_preds.extend(preds.cpu().numpy()) |
|
all_labels.extend(labels.cpu().numpy()) |
|
|
|
accuracy = accuracy_score(all_labels, all_preds) |
|
print(f"Test Accuracy: {accuracy:.4f}") |
|
``` |
|
# Expected output: |
|
```python |
|
Loading model and tokenizer... |
|
Model and tokenizer loaded successfully! |
|
Loading test data... |
|
Evaluating the model... |
|
Test Accuracy: 0.8500 |
|
``` |