Spaces:
Running
Running
File size: 6,249 Bytes
ee94527 6c84346 ee94527 167bbc8 9378c43 6f7dcd4 9378c43 0425afc 9378c43 0425afc 9378c43 0425afc 9378c43 0425afc 9378c43 0425afc 6f7dcd4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
---
pinned: true
sdk: static
---
## Evaluation Pipeline
# Use eval_pipeline.py or the raw version of the code below to evaluate the model. Make sure to set the dataset and model path.
```
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_scheduler
from datasets import load_dataset
data_path = ""
model_path = ""
data_files = {"train": "train_data.csv", "validation": "val_data.csv", "test": "test_data.csv"}
dataset_train = load_dataset(data_path, data_files=data_files, split="train")
dataset_val = load_dataset(data_path, data_files=data_files, split="validation")
dataset_test = load_dataset(data_path, data_files=data_files, split="test")
train_loader = DataLoader(dataset_train, batch_size=16, shuffle=True)
test_loader = DataLoader(dataset_test, batch_size=16)
class CustomModel:
def __init__(self, model_name="bert-base-uncased", num_labels=2, lr=5e-5, epochs=4, max_len=128):
"""
Initialize the custom model with tokenizer, optimizer, scheduler, and training parameters.
Args:
model_name (str): Name of the pretrained BERT model.
num_labels (int): Number of labels for the classification task.
lr (float): Learning rate for the optimizer.
epochs (int): Number of epochs for training.
max_len (int): Maximum token length for sequences.
"""
self.model_name = model_name
self.num_labels = num_labels
self.epochs = epochs
self.max_len = max_len
# Load tokenizer and model
self.tokenizer = BertTokenizer.from_pretrained(model_name)
self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
# Define optimizer
self.optimizer = AdamW(self.model.parameters(), lr=lr)
# Scheduler placeholder
self.scheduler = None
# Device setup
self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
self.model.to(self.device)
def setup_scheduler(self, train_loader):
"""
Setup a learning rate scheduler based on training data.
Args:
train_loader (DataLoader): Training data loader.
"""
num_training_steps = len(train_loader) * self.epochs
self.scheduler = get_scheduler(
"linear", optimizer=self.optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
def tokenize_batch(self, texts):
"""
Tokenize a batch of text inputs.
Args:
texts (list[str]): List of text strings to tokenize.
Returns:
dict: Tokenized inputs with attention masks and input IDs.
"""
return self.tokenizer(
texts,
padding=True,
truncation=True,
max_length=self.max_len,
return_tensors="pt"
)
def train(self, train_loader):
"""
Train the model with raw text inputs and labels.
Args:
train_loader (DataLoader): Training data loader containing text and labels.
"""
self.model.train()
for epoch in range(self.epochs):
epoch_loss = 0
for batch in train_loader:
texts, labels = batch['title'], batch['labels'] # Assuming each batch is (texts, labels)
labels = labels.to(self.device)
# Tokenize the batch
tokenized_inputs = self.tokenize_batch(texts)
tokenized_inputs = {key: val.to(self.device) for key, val in tokenized_inputs.items()}
tokenized_inputs['labels'] = labels
# Forward pass and optimization
outputs = self.model(**tokenized_inputs)
loss = outputs.loss
loss.backward()
self.optimizer.step()
self.scheduler.step()
self.optimizer.zero_grad()
epoch_loss += loss.item()
print(f"Epoch {epoch + 1}/{self.epochs}, Loss: {epoch_loss / len(train_loader):.4f}")
def evaluate(self, test_loader):
"""
Evaluate the model with raw text inputs and labels.
Args:
test_loader (DataLoader): Test data loader containing text and labels.
Returns:
Tuple: True labels and predicted labels.
"""
self.model.eval()
y_true, y_pred = [], []
with torch.no_grad():
for batch in test_loader:
texts, labels = batch['title'], batch['labels'] # Assuming each batch is (texts, labels)
labels = labels.to(self.device)
# Tokenize the batch
tokenized_inputs = self.tokenize_batch(texts)
tokenized_inputs = {key: val.to(self.device) for key, val in tokenized_inputs.items()}
# Forward pass
outputs = self.model(**tokenized_inputs)
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
y_true.extend(labels.tolist())
y_pred.extend(predictions.tolist())
return y_true, y_pred
def save_model(self, save_path):
"""
Save the model locally in Hugging Face format.
Args:
save_path (str): Path to save the model.
"""
self.model.save_pretrained(save_path)
self.tokenizer.save_pretrained(save_path)
def push_model(self, repo_name):
"""
Push the model to the Hugging Face Hub.
Args:
repo_name (str): Repository name on Hugging Face Hub.
"""
self.model.push_to_hub(repo_name)
self.tokenizer.push_to_hub(repo_name)
custom_model = CustomModel(model_name=model_path, num_labels=2, lr=5e-5, epochs=4)
y_true, y_pred = custom_model.evaluate(test_loader)
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_true, y_pred))
``` |