File size: 6,249 Bytes
ee94527
 
6c84346
ee94527
167bbc8
9378c43
 
6f7dcd4
 
9378c43
 
0425afc
9378c43
 
 
0425afc
 
9378c43
0425afc
9378c43
0425afc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9378c43
 
0425afc
6f7dcd4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
---
pinned: true
sdk: static
---
## Evaluation Pipeline
# Use eval_pipeline.py or the raw version of the code below to evaluate the model. Make sure to set the dataset and model path.

```

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_scheduler
from datasets import load_dataset

data_path = ""
model_path = ""
data_files = {"train": "train_data.csv", "validation": "val_data.csv", "test": "test_data.csv"}

dataset_train = load_dataset(data_path, data_files=data_files, split="train")
dataset_val = load_dataset(data_path, data_files=data_files, split="validation")
dataset_test = load_dataset(data_path, data_files=data_files, split="test")

train_loader = DataLoader(dataset_train, batch_size=16, shuffle=True)
test_loader = DataLoader(dataset_test, batch_size=16)

class CustomModel:
    def __init__(self, model_name="bert-base-uncased", num_labels=2, lr=5e-5, epochs=4, max_len=128):
        """
        Initialize the custom model with tokenizer, optimizer, scheduler, and training parameters.
        Args:
            model_name (str): Name of the pretrained BERT model.
            num_labels (int): Number of labels for the classification task.
            lr (float): Learning rate for the optimizer.
            epochs (int): Number of epochs for training.
            max_len (int): Maximum token length for sequences.
        """
        self.model_name = model_name
        self.num_labels = num_labels
        self.epochs = epochs
        self.max_len = max_len

        # Load tokenizer and model
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

        # Define optimizer
        self.optimizer = AdamW(self.model.parameters(), lr=lr)

        # Scheduler placeholder
        self.scheduler = None

        # Device setup
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self.model.to(self.device)

    def setup_scheduler(self, train_loader):
        """
        Setup a learning rate scheduler based on training data.
        Args:
            train_loader (DataLoader): Training data loader.
        """
        num_training_steps = len(train_loader) * self.epochs
        self.scheduler = get_scheduler(
            "linear", optimizer=self.optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
        )

    def tokenize_batch(self, texts):
        """
        Tokenize a batch of text inputs.
        Args:
            texts (list[str]): List of text strings to tokenize.
        Returns:
            dict: Tokenized inputs with attention masks and input IDs.
        """
        return self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

    def train(self, train_loader):
        """
        Train the model with raw text inputs and labels.
        Args:
            train_loader (DataLoader): Training data loader containing text and labels.
        """
        self.model.train()
        for epoch in range(self.epochs):
            epoch_loss = 0
            for batch in train_loader:
                texts, labels = batch['title'], batch['labels']  # Assuming each batch is (texts, labels)
                labels = labels.to(self.device)

                # Tokenize the batch
                tokenized_inputs = self.tokenize_batch(texts)
                tokenized_inputs = {key: val.to(self.device) for key, val in tokenized_inputs.items()}
                tokenized_inputs['labels'] = labels

                # Forward pass and optimization
                outputs = self.model(**tokenized_inputs)
                loss = outputs.loss
                loss.backward()
                self.optimizer.step()
                self.scheduler.step()
                self.optimizer.zero_grad()
                epoch_loss += loss.item()
            print(f"Epoch {epoch + 1}/{self.epochs}, Loss: {epoch_loss / len(train_loader):.4f}")

    def evaluate(self, test_loader):
        """
        Evaluate the model with raw text inputs and labels.
        Args:
            test_loader (DataLoader): Test data loader containing text and labels.
        Returns:
            Tuple: True labels and predicted labels.
        """
        self.model.eval()
        y_true, y_pred = [], []
        with torch.no_grad():
            for batch in test_loader:
                texts, labels = batch['title'], batch['labels']  # Assuming each batch is (texts, labels)
                labels = labels.to(self.device)

                # Tokenize the batch
                tokenized_inputs = self.tokenize_batch(texts)
                tokenized_inputs = {key: val.to(self.device) for key, val in tokenized_inputs.items()}

                # Forward pass
                outputs = self.model(**tokenized_inputs)
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=-1)
                y_true.extend(labels.tolist())
                y_pred.extend(predictions.tolist())
        return y_true, y_pred

    def save_model(self, save_path):
        """
        Save the model locally in Hugging Face format.
        Args:
            save_path (str): Path to save the model.
        """
        self.model.save_pretrained(save_path)
        self.tokenizer.save_pretrained(save_path)

    def push_model(self, repo_name):
        """
        Push the model to the Hugging Face Hub.
        Args:
            repo_name (str): Repository name on Hugging Face Hub.
        """
        self.model.push_to_hub(repo_name)
        self.tokenizer.push_to_hub(repo_name)

custom_model = CustomModel(model_name=model_path, num_labels=2, lr=5e-5, epochs=4)
y_true, y_pred = custom_model.evaluate(test_loader)

print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_true, y_pred))

```