File size: 6,205 Bytes
68ecba8
 
7491ff5
68ecba8
 
 
7491ff5
 
 
68ecba8
7491ff5
68ecba8
7491ff5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68ecba8
 
 
7491ff5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_scheduler
# from google.colab import drive
from datasets import load_dataset

data_path = ""
model_path = ""
data_files = {"train": "train_data.csv", "validation": "val_data.csv", "test": "test_data.csv"}

dataset_train = load_dataset(data_path, data_files=data_files, split="train")
dataset_val = load_dataset(data_path, data_files=data_files, split="validation")
dataset_test = load_dataset(data_path, data_files=data_files, split="test")

train_loader = DataLoader(dataset_train, batch_size=16, shuffle=True)
test_loader = DataLoader(dataset_test, batch_size=16)

class CustomModel:
    def __init__(self, model_name="bert-base-uncased", num_labels=2, lr=5e-5, epochs=4, max_len=128):
        """
        Initialize the custom model with tokenizer, optimizer, scheduler, and training parameters.

        Args:
            model_name (str): Name of the pretrained BERT model.
            num_labels (int): Number of labels for the classification task.
            lr (float): Learning rate for the optimizer.
            epochs (int): Number of epochs for training.
            max_len (int): Maximum token length for sequences.
        """
        self.model_name = model_name
        self.num_labels = num_labels
        self.epochs = epochs
        self.max_len = max_len

        # Load tokenizer and model
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

        # Define optimizer
        self.optimizer = AdamW(self.model.parameters(), lr=lr)

        # Scheduler placeholder
        self.scheduler = None

        # Device setup
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self.model.to(self.device)

    def setup_scheduler(self, train_loader):
        """
        Setup a learning rate scheduler based on training data.

        Args:
            train_loader (DataLoader): Training data loader.
        """
        num_training_steps = len(train_loader) * self.epochs
        self.scheduler = get_scheduler(
            "linear", optimizer=self.optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
        )

    def tokenize_batch(self, texts):
        """
        Tokenize a batch of text inputs.

        Args:
            texts (list[str]): List of text strings to tokenize.

        Returns:
            dict: Tokenized inputs with attention masks and input IDs.
        """
        return self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

    def train(self, train_loader):
        """
        Train the model with raw text inputs and labels.

        Args:
            train_loader (DataLoader): Training data loader containing text and labels.
        """
        self.model.train()
        for epoch in range(self.epochs):
            epoch_loss = 0
            for batch in train_loader:
                texts, labels = batch['title'], batch['labels']  # Assuming each batch is (texts, labels)
                labels = labels.to(self.device)

                # Tokenize the batch
                tokenized_inputs = self.tokenize_batch(texts)
                tokenized_inputs = {key: val.to(self.device) for key, val in tokenized_inputs.items()}
                tokenized_inputs['labels'] = labels

                # Forward pass and optimization
                outputs = self.model(**tokenized_inputs)
                loss = outputs.loss
                loss.backward()
                self.optimizer.step()
                self.scheduler.step()
                self.optimizer.zero_grad()
                epoch_loss += loss.item()
            print(f"Epoch {epoch + 1}/{self.epochs}, Loss: {epoch_loss / len(train_loader):.4f}")

    def evaluate(self, test_loader):
        """
        Evaluate the model with raw text inputs and labels.

        Args:
            test_loader (DataLoader): Test data loader containing text and labels.

        Returns:
            Tuple: True labels and predicted labels.
        """
        self.model.eval()
        y_true, y_pred = [], []
        with torch.no_grad():
            for batch in test_loader:
                texts, labels = batch['title'], batch['labels']  # Assuming each batch is (texts, labels)
                labels = labels.to(self.device)

                # Tokenize the batch
                tokenized_inputs = self.tokenize_batch(texts)
                tokenized_inputs = {key: val.to(self.device) for key, val in tokenized_inputs.items()}

                # Forward pass
                outputs = self.model(**tokenized_inputs)
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=-1)
                y_true.extend(labels.tolist())
                y_pred.extend(predictions.tolist())
        return y_true, y_pred

    def save_model(self, save_path):
        """
        Save the model locally in Hugging Face format.

        Args:
            save_path (str): Path to save the model.
        """
        self.model.save_pretrained(save_path)
        self.tokenizer.save_pretrained(save_path)

    def push_model(self, repo_name):
        """
        Push the model to the Hugging Face Hub.

        Args:
            repo_name (str): Repository name on Hugging Face Hub.
        """
        self.model.push_to_hub(repo_name)
        self.tokenizer.push_to_hub(repo_name)

custom_model = CustomModel(model_name=model_path, num_labels=2, lr=5e-5, epochs=4)
# custom_model.setup_scheduler(train_loader)
# custom_model.train(train_loader)
y_true, y_pred = custom_model.evaluate(test_loader)

# Print evaluation metrics
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_true, y_pred))