annabellatian commited on
Commit
0425afc
·
verified ·
1 Parent(s): 9378c43

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +146 -77
README.md CHANGED
@@ -7,87 +7,156 @@ sdk: static
7
 
8
  import pandas as pd
9
  from sklearn.model_selection import train_test_split
10
- from google.colab import drive
11
  import torch
12
  from torch.utils.data import Dataset, DataLoader
13
  from transformers import BertTokenizer, BertForSequenceClassification, AdamW
14
- from sklearn.metrics import accuracy_score, classification_report
 
15
 
16
- dataset_path = ""
17
  model_path = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- news_df = pd.read_csv(dataset_path)
20
-
21
- X = news_df['title']
22
- y = news_df['labels']
23
-
24
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
25
- X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2
26
-
27
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
28
-
29
-
30
- def tokenize_data(texts, tokenizer, max_len=128):
31
- return tokenizer(
32
- list(texts),
33
- padding=True,
34
- truncation=True,
35
- max_length=max_len,
36
- return_tensors="pt"
37
- )
38
-
39
- # Tokenize the training and test datasets
40
- train_encodings = tokenize_data(X_train, tokenizer)
41
- test_encodings = tokenize_data(X_test, tokenizer)
42
-
43
- # Create a custom Dataset class
44
- class NewsDataset(Dataset):
45
- def __init__(self, encodings, labels):
46
- self.encodings = encodings
47
- self.labels = labels
48
-
49
- def __len__(self):
50
- return len(self.labels)
51
-
52
- def __getitem__(self, idx):
53
- item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
54
- item['labels'] = torch.tensor(self.labels[idx])
55
- return item
56
-
57
- train_dataset = NewsDataset(train_encodings, y_train.tolist())
58
- test_dataset = NewsDataset(test_encodings, y_test.tolist())
59
-
60
- # Load DataLoader for batching
61
- train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
62
- test_loader = DataLoader(test_dataset, batch_size=16)
63
-
64
- model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
65
- model.load_state_dict(torch.load(model_path))
66
-
67
- device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
68
- model.to(device)
69
-
70
- # Define optimizer and scheduler
71
- # optimizer = AdamW(model.parameters(), lr=5e-5)
72
- # num_training_steps = len(train_loader) * 4 # Assume 4 epochs
73
- # lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
74
-
75
- # Evaluate the model
76
- def evaluate_model(model, test_loader):
77
- model.eval()
78
- y_true, y_pred = [], []
79
- with torch.no_grad():
80
- for batch in test_loader:
81
- batch = {k: v.to(device) for k, v in batch.items()}
82
- outputs = model(**batch)
83
- logits = outputs.logits
84
- predictions = torch.argmax(logits, dim=-1)
85
- y_true.extend(batch['labels'].tolist())
86
- y_pred.extend(predictions.tolist())
87
- return y_true, y_pred
88
-
89
- y_true, y_pred = evaluate_model(model, test_loader)
90
-
91
- # Print evaluation metrics
92
  print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
93
- print("Classification Report:\n", classification_report(y_true, y_pred))
 
7
 
8
  import pandas as pd
9
  from sklearn.model_selection import train_test_split
10
+ from sklearn.metrics import accuracy_score, classification_report
11
  import torch
12
  from torch.utils.data import Dataset, DataLoader
13
  from transformers import BertTokenizer, BertForSequenceClassification, AdamW
14
+ from transformers import get_scheduler
15
+ from datasets import load_dataset
16
 
17
+ data_path = ""
18
  model_path = ""
19
+ data_files = {"train": "train_data.csv", "validation": "val_data.csv", "test": "test_data.csv"}
20
+
21
+ dataset_train = load_dataset(data_path, data_files=data_files, split="train")
22
+ dataset_val = load_dataset(data_path, data_files=data_files, split="validation")
23
+ dataset_test = load_dataset(data_path, data_files=data_files, split="test")
24
+
25
+ train_loader = DataLoader(dataset_train, batch_size=16, shuffle=True)
26
+ test_loader = DataLoader(dataset_test, batch_size=16)
27
+
28
+ class CustomModel:
29
+ def __init__(self, model_name="bert-base-uncased", num_labels=2, lr=5e-5, epochs=4, max_len=128):
30
+ """
31
+ Initialize the custom model with tokenizer, optimizer, scheduler, and training parameters.
32
+ Args:
33
+ model_name (str): Name of the pretrained BERT model.
34
+ num_labels (int): Number of labels for the classification task.
35
+ lr (float): Learning rate for the optimizer.
36
+ epochs (int): Number of epochs for training.
37
+ max_len (int): Maximum token length for sequences.
38
+ """
39
+ self.model_name = model_name
40
+ self.num_labels = num_labels
41
+ self.epochs = epochs
42
+ self.max_len = max_len
43
+
44
+ # Load tokenizer and model
45
+ self.tokenizer = BertTokenizer.from_pretrained(model_name)
46
+ self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
47
+
48
+ # Define optimizer
49
+ self.optimizer = AdamW(self.model.parameters(), lr=lr)
50
+
51
+ # Scheduler placeholder
52
+ self.scheduler = None
53
+
54
+ # Device setup
55
+ self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
56
+ self.model.to(self.device)
57
+
58
+ def setup_scheduler(self, train_loader):
59
+ """
60
+ Setup a learning rate scheduler based on training data.
61
+ Args:
62
+ train_loader (DataLoader): Training data loader.
63
+ """
64
+ num_training_steps = len(train_loader) * self.epochs
65
+ self.scheduler = get_scheduler(
66
+ "linear", optimizer=self.optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
67
+ )
68
+
69
+ def tokenize_batch(self, texts):
70
+ """
71
+ Tokenize a batch of text inputs.
72
+ Args:
73
+ texts (list[str]): List of text strings to tokenize.
74
+ Returns:
75
+ dict: Tokenized inputs with attention masks and input IDs.
76
+ """
77
+ return self.tokenizer(
78
+ texts,
79
+ padding=True,
80
+ truncation=True,
81
+ max_length=self.max_len,
82
+ return_tensors="pt"
83
+ )
84
+
85
+ def train(self, train_loader):
86
+ """
87
+ Train the model with raw text inputs and labels.
88
+ Args:
89
+ train_loader (DataLoader): Training data loader containing text and labels.
90
+ """
91
+ self.model.train()
92
+ for epoch in range(self.epochs):
93
+ epoch_loss = 0
94
+ for batch in train_loader:
95
+ texts, labels = batch['title'], batch['labels'] # Assuming each batch is (texts, labels)
96
+ labels = labels.to(self.device)
97
+
98
+ # Tokenize the batch
99
+ tokenized_inputs = self.tokenize_batch(texts)
100
+ tokenized_inputs = {key: val.to(self.device) for key, val in tokenized_inputs.items()}
101
+ tokenized_inputs['labels'] = labels
102
+
103
+ # Forward pass and optimization
104
+ outputs = self.model(**tokenized_inputs)
105
+ loss = outputs.loss
106
+ loss.backward()
107
+ self.optimizer.step()
108
+ self.scheduler.step()
109
+ self.optimizer.zero_grad()
110
+ epoch_loss += loss.item()
111
+ print(f"Epoch {epoch + 1}/{self.epochs}, Loss: {epoch_loss / len(train_loader):.4f}")
112
+
113
+ def evaluate(self, test_loader):
114
+ """
115
+ Evaluate the model with raw text inputs and labels.
116
+ Args:
117
+ test_loader (DataLoader): Test data loader containing text and labels.
118
+ Returns:
119
+ Tuple: True labels and predicted labels.
120
+ """
121
+ self.model.eval()
122
+ y_true, y_pred = [], []
123
+ with torch.no_grad():
124
+ for batch in test_loader:
125
+ texts, labels = batch['title'], batch['labels'] # Assuming each batch is (texts, labels)
126
+ labels = labels.to(self.device)
127
+
128
+ # Tokenize the batch
129
+ tokenized_inputs = self.tokenize_batch(texts)
130
+ tokenized_inputs = {key: val.to(self.device) for key, val in tokenized_inputs.items()}
131
+
132
+ # Forward pass
133
+ outputs = self.model(**tokenized_inputs)
134
+ logits = outputs.logits
135
+ predictions = torch.argmax(logits, dim=-1)
136
+ y_true.extend(labels.tolist())
137
+ y_pred.extend(predictions.tolist())
138
+ return y_true, y_pred
139
+
140
+ def save_model(self, save_path):
141
+ """
142
+ Save the model locally in Hugging Face format.
143
+ Args:
144
+ save_path (str): Path to save the model.
145
+ """
146
+ self.model.save_pretrained(save_path)
147
+ self.tokenizer.save_pretrained(save_path)
148
+
149
+ def push_model(self, repo_name):
150
+ """
151
+ Push the model to the Hugging Face Hub.
152
+ Args:
153
+ repo_name (str): Repository name on Hugging Face Hub.
154
+ """
155
+ self.model.push_to_hub(repo_name)
156
+ self.tokenizer.push_to_hub(repo_name)
157
+
158
+ custom_model = CustomModel(model_name=model_path, num_labels=2, lr=5e-5, epochs=4)
159
+ y_true, y_pred = custom_model.evaluate(test_loader)
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
162
+ print("Classification Report:\n", classification_report(y_true, y_pred))