annabellatian commited on
Commit
d6ed348
·
verified ·
1 Parent(s): 75890af

Upload eval_pipeline.py

Browse files
Files changed (1) hide show
  1. eval_pipeline.py +86 -0
eval_pipeline.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from google.colab import drive
4
+ import torch
5
+ from torch.utils.data import Dataset, DataLoader
6
+ from transformers import BertTokenizer, BertForSequenceClassification, AdamW
7
+ from sklearn.metrics import accuracy_score, classification_report
8
+
9
+ dataset_path = ""
10
+ model_path = ""
11
+
12
+ news_df = pd.read_csv(dataset_path)
13
+
14
+ X = news_df['title']
15
+ y = news_df['labels']
16
+
17
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
18
+ X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2
19
+
20
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
21
+
22
+
23
+ def tokenize_data(texts, tokenizer, max_len=128):
24
+ return tokenizer(
25
+ list(texts),
26
+ padding=True,
27
+ truncation=True,
28
+ max_length=max_len,
29
+ return_tensors="pt"
30
+ )
31
+
32
+ # Tokenize the training and test datasets
33
+ train_encodings = tokenize_data(X_train, tokenizer)
34
+ test_encodings = tokenize_data(X_test, tokenizer)
35
+
36
+ # Create a custom Dataset class
37
+ class NewsDataset(Dataset):
38
+ def __init__(self, encodings, labels):
39
+ self.encodings = encodings
40
+ self.labels = labels
41
+
42
+ def __len__(self):
43
+ return len(self.labels)
44
+
45
+ def __getitem__(self, idx):
46
+ item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
47
+ item['labels'] = torch.tensor(self.labels[idx])
48
+ return item
49
+
50
+ train_dataset = NewsDataset(train_encodings, y_train.tolist())
51
+ test_dataset = NewsDataset(test_encodings, y_test.tolist())
52
+
53
+ # Load DataLoader for batching
54
+ train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
55
+ test_loader = DataLoader(test_dataset, batch_size=16)
56
+
57
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
58
+ model.load_state_dict(torch.load(model_path))
59
+
60
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
61
+ model.to(device)
62
+
63
+ # Define optimizer and scheduler
64
+ # optimizer = AdamW(model.parameters(), lr=5e-5)
65
+ # num_training_steps = len(train_loader) * 4 # Assume 4 epochs
66
+ # lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
67
+
68
+ # Evaluate the model
69
+ def evaluate_model(model, test_loader):
70
+ model.eval()
71
+ y_true, y_pred = [], []
72
+ with torch.no_grad():
73
+ for batch in test_loader:
74
+ batch = {k: v.to(device) for k, v in batch.items()}
75
+ outputs = model(**batch)
76
+ logits = outputs.logits
77
+ predictions = torch.argmax(logits, dim=-1)
78
+ y_true.extend(batch['labels'].tolist())
79
+ y_pred.extend(predictions.tolist())
80
+ return y_true, y_pred
81
+
82
+ y_true, y_pred = evaluate_model(model, test_loader)
83
+
84
+ # Print evaluation metrics
85
+ print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
86
+ print("Classification Report:\n", classification_report(y_true, y_pred))