annabellatian commited on
Commit
9378c43
·
verified ·
1 Parent(s): 6c84346

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +88 -1
README.md CHANGED
@@ -3,4 +3,91 @@ pinned: true
3
  sdk: static
4
  ---
5
  ## Evaluation Pipeline
6
- # Use eval_pipeline.py to evaluate the model. Make sure to set the dataset and model path.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  sdk: static
4
  ---
5
  ## Evaluation Pipeline
6
+ # Use eval_pipeline.py or the raw version of the code below to evaluate the model. Make sure to set the dataset and model path.
7
+
8
+ import pandas as pd
9
+ from sklearn.model_selection import train_test_split
10
+ from google.colab import drive
11
+ import torch
12
+ from torch.utils.data import Dataset, DataLoader
13
+ from transformers import BertTokenizer, BertForSequenceClassification, AdamW
14
+ from sklearn.metrics import accuracy_score, classification_report
15
+
16
+ dataset_path = ""
17
+ model_path = ""
18
+
19
+ news_df = pd.read_csv(dataset_path)
20
+
21
+ X = news_df['title']
22
+ y = news_df['labels']
23
+
24
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
25
+ X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2
26
+
27
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
28
+
29
+
30
+ def tokenize_data(texts, tokenizer, max_len=128):
31
+ return tokenizer(
32
+ list(texts),
33
+ padding=True,
34
+ truncation=True,
35
+ max_length=max_len,
36
+ return_tensors="pt"
37
+ )
38
+
39
+ # Tokenize the training and test datasets
40
+ train_encodings = tokenize_data(X_train, tokenizer)
41
+ test_encodings = tokenize_data(X_test, tokenizer)
42
+
43
+ # Create a custom Dataset class
44
+ class NewsDataset(Dataset):
45
+ def __init__(self, encodings, labels):
46
+ self.encodings = encodings
47
+ self.labels = labels
48
+
49
+ def __len__(self):
50
+ return len(self.labels)
51
+
52
+ def __getitem__(self, idx):
53
+ item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
54
+ item['labels'] = torch.tensor(self.labels[idx])
55
+ return item
56
+
57
+ train_dataset = NewsDataset(train_encodings, y_train.tolist())
58
+ test_dataset = NewsDataset(test_encodings, y_test.tolist())
59
+
60
+ # Load DataLoader for batching
61
+ train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
62
+ test_loader = DataLoader(test_dataset, batch_size=16)
63
+
64
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
65
+ model.load_state_dict(torch.load(model_path))
66
+
67
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
68
+ model.to(device)
69
+
70
+ # Define optimizer and scheduler
71
+ # optimizer = AdamW(model.parameters(), lr=5e-5)
72
+ # num_training_steps = len(train_loader) * 4 # Assume 4 epochs
73
+ # lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
74
+
75
+ # Evaluate the model
76
+ def evaluate_model(model, test_loader):
77
+ model.eval()
78
+ y_true, y_pred = [], []
79
+ with torch.no_grad():
80
+ for batch in test_loader:
81
+ batch = {k: v.to(device) for k, v in batch.items()}
82
+ outputs = model(**batch)
83
+ logits = outputs.logits
84
+ predictions = torch.argmax(logits, dim=-1)
85
+ y_true.extend(batch['labels'].tolist())
86
+ y_pred.extend(predictions.tolist())
87
+ return y_true, y_pred
88
+
89
+ y_true, y_pred = evaluate_model(model, test_loader)
90
+
91
+ # Print evaluation metrics
92
+ print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
93
+ print("Classification Report:\n", classification_report(y_true, y_pred))