maxdunhill commited on
Commit
94f7497
·
1 Parent(s): ff1df51

Upload classifier_runs.py

Browse files

This code achieves fine-tuning of DistilBERT model on C++ training set of vulnerable/non-vulnerable code.

As I got error message: "TypeError: new(): invalid data type 'str'" when trying to validate the model, this functionality from original codebase:
https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb#scrollTo=kT5-oqMPB6vp

Has been modified/commented out.

A separate file is available with the non-modified/commented out validation functionality is available, should a member of the community wish to take it upon themselves to get the validation functionality to run.

Files changed (1) hide show
  1. classifier_runs.py +186 -0
classifier_runs.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ warnings.simplefilter('ignore')
3
+ import numpy as np
4
+ import pandas as pd
5
+ from tqdm import tqdm
6
+ from sklearn import metrics
7
+ import transformers
8
+ import torch
9
+ from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
10
+ from transformers import DistilBertTokenizer, DistilBertModel
11
+ import logging
12
+ logging.basicConfig(level=logging.ERROR)
13
+
14
+ # # Setting up the device for GPU usage
15
+
16
+ from torch import cuda
17
+ device = 'cuda' if cuda.is_available() else 'cpu'
18
+
19
+ def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
20
+ acc_list = []
21
+ for i in range(y_true.shape[0]):
22
+ set_true = set( np.where(y_true[i])[0] )
23
+ set_pred = set( np.where(y_pred[i])[0] )
24
+ tmp_a = None
25
+ if len(set_true) == 0 and len(set_pred) == 0:
26
+ tmp_a = 1
27
+ else:
28
+ tmp_a = len(set_true.intersection(set_pred))/\
29
+ float( len(set_true.union(set_pred)) )
30
+ acc_list.append(tmp_a)
31
+ return np.mean(acc_list)
32
+
33
+ data = pd.read_csv('Vulnerable code dataset 15_12_22 - Training.csv')
34
+ #data.drop(['source_name'], inplace=True, axis=1)
35
+ new_df = pd.DataFrame()
36
+ new_df['text'] = data['text']
37
+ new_df['labels'] = data['label']
38
+ new_df.head()
39
+
40
+ # Sections of config
41
+
42
+ # Defining some key variables that will be used later on in the training
43
+ MAX_LEN = 128
44
+ TRAIN_BATCH_SIZE = 4
45
+ VALID_BATCH_SIZE = 4
46
+ EPOCHS = 1
47
+ LEARNING_RATE = 1e-05
48
+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
49
+
50
+ class MultiLabelDataset(Dataset):
51
+
52
+ def __init__(self, dataframe, tokenizer, max_len):
53
+ self.tokenizer = tokenizer
54
+ self.data = dataframe
55
+ self.text = dataframe.text
56
+ self.targets = self.data.labels
57
+ self.max_len = max_len
58
+
59
+ def __len__(self):
60
+ return len(self.text)
61
+
62
+ def __getitem__(self, index):
63
+ text = str(self.text[index])
64
+ text = " ".join(text.split())
65
+
66
+ inputs = self.tokenizer.encode_plus(
67
+ text,
68
+ None,
69
+ add_special_tokens=True,
70
+ max_length=self.max_len,
71
+ pad_to_max_length=True,
72
+ return_token_type_ids=True
73
+ )
74
+ ids = inputs['input_ids']
75
+ mask = inputs['attention_mask']
76
+ token_type_ids = inputs["token_type_ids"]
77
+
78
+
79
+ return {
80
+ 'ids': torch.tensor(ids, dtype=torch.long),
81
+ 'mask': torch.tensor(mask, dtype=torch.long),
82
+ 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
83
+ #'targets': torch.tensor(self.targets[index], dtype=torch.float)
84
+ }
85
+
86
+ train_size = 0.8
87
+ train_data=new_df.sample(frac=train_size,random_state=200)
88
+ test_data=new_df.drop(train_data.index).reset_index(drop=True)
89
+ train_data = train_data.reset_index(drop=True)
90
+
91
+
92
+ print("FULL Dataset: {}".format(new_df.shape))
93
+ print("TRAIN Dataset: {}".format(train_data.shape))
94
+ print("TEST Dataset: {}".format(test_data.shape))
95
+
96
+ training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
97
+ testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)
98
+
99
+ train_params = {'batch_size': TRAIN_BATCH_SIZE,
100
+ 'shuffle': True,
101
+ 'num_workers': 0
102
+ }
103
+
104
+ test_params = {'batch_size': VALID_BATCH_SIZE,
105
+ 'shuffle': True,
106
+ 'num_workers': 0
107
+ }
108
+
109
+ training_loader = DataLoader(training_set, **train_params)
110
+ testing_loader = DataLoader(testing_set, **test_params)
111
+
112
+ # Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.
113
+
114
+ class DistilBERTClass(torch.nn.Module):
115
+ def __init__(self):
116
+ super(DistilBERTClass, self).__init__()
117
+ self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
118
+ self.pre_classifier = torch.nn.Linear(768, 768)
119
+ self.dropout = torch.nn.Dropout(0.1)
120
+ self.classifier = torch.nn.Linear(768, 6)
121
+
122
+ def forward(self, input_ids, attention_mask, token_type_ids):
123
+ output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
124
+ hidden_state = output_1[0]
125
+ pooler = hidden_state[:, 0]
126
+ pooler = self.pre_classifier(pooler)
127
+ pooler = torch.nn.Tanh()(pooler)
128
+ pooler = self.dropout(pooler)
129
+ output = self.classifier(pooler)
130
+ return output
131
+
132
+ model = DistilBERTClass()
133
+ model.to(device)
134
+
135
+ def loss_fn(outputs, targets):
136
+ return torch.nn.BCEWithLogitsLoss()(outputs, targets)
137
+
138
+ optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
139
+
140
+ def train(epoch):
141
+ model.train()
142
+ for _,data in tqdm(enumerate(training_loader, 0)):
143
+ ids = data['ids'].to(device, dtype = torch.long)
144
+ mask = data['mask'].to(device, dtype = torch.long)
145
+ token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
146
+ #targets = data['targets'].to(device, dtype = torch.float)
147
+
148
+ outputs = model(ids, mask, token_type_ids)
149
+
150
+ optimizer.zero_grad()
151
+ #loss = loss_fn(outputs)
152
+ #if _%5000==0:
153
+ # print(f'Epoch: {epoch}, Loss: {loss.item()}')
154
+
155
+ #loss.backward()
156
+ #optimizer.step()
157
+
158
+ #for epoch in range(EPOCHS):
159
+ # train(epoch)
160
+
161
+ def validation(testing_loader):
162
+ model.eval()
163
+ fin_targets=[]
164
+ fin_outputs=[]
165
+ with torch.no_grad():
166
+ for _, data in tqdm(enumerate(testing_loader, 0)):
167
+ ids = data['ids'].to(device, dtype = torch.long)
168
+ mask = data['mask'].to(device, dtype = torch.long)
169
+ token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
170
+ # targets = data['targets'].to(device, dtype = torch.float)
171
+ outputs = model(ids, mask, token_type_ids)
172
+ #fin_targets.extend(targets.cpu().detach().numpy().tolist())
173
+ fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
174
+ return fin_outputs, fin_targets
175
+
176
+ outputs = validation(testing_loader)
177
+
178
+ print(outputs)
179
+
180
+ #final_outputs = np.array(outputs) >=0.5
181
+
182
+ #val_hamming_loss = metrics.hamming_loss(final_outputs)
183
+ #val_hamming_score = hamming_score(np.array(final_outputs))
184
+
185
+ #print(f"Hamming Score = {val_hamming_score}")
186
+ #print(f"Hamming Loss = {val_hamming_loss}")