maxdunhill
/

DistilBERTdetectingvulnerablecode

Model card Files Files and versions Community

maxdunhill commited on Dec 15, 2022

Commit

94f7497

1 Parent(s): ff1df51

Upload classifier_runs.py

This code achieves fine-tuning of DistilBERT model on C++ training set of vulnerable/non-vulnerable code.

As I got error message: "TypeError: new(): invalid data type 'str'" when trying to validate the model, this functionality from original codebase:
https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb#scrollTo=kT5-oqMPB6vp

Has been modified/commented out.

A separate file is available with the non-modified/commented out validation functionality is available, should a member of the community wish to take it upon themselves to get the validation functionality to run.

Files changed (1) hide show

classifier_runs.py +186 -0

classifier_runs.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import warnings
+warnings.simplefilter('ignore')
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from sklearn import metrics
+import transformers
+import torch
+from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
+from transformers import DistilBertTokenizer, DistilBertModel
+import logging
+logging.basicConfig(level=logging.ERROR)
+# # Setting up the device for GPU usage
+from torch import cuda
+device = 'cuda' if cuda.is_available() else 'cpu'
+def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
+    acc_list = []
+    for i in range(y_true.shape[0]):
+        set_true = set( np.where(y_true[i])[0] )
+        set_pred = set( np.where(y_pred[i])[0] )
+        tmp_a = None
+        if len(set_true) == 0 and len(set_pred) == 0:
+            tmp_a = 1
+        else:
+            tmp_a = len(set_true.intersection(set_pred))/\
+                    float( len(set_true.union(set_pred)) )
+        acc_list.append(tmp_a)
+    return np.mean(acc_list)
+data = pd.read_csv('Vulnerable code dataset 15_12_22 - Training.csv')
+#data.drop(['source_name'], inplace=True, axis=1)
+new_df = pd.DataFrame()
+new_df['text'] = data['text']
+new_df['labels'] = data['label']
+new_df.head()
+# Sections of config
+# Defining some key variables that will be used later on in the training
+MAX_LEN = 128
+TRAIN_BATCH_SIZE = 4
+VALID_BATCH_SIZE = 4
+EPOCHS = 1
+LEARNING_RATE = 1e-05
+tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
+class MultiLabelDataset(Dataset):
+    def __init__(self, dataframe, tokenizer, max_len):
+        self.tokenizer = tokenizer
+        self.data = dataframe
+        self.text = dataframe.text
+        self.targets = self.data.labels
+        self.max_len = max_len
+    def __len__(self):
+        return len(self.text)
+    def __getitem__(self, index):
+        text = str(self.text[index])
+        text = " ".join(text.split())
+        inputs = self.tokenizer.encode_plus(
+            text,
+            None,
+            add_special_tokens=True,
+            max_length=self.max_len,
+            pad_to_max_length=True,
+            return_token_type_ids=True
+        )
+        ids = inputs['input_ids']
+        mask = inputs['attention_mask']
+        token_type_ids = inputs["token_type_ids"]
+        return {
+            'ids': torch.tensor(ids, dtype=torch.long),
+            'mask': torch.tensor(mask, dtype=torch.long),
+            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
+            #'targets': torch.tensor(self.targets[index], dtype=torch.float)
+        }
+train_size = 0.8
+train_data=new_df.sample(frac=train_size,random_state=200)
+test_data=new_df.drop(train_data.index).reset_index(drop=True)
+train_data = train_data.reset_index(drop=True)
+print("FULL Dataset: {}".format(new_df.shape))
+print("TRAIN Dataset: {}".format(train_data.shape))
+print("TEST Dataset: {}".format(test_data.shape))
+training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
+testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)
+train_params = {'batch_size': TRAIN_BATCH_SIZE,
+                'shuffle': True,
+                'num_workers': 0
+                }
+test_params = {'batch_size': VALID_BATCH_SIZE,
+                'shuffle': True,
+                'num_workers': 0
+                }
+training_loader = DataLoader(training_set, **train_params)
+testing_loader = DataLoader(testing_set, **test_params)
+# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.
+class DistilBERTClass(torch.nn.Module):
+    def __init__(self):
+        super(DistilBERTClass, self).__init__()
+        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
+        self.pre_classifier = torch.nn.Linear(768, 768)
+        self.dropout = torch.nn.Dropout(0.1)
+        self.classifier = torch.nn.Linear(768, 6)
+    def forward(self, input_ids, attention_mask, token_type_ids):
+        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
+        hidden_state = output_1[0]
+        pooler = hidden_state[:, 0]
+        pooler = self.pre_classifier(pooler)
+        pooler = torch.nn.Tanh()(pooler)
+        pooler = self.dropout(pooler)
+        output = self.classifier(pooler)
+        return output
+model = DistilBERTClass()
+model.to(device)
+def loss_fn(outputs, targets):
+    return torch.nn.BCEWithLogitsLoss()(outputs, targets)
+optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
+def train(epoch):
+    model.train()
+    for _,data in tqdm(enumerate(training_loader, 0)):
+        ids = data['ids'].to(device, dtype = torch.long)
+        mask = data['mask'].to(device, dtype = torch.long)
+        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
+        #targets = data['targets'].to(device, dtype = torch.float)
+        outputs = model(ids, mask, token_type_ids)
+        optimizer.zero_grad()
+        #loss = loss_fn(outputs)
+        #if _%5000==0:
+        #    print(f'Epoch: {epoch}, Loss:  {loss.item()}')
+        #loss.backward()
+        #optimizer.step()
+#for epoch in range(EPOCHS):
+#    train(epoch)
+def validation(testing_loader):
+    model.eval()
+    fin_targets=[]
+    fin_outputs=[]
+    with torch.no_grad():
+        for _, data in tqdm(enumerate(testing_loader, 0)):
+            ids = data['ids'].to(device, dtype = torch.long)
+            mask = data['mask'].to(device, dtype = torch.long)
+            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
+            # targets = data['targets'].to(device, dtype = torch.float)
+            outputs = model(ids, mask, token_type_ids)
+            #fin_targets.extend(targets.cpu().detach().numpy().tolist())
+            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
+    return fin_outputs, fin_targets
+outputs = validation(testing_loader)
+print(outputs)
+#final_outputs = np.array(outputs) >=0.5
+#val_hamming_loss = metrics.hamming_loss(final_outputs)
+#val_hamming_score = hamming_score(np.array(final_outputs))
+#print(f"Hamming Score = {val_hamming_score}")
+#print(f"Hamming Loss = {val_hamming_loss}")