Init commit
Browse files- 0.0.ipynb +0 -0
- 1.0.py +194 -0
- 2.2-all.py +287 -0
- 2.2-iter.py +279 -0
- 4.0.ipynb +154 -0
- README.md +13 -0
- class_eval.py +201 -0
- id_score.ipynb +725 -0
- identifier_scoring.py +428 -0
- model_eval.py +283 -0
- model_test.csv +0 -0
- procTest.ipynb +0 -0
- stat_sampling.py +157 -0
- test.csv +0 -0
0.0.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
1.0.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
from transformers import RobertaTokenizer, RobertaForSequenceClassification
|
7 |
+
from torch import nn
|
8 |
+
from torch.nn import init, MarginRankingLoss
|
9 |
+
from transformers import BertModel, RobertaModel
|
10 |
+
from transformers import BertTokenizer, RobertaTokenizer
|
11 |
+
from torch.optim import Adam
|
12 |
+
from distutils.version import LooseVersion
|
13 |
+
from torch.utils.data import Dataset, DataLoader
|
14 |
+
from torch.utils.tensorboard import SummaryWriter
|
15 |
+
from datetime import datetime
|
16 |
+
from torch.autograd import Variable
|
17 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
18 |
+
import nltk
|
19 |
+
import re
|
20 |
+
import Levenshtein
|
21 |
+
import spacy
|
22 |
+
import en_core_web_sm
|
23 |
+
import torch.optim as optim
|
24 |
+
from torch.distributions import Categorical
|
25 |
+
from numpy import linalg as LA
|
26 |
+
from transformers import AutoModelForMaskedLM
|
27 |
+
from nltk.corpus import wordnet
|
28 |
+
import torch.nn.functional as F
|
29 |
+
import random
|
30 |
+
from transformers import get_linear_schedule_with_warmup
|
31 |
+
from sklearn.metrics import precision_recall_fscore_support
|
32 |
+
from nltk.corpus import words as wal
|
33 |
+
from sklearn.utils import resample
|
34 |
+
|
35 |
+
|
36 |
+
# In[56]:
|
37 |
+
|
38 |
+
|
39 |
+
class MyDataset(Dataset):
|
40 |
+
def __init__(self,file_name):
|
41 |
+
df1 = pd.read_csv(file_name)
|
42 |
+
df1 = df1.fillna("")
|
43 |
+
res = df1['X']
|
44 |
+
self.X_list = res.to_numpy()
|
45 |
+
self.y_list = df1['y'].to_numpy()
|
46 |
+
def __len__(self):
|
47 |
+
return len(self.X_list)
|
48 |
+
def __getitem__(self,idx):
|
49 |
+
mapi = []
|
50 |
+
mapi.append(self.X_list[idx])
|
51 |
+
mapi.append(self.y_list[idx])
|
52 |
+
return mapi
|
53 |
+
|
54 |
+
|
55 |
+
# In[59]:
|
56 |
+
|
57 |
+
|
58 |
+
class Step1_model(nn.Module):
|
59 |
+
def __init__(self, hidden_size=512):
|
60 |
+
super(Step1_model, self).__init__()
|
61 |
+
self.hidden_size = hidden_size
|
62 |
+
self.model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base", num_labels=6)
|
63 |
+
self.tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
|
64 |
+
self.config = AutoConfig.from_pretrained("microsoft/graphcodebert-base")
|
65 |
+
for name, param in self.model.named_parameters():
|
66 |
+
param.requires_grad = True
|
67 |
+
|
68 |
+
|
69 |
+
def forward(self, mapi):
|
70 |
+
X_init = mapi[0]
|
71 |
+
X_init = X_init.replace("[MASK]", " ".join([tokenizer.mask_token] * 1))
|
72 |
+
y = mapi[1]
|
73 |
+
print(y)
|
74 |
+
nl = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\d+', y)
|
75 |
+
lb = ' '.join(nl).lower()
|
76 |
+
x = tokenizer.tokenize(lb)
|
77 |
+
nlab = len(x)
|
78 |
+
print(nlab)
|
79 |
+
tokens = self.tokenizer.encode_plus(X_init, add_special_tokens=False,return_tensors='pt')
|
80 |
+
input_id_chunki = tokens['input_ids'][0].split(510)
|
81 |
+
input_id_chunks = []
|
82 |
+
mask_chunks = []
|
83 |
+
mask_chunki = tokens['attention_mask'][0].split(510)
|
84 |
+
for tensor in input_id_chunki:
|
85 |
+
input_id_chunks.append(tensor)
|
86 |
+
for tensor in mask_chunki:
|
87 |
+
mask_chunks.append(tensor)
|
88 |
+
xi = torch.full((1,), fill_value=101)
|
89 |
+
yi = torch.full((1,), fill_value=1)
|
90 |
+
zi = torch.full((1,), fill_value=102)
|
91 |
+
for r in range(len(input_id_chunks)):
|
92 |
+
input_id_chunks[r] = torch.cat([xi, input_id_chunks[r]],dim = -1)
|
93 |
+
input_id_chunks[r] = torch.cat([input_id_chunks[r],zi],dim=-1)
|
94 |
+
mask_chunks[r] = torch.cat([yi, mask_chunks[r]],dim=-1)
|
95 |
+
mask_chunks[r] = torch.cat([mask_chunks[r],yi],dim=-1)
|
96 |
+
di = torch.full((1,), fill_value=0)
|
97 |
+
for i in range(len(input_id_chunks)):
|
98 |
+
# get required padding length
|
99 |
+
pad_len = 512 - input_id_chunks[i].shape[0]
|
100 |
+
# check if tensor length satisfies required chunk size
|
101 |
+
if pad_len > 0:
|
102 |
+
# if padding length is more than 0, we must add padding
|
103 |
+
for p in range(pad_len):
|
104 |
+
input_id_chunks[i] = torch.cat([input_id_chunks[i],di],dim=-1)
|
105 |
+
mask_chunks[i] = torch.cat([mask_chunks[i],di],dim=-1)
|
106 |
+
input_ids = torch.stack(input_id_chunks)
|
107 |
+
attention_mask = torch.stack(mask_chunks)
|
108 |
+
input_dict = {
|
109 |
+
'input_ids': input_ids.long(),
|
110 |
+
'attention_mask': attention_mask.int()
|
111 |
+
}
|
112 |
+
with torch.no_grad():
|
113 |
+
outputs = self.model(**input_dict)
|
114 |
+
last_hidden_state = outputs.logits.squeeze()
|
115 |
+
lhs_agg = []
|
116 |
+
if len(last_hidden_state) == 1:
|
117 |
+
lhs_agg.append(last_hidden_state)
|
118 |
+
else:
|
119 |
+
for p in range(len(last_hidden_state)):
|
120 |
+
lhs_agg.append(last_hidden_state[p])
|
121 |
+
lhs = lhs_agg[0]
|
122 |
+
for i in range(len(lhs_agg)):
|
123 |
+
if i == 0:
|
124 |
+
continue
|
125 |
+
lhs+=lhs_agg[i]
|
126 |
+
lhs/=len(lhs_agg)
|
127 |
+
# print(lhs)
|
128 |
+
predicted_prob = torch.softmax(lhs, dim=0)
|
129 |
+
if nlab > 6:
|
130 |
+
nlab = 6
|
131 |
+
pll = -1*torch.log(predicted_prob[nlab-1])
|
132 |
+
return {'loss':pll}
|
133 |
+
|
134 |
+
|
135 |
+
# In[60]:
|
136 |
+
|
137 |
+
|
138 |
+
epoch_number = 0
|
139 |
+
EPOCHS = 5
|
140 |
+
run_int = 8
|
141 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
|
142 |
+
model = Step1_model()
|
143 |
+
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
|
144 |
+
myDs=MyDataset('dat1.csv')
|
145 |
+
train_loader=DataLoader(myDs,batch_size=1,shuffle=False)
|
146 |
+
best_loss = torch.full((1,), fill_value=100000)
|
147 |
+
|
148 |
+
|
149 |
+
# In[61]:
|
150 |
+
|
151 |
+
|
152 |
+
flag = 0
|
153 |
+
def train_one_epoch(transformer_model, dataset):
|
154 |
+
global flag
|
155 |
+
for batch in dataset:
|
156 |
+
p = 0
|
157 |
+
inputs = batch
|
158 |
+
optimizer.zero_grad()
|
159 |
+
for i in range(len(inputs[0])):
|
160 |
+
l = []
|
161 |
+
l.append(inputs[0][i])
|
162 |
+
l.append(inputs[1][i])
|
163 |
+
opi = transformer_model(l)
|
164 |
+
loss = opi['loss']
|
165 |
+
loss.backward()
|
166 |
+
optimizer.step()
|
167 |
+
if p % 1 == 0:
|
168 |
+
print(' batch loss: {}'.format(loss))
|
169 |
+
return loss
|
170 |
+
|
171 |
+
|
172 |
+
# In[62]:
|
173 |
+
|
174 |
+
|
175 |
+
for epoch in range(EPOCHS):
|
176 |
+
print('EPOCH {}:'.format(epoch_number + 1))
|
177 |
+
|
178 |
+
model.train(True)
|
179 |
+
avg_loss = train_one_epoch(model,train_loader)
|
180 |
+
model.train(False)
|
181 |
+
print('LOSS train {}'.format(avg_loss))
|
182 |
+
if avg_loss < best_loss:
|
183 |
+
best_loss = avg_loss
|
184 |
+
model_path = 'var_runs_class/model_{}_{}'.format(run_int, epoch_number)
|
185 |
+
torch.save(model.state_dict(), model_path)
|
186 |
+
|
187 |
+
epoch_number += 1
|
188 |
+
|
189 |
+
|
190 |
+
# In[ ]:
|
191 |
+
|
192 |
+
|
193 |
+
|
194 |
+
|
2.2-all.py
ADDED
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# In[1]:
|
5 |
+
from torch.nn.utils import clip_grad_norm_
|
6 |
+
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
import torch
|
10 |
+
from torch import nn
|
11 |
+
from torch.nn import init, MarginRankingLoss
|
12 |
+
from torch.optim import Adam
|
13 |
+
from distutils.version import LooseVersion
|
14 |
+
from torch.utils.data import Dataset, DataLoader
|
15 |
+
from torch.autograd import Variable
|
16 |
+
import math
|
17 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
18 |
+
import nltk
|
19 |
+
import re
|
20 |
+
import torch.optim as optim
|
21 |
+
from tqdm import tqdm
|
22 |
+
from transformers import AutoModelForMaskedLM
|
23 |
+
import torch.nn.functional as F
|
24 |
+
import random
|
25 |
+
|
26 |
+
|
27 |
+
# In[2]:
|
28 |
+
def freeze(model):
|
29 |
+
for name, param in model.named_parameters():
|
30 |
+
param.requires_grad = True
|
31 |
+
if name.startswith("model.roberta.encoder.layer.0"):
|
32 |
+
param.requires_grad = False
|
33 |
+
if name.startswith("model.roberta.encoder.layer.1"):
|
34 |
+
param.requires_grad = False
|
35 |
+
if name.startswith("model.roberta.encoder.layer.2"):
|
36 |
+
param.requires_grad = False
|
37 |
+
if name.startswith("model.roberta.encoder.layer.3"):
|
38 |
+
param.requires_grad = False
|
39 |
+
if name.startswith("model.roberta.encoder.layer.4"):
|
40 |
+
param.requires_grad = False
|
41 |
+
if name.startswith("model.roberta.encoder.layer.5"):
|
42 |
+
param.requires_grad = False
|
43 |
+
if name.startswith("model.roberta.encoder.layer.6"):
|
44 |
+
param.requires_grad = False
|
45 |
+
if name.startswith("model.roberta.encoder.layer.7"):
|
46 |
+
param.requires_grad = False
|
47 |
+
# if name.startswith("model.roberta.encoder.layer.8"):
|
48 |
+
# param.requires_grad = False
|
49 |
+
# if name.startswith("model.roberta.encoder.layer.9"):
|
50 |
+
# param.requires_grad = False
|
51 |
+
return model
|
52 |
+
|
53 |
+
maskis = []
|
54 |
+
n_y = []
|
55 |
+
class MyDataset(Dataset):
|
56 |
+
def __init__(self,file_name):
|
57 |
+
global maskis
|
58 |
+
global n_y
|
59 |
+
df = pd.read_csv(file_name)
|
60 |
+
df = df.fillna("")
|
61 |
+
self.inp_dicts = []
|
62 |
+
for r in range(df.shape[0]):
|
63 |
+
X_init = df['X'][r]
|
64 |
+
y = df['y'][r]
|
65 |
+
n_y.append(y)
|
66 |
+
nl = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\d+', y)
|
67 |
+
lb = ' '.join(nl).lower()
|
68 |
+
x = tokenizer.tokenize(lb)
|
69 |
+
num_sub_tokens_label = len(x)
|
70 |
+
X_init = X_init.replace("[MASK]", " ".join([tokenizer.mask_token] * num_sub_tokens_label))
|
71 |
+
tokens = tokenizer.encode_plus(X_init, add_special_tokens=False,return_tensors='pt')
|
72 |
+
input_id_chunki = tokens['input_ids'][0].split(510)
|
73 |
+
input_id_chunks = []
|
74 |
+
mask_chunks = []
|
75 |
+
mask_chunki = tokens['attention_mask'][0].split(510)
|
76 |
+
for tensor in input_id_chunki:
|
77 |
+
input_id_chunks.append(tensor)
|
78 |
+
for tensor in mask_chunki:
|
79 |
+
mask_chunks.append(tensor)
|
80 |
+
xi = torch.full((1,), fill_value=101)
|
81 |
+
yi = torch.full((1,), fill_value=1)
|
82 |
+
zi = torch.full((1,), fill_value=102)
|
83 |
+
for r in range(len(input_id_chunks)):
|
84 |
+
input_id_chunks[r] = torch.cat([xi, input_id_chunks[r]],dim = -1)
|
85 |
+
input_id_chunks[r] = torch.cat([input_id_chunks[r],zi],dim=-1)
|
86 |
+
mask_chunks[r] = torch.cat([yi, mask_chunks[r]],dim=-1)
|
87 |
+
mask_chunks[r] = torch.cat([mask_chunks[r],yi],dim=-1)
|
88 |
+
di = torch.full((1,), fill_value=0)
|
89 |
+
for i in range(len(input_id_chunks)):
|
90 |
+
pad_len = 512 - input_id_chunks[i].shape[0]
|
91 |
+
if pad_len > 0:
|
92 |
+
for p in range(pad_len):
|
93 |
+
input_id_chunks[i] = torch.cat([input_id_chunks[i],di],dim=-1)
|
94 |
+
mask_chunks[i] = torch.cat([mask_chunks[i],di],dim=-1)
|
95 |
+
vb = torch.ones_like(input_id_chunks[0])
|
96 |
+
fg = torch.zeros_like(input_id_chunks[0])
|
97 |
+
maski = []
|
98 |
+
for l in range(len(input_id_chunks)):
|
99 |
+
masked_pos = []
|
100 |
+
for i in range(len(input_id_chunks[l])):
|
101 |
+
if input_id_chunks[l][i] == tokenizer.mask_token_id: #103
|
102 |
+
if i != 0 and input_id_chunks[l][i-1] == tokenizer.mask_token_id:
|
103 |
+
continue
|
104 |
+
masked_pos.append(i)
|
105 |
+
maski.append(masked_pos)
|
106 |
+
maskis.append(maski)
|
107 |
+
while (len(input_id_chunks)<250):
|
108 |
+
input_id_chunks.append(vb)
|
109 |
+
mask_chunks.append(fg)
|
110 |
+
input_ids = torch.stack(input_id_chunks)
|
111 |
+
attention_mask = torch.stack(mask_chunks)
|
112 |
+
input_dict = {
|
113 |
+
'input_ids': input_ids.long(),
|
114 |
+
'attention_mask': attention_mask.int()
|
115 |
+
}
|
116 |
+
self.inp_dicts.append(input_dict)
|
117 |
+
del input_dict
|
118 |
+
del input_ids
|
119 |
+
del attention_mask
|
120 |
+
del maski
|
121 |
+
del mask_chunks
|
122 |
+
del input_id_chunks
|
123 |
+
del di
|
124 |
+
del fg
|
125 |
+
del vb
|
126 |
+
del mask_chunki
|
127 |
+
del input_id_chunki
|
128 |
+
del X_init
|
129 |
+
del y
|
130 |
+
del tokens
|
131 |
+
del x
|
132 |
+
del lb
|
133 |
+
del nl
|
134 |
+
del df
|
135 |
+
def __len__(self):
|
136 |
+
return len(self.inp_dicts)
|
137 |
+
def __getitem__(self,idx):
|
138 |
+
return self.inp_dicts[idx]
|
139 |
+
|
140 |
+
|
141 |
+
# In[3]:
|
142 |
+
|
143 |
+
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
144 |
+
epoch_number = 0
|
145 |
+
EPOCHS = 5
|
146 |
+
run_int = 26
|
147 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
|
148 |
+
model = AutoModelForMaskedLM.from_pretrained("microsoft/graphcodebert-base")
|
149 |
+
#model = model.half()
|
150 |
+
#model = freeze(model)
|
151 |
+
optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)
|
152 |
+
myDs=MyDataset('dat1.csv')
|
153 |
+
train_loader=DataLoader(myDs,batch_size=1,shuffle=False)
|
154 |
+
#model.half()
|
155 |
+
#model = freeze(model)
|
156 |
+
best_loss = torch.full((1,), fill_value=100000)
|
157 |
+
#model = nn.DataParallel(model,device_ids=[0,1])
|
158 |
+
#model.to(device)
|
159 |
+
|
160 |
+
|
161 |
+
# In[5]:
|
162 |
+
|
163 |
+
|
164 |
+
for epoch in range(EPOCHS):
|
165 |
+
loop = tqdm(train_loader, leave=True)
|
166 |
+
tot_loss = 0.0
|
167 |
+
cntr = 0
|
168 |
+
# nbtch = torch.tensor(0.0, requires_grad=True)
|
169 |
+
for batch in loop:
|
170 |
+
optimizer.zero_grad()
|
171 |
+
maxi = torch.tensor(0.0, requires_grad=True)
|
172 |
+
for i in range(len(batch['input_ids'])):
|
173 |
+
cntr+=1
|
174 |
+
maski = maskis[cntr-1]
|
175 |
+
li = len(maski)
|
176 |
+
input_ids = batch['input_ids'][i][:li]
|
177 |
+
att_mask = batch['attention_mask'][i][:li]
|
178 |
+
y = n_y[cntr-1]
|
179 |
+
print("Ground truth:", y)
|
180 |
+
ty = tokenizer.encode(y)[1:-1]
|
181 |
+
num_sub_tokens_label = len(ty)
|
182 |
+
# input_ids, att_mask = input_ids.to(device),att_mask.to(device)
|
183 |
+
outputs = model(input_ids, attention_mask = att_mask)
|
184 |
+
last_hidden_state = outputs[0].squeeze()
|
185 |
+
l_o_l_sa = []
|
186 |
+
sum_state = []
|
187 |
+
for t in range(num_sub_tokens_label):
|
188 |
+
c = []
|
189 |
+
l_o_l_sa.append(c)
|
190 |
+
if len(maski) == 1:
|
191 |
+
masked_pos = maski[0]
|
192 |
+
for k in masked_pos:
|
193 |
+
for t in range(num_sub_tokens_label):
|
194 |
+
l_o_l_sa[t].append(last_hidden_state[k+t])
|
195 |
+
else:
|
196 |
+
for p in range(len(maski)):
|
197 |
+
masked_pos = maski[p]
|
198 |
+
for k in masked_pos:
|
199 |
+
for t in range(num_sub_tokens_label):
|
200 |
+
if (k+t) >= len(last_hidden_state[p]):
|
201 |
+
l_o_l_sa[t].append(last_hidden_state[p+1][k+t-len(last_hidden_state[p])])
|
202 |
+
continue
|
203 |
+
l_o_l_sa[t].append(last_hidden_state[p][k+t])
|
204 |
+
for t in range(num_sub_tokens_label):
|
205 |
+
sum_state.append(l_o_l_sa[t][0])
|
206 |
+
for i in range(len(l_o_l_sa[0])):
|
207 |
+
if i == 0:
|
208 |
+
continue
|
209 |
+
for t in range(num_sub_tokens_label):
|
210 |
+
sum_state[t] = sum_state[t] + l_o_l_sa[t][i]
|
211 |
+
yip = len(l_o_l_sa[0])
|
212 |
+
qw = ""
|
213 |
+
val = torch.tensor(0.0, requires_grad=True)
|
214 |
+
for t in range(num_sub_tokens_label):
|
215 |
+
sum_state[t] /= yip
|
216 |
+
print("sum_state: ", sum_state[t])
|
217 |
+
print("yip: ", yip)
|
218 |
+
probs = F.softmax(sum_state[t], dim=0)
|
219 |
+
print("probs: ",probs)
|
220 |
+
print("idx: ", probs[ty[t]])
|
221 |
+
val = val - torch.log(probs[ty[t]])
|
222 |
+
idx = torch.topk(sum_state[t], k=5, dim=0)[1]
|
223 |
+
wor = [tokenizer.decode(i.item()).strip() for i in idx]
|
224 |
+
for kl in wor:
|
225 |
+
if all(char.isalpha() for char in kl):
|
226 |
+
qw+=kl.capitalize()
|
227 |
+
break
|
228 |
+
print("NTokens: ", num_sub_tokens_label)
|
229 |
+
# val = val / 5
|
230 |
+
val = val / num_sub_tokens_label
|
231 |
+
print("Val: ", val)
|
232 |
+
maxi = maxi + val
|
233 |
+
print("Prediction: ", qw)
|
234 |
+
print("*****")
|
235 |
+
for c in sum_state:
|
236 |
+
del c
|
237 |
+
del sum_state
|
238 |
+
for c in l_o_l_sa:
|
239 |
+
del c
|
240 |
+
del l_o_l_sa
|
241 |
+
del maski
|
242 |
+
del input_ids
|
243 |
+
del att_mask
|
244 |
+
del last_hidden_state
|
245 |
+
del qw
|
246 |
+
|
247 |
+
tot_loss +=maxi
|
248 |
+
maxi = maxi / len(batch['input_ids'])
|
249 |
+
maxi.backward()
|
250 |
+
optimizer.step()
|
251 |
+
if cntr%200 == 0:
|
252 |
+
checkpoint = {
|
253 |
+
'model_state_dict': model.state_dict(),
|
254 |
+
'optimizer_state_dict': optimizer.state_dict(),
|
255 |
+
'cntr': cntr # Add any additional information you want>
|
256 |
+
}
|
257 |
+
model_path = 'var_runs/model_{}_{}_{}.pth'.format(run_>
|
258 |
+
torch.save(checkpoint, model_path)
|
259 |
+
# nbtch = nbtch + maxi
|
260 |
+
# if cntr % 4 == 0:
|
261 |
+
# nbtch = nbtch / 4
|
262 |
+
# nbtch.backward()
|
263 |
+
# for name, param in model.named_parameters():
|
264 |
+
# if param.grad is not None:
|
265 |
+
# print(f'Parameter: {name}')
|
266 |
+
# print(f'Gradient Norm: {param.grad.norm().item()}')
|
267 |
+
# print(f'Gradient Values: {param.grad}')
|
268 |
+
# max_grad_norm = 1.0 # You can adjust this value as needed
|
269 |
+
# clip_grad_norm_(model.parameters(), max_grad_norm)
|
270 |
+
# optimizer.step()
|
271 |
+
# nbtch = 0.0
|
272 |
+
# print(list(model.parameters())[0].grad)
|
273 |
+
loop.set_description(f'Epoch {epoch}')
|
274 |
+
loop.set_postfix(loss=maxi.item())
|
275 |
+
tot_loss/=len(myDs)
|
276 |
+
print(tot_loss)
|
277 |
+
if tot_loss < best_loss:
|
278 |
+
best_loss = tot_loss
|
279 |
+
model_path = 'var_runs/model_{}_{}'.format(run_int, epoch)
|
280 |
+
torch.save(model.state_dict(), model_path)
|
281 |
+
|
282 |
+
|
283 |
+
# In[ ]:
|
284 |
+
|
285 |
+
|
286 |
+
|
287 |
+
|
2.2-iter.py
ADDED
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# In[1]:
|
5 |
+
|
6 |
+
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
import torch
|
10 |
+
from torch import nn
|
11 |
+
from torch.nn import init, MarginRankingLoss
|
12 |
+
from torch.optim import Adam
|
13 |
+
from distutils.version import LooseVersion
|
14 |
+
from torch.utils.data import Dataset, DataLoader
|
15 |
+
from torch.autograd import Variable
|
16 |
+
import math
|
17 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
18 |
+
import nltk
|
19 |
+
import re
|
20 |
+
import torch.optim as optim
|
21 |
+
from tqdm import tqdm
|
22 |
+
from transformers import AutoModelForMaskedLM
|
23 |
+
import torch.nn.functional as F
|
24 |
+
import random
|
25 |
+
|
26 |
+
|
27 |
+
# In[2]:
|
28 |
+
|
29 |
+
def freeze(model):
|
30 |
+
for name, param in model.named_parameters():
|
31 |
+
param.requires_grad = True
|
32 |
+
if name.startswith("model.roberta.encoder.layer.0"):
|
33 |
+
param.requires_grad = False
|
34 |
+
if name.startswith("model.roberta.encoder.layer.1"):
|
35 |
+
param.requires_grad = False
|
36 |
+
if name.startswith("model.roberta.encoder.layer.2"):
|
37 |
+
param.requires_grad = False
|
38 |
+
if name.startswith("model.roberta.encoder.layer.3"):
|
39 |
+
param.requires_grad = False
|
40 |
+
if name.startswith("model.roberta.encoder.layer.4"):
|
41 |
+
param.requires_grad = False
|
42 |
+
if name.startswith("model.roberta.encoder.layer.5"):
|
43 |
+
param.requires_grad = False
|
44 |
+
if name.startswith("model.roberta.encoder.layer.6"):
|
45 |
+
param.requires_grad = False
|
46 |
+
if name.startswith("model.roberta.encoder.layer.7"):
|
47 |
+
param.requires_grad = False
|
48 |
+
# if name.startswith("model.roberta.encoder.layer.8"):
|
49 |
+
# param.requires_grad = False
|
50 |
+
# if name.startswith("model.roberta.encoder.layer.9"):
|
51 |
+
# param.requires_grad = False
|
52 |
+
return model
|
53 |
+
|
54 |
+
maskis = []
|
55 |
+
n_y = []
|
56 |
+
class MyDataset(Dataset):
|
57 |
+
def __init__(self,file_name):
|
58 |
+
global maskis
|
59 |
+
global n_y
|
60 |
+
df = pd.read_csv(file_name)
|
61 |
+
df = df.sample(frac=1)
|
62 |
+
df = df.fillna("")
|
63 |
+
self.inp_dicts = []
|
64 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
|
65 |
+
for r in range(df.shape[0]):
|
66 |
+
X_init = df['X'][r]
|
67 |
+
y = df['y'][r]
|
68 |
+
n_y.append(y)
|
69 |
+
nl = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\d+', y)
|
70 |
+
lb = ' '.join(nl).lower()
|
71 |
+
x = tokenizer.tokenize(lb)
|
72 |
+
num_sub_tokens_label = len(x)
|
73 |
+
X_init = X_init.replace("[MASK]", " ".join([tokenizer.mask_token] * num_sub_tokens_label))
|
74 |
+
tokens = tokenizer.encode_plus(X_init, add_special_tokens=False,return_tensors='pt')
|
75 |
+
input_id_chunki = tokens['input_ids'][0].split(510)
|
76 |
+
input_id_chunks = []
|
77 |
+
mask_chunks = []
|
78 |
+
mask_chunki = tokens['attention_mask'][0].split(510)
|
79 |
+
for tensor in input_id_chunki:
|
80 |
+
input_id_chunks.append(tensor)
|
81 |
+
for tensor in mask_chunki:
|
82 |
+
mask_chunks.append(tensor)
|
83 |
+
xi = torch.full((1,), fill_value=101)
|
84 |
+
yi = torch.full((1,), fill_value=1)
|
85 |
+
zi = torch.full((1,), fill_value=102)
|
86 |
+
for r in range(len(input_id_chunks)):
|
87 |
+
input_id_chunks[r] = torch.cat([xi, input_id_chunks[r]],dim = -1)
|
88 |
+
input_id_chunks[r] = torch.cat([input_id_chunks[r],zi],dim=-1)
|
89 |
+
mask_chunks[r] = torch.cat([yi, mask_chunks[r]],dim=-1)
|
90 |
+
mask_chunks[r] = torch.cat([mask_chunks[r],yi],dim=-1)
|
91 |
+
di = torch.full((1,), fill_value=0)
|
92 |
+
for i in range(len(input_id_chunks)):
|
93 |
+
pad_len = 512 - input_id_chunks[i].shape[0]
|
94 |
+
if pad_len > 0:
|
95 |
+
for p in range(pad_len):
|
96 |
+
input_id_chunks[i] = torch.cat([input_id_chunks[i],di],dim=-1)
|
97 |
+
mask_chunks[i] = torch.cat([mask_chunks[i],di],dim=-1)
|
98 |
+
vb = torch.ones_like(input_id_chunks[0])
|
99 |
+
fg = torch.zeros_like(input_id_chunks[0])
|
100 |
+
maski = []
|
101 |
+
for l in range(len(input_id_chunks)):
|
102 |
+
masked_pos = []
|
103 |
+
for i in range(len(input_id_chunks[l])):
|
104 |
+
if input_id_chunks[l][i] == tokenizer.mask_token_id: #103
|
105 |
+
if i != 0 and input_id_chunks[l][i-1] == tokenizer.mask_token_id:
|
106 |
+
continue
|
107 |
+
masked_pos.append(i)
|
108 |
+
maski.append(masked_pos)
|
109 |
+
maskis.append(maski)
|
110 |
+
while (len(input_id_chunks)<250):
|
111 |
+
input_id_chunks.append(vb)
|
112 |
+
mask_chunks.append(fg)
|
113 |
+
input_ids = torch.stack(input_id_chunks)
|
114 |
+
attention_mask = torch.stack(mask_chunks)
|
115 |
+
input_dict = {
|
116 |
+
'input_ids': input_ids.long(),
|
117 |
+
'attention_mask': attention_mask.int()
|
118 |
+
}
|
119 |
+
self.inp_dicts.append(input_dict)
|
120 |
+
del input_dict
|
121 |
+
del input_ids
|
122 |
+
del attention_mask
|
123 |
+
del maski
|
124 |
+
del mask_chunks
|
125 |
+
del input_id_chunks
|
126 |
+
del di
|
127 |
+
del fg
|
128 |
+
del vb
|
129 |
+
del mask_chunki
|
130 |
+
del input_id_chunki
|
131 |
+
del X_init
|
132 |
+
del y
|
133 |
+
del tokens
|
134 |
+
del x
|
135 |
+
del lb
|
136 |
+
del nl
|
137 |
+
del df
|
138 |
+
def __len__(self):
|
139 |
+
return len(self.inp_dicts)
|
140 |
+
def __getitem__(self,idx):
|
141 |
+
return self.inp_dicts[idx]
|
142 |
+
|
143 |
+
|
144 |
+
# In[3]:
|
145 |
+
|
146 |
+
|
147 |
+
def my_func():
|
148 |
+
global maskis
|
149 |
+
global n_y
|
150 |
+
epoch_number = 0
|
151 |
+
EPOCHS = 5
|
152 |
+
run_int = 26
|
153 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
|
154 |
+
model = AutoModelForMaskedLM.from_pretrained("microsoft/graphcodebert-base")
|
155 |
+
# model = model.half()
|
156 |
+
# model = freeze(model)
|
157 |
+
optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)
|
158 |
+
myDs=MyDataset('dat1.csv')
|
159 |
+
train_loader=DataLoader(myDs,batch_size=1,shuffle=False)
|
160 |
+
best_loss = torch.full((1,), fill_value=100000)
|
161 |
+
for epoch in range(EPOCHS):
|
162 |
+
loop = tqdm(train_loader, leave=True)
|
163 |
+
tot_loss = 0.0
|
164 |
+
cntr = 0
|
165 |
+
for batch in loop:
|
166 |
+
try:
|
167 |
+
optimizer.zero_grad()
|
168 |
+
maxi = torch.tensor(0.0, requires_grad=True)
|
169 |
+
for i in range(len(batch['input_ids'])):
|
170 |
+
cntr+=1
|
171 |
+
maski = maskis[cntr-1]
|
172 |
+
li = len(maski)
|
173 |
+
input_ids = batch['input_ids'][i][:li]
|
174 |
+
att_mask = batch['attention_mask'][i][:li]
|
175 |
+
y = n_y[cntr-1]
|
176 |
+
print("Ground truth:", y)
|
177 |
+
ty = tokenizer.encode(y)[1:-1]
|
178 |
+
num_sub_tokens_label = len(ty)
|
179 |
+
qw = ""
|
180 |
+
val = torch.tensor(0.0, requires_grad=True)
|
181 |
+
for m in range(num_sub_tokens_label):
|
182 |
+
outputs = model(input_ids, attention_mask = att_mask)
|
183 |
+
last_hidden_state = outputs[0].squeeze()
|
184 |
+
l_o_l_sa = []
|
185 |
+
sum_state = []
|
186 |
+
if len(maski) == 1:
|
187 |
+
masked_pos = maski[0]
|
188 |
+
for k in masked_pos:
|
189 |
+
l_o_l_sa.append(last_hidden_state[k+m])
|
190 |
+
else:
|
191 |
+
for p in range(len(maski)):
|
192 |
+
masked_pos = maski[p]
|
193 |
+
for k in masked_pos:
|
194 |
+
if (k+m) >= len(last_hidden_state[p]):
|
195 |
+
l_o_l_sa.append(last_hidden_state[p+1][k+m-len(last_hidden_state[p])])
|
196 |
+
continue
|
197 |
+
l_o_l_sa.append(last_hidden_state[p][k+m])
|
198 |
+
sum_state = l_o_l_sa[0]
|
199 |
+
for i in range(len(l_o_l_sa)):
|
200 |
+
if i == 0:
|
201 |
+
continue
|
202 |
+
sum_state = sum_state + l_o_l_sa[i]
|
203 |
+
yip = len(l_o_l_sa)
|
204 |
+
sum_state = sum_state / yip
|
205 |
+
probs = F.softmax(sum_state, dim=0)
|
206 |
+
val = val - torch.log(probs[ty[m]])
|
207 |
+
idx = torch.topk(sum_state, k=5, dim=0)[1]
|
208 |
+
wor= [tokenizer.decode(i.item()).strip() for i in idx]
|
209 |
+
for kl in wor:
|
210 |
+
if all(char.isalpha() for char in kl):
|
211 |
+
qw+=kl.capitalize()
|
212 |
+
break
|
213 |
+
des = input_ids.clone()
|
214 |
+
if len(maski) == 1:
|
215 |
+
masked_pos = maski[0]
|
216 |
+
for k in masked_pos:
|
217 |
+
des[k+m] = idx[0]
|
218 |
+
else:
|
219 |
+
for p in range(len(maski)):
|
220 |
+
masked_pos = maski[p]
|
221 |
+
for k in masked_pos:
|
222 |
+
if (k+m) >= len(des[p]):
|
223 |
+
des[p+1][k+m-len(des[p])] = idx[0]
|
224 |
+
continue
|
225 |
+
des[p][k+m] = idx[0]
|
226 |
+
del input_ids
|
227 |
+
input_ids = des
|
228 |
+
for c in sum_state:
|
229 |
+
del c
|
230 |
+
del sum_state
|
231 |
+
for c in l_o_l_sa:
|
232 |
+
del c
|
233 |
+
del l_o_l_sa
|
234 |
+
del last_hidden_state
|
235 |
+
val = val / num_sub_tokens_label
|
236 |
+
maxi = maxi + val
|
237 |
+
print("Prediction: ", qw)
|
238 |
+
print("*****")
|
239 |
+
del maski
|
240 |
+
del input_ids
|
241 |
+
del att_mask
|
242 |
+
del qw
|
243 |
+
tot_loss +=maxi
|
244 |
+
maxi = maxi / len(batch['input_ids'])
|
245 |
+
maxi.backward()
|
246 |
+
optimizer.step()
|
247 |
+
if cntr%200 == 0:
|
248 |
+
checkpoint = {
|
249 |
+
'model_state_dict': model.state_dict(),
|
250 |
+
'optimizer_state_dict': optimizer.state_dict(),
|
251 |
+
'cntr': cntr # Add any additional information you want>
|
252 |
+
}
|
253 |
+
model_path = 'var_runs_iter/model_{}_{}_{}.pth'.format(run_>
|
254 |
+
torch.save(checkpoint,model_path)
|
255 |
+
# print(list(model.parameters())[0].grad)
|
256 |
+
loop.set_description(f'Epoch {epoch}')
|
257 |
+
loop.set_postfix(loss=maxi.item())
|
258 |
+
except:
|
259 |
+
continue
|
260 |
+
tot_loss/=len(myDs)
|
261 |
+
print(tot_loss)
|
262 |
+
if tot_loss < best_loss:
|
263 |
+
best_loss = tot_loss
|
264 |
+
model_path = 'var_runs_iter/model_{}_{}'.format(run_int, epoch)
|
265 |
+
torch.save(model.state_dict(), model_path)
|
266 |
+
|
267 |
+
|
268 |
+
# In[ ]:
|
269 |
+
|
270 |
+
|
271 |
+
if __name__ == "__main__":
|
272 |
+
my_func()
|
273 |
+
|
274 |
+
|
275 |
+
# In[ ]:
|
276 |
+
|
277 |
+
|
278 |
+
|
279 |
+
|
4.0.ipynb
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 6,
|
6 |
+
"id": "7a478c86",
|
7 |
+
"metadata": {
|
8 |
+
"ExecuteTime": {
|
9 |
+
"end_time": "2023-07-31T01:55:12.877222Z",
|
10 |
+
"start_time": "2023-07-31T01:55:12.874203Z"
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"outputs": [],
|
14 |
+
"source": [
|
15 |
+
"import requests\n",
|
16 |
+
"import os"
|
17 |
+
]
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"cell_type": "code",
|
21 |
+
"execution_count": 12,
|
22 |
+
"id": "61571547",
|
23 |
+
"metadata": {
|
24 |
+
"ExecuteTime": {
|
25 |
+
"end_time": "2023-07-31T01:57:23.295679Z",
|
26 |
+
"start_time": "2023-07-31T01:57:23.292514Z"
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"outputs": [],
|
30 |
+
"source": [
|
31 |
+
"t = 2314"
|
32 |
+
]
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"cell_type": "code",
|
36 |
+
"execution_count": 9,
|
37 |
+
"id": "5d7eda84",
|
38 |
+
"metadata": {
|
39 |
+
"ExecuteTime": {
|
40 |
+
"end_time": "2023-07-31T01:56:12.033568Z",
|
41 |
+
"start_time": "2023-07-31T01:56:12.025127Z"
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"outputs": [],
|
45 |
+
"source": [
|
46 |
+
"def download_java_files(url, path):\n",
|
47 |
+
" global t\n",
|
48 |
+
" # Send GET request to retrieve the folder contents\n",
|
49 |
+
" response = requests.get(url, params={\"ref\": branch})\n",
|
50 |
+
" if response.status_code == 200:\n",
|
51 |
+
" # Parse the response JSON\n",
|
52 |
+
" contents = response.json()\n",
|
53 |
+
"\n",
|
54 |
+
" for item in contents:\n",
|
55 |
+
" if item[\"type\"] == \"file\" and item[\"name\"].endswith(\".java\"):\n",
|
56 |
+
" # Download Java file\n",
|
57 |
+
" download_url = item[\"download_url\"]\n",
|
58 |
+
" file_name = \"train_\"+str(t)\n",
|
59 |
+
" output_file_path = f\"Desktop/MITACS/Dataset/inp-txt/{file_name}\"\n",
|
60 |
+
"\n",
|
61 |
+
" # Send GET request to download the file content\n",
|
62 |
+
" file_content = requests.get(download_url).text\n",
|
63 |
+
"\n",
|
64 |
+
" # Save the file to the output directory\n",
|
65 |
+
" with open(output_file_path, \"w\") as output_file:\n",
|
66 |
+
" output_file.write(file_content)\n",
|
67 |
+
"\n",
|
68 |
+
" print(f\"Downloaded: {file_name}\")\n",
|
69 |
+
" t+=1\n",
|
70 |
+
" elif item[\"type\"] == \"dir\":\n",
|
71 |
+
" # Recursively navigate into subfolders\n",
|
72 |
+
" subfolder_url = item[\"url\"]\n",
|
73 |
+
" subfolder_path = f\"{path}/{item['name']}\"\n",
|
74 |
+
" download_java_files(subfolder_url, subfolder_path)\n",
|
75 |
+
" else:\n",
|
76 |
+
" print(\"Failed to retrieve folder contents.\")"
|
77 |
+
]
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"cell_type": "code",
|
81 |
+
"execution_count": 14,
|
82 |
+
"id": "dafd2b43",
|
83 |
+
"metadata": {
|
84 |
+
"ExecuteTime": {
|
85 |
+
"end_time": "2023-07-31T01:59:48.822445Z",
|
86 |
+
"start_time": "2023-07-31T01:59:48.667420Z"
|
87 |
+
}
|
88 |
+
},
|
89 |
+
"outputs": [
|
90 |
+
{
|
91 |
+
"name": "stdout",
|
92 |
+
"output_type": "stream",
|
93 |
+
"text": [
|
94 |
+
"Failed to retrieve folder contents.\n",
|
95 |
+
"2314\n"
|
96 |
+
]
|
97 |
+
}
|
98 |
+
],
|
99 |
+
"source": [
|
100 |
+
"# GitHub repository details\n",
|
101 |
+
"repo = \"leakcanary\"\n",
|
102 |
+
"owner = \"square\"\n",
|
103 |
+
"branch = \"main\" # Specify the branch you want to download from\n",
|
104 |
+
"output_directory = \"DesktopDataset/inp-txt\" # Specify the directory to save the downloaded files\n",
|
105 |
+
"# API endpoint to get the list of files in the repository\n",
|
106 |
+
"url = f\"https://api.github.com/repos/{owner}/{repo}/contents\"\n",
|
107 |
+
"download_java_files(url, output_directory)\n",
|
108 |
+
"print(t)\n"
|
109 |
+
]
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"cell_type": "code",
|
113 |
+
"execution_count": null,
|
114 |
+
"id": "f51e193d",
|
115 |
+
"metadata": {},
|
116 |
+
"outputs": [],
|
117 |
+
"source": []
|
118 |
+
}
|
119 |
+
],
|
120 |
+
"metadata": {
|
121 |
+
"kernelspec": {
|
122 |
+
"display_name": "Python 3.10 (tensorflown)",
|
123 |
+
"language": "python",
|
124 |
+
"name": "tensorflown"
|
125 |
+
},
|
126 |
+
"language_info": {
|
127 |
+
"codemirror_mode": {
|
128 |
+
"name": "ipython",
|
129 |
+
"version": 3
|
130 |
+
},
|
131 |
+
"file_extension": ".py",
|
132 |
+
"mimetype": "text/x-python",
|
133 |
+
"name": "python",
|
134 |
+
"nbconvert_exporter": "python",
|
135 |
+
"pygments_lexer": "ipython3",
|
136 |
+
"version": "3.10.12"
|
137 |
+
},
|
138 |
+
"toc": {
|
139 |
+
"base_numbering": 1,
|
140 |
+
"nav_menu": {},
|
141 |
+
"number_sections": true,
|
142 |
+
"sideBar": true,
|
143 |
+
"skip_h1_title": false,
|
144 |
+
"title_cell": "Table of Contents",
|
145 |
+
"title_sidebar": "Contents",
|
146 |
+
"toc_cell": false,
|
147 |
+
"toc_position": {},
|
148 |
+
"toc_section_display": true,
|
149 |
+
"toc_window_display": false
|
150 |
+
}
|
151 |
+
},
|
152 |
+
"nbformat": 4,
|
153 |
+
"nbformat_minor": 5
|
154 |
+
}
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Identifier-Renaming
|
2 |
+
Generating higher quality identifier names by using context and following conventions with RLHF <br>
|
3 |
+
0.0 is used to generate the csv dataset from the .java files after 4.0 <br>
|
4 |
+
1.0 is code for the classifier to predict the number of mask tokens to insert in for the variable name <br>
|
5 |
+
2.2-all is used for finetuning the GraphCodeBert model on variable names <br>
|
6 |
+
4.0 is used to create the dataset. Repositories are cloned and this code file iterates over all the files and preprocesses them for creation of dataset.<br>
|
7 |
+
procTest.ipynb processes the text generated while evaluating the trained model and generates relevant graphs<br>
|
8 |
+
class_eval.py is used to evaluate the performance of the classifer <br>
|
9 |
+
identifier_scoring.py uses two non-fine-tuned models GraphCodeBERT and CodeBERT for the metric<br>
|
10 |
+
stat_sampling.py evaluates the use of random sampling technique to predict number of mask tokens<br>
|
11 |
+
model_eval.py is the code to evaluate the trained model<br>
|
12 |
+
model_test.csv is the subset of data used to evaluate the model<br>
|
13 |
+
test.csv is the dataset used for the evaluation of the readability metric and the fine-tuned model <br>
|
class_eval.py
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# In[73]:
|
5 |
+
|
6 |
+
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
import torch
|
10 |
+
from transformers import RobertaTokenizer, RobertaForSequenceClassification
|
11 |
+
from torch import nn
|
12 |
+
from torch.nn import init, MarginRankingLoss
|
13 |
+
from transformers import BertModel, RobertaModel
|
14 |
+
from transformers import BertTokenizer, RobertaTokenizer
|
15 |
+
from torch.optim import Adam
|
16 |
+
from distutils.version import LooseVersion
|
17 |
+
from torch.utils.data import Dataset, DataLoader
|
18 |
+
from torch.utils.tensorboard import SummaryWriter
|
19 |
+
from datetime import datetime
|
20 |
+
from torch.autograd import Variable
|
21 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
22 |
+
import nltk
|
23 |
+
import re
|
24 |
+
import Levenshtein
|
25 |
+
import spacy
|
26 |
+
import en_core_web_sm
|
27 |
+
import torch.optim as optim
|
28 |
+
from torch.distributions import Categorical
|
29 |
+
from numpy import linalg as LA
|
30 |
+
from transformers import AutoModelForMaskedLM
|
31 |
+
from nltk.corpus import wordnet
|
32 |
+
import torch.nn.functional as F
|
33 |
+
import random
|
34 |
+
from transformers import get_linear_schedule_with_warmup
|
35 |
+
from sklearn.metrics import precision_recall_fscore_support
|
36 |
+
from nltk.corpus import words as wal
|
37 |
+
from sklearn.utils import resample
|
38 |
+
|
39 |
+
|
40 |
+
# In[56]:
|
41 |
+
|
42 |
+
|
43 |
+
class MyDataset(Dataset):
|
44 |
+
def __init__(self,file_name):
|
45 |
+
df1 = pd.read_csv(file_name)
|
46 |
+
df1 = df1[230000:]
|
47 |
+
df1 = df1.fillna("")
|
48 |
+
res = df1['X']
|
49 |
+
self.X_list = res.to_numpy()
|
50 |
+
self.y_list = df1['y'].to_numpy()
|
51 |
+
def __len__(self):
|
52 |
+
return len(self.X_list)
|
53 |
+
def __getitem__(self,idx):
|
54 |
+
mapi = []
|
55 |
+
mapi.append(self.X_list[idx])
|
56 |
+
mapi.append(self.y_list[idx])
|
57 |
+
return mapi
|
58 |
+
|
59 |
+
|
60 |
+
# In[59]:
|
61 |
+
|
62 |
+
|
63 |
+
class Step1_model(nn.Module):
|
64 |
+
def __init__(self, hidden_size=512):
|
65 |
+
super(Step1_model, self).__init__()
|
66 |
+
self.hidden_size = hidden_size
|
67 |
+
self.model = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=6)
|
68 |
+
self.tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
|
69 |
+
self.config = AutoConfig.from_pretrained("microsoft/codebert-base")
|
70 |
+
|
71 |
+
def forward(self, mapi):
|
72 |
+
X_init = mapi[0]
|
73 |
+
X_init = X_init.replace("[MASK]", " ".join([tokenizer.mask_token] * 1))
|
74 |
+
y = mapi[1]
|
75 |
+
print(y)
|
76 |
+
nl = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\d+', y)
|
77 |
+
lb = ' '.join(nl).lower()
|
78 |
+
x = tokenizer.tokenize(lb)
|
79 |
+
nlab = len(x)
|
80 |
+
print(nlab)
|
81 |
+
tokens = self.tokenizer.encode_plus(X_init, add_special_tokens=False,return_tensors='pt')
|
82 |
+
input_id_chunki = tokens['input_ids'][0].split(510)
|
83 |
+
input_id_chunks = []
|
84 |
+
mask_chunks = []
|
85 |
+
mask_chunki = tokens['attention_mask'][0].split(510)
|
86 |
+
for tensor in input_id_chunki:
|
87 |
+
input_id_chunks.append(tensor)
|
88 |
+
for tensor in mask_chunki:
|
89 |
+
mask_chunks.append(tensor)
|
90 |
+
xi = torch.full((1,), fill_value=101)
|
91 |
+
yi = torch.full((1,), fill_value=1)
|
92 |
+
zi = torch.full((1,), fill_value=102)
|
93 |
+
for r in range(len(input_id_chunks)):
|
94 |
+
input_id_chunks[r] = torch.cat([xi, input_id_chunks[r]],dim = -1)
|
95 |
+
input_id_chunks[r] = torch.cat([input_id_chunks[r],zi],dim=-1)
|
96 |
+
mask_chunks[r] = torch.cat([yi, mask_chunks[r]],dim=-1)
|
97 |
+
mask_chunks[r] = torch.cat([mask_chunks[r],yi],dim=-1)
|
98 |
+
di = torch.full((1,), fill_value=0)
|
99 |
+
for i in range(len(input_id_chunks)):
|
100 |
+
pad_len = 512 - input_id_chunks[i].shape[0]
|
101 |
+
if pad_len > 0:
|
102 |
+
for p in range(pad_len):
|
103 |
+
input_id_chunks[i] = torch.cat([input_id_chunks[i],di],dim=-1)
|
104 |
+
mask_chunks[i] = torch.cat([mask_chunks[i],di],dim=-1)
|
105 |
+
input_ids = torch.stack(input_id_chunks)
|
106 |
+
attention_mask = torch.stack(mask_chunks)
|
107 |
+
input_dict = {
|
108 |
+
'input_ids': input_ids.long(),
|
109 |
+
'attention_mask': attention_mask.int()
|
110 |
+
}
|
111 |
+
with torch.no_grad():
|
112 |
+
outputs = self.model(**input_dict)
|
113 |
+
last_hidden_state = outputs.logits.squeeze()
|
114 |
+
lhs_agg = []
|
115 |
+
if len(last_hidden_state) == 1:
|
116 |
+
lhs_agg.append(last_hidden_state)
|
117 |
+
else:
|
118 |
+
for p in range(len(last_hidden_state)):
|
119 |
+
lhs_agg.append(last_hidden_state[p])
|
120 |
+
lhs = lhs_agg[0]
|
121 |
+
for i in range(len(lhs_agg)):
|
122 |
+
if i == 0:
|
123 |
+
continue
|
124 |
+
lhs+=lhs_agg[i]
|
125 |
+
lhs/=len(lhs_agg)
|
126 |
+
print(lhs)
|
127 |
+
predicted_prob = torch.softmax(lhs, dim=0)
|
128 |
+
if nlab > 6:
|
129 |
+
nlab = 6
|
130 |
+
pll = -1*torch.log(predicted_prob[nlab-1])
|
131 |
+
|
132 |
+
pred = torch.argmax(predicted_prob).item()
|
133 |
+
pred+=1
|
134 |
+
print(pred)
|
135 |
+
predicted = torch.tensor([pred], dtype = float)
|
136 |
+
if pred == nlab:
|
137 |
+
l2 = 0
|
138 |
+
else:
|
139 |
+
l2 = 1
|
140 |
+
actual = torch.tensor([nlab], dtype = float)
|
141 |
+
l1 = Variable(torch.tensor([(actual-predicted)**2],dtype=float),requires_grad = True)
|
142 |
+
return {'loss1':l1, 'loss2':l2}
|
143 |
+
|
144 |
+
|
145 |
+
# In[60]:
|
146 |
+
|
147 |
+
|
148 |
+
epoch_number = 0
|
149 |
+
EPOCHS = 5
|
150 |
+
run_int = 0
|
151 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
|
152 |
+
model = Step1_model()
|
153 |
+
myDs=MyDataset('dat_test.csv')
|
154 |
+
train_loader=DataLoader(myDs,batch_size=2,shuffle=True)
|
155 |
+
best_loss = torch.full((1,), fill_value=100000)
|
156 |
+
|
157 |
+
|
158 |
+
# In[61]:
|
159 |
+
|
160 |
+
|
161 |
+
flag = 0
|
162 |
+
def train_one_epoch(transformer_model, dataset):
|
163 |
+
global flag
|
164 |
+
tot_loss1 = 0.0
|
165 |
+
tot_loss2 = 0.0
|
166 |
+
cnt = 0
|
167 |
+
for batch in dataset:
|
168 |
+
p = 0
|
169 |
+
inputs = batch
|
170 |
+
for i in range(len(inputs[0])):
|
171 |
+
cnt += 1
|
172 |
+
l = []
|
173 |
+
l.append(inputs[0][i])
|
174 |
+
l.append(inputs[1][i])
|
175 |
+
opi = transformer_model(l)
|
176 |
+
loss1 = opi['loss1']
|
177 |
+
loss2 = opi['loss2']
|
178 |
+
tot_loss1 += loss1
|
179 |
+
tot_loss2 += loss2
|
180 |
+
|
181 |
+
tot_loss1/=cnt
|
182 |
+
tot_loss2/=cnt
|
183 |
+
print('MSE: ')
|
184 |
+
print(tot_loss1)
|
185 |
+
print('Acc: ',tot_loss2)
|
186 |
+
return {'tot loss1': tot_loss1,'tot_loss2':tot_loss2}
|
187 |
+
|
188 |
+
|
189 |
+
# In[62]:
|
190 |
+
|
191 |
+
model.eval()
|
192 |
+
avg_loss = train_one_epoch(model,train_loader)
|
193 |
+
|
194 |
+
|
195 |
+
|
196 |
+
|
197 |
+
# In[ ]:
|
198 |
+
|
199 |
+
|
200 |
+
|
201 |
+
|
id_score.ipynb
ADDED
@@ -0,0 +1,725 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "aba235f2",
|
7 |
+
"metadata": {
|
8 |
+
"ExecuteTime": {
|
9 |
+
"end_time": "2023-09-27T18:12:18.439224Z",
|
10 |
+
"start_time": "2023-09-27T18:12:12.646006Z"
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"outputs": [],
|
14 |
+
"source": [
|
15 |
+
"import pandas as pd\n",
|
16 |
+
"import numpy as np\n",
|
17 |
+
"import torch\n",
|
18 |
+
"from torch import nn\n",
|
19 |
+
"import torch.nn.functional as F\n",
|
20 |
+
"from torch.nn import init, MarginRankingLoss\n",
|
21 |
+
"from transformers import BertModel, RobertaModel\n",
|
22 |
+
"from transformers import BertTokenizer, RobertaTokenizer\n",
|
23 |
+
"from torch.optim import Adam\n",
|
24 |
+
"from distutils.version import LooseVersion\n",
|
25 |
+
"from torch.utils.data import Dataset, DataLoader\n",
|
26 |
+
"from torch.utils.tensorboard import SummaryWriter\n",
|
27 |
+
"from datetime import datetime\n",
|
28 |
+
"from torch.autograd import Variable\n",
|
29 |
+
"from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer\n",
|
30 |
+
"import torch.optim as optim\n",
|
31 |
+
"from torch.distributions import Categorical\n",
|
32 |
+
"import random\n",
|
33 |
+
"from transformers import AutoModelForMaskedLM, BertForMaskedLM, AdamW\n",
|
34 |
+
"from transformers import BertTokenizer\n",
|
35 |
+
"from tqdm import tqdm\n",
|
36 |
+
"import matplotlib.pyplot as plt\n",
|
37 |
+
"from transformers import XLMRobertaTokenizer\n",
|
38 |
+
"import os\n",
|
39 |
+
"import csv\n",
|
40 |
+
"from sklearn.model_selection import train_test_split\n",
|
41 |
+
"import nltk\n",
|
42 |
+
"from collections import defaultdict\n",
|
43 |
+
"from nltk.tokenize import word_tokenize\n",
|
44 |
+
"from nltk import pos_tag\n",
|
45 |
+
"from nltk.tokenize import word_tokenize\n",
|
46 |
+
"import math\n",
|
47 |
+
"from nltk.corpus import words\n",
|
48 |
+
"from sklearn.model_selection import train_test_split\n",
|
49 |
+
"import random\n",
|
50 |
+
"import re\n",
|
51 |
+
"import random"
|
52 |
+
]
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"cell_type": "code",
|
56 |
+
"execution_count": 2,
|
57 |
+
"id": "ddeeea22",
|
58 |
+
"metadata": {
|
59 |
+
"ExecuteTime": {
|
60 |
+
"end_time": "2023-09-27T18:12:18.442893Z",
|
61 |
+
"start_time": "2023-09-27T18:12:18.440610Z"
|
62 |
+
}
|
63 |
+
},
|
64 |
+
"outputs": [],
|
65 |
+
"source": [
|
66 |
+
"class MyDataset(Dataset):\n",
|
67 |
+
" def __init__(self,file_name):\n",
|
68 |
+
" df1 = pd.read_csv(file_name)\n",
|
69 |
+
" df1 = df1[200:300]\n",
|
70 |
+
" df1 = df1.fillna(\"\")\n",
|
71 |
+
" res = df1['X'].to_numpy()\n",
|
72 |
+
" self.X_list = res\n",
|
73 |
+
" self.y_list = df1['y'].to_numpy()\n",
|
74 |
+
" def __len__(self):\n",
|
75 |
+
" return len(self.X_list)\n",
|
76 |
+
" def __getitem__(self,idx):\n",
|
77 |
+
" mapi = []\n",
|
78 |
+
" mapi.append(self.X_list[idx])\n",
|
79 |
+
" mapi.append(self.y_list[idx])\n",
|
80 |
+
" return mapi"
|
81 |
+
]
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"cell_type": "code",
|
85 |
+
"execution_count": 3,
|
86 |
+
"id": "dd2fe8b9",
|
87 |
+
"metadata": {
|
88 |
+
"ExecuteTime": {
|
89 |
+
"end_time": "2023-09-27T18:12:18.466279Z",
|
90 |
+
"start_time": "2023-09-27T18:12:18.443804Z"
|
91 |
+
}
|
92 |
+
},
|
93 |
+
"outputs": [],
|
94 |
+
"source": [
|
95 |
+
"class Step1_model(nn.Module):\n",
|
96 |
+
" def __init__(self, hidden_size=512):\n",
|
97 |
+
"# global old_inp\n",
|
98 |
+
"# global old_mhs\n",
|
99 |
+
"# self.oi = old_inp\n",
|
100 |
+
"# self.old_mhs = old_mhs\n",
|
101 |
+
" super(Step1_model, self).__init__()\n",
|
102 |
+
" self.hidden_size = hidden_size\n",
|
103 |
+
"# self.model = AutoModel.from_pretrained(\"roberta-base\")\n",
|
104 |
+
"# self.tokenizer = AutoTokenizer.from_pretrained(\"roberta-base\")\n",
|
105 |
+
"# self.config = AutoConfig.from_pretrained(\"roberta-base\")\n",
|
106 |
+
" self.model = AutoModelForMaskedLM.from_pretrained('microsoft/graphcodebert-base')\n",
|
107 |
+
" self.tokenizer = AutoTokenizer.from_pretrained(\"microsoft/graphcodebert-base\")\n",
|
108 |
+
" self.config = AutoConfig.from_pretrained(\"microsoft/graphcodebert-base\")\n",
|
109 |
+
" self.linear_layer = nn.Linear(self.model.config.vocab_size, self.model.config.vocab_size)\n",
|
110 |
+
"\n",
|
111 |
+
"# self.model = AutoModelForMaskedLM.from_pretrained('bert-base-cased')\n",
|
112 |
+
"# self.tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
|
113 |
+
"# self.config = AutoConfig.from_pretrained(\"bert-base-cased\")\n",
|
114 |
+
" for param in self.model.base_model.parameters():\n",
|
115 |
+
" param.requires_grad = True\n",
|
116 |
+
" def foo (self,data):\n",
|
117 |
+
" result = []\n",
|
118 |
+
" if type(data) == tuple:\n",
|
119 |
+
" return data[1]\n",
|
120 |
+
" if type(data) == list:\n",
|
121 |
+
" for inner in data:\n",
|
122 |
+
" result.append(foo(inner))\n",
|
123 |
+
" res = []\n",
|
124 |
+
" for a in result[0]:\n",
|
125 |
+
" res.append(a[:2])\n",
|
126 |
+
" return res\n",
|
127 |
+
" def loss_func1(self, word, y):\n",
|
128 |
+
" if word =='NA':\n",
|
129 |
+
" return torch.full((1,), fill_value=100)\n",
|
130 |
+
" try:\n",
|
131 |
+
" pred_list = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\\d+', word)\n",
|
132 |
+
" target_list = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\\d+', y)\n",
|
133 |
+
" pred_tag = self.foo(nltk.pos_tag(pred_list))\n",
|
134 |
+
" target_tag = self.foo(nltk.pos_tag(target_list))\n",
|
135 |
+
" str1 = ' '.join(pred_tag) # Convert lists to strings\n",
|
136 |
+
" str2 = ' '.join(target_tag)\n",
|
137 |
+
" distance = Levenshtein.distance(str1, str2)\n",
|
138 |
+
" dist = torch.Tensor([distance])\n",
|
139 |
+
" except:\n",
|
140 |
+
" dist = torch.Tensor([2*len(target_list)])\n",
|
141 |
+
" return dist\n",
|
142 |
+
" def loss_func2(self, word, y):\n",
|
143 |
+
" if word =='NA':\n",
|
144 |
+
" return torch.full((1,), fill_value=100)\n",
|
145 |
+
" nlp = en_core_web_sm.load()\n",
|
146 |
+
" pred_list = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\\d+', word)\n",
|
147 |
+
" target_list = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\\d+', y)\n",
|
148 |
+
" try:\n",
|
149 |
+
" str1 = ' '.join(pred_list) # Convert lists to strings\n",
|
150 |
+
" str2 = ' '.join(target_list)\n",
|
151 |
+
" tokens1 = nlp(str1)\n",
|
152 |
+
" tokens2 = nlp(str2)\n",
|
153 |
+
" # Calculate the average word embedding for each string\n",
|
154 |
+
" embedding1 = sum(token.vector for token in tokens1) / len(tokens1)\n",
|
155 |
+
" embedding2 = sum(token.vector for token in tokens2) / len(tokens2)\n",
|
156 |
+
" # Calculate the cosine similarity between the embeddings\n",
|
157 |
+
" w1= LA.norm(embedding1)\n",
|
158 |
+
" w2= LA.norm(embedding2)\n",
|
159 |
+
" distance = 1 - (embedding1.dot(embedding2) / (w1 * w2))\n",
|
160 |
+
" dist = torch.Tensor([distance])\n",
|
161 |
+
" except:\n",
|
162 |
+
" dist = torch.Tensor([1])\n",
|
163 |
+
" return dist\n",
|
164 |
+
" def forward(self, mapi):\n",
|
165 |
+
" global variable_names\n",
|
166 |
+
" global base_model\n",
|
167 |
+
" global tot_pll\n",
|
168 |
+
" global base_tot_pll\n",
|
169 |
+
" X_init1 = mapi[0]\n",
|
170 |
+
" X_init = mapi[0]\n",
|
171 |
+
" y = mapi[1]\n",
|
172 |
+
" print(y)\n",
|
173 |
+
" y_tok = self.tokenizer.encode(y)[1:-1]\n",
|
174 |
+
" nl = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\\d+', y)\n",
|
175 |
+
" lb = ' '.join(nl).lower()\n",
|
176 |
+
" x = self.tokenizer.tokenize(lb)\n",
|
177 |
+
" num_sub_tokens_label = len(x)\n",
|
178 |
+
" X_init = X_init.replace(\"[MASK]\", \" \".join([self.tokenizer.mask_token] * num_sub_tokens_label))\n",
|
179 |
+
" sent_pll = 0.0\n",
|
180 |
+
" base_sent_pll = 0.0\n",
|
181 |
+
" for m in range(num_sub_tokens_label):\n",
|
182 |
+
" print(m)\n",
|
183 |
+
" tokens = self.tokenizer.encode_plus(X_init, add_special_tokens=False,return_tensors='pt')\n",
|
184 |
+
" input_id_chunki = tokens['input_ids'][0].split(510)\n",
|
185 |
+
" input_id_chunks = []\n",
|
186 |
+
" mask_chunks = []\n",
|
187 |
+
" mask_chunki = tokens['attention_mask'][0].split(510)\n",
|
188 |
+
" for tensor in input_id_chunki:\n",
|
189 |
+
" input_id_chunks.append(tensor)\n",
|
190 |
+
" for tensor in mask_chunki:\n",
|
191 |
+
" mask_chunks.append(tensor)\n",
|
192 |
+
" xi = torch.full((1,), fill_value=101)\n",
|
193 |
+
" yi = torch.full((1,), fill_value=1)\n",
|
194 |
+
" zi = torch.full((1,), fill_value=102)\n",
|
195 |
+
" for r in range(len(input_id_chunks)):\n",
|
196 |
+
" input_id_chunks[r] = torch.cat([xi, input_id_chunks[r]],dim = -1)\n",
|
197 |
+
" input_id_chunks[r] = torch.cat([input_id_chunks[r],zi],dim=-1)\n",
|
198 |
+
" mask_chunks[r] = torch.cat([yi, mask_chunks[r]],dim=-1)\n",
|
199 |
+
" mask_chunks[r] = torch.cat([mask_chunks[r],yi],dim=-1)\n",
|
200 |
+
" di = torch.full((1,), fill_value=0)\n",
|
201 |
+
" for i in range(len(input_id_chunks)):\n",
|
202 |
+
" pad_len = 512 - input_id_chunks[i].shape[0]\n",
|
203 |
+
" if pad_len > 0:\n",
|
204 |
+
" for p in range(pad_len):\n",
|
205 |
+
" input_id_chunks[i] = torch.cat([input_id_chunks[i],di],dim=-1)\n",
|
206 |
+
" mask_chunks[i] = torch.cat([mask_chunks[i],di],dim=-1)\n",
|
207 |
+
" input_ids = torch.stack(input_id_chunks)\n",
|
208 |
+
" attention_mask = torch.stack(mask_chunks)\n",
|
209 |
+
" input_dict = {\n",
|
210 |
+
" 'input_ids': input_ids.long(),\n",
|
211 |
+
" 'attention_mask': attention_mask.int()\n",
|
212 |
+
" }\n",
|
213 |
+
" maski = []\n",
|
214 |
+
" u = 0\n",
|
215 |
+
" ad = 0\n",
|
216 |
+
" for l in range(len(input_dict['input_ids'])):\n",
|
217 |
+
" masked_pos = []\n",
|
218 |
+
" for i in range(len(input_dict['input_ids'][l])):\n",
|
219 |
+
" if input_dict['input_ids'][l][i] == 50264: #103\n",
|
220 |
+
" u+=1\n",
|
221 |
+
" if i != 0 and input_dict['input_ids'][l][i-1] == 50264:\n",
|
222 |
+
" continue\n",
|
223 |
+
" masked_pos.append(i)\n",
|
224 |
+
" ad+=1\n",
|
225 |
+
" maski.append(masked_pos)\n",
|
226 |
+
" print('number of mask tok',u)\n",
|
227 |
+
" print('number of seq', ad)\n",
|
228 |
+
" with torch.no_grad():\n",
|
229 |
+
" output = self.model(**input_dict)\n",
|
230 |
+
" base_output = base_model(**input_dict)\n",
|
231 |
+
" last_hidden_state = output[0].squeeze()\n",
|
232 |
+
" base_last_hidden_state = base_output[0].squeeze()\n",
|
233 |
+
" l_o_l_sa = []\n",
|
234 |
+
" base_l_o_l_sa = []\n",
|
235 |
+
" if len(maski) == 1:\n",
|
236 |
+
" masked_pos = maski[0]\n",
|
237 |
+
" for k in masked_pos:\n",
|
238 |
+
" l_o_l_sa.append(last_hidden_state[k])\n",
|
239 |
+
" base_l_o_l_sa.append(base_last_hidden_state[k])\n",
|
240 |
+
" else:\n",
|
241 |
+
" for p in range(len(maski)):\n",
|
242 |
+
" masked_pos = maski[p]\n",
|
243 |
+
" for k in masked_pos:\n",
|
244 |
+
" l_o_l_sa.append(last_hidden_state[p][k])\n",
|
245 |
+
" base_l_o_l_sa.append(base_last_hidden_state[p][k])\n",
|
246 |
+
" sum_state = l_o_l_sa[0]\n",
|
247 |
+
" base_sum_state = base_l_o_l_sa[0]\n",
|
248 |
+
" for i in range(len(l_o_l_sa)):\n",
|
249 |
+
" if i == 0:\n",
|
250 |
+
" continue\n",
|
251 |
+
" sum_state += l_o_l_sa[i]\n",
|
252 |
+
" base_sum_state += base_l_o_l_sa[i]\n",
|
253 |
+
" yip = len(l_o_l_sa)\n",
|
254 |
+
" sum_state /= yip\n",
|
255 |
+
" base_sum_state /= yip\n",
|
256 |
+
" probs = F.softmax(sum_state, dim=0)\n",
|
257 |
+
" base_probs = F.softmax(base_sum_state, dim=0)\n",
|
258 |
+
" a_lab = y_tok[m]\n",
|
259 |
+
" prob = probs[a_lab]\n",
|
260 |
+
" base_prob = base_probs[a_lab]\n",
|
261 |
+
" log_prob = -1*math.log(prob)\n",
|
262 |
+
" base_log_prob = -1*math.log(base_prob)\n",
|
263 |
+
" sent_pll+=log_prob\n",
|
264 |
+
" base_sent_pll+=base_log_prob\n",
|
265 |
+
" xl = X_init.split()\n",
|
266 |
+
" xxl = []\n",
|
267 |
+
" for p in range(len(xl)):\n",
|
268 |
+
" if xl[p] == self.tokenizer.mask_token:\n",
|
269 |
+
" if p != 0 and xl[p-1] == self.tokenizer.mask_token:\n",
|
270 |
+
" xxl.append(xl[p])\n",
|
271 |
+
" continue\n",
|
272 |
+
" xxl.append(self.tokenizer.convert_ids_to_tokens(y_tok[m]))\n",
|
273 |
+
" continue\n",
|
274 |
+
" xxl.append(xl[p])\n",
|
275 |
+
" X_init = \" \".join(xxl)\n",
|
276 |
+
" sent_pll/=num_sub_tokens_label\n",
|
277 |
+
" base_sent_pll/=num_sub_tokens_label\n",
|
278 |
+
" print(\"Sent PLL:\")\n",
|
279 |
+
" print(sent_pll)\n",
|
280 |
+
" print(\"Base Sent PLL:\")\n",
|
281 |
+
" print(base_sent_pll)\n",
|
282 |
+
" print(\"Net % difference:\")\n",
|
283 |
+
" diff = (sent_pll-base_sent_pll)*100/base_sent_pll\n",
|
284 |
+
" print(diff)\n",
|
285 |
+
" tot_pll += sent_pll\n",
|
286 |
+
" base_tot_pll+=base_sent_pll\n",
|
287 |
+
" print()\n",
|
288 |
+
" print()\n",
|
289 |
+
" y = random.choice(variable_names)\n",
|
290 |
+
" print(y)\n",
|
291 |
+
" X_init = X_init1\n",
|
292 |
+
" y_tok = self.tokenizer.encode(y)[1:-1]\n",
|
293 |
+
" nl = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\\d+', y)\n",
|
294 |
+
" lb = ' '.join(nl).lower()\n",
|
295 |
+
" x = self.tokenizer.tokenize(lb)\n",
|
296 |
+
" num_sub_tokens_label = len(x)\n",
|
297 |
+
" X_init = X_init.replace(\"[MASK]\", \" \".join([self.tokenizer.mask_token] * num_sub_tokens_label))\n",
|
298 |
+
" sent_pll = 0.0\n",
|
299 |
+
" base_sent_pll = 0.0\n",
|
300 |
+
" for m in range(num_sub_tokens_label):\n",
|
301 |
+
" print(m)\n",
|
302 |
+
" tokens = self.tokenizer.encode_plus(X_init, add_special_tokens=False,return_tensors='pt')\n",
|
303 |
+
" input_id_chunki = tokens['input_ids'][0].split(510)\n",
|
304 |
+
" input_id_chunks = []\n",
|
305 |
+
" mask_chunks = []\n",
|
306 |
+
" mask_chunki = tokens['attention_mask'][0].split(510)\n",
|
307 |
+
" for tensor in input_id_chunki:\n",
|
308 |
+
" input_id_chunks.append(tensor)\n",
|
309 |
+
" for tensor in mask_chunki:\n",
|
310 |
+
" mask_chunks.append(tensor)\n",
|
311 |
+
" xi = torch.full((1,), fill_value=101)\n",
|
312 |
+
" yi = torch.full((1,), fill_value=1)\n",
|
313 |
+
" zi = torch.full((1,), fill_value=102)\n",
|
314 |
+
" for r in range(len(input_id_chunks)):\n",
|
315 |
+
" input_id_chunks[r] = torch.cat([xi, input_id_chunks[r]],dim = -1)\n",
|
316 |
+
" input_id_chunks[r] = torch.cat([input_id_chunks[r],zi],dim=-1)\n",
|
317 |
+
" mask_chunks[r] = torch.cat([yi, mask_chunks[r]],dim=-1)\n",
|
318 |
+
" mask_chunks[r] = torch.cat([mask_chunks[r],yi],dim=-1)\n",
|
319 |
+
" di = torch.full((1,), fill_value=0)\n",
|
320 |
+
" for i in range(len(input_id_chunks)):\n",
|
321 |
+
" pad_len = 512 - input_id_chunks[i].shape[0]\n",
|
322 |
+
" if pad_len > 0:\n",
|
323 |
+
" for p in range(pad_len):\n",
|
324 |
+
" input_id_chunks[i] = torch.cat([input_id_chunks[i],di],dim=-1)\n",
|
325 |
+
" mask_chunks[i] = torch.cat([mask_chunks[i],di],dim=-1)\n",
|
326 |
+
" input_ids = torch.stack(input_id_chunks)\n",
|
327 |
+
" attention_mask = torch.stack(mask_chunks)\n",
|
328 |
+
" input_dict = {\n",
|
329 |
+
" 'input_ids': input_ids.long(),\n",
|
330 |
+
" 'attention_mask': attention_mask.int()\n",
|
331 |
+
" }\n",
|
332 |
+
" maski = []\n",
|
333 |
+
" u = 0\n",
|
334 |
+
" ad = 0\n",
|
335 |
+
" for l in range(len(input_dict['input_ids'])):\n",
|
336 |
+
" masked_pos = []\n",
|
337 |
+
" for i in range(len(input_dict['input_ids'][l])):\n",
|
338 |
+
" if input_dict['input_ids'][l][i] == 50264: #103\n",
|
339 |
+
" u+=1\n",
|
340 |
+
" if i != 0 and input_dict['input_ids'][l][i-1] == 50264:\n",
|
341 |
+
" continue\n",
|
342 |
+
" masked_pos.append(i)\n",
|
343 |
+
" ad+=1\n",
|
344 |
+
" maski.append(masked_pos)\n",
|
345 |
+
" print('number of mask tok',u)\n",
|
346 |
+
" print('number of seq', ad)\n",
|
347 |
+
" with torch.no_grad():\n",
|
348 |
+
" output = self.model(**input_dict)\n",
|
349 |
+
" base_output = base_model(**input_dict)\n",
|
350 |
+
" last_hidden_state = output[0].squeeze()\n",
|
351 |
+
" base_last_hidden_state = base_output[0].squeeze()\n",
|
352 |
+
" l_o_l_sa = []\n",
|
353 |
+
" base_l_o_l_sa = []\n",
|
354 |
+
" if len(maski) == 1:\n",
|
355 |
+
" masked_pos = maski[0]\n",
|
356 |
+
" for k in masked_pos:\n",
|
357 |
+
" l_o_l_sa.append(last_hidden_state[k])\n",
|
358 |
+
" base_l_o_l_sa.append(base_last_hidden_state[k])\n",
|
359 |
+
" else:\n",
|
360 |
+
" for p in range(len(maski)):\n",
|
361 |
+
" masked_pos = maski[p]\n",
|
362 |
+
" for k in masked_pos:\n",
|
363 |
+
" l_o_l_sa.append(last_hidden_state[p][k])\n",
|
364 |
+
" base_l_o_l_sa.append(base_last_hidden_state[p][k])\n",
|
365 |
+
" sum_state = l_o_l_sa[0]\n",
|
366 |
+
" base_sum_state = base_l_o_l_sa[0]\n",
|
367 |
+
" for i in range(len(l_o_l_sa)):\n",
|
368 |
+
" if i == 0:\n",
|
369 |
+
" continue\n",
|
370 |
+
" sum_state += l_o_l_sa[i]\n",
|
371 |
+
" base_sum_state += base_l_o_l_sa[i]\n",
|
372 |
+
" yip = len(l_o_l_sa)\n",
|
373 |
+
" sum_state /= yip\n",
|
374 |
+
" base_sum_state /= yip\n",
|
375 |
+
" probs = F.softmax(sum_state, dim=0)\n",
|
376 |
+
" base_probs = F.softmax(base_sum_state, dim=0)\n",
|
377 |
+
" a_lab = y_tok[m]\n",
|
378 |
+
" prob = probs[a_lab]\n",
|
379 |
+
" base_prob = base_probs[a_lab]\n",
|
380 |
+
" log_prob = -1*math.log(prob)\n",
|
381 |
+
" base_log_prob = -1*math.log(base_prob)\n",
|
382 |
+
" sent_pll+=log_prob\n",
|
383 |
+
" base_sent_pll+=base_log_prob\n",
|
384 |
+
" xl = X_init.split()\n",
|
385 |
+
" xxl = []\n",
|
386 |
+
" for p in range(len(xl)):\n",
|
387 |
+
" if xl[p] == self.tokenizer.mask_token:\n",
|
388 |
+
" if p != 0 and xl[p-1] == self.tokenizer.mask_token:\n",
|
389 |
+
" xxl.append(xl[p])\n",
|
390 |
+
" continue\n",
|
391 |
+
" xxl.append(self.tokenizer.convert_ids_to_tokens(y_tok[m]))\n",
|
392 |
+
" continue\n",
|
393 |
+
" xxl.append(xl[p])\n",
|
394 |
+
" X_init = \" \".join(xxl)\n",
|
395 |
+
" sent_pll/=num_sub_tokens_label\n",
|
396 |
+
" base_sent_pll/=num_sub_tokens_label\n",
|
397 |
+
" print(\"Sent PLL:\")\n",
|
398 |
+
" print(sent_pll)\n",
|
399 |
+
" print(\"Base Sent PLL:\")\n",
|
400 |
+
" print(base_sent_pll)\n",
|
401 |
+
" print(\"Net % difference:\")\n",
|
402 |
+
" diff = (sent_pll-base_sent_pll)*100/base_sent_pll\n",
|
403 |
+
" print(diff)\n",
|
404 |
+
" print()\n",
|
405 |
+
" print(\"******\")\n",
|
406 |
+
" print()\n",
|
407 |
+
" "
|
408 |
+
]
|
409 |
+
},
|
410 |
+
{
|
411 |
+
"cell_type": "code",
|
412 |
+
"execution_count": 4,
|
413 |
+
"id": "bc788ca0",
|
414 |
+
"metadata": {
|
415 |
+
"ExecuteTime": {
|
416 |
+
"end_time": "2023-09-27T18:12:36.975722Z",
|
417 |
+
"start_time": "2023-09-27T18:12:18.467898Z"
|
418 |
+
}
|
419 |
+
},
|
420 |
+
"outputs": [
|
421 |
+
{
|
422 |
+
"data": {
|
423 |
+
"text/plain": [
|
424 |
+
"RobertaForMaskedLM(\n",
|
425 |
+
" (roberta): RobertaModel(\n",
|
426 |
+
" (embeddings): RobertaEmbeddings(\n",
|
427 |
+
" (word_embeddings): Embedding(50265, 768, padding_idx=1)\n",
|
428 |
+
" (position_embeddings): Embedding(514, 768, padding_idx=1)\n",
|
429 |
+
" (token_type_embeddings): Embedding(1, 768)\n",
|
430 |
+
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
431 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
432 |
+
" )\n",
|
433 |
+
" (encoder): RobertaEncoder(\n",
|
434 |
+
" (layer): ModuleList(\n",
|
435 |
+
" (0-11): 12 x RobertaLayer(\n",
|
436 |
+
" (attention): RobertaAttention(\n",
|
437 |
+
" (self): RobertaSelfAttention(\n",
|
438 |
+
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
|
439 |
+
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
|
440 |
+
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
|
441 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
442 |
+
" )\n",
|
443 |
+
" (output): RobertaSelfOutput(\n",
|
444 |
+
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
|
445 |
+
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
446 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
447 |
+
" )\n",
|
448 |
+
" )\n",
|
449 |
+
" (intermediate): RobertaIntermediate(\n",
|
450 |
+
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
|
451 |
+
" (intermediate_act_fn): GELUActivation()\n",
|
452 |
+
" )\n",
|
453 |
+
" (output): RobertaOutput(\n",
|
454 |
+
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
|
455 |
+
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
456 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
457 |
+
" )\n",
|
458 |
+
" )\n",
|
459 |
+
" )\n",
|
460 |
+
" )\n",
|
461 |
+
" )\n",
|
462 |
+
" (lm_head): RobertaLMHead(\n",
|
463 |
+
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
|
464 |
+
" (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
465 |
+
" (decoder): Linear(in_features=768, out_features=50265, bias=True)\n",
|
466 |
+
" )\n",
|
467 |
+
")"
|
468 |
+
]
|
469 |
+
},
|
470 |
+
"execution_count": 4,
|
471 |
+
"metadata": {},
|
472 |
+
"output_type": "execute_result"
|
473 |
+
}
|
474 |
+
],
|
475 |
+
"source": [
|
476 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"microsoft/graphcodebert-base\")\n",
|
477 |
+
"model = Step1_model()\n",
|
478 |
+
"model.load_state_dict(torch.load('var_runs/model_98_3'))\n",
|
479 |
+
"base_model = AutoModelForMaskedLM.from_pretrained('microsoft/graphcodebert-base')\n",
|
480 |
+
"model.eval()\n",
|
481 |
+
"base_model.eval()"
|
482 |
+
]
|
483 |
+
},
|
484 |
+
{
|
485 |
+
"cell_type": "code",
|
486 |
+
"execution_count": 5,
|
487 |
+
"id": "f96328ce",
|
488 |
+
"metadata": {
|
489 |
+
"ExecuteTime": {
|
490 |
+
"end_time": "2023-09-27T18:15:14.635841Z",
|
491 |
+
"start_time": "2023-09-27T18:12:36.980040Z"
|
492 |
+
}
|
493 |
+
},
|
494 |
+
"outputs": [
|
495 |
+
{
|
496 |
+
"name": "stderr",
|
497 |
+
"output_type": "stream",
|
498 |
+
"text": [
|
499 |
+
"\r",
|
500 |
+
" 0%| | 0/50 [00:00<?, ?it/s]"
|
501 |
+
]
|
502 |
+
}
|
503 |
+
],
|
504 |
+
"source": [
|
505 |
+
"myDs=MyDataset('dat.csv')\n",
|
506 |
+
"loader=DataLoader(myDs,batch_size=2,shuffle=True)\n",
|
507 |
+
"loop = tqdm(loader, leave=True)"
|
508 |
+
]
|
509 |
+
},
|
510 |
+
{
|
511 |
+
"cell_type": "code",
|
512 |
+
"execution_count": 6,
|
513 |
+
"id": "45333143",
|
514 |
+
"metadata": {
|
515 |
+
"ExecuteTime": {
|
516 |
+
"end_time": "2023-09-27T18:18:54.349042Z",
|
517 |
+
"start_time": "2023-09-27T18:17:34.313070Z"
|
518 |
+
},
|
519 |
+
"code_folding": []
|
520 |
+
},
|
521 |
+
"outputs": [
|
522 |
+
{
|
523 |
+
"name": "stderr",
|
524 |
+
"output_type": "stream",
|
525 |
+
"text": [
|
526 |
+
"Token indices sequence length is longer than the specified maximum sequence length for this model (7050 > 512). Running this sequence through the model will result in indexing errors\n"
|
527 |
+
]
|
528 |
+
},
|
529 |
+
{
|
530 |
+
"name": "stdout",
|
531 |
+
"output_type": "stream",
|
532 |
+
"text": [
|
533 |
+
"stackBefore\n",
|
534 |
+
"\n",
|
535 |
+
"0\n",
|
536 |
+
"number of mask tok 16\n",
|
537 |
+
"number of seq 8\n",
|
538 |
+
"1\n",
|
539 |
+
"number of mask tok 8\n",
|
540 |
+
"number of seq 8\n",
|
541 |
+
"Sent PLL:\n",
|
542 |
+
"3.184066466322467\n",
|
543 |
+
"Base Sent PLL:\n",
|
544 |
+
"3.184066466322467\n",
|
545 |
+
"Net % difference:\n",
|
546 |
+
"0.0\n",
|
547 |
+
"\n",
|
548 |
+
"\n",
|
549 |
+
"distance\n",
|
550 |
+
"0\n",
|
551 |
+
"number of mask tok 8\n",
|
552 |
+
"number of seq 8\n",
|
553 |
+
"Sent PLL:\n",
|
554 |
+
"22.091890736746276\n",
|
555 |
+
"Base Sent PLL:\n",
|
556 |
+
"22.091890736746276\n",
|
557 |
+
"Net % difference:\n",
|
558 |
+
"0.0\n",
|
559 |
+
"\n",
|
560 |
+
"******\n",
|
561 |
+
"\n",
|
562 |
+
"records\n",
|
563 |
+
"\n",
|
564 |
+
"0\n",
|
565 |
+
"number of mask tok 4\n",
|
566 |
+
"number of seq 2\n",
|
567 |
+
"1\n",
|
568 |
+
"number of mask tok 2\n",
|
569 |
+
"number of seq 2\n",
|
570 |
+
"Sent PLL:\n",
|
571 |
+
"4.304520906089483\n",
|
572 |
+
"Base Sent PLL:\n",
|
573 |
+
"4.304520906089483\n",
|
574 |
+
"Net % difference:\n",
|
575 |
+
"0.0\n",
|
576 |
+
"\n",
|
577 |
+
"\n",
|
578 |
+
"valueB\n",
|
579 |
+
"0\n",
|
580 |
+
"number of mask tok 4\n",
|
581 |
+
"number of seq 2\n",
|
582 |
+
"1\n",
|
583 |
+
"number of mask tok 2\n",
|
584 |
+
"number of seq 2\n"
|
585 |
+
]
|
586 |
+
},
|
587 |
+
{
|
588 |
+
"name": "stderr",
|
589 |
+
"output_type": "stream",
|
590 |
+
"text": [
|
591 |
+
"\r",
|
592 |
+
" 2%|▊ | 1/50 [03:31<2:52:22, 211.08s/it]"
|
593 |
+
]
|
594 |
+
},
|
595 |
+
{
|
596 |
+
"name": "stdout",
|
597 |
+
"output_type": "stream",
|
598 |
+
"text": [
|
599 |
+
"Sent PLL:\n",
|
600 |
+
"9.457522688945344\n",
|
601 |
+
"Base Sent PLL:\n",
|
602 |
+
"9.457522688945344\n",
|
603 |
+
"Net % difference:\n",
|
604 |
+
"0.0\n",
|
605 |
+
"\n",
|
606 |
+
"******\n",
|
607 |
+
"\n",
|
608 |
+
"stackEntry\n",
|
609 |
+
"\n",
|
610 |
+
"0\n",
|
611 |
+
"number of mask tok 30\n",
|
612 |
+
"number of seq 15\n"
|
613 |
+
]
|
614 |
+
},
|
615 |
+
{
|
616 |
+
"name": "stderr",
|
617 |
+
"output_type": "stream",
|
618 |
+
"text": [
|
619 |
+
" 2%|▊ | 1/50 [03:38<2:58:06, 218.09s/it]\n"
|
620 |
+
]
|
621 |
+
},
|
622 |
+
{
|
623 |
+
"ename": "KeyboardInterrupt",
|
624 |
+
"evalue": "",
|
625 |
+
"output_type": "error",
|
626 |
+
"traceback": [
|
627 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
628 |
+
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
629 |
+
"Cell \u001b[0;32mIn[6], line 18\u001b[0m\n\u001b[1;32m 16\u001b[0m l\u001b[38;5;241m.\u001b[39mappend(inputs[\u001b[38;5;241m0\u001b[39m][i])\n\u001b[1;32m 17\u001b[0m l\u001b[38;5;241m.\u001b[39mappend(inputs[\u001b[38;5;241m1\u001b[39m][i])\n\u001b[0;32m---> 18\u001b[0m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[43ml\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# X_init1 = inputs[0][i]\u001b[39;00m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;66;03m# X_init = inputs[0][i]\u001b[39;00m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;66;03m# y = inputs[1][i]\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[38;5;66;03m# except:\u001b[39;00m\n\u001b[1;32m 263\u001b[0m \u001b[38;5;66;03m# continue\u001b[39;00m\n\u001b[1;32m 264\u001b[0m tot_pll\u001b[38;5;241m/\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(myDs)\n",
|
630 |
+
"File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
|
631 |
+
"Cell \u001b[0;32mIn[3], line 152\u001b[0m, in \u001b[0;36mStep1_model.forward\u001b[0;34m(self, mapi)\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[1;32m 151\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39minput_dict)\n\u001b[0;32m--> 152\u001b[0m base_output \u001b[38;5;241m=\u001b[39m \u001b[43mbase_model\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minput_dict\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 153\u001b[0m last_hidden_state \u001b[38;5;241m=\u001b[39m output[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39msqueeze()\n\u001b[1;32m 154\u001b[0m base_last_hidden_state \u001b[38;5;241m=\u001b[39m base_output[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39msqueeze()\n",
|
632 |
+
"File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
|
633 |
+
"File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py:1082\u001b[0m, in \u001b[0;36mRobertaForMaskedLM.forward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 1072\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1073\u001b[0m \u001b[38;5;124;03mlabels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):\u001b[39;00m\n\u001b[1;32m 1074\u001b[0m \u001b[38;5;124;03m Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1078\u001b[0m \u001b[38;5;124;03m Used to hide legacy arguments that have been deprecated.\u001b[39;00m\n\u001b[1;32m 1079\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1080\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[0;32m-> 1082\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mroberta\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1083\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1084\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1085\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1086\u001b[0m \u001b[43m \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1087\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1088\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1089\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1090\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1091\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1092\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1093\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1094\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1095\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 1096\u001b[0m prediction_scores \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlm_head(sequence_output)\n",
|
634 |
+
"File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
|
635 |
+
"File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py:844\u001b[0m, in \u001b[0;36mRobertaModel.forward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 835\u001b[0m head_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_head_mask(head_mask, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mnum_hidden_layers)\n\u001b[1;32m 837\u001b[0m embedding_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membeddings(\n\u001b[1;32m 838\u001b[0m input_ids\u001b[38;5;241m=\u001b[39minput_ids,\n\u001b[1;32m 839\u001b[0m position_ids\u001b[38;5;241m=\u001b[39mposition_ids,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 842\u001b[0m past_key_values_length\u001b[38;5;241m=\u001b[39mpast_key_values_length,\n\u001b[1;32m 843\u001b[0m )\n\u001b[0;32m--> 844\u001b[0m encoder_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 845\u001b[0m \u001b[43m \u001b[49m\u001b[43membedding_output\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 846\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextended_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 847\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 848\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 849\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_extended_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 850\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 851\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 852\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 853\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 854\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 855\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 856\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m encoder_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 857\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler(sequence_output) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
|
636 |
+
"File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
|
637 |
+
"File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py:529\u001b[0m, in \u001b[0;36mRobertaEncoder.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 520\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mcheckpoint\u001b[38;5;241m.\u001b[39mcheckpoint(\n\u001b[1;32m 521\u001b[0m create_custom_forward(layer_module),\n\u001b[1;32m 522\u001b[0m hidden_states,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 526\u001b[0m encoder_attention_mask,\n\u001b[1;32m 527\u001b[0m )\n\u001b[1;32m 528\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 529\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mlayer_module\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 530\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 531\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 532\u001b[0m \u001b[43m \u001b[49m\u001b[43mlayer_head_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 533\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 534\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 535\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 536\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 537\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 539\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 540\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_cache:\n",
|
638 |
+
"File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
|
639 |
+
"File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py:413\u001b[0m, in \u001b[0;36mRobertaLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m 401\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m 402\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 403\u001b[0m hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 410\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[1;32m 411\u001b[0m \u001b[38;5;66;03m# decoder uni-directional self-attention cached key/values tuple is at positions 1,2\u001b[39;00m\n\u001b[1;32m 412\u001b[0m self_attn_past_key_value \u001b[38;5;241m=\u001b[39m past_key_value[:\u001b[38;5;241m2\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m past_key_value \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 413\u001b[0m self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 414\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 415\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 416\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 417\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 418\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mself_attn_past_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 419\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 420\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 422\u001b[0m \u001b[38;5;66;03m# if decoder, the last output is tuple of self-attn cache\u001b[39;00m\n",
|
640 |
+
"File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
|
641 |
+
"File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py:340\u001b[0m, in \u001b[0;36mRobertaAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m 330\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m 331\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 332\u001b[0m hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 338\u001b[0m output_attentions: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 339\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[torch\u001b[38;5;241m.\u001b[39mTensor]:\n\u001b[0;32m--> 340\u001b[0m self_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mself\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 341\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 342\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 343\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 344\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 345\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 346\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 347\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 348\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 349\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput(self_outputs[\u001b[38;5;241m0\u001b[39m], hidden_states)\n\u001b[1;32m 350\u001b[0m outputs \u001b[38;5;241m=\u001b[39m (attention_output,) \u001b[38;5;241m+\u001b[39m self_outputs[\u001b[38;5;241m1\u001b[39m:] \u001b[38;5;66;03m# add attentions if we output them\u001b[39;00m\n",
|
642 |
+
"File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
|
643 |
+
"File \u001b[0;32m~/miniconda3/envs/tensorflown/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py:236\u001b[0m, in \u001b[0;36mRobertaSelfAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m 233\u001b[0m past_key_value \u001b[38;5;241m=\u001b[39m (key_layer, value_layer)\n\u001b[1;32m 235\u001b[0m \u001b[38;5;66;03m# Take the dot product between \"query\" and \"key\" to get the raw attention scores.\u001b[39;00m\n\u001b[0;32m--> 236\u001b[0m attention_scores \u001b[38;5;241m=\u001b[39m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmatmul\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery_layer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey_layer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranspose\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 238\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mposition_embedding_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrelative_key\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mposition_embedding_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrelative_key_query\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 239\u001b[0m query_length, key_length \u001b[38;5;241m=\u001b[39m query_layer\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m2\u001b[39m], key_layer\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m2\u001b[39m]\n",
|
644 |
+
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
645 |
+
]
|
646 |
+
}
|
647 |
+
],
|
648 |
+
"source": [
|
649 |
+
"tot_pll = 0.0\n",
|
650 |
+
"base_tot_pll = 0.0\n",
|
651 |
+
"variable_names = [\n",
|
652 |
+
" 'x', 'y', 'myVariable', 'dataPoint', 'randomNumber', 'userAge', 'resultValue', 'inputValue', 'tempValue', 'indexCounter', \n",
|
653 |
+
" 'itemPrice', 'userName', 'testScore', 'acceleration', 'productCount', 'errorMargin', 'piValue', 'sensorReading', \n",
|
654 |
+
" 'currentTemperature', 'velocityVector', 'variable1', 'variable2', 'valueA', 'valueB', 'counter', 'flag', 'total', \n",
|
655 |
+
" 'average', 'valueX', 'valueY', 'valueZ', 'price', 'quantity', 'name', 'age', 'score', 'weight', 'height', 'distance', \n",
|
656 |
+
" 'time', 'radius', 'width', 'length', 'temperature', 'pressure', 'humidity', 'voltage', 'current', 'resistance'\n",
|
657 |
+
"]\n",
|
658 |
+
"\n",
|
659 |
+
"for batch in loop:\n",
|
660 |
+
" inputs = batch\n",
|
661 |
+
" try:\n",
|
662 |
+
" for i in range(len(inputs[0])):\n",
|
663 |
+
" l = []\n",
|
664 |
+
" l.append(inputs[0][i])\n",
|
665 |
+
" l.append(inputs[1][i])\n",
|
666 |
+
" model(l)\n",
|
667 |
+
" except:\n",
|
668 |
+
" continue\n",
|
669 |
+
"\n",
|
670 |
+
"tot_pll/=len(myDs)\n",
|
671 |
+
"print('Total PLL per sentence: ')\n",
|
672 |
+
"print(tot_pll)\n",
|
673 |
+
"base_tot_pll/=len(myDs)\n",
|
674 |
+
"print('Total Base PLL per sentence: ')\n",
|
675 |
+
"print(base_tot_pll)\n",
|
676 |
+
"print(\"Net % difference average:\")\n",
|
677 |
+
"tot_diff = (tot_pll-base_tot_pll)*100/base_tot_pll\n",
|
678 |
+
"print(tot_diff)\n",
|
679 |
+
" "
|
680 |
+
]
|
681 |
+
},
|
682 |
+
{
|
683 |
+
"cell_type": "code",
|
684 |
+
"execution_count": null,
|
685 |
+
"id": "da79bcc2",
|
686 |
+
"metadata": {},
|
687 |
+
"outputs": [],
|
688 |
+
"source": []
|
689 |
+
}
|
690 |
+
],
|
691 |
+
"metadata": {
|
692 |
+
"kernelspec": {
|
693 |
+
"display_name": "Python 3.10 (tensorflown)",
|
694 |
+
"language": "python",
|
695 |
+
"name": "tensorflown"
|
696 |
+
},
|
697 |
+
"language_info": {
|
698 |
+
"codemirror_mode": {
|
699 |
+
"name": "ipython",
|
700 |
+
"version": 3
|
701 |
+
},
|
702 |
+
"file_extension": ".py",
|
703 |
+
"mimetype": "text/x-python",
|
704 |
+
"name": "python",
|
705 |
+
"nbconvert_exporter": "python",
|
706 |
+
"pygments_lexer": "ipython3",
|
707 |
+
"version": "3.10.12"
|
708 |
+
},
|
709 |
+
"toc": {
|
710 |
+
"base_numbering": 1,
|
711 |
+
"nav_menu": {},
|
712 |
+
"number_sections": true,
|
713 |
+
"sideBar": true,
|
714 |
+
"skip_h1_title": false,
|
715 |
+
"title_cell": "Table of Contents",
|
716 |
+
"title_sidebar": "Contents",
|
717 |
+
"toc_cell": false,
|
718 |
+
"toc_position": {},
|
719 |
+
"toc_section_display": true,
|
720 |
+
"toc_window_display": false
|
721 |
+
}
|
722 |
+
},
|
723 |
+
"nbformat": 4,
|
724 |
+
"nbformat_minor": 5
|
725 |
+
}
|
identifier_scoring.py
ADDED
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# In[1]:
|
5 |
+
|
6 |
+
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
import torch
|
10 |
+
from torch import nn
|
11 |
+
import torch.nn.functional as F
|
12 |
+
from torch.nn import init, MarginRankingLoss
|
13 |
+
from transformers import BertModel, RobertaModel
|
14 |
+
from transformers import BertTokenizer, RobertaTokenizer
|
15 |
+
from torch.optim import Adam
|
16 |
+
from distutils.version import LooseVersion
|
17 |
+
from torch.utils.data import Dataset, DataLoader
|
18 |
+
from torch.utils.tensorboard import SummaryWriter
|
19 |
+
from datetime import datetime
|
20 |
+
from torch.autograd import Variable
|
21 |
+
from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer
|
22 |
+
import torch.optim as optim
|
23 |
+
from torch.distributions import Categorical
|
24 |
+
import random
|
25 |
+
from transformers import AutoModelForMaskedLM, BertForMaskedLM, AdamW
|
26 |
+
from transformers import BertTokenizer
|
27 |
+
from tqdm import tqdm
|
28 |
+
import matplotlib.pyplot as plt
|
29 |
+
from transformers import XLMRobertaTokenizer
|
30 |
+
import os
|
31 |
+
import csv
|
32 |
+
from sklearn.model_selection import train_test_split
|
33 |
+
import nltk
|
34 |
+
from collections import defaultdict
|
35 |
+
from nltk.tokenize import word_tokenize
|
36 |
+
from nltk import pos_tag
|
37 |
+
from nltk.tokenize import word_tokenize
|
38 |
+
import math
|
39 |
+
from nltk.corpus import words
|
40 |
+
from sklearn.model_selection import train_test_split
|
41 |
+
import random
|
42 |
+
import re
|
43 |
+
import random
|
44 |
+
|
45 |
+
|
46 |
+
# In[2]:
|
47 |
+
|
48 |
+
|
49 |
+
class MyDataset(Dataset):
|
50 |
+
def __init__(self,file_name):
|
51 |
+
df1 = pd.read_csv(file_name)
|
52 |
+
df1 = df1[200:300]
|
53 |
+
df1 = df1.fillna("")
|
54 |
+
res = df1['X'].to_numpy()
|
55 |
+
self.X_list = res
|
56 |
+
self.y_list = df1['y'].to_numpy()
|
57 |
+
def __len__(self):
|
58 |
+
return len(self.X_list)
|
59 |
+
def __getitem__(self,idx):
|
60 |
+
mapi = []
|
61 |
+
mapi.append(self.X_list[idx])
|
62 |
+
mapi.append(self.y_list[idx])
|
63 |
+
return mapi
|
64 |
+
|
65 |
+
|
66 |
+
# In[3]:
|
67 |
+
|
68 |
+
|
69 |
+
class Step1_model(nn.Module):
|
70 |
+
def __init__(self, hidden_size=512):
|
71 |
+
super(Step1_model, self).__init__()
|
72 |
+
self.hidden_size = hidden_size
|
73 |
+
self.model = AutoModelForMaskedLM.from_pretrained('microsoft/graphcodebert-base')
|
74 |
+
self.tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
|
75 |
+
self.config = AutoConfig.from_pretrained("microsoft/graphcodebert-base")
|
76 |
+
self.linear_layer = nn.Linear(self.model.config.vocab_size, self.model.config.vocab_size)
|
77 |
+
def foo (self,data):
|
78 |
+
result = []
|
79 |
+
if type(data) == tuple:
|
80 |
+
return data[1]
|
81 |
+
if type(data) == list:
|
82 |
+
for inner in data:
|
83 |
+
result.append(foo(inner))
|
84 |
+
res = []
|
85 |
+
for a in result[0]:
|
86 |
+
res.append(a[:2])
|
87 |
+
return res
|
88 |
+
def loss_func1(self, word, y):
|
89 |
+
if word =='NA':
|
90 |
+
return torch.full((1,), fill_value=100)
|
91 |
+
try:
|
92 |
+
pred_list = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\d+', word)
|
93 |
+
target_list = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\d+', y)
|
94 |
+
pred_tag = self.foo(nltk.pos_tag(pred_list))
|
95 |
+
target_tag = self.foo(nltk.pos_tag(target_list))
|
96 |
+
str1 = ' '.join(pred_tag) # Convert lists to strings
|
97 |
+
str2 = ' '.join(target_tag)
|
98 |
+
distance = Levenshtein.distance(str1, str2)
|
99 |
+
dist = torch.Tensor([distance])
|
100 |
+
except:
|
101 |
+
dist = torch.Tensor([2*len(target_list)])
|
102 |
+
return dist
|
103 |
+
def loss_func2(self, word, y):
|
104 |
+
if word =='NA':
|
105 |
+
return torch.full((1,), fill_value=100)
|
106 |
+
nlp = en_core_web_sm.load()
|
107 |
+
pred_list = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\d+', word)
|
108 |
+
target_list = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\d+', y)
|
109 |
+
try:
|
110 |
+
str1 = ' '.join(pred_list) # Convert lists to strings
|
111 |
+
str2 = ' '.join(target_list)
|
112 |
+
tokens1 = nlp(str1)
|
113 |
+
tokens2 = nlp(str2)
|
114 |
+
embedding1 = sum(token.vector for token in tokens1) / len(tokens1)
|
115 |
+
embedding2 = sum(token.vector for token in tokens2) / len(tokens2)
|
116 |
+
w1= LA.norm(embedding1)
|
117 |
+
w2= LA.norm(embedding2)
|
118 |
+
distance = 1 - (embedding1.dot(embedding2) / (w1 * w2))
|
119 |
+
dist = torch.Tensor([distance])
|
120 |
+
except:
|
121 |
+
dist = torch.Tensor([1])
|
122 |
+
return dist
|
123 |
+
def forward(self, mapi):
|
124 |
+
global variable_names
|
125 |
+
global base_model
|
126 |
+
global tot_pll
|
127 |
+
global base_tot_pll
|
128 |
+
X_init1 = mapi[0]
|
129 |
+
X_init = mapi[0]
|
130 |
+
y = mapi[1]
|
131 |
+
print(y)
|
132 |
+
y_tok = self.tokenizer.encode(y)[1:-1]
|
133 |
+
nl = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\d+', y)
|
134 |
+
lb = ' '.join(nl).lower()
|
135 |
+
x = self.tokenizer.tokenize(lb)
|
136 |
+
num_sub_tokens_label = len(x)
|
137 |
+
X_init = X_init.replace("[MASK]", " ".join([self.tokenizer.mask_token] * num_sub_tokens_label))
|
138 |
+
sent_pll = 0.0
|
139 |
+
base_sent_pll = 0.0
|
140 |
+
for m in range(num_sub_tokens_label):
|
141 |
+
print(m)
|
142 |
+
tokens = self.tokenizer.encode_plus(X_init, add_special_tokens=False,return_tensors='pt')
|
143 |
+
input_id_chunki = tokens['input_ids'][0].split(510)
|
144 |
+
input_id_chunks = []
|
145 |
+
mask_chunks = []
|
146 |
+
mask_chunki = tokens['attention_mask'][0].split(510)
|
147 |
+
for tensor in input_id_chunki:
|
148 |
+
input_id_chunks.append(tensor)
|
149 |
+
for tensor in mask_chunki:
|
150 |
+
mask_chunks.append(tensor)
|
151 |
+
xi = torch.full((1,), fill_value=101)
|
152 |
+
yi = torch.full((1,), fill_value=1)
|
153 |
+
zi = torch.full((1,), fill_value=102)
|
154 |
+
for r in range(len(input_id_chunks)):
|
155 |
+
input_id_chunks[r] = torch.cat([xi, input_id_chunks[r]],dim = -1)
|
156 |
+
input_id_chunks[r] = torch.cat([input_id_chunks[r],zi],dim=-1)
|
157 |
+
mask_chunks[r] = torch.cat([yi, mask_chunks[r]],dim=-1)
|
158 |
+
mask_chunks[r] = torch.cat([mask_chunks[r],yi],dim=-1)
|
159 |
+
di = torch.full((1,), fill_value=0)
|
160 |
+
for i in range(len(input_id_chunks)):
|
161 |
+
pad_len = 512 - input_id_chunks[i].shape[0]
|
162 |
+
if pad_len > 0:
|
163 |
+
for p in range(pad_len):
|
164 |
+
input_id_chunks[i] = torch.cat([input_id_chunks[i],di],dim=-1)
|
165 |
+
mask_chunks[i] = torch.cat([mask_chunks[i],di],dim=-1)
|
166 |
+
input_ids = torch.stack(input_id_chunks)
|
167 |
+
attention_mask = torch.stack(mask_chunks)
|
168 |
+
input_dict = {
|
169 |
+
'input_ids': input_ids.long(),
|
170 |
+
'attention_mask': attention_mask.int()
|
171 |
+
}
|
172 |
+
maski = []
|
173 |
+
u = 0
|
174 |
+
ad = 0
|
175 |
+
for l in range(len(input_dict['input_ids'])):
|
176 |
+
masked_pos = []
|
177 |
+
for i in range(len(input_dict['input_ids'][l])):
|
178 |
+
if input_dict['input_ids'][l][i] == 50264: #103
|
179 |
+
u+=1
|
180 |
+
if i != 0 and input_dict['input_ids'][l][i-1] == 50264:
|
181 |
+
continue
|
182 |
+
masked_pos.append(i)
|
183 |
+
ad+=1
|
184 |
+
maski.append(masked_pos)
|
185 |
+
print('number of mask tok',u)
|
186 |
+
print('number of seq', ad)
|
187 |
+
with torch.no_grad():
|
188 |
+
output = self.model(**input_dict)
|
189 |
+
base_output = base_model(**input_dict)
|
190 |
+
last_hidden_state = output[0].squeeze()
|
191 |
+
base_last_hidden_state = base_output[0].squeeze()
|
192 |
+
l_o_l_sa = []
|
193 |
+
base_l_o_l_sa = []
|
194 |
+
if len(maski) == 1:
|
195 |
+
masked_pos = maski[0]
|
196 |
+
for k in masked_pos:
|
197 |
+
l_o_l_sa.append(last_hidden_state[k])
|
198 |
+
base_l_o_l_sa.append(base_last_hidden_state[k])
|
199 |
+
else:
|
200 |
+
for p in range(len(maski)):
|
201 |
+
masked_pos = maski[p]
|
202 |
+
for k in masked_pos:
|
203 |
+
l_o_l_sa.append(last_hidden_state[p][k])
|
204 |
+
base_l_o_l_sa.append(base_last_hidden_state[p][k])
|
205 |
+
sum_state = l_o_l_sa[0]
|
206 |
+
base_sum_state = base_l_o_l_sa[0]
|
207 |
+
for i in range(len(l_o_l_sa)):
|
208 |
+
if i == 0:
|
209 |
+
continue
|
210 |
+
sum_state += l_o_l_sa[i]
|
211 |
+
base_sum_state += base_l_o_l_sa[i]
|
212 |
+
yip = len(l_o_l_sa)
|
213 |
+
sum_state /= yip
|
214 |
+
base_sum_state /= yip
|
215 |
+
probs = F.softmax(sum_state, dim=0)
|
216 |
+
base_probs = F.softmax(base_sum_state, dim=0)
|
217 |
+
a_lab = y_tok[m]
|
218 |
+
prob = probs[a_lab]
|
219 |
+
base_prob = base_probs[a_lab]
|
220 |
+
log_prob = -1*math.log(prob)
|
221 |
+
base_log_prob = -1*math.log(base_prob)
|
222 |
+
sent_pll+=log_prob
|
223 |
+
base_sent_pll+=base_log_prob
|
224 |
+
xl = X_init.split()
|
225 |
+
xxl = []
|
226 |
+
for p in range(len(xl)):
|
227 |
+
if xl[p] == self.tokenizer.mask_token:
|
228 |
+
if p != 0 and xl[p-1] == self.tokenizer.mask_token:
|
229 |
+
xxl.append(xl[p])
|
230 |
+
continue
|
231 |
+
xxl.append(self.tokenizer.convert_ids_to_tokens(y_tok[m]))
|
232 |
+
continue
|
233 |
+
xxl.append(xl[p])
|
234 |
+
X_init = " ".join(xxl)
|
235 |
+
sent_pll/=num_sub_tokens_label
|
236 |
+
base_sent_pll/=num_sub_tokens_label
|
237 |
+
print("Sent PLL:")
|
238 |
+
print(sent_pll)
|
239 |
+
print("Base Sent PLL:")
|
240 |
+
print(base_sent_pll)
|
241 |
+
print("Net % difference:")
|
242 |
+
diff = (sent_pll-base_sent_pll)*100/base_sent_pll
|
243 |
+
print(diff)
|
244 |
+
tot_pll += sent_pll
|
245 |
+
base_tot_pll+=base_sent_pll
|
246 |
+
print()
|
247 |
+
print()
|
248 |
+
y = random.choice(variable_names)
|
249 |
+
print(y)
|
250 |
+
X_init = X_init1
|
251 |
+
y_tok = self.tokenizer.encode(y)[1:-1]
|
252 |
+
nl = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\d+', y)
|
253 |
+
lb = ' '.join(nl).lower()
|
254 |
+
x = self.tokenizer.tokenize(lb)
|
255 |
+
num_sub_tokens_label = len(x)
|
256 |
+
X_init = X_init.replace("[MASK]", " ".join([self.tokenizer.mask_token] * num_sub_tokens_label))
|
257 |
+
sent_pll = 0.0
|
258 |
+
base_sent_pll = 0.0
|
259 |
+
for m in range(num_sub_tokens_label):
|
260 |
+
print(m)
|
261 |
+
tokens = self.tokenizer.encode_plus(X_init, add_special_tokens=False,return_tensors='pt')
|
262 |
+
input_id_chunki = tokens['input_ids'][0].split(510)
|
263 |
+
input_id_chunks = []
|
264 |
+
mask_chunks = []
|
265 |
+
mask_chunki = tokens['attention_mask'][0].split(510)
|
266 |
+
for tensor in input_id_chunki:
|
267 |
+
input_id_chunks.append(tensor)
|
268 |
+
for tensor in mask_chunki:
|
269 |
+
mask_chunks.append(tensor)
|
270 |
+
xi = torch.full((1,), fill_value=101)
|
271 |
+
yi = torch.full((1,), fill_value=1)
|
272 |
+
zi = torch.full((1,), fill_value=102)
|
273 |
+
for r in range(len(input_id_chunks)):
|
274 |
+
input_id_chunks[r] = torch.cat([xi, input_id_chunks[r]],dim = -1)
|
275 |
+
input_id_chunks[r] = torch.cat([input_id_chunks[r],zi],dim=-1)
|
276 |
+
mask_chunks[r] = torch.cat([yi, mask_chunks[r]],dim=-1)
|
277 |
+
mask_chunks[r] = torch.cat([mask_chunks[r],yi],dim=-1)
|
278 |
+
di = torch.full((1,), fill_value=0)
|
279 |
+
for i in range(len(input_id_chunks)):
|
280 |
+
pad_len = 512 - input_id_chunks[i].shape[0]
|
281 |
+
if pad_len > 0:
|
282 |
+
for p in range(pad_len):
|
283 |
+
input_id_chunks[i] = torch.cat([input_id_chunks[i],di],dim=-1)
|
284 |
+
mask_chunks[i] = torch.cat([mask_chunks[i],di],dim=-1)
|
285 |
+
input_ids = torch.stack(input_id_chunks)
|
286 |
+
attention_mask = torch.stack(mask_chunks)
|
287 |
+
input_dict = {
|
288 |
+
'input_ids': input_ids.long(),
|
289 |
+
'attention_mask': attention_mask.int()
|
290 |
+
}
|
291 |
+
maski = []
|
292 |
+
u = 0
|
293 |
+
ad = 0
|
294 |
+
for l in range(len(input_dict['input_ids'])):
|
295 |
+
masked_pos = []
|
296 |
+
for i in range(len(input_dict['input_ids'][l])):
|
297 |
+
if input_dict['input_ids'][l][i] == 50264: #103
|
298 |
+
u+=1
|
299 |
+
if i != 0 and input_dict['input_ids'][l][i-1] == 50264:
|
300 |
+
continue
|
301 |
+
masked_pos.append(i)
|
302 |
+
ad+=1
|
303 |
+
maski.append(masked_pos)
|
304 |
+
print('number of mask tok',u)
|
305 |
+
print('number of seq', ad)
|
306 |
+
with torch.no_grad():
|
307 |
+
output = self.model(**input_dict)
|
308 |
+
base_output = base_model(**input_dict)
|
309 |
+
last_hidden_state = output[0].squeeze()
|
310 |
+
base_last_hidden_state = base_output[0].squeeze()
|
311 |
+
l_o_l_sa = []
|
312 |
+
base_l_o_l_sa = []
|
313 |
+
if len(maski) == 1:
|
314 |
+
masked_pos = maski[0]
|
315 |
+
for k in masked_pos:
|
316 |
+
l_o_l_sa.append(last_hidden_state[k])
|
317 |
+
base_l_o_l_sa.append(base_last_hidden_state[k])
|
318 |
+
else:
|
319 |
+
for p in range(len(maski)):
|
320 |
+
masked_pos = maski[p]
|
321 |
+
for k in masked_pos:
|
322 |
+
l_o_l_sa.append(last_hidden_state[p][k])
|
323 |
+
base_l_o_l_sa.append(base_last_hidden_state[p][k])
|
324 |
+
sum_state = l_o_l_sa[0]
|
325 |
+
base_sum_state = base_l_o_l_sa[0]
|
326 |
+
for i in range(len(l_o_l_sa)):
|
327 |
+
if i == 0:
|
328 |
+
continue
|
329 |
+
sum_state += l_o_l_sa[i]
|
330 |
+
base_sum_state += base_l_o_l_sa[i]
|
331 |
+
yip = len(l_o_l_sa)
|
332 |
+
sum_state /= yip
|
333 |
+
base_sum_state /= yip
|
334 |
+
probs = F.softmax(sum_state, dim=0)
|
335 |
+
base_probs = F.softmax(base_sum_state, dim=0)
|
336 |
+
a_lab = y_tok[m]
|
337 |
+
prob = probs[a_lab]
|
338 |
+
base_prob = base_probs[a_lab]
|
339 |
+
log_prob = -1*math.log(prob)
|
340 |
+
base_log_prob = -1*math.log(base_prob)
|
341 |
+
sent_pll+=log_prob
|
342 |
+
base_sent_pll+=base_log_prob
|
343 |
+
xl = X_init.split()
|
344 |
+
xxl = []
|
345 |
+
for p in range(len(xl)):
|
346 |
+
if xl[p] == self.tokenizer.mask_token:
|
347 |
+
if p != 0 and xl[p-1] == self.tokenizer.mask_token:
|
348 |
+
xxl.append(xl[p])
|
349 |
+
continue
|
350 |
+
xxl.append(self.tokenizer.convert_ids_to_tokens(y_tok[m]))
|
351 |
+
continue
|
352 |
+
xxl.append(xl[p])
|
353 |
+
X_init = " ".join(xxl)
|
354 |
+
sent_pll/=num_sub_tokens_label
|
355 |
+
base_sent_pll/=num_sub_tokens_label
|
356 |
+
print("Sent PLL:")
|
357 |
+
print(sent_pll)
|
358 |
+
print("Base Sent PLL:")
|
359 |
+
print(base_sent_pll)
|
360 |
+
print("Net % difference:")
|
361 |
+
diff = (sent_pll-base_sent_pll)*100/base_sent_pll
|
362 |
+
print(diff)
|
363 |
+
print()
|
364 |
+
print("******")
|
365 |
+
print()
|
366 |
+
|
367 |
+
|
368 |
+
|
369 |
+
# In[4]:
|
370 |
+
|
371 |
+
|
372 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
|
373 |
+
model = Step1_model()
|
374 |
+
model.load_state_dict(torch.load('var_runs/model_98_3'))
|
375 |
+
base_model = AutoModelForMaskedLM.from_pretrained('microsoft/graphcodebert-base')
|
376 |
+
model.eval()
|
377 |
+
base_model.eval()
|
378 |
+
|
379 |
+
|
380 |
+
# In[5]:
|
381 |
+
|
382 |
+
|
383 |
+
myDs=MyDataset('dat.csv')
|
384 |
+
loader=DataLoader(myDs,batch_size=2,shuffle=True)
|
385 |
+
loop = tqdm(loader, leave=True)
|
386 |
+
|
387 |
+
|
388 |
+
# In[6]:
|
389 |
+
|
390 |
+
|
391 |
+
tot_pll = 0.0
|
392 |
+
base_tot_pll = 0.0
|
393 |
+
variable_names = [
|
394 |
+
'x', 'y', 'myVariable', 'dataPoint', 'randomNumber', 'userAge', 'resultValue', 'inputValue', 'tempValue', 'indexCounter',
|
395 |
+
'itemPrice', 'userName', 'testScore', 'acceleration', 'productCount', 'errorMargin', 'piValue', 'sensorReading',
|
396 |
+
'currentTemperature', 'velocityVector', 'variable1', 'variable2', 'valueA', 'valueB', 'counter', 'flag', 'total',
|
397 |
+
'average', 'valueX', 'valueY', 'valueZ', 'price', 'quantity', 'name', 'age', 'score', 'weight', 'height', 'distance',
|
398 |
+
'time', 'radius', 'width', 'length', 'temperature', 'pressure', 'humidity', 'voltage', 'current', 'resistance'
|
399 |
+
]
|
400 |
+
|
401 |
+
for batch in loop:
|
402 |
+
inputs = batch
|
403 |
+
try:
|
404 |
+
for i in range(len(inputs[0])):
|
405 |
+
l = []
|
406 |
+
l.append(inputs[0][i])
|
407 |
+
l.append(inputs[1][i])
|
408 |
+
model(l)
|
409 |
+
except:
|
410 |
+
continue
|
411 |
+
|
412 |
+
tot_pll/=len(myDs)
|
413 |
+
print('Total PLL per sentence: ')
|
414 |
+
print(tot_pll)
|
415 |
+
base_tot_pll/=len(myDs)
|
416 |
+
print('Total Base PLL per sentence: ')
|
417 |
+
print(base_tot_pll)
|
418 |
+
print("Net % difference average:")
|
419 |
+
tot_diff = (tot_pll-base_tot_pll)*100/base_tot_pll
|
420 |
+
print(tot_diff)
|
421 |
+
|
422 |
+
|
423 |
+
|
424 |
+
# In[ ]:
|
425 |
+
|
426 |
+
|
427 |
+
|
428 |
+
|
model_eval.py
ADDED
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# In[1]:
|
5 |
+
|
6 |
+
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
import torch
|
10 |
+
from torch import nn
|
11 |
+
from torch.nn import init, MarginRankingLoss
|
12 |
+
from torch.optim import Adam
|
13 |
+
from distutils.version import LooseVersion
|
14 |
+
from torch.utils.data import Dataset, DataLoader
|
15 |
+
from torch.autograd import Variable
|
16 |
+
import math
|
17 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
18 |
+
import nltk
|
19 |
+
import re
|
20 |
+
import torch.optim as optim
|
21 |
+
from tqdm import tqdm
|
22 |
+
from transformers import AutoModelForMaskedLM
|
23 |
+
import torch.nn.functional as F
|
24 |
+
import random
|
25 |
+
|
26 |
+
|
27 |
+
# In[2]:
|
28 |
+
|
29 |
+
|
30 |
+
maskis = []
|
31 |
+
n_y = []
|
32 |
+
class MyDataset(Dataset):
|
33 |
+
def __init__(self,file_name):
|
34 |
+
global maskis
|
35 |
+
global n_y
|
36 |
+
df = pd.read_csv(file_name)
|
37 |
+
df = df.fillna("")
|
38 |
+
self.inp_dicts = []
|
39 |
+
for r in range(df.shape[0]):
|
40 |
+
X_init = df['X'][r]
|
41 |
+
y = df['y'][r]
|
42 |
+
n_y.append(y)
|
43 |
+
nl = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\d+', y)
|
44 |
+
lb = ' '.join(nl).lower()
|
45 |
+
x = tokenizer.tokenize(lb)
|
46 |
+
num_sub_tokens_label = len(x)
|
47 |
+
X_init = X_init.replace("[MASK]", " ".join([tokenizer.mask_token] * num_sub_tokens_label))
|
48 |
+
tokens = tokenizer.encode_plus(X_init, add_special_tokens=False,return_tensors='pt')
|
49 |
+
input_id_chunki = tokens['input_ids'][0].split(510)
|
50 |
+
input_id_chunks = []
|
51 |
+
mask_chunks = []
|
52 |
+
mask_chunki = tokens['attention_mask'][0].split(510)
|
53 |
+
for tensor in input_id_chunki:
|
54 |
+
input_id_chunks.append(tensor)
|
55 |
+
for tensor in mask_chunki:
|
56 |
+
mask_chunks.append(tensor)
|
57 |
+
xi = torch.full((1,), fill_value=101)
|
58 |
+
yi = torch.full((1,), fill_value=1)
|
59 |
+
zi = torch.full((1,), fill_value=102)
|
60 |
+
for r in range(len(input_id_chunks)):
|
61 |
+
input_id_chunks[r] = torch.cat([xi, input_id_chunks[r]],dim = -1)
|
62 |
+
input_id_chunks[r] = torch.cat([input_id_chunks[r],zi],dim=-1)
|
63 |
+
mask_chunks[r] = torch.cat([yi, mask_chunks[r]],dim=-1)
|
64 |
+
mask_chunks[r] = torch.cat([mask_chunks[r],yi],dim=-1)
|
65 |
+
di = torch.full((1,), fill_value=0)
|
66 |
+
for i in range(len(input_id_chunks)):
|
67 |
+
pad_len = 512 - input_id_chunks[i].shape[0]
|
68 |
+
if pad_len > 0:
|
69 |
+
for p in range(pad_len):
|
70 |
+
input_id_chunks[i] = torch.cat([input_id_chunks[i],di],dim=-1)
|
71 |
+
mask_chunks[i] = torch.cat([mask_chunks[i],di],dim=-1)
|
72 |
+
vb = torch.ones_like(input_id_chunks[0])
|
73 |
+
fg = torch.zeros_like(input_id_chunks[0])
|
74 |
+
maski = []
|
75 |
+
for l in range(len(input_id_chunks)):
|
76 |
+
masked_pos = []
|
77 |
+
for i in range(len(input_id_chunks[l])):
|
78 |
+
if input_id_chunks[l][i] == tokenizer.mask_token_id: #103
|
79 |
+
if i != 0 and input_id_chunks[l][i-1] == tokenizer.mask_token_id:
|
80 |
+
continue
|
81 |
+
masked_pos.append(i)
|
82 |
+
maski.append(masked_pos)
|
83 |
+
maskis.append(maski)
|
84 |
+
while (len(input_id_chunks)<250):
|
85 |
+
input_id_chunks.append(vb)
|
86 |
+
mask_chunks.append(fg)
|
87 |
+
input_ids = torch.stack(input_id_chunks)
|
88 |
+
attention_mask = torch.stack(mask_chunks)
|
89 |
+
input_dict = {
|
90 |
+
'input_ids': input_ids.long(),
|
91 |
+
'attention_mask': attention_mask.int()
|
92 |
+
}
|
93 |
+
self.inp_dicts.append(input_dict)
|
94 |
+
del input_dict
|
95 |
+
del input_ids
|
96 |
+
del attention_mask
|
97 |
+
del maski
|
98 |
+
del mask_chunks
|
99 |
+
del input_id_chunks
|
100 |
+
del di
|
101 |
+
del fg
|
102 |
+
del vb
|
103 |
+
del mask_chunki
|
104 |
+
del input_id_chunki
|
105 |
+
del X_init
|
106 |
+
del y
|
107 |
+
del tokens
|
108 |
+
del x
|
109 |
+
del lb
|
110 |
+
del nl
|
111 |
+
del df
|
112 |
+
def __len__(self):
|
113 |
+
return len(self.inp_dicts)
|
114 |
+
def __getitem__(self,idx):
|
115 |
+
return self.inp_dicts[idx]
|
116 |
+
|
117 |
+
|
118 |
+
# In[3]:
|
119 |
+
|
120 |
+
|
121 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
|
122 |
+
model = AutoModelForMaskedLM.from_pretrained("microsoft/graphcodebert-base")
|
123 |
+
base_model = AutoModelForMaskedLM.from_pretrained("microsoft/graphcodebert-base")
|
124 |
+
model.load_state_dict(torch.load('var_runs/model_26_2'))
|
125 |
+
model.eval()
|
126 |
+
base_model.eval()
|
127 |
+
myDs=MyDataset('test.csv')
|
128 |
+
train_loader=DataLoader(myDs,batch_size=1,shuffle=False)
|
129 |
+
|
130 |
+
|
131 |
+
# In[4]:
|
132 |
+
|
133 |
+
|
134 |
+
variable_names = [
|
135 |
+
# One-word Variable Names
|
136 |
+
'count', 'value', 'result', 'flag', 'max', 'min', 'data', 'input', 'output', 'name', 'index', 'status', 'error', 'message', 'price', 'quantity', 'total', 'length', 'size', 'score',
|
137 |
+
|
138 |
+
# Two-word Variable Names
|
139 |
+
'studentName', 'accountBalance', 'isFound', 'maxScore', 'userAge', 'carModel', 'bookTitle', 'arrayLength', 'employeeID', 'itemPrice', 'customerAddress', 'productCategory', 'orderNumber', 'transactionType', 'bankAccount', 'shippingMethod', 'deliveryDate', 'purchaseAmount', 'inventoryItem', 'salesRevenue',
|
140 |
+
|
141 |
+
# Three-word Variable Names
|
142 |
+
'numberOfStudents', 'averageTemperature', 'userIsLoggedIn', 'totalSalesAmount', 'employeeSalaryRate', 'maxAllowedAttempts', 'selectedOption', 'shippingAddress', 'manufacturingDate', 'connectionPool', 'customerAccountBalance', 'employeeSalaryReport', 'productInventoryCount', 'transactionProcessingStatus', 'userAuthenticationToken', 'orderShippingAddress', 'databaseConnectionPoolSize', 'vehicleEngineTemperature', 'sensorDataProcessingRate', 'employeePayrollSystem',
|
143 |
+
|
144 |
+
# Four-word Variable Names
|
145 |
+
'customerAccountBalanceValue', 'employeeSalaryReportData', 'productInventoryItemCount', 'transactionProcessingStatusFlag', 'userAuthenticationTokenKey', 'orderShippingAddressDetails', 'databaseConnectionPoolMaxSize', 'vehicleEngineTemperatureReading', 'sensorDataProcessingRateLimit', 'employeePayrollSystemData', 'customerOrderShippingAddress', 'productCatalogItemNumber', 'transactionProcessingSuccessFlag', 'userAuthenticationAccessToken', 'databaseConnectionPoolConfig', 'vehicleEngineTemperatureSensor', 'sensorDataProcessingRateLimitation', 'employeePayrollSystemConfiguration', 'customerAccountBalanceHistoryData', 'transactionProcessingStatusTracking'
|
146 |
+
]
|
147 |
+
var_list = []
|
148 |
+
for j in range(6):
|
149 |
+
d =[]
|
150 |
+
var_list.append(d)
|
151 |
+
for var in variable_names:
|
152 |
+
try:
|
153 |
+
var_list[len(tokenizer.tokenize(var))-1].append(var)
|
154 |
+
except:
|
155 |
+
continue
|
156 |
+
|
157 |
+
|
158 |
+
# In[5]:
|
159 |
+
|
160 |
+
|
161 |
+
tot_pll = 0.0
|
162 |
+
base_tot_pll = 0.0
|
163 |
+
loop = tqdm(train_loader, leave=True)
|
164 |
+
cntr = 0
|
165 |
+
for batch in loop:
|
166 |
+
maxi = torch.tensor(0.0, requires_grad=True)
|
167 |
+
for i in range(len(batch['input_ids'])):
|
168 |
+
cntr+=1
|
169 |
+
maski = maskis[cntr-1]
|
170 |
+
li = len(maski)
|
171 |
+
input_ids = batch['input_ids'][i][:li]
|
172 |
+
att_mask = batch['attention_mask'][i][:li]
|
173 |
+
y = n_y[cntr-1]
|
174 |
+
ty = tokenizer.encode(y)[1:-1]
|
175 |
+
num_sub_tokens_label = len(ty)
|
176 |
+
if num_sub_tokens_label > 6:
|
177 |
+
continue
|
178 |
+
print("Ground truth:", y)
|
179 |
+
m_y = random.choice(var_list[num_sub_tokens_label-1])
|
180 |
+
m_ty = tokenizer.encode(m_y)[1:-1]
|
181 |
+
print("Mock truth:", m_y)
|
182 |
+
# input_ids, att_mask = input_ids.to(device),att_mask.to(device)
|
183 |
+
outputs = model(input_ids, attention_mask = att_mask)
|
184 |
+
base_outputs = base_model(input_ids, attention_mask = att_mask)
|
185 |
+
last_hidden_state = outputs[0].squeeze()
|
186 |
+
base_last_hidden_state = base_outputs[0].squeeze()
|
187 |
+
l_o_l_sa = []
|
188 |
+
base_l_o_l_sa = []
|
189 |
+
sum_state = []
|
190 |
+
base_sum_state = []
|
191 |
+
for t in range(num_sub_tokens_label):
|
192 |
+
c = []
|
193 |
+
d = []
|
194 |
+
l_o_l_sa.append(c)
|
195 |
+
base_l_o_l_sa.append(d)
|
196 |
+
if len(maski) == 1:
|
197 |
+
masked_pos = maski[0]
|
198 |
+
for k in masked_pos:
|
199 |
+
for t in range(num_sub_tokens_label):
|
200 |
+
l_o_l_sa[t].append(last_hidden_state[k+t])
|
201 |
+
base_l_o_l_sa[t].append(base_last_hidden_state[k+t])
|
202 |
+
else:
|
203 |
+
for p in range(len(maski)):
|
204 |
+
masked_pos = maski[p]
|
205 |
+
for k in masked_pos:
|
206 |
+
for t in range(num_sub_tokens_label):
|
207 |
+
if (k+t) >= len(last_hidden_state[p]):
|
208 |
+
l_o_l_sa[t].append(last_hidden_state[p+1][k+t-len(last_hidden_state[p])])
|
209 |
+
base_l_o_l_sa[t].append(base_last_hidden_state[p+1][k+t-len(base_last_hidden_state[p])])
|
210 |
+
continue
|
211 |
+
l_o_l_sa[t].append(last_hidden_state[p][k+t])
|
212 |
+
base_l_o_l_sa[t].append(base_last_hidden_state[p][k+t])
|
213 |
+
for t in range(num_sub_tokens_label):
|
214 |
+
sum_state.append(l_o_l_sa[t][0])
|
215 |
+
base_sum_state.append(base_l_o_l_sa[t][0])
|
216 |
+
for i in range(len(l_o_l_sa[0])):
|
217 |
+
if i == 0:
|
218 |
+
continue
|
219 |
+
for t in range(num_sub_tokens_label):
|
220 |
+
sum_state[t] = sum_state[t] + l_o_l_sa[t][i]
|
221 |
+
base_sum_state[t] = base_sum_state[t] + base_l_o_l_sa[t][i]
|
222 |
+
yip = len(l_o_l_sa[0])
|
223 |
+
val = 0.0
|
224 |
+
m_val = 0.0
|
225 |
+
m_base_val = 0.0
|
226 |
+
base_val = 0.0
|
227 |
+
for t in range(num_sub_tokens_label):
|
228 |
+
sum_state[t] /= yip
|
229 |
+
base_sum_state[t] /= yip
|
230 |
+
probs = F.softmax(sum_state[t], dim=0)
|
231 |
+
base_probs = F.softmax(base_sum_state[t], dim=0)
|
232 |
+
val = val - torch.log(probs[ty[t]])
|
233 |
+
m_val = m_val - torch.log(probs[m_ty[t]])
|
234 |
+
base_val = base_val - torch.log(base_probs[ty[t]])
|
235 |
+
m_base_val = m_base_val - torch.log(base_probs[m_ty[t]])
|
236 |
+
val = val / num_sub_tokens_label
|
237 |
+
base_val = base_val / num_sub_tokens_label
|
238 |
+
m_val = m_val / num_sub_tokens_label
|
239 |
+
m_base_val = m_base_val / num_sub_tokens_label
|
240 |
+
print("Sent PLL:")
|
241 |
+
print(val)
|
242 |
+
print("Base Sent PLL:")
|
243 |
+
print(base_val)
|
244 |
+
print("Net % difference:")
|
245 |
+
diff = (val-base_val)*100/base_val
|
246 |
+
print(diff)
|
247 |
+
tot_pll += val
|
248 |
+
base_tot_pll+=base_val
|
249 |
+
print()
|
250 |
+
print()
|
251 |
+
print("Mock Sent PLL:")
|
252 |
+
print(m_val)
|
253 |
+
print("Mock Base Sent PLL:")
|
254 |
+
print(m_base_val)
|
255 |
+
print("Mock Net % difference:")
|
256 |
+
m_diff = (m_val-m_base_val)*100/m_base_val
|
257 |
+
print(m_diff)
|
258 |
+
for c in sum_state:
|
259 |
+
del c
|
260 |
+
for d in base_sum_state:
|
261 |
+
del d
|
262 |
+
del sum_state
|
263 |
+
del base_sum_state
|
264 |
+
for c in l_o_l_sa:
|
265 |
+
del c
|
266 |
+
for c in base_l_o_l_sa:
|
267 |
+
del c
|
268 |
+
del l_o_l_sa
|
269 |
+
del base_l_o_l_sa
|
270 |
+
del maski
|
271 |
+
del input_ids
|
272 |
+
del att_mask
|
273 |
+
del last_hidden_state
|
274 |
+
del base_last_hidden_state
|
275 |
+
print("Tot PLL: ", tot_pll)
|
276 |
+
print("Base Tot PLL: ", base_tot_pll)
|
277 |
+
|
278 |
+
|
279 |
+
# In[ ]:
|
280 |
+
|
281 |
+
|
282 |
+
|
283 |
+
|
model_test.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
procTest.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
stat_sampling.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# In[73]:
|
5 |
+
|
6 |
+
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
import torch
|
10 |
+
from transformers import RobertaTokenizer, RobertaForSequenceClassification
|
11 |
+
from torch import nn
|
12 |
+
from torch.nn import init, MarginRankingLoss
|
13 |
+
from transformers import BertModel, RobertaModel
|
14 |
+
from transformers import BertTokenizer, RobertaTokenizer
|
15 |
+
from torch.optim import Adam
|
16 |
+
from distutils.version import LooseVersion
|
17 |
+
from torch.utils.data import Dataset, DataLoader
|
18 |
+
from torch.utils.tensorboard import SummaryWriter
|
19 |
+
from datetime import datetime
|
20 |
+
from torch.autograd import Variable
|
21 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
22 |
+
import nltk
|
23 |
+
import re
|
24 |
+
import Levenshtein
|
25 |
+
import spacy
|
26 |
+
import en_core_web_sm
|
27 |
+
import torch.optim as optim
|
28 |
+
from torch.distributions import Categorical
|
29 |
+
from numpy import linalg as LA
|
30 |
+
from transformers import AutoModelForMaskedLM
|
31 |
+
from nltk.corpus import wordnet
|
32 |
+
import torch.nn.functional as F
|
33 |
+
import random
|
34 |
+
from transformers import get_linear_schedule_with_warmup
|
35 |
+
from sklearn.metrics import precision_recall_fscore_support
|
36 |
+
from nltk.corpus import words as wal
|
37 |
+
from sklearn.utils import resample
|
38 |
+
|
39 |
+
|
40 |
+
# In[56]:
|
41 |
+
|
42 |
+
|
43 |
+
class MyDataset(Dataset):
|
44 |
+
def __init__(self,file_name):
|
45 |
+
df1 = pd.read_csv(file_name)
|
46 |
+
df1 = df1[230000:]
|
47 |
+
df1 = df1.fillna("")
|
48 |
+
res = df1['X']
|
49 |
+
# ab = df1['X']
|
50 |
+
# res = [sub.replace("<mask>", "[MASK]") for sub in ab]
|
51 |
+
self.X_list = res.to_numpy()
|
52 |
+
self.y_list = df1['y'].to_numpy()
|
53 |
+
def __len__(self):
|
54 |
+
return len(self.X_list)
|
55 |
+
def __getitem__(self,idx):
|
56 |
+
mapi = []
|
57 |
+
mapi.append(self.X_list[idx])
|
58 |
+
mapi.append(self.y_list[idx])
|
59 |
+
return mapi
|
60 |
+
|
61 |
+
|
62 |
+
# In[59]:
|
63 |
+
|
64 |
+
|
65 |
+
class Step1_model(nn.Module):
|
66 |
+
def __init__(self, hidden_size=512):
|
67 |
+
super(Step1_model, self).__init__()
|
68 |
+
self.hidden_size = hidden_size
|
69 |
+
self.tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
|
70 |
+
|
71 |
+
def forward(self, mapi):
|
72 |
+
y = mapi[1]
|
73 |
+
print(y)
|
74 |
+
nl = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\d+', y)
|
75 |
+
lb = ' '.join(nl).lower()
|
76 |
+
x = tokenizer.tokenize(lb)
|
77 |
+
nlab = len(x)
|
78 |
+
print(nlab)
|
79 |
+
rand_no = random.random()
|
80 |
+
tok_map = {2: 0.4363429005892416,
|
81 |
+
1: 0.6672580202327398,
|
82 |
+
4: 0.7476060740459144,
|
83 |
+
3: 0.9618703668504087,
|
84 |
+
6: 0.9701028532809564,
|
85 |
+
7: 0.9729244545819342,
|
86 |
+
8: 0.9739508754144756,
|
87 |
+
5: 0.9994508859743607,
|
88 |
+
9: 0.9997507867114407,
|
89 |
+
10: 0.9999112969650892,
|
90 |
+
11: 0.9999788802297832,
|
91 |
+
0: 0.9999831041838266,
|
92 |
+
12: 0.9999873281378701,
|
93 |
+
22: 0.9999957760459568,
|
94 |
+
14: 1.0000000000000002}
|
95 |
+
for key in tok_map.keys():
|
96 |
+
if rand_no < tok_map[key]:
|
97 |
+
pred = key
|
98 |
+
break
|
99 |
+
predicted = torch.tensor([pred], dtype = float)
|
100 |
+
if pred == nlab:
|
101 |
+
l2 = 0
|
102 |
+
else:
|
103 |
+
l2 = 1
|
104 |
+
actual = torch.tensor([nlab], dtype = float)
|
105 |
+
l1 = Variable(torch.tensor([(actual-predicted)**2],dtype=float),requires_grad = True)
|
106 |
+
return {'loss':l1, 'actual_pred':pred, 'acc': l2}
|
107 |
+
|
108 |
+
|
109 |
+
# In[60]:
|
110 |
+
|
111 |
+
|
112 |
+
epoch_number = 0
|
113 |
+
EPOCHS = 5
|
114 |
+
run_int = 0
|
115 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
|
116 |
+
model = Step1_model()
|
117 |
+
myDs=MyDataset('dat_test.csv')
|
118 |
+
train_loader=DataLoader(myDs,batch_size=2,shuffle=True)
|
119 |
+
best_loss = torch.full((1,), fill_value=100000)
|
120 |
+
|
121 |
+
|
122 |
+
flag = 0
|
123 |
+
def train_one_epoch(transformer_model, dataset):
|
124 |
+
global flag
|
125 |
+
tot_loss1 = 0.0
|
126 |
+
tot_loss2 = 0.0
|
127 |
+
cnt = 0
|
128 |
+
for batch in dataset:
|
129 |
+
p = 0
|
130 |
+
inputs = batch
|
131 |
+
for i in range(len(inputs[0])):
|
132 |
+
cnt += 1
|
133 |
+
l = []
|
134 |
+
l.append(inputs[0][i])
|
135 |
+
l.append(inputs[1][i])
|
136 |
+
opi = transformer_model(l)
|
137 |
+
loss1 = opi['loss']
|
138 |
+
loss2 = opi['acc']
|
139 |
+
tot_loss1 += loss1
|
140 |
+
tot_loss2 += loss2
|
141 |
+
|
142 |
+
tot_loss1/=cnt
|
143 |
+
tot_loss2/=cnt
|
144 |
+
print('MSE loss: ')
|
145 |
+
print(tot_loss1)
|
146 |
+
print('accuracy: ')
|
147 |
+
print(tot_loss2)
|
148 |
+
return {'MSE loss': tot_loss1, 'accuracy': tot_loss2}
|
149 |
+
|
150 |
+
model.eval()
|
151 |
+
avg_loss = train_one_epoch(model,train_loader)
|
152 |
+
|
153 |
+
|
154 |
+
|
155 |
+
|
156 |
+
|
157 |
+
|
test.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|