|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
import numpy as np |
|
import torch |
|
from torch import nn |
|
from torch.nn import init, MarginRankingLoss |
|
from torch.optim import Adam |
|
from distutils.version import LooseVersion |
|
from torch.utils.data import Dataset, DataLoader |
|
from torch.autograd import Variable |
|
import math |
|
from transformers import AutoConfig, AutoModel, AutoTokenizer |
|
import nltk |
|
import re |
|
import torch.optim as optim |
|
from tqdm import tqdm |
|
from transformers import AutoModelForMaskedLM |
|
import torch.nn.functional as F |
|
import random |
|
|
|
|
|
|
|
|
|
def freeze(model): |
|
for name, param in model.named_parameters(): |
|
param.requires_grad = True |
|
if name.startswith("model.roberta.encoder.layer.0"): |
|
param.requires_grad = False |
|
if name.startswith("model.roberta.encoder.layer.1"): |
|
param.requires_grad = False |
|
if name.startswith("model.roberta.encoder.layer.2"): |
|
param.requires_grad = False |
|
if name.startswith("model.roberta.encoder.layer.3"): |
|
param.requires_grad = False |
|
if name.startswith("model.roberta.encoder.layer.4"): |
|
param.requires_grad = False |
|
if name.startswith("model.roberta.encoder.layer.5"): |
|
param.requires_grad = False |
|
if name.startswith("model.roberta.encoder.layer.6"): |
|
param.requires_grad = False |
|
if name.startswith("model.roberta.encoder.layer.7"): |
|
param.requires_grad = False |
|
|
|
|
|
|
|
|
|
return model |
|
|
|
maskis = [] |
|
n_y = [] |
|
class MyDataset(Dataset): |
|
def __init__(self,file_name): |
|
global maskis |
|
global n_y |
|
df = pd.read_csv(file_name) |
|
df = df.sample(frac=1) |
|
df = df.fillna("") |
|
self.inp_dicts = [] |
|
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base") |
|
for r in range(df.shape[0]): |
|
X_init = df['X'][r] |
|
y = df['y'][r] |
|
n_y.append(y) |
|
nl = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))|[a-z]+|\d+', y) |
|
lb = ' '.join(nl).lower() |
|
x = tokenizer.tokenize(lb) |
|
num_sub_tokens_label = len(x) |
|
X_init = X_init.replace("[MASK]", " ".join([tokenizer.mask_token] * num_sub_tokens_label)) |
|
tokens = tokenizer.encode_plus(X_init, add_special_tokens=False,return_tensors='pt') |
|
input_id_chunki = tokens['input_ids'][0].split(510) |
|
input_id_chunks = [] |
|
mask_chunks = [] |
|
mask_chunki = tokens['attention_mask'][0].split(510) |
|
for tensor in input_id_chunki: |
|
input_id_chunks.append(tensor) |
|
for tensor in mask_chunki: |
|
mask_chunks.append(tensor) |
|
xi = torch.full((1,), fill_value=101) |
|
yi = torch.full((1,), fill_value=1) |
|
zi = torch.full((1,), fill_value=102) |
|
for r in range(len(input_id_chunks)): |
|
input_id_chunks[r] = torch.cat([xi, input_id_chunks[r]],dim = -1) |
|
input_id_chunks[r] = torch.cat([input_id_chunks[r],zi],dim=-1) |
|
mask_chunks[r] = torch.cat([yi, mask_chunks[r]],dim=-1) |
|
mask_chunks[r] = torch.cat([mask_chunks[r],yi],dim=-1) |
|
di = torch.full((1,), fill_value=0) |
|
for i in range(len(input_id_chunks)): |
|
pad_len = 512 - input_id_chunks[i].shape[0] |
|
if pad_len > 0: |
|
for p in range(pad_len): |
|
input_id_chunks[i] = torch.cat([input_id_chunks[i],di],dim=-1) |
|
mask_chunks[i] = torch.cat([mask_chunks[i],di],dim=-1) |
|
vb = torch.ones_like(input_id_chunks[0]) |
|
fg = torch.zeros_like(input_id_chunks[0]) |
|
maski = [] |
|
for l in range(len(input_id_chunks)): |
|
masked_pos = [] |
|
for i in range(len(input_id_chunks[l])): |
|
if input_id_chunks[l][i] == tokenizer.mask_token_id: |
|
if i != 0 and input_id_chunks[l][i-1] == tokenizer.mask_token_id: |
|
continue |
|
masked_pos.append(i) |
|
maski.append(masked_pos) |
|
maskis.append(maski) |
|
while (len(input_id_chunks)<250): |
|
input_id_chunks.append(vb) |
|
mask_chunks.append(fg) |
|
input_ids = torch.stack(input_id_chunks) |
|
attention_mask = torch.stack(mask_chunks) |
|
input_dict = { |
|
'input_ids': input_ids.long(), |
|
'attention_mask': attention_mask.int() |
|
} |
|
self.inp_dicts.append(input_dict) |
|
del input_dict |
|
del input_ids |
|
del attention_mask |
|
del maski |
|
del mask_chunks |
|
del input_id_chunks |
|
del di |
|
del fg |
|
del vb |
|
del mask_chunki |
|
del input_id_chunki |
|
del X_init |
|
del y |
|
del tokens |
|
del x |
|
del lb |
|
del nl |
|
del df |
|
def __len__(self): |
|
return len(self.inp_dicts) |
|
def __getitem__(self,idx): |
|
return self.inp_dicts[idx] |
|
|
|
|
|
|
|
|
|
|
|
def my_func(): |
|
global maskis |
|
global n_y |
|
epoch_number = 0 |
|
EPOCHS = 5 |
|
run_int = 26 |
|
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base") |
|
model = AutoModelForMaskedLM.from_pretrained("microsoft/graphcodebert-base") |
|
|
|
|
|
optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5) |
|
myDs=MyDataset('dat1.csv') |
|
train_loader=DataLoader(myDs,batch_size=1,shuffle=False) |
|
best_loss = torch.full((1,), fill_value=100000) |
|
for epoch in range(EPOCHS): |
|
loop = tqdm(train_loader, leave=True) |
|
tot_loss = 0.0 |
|
cntr = 0 |
|
for batch in loop: |
|
try: |
|
optimizer.zero_grad() |
|
maxi = torch.tensor(0.0, requires_grad=True) |
|
for i in range(len(batch['input_ids'])): |
|
cntr+=1 |
|
maski = maskis[cntr-1] |
|
li = len(maski) |
|
input_ids = batch['input_ids'][i][:li] |
|
att_mask = batch['attention_mask'][i][:li] |
|
y = n_y[cntr-1] |
|
print("Ground truth:", y) |
|
ty = tokenizer.encode(y)[1:-1] |
|
num_sub_tokens_label = len(ty) |
|
qw = "" |
|
val = torch.tensor(0.0, requires_grad=True) |
|
for m in range(num_sub_tokens_label): |
|
outputs = model(input_ids, attention_mask = att_mask) |
|
last_hidden_state = outputs[0].squeeze() |
|
l_o_l_sa = [] |
|
sum_state = [] |
|
if len(maski) == 1: |
|
masked_pos = maski[0] |
|
for k in masked_pos: |
|
l_o_l_sa.append(last_hidden_state[k+m]) |
|
else: |
|
for p in range(len(maski)): |
|
masked_pos = maski[p] |
|
for k in masked_pos: |
|
if (k+m) >= len(last_hidden_state[p]): |
|
l_o_l_sa.append(last_hidden_state[p+1][k+m-len(last_hidden_state[p])]) |
|
continue |
|
l_o_l_sa.append(last_hidden_state[p][k+m]) |
|
sum_state = l_o_l_sa[0] |
|
for i in range(len(l_o_l_sa)): |
|
if i == 0: |
|
continue |
|
sum_state = sum_state + l_o_l_sa[i] |
|
yip = len(l_o_l_sa) |
|
sum_state = sum_state / yip |
|
probs = F.softmax(sum_state, dim=0) |
|
val = val - torch.log(probs[ty[m]]) |
|
idx = torch.topk(sum_state, k=5, dim=0)[1] |
|
wor= [tokenizer.decode(i.item()).strip() for i in idx] |
|
for kl in wor: |
|
if all(char.isalpha() for char in kl): |
|
qw+=kl.capitalize() |
|
break |
|
des = input_ids.clone() |
|
if len(maski) == 1: |
|
masked_pos = maski[0] |
|
for k in masked_pos: |
|
des[k+m] = idx[0] |
|
else: |
|
for p in range(len(maski)): |
|
masked_pos = maski[p] |
|
for k in masked_pos: |
|
if (k+m) >= len(des[p]): |
|
des[p+1][k+m-len(des[p])] = idx[0] |
|
continue |
|
des[p][k+m] = idx[0] |
|
del input_ids |
|
input_ids = des |
|
for c in sum_state: |
|
del c |
|
del sum_state |
|
for c in l_o_l_sa: |
|
del c |
|
del l_o_l_sa |
|
del last_hidden_state |
|
val = val / num_sub_tokens_label |
|
maxi = maxi + val |
|
print("Prediction: ", qw) |
|
print("*****") |
|
del maski |
|
del input_ids |
|
del att_mask |
|
del qw |
|
tot_loss +=maxi |
|
maxi = maxi / len(batch['input_ids']) |
|
maxi.backward() |
|
optimizer.step() |
|
if cntr%200 == 0: |
|
checkpoint = { |
|
'model_state_dict': model.state_dict(), |
|
'optimizer_state_dict': optimizer.state_dict(), |
|
'cntr': cntr |
|
} |
|
model_path = 'var_runs_iter/model_{}_{}_{}.pth'.format(run_> |
|
torch.save(checkpoint,model_path) |
|
|
|
loop.set_description(f'Epoch {epoch}') |
|
loop.set_postfix(loss=maxi.item()) |
|
except: |
|
continue |
|
tot_loss/=len(myDs) |
|
print(tot_loss) |
|
if tot_loss < best_loss: |
|
best_loss = tot_loss |
|
model_path = 'var_runs_iter/model_{}_{}'.format(run_int, epoch) |
|
torch.save(model.state_dict(), model_path) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
my_func() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|