|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa). |
|
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned |
|
using a masked language modeling (MLM) loss. |
|
""" |
|
|
|
from __future__ import absolute_import |
|
import os |
|
import sys |
|
import pickle |
|
import torch |
|
import json |
|
|
|
import random |
|
import logging |
|
import argparse |
|
import numpy as np |
|
from io import open |
|
from itertools import cycle |
|
import torch.nn as nn |
|
from model import Seq2Seq |
|
from tqdm import tqdm, trange |
|
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset |
|
from torch.utils.data.distributed import DistributedSampler |
|
from tqdm import tqdm |
|
from fuzzywuzzy import fuzz |
|
import re |
|
import multiprocessing |
|
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, |
|
RobertaConfig, RobertaModel, RobertaTokenizer) |
|
|
|
divide_number = 2 |
|
cpu_cont = 16 |
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', |
|
datefmt = '%m/%d/%Y %H:%M:%S', |
|
level = logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class Example(object): |
|
"""A single training/test example.""" |
|
def __init__(self, |
|
idx, |
|
source, |
|
target, |
|
max_src_len, |
|
max_tar_len |
|
): |
|
self.idx = idx |
|
self.source = source |
|
self.target = target |
|
self.max_src_len = max_src_len |
|
self.max_tar_len = max_tar_len |
|
|
|
def read_examples(filename): |
|
"""Read examples from filename.""" |
|
examples=[] |
|
|
|
with open(filename,encoding="utf-8") as f: |
|
max_src_len = 0 |
|
max_tar_len = 0 |
|
for idx, line in enumerate(f): |
|
js=json.loads(line) |
|
inputs = " ".join(js["Template_token"][1:]) |
|
max_src_len = max(max_src_len, len(js["Template_token"])) |
|
|
|
if "ground_truth" in js: |
|
outputs = " ".join(js["ground_truth"]) |
|
max_tar_len = max(max_src_len, len(js["ground_truth"])) |
|
else: |
|
outputs = inputs |
|
if 'Idx' in js: |
|
idx = js['Idx'] |
|
examples.append( |
|
Example( |
|
idx = idx, |
|
source = inputs, |
|
target = outputs, |
|
max_src_len = max_src_len, |
|
max_tar_len = max_tar_len |
|
) |
|
) |
|
return examples |
|
|
|
|
|
class InputFeatures(object): |
|
"""A single training/test features for a example.""" |
|
def __init__(self, |
|
example_id, |
|
source_ids, |
|
target_ids, |
|
): |
|
self.example_id = example_id |
|
self.source_ids = source_ids |
|
self.target_ids = target_ids |
|
|
|
def convert_examples_to_features(examples, tokenizer, args,stage=None): |
|
features = [] |
|
for example_index, example in enumerate(examples): |
|
|
|
source_tokens = tokenizer.tokenize(example.source)[:args.max_source_length-5] |
|
source_tokens =[tokenizer.cls_token,tokenizer.sep_token]+source_tokens+["<mask>", tokenizer.sep_token] |
|
source_ids = tokenizer.convert_tokens_to_ids(source_tokens) |
|
padding_length = args.max_source_length - len(source_ids) |
|
source_ids+=[tokenizer.pad_token_id]*padding_length |
|
|
|
|
|
if stage=="test": |
|
target_tokens = tokenizer.tokenize("None") |
|
else: |
|
target_tokens = ["<mask>"] + tokenizer.tokenize(example.target)[:args.max_target_length-2] |
|
target_tokens = target_tokens+[tokenizer.sep_token] |
|
target_ids = tokenizer.convert_tokens_to_ids(target_tokens) |
|
padding_length = args.max_target_length - len(target_ids) |
|
target_ids+=[tokenizer.pad_token_id]*padding_length |
|
|
|
features.append( |
|
InputFeatures( |
|
example_index, |
|
source_ids, |
|
target_ids, |
|
) |
|
) |
|
return features |
|
|
|
|
|
|
|
def set_seed(seed=20240124): |
|
random.seed(seed) |
|
os.environ['PYHTONHASHSEED'] = str(seed) |
|
np.random.seed(seed) |
|
torch.manual_seed(seed) |
|
torch.cuda.manual_seed(seed) |
|
torch.backends.cudnn.deterministic = True |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser() |
|
|
|
|
|
parser.add_argument("--model_name_or_path", default=None, type=str, required=True, |
|
help="Path to pre-trained model: e.g. roberta-base" ) |
|
parser.add_argument("--output_dir", default=None, type=str, required=True, |
|
help="The output directory where the model predictions and checkpoints will be written.") |
|
parser.add_argument("--load_model_path", default=None, type=str, |
|
help="Path to trained model: Should contain the .bin files" ) |
|
|
|
parser.add_argument("--task", default=None, type=str, required=True, |
|
help="Task Type: statement_level, next_statement" ) |
|
|
|
parser.add_argument("--train_filename", default="../../Dataset/", type=str, |
|
help="The train filename. Should contain the .jsonl files for this task.") |
|
parser.add_argument("--dev_filename", default="../../Dataset/", type=str, |
|
help="The dev filename. Should contain the .jsonl files for this task.") |
|
parser.add_argument("--test_filename", default="../../Dataset/", type=str, |
|
help="The test filename. Should contain the .jsonl files for this task.") |
|
|
|
parser.add_argument("--config_name", default="", type=str, |
|
help="Pretrained config name or path if not the same as model_name") |
|
parser.add_argument("--tokenizer_name", default="", type=str, |
|
help="Pretrained tokenizer name or path if not the same as model_name") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument("--do_train", action='store_true', |
|
help="Whether to run training.") |
|
parser.add_argument("--do_eval", action='store_true', |
|
help="Whether to run eval on the dev set.") |
|
parser.add_argument("--do_test", action='store_true', |
|
help="Whether to run eval on the dev set.") |
|
parser.add_argument("--test_org", action='store_true', |
|
help="Whether to run eval on org model.") |
|
parser.add_argument("--do_lower_case", action='store_true', |
|
help="Set this flag if you are using an uncased model.") |
|
parser.add_argument("--no_cuda", action='store_true', |
|
help="Avoid using CUDA when available") |
|
|
|
parser.add_argument("--train_batch_size", default=8, type=int, |
|
help="Batch size per GPU/CPU for training.") |
|
parser.add_argument("--eval_batch_size", default=8, type=int, |
|
help="Batch size per GPU/CPU for evaluation.") |
|
parser.add_argument('--gradient_accumulation_steps', type=int, default=1, |
|
help="Number of updates steps to accumulate before performing a backward/update pass.") |
|
parser.add_argument("--learning_rate", default=5e-5, type=float, |
|
help="The initial learning rate for Adam.") |
|
parser.add_argument("--beam_size", default=10, type=int, |
|
help="beam size for beam search") |
|
parser.add_argument("--weight_decay", default=0.0, type=float, |
|
help="Weight deay if we apply some.") |
|
parser.add_argument("--adam_epsilon", default=1e-8, type=float, |
|
help="Epsilon for Adam optimizer.") |
|
parser.add_argument("--max_grad_norm", default=1.0, type=float, |
|
help="Max gradient norm.") |
|
parser.add_argument("--num_train_epochs", default=3, type=int, |
|
help="Total number of training epochs to perform.") |
|
parser.add_argument("--max_steps", default=-1, type=int, |
|
help="If > 0: set total number of training steps to perform. Override num_train_epochs.") |
|
parser.add_argument("--eval_steps", default=-1, type=int, |
|
help="") |
|
parser.add_argument("--max_target_length", default=128, type=int, |
|
help="") |
|
parser.add_argument("--max_source_length", default=384, type=int, |
|
help="") |
|
parser.add_argument("--train_steps", default=-1, type=int, |
|
help="") |
|
parser.add_argument("--warmup_steps", default=0, type=int, |
|
help="Linear warmup over warmup_steps.") |
|
parser.add_argument("--local_rank", type=int, default=-1, |
|
help="For distributed training: local_rank") |
|
parser.add_argument('--seed', type=int, default=20240124, |
|
help="random seed for initialization") |
|
|
|
args = parser.parse_args() |
|
|
|
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', |
|
datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO ) |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
args.n_gpu = torch.cuda.device_count() |
|
args.device = device |
|
logger.info("device: %s, n_gpu: %s",device, args.n_gpu) |
|
|
|
|
|
set_seed(args.seed) |
|
|
|
|
|
if os.path.exists(args.output_dir) is False: |
|
os.makedirs(args.output_dir) |
|
|
|
|
|
tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) |
|
config = RobertaConfig.from_pretrained(args.model_name_or_path) |
|
|
|
config.is_decoder = True |
|
encoder = RobertaModel.from_pretrained(args.model_name_or_path,config=config) |
|
|
|
model = Seq2Seq(encoder=encoder,decoder=encoder,config=config, |
|
beam_size=args.beam_size,max_length=args.max_target_length, |
|
sos_id=tokenizer.convert_tokens_to_ids(["<mask0>"])[0],eos_id=tokenizer.sep_token_id) |
|
|
|
logger.info("Training/evaluation parameters %s", args) |
|
|
|
if args.load_model_path is not None: |
|
if args.task == "statement_level": |
|
logger.info("reload model from {}".format(args.load_model_path + "/statement_level/pytorch_model.bin")) |
|
model.load_state_dict(torch.load(args.load_model_path + "/statement_level/pytorch_model.bin")) |
|
else: |
|
logger.info("reload model from {}".format(args.load_model_path + "/next_statement/pytorch_model.bin")) |
|
model.load_state_dict(torch.load(args.load_model_path + "/next_statement/pytorch_model.bin")) |
|
|
|
model.to(args.device) |
|
|
|
if args.n_gpu > 1: |
|
|
|
model = torch.nn.DataParallel(model) |
|
|
|
if args.do_train: |
|
|
|
if args.task == "statement_level": |
|
train_examples = read_examples(args.train_filename + "/Code_Completion/statement_level/train.jsonl") |
|
else: |
|
train_examples = read_examples(args.train_filename + "/Code_Completion/next_statement/train.jsonl") |
|
train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train') |
|
all_source_ids = torch.tensor([f.source_ids for f in train_features], dtype=torch.long) |
|
all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long) |
|
train_data = TensorDataset(all_source_ids,all_target_ids) |
|
train_sampler = RandomSampler(train_data) |
|
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) |
|
|
|
|
|
|
|
no_decay = ['bias', 'LayerNorm.weight'] |
|
optimizer_grouped_parameters = [ |
|
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], |
|
'weight_decay': args.weight_decay}, |
|
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} |
|
] |
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) |
|
scheduler = get_linear_schedule_with_warmup(optimizer, |
|
num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1), |
|
num_training_steps=len(train_dataloader)*args.num_train_epochs) |
|
|
|
|
|
logger.info("***** Running training *****") |
|
logger.info(" Num examples = %d", len(train_examples)) |
|
logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps) |
|
logger.info(" Num epoch = %d", args.num_train_epochs) |
|
|
|
|
|
model.train() |
|
patience, best_score, losses, dev_dataset = 0, 0, [], {} |
|
for epoch in range(args.num_train_epochs): |
|
for idx,batch in enumerate(train_dataloader): |
|
batch = tuple(t.to(device) for t in batch) |
|
source_ids,target_ids = batch |
|
loss,_,_ = model(source_ids=source_ids,target_ids=target_ids) |
|
|
|
if args.n_gpu > 1: |
|
loss = loss.mean() |
|
if args.gradient_accumulation_steps > 1: |
|
loss = loss / args.gradient_accumulation_steps |
|
|
|
losses.append(loss.item()) |
|
loss.backward() |
|
if len(losses) % args.gradient_accumulation_steps == 0: |
|
|
|
optimizer.step() |
|
optimizer.zero_grad() |
|
scheduler.step() |
|
if len(losses) // args.gradient_accumulation_steps % 100 == 0: |
|
logger.info("epoch {} step {} loss {}".format(epoch, |
|
len(losses)//args.gradient_accumulation_steps, |
|
round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4))) |
|
if args.do_eval: |
|
|
|
|
|
if 'dev_loss' in dev_dataset: |
|
eval_examples,eval_data = dev_dataset['dev_loss'] |
|
else: |
|
if args.task == "statement_level": |
|
eval_examples = read_examples(args.dev_filename + "/Code_Completion/statement_level/valid.jsonl") |
|
else: |
|
eval_examples = read_examples(args.dev_filename + "/Code_Completion/next_statement/valid.jsonl") |
|
eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev') |
|
all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long) |
|
all_target_ids = torch.tensor([f.target_ids for f in eval_features], dtype=torch.long) |
|
eval_data = TensorDataset(all_source_ids,all_target_ids) |
|
dev_dataset['dev_loss' ]= eval_examples,eval_data |
|
eval_sampler = SequentialSampler(eval_data) |
|
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) |
|
res_list = [] |
|
logger.info("\n***** Running evaluation *****") |
|
logger.info(" Num examples = %d", len(eval_examples)) |
|
logger.info(" Batch size = %d", args.eval_batch_size) |
|
|
|
|
|
model.eval() |
|
eval_loss,tokens_num = 0,0 |
|
for batch in eval_dataloader: |
|
batch = tuple(t.to(device) for t in batch) |
|
source_ids,target_ids = batch |
|
|
|
with torch.no_grad(): |
|
_,loss,num = model(source_ids=source_ids,target_ids=target_ids) |
|
eval_loss += loss.sum().item() |
|
tokens_num += num.sum().item() |
|
|
|
model.train() |
|
eval_loss = eval_loss / tokens_num |
|
result = {'eval_ppl': round(np.exp(eval_loss),5)} |
|
for key in sorted(result.keys()): |
|
logger.info(" %s = %s", key, str(result[key])) |
|
logger.info(" "+"*"*20) |
|
|
|
|
|
if 'dev_bleu' in dev_dataset: |
|
eval_examples,eval_data=dev_dataset['dev_bleu'] |
|
else: |
|
if args.task == "statement_level": |
|
eval_examples = read_examples(args.dev_filename + "/Code_Completion/statement_level/valid.jsonl") |
|
else: |
|
eval_examples = read_examples(args.dev_filename + "/Code_Completion/next_statement/valid.jsonl") |
|
|
|
eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test') |
|
all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long) |
|
eval_data = TensorDataset(all_source_ids) |
|
dev_dataset['dev_bleu'] = eval_examples,eval_data |
|
|
|
eval_sampler = SequentialSampler(eval_data) |
|
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) |
|
|
|
model.eval() |
|
p=[] |
|
for batch in eval_dataloader: |
|
batch = tuple(t.to(device) for t in batch) |
|
source_ids = batch[0] |
|
with torch.no_grad(): |
|
preds = model(source_ids) |
|
|
|
for pred in preds: |
|
t = pred[0].cpu().numpy() |
|
t = list(t) |
|
if 0 in t: |
|
t = t[:t.index(0)] |
|
text = tokenizer.decode(t,clean_up_tokenization_spaces=False) |
|
p.append(text) |
|
model.train() |
|
EM = 0.0 |
|
edit_sim = 0.0 |
|
total = len(p) |
|
token_accuracy = 0 |
|
for ref,gold in zip(p,eval_examples): |
|
pred = ref.strip() |
|
gt = gold.target |
|
edit_sim += fuzz.ratio(pred, gt) |
|
if pred.split() == gt.split(): |
|
EM += 1 |
|
res_list.append([pred,gt]) |
|
dev_acc = round(EM/total*100, 2) |
|
|
|
logger.info(" %s = %s "%("Epoch",str(epoch))) |
|
logger.info(" %s = %s "%("EM Acc",str(dev_acc))) |
|
logger.info(" %s = %s "%("Edit Distance",str(round(edit_sim/total, 2)))) |
|
logger.info(" "+"*"*20) |
|
|
|
if dev_acc > best_score: |
|
best_score = dev_acc |
|
|
|
if args.task == "statement_level": |
|
output_dir = os.path.join(args.output_dir, 'statement_level/') |
|
else: |
|
output_dir = os.path.join(args.output_dir, 'next_statement/') |
|
if not os.path.exists(output_dir): |
|
os.makedirs(output_dir) |
|
model_to_save = model.module if hasattr(model, 'module') else model |
|
output_model_file = os.path.join(output_dir, "pytorch_model.bin") |
|
torch.save(model_to_save.state_dict(), output_model_file) |
|
patience = 0 |
|
else: |
|
patience += 1 |
|
if patience == 3: |
|
break |
|
logger.info(" Best score:%s",best_score) |
|
logger.info(" "+"*"*20) |
|
|
|
if args.task == "statement_level": |
|
output_dir = os.path.join(args.output_dir, 'statement_level/') |
|
else: |
|
output_dir = os.path.join(args.output_dir, 'next_statement/') |
|
with open(output_dir + "/last_training_result.jsonl", 'w') as wf: |
|
for line in res_list: |
|
dic = {} |
|
dic["Pred"] = line[0] |
|
dic["GT"] = line[1] |
|
wf.write(json.dumps(dic)) |
|
wf.write("\n") |
|
|
|
if args.do_test: |
|
res_list = [] |
|
output_dir2 = "" |
|
|
|
if args.load_model_path is not None: |
|
model_to_load = model.module if hasattr(model, 'module') else model |
|
|
|
if args.task == "statement_level": |
|
logger.info("reload model from {}".format(args.load_model_path + "/statement_level/pytorch_model.bin")) |
|
model_to_load.load_state_dict(torch.load(args.load_model_path + "/statement_level/pytorch_model.bin")) |
|
else: |
|
logger.info("reload model from {}".format(args.load_model_path + "/next_statement/pytorch_model.bin")) |
|
model_to_load.load_state_dict(torch.load(args.load_model_path + "/next_statement/pytorch_model.bin")) |
|
|
|
if args.task == "statement_level": |
|
args.test_filename = os.path.join(args.test_filename, 'Code_Completion/statement_level/test.jsonl') |
|
else: |
|
args.test_filename = os.path.join(args.test_filename, 'Code_Completion/next_statement/test.jsonl') |
|
eval_examples = read_examples(args.test_filename) |
|
eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test') |
|
all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long) |
|
eval_data = TensorDataset(all_source_ids) |
|
|
|
|
|
eval_sampler = SequentialSampler(eval_data) |
|
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) |
|
|
|
model.eval() |
|
p=[] |
|
for batch in tqdm(eval_dataloader,total=len(eval_dataloader)): |
|
batch = tuple(t.to(device) for t in batch) |
|
source_ids = batch[0] |
|
with torch.no_grad(): |
|
preds = model(source_ids) |
|
|
|
for pred in preds: |
|
t = pred[0].cpu().numpy() |
|
t = list(t) |
|
if 0 in t: |
|
t = t[:t.index(0)] |
|
text = tokenizer.decode(t,clean_up_tokenization_spaces=False) |
|
p.append(text) |
|
model.train() |
|
avg_acc = 0.0 |
|
avg_EM = 0.0 |
|
total = 0 |
|
for ref,gold in zip(p,eval_examples): |
|
pred = ref.strip() |
|
gt = gold.target.strip() |
|
if pred == gt: |
|
avg_EM += 1 |
|
avg_acc += fuzz.ratio(pred, gt) |
|
res_list.append([pred, gt]) |
|
total += 1 |
|
dev_acc = round(avg_acc/total, 2) |
|
dev_em = round(avg_EM/total, 4) |
|
|
|
logger.info(" %s = %s "%("Test Token Avg Edit Distance",str(dev_acc))) |
|
logger.info(" %s = %s "%("Test Token Avg Exact Match Rate",str(dev_em))) |
|
logger.info(" "+"*"*20) |
|
if args.test_org: |
|
output_dir = args.output_dir |
|
else: |
|
if args.task == "statement_level": |
|
output_dir = os.path.join(args.output_dir, 'statement_level/') |
|
else: |
|
output_dir = os.path.join(args.output_dir, 'next_statement/') |
|
|
|
with open(output_dir + "/test_result.jsonl", 'w') as wf: |
|
for line in res_list: |
|
dic = {} |
|
dic["Pred"] = line[0] |
|
dic["GT"] = line[1] |
|
wf.write(json.dumps(dic)) |
|
wf.write("\n") |
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|
|
|
|
|