ComBack_Models / Script /Exp_Script /ForkFlow /calculate_forkflow.py
unknown
add files
df55b07
raw
history blame
21.7 kB
import os
# from tree_sitter import Language, Parser
# # import pandas as pd
# import openpyxl
import json
import time
import csv
import pathlib
import difflib
import re
from bleu import _bleu
from fuzzywuzzy import fuzz
import random
import numpy as np
from transformers import RobertaTokenizer
#tokens = nltk.word_tokenize(sentence)
folder = str(pathlib.Path(__file__).parent.resolve())
isa_type_dir = folder+"/../../../Dataset"
src_dir = folder+"/../../../Dataset/Code_Generation"
dst_dir = folder+"/Result"
train_lis = []
valid_lis = []
test_lis = []
target_clf = {}
def get_target_clf_list():
global target_clf
with open(isa_type_dir+"/comback_isa_type.csv","r",encoding="utf-8") as f:
reader = csv.reader(f)
for idx, l in enumerate(reader):
if l[1].lower() == "arc" or l[1].lower() == "riscv" or l[1].lower() == "nvptx":
continue
if l[0] + " " + l[2] not in target_clf.keys():
target_clf[l[0] + " " + l[2]] = [l[1]]
else:
target_clf[l[0] + " " + l[2]] += [l[1]]
def Calculate_Statements_Ratio(Src_List, Fork_Lis, src_name, fork_name):
src_code = ""
Fork_code = ""
idx = 0
cnt_stmt = 0.0
while idx < len(Src_List):
src_code += Src_List[idx].replace(src_name, "")
if Src_List[idx] in [";", ":", "{", "}"]:
src_code += "\n"
cnt_stmt += 1
idx += 1
while idx < len(Fork_Lis):
Fork_code += Fork_Lis[idx].replace(fork_name, "")
if Fork_Lis[idx] in [";", ":", "{", "}"]:
Fork_code += "\n"
idx += 1
code_same = 0
code_modi = 0
code_add = 0
diff_code = list(difflib.Differ().compare(src_code.splitlines(), Fork_code.splitlines()))
for idx, dv in enumerate(diff_code):
if dv[0] == '-':
if idx < len(diff_code) - 1 and diff_code[idx+1][0] == '?':
code_modi += 1
else:
code_add += 1
elif dv[0] == '+':
continue
elif dv[0] == '?':
continue
#vega_add -= 1
elif dv.strip().replace("\n", "") == '':
continue
else:
code_same += 1
return round(float(code_same) / cnt_stmt, 2)
def Calculate_Forkflow():
get_target_clf_list()
print("############## Exp 1: Calculate Fork-Flow ################\n")
test_lis = ["nvptx","arc","riscv"]
for comp_type in ["GCC", "LLVM"]:
for isa_type in ["GPU", "MPU", "CPU"]:
max_ed = 0
avg_ed = 0
max_bleu4 = 0
avg_bleu4 = 0
avg_cnt = 0
target_lis = target_clf[comp_type + " " + isa_type]
test_target_dic = {}
cnt_idx = 0
if comp_type == "GCC":
if isa_type == "CPU":
for line in open(src_dir + "/GCC/riscv.jsonl", 'r'):
dic = json.loads(line)
test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("riscv", "")] = dic["ground_truth"]
cnt_idx += 1
for tar in target_lis:
edit_dis = 0.0
EM = []
bleu4 = 0.0
stmt_mod = 0.0
cnt = 0
fork_target_dic = {}
for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
dic = json.loads(line)
fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]
for k in test_target_dic.keys():
func = k.split(" ")[1]
src_code = " ".join(test_target_dic[k]).replace("riscv", "")
if func in fork_target_dic.keys():
fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "riscv", tar)
else:
fork_code = ""
stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "riscv", tar)
with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
f.write(fork_code+'\n')
f1.write(src_code+'\n')
EM.append(fork_code==src_code)
edit_dis += fuzz.ratio(fork_code, src_code)
avg_ed += fuzz.ratio(fork_code, src_code)
cnt += 1
avg_cnt += 1
if fork_code.strip() == "":
bleu4 += 0
else:
tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
bleu4 += tmp_bleu4
avg_bleu4 += tmp_bleu4
with open(dst_dir + '/result.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))])
if round(float(bleu4)/cnt,2) > max_bleu4:
max_bleu4 = round(float(bleu4)/cnt,2)
if round(float(edit_dis)/cnt,2) > max_ed:
max_ed = round(float(edit_dis)/cnt,2)
if isa_type == "GPU":
for line in open(src_dir + "/GCC/nvptx.jsonl", 'r'):
dic = json.loads(line)
test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("nvptx", "")] = dic["ground_truth"]
cnt_idx += 1
for tar in target_lis:
edit_dis = 0.0
EM = []
bleu4 = 0.0
stmt_mod = 0.0
cnt = 0
fork_target_dic = {}
for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
dic = json.loads(line)
fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]
for k in test_target_dic.keys():
func = k.split(" ")[1]
src_code = " ".join(test_target_dic[k]).replace("nvptx", "")
if func in fork_target_dic.keys():
fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "nvptx", tar)
else:
fork_code = ""
stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "nvptx", tar)
with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
f.write(fork_code+'\n')
f1.write(src_code+'\n')
EM.append(fork_code==src_code)
edit_dis += fuzz.ratio(fork_code, src_code)
avg_ed += fuzz.ratio(fork_code, src_code)
cnt += 1
avg_cnt += 1
if fork_code.strip() == "":
bleu4 += 0
else:
tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
bleu4 += tmp_bleu4
avg_bleu4 += tmp_bleu4
with open(dst_dir + '/result.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))])
if round(float(bleu4)/cnt,2) > max_bleu4:
max_bleu4 = round(float(bleu4)/cnt,2)
if round(float(edit_dis)/cnt,2) > max_ed:
max_ed = round(float(edit_dis)/cnt,2)
if isa_type == "MPU":
for line in open(src_dir + "/GCC/arc.jsonl", 'r'):
dic = json.loads(line)
test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("arc", "")] = dic["ground_truth"]
cnt_idx += 1
for tar in target_lis:
edit_dis = 0.0
EM = []
bleu4 = 0.0
stmt_mod = 0.0
cnt = 0
fork_target_dic = {}
for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
dic = json.loads(line)
fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]
for k in test_target_dic.keys():
func = k.split(" ")[1]
src_code = " ".join(test_target_dic[k]).replace("arc", "")
if func in fork_target_dic.keys():
fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "arc", tar)
else:
fork_code = ""
stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "arc", tar)
with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
f.write(fork_code+'\n')
f1.write(src_code+'\n')
EM.append(fork_code==src_code)
edit_dis += fuzz.ratio(fork_code, src_code)
avg_ed += fuzz.ratio(fork_code, src_code)
cnt += 1
avg_cnt += 1
if fork_code.strip() == "":
bleu4 += 0
else:
tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
bleu4 += tmp_bleu4
avg_bleu4 += tmp_bleu4
with open(dst_dir + '/result.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))])
if round(float(bleu4)/cnt,2) > max_bleu4:
max_bleu4 = round(float(bleu4)/cnt,2)
if round(float(edit_dis)/cnt,2) > max_ed:
max_ed = round(float(edit_dis)/cnt,2)
if comp_type == "LLVM":
if isa_type == "CPU":
for line in open(src_dir + "/LLVM/RISCV.jsonl", 'r'):
dic = json.loads(line)
test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("RISCV", "")] = dic["ground_truth"]
cnt_idx += 1
for tar in target_lis:
if tar == "RI5CY":
continue
edit_dis = 0.0
EM = []
bleu4 = 0.0
stmt_mod = 0.0
cnt = 0
fork_target_dic = {}
for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
dic = json.loads(line)
fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]
for k in test_target_dic.keys():
func = k.split(" ")[1]
src_code = " ".join(test_target_dic[k]).replace("RISCV", "")
if func in fork_target_dic.keys():
fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "RISCV", tar)
else:
fork_code = ""
stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "RISCV", tar)
with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
f.write(fork_code+'\n')
f1.write(src_code+'\n')
EM.append(fork_code==src_code)
edit_dis += fuzz.ratio(fork_code, src_code)
avg_ed += fuzz.ratio(fork_code, src_code)
cnt += 1
avg_cnt += 1
if fork_code.strip() == "":
bleu4 += 0
else:
tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
bleu4 += tmp_bleu4
avg_bleu4 += tmp_bleu4
with open(dst_dir + '/result.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))])
if round(float(bleu4)/cnt,2) > max_bleu4:
max_bleu4 = round(float(bleu4)/cnt,2)
if round(float(edit_dis)/cnt,2) > max_ed:
max_ed = round(float(edit_dis)/cnt,2)
if isa_type == "GPU":
for line in open(src_dir + "/LLVM/NVPTX.jsonl", 'r'):
dic = json.loads(line)
test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("NVPTX", "")] = dic["ground_truth"]
cnt_idx += 1
for tar in target_lis:
edit_dis = 0.0
EM = []
bleu4 = 0.0
stmt_mod = 0.0
cnt = 0
fork_target_dic = {}
for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
dic = json.loads(line)
fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]
for k in test_target_dic.keys():
func = k.split(" ")[1]
src_code = " ".join(test_target_dic[k]).replace("NVPTX", "")
if func in fork_target_dic.keys():
fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "NVPTX", tar)
else:
fork_code = ""
stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "NVPTX", tar)
with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
f.write(fork_code+'\n')
f1.write(src_code+'\n')
EM.append(fork_code==src_code)
edit_dis += fuzz.ratio(fork_code, src_code)
avg_ed += fuzz.ratio(fork_code, src_code)
cnt += 1
avg_cnt += 1
if fork_code.strip() == "":
bleu4 += 0
else:
tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
bleu4 += tmp_bleu4
avg_bleu4 += tmp_bleu4
with open(dst_dir + '/result.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))])
if round(float(bleu4)/cnt,2) > max_bleu4:
max_bleu4 = round(float(bleu4)/cnt,2)
if round(float(edit_dis)/cnt,2) > max_ed:
max_ed = round(float(edit_dis)/cnt,2)
if isa_type == "MPU":
for line in open(src_dir + "/LLVM/ARC.jsonl", 'r'):
dic = json.loads(line)
test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("ARC", "")] = dic["ground_truth"]
cnt_idx += 1
for tar in target_lis:
edit_dis = 0.0
EM = []
bleu4 = 0.0
stmt_mod = 0.0
cnt = 0
fork_target_dic = {}
for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
dic = json.loads(line)
fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]
for k in test_target_dic.keys():
func = k.split(" ")[1]
src_code = " ".join(test_target_dic[k]).replace("ARC", "")
if func in fork_target_dic.keys():
fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "ARC", tar)
else:
fork_code = ""
stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "ARC", tar)
with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
f.write(fork_code+'\n')
f1.write(src_code+'\n')
EM.append(fork_code==src_code)
edit_dis += fuzz.ratio(fork_code, src_code)
avg_ed += fuzz.ratio(fork_code, src_code)
cnt += 1
avg_cnt += 1
if fork_code.strip() == "":
bleu4 += 0
else:
tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
bleu4 += tmp_bleu4
avg_bleu4 += tmp_bleu4
with open(dst_dir + '/result.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))])
if round(float(bleu4)/cnt,2) > max_bleu4:
max_bleu4 = round(float(bleu4)/cnt,2)
if round(float(edit_dis)/cnt,2) > max_ed:
max_ed = round(float(edit_dis)/cnt,2)
print(comp_type + " " + isa_type)
print("Avg ED: " + str(round(float(avg_ed)/avg_cnt,2)))
print("Max ED: " + str(max_ed))
print("Avg BLEU4: " + str(round(float(avg_bleu4)/avg_cnt,2)))
print("Max BLEU4: " + str(max_bleu4))
print("\n\n")
if __name__ == "__main__":
with open(dst_dir + '/result.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["Compiler Type", "ISA Type", "Target", "BLEU4", "Exact Match", "Edit Didtance", "Stmt_Ratio"])
Calculate_Forkflow()