ComBack_Models / Script /Exp_Script /ForkFlow /calculate_forkflow.py

unknown

add files

df55b07 about 1 year ago

21.7 kB

	import os
	# from tree_sitter import Language, Parser
	# # import pandas as pd
	# import openpyxl
	import json
	import time
	import csv
	import pathlib
	import difflib
	import re
	from bleu import _bleu
	from fuzzywuzzy import fuzz
	import random
	import numpy as np
	from transformers import RobertaTokenizer
	#tokens = nltk.word_tokenize(sentence)

	folder = str(pathlib.Path(__file__).parent.resolve())
	isa_type_dir = folder+"/../../../Dataset"
	src_dir = folder+"/../../../Dataset/Code_Generation"
	dst_dir = folder+"/Result"

	train_lis = []
	valid_lis = []
	test_lis = []

	target_clf = {}
	def get_target_clf_list():
	global target_clf
	with open(isa_type_dir+"/comback_isa_type.csv","r",encoding="utf-8") as f:
	reader = csv.reader(f)
	for idx, l in enumerate(reader):
	if l[1].lower() == "arc" or l[1].lower() == "riscv" or l[1].lower() == "nvptx":
	continue
	if l[0] + " " + l[2] not in target_clf.keys():
	target_clf[l[0] + " " + l[2]] = [l[1]]
	else:
	target_clf[l[0] + " " + l[2]] += [l[1]]


	def Calculate_Statements_Ratio(Src_List, Fork_Lis, src_name, fork_name):
	src_code = ""
	Fork_code = ""
	idx = 0
	cnt_stmt = 0.0
	while idx < len(Src_List):
	src_code += Src_List[idx].replace(src_name, "")
	if Src_List[idx] in [";", ":", "{", "}"]:
	src_code += "\n"
	cnt_stmt += 1
	idx += 1
	while idx < len(Fork_Lis):
	Fork_code += Fork_Lis[idx].replace(fork_name, "")
	if Fork_Lis[idx] in [";", ":", "{", "}"]:
	Fork_code += "\n"
	idx += 1

	code_same = 0
	code_modi = 0
	code_add = 0
	diff_code = list(difflib.Differ().compare(src_code.splitlines(), Fork_code.splitlines()))
	for idx, dv in enumerate(diff_code):
	if dv[0] == '-':
	if idx < len(diff_code) - 1 and diff_code[idx+1][0] == '?':
	code_modi += 1
	else:
	code_add += 1
	elif dv[0] == '+':
	continue
	elif dv[0] == '?':
	continue
	#vega_add -= 1
	elif dv.strip().replace("\n", "") == '':
	continue
	else:
	code_same += 1
	return round(float(code_same) / cnt_stmt, 2)



	def Calculate_Forkflow():
	get_target_clf_list()
	print("############## Exp 1: Calculate Fork-Flow ################\n")

	test_lis = ["nvptx","arc","riscv"]
	for comp_type in ["GCC", "LLVM"]:
	for isa_type in ["GPU", "MPU", "CPU"]:
	max_ed = 0
	avg_ed = 0
	max_bleu4 = 0
	avg_bleu4 = 0
	avg_cnt = 0
	target_lis = target_clf[comp_type + " " + isa_type]
	test_target_dic = {}
	cnt_idx = 0
	if comp_type == "GCC":
	if isa_type == "CPU":
	for line in open(src_dir + "/GCC/riscv.jsonl", 'r'):
	dic = json.loads(line)
	test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("riscv", "")] = dic["ground_truth"]
	cnt_idx += 1

	for tar in target_lis:
	edit_dis = 0.0
	EM = []
	bleu4 = 0.0
	stmt_mod = 0.0
	cnt = 0
	fork_target_dic = {}
	for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
	dic = json.loads(line)
	fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]

	for k in test_target_dic.keys():
	func = k.split(" ")[1]
	src_code = " ".join(test_target_dic[k]).replace("riscv", "")
	if func in fork_target_dic.keys():
	fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
	stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "riscv", tar)
	else:
	fork_code = ""
	stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "riscv", tar)

	with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
	f.write(fork_code+'\n')
	f1.write(src_code+'\n')
	EM.append(fork_code==src_code)
	edit_dis += fuzz.ratio(fork_code, src_code)
	avg_ed += fuzz.ratio(fork_code, src_code)
	cnt += 1
	avg_cnt += 1
	if fork_code.strip() == "":
	bleu4 += 0
	else:
	tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
	bleu4 += tmp_bleu4
	avg_bleu4 += tmp_bleu4

	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)100/cnt,2))])
	if round(float(bleu4)/cnt,2) > max_bleu4:
	max_bleu4 = round(float(bleu4)/cnt,2)
	if round(float(edit_dis)/cnt,2) > max_ed:
	max_ed = round(float(edit_dis)/cnt,2)
	if isa_type == "GPU":
	for line in open(src_dir + "/GCC/nvptx.jsonl", 'r'):
	dic = json.loads(line)
	test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("nvptx", "")] = dic["ground_truth"]
	cnt_idx += 1

	for tar in target_lis:
	edit_dis = 0.0
	EM = []
	bleu4 = 0.0
	stmt_mod = 0.0
	cnt = 0
	fork_target_dic = {}
	for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
	dic = json.loads(line)
	fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]

	for k in test_target_dic.keys():
	func = k.split(" ")[1]
	src_code = " ".join(test_target_dic[k]).replace("nvptx", "")
	if func in fork_target_dic.keys():
	fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
	stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "nvptx", tar)
	else:
	fork_code = ""
	stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "nvptx", tar)

	with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
	f.write(fork_code+'\n')
	f1.write(src_code+'\n')
	EM.append(fork_code==src_code)
	edit_dis += fuzz.ratio(fork_code, src_code)
	avg_ed += fuzz.ratio(fork_code, src_code)
	cnt += 1
	avg_cnt += 1
	if fork_code.strip() == "":
	bleu4 += 0
	else:
	tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
	bleu4 += tmp_bleu4
	avg_bleu4 += tmp_bleu4

	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)100/cnt,2))])
	if round(float(bleu4)/cnt,2) > max_bleu4:
	max_bleu4 = round(float(bleu4)/cnt,2)
	if round(float(edit_dis)/cnt,2) > max_ed:
	max_ed = round(float(edit_dis)/cnt,2)
	if isa_type == "MPU":
	for line in open(src_dir + "/GCC/arc.jsonl", 'r'):
	dic = json.loads(line)
	test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("arc", "")] = dic["ground_truth"]
	cnt_idx += 1

	for tar in target_lis:
	edit_dis = 0.0
	EM = []
	bleu4 = 0.0
	stmt_mod = 0.0
	cnt = 0
	fork_target_dic = {}
	for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
	dic = json.loads(line)
	fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]

	for k in test_target_dic.keys():
	func = k.split(" ")[1]
	src_code = " ".join(test_target_dic[k]).replace("arc", "")
	if func in fork_target_dic.keys():
	fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
	stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "arc", tar)
	else:
	fork_code = ""
	stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "arc", tar)

	with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
	f.write(fork_code+'\n')
	f1.write(src_code+'\n')
	EM.append(fork_code==src_code)
	edit_dis += fuzz.ratio(fork_code, src_code)
	avg_ed += fuzz.ratio(fork_code, src_code)
	cnt += 1
	avg_cnt += 1
	if fork_code.strip() == "":
	bleu4 += 0
	else:
	tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
	bleu4 += tmp_bleu4
	avg_bleu4 += tmp_bleu4
	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)100/cnt,2))])
	if round(float(bleu4)/cnt,2) > max_bleu4:
	max_bleu4 = round(float(bleu4)/cnt,2)
	if round(float(edit_dis)/cnt,2) > max_ed:
	max_ed = round(float(edit_dis)/cnt,2)
	if comp_type == "LLVM":
	if isa_type == "CPU":
	for line in open(src_dir + "/LLVM/RISCV.jsonl", 'r'):
	dic = json.loads(line)
	test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("RISCV", "")] = dic["ground_truth"]
	cnt_idx += 1

	for tar in target_lis:
	if tar == "RI5CY":
	continue
	edit_dis = 0.0
	EM = []
	bleu4 = 0.0
	stmt_mod = 0.0
	cnt = 0
	fork_target_dic = {}
	for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
	dic = json.loads(line)
	fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]

	for k in test_target_dic.keys():
	func = k.split(" ")[1]
	src_code = " ".join(test_target_dic[k]).replace("RISCV", "")
	if func in fork_target_dic.keys():
	fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
	stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "RISCV", tar)
	else:
	fork_code = ""
	stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "RISCV", tar)
	with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
	f.write(fork_code+'\n')
	f1.write(src_code+'\n')
	EM.append(fork_code==src_code)
	edit_dis += fuzz.ratio(fork_code, src_code)
	avg_ed += fuzz.ratio(fork_code, src_code)
	cnt += 1
	avg_cnt += 1
	if fork_code.strip() == "":
	bleu4 += 0
	else:
	tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
	bleu4 += tmp_bleu4
	avg_bleu4 += tmp_bleu4
	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)100/cnt,2))])
	if round(float(bleu4)/cnt,2) > max_bleu4:
	max_bleu4 = round(float(bleu4)/cnt,2)
	if round(float(edit_dis)/cnt,2) > max_ed:
	max_ed = round(float(edit_dis)/cnt,2)
	if isa_type == "GPU":
	for line in open(src_dir + "/LLVM/NVPTX.jsonl", 'r'):
	dic = json.loads(line)
	test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("NVPTX", "")] = dic["ground_truth"]
	cnt_idx += 1

	for tar in target_lis:
	edit_dis = 0.0
	EM = []
	bleu4 = 0.0
	stmt_mod = 0.0
	cnt = 0
	fork_target_dic = {}
	for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
	dic = json.loads(line)
	fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]

	for k in test_target_dic.keys():
	func = k.split(" ")[1]
	src_code = " ".join(test_target_dic[k]).replace("NVPTX", "")
	if func in fork_target_dic.keys():
	fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
	stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "NVPTX", tar)
	else:
	fork_code = ""
	stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "NVPTX", tar)

	with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
	f.write(fork_code+'\n')
	f1.write(src_code+'\n')
	EM.append(fork_code==src_code)
	edit_dis += fuzz.ratio(fork_code, src_code)
	avg_ed += fuzz.ratio(fork_code, src_code)
	cnt += 1
	avg_cnt += 1
	if fork_code.strip() == "":
	bleu4 += 0
	else:
	tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
	bleu4 += tmp_bleu4
	avg_bleu4 += tmp_bleu4
	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)100/cnt,2))])
	if round(float(bleu4)/cnt,2) > max_bleu4:
	max_bleu4 = round(float(bleu4)/cnt,2)
	if round(float(edit_dis)/cnt,2) > max_ed:
	max_ed = round(float(edit_dis)/cnt,2)
	if isa_type == "MPU":
	for line in open(src_dir + "/LLVM/ARC.jsonl", 'r'):
	dic = json.loads(line)
	test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("ARC", "")] = dic["ground_truth"]
	cnt_idx += 1
	for tar in target_lis:
	edit_dis = 0.0
	EM = []
	bleu4 = 0.0
	stmt_mod = 0.0
	cnt = 0
	fork_target_dic = {}
	for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'):
	dic = json.loads(line)
	fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"]

	for k in test_target_dic.keys():
	func = k.split(" ")[1]
	src_code = " ".join(test_target_dic[k]).replace("ARC", "")
	if func in fork_target_dic.keys():
	fork_code = " ".join(fork_target_dic[func]).replace(tar, "")
	stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "ARC", tar)
	else:
	fork_code = ""
	stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "ARC", tar)
	with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1:
	f.write(fork_code+'\n')
	f1.write(src_code+'\n')
	EM.append(fork_code==src_code)
	edit_dis += fuzz.ratio(fork_code, src_code)
	avg_ed += fuzz.ratio(fork_code, src_code)
	cnt += 1
	avg_cnt += 1
	if fork_code.strip() == "":
	bleu4 += 0
	else:
	tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output")
	bleu4 += tmp_bleu4
	avg_bleu4 += tmp_bleu4


	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)100/cnt,2))])
	if round(float(bleu4)/cnt,2) > max_bleu4:
	max_bleu4 = round(float(bleu4)/cnt,2)
	if round(float(edit_dis)/cnt,2) > max_ed:
	max_ed = round(float(edit_dis)/cnt,2)
	print(comp_type + " " + isa_type)
	print("Avg ED: " + str(round(float(avg_ed)/avg_cnt,2)))
	print("Max ED: " + str(max_ed))
	print("Avg BLEU4: " + str(round(float(avg_bleu4)/avg_cnt,2)))
	print("Max BLEU4: " + str(max_bleu4))
	print("\n\n")





	if __name__ == "__main__":
	with open(dst_dir + '/result.csv', 'w', newline='') as file:
	writer = csv.writer(file)
	writer.writerow(["Compiler Type", "ISA Type", "Target", "BLEU4", "Exact Match", "Edit Didtance", "Stmt_Ratio"])
	Calculate_Forkflow()