ComBack_Models / Script /Exp_Script /Code-LLaMA /calculate_codellama_completion.py

unknown

add files

74cea47 about 1 year ago

13.7 kB

	import os
	# from tree_sitter import Language, Parser
	# # import pandas as pd
	# import openpyxl
	import json
	import time
	import csv
	import pathlib
	import difflib
	import re
	from bleu import _bleu
	from fuzzywuzzy import fuzz
	import random
	import numpy as np
	from transformers import RobertaTokenizer
	#tokens = nltk.word_tokenize(sentence)
	import argparse

	parser = argparse.ArgumentParser(description='Test')
	parser.add_argument("--task", default=None, type=str, required=True,
	help="Task Type: statement_level, next_statement" )
	args = parser.parse_args()

	folder = str(pathlib.Path(__file__).parent.resolve())
	isa_type_dir = folder+"/../../../Dataset"
	src_dir = folder+f"/../../../Dataset/Code_Completion/{args.task}"
	dst_dir = folder

	train_lis = []
	valid_lis = []
	test_lis = []

	target_clf = {}
	def get_target_clf_list():
	global target_clf
	with open(isa_type_dir+"/comback_isa_type.csv","r",encoding="utf-8") as f:
	reader = csv.reader(f)
	for idx, l in enumerate(reader):
	if l[1].lower() == "arc" or l[1].lower() == "riscv" or l[1].lower() == "nvptx":
	continue
	if l[0] + " " + l[2] not in target_clf.keys():
	target_clf[l[0] + " " + l[2]] = [l[1]]
	else:
	target_clf[l[0] + " " + l[2]] += [l[1]]




	def Calculate_Completion():
	get_target_clf_list()
	print("############## Exp 2: Calculate Code-LLaMA Stmt Completion ################\n")

	test_lis = ["nvptx","arc","riscv"]


	codellama_gcc_code = {}
	codellama_llvm_code = {}

	if args.task == "next_statement":
	dst_file = dst_dir+"/Input/codellama_next_output_cleaned.csv"
	else:
	dst_file = dst_dir+"/Input/codellama_stmt_output_cleaned.csv"

	with open(dst_file,encoding="utf-8") as f:
	reader = csv.reader(f)
	for idx, row in enumerate(reader):
	if row[0] == "GCC":
	codellama_gcc_code[row[1] + " " + str(row[2])] = row[3]
	else:
	codellama_llvm_code[row[1] + " " + str(row[2])] = row[3]
	avg_accuracy = {}
	for comp_type in ["GCC", "LLVM"]:
	for isa_type in ["GPU", "MPU", "CPU"]:
	test_target_dic = {}
	cnt_idx = 0
	if comp_type == "GCC":
	if isa_type == "CPU":
	cnt_idx = 0
	for line in open(src_dir + "/GCC/riscv.jsonl", 'r'):
	dic = json.loads(line)
	test_target_dic["riscv" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])

	cnt_idx += 1
	total_EM = 0.0
	total_ED = 0.0
	for k in test_target_dic.keys():
	edit_dis = 0.0
	EM = 0.0
	src_code = test_target_dic[k]

	if k in codellama_gcc_code.keys():
	chat_code = codellama_gcc_code[k]
	if chat_code.replace(" ", "") == src_code.replace(" ", ""):
	EM = 1
	edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
	total_ED += edit_dis
	total_EM += EM
	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, "riscv", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
	else:
	print(k)
	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, "riscv", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
	avg_accuracy[comp_type + " " + "riscv"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]
	if isa_type == "GPU":
	cnt_idx = 0
	for line in open(src_dir + "/GCC/nvptx.jsonl", 'r'):
	dic = json.loads(line)
	test_target_dic["nvptx" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
	cnt_idx += 1
	total_EM = 0.0
	total_ED = 0.0

	for k in test_target_dic.keys():
	edit_dis = 0.0
	EM = 0.0
	src_code = test_target_dic[k]
	if k in codellama_gcc_code.keys():
	chat_code = codellama_gcc_code[k]
	if chat_code.replace(" ", "") == src_code.replace(" ", ""):
	EM = 1
	edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
	total_ED += edit_dis
	total_EM += EM
	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, "nvptx", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
	else:
	print(k)
	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, "nvptx", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
	avg_accuracy[comp_type + " " + "nvptx"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]
	if isa_type == "MPU":
	cnt_idx = 0
	for line in open(src_dir + "/GCC/arc.jsonl", 'r'):
	dic = json.loads(line)
	test_target_dic["arc" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
	cnt_idx += 1
	total_EM = 0.0
	total_ED = 0.0
	for k in test_target_dic.keys():
	edit_dis = 0.0
	EM = 0.0
	src_code = test_target_dic[k]
	if k in codellama_gcc_code.keys():
	chat_code = codellama_gcc_code[k]
	if chat_code.replace(" ", "") == src_code.replace(" ", ""):
	EM = 1
	edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
	total_ED += edit_dis
	total_EM += EM
	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, "arc", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
	else:
	print(k)
	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, "arc", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
	avg_accuracy[comp_type + " " + "arc"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]

	if comp_type == "LLVM":
	if isa_type == "CPU":
	cnt_idx = 0
	for line in open(src_dir + "/LLVM/RISCV.jsonl", 'r'):
	dic = json.loads(line)
	test_target_dic["RISCV" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
	cnt_idx += 1
	total_EM = 0.0
	total_ED = 0.0
	for k in test_target_dic.keys():
	edit_dis = 0.0
	EM = 0.0
	src_code = test_target_dic[k]
	if k in codellama_llvm_code.keys():
	chat_code = codellama_llvm_code[k]
	if chat_code.replace(" ", "") == src_code.replace(" ", ""):
	EM = 1
	edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
	total_ED += edit_dis
	total_EM += EM
	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, "RISCV", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
	else:
	print(k)
	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, "RISCV", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
	avg_accuracy[comp_type + " " + "RISCV"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]
	if isa_type == "GPU":
	cnt_idx = 0
	for line in open(src_dir + "/LLVM/NVPTX.jsonl", 'r'):
	dic = json.loads(line)
	test_target_dic["NVPTX" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
	cnt_idx += 1
	total_EM = 0.0
	total_ED = 0.0
	for k in test_target_dic.keys():
	edit_dis = 0.0
	EM = 0.0
	src_code = test_target_dic[k]
	if k in codellama_llvm_code.keys():
	chat_code = codellama_llvm_code[k]
	if chat_code.replace(" ", "") == src_code.replace(" ", ""):
	EM = 1
	edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
	total_ED += edit_dis
	total_EM += EM
	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, "NVPTX", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
	else:
	print(k)
	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, "NVPTX", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
	avg_accuracy[comp_type + " " + "NVPTX"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]
	if isa_type == "MPU":
	cnt_idx = 0
	for line in open(src_dir + "/LLVM/ARC.jsonl", 'r'):
	dic = json.loads(line)
	test_target_dic["ARC" + " " + str(cnt_idx)] = " ".join(dic["ground_truth"])
	cnt_idx += 1
	total_EM = 0.0
	total_ED = 0.0

	for k in test_target_dic.keys():
	edit_dis = 0.0
	EM = 0.0
	src_code = test_target_dic[k]
	if k in codellama_llvm_code.keys():
	chat_code = codellama_llvm_code[k]
	if chat_code.replace(" ", "") == src_code.replace(" ", ""):
	EM = 1
	edit_dis = fuzz.ratio(chat_code.replace(" ", ""), src_code.replace(" ", ""))
	total_ED += edit_dis
	total_EM += EM
	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, "ARC", k.split(" ")[1], str(round(EM*100,2)), str(round(float(edit_dis),2))])
	else:
	print(k)
	with open(dst_dir + '/result.csv', 'a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow([comp_type, "ARC", "average", str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))])
	avg_accuracy[comp_type + " " + "ARC"] = [str(round((total_EM / cnt_idx)*100,2)), str(round(float(total_ED / cnt_idx),2))]

	return avg_accuracy




	if __name__ == "__main__":
	with open(dst_dir + '/result.csv', 'w', newline='') as file:
	writer = csv.writer(file)
	writer.writerow(["Compiler Type", "Target", "Idx", "Exact Match", "Edit Didtance"])

	avg_dic = Calculate_Completion()

	for k in avg_dic:
	print("########################")

	print(k)
	print(" ".join(["Exact Match", "Edit Didtance"]))
	print(" ".join(avg_dic[k]))