MultiPLE-Evaluator / src /generic_eval.py
dongsheng's picture
Upload 48 files
41e79e2 verified
# This is a helper script for evaluating benchmarks that have been translated to
# different languages.
#
# To use this script, call eval_lang.py.
# The --directory argument is required, and tells the script where the benchmarks are located.
# The --files argument is optional, and takes a list of numbers corresponding to the files to be evaluated.
#
# The script will print the results on each benchmark, and also write to results/lang.csv.
# When the script completes, it will print a summary.
#
# Examples
#
# To run the entire benchmark suite:
# python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/
#
# To run benchmarks 1, 2, and 3:
# python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/ --files 1 2 3
import argparse
from sys import exit as sysexit
from pathlib import Path
import sys
def list_files(directory, ext):
files_unsorted = directory.glob(f"HumanEval_*{ext}")
# assumption: base filenames are in the format of HumanEval_X_*
# Where X is a valid number
def key(s):
return int(str(s.name).split("_")[1])
files_sorted = sorted(files_unsorted, key=(lambda s: key(s)))
# assumption: there may be missing files, but no extra files
# so we build files_array where the index corresponds to the file's number,
# and a missing file is represented by None
size = key(files_sorted[-1]) + 1
files_array = [None] * size
for f in files_sorted:
k = key(f)
files_array[k] = f
return files_array
def main(eval_script, language, extension):
args = argparse.ArgumentParser()
args.add_argument(
"--directory", type=str, required=True, help="Directory to read benchmarks from"
)
args.add_argument(
"--files",
type=int,
nargs="*",
default=[],
help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2"
)
args = args.parse_args()
directory = Path(args.directory).resolve()
files_sorted = list_files(directory, extension)
# the directory you specified does not contain the right language
if len(files_sorted) == 0:
print(f'The specified directory does not contain files of type {extension}')
sysexit(1)
files_index = []
if len(args.files) > 0:
files_index = args.files
else:
files_index = range(len(files_sorted))
total = 0
passed = 0
syntax_error = 0
results_file = Path(Path(__file__).parent, "..", "results", language.lower() + ".csv").resolve()
with open(results_file, "w") as f:
for i in files_index:
filepath = files_sorted[i]
if filepath is None:
print("File {} does not exist!".format(i))
continue
res = eval_script(filepath)
output = f"{language},{filepath.stem},{res['status']}\n"
f.write(output)
print(output, end="")
total += 1
if res['status'] == "OK":
passed += 1
elif res['status'] == "SyntaxError":
syntax_error += 1
print (f"Total {total}, Syntax Error {syntax_error}, Passed {passed}")
def main_check_stubs(check_script, language, extension):
args = argparse.ArgumentParser()
args.add_argument(
"--directory", type=str, required=True, help="Directory to read benchmarks from"
)
args.add_argument(
"--files",
type=int,
nargs="*",
default=[],
help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2"
)
args = args.parse_args()
directory = Path(args.directory).resolve()
files_sorted = list_files(directory, extension)
# the directory you specified does not contain the right language
if len(files_sorted) == 0:
print(f'The specified directory does not contain files of type {extension}')
sysexit(1)
files_index = []
if len(args.files) > 0:
files_index = args.files
else:
files_index = range(len(files_sorted))
total = 0
passed = 0
results_file = Path(Path(__file__).parent, "..", "check_results", language.lower() + ".csv").resolve()
with open(results_file, "w") as f:
for i in files_index:
filepath = files_sorted[i]
if filepath is None:
print("File {} does not exist!".format(i))
continue
res = check_script(filepath)
output = f"{language},{filepath.stem},{res['status']}\n"
f.write(output)
print(output, end="")
total += 1
if res['status'] == "OK":
passed += 1
print (f"Total {total}, Passed {passed}")
if total != passed:
sys.exit(1)