Spaces:
Running
Running
File size: 4,799 Bytes
41e79e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
# This is a helper script for evaluating benchmarks that have been translated to
# different languages.
#
# To use this script, call eval_lang.py.
# The --directory argument is required, and tells the script where the benchmarks are located.
# The --files argument is optional, and takes a list of numbers corresponding to the files to be evaluated.
#
# The script will print the results on each benchmark, and also write to results/lang.csv.
# When the script completes, it will print a summary.
#
# Examples
#
# To run the entire benchmark suite:
# python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/
#
# To run benchmarks 1, 2, and 3:
# python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/ --files 1 2 3
import argparse
from sys import exit as sysexit
from pathlib import Path
import sys
def list_files(directory, ext):
files_unsorted = directory.glob(f"HumanEval_*{ext}")
# assumption: base filenames are in the format of HumanEval_X_*
# Where X is a valid number
def key(s):
return int(str(s.name).split("_")[1])
files_sorted = sorted(files_unsorted, key=(lambda s: key(s)))
# assumption: there may be missing files, but no extra files
# so we build files_array where the index corresponds to the file's number,
# and a missing file is represented by None
size = key(files_sorted[-1]) + 1
files_array = [None] * size
for f in files_sorted:
k = key(f)
files_array[k] = f
return files_array
def main(eval_script, language, extension):
args = argparse.ArgumentParser()
args.add_argument(
"--directory", type=str, required=True, help="Directory to read benchmarks from"
)
args.add_argument(
"--files",
type=int,
nargs="*",
default=[],
help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2"
)
args = args.parse_args()
directory = Path(args.directory).resolve()
files_sorted = list_files(directory, extension)
# the directory you specified does not contain the right language
if len(files_sorted) == 0:
print(f'The specified directory does not contain files of type {extension}')
sysexit(1)
files_index = []
if len(args.files) > 0:
files_index = args.files
else:
files_index = range(len(files_sorted))
total = 0
passed = 0
syntax_error = 0
results_file = Path(Path(__file__).parent, "..", "results", language.lower() + ".csv").resolve()
with open(results_file, "w") as f:
for i in files_index:
filepath = files_sorted[i]
if filepath is None:
print("File {} does not exist!".format(i))
continue
res = eval_script(filepath)
output = f"{language},{filepath.stem},{res['status']}\n"
f.write(output)
print(output, end="")
total += 1
if res['status'] == "OK":
passed += 1
elif res['status'] == "SyntaxError":
syntax_error += 1
print (f"Total {total}, Syntax Error {syntax_error}, Passed {passed}")
def main_check_stubs(check_script, language, extension):
args = argparse.ArgumentParser()
args.add_argument(
"--directory", type=str, required=True, help="Directory to read benchmarks from"
)
args.add_argument(
"--files",
type=int,
nargs="*",
default=[],
help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2"
)
args = args.parse_args()
directory = Path(args.directory).resolve()
files_sorted = list_files(directory, extension)
# the directory you specified does not contain the right language
if len(files_sorted) == 0:
print(f'The specified directory does not contain files of type {extension}')
sysexit(1)
files_index = []
if len(args.files) > 0:
files_index = args.files
else:
files_index = range(len(files_sorted))
total = 0
passed = 0
results_file = Path(Path(__file__).parent, "..", "check_results", language.lower() + ".csv").resolve()
with open(results_file, "w") as f:
for i in files_index:
filepath = files_sorted[i]
if filepath is None:
print("File {} does not exist!".format(i))
continue
res = check_script(filepath)
output = f"{language},{filepath.stem},{res['status']}\n"
f.write(output)
print(output, end="")
total += 1
if res['status'] == "OK":
passed += 1
print (f"Total {total}, Passed {passed}")
if total != passed:
sys.exit(1)
|