Spaces:

opencompass
/

MultiPLE-Evaluator

Running

App Files Files Community

MultiPLE-Evaluator / src /generic_eval.py

dongsheng

Upload 48 files

41e79e2 verified 4 months ago

raw

history blame contribute delete

4.8 kB

	# This is a helper script for evaluating benchmarks that have been translated to
	# different languages.
	#
	# To use this script, call eval_lang.py.
	# The --directory argument is required, and tells the script where the benchmarks are located.
	# The --files argument is optional, and takes a list of numbers corresponding to the files to be evaluated.
	#
	# The script will print the results on each benchmark, and also write to results/lang.csv.
	# When the script completes, it will print a summary.
	#
	# Examples
	#
	# To run the entire benchmark suite:
	# python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/
	#
	# To run benchmarks 1, 2, and 3:
	# python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/ --files 1 2 3

	import argparse
	from sys import exit as sysexit
	from pathlib import Path
	import sys

	def list_files(directory, ext):
	files_unsorted = directory.glob(f"HumanEval_*{ext}")
	# assumption: base filenames are in the format of HumanEval_X_*
	# Where X is a valid number
	def key(s):
	return int(str(s.name).split("_")[1])
	files_sorted = sorted(files_unsorted, key=(lambda s: key(s)))

	# assumption: there may be missing files, but no extra files
	# so we build files_array where the index corresponds to the file's number,
	# and a missing file is represented by None
	size = key(files_sorted[-1]) + 1
	files_array = [None] * size
	for f in files_sorted:
	k = key(f)
	files_array[k] = f

	return files_array

	def main(eval_script, language, extension):
	args = argparse.ArgumentParser()

	args.add_argument(
	"--directory", type=str, required=True, help="Directory to read benchmarks from"
	)
	args.add_argument(
	"--files",
	type=int,
	nargs="*",
	default=[],
	help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2"
	)
	args = args.parse_args()

	directory = Path(args.directory).resolve()

	files_sorted = list_files(directory, extension)

	# the directory you specified does not contain the right language
	if len(files_sorted) == 0:
	print(f'The specified directory does not contain files of type {extension}')
	sysexit(1)

	files_index = []
	if len(args.files) > 0:
	files_index = args.files
	else:
	files_index = range(len(files_sorted))

	total = 0
	passed = 0
	syntax_error = 0

	results_file = Path(Path(__file__).parent, "..", "results", language.lower() + ".csv").resolve()

	with open(results_file, "w") as f:
	for i in files_index:
	filepath = files_sorted[i]
	if filepath is None:
	print("File {} does not exist!".format(i))
	continue
	res = eval_script(filepath)
	output = f"{language},{filepath.stem},{res['status']}\n"
	f.write(output)
	print(output, end="")
	total += 1
	if res['status'] == "OK":
	passed += 1
	elif res['status'] == "SyntaxError":
	syntax_error += 1
	print (f"Total {total}, Syntax Error {syntax_error}, Passed {passed}")



	def main_check_stubs(check_script, language, extension):
	args = argparse.ArgumentParser()

	args.add_argument(
	"--directory", type=str, required=True, help="Directory to read benchmarks from"
	)
	args.add_argument(
	"--files",
	type=int,
	nargs="*",
	default=[],
	help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2"
	)
	args = args.parse_args()

	directory = Path(args.directory).resolve()

	files_sorted = list_files(directory, extension)

	# the directory you specified does not contain the right language
	if len(files_sorted) == 0:
	print(f'The specified directory does not contain files of type {extension}')
	sysexit(1)

	files_index = []
	if len(args.files) > 0:
	files_index = args.files
	else:
	files_index = range(len(files_sorted))

	total = 0
	passed = 0

	results_file = Path(Path(__file__).parent, "..", "check_results", language.lower() + ".csv").resolve()

	with open(results_file, "w") as f:
	for i in files_index:
	filepath = files_sorted[i]
	if filepath is None:
	print("File {} does not exist!".format(i))
	continue
	res = check_script(filepath)
	output = f"{language},{filepath.stem},{res['status']}\n"
	f.write(output)
	print(output, end="")
	total += 1
	if res['status'] == "OK":
	passed += 1
	print (f"Total {total}, Passed {passed}")

	if total != passed:
	sys.exit(1)