Spaces:

alKoGolik
/

asd

Configuration error

App Files Files Community

asd / OpenCodeInterpreter /evaluation /evalplus /tools /sanitize.py

alKoGolik

Upload 210 files

6fcd376 verified over 1 year ago

raw

history blame contribute delete

4.22 kB

	"""Purpose of this file: Sanitize the code produced by LLMs for the following reasons.
	1. Vicuna generated code could miss one white space. We fix the white space to make Vicuna more capable.
	2. {Our fault lol.} We find more EOFs tokens afterwards and truncate some messy code afterwards.
	"""

	import os

	from tqdm import tqdm

	from evalplus.data import (
	get_human_eval_plus,
	get_mbpp_plus,
	load_solutions,
	write_directory,
	write_jsonl,
	)
	from evalplus.sanitize import sanitize


	def remove_unindented_lines(code, protect_before, execeptions, trim_tails):
	lines = code.splitlines()
	cut_idx = []
	cut_enabled = False
	for i, line in enumerate(lines):
	if not cut_enabled and line.startswith(protect_before):
	cut_enabled = True
	continue
	if line.strip() == "":
	continue
	if any(line.startswith(e) for e in execeptions):
	continue

	lspace = len(line) - len(line.lstrip())
	if lspace == 0:
	cut_idx.append(i)

	if any(line.rstrip().startswith(t) for t in trim_tails):
	# cut off everything behind
	cut_idx.extend(list(range(i, len(lines))))
	break

	return "\n".join([line for i, line in enumerate(lines) if i not in cut_idx])


	def to_four_space_indents(old_code):
	new_code = ""
	for line in old_code.splitlines():
	lspace = len(line) - len(line.lstrip())
	if lspace == 3:
	new_code += " "
	new_code += line + "\n"
	return new_code


	if __name__ == "__main__":
	import argparse
	import pathlib

	parser = argparse.ArgumentParser()
	parser.add_argument("--samples", type=str, required=True)
	parser.add_argument("--eofs", nargs="+", type=str, default=[])
	parser.add_argument("--inplace", action="store_true")
	parser.add_argument(
	"--rm-prefix-lines", type=str, help="Remove lines starting with this"
	)
	parser.add_argument(
	"--dataset", required=True, type=str, choices=["humaneval", "mbpp"]
	)
	parser.add_argument(
	"--debug-task", type=str, help="Enter the task ID to only sanitize that task."
	)
	args = parser.parse_args()

	# task_id -> entry_point
	entry_point = {}

	if args.dataset == "humaneval":
	dataset = get_human_eval_plus()
	elif args.dataset == "mbpp":
	dataset = get_mbpp_plus()

	for task_id, problem in dataset.items():
	entry_point[task_id] = problem["entry_point"]

	# make a new folder with "-sanitized" suffix
	is_folder = os.path.isdir(args.samples)
	target_path = pathlib.Path(args.samples)
	if not args.inplace:
	if is_folder:
	new_name = target_path.name + "-sanitized"
	else:
	new_name = target_path.name.replace(".jsonl", "-sanitized.jsonl")
	target_path = target_path.parent / new_name
	target_path = str(target_path)

	nsan = 0
	ntotal = 0

	new_solutions = []

	for solution in tqdm(load_solutions(args.samples)):
	task_id = solution["task_id"]
	dbg_identifier = solution["_identifier"]
	if args.debug_task is not None and task_id != args.debug_task:
	continue

	ntotal += 1
	if "solution" in solution:
	old_code = solution["solution"]
	else:
	assert "completion" in solution
	old_code = dataset[task_id]["prompt"] + "\n" + solution["completion"]

	old_code = old_code.strip()

	new_code = sanitize(
	old_code=old_code,
	entry_point=entry_point[task_id],
	rm_prefix_lines=args.rm_prefix_lines,
	eofs=args.eofs,
	).strip()

	# if changed, print the message
	if new_code != old_code:
	msg = "Sanitized: " + dbg_identifier
	if is_folder:
	msg += " -> " + dbg_identifier.replace(args.samples, target_path)
	print(msg)
	nsan += 1

	new_solutions.append({"task_id": task_id, "solution": new_code})

	if is_folder:
	write_directory(target_path, new_solutions)
	else:
	write_jsonl(target_path, new_solutions)

	print(f"Sanitized {nsan} out of {ntotal} files.")