red1bluelost's picture
Adds initial evaluation of just runtime completion tests.
6bd9122
raw
history blame
3.69 kB
import contextlib
import multiprocessing
import os
import subprocess
import tempfile
def check_correctness(candidate, reference, task_id, completion_id):
"""
Evaluates the functional correctness of a completion by running the test
suite provided in the problem.
:param completion_id: an optional completion ID so we can match
the results later even if execution finishes asynchronously.
"""
manager = multiprocessing.Manager()
base_run_result = manager.list()
process_case(
unsafe_execute_cpp,
candidate["base"],
reference["tests"],
base_run_result,
)
sfinae_run_result = manager.list()
process_case(
unsafe_execute_cpp,
candidate["sfinae"],
reference["tests"],
sfinae_run_result,
)
concepts_run_result = manager.list()
process_case(
unsafe_execute_cpp,
candidate["concepts"],
reference["tests"],
concepts_run_result,
)
return dict(
task_id=task_id,
completion_id=completion_id,
base_run_passed=base_run_result[0] == "passed",
base_run_result=base_run_result[0],
sfinae_run_passed=sfinae_run_result[0] == "passed",
sfinae_run_result=sfinae_run_result[0],
concepts_run_passed=concepts_run_result[0] == "passed",
concepts_run_result=concepts_run_result[0],
)
def process_case(target, candidate, reference, result):
timeout = 60
p = multiprocessing.Process(
target=target,
args=(candidate, reference, result, timeout),
)
p.start()
p.join(timeout=timeout + 5)
if p.is_alive():
p.kill()
if not result:
result.append("timed out")
def unsafe_execute_cpp(candidate, reference, result, timeout):
with create_tempdir():
code = "#include <bits/stdc++.h>\n" + candidate + reference
open(f"test.cpp", "w").write(code)
cpp_compiler = os.getenv("GENERICIFY_CLANG")
compilation_result = subprocess.run(
[cpp_compiler, "-std=c++20", "test.cpp"],
timeout=timeout,
capture_output=True,
)
if compilation_result.returncode != 0:
if compilation_result.stderr:
err = compilation_result.stderr.decode()
else:
err = compilation_result.stdout.decode()
result.append(f"failed: compilation error: {err}")
else:
try:
exec_result = subprocess.run(
["./a.out"], timeout=timeout, capture_output=True
)
if exec_result.returncode == 0:
result.append("passed")
else:
if exec_result.stderr:
try:
err = exec_result.stderr.decode()
except:
err = exec_result.stderr
else:
try:
err = exec_result.stdout.decode()
except:
err = exec_result.stdout
result.append(f"failed: {err}")
except subprocess.TimeoutExpired as e:
result.append("timed out")
@contextlib.contextmanager
def create_tempdir():
with tempfile.TemporaryDirectory() as dirname:
with chdir(dirname):
yield dirname
@contextlib.contextmanager
def chdir(root):
if root == ".":
yield
return
cwd = os.getcwd()
os.chdir(root)
try:
yield
except BaseException as exc:
raise exc
finally:
os.chdir(cwd)