Spaces:
Sleeping
Sleeping
Kewen Zhao
commited on
Commit
·
af3f724
1
Parent(s):
47feec4
stdio format
Browse files- code_eval_stdio.py +3 -4
- execute.py +45 -35
code_eval_stdio.py
CHANGED
@@ -152,7 +152,7 @@ class CodeEval(evaluate.Metric):
|
|
152 |
license=_LICENSE,
|
153 |
)
|
154 |
|
155 |
-
def _compute(self,
|
156 |
"""Returns the scores"""
|
157 |
|
158 |
if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
|
@@ -167,10 +167,9 @@ class CodeEval(evaluate.Metric):
|
|
167 |
n_samples = 0
|
168 |
results = defaultdict(list)
|
169 |
|
170 |
-
for task_id, (candidates,
|
171 |
for candidate in candidates:
|
172 |
-
|
173 |
-
args = (test_program, timeout, task_id, completion_id[task_id])
|
174 |
future = executor.submit(check_correctness, *args)
|
175 |
futures.append(future)
|
176 |
completion_id[task_id] += 1
|
|
|
152 |
license=_LICENSE,
|
153 |
)
|
154 |
|
155 |
+
def _compute(self, program, test_input, test_output, k=[1, 10, 100], num_workers=4, timeout=3.0):
|
156 |
"""Returns the scores"""
|
157 |
|
158 |
if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
|
|
|
167 |
n_samples = 0
|
168 |
results = defaultdict(list)
|
169 |
|
170 |
+
for task_id, (candidates, inputs, outputs) in enumerate(zip(program, test_input, test_output)):
|
171 |
for candidate in candidates:
|
172 |
+
args = (candidate, inputs, outputs, timeout, task_id, completion_id[task_id])
|
|
|
173 |
future = executor.submit(check_correctness, *args)
|
174 |
futures.append(future)
|
175 |
completion_id[task_id] += 1
|
execute.py
CHANGED
@@ -25,24 +25,27 @@ import signal
|
|
25 |
import tempfile
|
26 |
|
27 |
|
28 |
-
def check_correctness(
|
29 |
"""
|
30 |
Evaluates the functional correctness of a completion by running the test
|
31 |
suite provided in the problem.
|
32 |
|
33 |
-
:param
|
34 |
-
|
|
|
|
|
|
|
|
|
35 |
"""
|
36 |
manager = multiprocessing.Manager()
|
37 |
result = manager.list()
|
38 |
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
if p.is_alive():
|
43 |
-
p.kill()
|
44 |
|
45 |
-
if
|
|
|
46 |
result.append("timed out")
|
47 |
|
48 |
return dict(
|
@@ -53,8 +56,16 @@ def check_correctness(check_program, timeout, task_id, completion_id):
|
|
53 |
)
|
54 |
|
55 |
|
56 |
-
def unsafe_execute(
|
|
|
|
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
with create_tempdir():
|
59 |
|
60 |
# These system calls are needed when cleaning up tempdir.
|
@@ -71,10 +82,24 @@ def unsafe_execute(check_program, result, timeout):
|
|
71 |
# Run program.
|
72 |
try:
|
73 |
exec_globals = {}
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
with time_limit(timeout):
|
76 |
-
exec(
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
except TimeoutException:
|
79 |
result.append("timed out")
|
80 |
except BaseException as e:
|
@@ -100,11 +125,13 @@ def time_limit(seconds):
|
|
100 |
|
101 |
|
102 |
@contextlib.contextmanager
|
103 |
-
def swallow_io():
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
108 |
yield
|
109 |
|
110 |
|
@@ -119,23 +146,6 @@ class TimeoutException(Exception):
|
|
119 |
pass
|
120 |
|
121 |
|
122 |
-
class WriteOnlyStringIO(io.StringIO):
|
123 |
-
"""StringIO that throws an exception when it's read from"""
|
124 |
-
|
125 |
-
def read(self, *args, **kwargs):
|
126 |
-
raise OSError
|
127 |
-
|
128 |
-
def readline(self, *args, **kwargs):
|
129 |
-
raise OSError
|
130 |
-
|
131 |
-
def readlines(self, *args, **kwargs):
|
132 |
-
raise OSError
|
133 |
-
|
134 |
-
def readable(self, *args, **kwargs):
|
135 |
-
"""Returns True if the IO object can be read."""
|
136 |
-
return False
|
137 |
-
|
138 |
-
|
139 |
class redirect_stdin(contextlib._RedirectStream): # type: ignore
|
140 |
_stream = "stdin"
|
141 |
|
|
|
25 |
import tempfile
|
26 |
|
27 |
|
28 |
+
def check_correctness(program, test_input, test_output, timeout, task_id, completion_id):
|
29 |
"""
|
30 |
Evaluates the functional correctness of a completion by running the test
|
31 |
suite provided in the problem.
|
32 |
|
33 |
+
:param program: The program string to evaluate.
|
34 |
+
:param test_input: The input string to provide to the program via STDIN.
|
35 |
+
:param test_output: The expected output string from the program via STDOUT.
|
36 |
+
:param timeout: Maximum execution time in seconds.
|
37 |
+
:param task_id: ID of the task being evaluated.
|
38 |
+
:param completion_id: Completion ID to match results later.
|
39 |
"""
|
40 |
manager = multiprocessing.Manager()
|
41 |
result = manager.list()
|
42 |
|
43 |
+
process = multiprocessing.Process(target=unsafe_execute, args=(program, test_input, test_output, result, timeout))
|
44 |
+
process.start()
|
45 |
+
process.join(timeout=timeout + 1)
|
|
|
|
|
46 |
|
47 |
+
if process.is_alive():
|
48 |
+
process.kill()
|
49 |
result.append("timed out")
|
50 |
|
51 |
return dict(
|
|
|
56 |
)
|
57 |
|
58 |
|
59 |
+
def unsafe_execute(program, test_input, test_output, result, timeout):
|
60 |
+
"""
|
61 |
+
Executes the program with redirected STDIN and compares its STDOUT to the expected output.
|
62 |
|
63 |
+
:param program: The program string to execute.
|
64 |
+
:param test_input: Input to provide to the program via STDIN.
|
65 |
+
:param test_output: Expected output to compare with STDOUT.
|
66 |
+
:param result: A multiprocessing.Manager().list() to store the execution result.
|
67 |
+
:param timeout: Maximum execution time in seconds.
|
68 |
+
"""
|
69 |
with create_tempdir():
|
70 |
|
71 |
# These system calls are needed when cleaning up tempdir.
|
|
|
82 |
# Run program.
|
83 |
try:
|
84 |
exec_globals = {}
|
85 |
+
actual_output = None
|
86 |
+
|
87 |
+
# Redirect I/O and execute
|
88 |
+
input_stream = io.StringIO(test_input)
|
89 |
+
output_stream = io.StringIO()
|
90 |
+
|
91 |
+
with swallow_io(input_stream, output_stream):
|
92 |
with time_limit(timeout):
|
93 |
+
exec(program, exec_globals)
|
94 |
+
|
95 |
+
actual_output = output_stream.getvalue().strip()
|
96 |
+
expected_output = test_output.strip()
|
97 |
+
|
98 |
+
if actual_output == expected_output:
|
99 |
+
result.append("passed")
|
100 |
+
else:
|
101 |
+
result.append(f"failed: got '{actual_output}', expected '{expected_output}'")
|
102 |
+
|
103 |
except TimeoutException:
|
104 |
result.append("timed out")
|
105 |
except BaseException as e:
|
|
|
125 |
|
126 |
|
127 |
@contextlib.contextmanager
|
128 |
+
def swallow_io(input_stream, output_stream):
|
129 |
+
"""
|
130 |
+
Redirects STDIN, STDOUT, and STDERR for isolated execution.
|
131 |
+
"""
|
132 |
+
with contextlib.redirect_stdout(output_stream):
|
133 |
+
with contextlib.redirect_stderr(output_stream):
|
134 |
+
with redirect_stdin(input_stream):
|
135 |
yield
|
136 |
|
137 |
|
|
|
146 |
pass
|
147 |
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
class redirect_stdin(contextlib._RedirectStream): # type: ignore
|
150 |
_stream = "stdin"
|
151 |
|