Spaces:
Sleeping
Sleeping
Jialin Song
commited on
Commit
·
9d03b23
1
Parent(s):
cbe9336
update apps_metric to provide outputs
Browse files- testing_util.py +53 -16
- utils.py +1 -1
testing_util.py
CHANGED
@@ -54,7 +54,8 @@ def run_test(sample, test=None, debug=False):
|
|
54 |
otherwise it'll just return an input and output pair.
|
55 |
"""
|
56 |
# Disable functionalities that can make destructive changes to the test.
|
57 |
-
|
|
|
58 |
|
59 |
if debug:
|
60 |
print(f"start = {datetime.now().time()}")
|
@@ -99,7 +100,7 @@ def run_test(sample, test=None, debug=False):
|
|
99 |
if debug:
|
100 |
print(f"type 0 compilation error = {e}")
|
101 |
results.append(-2)
|
102 |
-
return results
|
103 |
signal.alarm(0)
|
104 |
|
105 |
elif which_type == CODE_TYPE.standard_input:
|
@@ -156,6 +157,7 @@ def run_test(sample, test=None, debug=False):
|
|
156 |
results.append(-2)
|
157 |
return results
|
158 |
|
|
|
159 |
for index, inputs in enumerate(in_outs["inputs"]):
|
160 |
# JSON forces dictionaries to have string keys; this undoes this (assuming a singleton list)
|
161 |
try:
|
@@ -200,6 +202,15 @@ def run_test(sample, test=None, debug=False):
|
|
200 |
|
201 |
# reset the alarm
|
202 |
signal.alarm(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
except Exception as e:
|
204 |
signal.alarm(0)
|
205 |
faulthandler.disable()
|
@@ -234,6 +245,10 @@ def run_test(sample, test=None, debug=False):
|
|
234 |
results.append(-1)
|
235 |
signal.alarm(0)
|
236 |
|
|
|
|
|
|
|
|
|
237 |
if not passed:
|
238 |
if debug:
|
239 |
nl = "\n"
|
@@ -246,7 +261,12 @@ def run_test(sample, test=None, debug=False):
|
|
246 |
if passed and debug:
|
247 |
print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}")
|
248 |
|
249 |
-
|
|
|
|
|
|
|
|
|
|
|
250 |
tmp_result = True
|
251 |
results.append(tmp_result)
|
252 |
continue
|
@@ -391,26 +411,42 @@ def run_test(sample, test=None, debug=False):
|
|
391 |
if not isinstance(inputs, list):
|
392 |
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
|
393 |
else:
|
394 |
-
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
|
395 |
-
|
396 |
|
397 |
-
return results
|
398 |
|
399 |
|
400 |
def custom_compare_(output, ground_truth):
|
|
|
|
|
|
|
|
|
|
|
401 |
|
402 |
if isinstance(output, list):
|
403 |
-
|
404 |
-
|
405 |
-
|
|
|
|
|
|
|
406 |
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
412 |
|
413 |
-
return False
|
414 |
|
415 |
def stripped_string_compare(s1, s2):
|
416 |
s1 = s1.lstrip().rstrip()
|
@@ -427,6 +463,7 @@ def call_method(method, inputs):
|
|
427 |
# sys.setrecursionlimit(10000)
|
428 |
|
429 |
# @patch('builtins.input', side_effect=inputs.split("\n"))
|
|
|
430 |
@patch('builtins.open', mock_open(read_data=inputs))
|
431 |
@patch('sys.stdin', StringIO(inputs))
|
432 |
@patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
|
@@ -522,4 +559,4 @@ def reliability_guard(maximum_memory_bytes=None):
|
|
522 |
sys.modules["joblib"] = None
|
523 |
sys.modules["resource"] = None
|
524 |
sys.modules["psutil"] = None
|
525 |
-
sys.modules["tkinter"] = None
|
|
|
54 |
otherwise it'll just return an input and output pair.
|
55 |
"""
|
56 |
# Disable functionalities that can make destructive changes to the test.
|
57 |
+
# TODO: disable for now as it interferes with GPT-4 generation through gateway
|
58 |
+
# reliability_guard()
|
59 |
|
60 |
if debug:
|
61 |
print(f"start = {datetime.now().time()}")
|
|
|
100 |
if debug:
|
101 |
print(f"type 0 compilation error = {e}")
|
102 |
results.append(-2)
|
103 |
+
return results, {}
|
104 |
signal.alarm(0)
|
105 |
|
106 |
elif which_type == CODE_TYPE.standard_input:
|
|
|
157 |
results.append(-2)
|
158 |
return results
|
159 |
|
160 |
+
program_outputs = {}
|
161 |
for index, inputs in enumerate(in_outs["inputs"]):
|
162 |
# JSON forces dictionaries to have string keys; this undoes this (assuming a singleton list)
|
163 |
try:
|
|
|
202 |
|
203 |
# reset the alarm
|
204 |
signal.alarm(0)
|
205 |
+
|
206 |
+
program_outputs[index] = {
|
207 |
+
"pass": tmp_result,
|
208 |
+
"pass_pct": int(tmp_result),
|
209 |
+
"pass_res": [int(tmp_result)],
|
210 |
+
"output": output,
|
211 |
+
"input": inputs,
|
212 |
+
"ground_truth": in_outs["outputs"][index]
|
213 |
+
}
|
214 |
except Exception as e:
|
215 |
signal.alarm(0)
|
216 |
faulthandler.disable()
|
|
|
245 |
results.append(-1)
|
246 |
signal.alarm(0)
|
247 |
|
248 |
+
program_outputs[index] = {"output": output}
|
249 |
+
program_outputs[index]["ground_truth"] = in_outs['outputs'][index]
|
250 |
+
program_outputs[index]["input"] = in_outs['inputs'][index]
|
251 |
+
|
252 |
if not passed:
|
253 |
if debug:
|
254 |
nl = "\n"
|
|
|
261 |
if passed and debug:
|
262 |
print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}")
|
263 |
|
264 |
+
all_pass, pass_pct, pass_res = custom_compare_(output, in_outs['outputs'][index])
|
265 |
+
program_outputs[index]["pass"] = all_pass
|
266 |
+
program_outputs[index]["pass_pct"] = pass_pct
|
267 |
+
program_outputs[index]["pass_res"] = pass_res
|
268 |
+
|
269 |
+
if all_pass:
|
270 |
tmp_result = True
|
271 |
results.append(tmp_result)
|
272 |
continue
|
|
|
411 |
if not isinstance(inputs, list):
|
412 |
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
|
413 |
else:
|
414 |
+
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
|
|
|
415 |
|
416 |
+
return results, program_outputs
|
417 |
|
418 |
|
419 |
def custom_compare_(output, ground_truth):
|
420 |
+
|
421 |
+
# TODO: split ground_truth and compare one by one
|
422 |
+
ground_truth_list = ground_truth.strip().split("\n")
|
423 |
+
correct = 0
|
424 |
+
res = []
|
425 |
|
426 |
if isinstance(output, list):
|
427 |
+
for out, g_t in zip(output, ground_truth_list):
|
428 |
+
if out.strip() == g_t.strip():
|
429 |
+
correct += 1
|
430 |
+
res.append(1)
|
431 |
+
else:
|
432 |
+
res.append(0)
|
433 |
|
434 |
+
return correct == len(ground_truth_list), correct / len(ground_truth_list), res
|
435 |
+
|
436 |
+
return False, 0.0, []
|
437 |
+
|
438 |
+
# if isinstance(output, list):
|
439 |
+
# output_1 = "\n".join(output)
|
440 |
+
# if stripped_string_compare(output_1, ground_truth):
|
441 |
+
# return True
|
442 |
+
|
443 |
+
# if isinstance(output, list):
|
444 |
+
# output_2 = [o.lstrip().rstrip() for o in output]
|
445 |
+
# output_2 = "\n".join(output_2)
|
446 |
+
# if stripped_string_compare(output_2, ground_truth):
|
447 |
+
# return True
|
448 |
|
449 |
+
# return False
|
450 |
|
451 |
def stripped_string_compare(s1, s2):
|
452 |
s1 = s1.lstrip().rstrip()
|
|
|
463 |
# sys.setrecursionlimit(10000)
|
464 |
|
465 |
# @patch('builtins.input', side_effect=inputs.split("\n"))
|
466 |
+
@patch('builtins.input', lambda *args: next(inputs_line_iterator))
|
467 |
@patch('builtins.open', mock_open(read_data=inputs))
|
468 |
@patch('sys.stdin', StringIO(inputs))
|
469 |
@patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
|
|
|
559 |
sys.modules["joblib"] = None
|
560 |
sys.modules["resource"] = None
|
561 |
sys.modules["psutil"] = None
|
562 |
+
sys.modules["tkinter"] = None
|
utils.py
CHANGED
@@ -48,7 +48,7 @@ def evaluate_generations(generations: list, indices: list = [], level: str = "al
|
|
48 |
"""
|
49 |
|
50 |
# generations are code generations in the same order of the dataset
|
51 |
-
apps_eval = load_dataset(DATASET, split="
|
52 |
|
53 |
if indices is None:
|
54 |
indices = range(len(generations))
|
|
|
48 |
"""
|
49 |
|
50 |
# generations are code generations in the same order of the dataset
|
51 |
+
apps_eval = load_dataset(DATASET, level, split="train")
|
52 |
|
53 |
if indices is None:
|
54 |
indices = range(len(generations))
|