red1bluelost commited on
Commit
2ed1c06
1 Parent(s): 9fdbbcb

Updates for only selecting a single prompt type.

Browse files
Files changed (2) hide show
  1. evaluate_genericify_cpp.py +25 -5
  2. execute.py +78 -65
evaluate_genericify_cpp.py CHANGED
@@ -3,6 +3,8 @@
3
  import collections
4
  import os
5
 
 
 
6
  import concurrent.futures
7
  import datasets
8
  import evaluate
@@ -119,7 +121,14 @@ class EvaluateGenericifyCpp(evaluate.Metric):
119
  reference_urls=["http://path.to.reference.url/new_module"],
120
  )
121
 
122
- def _compute(self, *, predictions, references, k=[1, 10, 100]):
 
 
 
 
 
 
 
123
  """Returns the scores"""
124
  num_workers = 4
125
 
@@ -148,6 +157,7 @@ class EvaluateGenericifyCpp(evaluate.Metric):
148
  args = (
149
  candidate,
150
  reference,
 
151
  task_id,
152
  completion_id[task_id],
153
  )
@@ -163,16 +173,26 @@ class EvaluateGenericifyCpp(evaluate.Metric):
163
 
164
  totals = collections.defaultdict(list)
165
  corrects = collections.defaultdict(list)
166
- for result in results.values():
167
- result.sort()
168
- for pt in [
169
  "base_run_passed",
170
  "base_run_compiled",
 
 
171
  "sfinae_run_passed",
172
  "sfinae_run_compiled",
 
 
 
173
  "concepts_run_passed",
174
  "concepts_run_compiled",
175
- ]:
 
 
 
 
 
176
  passed = [r[1][pt] for r in result]
177
  totals[pt].append(len(passed))
178
  corrects[pt].append(sum(passed))
 
3
  import collections
4
  import os
5
 
6
+ from typing import Literal
7
+
8
  import concurrent.futures
9
  import datasets
10
  import evaluate
 
121
  reference_urls=["http://path.to.reference.url/new_module"],
122
  )
123
 
124
+ def _compute(
125
+ self,
126
+ *,
127
+ predictions,
128
+ references,
129
+ cpp_type: Literal["base", "sfinae", "concepts"],
130
+ k=[1, 10, 100],
131
+ ):
132
  """Returns the scores"""
133
  num_workers = 4
134
 
 
157
  args = (
158
  candidate,
159
  reference,
160
+ cpp_type,
161
  task_id,
162
  completion_id[task_id],
163
  )
 
173
 
174
  totals = collections.defaultdict(list)
175
  corrects = collections.defaultdict(list)
176
+
177
+ keys = {
178
+ "base": [
179
  "base_run_passed",
180
  "base_run_compiled",
181
+ ],
182
+ "sfinae": [
183
  "sfinae_run_passed",
184
  "sfinae_run_compiled",
185
+ "sfinae_constrain_passed",
186
+ ],
187
+ "concepts": [
188
  "concepts_run_passed",
189
  "concepts_run_compiled",
190
+ "concepts_constrain_passed",
191
+ ],
192
+ }[cpp_type]
193
+ for result in results.values():
194
+ result.sort()
195
+ for pt in keys:
196
  passed = [r[1][pt] for r in result]
197
  totals[pt].append(len(passed))
198
  corrects[pt].append(sum(passed))
execute.py CHANGED
@@ -5,7 +5,7 @@ import subprocess
5
  import tempfile
6
 
7
 
8
- def check_correctness(candidate, reference, task_id, completion_id):
9
  """
10
  Evaluates the functional correctness of a completion by running the test
11
  suite provided in the problem.
@@ -15,75 +15,88 @@ def check_correctness(candidate, reference, task_id, completion_id):
15
  """
16
 
17
  manager = multiprocessing.Manager()
18
- base_run_result = manager.list()
19
- process_case(
20
- unsafe_execute_cpp,
21
- candidate["base"],
22
- reference["tests"],
23
- base_run_result,
24
- "c++17",
25
- )
26
- sfinae_run_result = manager.list()
27
- process_case(
28
- unsafe_execute_cpp,
29
- candidate["sfinae"],
30
- reference["tests"],
31
- sfinae_run_result,
32
- "c++17",
33
- )
34
- concepts_run_result = manager.list()
35
- process_case(
36
- unsafe_execute_cpp,
37
- candidate["concepts"],
38
- reference["tests"],
39
- concepts_run_result,
40
- "c++20",
41
- )
42
-
43
- sfinae_constrain_result = manager.list()
44
- process_case(
45
- invalid_compile_cpp,
46
- candidate["sfinae"],
47
- reference["invalids"],
48
- sfinae_constrain_result,
49
- "c++17",
50
- )
51
- concepts_constrain_result = manager.list()
52
- process_case(
53
- invalid_compile_cpp,
54
- candidate["concepts"],
55
- reference["invalids"],
56
- concepts_constrain_result,
57
- "c++20",
58
- )
59
 
60
- return dict(
61
  task_id=task_id,
62
  completion_id=completion_id,
63
- base_run_passed=base_run_result[0] == "passed",
64
- base_run_compiled=(
65
- base_run_result[0] == "passed"
66
- or base_run_result[0].startswith("failed: runtime error:")
67
- ),
68
- base_run_result=base_run_result[0],
69
- sfinae_run_passed=sfinae_run_result[0] == "passed",
70
- sfinae_run_compiled=(
71
- sfinae_run_result[0] == "passed"
72
- or sfinae_run_result[0].startswith("failed: runtime error:")
73
- ),
74
- sfinae_run_result=sfinae_run_result[0],
75
- concepts_run_passed=concepts_run_result[0] == "passed",
76
- concepts_run_compiled=(
77
- concepts_run_result[0] == "passed"
78
- or concepts_run_result[0].startswith("failed: runtime error:")
79
- ),
80
- concepts_run_result=concepts_run_result[0],
81
- sfinae_constrain_passed=sfinae_constrain_result[0] == "passed",
82
- sfinae_constrain_result=sfinae_constrain_result[0],
83
- concepts_constrain_passed=concepts_constrain_result[0] == "passed",
84
- concepts_constrain_result=concepts_constrain_result[0],
85
  )
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  def process_case(target, candidate, reference, result, cppstd):
89
  timeout = 60
 
5
  import tempfile
6
 
7
 
8
+ def check_correctness(candidate, reference, cpp_type, task_id, completion_id):
9
  """
10
  Evaluates the functional correctness of a completion by running the test
11
  suite provided in the problem.
 
15
  """
16
 
17
  manager = multiprocessing.Manager()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ result = dict(
20
  task_id=task_id,
21
  completion_id=completion_id,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  )
23
 
24
+ if cpp_type == "base":
25
+ base_run_result = manager.list()
26
+ process_case(
27
+ unsafe_execute_cpp,
28
+ candidate,
29
+ reference["tests"],
30
+ base_run_result,
31
+ "c++17",
32
+ )
33
+ result |= dict(
34
+ base_run_passed=base_run_result[0] == "passed",
35
+ base_run_compiled=(
36
+ base_run_result[0] == "passed"
37
+ or base_run_result[0].startswith("failed: runtime error:")
38
+ ),
39
+ base_run_result=base_run_result[0],
40
+ )
41
+ elif cpp_type == "sfinae":
42
+ sfinae_run_result = manager.list()
43
+ process_case(
44
+ unsafe_execute_cpp,
45
+ candidate,
46
+ reference["tests"],
47
+ sfinae_run_result,
48
+ "c++17",
49
+ )
50
+ sfinae_constrain_result = manager.list()
51
+ process_case(
52
+ invalid_compile_cpp,
53
+ candidate,
54
+ reference["invalids"],
55
+ sfinae_constrain_result,
56
+ "c++17",
57
+ )
58
+ result |= dict(
59
+ sfinae_run_passed=sfinae_run_result[0] == "passed",
60
+ sfinae_run_compiled=(
61
+ sfinae_run_result[0] == "passed"
62
+ or sfinae_run_result[0].startswith("failed: runtime error:")
63
+ ),
64
+ sfinae_run_result=sfinae_run_result[0],
65
+ sfinae_constrain_passed=sfinae_constrain_result[0] == "passed",
66
+ sfinae_constrain_result=sfinae_constrain_result[0],
67
+ )
68
+ elif cpp_type == "concepts":
69
+ concepts_run_result = manager.list()
70
+ process_case(
71
+ unsafe_execute_cpp,
72
+ candidate,
73
+ reference["tests"],
74
+ concepts_run_result,
75
+ "c++20",
76
+ )
77
+ concepts_constrain_result = manager.list()
78
+ process_case(
79
+ invalid_compile_cpp,
80
+ candidate,
81
+ reference["invalids"],
82
+ concepts_constrain_result,
83
+ "c++20",
84
+ )
85
+ result |= dict(
86
+ concepts_run_passed=concepts_run_result[0] == "passed",
87
+ concepts_run_compiled=(
88
+ concepts_run_result[0] == "passed"
89
+ or concepts_run_result[0].startswith("failed: runtime error:")
90
+ ),
91
+ concepts_run_result=concepts_run_result[0],
92
+ concepts_constrain_passed=concepts_constrain_result[0] == "passed",
93
+ concepts_constrain_result=concepts_constrain_result[0],
94
+ )
95
+ else:
96
+ raise ValueError(f"Unknown cpp_type: {cpp_type}")
97
+
98
+ return result
99
+
100
 
101
  def process_case(target, candidate, reference, result, cppstd):
102
  timeout = 60