gabeorlanski commited on
Commit
1359055
1 Parent(s): 9c8145f
Files changed (5) hide show
  1. README.md +242 -5
  2. app.py +5 -0
  3. bc_eval.py +335 -0
  4. execution.py +145 -0
  5. requirements.txt +1 -0
README.md CHANGED
@@ -1,12 +1,249 @@
1
  ---
2
- title: Bc Eval
3
- emoji: 😻
4
- colorFrom: pink
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 3.36.1
8
  app_file: app.py
9
  pinned: false
 
 
 
 
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: BabelCode Eval
3
+ colorFrom: blue
 
4
  colorTo: red
5
  sdk: gradio
6
+ sdk_version: 3.19.1
7
  app_file: app.py
8
  pinned: false
9
+ tags:
10
+ - evaluate
11
+ - metric
12
+ description: >-
13
+ This metric implements the evaluation harness for datasets translated with the
14
+ BabelCode framework as described in the paper "Measuring The Impact Of
15
+ Programming Language Distribution" (https://arxiv.org/abs/2302.01973).
16
  ---
17
 
18
+ # Metric Card for bc_eval
19
+
20
+
21
+ ## Metric Description
22
+ This metric implements the evaluation harness for datasets translated with the BabelCode framework as described in the paper "Measuring The Impact Of Programming Language Distribution" (https://arxiv.org/abs/2302.01973).
23
+
24
+ ## How to Use
25
+ 1. Generate predictions for BabelCode supported datasets
26
+ 2. Aggregate the predictions by their question.
27
+ 3. With the aggregated predictions for each question, add the `question_info` from the original BabelCode dataset.
28
+ 4. Run the metric on the `predictions`, `languages`, and `question_infos`.
29
+ 5. The result of the metric is a tuple where the first is a metric dict and the second value is the results for each prediction.
30
+
31
+ ```python
32
+ import evaluate
33
+ from datasets import load_dataset
34
+ import os
35
+ os.environ["HF_ALLOW_CODE_EVAL"] = "1"
36
+
37
+ predictions = []
38
+ languages = []
39
+ question_infos = []
40
+ ds = load_dataset("gabeorlanski/bc-humaneval", split="test")
41
+
42
+ for row in ds:
43
+ languages.append(row['language'])
44
+ question_infos.append(row['question_info'])
45
+
46
+ # Replace this with however you generate and postprocess predictions.
47
+ predictions.append(model.generate(row['signature_with_docstring']))
48
+
49
+
50
+ metric = evaluate.load("bc_eval")
51
+ metrics, results = metric.compute(
52
+ predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
53
+ )
54
+ ```
55
+
56
+ ### Inputs
57
+ * `predictions`(`List[List[str]]`): The list of predictions for each question to execute.
58
+ * `languages`(`List[str]`): The language to use for each question.
59
+ * `question_dicts`(`List[Dict]`): The information for each question.
60
+ * `k`(`List[int]`): number of code candidates to consider in the evaluation (Default: [1, 10, 100])
61
+ * `num_workers`(`int`): number of workers used to evaluate the candidate programs (Default: 4).
62
+ * `language_timeout`(`Dict[str,int]`): Timeouts to use for each language. If it is not set, will default to the one in the question dict (Default: None).
63
+
64
+ ### Output Values
65
+
66
+ The `bc_eval` metric outputs two things:
67
+
68
+ * `metrics`: a dictionary with the pass rates for each k value defined in the arguments and the mean percent of tests passed per question. The keys are formatted as `{LANGUAGE NAME}/{METRIC NAME}`
69
+
70
+ * `results`: a list of dictionaries with the results from each individual prediction.
71
+
72
+ #### Values from Popular Papers
73
+ [PaLM-2](https://arxiv.org/pdf/2305.10403.pdf) Performance on BC-HumanEval (`pass@1` with greedy decoding):
74
+
75
+ | Language | PaLM 2-S* | PaLM 540B | PaLM-Coder-540B |
76
+ |------------|-----------|-----------|-----------------|
77
+ | C# | 24.22 | 20.5 | **26.09** |
78
+ | C++ | **34.16** | 21.74 | 24.22 |
79
+ | Go | 19.25 | 13.66 | **21.12** |
80
+ | Haskell | **8.7** | 1.86 | 1.86 |
81
+ | Java | **31.06** | 20.5 | 25.47 |
82
+ | JavaScript | **32.3** | 23.6 | 29.81 |
83
+ | Julia | **16.77** | 2.48 | 4.35 |
84
+ | Lua | **26.09** | 19.25 | 24.84 |
85
+ | PHP | **26.09** | 18.63 | 25.47 |
86
+ | Python | **34.16** | 17.39 | 26.71 |
87
+ | Rust | **28.57** | 16.15 | 22.98 |
88
+ | TypeScript | **32.3** | 17.39 | 30.43 |
89
+
90
+
91
+ ### Examples
92
+ Full example with inputs that fail tests, time out, have an error, and pass.
93
+
94
+ #### Passing Example
95
+ ```python
96
+ import evaluate
97
+ from datasets import load_dataset
98
+ import os
99
+ os.environ["HF_ALLOW_CODE_EVAL"] = "1"
100
+ ds = load_dataset("gabeorlanski/bc-humaneval", split="test")
101
+ example = ds[0]
102
+ metric = evaluate.load("bc_eval")
103
+ languages = ["Python"]
104
+ question_infos = [example["question_info"]]
105
+ predictions = [["""def has_close_elements(numbers: List[float], threshold: float) -> bool:
106
+ for idx, elem in enumerate(numbers):
107
+ for idx2, elem2 in enumerate(numbers):
108
+ if idx != idx2:
109
+ distance = abs(elem - elem2)
110
+ if distance < threshold:
111
+ return True
112
+
113
+ return False"""
114
+ ]]
115
+ metrics, results = metric.compute(
116
+ predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
117
+ )
118
+ ```
119
+ `metrics` is:
120
+ ```
121
+ {"Python/pass@1": 1.0, "Python/mean_pct_pass": 1.0}
122
+ ```
123
+ `results` is:
124
+ ```
125
+ [{"qid": 0, "idx": "0", "file_path": ".../tmpqt_p3dwn/0", "results": [{"return_code": 0, "runtime": 0.076369, "stdout": "TEST-0...PASSED\r\nTEST-1...PASSED\r\nTEST-2...PASSED\r\nTEST-3...PASSED\r\nTEST-4...PASSED\r\nTEST-5...PASSED\r\nTEST-6...PASSED\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "PASSED", "1": "PASSED", "2": "PASSED", "3": "PASSED", "4": "PASSED", "5": "PASSED", "6": "PASSED"}, "outcome": "PASSED"}]
126
+ ```
127
+
128
+
129
+ #### Fails Test Example
130
+
131
+ ```python
132
+ import evaluate
133
+ from datasets import load_dataset
134
+ import os
135
+ os.environ["HF_ALLOW_CODE_EVAL"] = "1"
136
+ ds = load_dataset(
137
+ "gabeorlanski/bc-humaneval", "Python", split="test"
138
+ )
139
+ example = ds[0]
140
+ metric = evaluate.load("bc_eval")
141
+ languages = ["Python"]
142
+ question_infos = [example["question_info"]]
143
+ predictions = [["""def has_close_elements(numbers: List[float], threshold: float) -> bool:
144
+ for idx, elem in enumerate(numbers):
145
+ for idx2, elem2 in enumerate(numbers):
146
+ if idx != idx2:
147
+ distance = elem - elem2
148
+ if distance < threshold:
149
+ return True
150
+
151
+ return False"""
152
+ ]]
153
+ metrics, results = metric.compute(
154
+ predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
155
+ )
156
+ ```
157
+
158
+ `metrics` is:
159
+ ```
160
+ {"Python/pass@1": 0.0, "Python/mean_pct_pass": 0.5714285714285714}
161
+ ```
162
+ `results` is:
163
+ ```
164
+ [{"qid": 0, "idx": "0", "file_path": "/tmp7u587vk5/0", "results": [{"return_code": 0, "runtime": 0.08255, "stdout": "TEST-0...PASSED\r\nTEST-1...FAILED\r\nTEST-2...PASSED\r\nTEST-3...FAILED\r\nTEST-4...PASSED\r\nTEST-5...PASSED\r\nTEST-6...FAILED\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "PASSED", "1": "FAILED", "2": "PASSED", "3": "FAILED", "4": "PASSED", "5": "PASSED", "6": "FAILED"}, "outcome": "FAILED"}]
165
+ ```
166
+
167
+ Note that the individual test results are located in results.
168
+
169
+ #### Timeout Example
170
+
171
+ ```python
172
+ import evaluate
173
+ from datasets import load_dataset
174
+ import os
175
+ os.environ["HF_ALLOW_CODE_EVAL"] = "1"
176
+ ds = load_dataset(
177
+ "gabeorlanski/bc-humaneval", "Python", split="test"
178
+ )
179
+ example = ds[0]
180
+ metric = evaluate.load("bc_eval")
181
+ languages = ["Python"]
182
+ question_infos = [example["question_info"]]
183
+ predictions = [["""import time
184
+ def has_close_elements(numbers: List[float], threshold: float) -> bool:
185
+ time.sleep(100)
186
+ """
187
+ ]]
188
+ metrics, results = metric.compute(
189
+ predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
190
+ )
191
+ ```
192
+
193
+ `metrics` is:
194
+ ```
195
+ {"Python/pass@1": 0.0, "Python/mean_pct_pass": 0.0}
196
+ ```
197
+ `results` is:
198
+ ```
199
+ [{"qid": 0, "idx": "0", "file_path": "/tmp_rz6bhb9/0", "results": [{"return_code": -1, "runtime": 10, "stdout": null, "stderr": null, "timed_out": true}], "failed": false, "timed_out": true, "test_cases": {"0": "MISSING", "1": "MISSING", "2": "MISSING", "3": "MISSING", "4": "MISSING", "5": "MISSING", "6": "MISSING"}, "outcome": "TIMED_OUT"}]
200
+ ```
201
+
202
+ #### Error Example
203
+
204
+ ```python
205
+ import evaluate
206
+ from datasets import load_dataset
207
+ import os
208
+ os.environ["HF_ALLOW_CODE_EVAL"] = "1"
209
+ ds = load_dataset(
210
+ "gabeorlanski/bc-humaneval", "Python", split="test"
211
+ )
212
+ example = ds[0]
213
+ metric = evaluate.load("bc_eval")
214
+ languages = ["Python"]
215
+ question_infos = [example["question_info"]]
216
+ predictions = [["""import time
217
+ def has_close_elements(numbers: List[float], threshold: float) -> bool:
218
+ raise ValueError()
219
+ """,
220
+ """def add(a, b):
221
+ return a+b"""
222
+ ]]
223
+ metrics, results = metric.compute(
224
+ predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
225
+ )
226
+ ```
227
+
228
+ `metrics` is:
229
+ ```
230
+ {"Python/pass@1": 0.0, "Python/mean_pct_pass": 0.0}
231
+ ```
232
+ `results` is:
233
+ ```
234
+ [{"qid": 0, "idx": "0", "file_path": "/tmpjdn51aaa/0", "results": [{"return_code": 0, "runtime": 0.102855, "stdout": "TEST-0...ValueError\r\nTEST-1...ValueError\r\nTEST-2...ValueError\r\nTEST-3...ValueError\r\nTEST-4...ValueError\r\nTEST-5...ValueError\r\nTEST-6...ValueError\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "ValueError", "1": "ValueError", "2": "ValueError", "3": "ValueError", "4": "ValueError", "5": "ValueError", "6": "ValueError"}, "outcome": "HAD_ERROR"},
235
+ {"qid": 0, "idx": "1", "file_path": "/tmpjdn51aaa/1", "results": [{"return_code": 0, "runtime": 0.094347, "stdout": "TEST-0...NameError\r\nTEST-1...NameError\r\nTEST-2...NameError\r\nTEST-3...NameError\r\nTEST-4...NameError\r\nTEST-5...NameError\r\nTEST-6...NameError\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "NameError", "1": "NameError", "2": "NameError", "3": "NameError", "4": "NameError", "5": "NameError", "6": "NameError"}, "outcome": "HAD_ERROR"}]
236
+ ```
237
+
238
+ ## Limitations and Bias
239
+ This metric requires that the dataset be BabelCode compatible.
240
+
241
+ ## Citation
242
+ ```
243
+ @article{orlanski2023measuring,
244
+ title={Measuring The Impact Of Programming Language Distribution},
245
+ author={Orlanski, Gabriel and Xiao, Kefan and Garcia, Xavier and Hui, Jeffrey and Howland, Joshua and Malmaud, Jonathan and Austin, Jacob and Singh, Rishah and Catasta, Michele},
246
+ journal={arXiv preprint arXiv:2302.01973},
247
+ year={2023}
248
+ }
249
+ ```
app.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+ module = evaluate.load("gabeorlanski/bc_eval")
5
+ launch_gradio_widget(module)
bc_eval.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import itertools
3
+ import os
4
+ import re
5
+ import tempfile
6
+ from collections import defaultdict
7
+ from pathlib import Path
8
+
9
+ import datasets
10
+ import evaluate
11
+ import numpy as np
12
+ from tqdm import tqdm
13
+
14
+ from .execution import execute_predictions
15
+
16
+ STDOUT_PARSE_REGEX = re.compile(r"^TEST-(.+)\.\.\.(.+)$", flags=re.MULTILINE)
17
+
18
+ _CITATION = """\
19
+ @article{orlanski2023measuring,
20
+ title={Measuring The Impact Of Programming Language Distribution},
21
+ author={Orlanski, Gabriel and Xiao, Kefan and Garcia, Xavier and Hui, Jeffrey and Howland, Joshua and Malmaud, Jonathan and Austin, Jacob and Singh, Rishah and Catasta, Michele},
22
+ journal={arXiv preprint arXiv:2302.01973},
23
+ year={2023}
24
+ }
25
+ """
26
+
27
+ _DESCRIPTION = """\
28
+ This metric implements the evaluation harness for datasets translated with the BabelCode framework as described in the paper "Measuring The Impact Of Programming Language Distribution" (https://arxiv.org/abs/2302.01973).
29
+ """
30
+
31
+
32
+ _KWARGS_DESCRIPTION = """
33
+ Calculates how many predictions per question pass a set of tests for the given problem.
34
+
35
+ Args:
36
+ predictions: The list of predictions for each question to execute.
37
+ languages: The language to use for each question.
38
+ question_dicts: The information for each question.
39
+ k: number of code candidates to consider in the evaluation (Default: [1, 10, 100])
40
+ num_workers: number of workers used to evaluate the candidate programs (Default: 4).
41
+ language_timeout: Timeouts to use for each language. If it is not set, will default to the one in the question dict (Default: None).
42
+ Returns:
43
+ pass_at_k: dict with pass rates for each k
44
+ results: dict with granular results of each unittest
45
+ Examples:
46
+ >>> bc_eval = evaluate.load("bc_eval")
47
+ >>> predictions = [["def add(a,b):\n\treturn a+b", "def add(a,b):\n\treturn a-b"]]
48
+ >>> languages = ["Python"]
49
+ >>> question_dicts = [{"test_code": "...", "entry_fn_name": "add","entry_cls_name":"Solution", "test_case_ids":["0","1"],"test_list":"..."}]
50
+ >>> pass_at_k, results = code_eval.compute(predictions=predictions,languages=languages, question_dicts=question_dicts, k=[1, 2])
51
+ >>> print(pass_at_k)
52
+ {'pass@1': 0.5, 'pass@2': 1.0}
53
+ """
54
+
55
+
56
+ _WARNING = """
57
+ ################################################################################
58
+ !!!WARNING!!!
59
+ ################################################################################
60
+ The "bc_eval" metric executes untrusted model-generated code in Python.
61
+ Although it is highly unlikely that model-generated code will do something
62
+ overtly malicious in response to this test suite, model-generated code may act
63
+ destructively due to a lack of model capability or alignment.
64
+ Users are strongly encouraged to sandbox this evaluation suite so that it
65
+ does not perform destructive actions on their host or network. For more
66
+ information on how OpenAI sandboxes its code, see the paper "Evaluating Large
67
+ Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
68
+ Once you have read this disclaimer and taken appropriate precautions,
69
+ set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this
70
+ with:
71
+ >>> import os
72
+ >>> os.environ["HF_ALLOW_CODE_EVAL"] = "1"
73
+ ################################################################################\
74
+ """
75
+
76
+ _QUESTION_INFO_KEYS = {
77
+ "entry_fn_name",
78
+ "entry_cls_name",
79
+ "test_code",
80
+ "test_list",
81
+ "test_case_ids",
82
+ }
83
+
84
+
85
+ def make_file_and_command(
86
+ qid, idx, pred, question, working_dir, timeout_override=None
87
+ ):
88
+ file_name = f"pred.{question['extension']}"
89
+ pred_dir = working_dir.joinpath(idx)
90
+ pred_dir.mkdir(parents=True)
91
+ pred_file = pred_dir.joinpath(file_name)
92
+ with pred_file.open("w") as f:
93
+ code = question["test_code"]
94
+ code = question["test_code"].replace("PLACEHOLDER_CODE_BODY", pred)
95
+ code = code.replace("PLACEHOLDER_FN_NAME", question["entry_fn_name"])
96
+ code = code.replace("PLACEHOLDER_CLS_NAME", question["entry_cls_name"])
97
+ f.write(code)
98
+
99
+ commands = []
100
+ for cmd, t in zip(question["commands"], question["timeouts"]):
101
+ commands.append(
102
+ {
103
+ "timeout": t if timeout_override is None else timeout_override,
104
+ "command": [
105
+ c if c != "__FILENAME__" else file_name for c in cmd
106
+ ],
107
+ }
108
+ )
109
+
110
+ return {"qid": qid, "idx": idx, "commands": commands, "cwd": pred_dir}
111
+
112
+
113
+ def _write_preds(
114
+ preds,
115
+ languages,
116
+ language_timeout,
117
+ question_dicts,
118
+ tmp_dir,
119
+ ):
120
+ commands = []
121
+ question_id_to_dict = {}
122
+
123
+ for pred_list, l, q_dict in tqdm(
124
+ zip(preds, languages, question_dicts), desc="Setup", total=len(preds)
125
+ ):
126
+ qid = len(question_id_to_dict)
127
+ q_dict['language'] = l
128
+ question_id_to_dict[qid] = q_dict
129
+ for p in pred_list:
130
+ commands.append(
131
+ make_file_and_command(
132
+ qid=qid,
133
+ idx=str(len(commands)),
134
+ pred=p,
135
+ question=q_dict,
136
+ timeout_override=language_timeout.get(l),
137
+ working_dir=tmp_dir,
138
+ )
139
+ )
140
+
141
+ return question_id_to_dict, commands
142
+
143
+
144
+ @evaluate.utils.file_utils.add_start_docstrings(
145
+ _DESCRIPTION, _KWARGS_DESCRIPTION
146
+ )
147
+ class BabelCodeEval(evaluate.Metric):
148
+ def _info(self):
149
+ list_keys = ["timeouts", "commands", "test_case_ids"]
150
+ question_info_type = {
151
+ k: datasets.Value(dtype="string")
152
+ for k in _QUESTION_INFO_KEYS
153
+ if k not in list_keys
154
+ }
155
+ question_info_type["test_case_ids"] = datasets.Value("string")
156
+ question_info_type["commands"] = datasets.Sequence(
157
+ datasets.Value("string")
158
+ )
159
+ question_info_type["timeouts"] = datasets.Sequence(
160
+ datasets.Value("int32")
161
+ )
162
+
163
+ return evaluate.MetricInfo(
164
+ # This is the description that will appear on the metrics page.
165
+ description=_DESCRIPTION,
166
+ citation=_CITATION,
167
+ inputs_description=_KWARGS_DESCRIPTION,
168
+ # This defines the format of each prediction and reference
169
+ features=datasets.Features(
170
+ {
171
+ "predictions": datasets.Sequence(datasets.Value("string")),
172
+ "languages": datasets.Value("string"),
173
+ }
174
+ ),
175
+ homepage="https://github.com/google-research/babelcode",
176
+ codebase_urls=["https://github.com/google-research/babelcode"],
177
+ reference_urls=["https://github.com/google-research/babelcode"],
178
+ )
179
+
180
+ def _compute(
181
+ self,
182
+ predictions,
183
+ languages,
184
+ question_dicts,
185
+ k=[1, 10, 100],
186
+ num_workers=4,
187
+ language_timeout=None,
188
+ ):
189
+ """Returns the scores"""
190
+
191
+ if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
192
+ raise ValueError(_WARNING)
193
+
194
+ language_timeout = language_timeout or {}
195
+
196
+ with tempfile.TemporaryDirectory() as tmp_dir:
197
+ working_dir = Path(tmp_dir)
198
+ question_map, pred_commands = _write_preds(
199
+ preds=predictions,
200
+ languages=languages,
201
+ language_timeout=language_timeout,
202
+ question_dicts=question_dicts,
203
+ tmp_dir=working_dir,
204
+ )
205
+
206
+ results = execute_predictions(
207
+ pred_commands,
208
+ num_workers=num_workers,
209
+ max_task_per_child=5,
210
+ garbage_collection_freq=500,
211
+ )
212
+
213
+
214
+ all_results, q_passes, q_pct = _eval_predictions(
215
+ results, question_map
216
+ )
217
+
218
+
219
+ assert len(q_passes) == len(q_pct)
220
+ metrics = {}
221
+ for lang in q_passes:
222
+ metrics.update(_calculate_metrics(lang, q_passes[lang], q_pct[lang],k_vals=k))
223
+ return metrics, all_results
224
+
225
+ def _eval_single_pred(result, test_ids, num_expected_commands):
226
+ test_case_results = {k: "MISSING" for k in test_ids}
227
+ if len(result["results"]) != num_expected_commands:
228
+ return "HAD_ERROR", 0, test_case_results
229
+
230
+ last_result = result["results"][-1]
231
+ if last_result.timed_out:
232
+ return "TIMED_OUT", 0, test_case_results
233
+ elif last_result.return_code != 0:
234
+ return "HAD_ERROR", 0, test_case_results
235
+ elif not last_result.stdout:
236
+ return "HAD_ERROR", 0, test_case_results
237
+
238
+ for match in STDOUT_PARSE_REGEX.findall(last_result.stdout):
239
+ idx, test_result = match
240
+ if idx in test_ids:
241
+ if test_case_results[idx] != "MISSING":
242
+ return "UNKNOWN_ERROR", 0, test_case_results
243
+ test_case_results[idx] = test_result.strip()
244
+
245
+ did_test_fail = False
246
+ had_error = False
247
+ num_passed = 0
248
+ for r in test_case_results.values():
249
+ if r == "PASSED":
250
+ num_passed += 1
251
+ elif r == "FAILED":
252
+ did_test_fail = True
253
+ else:
254
+ had_error = True
255
+
256
+ if had_error:
257
+ return "HAD_ERROR", num_passed, test_case_results
258
+ if did_test_fail:
259
+ return "FAILED", num_passed, test_case_results
260
+
261
+ return "PASSED", num_passed, test_case_results
262
+
263
+
264
+ def _eval_predictions(pred_results, question_map):
265
+ out = []
266
+ question_results = defaultdict(lambda: defaultdict(list))
267
+ question_pct_pass = defaultdict(lambda: defaultdict(list))
268
+
269
+ for p in pred_results:
270
+ question = question_map[p["qid"]]
271
+ test_cases = question["test_case_ids"]
272
+ num_expected_commands = len(question["commands"])
273
+
274
+ outcome, num_passed, test_case_results = _eval_single_pred(
275
+ p, test_ids=test_cases, num_expected_commands=num_expected_commands
276
+ )
277
+
278
+ p["results"] = [dataclasses.asdict(r) for r in p["results"]]
279
+ p["test_cases"] = test_case_results
280
+ p["outcome"] = outcome
281
+
282
+ lang = question['language']
283
+ question_results[lang][p["qid"]].append(
284
+ num_passed == len(test_case_results)
285
+ )
286
+ question_pct_pass[lang][p["qid"]].append(
287
+ num_passed / len(test_case_results)
288
+ )
289
+
290
+ out.append(p)
291
+
292
+ return out, question_results, question_pct_pass
293
+
294
+
295
+ def _calculate_metrics(lang,q_passed, q_pcts, k_vals):
296
+ assert len(q_passed) == len(q_pcts)
297
+
298
+ num_samples = np.zeros(len(q_passed))
299
+ num_correct = np.zeros(len(q_passed))
300
+ pcts_passed = np.zeros(len(q_passed))
301
+ for i, (k,v) in enumerate(q_passed.items()):
302
+ num_samples[i] = len(v)
303
+ num_correct[i] = sum(v)
304
+ pcts_passed[i] = np.mean(q_pcts[k])
305
+
306
+
307
+ out = {f'{lang}/pass@{k}': estimate_pass_at_k(num_samples, num_correct, k).mean() for k in k_vals}
308
+ out[f'{lang}/mean_pct_pass'] = np.mean(pcts_passed)
309
+
310
+
311
+ return out
312
+
313
+
314
+
315
+ def estimate_pass_at_k(num_samples, num_correct, k):
316
+ """Estimates pass@k of each problem and returns them in an array."""
317
+
318
+ def estimator(n: int, c: int, k: int) -> float:
319
+ """Calculates 1 - comb(n - c, k) / comb(n, k)."""
320
+ if n - c < k:
321
+ return 1.0
322
+ return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
323
+
324
+ if isinstance(num_samples, int):
325
+ num_samples_it = itertools.repeat(num_samples, len(num_correct))
326
+ else:
327
+ assert len(num_samples) == len(num_correct)
328
+ num_samples_it = iter(num_samples)
329
+
330
+ return np.array(
331
+ [
332
+ estimator(int(n), int(c), k)
333
+ for n, c in zip(num_samples_it, num_correct)
334
+ ]
335
+ )
execution.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import gc
3
+ import multiprocessing as mp
4
+ import pathlib
5
+ import subprocess
6
+ from dataclasses import dataclass
7
+ from typing import Dict, List
8
+
9
+ from tqdm import tqdm
10
+
11
+ @dataclass
12
+ class CommandResult:
13
+ return_code: int
14
+ runtime: float
15
+ stdout: str
16
+ stderr: str
17
+ timed_out: bool
18
+
19
+ def safe_execute(
20
+ command_to_run: List[str],
21
+ working_dir: pathlib.Path,
22
+ timeout: int = 10,
23
+ ) -> CommandResult:
24
+ """Executes a list of commands safely.
25
+
26
+ Args:
27
+ command_to_run: The command to run.
28
+ working_dir: The working directory to run them in.
29
+ timeout Timeout.
30
+
31
+ Returns:
32
+ The result of executing the command.
33
+ """
34
+ timed_out = False
35
+ return_code = -1
36
+ runtime = timeout
37
+ stderr = None
38
+ stdout = None
39
+ start_time = datetime.datetime.now()
40
+ execution_process = subprocess.Popen(
41
+ command_to_run,
42
+ cwd=str(working_dir),
43
+ stdout=subprocess.PIPE,
44
+ stderr=subprocess.PIPE,
45
+ )
46
+ try:
47
+ outputs = execution_process.communicate(timeout=timeout)
48
+
49
+ stdout, stderr = outputs
50
+ stdout = stdout.decode('utf-8')
51
+ stderr = stderr.decode('utf-8')
52
+ runtime = (datetime.datetime.now() - start_time).total_seconds()
53
+ return_code = execution_process.returncode
54
+ except subprocess.TimeoutExpired:
55
+ timed_out = True
56
+ runtime = timeout
57
+ finally:
58
+ execution_process.kill()
59
+
60
+ return CommandResult(
61
+ return_code=return_code,
62
+ runtime=runtime,
63
+ stderr=stderr,
64
+ stdout=stdout,
65
+ timed_out=timed_out,
66
+ )
67
+
68
+
69
+ def execute_code(sample: Dict):
70
+ """Execute a file of code.
71
+
72
+ Args:
73
+ sample: The sample to run.
74
+
75
+ Returns:
76
+ The execution result.
77
+ """
78
+ file_path = sample["cwd"]
79
+ working_dir_for_execution = (
80
+ file_path.parent if file_path.is_file() else file_path
81
+ )
82
+ working_dir_for_execution = working_dir_for_execution.resolve().absolute()
83
+ timed_out = False
84
+ failed = False
85
+ results = []
86
+ for command in sample['commands']:
87
+ res = safe_execute(command['command'], working_dir=working_dir_for_execution, timeout=command['timeout'])
88
+ results.append(res)
89
+ if res.timed_out:
90
+ timed_out = True
91
+ break
92
+ if res.return_code != 0:
93
+ failed = True
94
+ break
95
+ return {
96
+ "qid":sample['qid'],
97
+ "idx": sample["idx"],
98
+ "file_path": str(file_path.absolute().resolve()),
99
+ "results": results,
100
+ "failed":failed,
101
+ "timed_out": timed_out,
102
+ }
103
+
104
+
105
+
106
+
107
+ def execute_predictions(
108
+ predictions: List[Dict],
109
+ num_workers: int = 1,
110
+ max_task_per_child: int = 1,
111
+ garbage_collection_freq: int = 500,
112
+ ):
113
+ """Execute a list of predictions in a specific language.
114
+
115
+ Args:
116
+ predictions: List of predictions.
117
+ num_workers: The number of workers to use.
118
+ max_task_per_child: The maximum tasks ran per child before it is killed.
119
+ garbage_collection_freq: How often to run garbage collection.
120
+
121
+ Returns:
122
+ The the array of raw execution results and the total runtime.
123
+ """
124
+
125
+ # Make the arguments to submit to the ThreadPoolExecutor. Do it here so we
126
+ # can have a progress bar as well.
127
+ num_to_complete = len(predictions)
128
+ num_completed = 0
129
+ results = []
130
+ with mp.Pool(num_workers, maxtasksperchild=max_task_per_child) as pool:
131
+ for result in tqdm(
132
+ pool.imap_unordered(execute_code, predictions),
133
+ total=num_to_complete,
134
+ desc="Executing",
135
+ ):
136
+ num_completed += 1
137
+
138
+ results.append(result)
139
+
140
+ if num_completed % garbage_collection_freq == 0:
141
+ gc.collect()
142
+ # Cleanup pool
143
+ pool.close()
144
+ pool.terminate()
145
+ return results
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ git+https://github.com/huggingface/evaluate@main