Spaces:
Runtime error
Runtime error
gabeorlanski
commited on
Commit
•
0fb6838
1
Parent(s):
a7f3790
Update bc_eval.py
Browse files- bc_eval.py +10 -8
bc_eval.py
CHANGED
@@ -3,7 +3,7 @@ import itertools
|
|
3 |
import os
|
4 |
import re
|
5 |
import tempfile
|
6 |
-
from collections import defaultdict
|
7 |
from pathlib import Path
|
8 |
|
9 |
import datasets
|
@@ -204,13 +204,13 @@ class BabelCodeEval(evaluate.Metric):
|
|
204 |
garbage_collection_freq=gc_freq,
|
205 |
)
|
206 |
|
207 |
-
all_results, q_passes, q_pct = _eval_predictions(results, question_map)
|
208 |
|
209 |
assert len(q_passes) == len(q_pct)
|
210 |
metrics = {}
|
211 |
for lang in q_passes:
|
212 |
metrics.update(
|
213 |
-
_calculate_metrics(lang, q_passes[lang], q_pct[lang], k_vals=k)
|
214 |
)
|
215 |
return metrics, all_results
|
216 |
|
@@ -258,7 +258,7 @@ def _eval_predictions(pred_results, question_map):
|
|
258 |
out = []
|
259 |
question_results = defaultdict(lambda: defaultdict(list))
|
260 |
question_pct_pass = defaultdict(lambda: defaultdict(list))
|
261 |
-
|
262 |
for p in pred_results:
|
263 |
question = question_map[p["qid"]]
|
264 |
test_cases = question["test_case_ids"]
|
@@ -275,13 +275,13 @@ def _eval_predictions(pred_results, question_map):
|
|
275 |
lang = question["language"]
|
276 |
question_results[lang][p["qid"]].append(num_passed == len(test_case_results))
|
277 |
question_pct_pass[lang][p["qid"]].append(num_passed / len(test_case_results))
|
278 |
-
|
279 |
out.append(p)
|
280 |
|
281 |
-
return out, question_results, question_pct_pass
|
282 |
|
283 |
|
284 |
-
def _calculate_metrics(lang, q_passed, q_pcts, k_vals):
|
285 |
assert len(q_passed) == len(q_pcts)
|
286 |
|
287 |
num_samples = np.zeros(len(q_passed))
|
@@ -298,7 +298,9 @@ def _calculate_metrics(lang, q_passed, q_pcts, k_vals):
|
|
298 |
}
|
299 |
out[f"{lang}/mean_pct_pass"] = np.mean(pcts_passed)
|
300 |
out[f"{lang}/median_pct_pass"] = np.median(pcts_passed)
|
301 |
-
|
|
|
|
|
302 |
|
303 |
return out
|
304 |
|
|
|
3 |
import os
|
4 |
import re
|
5 |
import tempfile
|
6 |
+
from collections import defaultdict, Counter
|
7 |
from pathlib import Path
|
8 |
|
9 |
import datasets
|
|
|
204 |
garbage_collection_freq=gc_freq,
|
205 |
)
|
206 |
|
207 |
+
all_results, q_passes, q_pct, o_count = _eval_predictions(results, question_map)
|
208 |
|
209 |
assert len(q_passes) == len(q_pct)
|
210 |
metrics = {}
|
211 |
for lang in q_passes:
|
212 |
metrics.update(
|
213 |
+
_calculate_metrics(lang, q_passes[lang], q_pct[lang], o_count[lang], k_vals=k)
|
214 |
)
|
215 |
return metrics, all_results
|
216 |
|
|
|
258 |
out = []
|
259 |
question_results = defaultdict(lambda: defaultdict(list))
|
260 |
question_pct_pass = defaultdict(lambda: defaultdict(list))
|
261 |
+
outcome_counts = defaultdict(Counter)
|
262 |
for p in pred_results:
|
263 |
question = question_map[p["qid"]]
|
264 |
test_cases = question["test_case_ids"]
|
|
|
275 |
lang = question["language"]
|
276 |
question_results[lang][p["qid"]].append(num_passed == len(test_case_results))
|
277 |
question_pct_pass[lang][p["qid"]].append(num_passed / len(test_case_results))
|
278 |
+
outcome_counts[lang][outcome] += 1
|
279 |
out.append(p)
|
280 |
|
281 |
+
return out, question_results, question_pct_pass, outcome_counts
|
282 |
|
283 |
|
284 |
+
def _calculate_metrics(lang, q_passed, q_pcts, o_count, k_vals):
|
285 |
assert len(q_passed) == len(q_pcts)
|
286 |
|
287 |
num_samples = np.zeros(len(q_passed))
|
|
|
298 |
}
|
299 |
out[f"{lang}/mean_pct_pass"] = np.mean(pcts_passed)
|
300 |
out[f"{lang}/median_pct_pass"] = np.median(pcts_passed)
|
301 |
+
|
302 |
+
for outcome, val in o_count.items():
|
303 |
+
out[f"{lang}/pct_{outcome}"] = val/len(q_passed)
|
304 |
|
305 |
return out
|
306 |
|