jjkim
commited on
Commit
·
dc264fe
1
Parent(s):
f435ec5
- code_eval.py +8 -6
code_eval.py
CHANGED
|
@@ -20,7 +20,7 @@ import itertools
|
|
| 20 |
import os
|
| 21 |
from collections import Counter, defaultdict
|
| 22 |
from concurrent.futures import CancelledError, ThreadPoolExecutor, as_completed
|
| 23 |
-
from typing import List, Optional
|
| 24 |
import time
|
| 25 |
from string import Template
|
| 26 |
|
|
@@ -145,8 +145,8 @@ class CodeEval(evaluate.Metric):
|
|
| 145 |
# This defines the format of each prediction and reference
|
| 146 |
features=datasets.Features(
|
| 147 |
{
|
| 148 |
-
"predictions":
|
| 149 |
-
"references":
|
| 150 |
}
|
| 151 |
),
|
| 152 |
homepage="https://github.com/openai/human-eval",
|
|
@@ -178,7 +178,7 @@ class CodeEval(evaluate.Metric):
|
|
| 178 |
raise NotImplementedError(
|
| 179 |
"This metric is currently not supported on Windows."
|
| 180 |
)
|
| 181 |
-
|
| 182 |
predictions = sorted(predictions, key=lambda x: x["id"])
|
| 183 |
references = sorted(references, key=lambda x: x["id"])
|
| 184 |
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
|
@@ -186,7 +186,7 @@ class CodeEval(evaluate.Metric):
|
|
| 186 |
for pred_d, ref_d in zip(predictions, references):
|
| 187 |
assert pred_d["id"] == ref_d["id"]
|
| 188 |
tid = pred_d["id"]
|
| 189 |
-
|
| 190 |
results[tid] = []
|
| 191 |
pred = pred_d[pred_key]
|
| 192 |
ref = ref_d[ref_key]
|
|
@@ -204,7 +204,9 @@ class CodeEval(evaluate.Metric):
|
|
| 204 |
result.add(future)
|
| 205 |
results[tid].append(result)
|
| 206 |
|
| 207 |
-
pbar = tqdm(
|
|
|
|
|
|
|
| 208 |
prev_done_count = 0
|
| 209 |
done = False
|
| 210 |
while not done:
|
|
|
|
| 20 |
import os
|
| 21 |
from collections import Counter, defaultdict
|
| 22 |
from concurrent.futures import CancelledError, ThreadPoolExecutor, as_completed
|
| 23 |
+
from typing import Dict, List, Optional
|
| 24 |
import time
|
| 25 |
from string import Template
|
| 26 |
|
|
|
|
| 145 |
# This defines the format of each prediction and reference
|
| 146 |
features=datasets.Features(
|
| 147 |
{
|
| 148 |
+
"predictions": List[Dict],
|
| 149 |
+
"references": List[Dict],
|
| 150 |
}
|
| 151 |
),
|
| 152 |
homepage="https://github.com/openai/human-eval",
|
|
|
|
| 178 |
raise NotImplementedError(
|
| 179 |
"This metric is currently not supported on Windows."
|
| 180 |
)
|
| 181 |
+
|
| 182 |
predictions = sorted(predictions, key=lambda x: x["id"])
|
| 183 |
references = sorted(references, key=lambda x: x["id"])
|
| 184 |
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
|
|
|
| 186 |
for pred_d, ref_d in zip(predictions, references):
|
| 187 |
assert pred_d["id"] == ref_d["id"]
|
| 188 |
tid = pred_d["id"]
|
| 189 |
+
|
| 190 |
results[tid] = []
|
| 191 |
pred = pred_d[pred_key]
|
| 192 |
ref = ref_d[ref_key]
|
|
|
|
| 204 |
result.add(future)
|
| 205 |
results[tid].append(result)
|
| 206 |
|
| 207 |
+
pbar = tqdm(
|
| 208 |
+
total=sum(len(r) for r in results.values()), disable=disable_tqdm
|
| 209 |
+
)
|
| 210 |
prev_done_count = 0
|
| 211 |
done = False
|
| 212 |
while not done:
|