Spaces:
Sleeping
Sleeping
dfs
Browse files- src/about.py +2 -1
- src/display/utils.py +1 -1
- src/leaderboard/read_evals.py +8 -5
src/about.py
CHANGED
@@ -17,7 +17,8 @@ class Tasks(Enum):
|
|
17 |
task2 = Task("VCR", "acc", "VCR")
|
18 |
task3 = Task("Culture", "acc", "Culture")
|
19 |
task4 = Task("Trick", "acc", "Trick")
|
20 |
-
|
|
|
21 |
task0_f1 = Task("Count", "f1", "Count")
|
22 |
task1_f1 = Task("Order", "f1", "Order")
|
23 |
task2_f1 = Task("VCR", "f1", "VCR")
|
|
|
17 |
task2 = Task("VCR", "acc", "VCR")
|
18 |
task3 = Task("Culture", "acc", "Culture")
|
19 |
task4 = Task("Trick", "acc", "Trick")
|
20 |
+
|
21 |
+
class N_Tasks(Enum):
|
22 |
task0_f1 = Task("Count", "f1", "Count")
|
23 |
task1_f1 = Task("Order", "f1", "Order")
|
24 |
task2_f1 = Task("VCR", "f1", "VCR")
|
src/display/utils.py
CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.about import Tasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.about import Tasks, N_Tasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,7 +8,8 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, ModelType,
|
|
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
@@ -114,9 +115,9 @@ class EvalResult:
|
|
114 |
except Exception:
|
115 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
116 |
|
117 |
-
def to_dict(self):
|
118 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
119 |
-
average = sum([v for v in self.results.values() if v is not None]) / len(
|
120 |
data_dict = {
|
121 |
"eval_name": self.eval_name, # not a column, just a save name,
|
122 |
AutoEvalColumn.architecture.name: self.architecture,
|
@@ -127,7 +128,7 @@ class EvalResult:
|
|
127 |
AutoEvalColumn.dataset_version.name: self.dataset_version,
|
128 |
}
|
129 |
|
130 |
-
for task in
|
131 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
132 |
|
133 |
return data_dict
|
@@ -187,10 +188,12 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
187 |
else:
|
188 |
eval_results[eval_name] = eval_result
|
189 |
|
|
|
|
|
190 |
results = []
|
191 |
for v in eval_results.values():
|
192 |
try:
|
193 |
-
v.to_dict() # we test if the dict version is complete
|
194 |
results.append(v)
|
195 |
except KeyError: # not all eval values present
|
196 |
continue
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, ModelType, Precision, WeightType
|
12 |
+
from src.about import Tasks, N_Tasks
|
13 |
from src.submission.check_validity import is_model_on_hub
|
14 |
|
15 |
|
|
|
115 |
except Exception:
|
116 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
117 |
|
118 |
+
def to_dict(self, tasks):
|
119 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
120 |
+
average = sum([v for v in self.results.values() if v is not None]) / len(tasks)
|
121 |
data_dict = {
|
122 |
"eval_name": self.eval_name, # not a column, just a save name,
|
123 |
AutoEvalColumn.architecture.name: self.architecture,
|
|
|
128 |
AutoEvalColumn.dataset_version.name: self.dataset_version,
|
129 |
}
|
130 |
|
131 |
+
for task in tasks:
|
132 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
133 |
|
134 |
return data_dict
|
|
|
188 |
else:
|
189 |
eval_results[eval_name] = eval_result
|
190 |
|
191 |
+
version = results_path.split("/")[-1]
|
192 |
+
tasks = N_Tasks if "n_" in version else Tasks
|
193 |
results = []
|
194 |
for v in eval_results.values():
|
195 |
try:
|
196 |
+
v.to_dict(tasks) # we test if the dict version is complete
|
197 |
results.append(v)
|
198 |
except KeyError: # not all eval values present
|
199 |
continue
|