Spaces:
Running
Running
Commit
·
c3ba57d
1
Parent(s):
bdacdff
parse model config from json files and display clickable links under Model column
Browse files- app.py +15 -4
- results/BOOM_leaderboard.csv +15 -15
- src/display/formatting.py +12 -2
- src/leaderboard/read_evals.py +73 -21
- src/populate.py +19 -16
app.py
CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
|
|
2 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
-
from
|
6 |
|
7 |
from src.about import (
|
8 |
CITATION_BUTTON_LABEL,
|
@@ -66,6 +66,7 @@ LEADERBOARD_DF = get_leaderboard_df(
|
|
66 |
LEADERBOARD_DF_DOMAIN = get_leaderboard_df(
|
67 |
EVAL_RESULTS_PATH + "/" + "BOOM_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
|
68 |
)
|
|
|
69 |
|
70 |
# (
|
71 |
# finished_eval_queue_df,
|
@@ -74,12 +75,21 @@ LEADERBOARD_DF_DOMAIN = get_leaderboard_df(
|
|
74 |
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
75 |
|
76 |
|
77 |
-
def init_leaderboard(dataframe):
|
78 |
# TODO: merge results df with model info df
|
79 |
if dataframe is None or dataframe.empty:
|
80 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
return Leaderboard(
|
82 |
-
value=
|
83 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
84 |
select_columns=SelectColumns(
|
85 |
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
@@ -92,6 +102,7 @@ def init_leaderboard(dataframe):
|
|
92 |
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
93 |
],
|
94 |
bool_checkboxgroup_label="Hide models",
|
|
|
95 |
interactive=False,
|
96 |
)
|
97 |
|
@@ -103,7 +114,7 @@ with demo:
|
|
103 |
|
104 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
105 |
with gr.TabItem("🏅 Overall", elem_id="boom-benchmark-tab-table", id=0):
|
106 |
-
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
107 |
|
108 |
# TODO - add other tabs if needed
|
109 |
# with gr.TabItem("🏅 By Domain - TODO", elem_id="boom-benchmark-tab-table", id=1):
|
|
|
2 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
+
from src.populate import get_model_info_df, get_merged_df
|
6 |
|
7 |
from src.about import (
|
8 |
CITATION_BUTTON_LABEL,
|
|
|
66 |
LEADERBOARD_DF_DOMAIN = get_leaderboard_df(
|
67 |
EVAL_RESULTS_PATH + "/" + "BOOM_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
|
68 |
)
|
69 |
+
model_info_df = get_model_info_df(EVAL_RESULTS_PATH)
|
70 |
|
71 |
# (
|
72 |
# finished_eval_queue_df,
|
|
|
75 |
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
76 |
|
77 |
|
78 |
+
def init_leaderboard(dataframe, model_info_df):
|
79 |
# TODO: merge results df with model info df
|
80 |
if dataframe is None or dataframe.empty:
|
81 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
82 |
+
|
83 |
+
merged_df = get_merged_df(dataframe, model_info_df)
|
84 |
+
merged_df = merged_df.sort_values(by=[AutoEvalColumn.Rank_6750_scaled.name], ascending=True)
|
85 |
+
|
86 |
+
# Move the model_type_symbol column to the beginning
|
87 |
+
cols = [AutoEvalColumn.model_type_symbol.name] + [
|
88 |
+
col for col in merged_df.columns if col != AutoEvalColumn.model_type_symbol.name
|
89 |
+
]
|
90 |
+
merged_df = merged_df[cols]
|
91 |
return Leaderboard(
|
92 |
+
value=merged_df,
|
93 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
94 |
select_columns=SelectColumns(
|
95 |
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
|
|
102 |
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
103 |
],
|
104 |
bool_checkboxgroup_label="Hide models",
|
105 |
+
column_widths=[40, 150] + [180 for _ in range(len(merged_df.columns) - 2)],
|
106 |
interactive=False,
|
107 |
)
|
108 |
|
|
|
114 |
|
115 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
116 |
with gr.TabItem("🏅 Overall", elem_id="boom-benchmark-tab-table", id=0):
|
117 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF, model_info_df)
|
118 |
|
119 |
# TODO - add other tabs if needed
|
120 |
# with gr.TabItem("🏅 By Domain - TODO", elem_id="boom-benchmark-tab-table", id=1):
|
results/BOOM_leaderboard.csv
CHANGED
@@ -1,15 +1,15 @@
|
|
1 |
-
model,
|
2 |
-
Toto-Open-Base-1.0,
|
3 |
-
moirai_1.1_base,
|
4 |
-
moirai_1.1_large,
|
5 |
-
moirai_1.1_small,
|
6 |
-
timesfm_2_0_500m,
|
7 |
-
chronos_bolt_base,
|
8 |
-
chronos_bolt_small,
|
9 |
-
autoarima,
|
10 |
-
timer,
|
11 |
-
time-moe,
|
12 |
-
visionts,
|
13 |
-
autoets,
|
14 |
-
autotheta,
|
15 |
-
seasonalnaive,
|
|
|
1 |
+
model,MASE_6750_scaled,CRPS_6750_scaled,Rank_6750_scaled
|
2 |
+
Toto-Open-Base-1.0,0.617,0.375,2.336
|
3 |
+
moirai_1.1_base,0.710,0.428,4.253
|
4 |
+
moirai_1.1_large,0.720,0.436,4.481
|
5 |
+
moirai_1.1_small,0.729,0.442,4.820
|
6 |
+
timesfm_2_0_500m,0.725,0.447,5.155
|
7 |
+
chronos_bolt_base,0.726,0.451,5.447
|
8 |
+
chronos_bolt_small,0.733,0.455,5.792
|
9 |
+
autoarima,0.824,0.736,9.166
|
10 |
+
timer,0.796,0.639,9.370
|
11 |
+
time-moe,0.806,0.649,9.381
|
12 |
+
visionts,0.988,0.673,10.317
|
13 |
+
autoets,0.842,1.975,10.968
|
14 |
+
autotheta,1.123,1.018,11.724
|
15 |
+
seasonalnaive,1.000,1.000,11.791
|
src/display/formatting.py
CHANGED
@@ -1,5 +1,15 @@
|
|
1 |
-
def model_hyperlink(
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
|
5 |
def make_clickable_model(model_name):
|
|
|
1 |
+
def model_hyperlink(model_link, code_link, model_name):
|
2 |
+
if model_link == "":
|
3 |
+
return model_name
|
4 |
+
# return f'<a target="_blank">{model_name}</a>'
|
5 |
+
# return f'<a target="_blank" href="{link}" rel="noopener noreferrer">{model_name}</a>'
|
6 |
+
else:
|
7 |
+
model_url = f'<a target="_blank" href="{model_link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
8 |
+
if code_link == "":
|
9 |
+
return model_url
|
10 |
+
else:
|
11 |
+
code_url = f'<a target="_blank" href="{code_link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">code</a>'
|
12 |
+
return f"{model_url} ({code_url})"
|
13 |
|
14 |
|
15 |
def make_clickable_model(model_name):
|
src/leaderboard/read_evals.py
CHANGED
@@ -3,33 +3,33 @@ import json
|
|
3 |
import math
|
4 |
import os
|
5 |
from dataclasses import dataclass
|
6 |
-
|
7 |
import dateutil
|
8 |
import numpy as np
|
9 |
|
10 |
-
from src.display.formatting import make_clickable_model
|
11 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
15 |
@dataclass
|
16 |
class EvalResult:
|
17 |
-
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
18 |
-
|
19 |
-
eval_name: str
|
20 |
-
full_model: str
|
21 |
-
org: str
|
22 |
model: str
|
23 |
-
revision: str
|
24 |
results: dict
|
25 |
precision: Precision = Precision.Unknown
|
26 |
-
model_type: ModelType = ModelType.Unknown
|
27 |
-
weight_type: WeightType = WeightType.Original
|
28 |
-
architecture: str = "Unknown"
|
29 |
license: str = "?"
|
30 |
likes: int = 0
|
31 |
num_params: int = 0
|
32 |
-
date: str = ""
|
33 |
still_on_hub: bool = False
|
34 |
|
35 |
@classmethod
|
@@ -85,10 +85,10 @@ class EvalResult:
|
|
85 |
org=org,
|
86 |
model=model,
|
87 |
results=results,
|
88 |
-
precision=precision,
|
89 |
-
revision=
|
90 |
still_on_hub=still_on_hub,
|
91 |
-
architecture=architecture
|
92 |
)
|
93 |
|
94 |
def update_with_request_file(self, requests_path):
|
@@ -105,7 +105,9 @@ class EvalResult:
|
|
105 |
self.num_params = request.get("params", 0)
|
106 |
self.date = request.get("submitted_time", "")
|
107 |
except Exception:
|
108 |
-
print(
|
|
|
|
|
109 |
|
110 |
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
@@ -132,6 +134,59 @@ class EvalResult:
|
|
132 |
return data_dict
|
133 |
|
134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
def get_request_file_for_model(requests_path, model_name, precision):
|
136 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
137 |
request_files = os.path.join(
|
@@ -146,10 +201,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
146 |
for tmp_request_file in request_files:
|
147 |
with open(tmp_request_file, "r") as f:
|
148 |
req_content = json.load(f)
|
149 |
-
if (
|
150 |
-
req_content["status"] in ["FINISHED"]
|
151 |
-
and req_content["precision"] == precision.split(".")[-1]
|
152 |
-
):
|
153 |
request_file = tmp_request_file
|
154 |
return request_file
|
155 |
|
@@ -188,7 +240,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
188 |
results = []
|
189 |
for v in eval_results.values():
|
190 |
try:
|
191 |
-
v.to_dict()
|
192 |
results.append(v)
|
193 |
except KeyError: # not all eval values present
|
194 |
continue
|
|
|
3 |
import math
|
4 |
import os
|
5 |
from dataclasses import dataclass
|
6 |
+
from pathlib import Path
|
7 |
import dateutil
|
8 |
import numpy as np
|
9 |
|
10 |
+
from src.display.formatting import make_clickable_model, model_hyperlink
|
11 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
15 |
@dataclass
|
16 |
class EvalResult:
|
17 |
+
"""Represents one full evaluation. Built from a combination of the result and request file for a given run."""
|
18 |
+
|
19 |
+
eval_name: str # org_model_precision (uid)
|
20 |
+
full_model: str # org/model (path on hub)
|
21 |
+
org: str
|
22 |
model: str
|
23 |
+
revision: str # commit hash, "" if main
|
24 |
results: dict
|
25 |
precision: Precision = Precision.Unknown
|
26 |
+
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
+
weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
+
architecture: str = "Unknown"
|
29 |
license: str = "?"
|
30 |
likes: int = 0
|
31 |
num_params: int = 0
|
32 |
+
date: str = "" # submission date of request file
|
33 |
still_on_hub: bool = False
|
34 |
|
35 |
@classmethod
|
|
|
85 |
org=org,
|
86 |
model=model,
|
87 |
results=results,
|
88 |
+
precision=precision,
|
89 |
+
revision=config.get("model_sha", ""),
|
90 |
still_on_hub=still_on_hub,
|
91 |
+
architecture=architecture,
|
92 |
)
|
93 |
|
94 |
def update_with_request_file(self, requests_path):
|
|
|
105 |
self.num_params = request.get("params", 0)
|
106 |
self.date = request.get("submitted_time", "")
|
107 |
except Exception:
|
108 |
+
print(
|
109 |
+
f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
|
110 |
+
)
|
111 |
|
112 |
def to_dict(self):
|
113 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
|
|
134 |
return data_dict
|
135 |
|
136 |
|
137 |
+
@dataclass
|
138 |
+
class ModelConfig:
|
139 |
+
"""Represents the model configuration of a model"""
|
140 |
+
|
141 |
+
model: str
|
142 |
+
tmp_name: str
|
143 |
+
model_link: str = ""
|
144 |
+
model_type: ModelType = ModelType.Unknown
|
145 |
+
code_link: str = ""
|
146 |
+
|
147 |
+
@classmethod
|
148 |
+
def init_from_json_file(cls, json_filepath):
|
149 |
+
"""Inits the result from the specific model result file"""
|
150 |
+
with open(json_filepath) as fp:
|
151 |
+
data = json.load(fp)
|
152 |
+
|
153 |
+
model_type = ModelType.from_str(data.get("model_type", ""))
|
154 |
+
model = data.get("model", "")
|
155 |
+
tmp_name = data.get("tmp_name", "")
|
156 |
+
model_link = data.get("model_link", "")
|
157 |
+
code_link = data.get("code_link", "")
|
158 |
+
return cls(model=model, tmp_name=tmp_name, model_link=model_link, model_type=model_type, code_link=code_link)
|
159 |
+
|
160 |
+
def to_dict(self):
|
161 |
+
"""Converts the model info to a dict compatible with our dataframe display"""
|
162 |
+
data_dict = {
|
163 |
+
AutoEvalColumn.model.name: self.model,
|
164 |
+
"model_w_link": model_hyperlink(self.model_link, self.code_link, self.model),
|
165 |
+
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
166 |
+
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
167 |
+
"tmp_name": self.tmp_name,
|
168 |
+
}
|
169 |
+
|
170 |
+
return data_dict
|
171 |
+
|
172 |
+
|
173 |
+
def get_model_info(results_path: str) -> list[ModelConfig]:
|
174 |
+
"""From the path of the results folder root, extract all needed info for results"""
|
175 |
+
model_result_filepaths = (Path(results_path) / "models_info").glob("**/config.json")
|
176 |
+
|
177 |
+
model_info_list = []
|
178 |
+
for model_result_filepath in model_result_filepaths:
|
179 |
+
# Creation of result
|
180 |
+
model_info = ModelConfig.init_from_json_file(model_result_filepath)
|
181 |
+
|
182 |
+
try:
|
183 |
+
model_info.to_dict() # we test if the dict version is complete
|
184 |
+
model_info_list.append(model_info)
|
185 |
+
except KeyError: # not all eval values present
|
186 |
+
continue
|
187 |
+
return model_info_list
|
188 |
+
|
189 |
+
|
190 |
def get_request_file_for_model(requests_path, model_name, precision):
|
191 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
192 |
request_files = os.path.join(
|
|
|
201 |
for tmp_request_file in request_files:
|
202 |
with open(tmp_request_file, "r") as f:
|
203 |
req_content = json.load(f)
|
204 |
+
if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
|
|
|
|
|
|
|
205 |
request_file = tmp_request_file
|
206 |
return request_file
|
207 |
|
|
|
240 |
results = []
|
241 |
for v in eval_results.values():
|
242 |
try:
|
243 |
+
v.to_dict() # we test if the dict version is complete
|
244 |
results.append(v)
|
245 |
except KeyError: # not all eval values present
|
246 |
continue
|
src/populate.py
CHANGED
@@ -5,10 +5,28 @@ import pandas as pd
|
|
5 |
from dataclasses import fields
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
-
from src.leaderboard.read_evals import
|
9 |
from src.display.utils import ModelType
|
10 |
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
# def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
# """Creates a dataframe from all the individual experiment results"""
|
14 |
# raw_data = get_raw_eval_results(results_path, requests_path)
|
@@ -45,21 +63,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
45 |
# Assuming `df` is your DataFrame:
|
46 |
df.rename(columns=column_mapping, inplace=True)
|
47 |
|
48 |
-
# Create a new column for model type symbol by parsing the model_type column
|
49 |
-
df[AutoEvalColumn.model_type_symbol.name] = df[AutoEvalColumn.model_type.name].apply(
|
50 |
-
lambda x: ModelType.from_str(x).value.symbol
|
51 |
-
)
|
52 |
-
# Prepend the value of model_type_symbol to the value of model_type
|
53 |
-
df[AutoEvalColumn.model_type.name] = (
|
54 |
-
df[AutoEvalColumn.model_type_symbol.name] + " " + df[AutoEvalColumn.model_type.name]
|
55 |
-
)
|
56 |
-
|
57 |
-
# Move the model_type_symbol column to the beginning
|
58 |
-
cols = [AutoEvalColumn.model_type_symbol.name] + [
|
59 |
-
col for col in df.columns if col != AutoEvalColumn.model_type_symbol.name
|
60 |
-
]
|
61 |
-
df = df[cols]
|
62 |
-
|
63 |
df = df.sort_values(by=[AutoEvalColumn.Rank_6750_scaled.name], ascending=True)
|
64 |
return df
|
65 |
|
|
|
5 |
from dataclasses import fields
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
+
from src.leaderboard.read_evals import get_model_info
|
9 |
from src.display.utils import ModelType
|
10 |
|
11 |
|
12 |
+
def get_model_info_df(results_path: str) -> pd.DataFrame:
|
13 |
+
"""Creates a dataframe from all the individual experiment results"""
|
14 |
+
raw_data = get_model_info(results_path)
|
15 |
+
all_data_json = [v.to_dict() for v in raw_data]
|
16 |
+
df = pd.DataFrame.from_records(all_data_json)
|
17 |
+
return df
|
18 |
+
|
19 |
+
|
20 |
+
def get_merged_df(result_df: pd.DataFrame, model_info_df: pd.DataFrame) -> pd.DataFrame:
|
21 |
+
"""Merges the model info dataframe with the results dataframe"""
|
22 |
+
result_df = result_df.rename(columns={"Model": "tmp_name"})
|
23 |
+
merged_df = pd.merge(model_info_df, result_df, on="tmp_name", how="inner")
|
24 |
+
assert len(merged_df) == len(result_df)
|
25 |
+
merged_df = merged_df.drop(columns=["Model", "tmp_name"])
|
26 |
+
merged_df = merged_df.rename(columns={"model_w_link": "Model"})
|
27 |
+
return merged_df
|
28 |
+
|
29 |
+
|
30 |
# def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
31 |
# """Creates a dataframe from all the individual experiment results"""
|
32 |
# raw_data = get_raw_eval_results(results_path, requests_path)
|
|
|
63 |
# Assuming `df` is your DataFrame:
|
64 |
df.rename(columns=column_mapping, inplace=True)
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
df = df.sort_values(by=[AutoEvalColumn.Rank_6750_scaled.name], ascending=True)
|
67 |
return df
|
68 |
|