Spaces:
Running
Running
Commit
·
c04b086
1
Parent(s):
79222e8
add term, domain, boomlet tables, and code to parse/display them correctly
Browse files- app.py +46 -14
- results/leaderboards/BOOMLET_leaderboard.csv +15 -0
- results/leaderboards/BOOM_domain_leaderboard.csv +15 -0
- results/{BOOM_leaderboard.csv → leaderboards/BOOM_leaderboard.csv} +1 -1
- results/leaderboards/BOOM_term_leaderboard.csv +15 -0
- results/models_info/naive/config.json +1 -0
- src/display/utils.py +0 -4
- src/populate.py +4 -4
app.py
CHANGED
@@ -61,10 +61,19 @@ def restart_space():
|
|
61 |
|
62 |
|
63 |
LEADERBOARD_DF = get_leaderboard_df(
|
64 |
-
EVAL_RESULTS_PATH + "/
|
65 |
)
|
66 |
LEADERBOARD_DF_DOMAIN = get_leaderboard_df(
|
67 |
-
EVAL_RESULTS_PATH + "/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
)
|
69 |
model_info_df = get_model_info_df(EVAL_RESULTS_PATH)
|
70 |
|
@@ -76,23 +85,37 @@ model_info_df = get_model_info_df(EVAL_RESULTS_PATH)
|
|
76 |
|
77 |
|
78 |
def init_leaderboard(dataframe, model_info_df):
|
79 |
-
# TODO: merge results df with model info df
|
80 |
if dataframe is None or dataframe.empty:
|
81 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
82 |
|
83 |
merged_df = get_merged_df(dataframe, model_info_df)
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
# Move the model_type_symbol column to the beginning
|
87 |
-
cols = [AutoEvalColumn.model_type_symbol.name] +
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
|
|
90 |
merged_df = merged_df[cols]
|
|
|
|
|
|
|
|
|
91 |
return Leaderboard(
|
92 |
value=merged_df,
|
93 |
-
datatype=
|
94 |
select_columns=SelectColumns(
|
95 |
-
default_selection=
|
96 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
97 |
label="Select Columns to Display:",
|
98 |
),
|
@@ -102,7 +125,8 @@ def init_leaderboard(dataframe, model_info_df):
|
|
102 |
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
103 |
],
|
104 |
bool_checkboxgroup_label="Hide models",
|
105 |
-
column_widths=[40,
|
|
|
106 |
interactive=False,
|
107 |
)
|
108 |
|
@@ -116,11 +140,19 @@ with demo:
|
|
116 |
with gr.TabItem("🏅 Overall", elem_id="boom-benchmark-tab-table", id=0):
|
117 |
leaderboard = init_leaderboard(LEADERBOARD_DF, model_info_df)
|
118 |
|
119 |
-
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
-
with gr.TabItem("📝 About", elem_id="boom-benchmark-tab-table", id=
|
124 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
125 |
|
126 |
with gr.Row():
|
|
|
61 |
|
62 |
|
63 |
LEADERBOARD_DF = get_leaderboard_df(
|
64 |
+
EVAL_RESULTS_PATH + "/leaderboards/BOOM_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
|
65 |
)
|
66 |
LEADERBOARD_DF_DOMAIN = get_leaderboard_df(
|
67 |
+
EVAL_RESULTS_PATH + "/leaderboards/BOOM_domain_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
|
68 |
+
)
|
69 |
+
# LEADERBOARD_DF_METRIC_TYPE = get_leaderboard_df(
|
70 |
+
# EVAL_RESULTS_PATH + "/leaderboards/BOOM_metric_type_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
|
71 |
+
# )
|
72 |
+
LEADERBOARD_DF_TERM = get_leaderboard_df(
|
73 |
+
EVAL_RESULTS_PATH + "/leaderboards/BOOM_term_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
|
74 |
+
)
|
75 |
+
LEADERBOARD_DF_BOOMLET = get_leaderboard_df(
|
76 |
+
EVAL_RESULTS_PATH + "/leaderboards/BOOMLET_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
|
77 |
)
|
78 |
model_info_df = get_model_info_df(EVAL_RESULTS_PATH)
|
79 |
|
|
|
85 |
|
86 |
|
87 |
def init_leaderboard(dataframe, model_info_df):
|
|
|
88 |
if dataframe is None or dataframe.empty:
|
89 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
90 |
|
91 |
merged_df = get_merged_df(dataframe, model_info_df)
|
92 |
+
|
93 |
+
if "Rank" in merged_df.columns:
|
94 |
+
merged_df = merged_df.sort_values(by=["Rank"], ascending=True)
|
95 |
+
else:
|
96 |
+
# Sort by the first CRPS column if the Rank column is not present
|
97 |
+
crps_cols = [col for col in merged_df.columns if "CRPS" in col]
|
98 |
+
if crps_cols:
|
99 |
+
merged_df = merged_df.sort_values(by=crps_cols[0], ascending=True)
|
100 |
|
101 |
# Move the model_type_symbol column to the beginning
|
102 |
+
cols = [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + sorted(
|
103 |
+
[
|
104 |
+
col
|
105 |
+
for col in merged_df.columns
|
106 |
+
if col not in [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
|
107 |
+
]
|
108 |
+
)
|
109 |
merged_df = merged_df[cols]
|
110 |
+
col2type_dict = {c.name: c.type for c in fields(AutoEvalColumn)}
|
111 |
+
datatype_list = [col2type_dict[col] if col in col2type_dict else "number" for col in merged_df.columns]
|
112 |
+
model_info_col_list = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default]
|
113 |
+
default_selection_list = list(dataframe.columns) + model_info_col_list
|
114 |
return Leaderboard(
|
115 |
value=merged_df,
|
116 |
+
datatype=datatype_list,
|
117 |
select_columns=SelectColumns(
|
118 |
+
default_selection=default_selection_list,
|
119 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
120 |
label="Select Columns to Display:",
|
121 |
),
|
|
|
125 |
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
126 |
],
|
127 |
bool_checkboxgroup_label="Hide models",
|
128 |
+
column_widths=[40, 180] + [160 for _ in range(len(merged_df.columns) - 2)],
|
129 |
+
wrap=True,
|
130 |
interactive=False,
|
131 |
)
|
132 |
|
|
|
140 |
with gr.TabItem("🏅 Overall", elem_id="boom-benchmark-tab-table", id=0):
|
141 |
leaderboard = init_leaderboard(LEADERBOARD_DF, model_info_df)
|
142 |
|
143 |
+
with gr.TabItem("🏅 By Domain", elem_id="boom-benchmark-tab-table", id=1):
|
144 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF_DOMAIN, model_info_df)
|
145 |
+
|
146 |
+
# with gr.TabItem("🏅 By Metric Type", elem_id="boom-benchmark-tab-table", id=2):
|
147 |
+
# leaderboard = init_leaderboard(LEADERBOARD_DF_METRIC_TYPE, model_info_df)
|
148 |
+
|
149 |
+
with gr.TabItem("🏅 By Forecast Horizon", elem_id="boom-benchmark-tab-table", id=3):
|
150 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF_TERM, model_info_df)
|
151 |
+
|
152 |
+
with gr.TabItem("🏅 BOOMLET", elem_id="boom-benchmark-tab-table", id=4):
|
153 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF_BOOMLET, model_info_df)
|
154 |
|
155 |
+
with gr.TabItem("📝 About", elem_id="boom-benchmark-tab-table", id=5):
|
156 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
157 |
|
158 |
with gr.Row():
|
results/leaderboards/BOOMLET_leaderboard.csv
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model,MASE,CRPS,Rank
|
2 |
+
Toto-Open-Base-1.0,0.617,0.519,1.244
|
3 |
+
timesfm_2_0_500m,0.685,0.603,4.156
|
4 |
+
moirai_1.1_large,0.767,0.621,4.267
|
5 |
+
moirai_1.1_base,0.779,0.630,4.567
|
6 |
+
moirai_1.1_small,0.786,0.631,4.944
|
7 |
+
chronos_bolt_base,0.711,0.637,5.467
|
8 |
+
chronos_bolt_small,0.717,0.642,5.667
|
9 |
+
time-moe,0.810,0.788,8.989
|
10 |
+
timer,0.807,0.793,9.244
|
11 |
+
autoarima,0.922,0.880,9.667
|
12 |
+
visionts,0.912,0.885,10.922
|
13 |
+
seasonalnaive,1.000,1.000,11.400
|
14 |
+
autoets,0.969,15.664,12.033
|
15 |
+
autotheta,1.030,1.182,12.433
|
results/leaderboards/BOOM_domain_leaderboard.csv
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model,Application Usage (MASE),Database (MASE),Infrastructure (MASE),Networking (MASE),Security (MASE),Application Usage (CRPS),Database (CRPS),Infrastructure (CRPS),Networking (CRPS),Security (CRPS)
|
2 |
+
Toto-Open-Base-1.0,0.639,0.635,0.568,0.635,0.682,0.378,0.362,0.391,0.4,0.476
|
3 |
+
autoets,0.87,0.859,0.727,0.98,0.868,1.423,1.604,4.563,2.067,0.878
|
4 |
+
autoarima,0.865,0.839,0.708,0.937,0.9,0.757,0.734,0.679,0.795,0.757
|
5 |
+
autotheta,1.151,1.188,0.976,1.213,1.039,1.019,1.032,0.963,1.105,0.991
|
6 |
+
chronos_bolt_base,0.748,0.757,0.663,0.757,0.729,0.451,0.441,0.466,0.489,0.535
|
7 |
+
chronos_bolt_small,0.748,0.761,0.678,0.779,0.734,0.452,0.444,0.474,0.506,0.539
|
8 |
+
moirai_1.1_base,0.721,0.738,0.65,0.786,0.739,0.422,0.414,0.446,0.484,0.504
|
9 |
+
moirai_1.1_large,0.73,0.743,0.67,0.773,0.736,0.43,0.418,0.462,0.484,0.504
|
10 |
+
moirai_1.1_small,0.747,0.751,0.692,0.795,0.741,0.44,0.429,0.476,0.493,0.505
|
11 |
+
seasonalnaive,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
|
12 |
+
time-moe,0.863,0.714,0.791,0.856,0.77,0.633,0.618,0.713,0.721,0.625
|
13 |
+
timer,0.871,0.716,0.728,0.871,0.828,0.636,0.619,0.655,0.725,0.664
|
14 |
+
timesfm_2_0_500m,0.736,0.765,0.679,0.765,0.717,0.441,0.44,0.471,0.493,0.525
|
15 |
+
visionts,1.042,1.017,0.863,1.035,0.924,0.691,0.647,0.666,0.734,0.735
|
results/{BOOM_leaderboard.csv → leaderboards/BOOM_leaderboard.csv}
RENAMED
@@ -1,4 +1,4 @@
|
|
1 |
-
model,
|
2 |
Toto-Open-Base-1.0,0.617,0.375,2.336
|
3 |
moirai_1.1_base,0.710,0.428,4.253
|
4 |
moirai_1.1_large,0.720,0.436,4.481
|
|
|
1 |
+
model,MASE,CRPS,Rank
|
2 |
Toto-Open-Base-1.0,0.617,0.375,2.336
|
3 |
moirai_1.1_base,0.710,0.428,4.253
|
4 |
moirai_1.1_large,0.720,0.436,4.481
|
results/leaderboards/BOOM_term_leaderboard.csv
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model,Long (MASE),Medium (MASE),Short (MASE),Long (CRPS),Medium (CRPS),Short (CRPS)
|
2 |
+
Toto-Open-Base-1.0,0.688,0.657,0.535,0.424,0.406,0.318
|
3 |
+
autoets,0.938,0.885,0.739,2.83,2.399,1.253
|
4 |
+
autoarima,0.896,0.853,0.749,0.807,0.804,0.635
|
5 |
+
autotheta,1.368,1.163,0.928,1.296,1.183,0.738
|
6 |
+
chronos_bolt_base,0.798,0.782,0.632,0.519,0.507,0.365
|
7 |
+
chronos_bolt_small,0.813,0.782,0.638,0.528,0.508,0.368
|
8 |
+
moirai_1.1_base,0.78,0.753,0.627,0.473,0.46,0.37
|
9 |
+
moirai_1.1_large,0.799,0.77,0.626,0.491,0.475,0.369
|
10 |
+
moirai_1.1_small,0.795,0.771,0.67,0.482,0.476,0.399
|
11 |
+
seasonalnaive,1.0,1.0,1.0,1.0,1.0,1.0
|
12 |
+
time-moe,0.886,0.866,0.704,0.724,0.725,0.541
|
13 |
+
timer,0.809,0.804,0.779,0.661,0.671,0.597
|
14 |
+
timesfm_2_0_500m,0.817,0.78,0.619,0.522,0.499,0.359
|
15 |
+
visionts,1.026,1.011,0.947,0.698,0.698,0.64
|
results/models_info/naive/config.json
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
{
|
2 |
"model": "Naive",
|
|
|
3 |
"model_type": "statistical",
|
4 |
"model_dtype": "float32"
|
5 |
}
|
|
|
1 |
{
|
2 |
"model": "Naive",
|
3 |
+
"tmp_name": "naive",
|
4 |
"model_type": "statistical",
|
5 |
"model_dtype": "float32"
|
6 |
}
|
src/display/utils.py
CHANGED
@@ -29,10 +29,6 @@ auto_eval_column_dict.append(
|
|
29 |
["model_type_symbol", ColumnContent, ColumnContent("Type", "str", True, never_hidden=True)]
|
30 |
)
|
31 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
32 |
-
# Scores
|
33 |
-
auto_eval_column_dict.append(["MASE_6750_scaled", ColumnContent, ColumnContent("MASE", "number", True)])
|
34 |
-
auto_eval_column_dict.append(["CRPS_6750_scaled", ColumnContent, ColumnContent("CRPS", "number", True)])
|
35 |
-
auto_eval_column_dict.append(["Rank_6750_scaled", ColumnContent, ColumnContent("Rank", "number", True)])
|
36 |
# Model information
|
37 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Model Type", "str", False, hidden=True)])
|
38 |
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
29 |
["model_type_symbol", ColumnContent, ColumnContent("Type", "str", True, never_hidden=True)]
|
30 |
)
|
31 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
|
|
|
|
|
|
32 |
# Model information
|
33 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Model Type", "str", False, hidden=True)])
|
34 |
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
src/populate.py
CHANGED
@@ -21,7 +21,9 @@ def get_merged_df(result_df: pd.DataFrame, model_info_df: pd.DataFrame) -> pd.Da
|
|
21 |
"""Merges the model info dataframe with the results dataframe"""
|
22 |
result_df = result_df.rename(columns={"Model": "tmp_name"})
|
23 |
merged_df = pd.merge(model_info_df, result_df, on="tmp_name", how="inner")
|
24 |
-
assert len(merged_df) == len(
|
|
|
|
|
25 |
merged_df = merged_df.drop(columns=["Model", "tmp_name"])
|
26 |
merged_df = merged_df.rename(columns={"model_w_link": "Model"})
|
27 |
return merged_df
|
@@ -53,7 +55,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
53 |
- Internal column names are mapped to display names using `AutoEvalColumn`.
|
54 |
- A new column for model type symbols is created by parsing the `model_type` column.
|
55 |
- The `model_type` column is updated to prepend the model type symbol.
|
56 |
-
- The DataFrame is sorted by the `
|
57 |
"""
|
58 |
|
59 |
df = pd.read_csv(results_path)
|
@@ -62,8 +64,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
62 |
column_mapping = {field.name: getattr(AutoEvalColumn, field.name).name for field in fields(AutoEvalColumn)}
|
63 |
# Assuming `df` is your DataFrame:
|
64 |
df.rename(columns=column_mapping, inplace=True)
|
65 |
-
|
66 |
-
df = df.sort_values(by=[AutoEvalColumn.Rank_6750_scaled.name], ascending=True)
|
67 |
return df
|
68 |
|
69 |
|
|
|
21 |
"""Merges the model info dataframe with the results dataframe"""
|
22 |
result_df = result_df.rename(columns={"Model": "tmp_name"})
|
23 |
merged_df = pd.merge(model_info_df, result_df, on="tmp_name", how="inner")
|
24 |
+
assert len(merged_df) == len(
|
25 |
+
result_df
|
26 |
+
), f"missing model info for: {set(result_df['tmp_name'].unique()) - set(model_info_df['tmp_name'].unique())}"
|
27 |
merged_df = merged_df.drop(columns=["Model", "tmp_name"])
|
28 |
merged_df = merged_df.rename(columns={"model_w_link": "Model"})
|
29 |
return merged_df
|
|
|
55 |
- Internal column names are mapped to display names using `AutoEvalColumn`.
|
56 |
- A new column for model type symbols is created by parsing the `model_type` column.
|
57 |
- The `model_type` column is updated to prepend the model type symbol.
|
58 |
+
- The DataFrame is sorted by the `Rank_scaled` column in ascending order.
|
59 |
"""
|
60 |
|
61 |
df = pd.read_csv(results_path)
|
|
|
64 |
column_mapping = {field.name: getattr(AutoEvalColumn, field.name).name for field in fields(AutoEvalColumn)}
|
65 |
# Assuming `df` is your DataFrame:
|
66 |
df.rename(columns=column_mapping, inplace=True)
|
|
|
|
|
67 |
return df
|
68 |
|
69 |
|