annamonica commited on
Commit
c04b086
·
1 Parent(s): 79222e8

add term, domain, boomlet tables, and code to parse/display them correctly

Browse files
app.py CHANGED
@@ -61,10 +61,19 @@ def restart_space():
61
 
62
 
63
  LEADERBOARD_DF = get_leaderboard_df(
64
- EVAL_RESULTS_PATH + "/" + "BOOM_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
65
  )
66
  LEADERBOARD_DF_DOMAIN = get_leaderboard_df(
67
- EVAL_RESULTS_PATH + "/" + "BOOM_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
 
 
 
 
 
 
 
 
 
68
  )
69
  model_info_df = get_model_info_df(EVAL_RESULTS_PATH)
70
 
@@ -76,23 +85,37 @@ model_info_df = get_model_info_df(EVAL_RESULTS_PATH)
76
 
77
 
78
  def init_leaderboard(dataframe, model_info_df):
79
- # TODO: merge results df with model info df
80
  if dataframe is None or dataframe.empty:
81
  raise ValueError("Leaderboard DataFrame is empty or None.")
82
 
83
  merged_df = get_merged_df(dataframe, model_info_df)
84
- merged_df = merged_df.sort_values(by=[AutoEvalColumn.Rank_6750_scaled.name], ascending=True)
 
 
 
 
 
 
 
85
 
86
  # Move the model_type_symbol column to the beginning
87
- cols = [AutoEvalColumn.model_type_symbol.name] + [
88
- col for col in merged_df.columns if col != AutoEvalColumn.model_type_symbol.name
89
- ]
 
 
 
 
90
  merged_df = merged_df[cols]
 
 
 
 
91
  return Leaderboard(
92
  value=merged_df,
93
- datatype=[c.type for c in fields(AutoEvalColumn)],
94
  select_columns=SelectColumns(
95
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
96
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
97
  label="Select Columns to Display:",
98
  ),
@@ -102,7 +125,8 @@ def init_leaderboard(dataframe, model_info_df):
102
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
103
  ],
104
  bool_checkboxgroup_label="Hide models",
105
- column_widths=[40, 150] + [180 for _ in range(len(merged_df.columns) - 2)],
 
106
  interactive=False,
107
  )
108
 
@@ -116,11 +140,19 @@ with demo:
116
  with gr.TabItem("🏅 Overall", elem_id="boom-benchmark-tab-table", id=0):
117
  leaderboard = init_leaderboard(LEADERBOARD_DF, model_info_df)
118
 
119
- # TODO - add other tabs if needed
120
- # with gr.TabItem("🏅 By Domain - TODO", elem_id="boom-benchmark-tab-table", id=1):
121
- # leaderboard = init_leaderboard(LEADERBOARD_DF_DOMAIN) # TODO - update table data
 
 
 
 
 
 
 
 
122
 
123
- with gr.TabItem("📝 About", elem_id="boom-benchmark-tab-table", id=2):
124
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
125
 
126
  with gr.Row():
 
61
 
62
 
63
  LEADERBOARD_DF = get_leaderboard_df(
64
+ EVAL_RESULTS_PATH + "/leaderboards/BOOM_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
65
  )
66
  LEADERBOARD_DF_DOMAIN = get_leaderboard_df(
67
+ EVAL_RESULTS_PATH + "/leaderboards/BOOM_domain_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
68
+ )
69
+ # LEADERBOARD_DF_METRIC_TYPE = get_leaderboard_df(
70
+ # EVAL_RESULTS_PATH + "/leaderboards/BOOM_metric_type_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
71
+ # )
72
+ LEADERBOARD_DF_TERM = get_leaderboard_df(
73
+ EVAL_RESULTS_PATH + "/leaderboards/BOOM_term_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
74
+ )
75
+ LEADERBOARD_DF_BOOMLET = get_leaderboard_df(
76
+ EVAL_RESULTS_PATH + "/leaderboards/BOOMLET_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
77
  )
78
  model_info_df = get_model_info_df(EVAL_RESULTS_PATH)
79
 
 
85
 
86
 
87
  def init_leaderboard(dataframe, model_info_df):
 
88
  if dataframe is None or dataframe.empty:
89
  raise ValueError("Leaderboard DataFrame is empty or None.")
90
 
91
  merged_df = get_merged_df(dataframe, model_info_df)
92
+
93
+ if "Rank" in merged_df.columns:
94
+ merged_df = merged_df.sort_values(by=["Rank"], ascending=True)
95
+ else:
96
+ # Sort by the first CRPS column if the Rank column is not present
97
+ crps_cols = [col for col in merged_df.columns if "CRPS" in col]
98
+ if crps_cols:
99
+ merged_df = merged_df.sort_values(by=crps_cols[0], ascending=True)
100
 
101
  # Move the model_type_symbol column to the beginning
102
+ cols = [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + sorted(
103
+ [
104
+ col
105
+ for col in merged_df.columns
106
+ if col not in [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
107
+ ]
108
+ )
109
  merged_df = merged_df[cols]
110
+ col2type_dict = {c.name: c.type for c in fields(AutoEvalColumn)}
111
+ datatype_list = [col2type_dict[col] if col in col2type_dict else "number" for col in merged_df.columns]
112
+ model_info_col_list = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default]
113
+ default_selection_list = list(dataframe.columns) + model_info_col_list
114
  return Leaderboard(
115
  value=merged_df,
116
+ datatype=datatype_list,
117
  select_columns=SelectColumns(
118
+ default_selection=default_selection_list,
119
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
120
  label="Select Columns to Display:",
121
  ),
 
125
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
126
  ],
127
  bool_checkboxgroup_label="Hide models",
128
+ column_widths=[40, 180] + [160 for _ in range(len(merged_df.columns) - 2)],
129
+ wrap=True,
130
  interactive=False,
131
  )
132
 
 
140
  with gr.TabItem("🏅 Overall", elem_id="boom-benchmark-tab-table", id=0):
141
  leaderboard = init_leaderboard(LEADERBOARD_DF, model_info_df)
142
 
143
+ with gr.TabItem("🏅 By Domain", elem_id="boom-benchmark-tab-table", id=1):
144
+ leaderboard = init_leaderboard(LEADERBOARD_DF_DOMAIN, model_info_df)
145
+
146
+ # with gr.TabItem("🏅 By Metric Type", elem_id="boom-benchmark-tab-table", id=2):
147
+ # leaderboard = init_leaderboard(LEADERBOARD_DF_METRIC_TYPE, model_info_df)
148
+
149
+ with gr.TabItem("🏅 By Forecast Horizon", elem_id="boom-benchmark-tab-table", id=3):
150
+ leaderboard = init_leaderboard(LEADERBOARD_DF_TERM, model_info_df)
151
+
152
+ with gr.TabItem("🏅 BOOMLET", elem_id="boom-benchmark-tab-table", id=4):
153
+ leaderboard = init_leaderboard(LEADERBOARD_DF_BOOMLET, model_info_df)
154
 
155
+ with gr.TabItem("📝 About", elem_id="boom-benchmark-tab-table", id=5):
156
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
157
 
158
  with gr.Row():
results/leaderboards/BOOMLET_leaderboard.csv ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,MASE,CRPS,Rank
2
+ Toto-Open-Base-1.0,0.617,0.519,1.244
3
+ timesfm_2_0_500m,0.685,0.603,4.156
4
+ moirai_1.1_large,0.767,0.621,4.267
5
+ moirai_1.1_base,0.779,0.630,4.567
6
+ moirai_1.1_small,0.786,0.631,4.944
7
+ chronos_bolt_base,0.711,0.637,5.467
8
+ chronos_bolt_small,0.717,0.642,5.667
9
+ time-moe,0.810,0.788,8.989
10
+ timer,0.807,0.793,9.244
11
+ autoarima,0.922,0.880,9.667
12
+ visionts,0.912,0.885,10.922
13
+ seasonalnaive,1.000,1.000,11.400
14
+ autoets,0.969,15.664,12.033
15
+ autotheta,1.030,1.182,12.433
results/leaderboards/BOOM_domain_leaderboard.csv ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,Application Usage (MASE),Database (MASE),Infrastructure (MASE),Networking (MASE),Security (MASE),Application Usage (CRPS),Database (CRPS),Infrastructure (CRPS),Networking (CRPS),Security (CRPS)
2
+ Toto-Open-Base-1.0,0.639,0.635,0.568,0.635,0.682,0.378,0.362,0.391,0.4,0.476
3
+ autoets,0.87,0.859,0.727,0.98,0.868,1.423,1.604,4.563,2.067,0.878
4
+ autoarima,0.865,0.839,0.708,0.937,0.9,0.757,0.734,0.679,0.795,0.757
5
+ autotheta,1.151,1.188,0.976,1.213,1.039,1.019,1.032,0.963,1.105,0.991
6
+ chronos_bolt_base,0.748,0.757,0.663,0.757,0.729,0.451,0.441,0.466,0.489,0.535
7
+ chronos_bolt_small,0.748,0.761,0.678,0.779,0.734,0.452,0.444,0.474,0.506,0.539
8
+ moirai_1.1_base,0.721,0.738,0.65,0.786,0.739,0.422,0.414,0.446,0.484,0.504
9
+ moirai_1.1_large,0.73,0.743,0.67,0.773,0.736,0.43,0.418,0.462,0.484,0.504
10
+ moirai_1.1_small,0.747,0.751,0.692,0.795,0.741,0.44,0.429,0.476,0.493,0.505
11
+ seasonalnaive,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
12
+ time-moe,0.863,0.714,0.791,0.856,0.77,0.633,0.618,0.713,0.721,0.625
13
+ timer,0.871,0.716,0.728,0.871,0.828,0.636,0.619,0.655,0.725,0.664
14
+ timesfm_2_0_500m,0.736,0.765,0.679,0.765,0.717,0.441,0.44,0.471,0.493,0.525
15
+ visionts,1.042,1.017,0.863,1.035,0.924,0.691,0.647,0.666,0.734,0.735
results/{BOOM_leaderboard.csv → leaderboards/BOOM_leaderboard.csv} RENAMED
@@ -1,4 +1,4 @@
1
- model,MASE_6750_scaled,CRPS_6750_scaled,Rank_6750_scaled
2
  Toto-Open-Base-1.0,0.617,0.375,2.336
3
  moirai_1.1_base,0.710,0.428,4.253
4
  moirai_1.1_large,0.720,0.436,4.481
 
1
+ model,MASE,CRPS,Rank
2
  Toto-Open-Base-1.0,0.617,0.375,2.336
3
  moirai_1.1_base,0.710,0.428,4.253
4
  moirai_1.1_large,0.720,0.436,4.481
results/leaderboards/BOOM_term_leaderboard.csv ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,Long (MASE),Medium (MASE),Short (MASE),Long (CRPS),Medium (CRPS),Short (CRPS)
2
+ Toto-Open-Base-1.0,0.688,0.657,0.535,0.424,0.406,0.318
3
+ autoets,0.938,0.885,0.739,2.83,2.399,1.253
4
+ autoarima,0.896,0.853,0.749,0.807,0.804,0.635
5
+ autotheta,1.368,1.163,0.928,1.296,1.183,0.738
6
+ chronos_bolt_base,0.798,0.782,0.632,0.519,0.507,0.365
7
+ chronos_bolt_small,0.813,0.782,0.638,0.528,0.508,0.368
8
+ moirai_1.1_base,0.78,0.753,0.627,0.473,0.46,0.37
9
+ moirai_1.1_large,0.799,0.77,0.626,0.491,0.475,0.369
10
+ moirai_1.1_small,0.795,0.771,0.67,0.482,0.476,0.399
11
+ seasonalnaive,1.0,1.0,1.0,1.0,1.0,1.0
12
+ time-moe,0.886,0.866,0.704,0.724,0.725,0.541
13
+ timer,0.809,0.804,0.779,0.661,0.671,0.597
14
+ timesfm_2_0_500m,0.817,0.78,0.619,0.522,0.499,0.359
15
+ visionts,1.026,1.011,0.947,0.698,0.698,0.64
results/models_info/naive/config.json CHANGED
@@ -1,5 +1,6 @@
1
  {
2
  "model": "Naive",
 
3
  "model_type": "statistical",
4
  "model_dtype": "float32"
5
  }
 
1
  {
2
  "model": "Naive",
3
+ "tmp_name": "naive",
4
  "model_type": "statistical",
5
  "model_dtype": "float32"
6
  }
src/display/utils.py CHANGED
@@ -29,10 +29,6 @@ auto_eval_column_dict.append(
29
  ["model_type_symbol", ColumnContent, ColumnContent("Type", "str", True, never_hidden=True)]
30
  )
31
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
32
- # Scores
33
- auto_eval_column_dict.append(["MASE_6750_scaled", ColumnContent, ColumnContent("MASE", "number", True)])
34
- auto_eval_column_dict.append(["CRPS_6750_scaled", ColumnContent, ColumnContent("CRPS", "number", True)])
35
- auto_eval_column_dict.append(["Rank_6750_scaled", ColumnContent, ColumnContent("Rank", "number", True)])
36
  # Model information
37
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Model Type", "str", False, hidden=True)])
38
  # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
29
  ["model_type_symbol", ColumnContent, ColumnContent("Type", "str", True, never_hidden=True)]
30
  )
31
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
 
 
 
32
  # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Model Type", "str", False, hidden=True)])
34
  # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
src/populate.py CHANGED
@@ -21,7 +21,9 @@ def get_merged_df(result_df: pd.DataFrame, model_info_df: pd.DataFrame) -> pd.Da
21
  """Merges the model info dataframe with the results dataframe"""
22
  result_df = result_df.rename(columns={"Model": "tmp_name"})
23
  merged_df = pd.merge(model_info_df, result_df, on="tmp_name", how="inner")
24
- assert len(merged_df) == len(result_df)
 
 
25
  merged_df = merged_df.drop(columns=["Model", "tmp_name"])
26
  merged_df = merged_df.rename(columns={"model_w_link": "Model"})
27
  return merged_df
@@ -53,7 +55,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
53
  - Internal column names are mapped to display names using `AutoEvalColumn`.
54
  - A new column for model type symbols is created by parsing the `model_type` column.
55
  - The `model_type` column is updated to prepend the model type symbol.
56
- - The DataFrame is sorted by the `Rank_6750_scaled` column in ascending order.
57
  """
58
 
59
  df = pd.read_csv(results_path)
@@ -62,8 +64,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
62
  column_mapping = {field.name: getattr(AutoEvalColumn, field.name).name for field in fields(AutoEvalColumn)}
63
  # Assuming `df` is your DataFrame:
64
  df.rename(columns=column_mapping, inplace=True)
65
-
66
- df = df.sort_values(by=[AutoEvalColumn.Rank_6750_scaled.name], ascending=True)
67
  return df
68
 
69
 
 
21
  """Merges the model info dataframe with the results dataframe"""
22
  result_df = result_df.rename(columns={"Model": "tmp_name"})
23
  merged_df = pd.merge(model_info_df, result_df, on="tmp_name", how="inner")
24
+ assert len(merged_df) == len(
25
+ result_df
26
+ ), f"missing model info for: {set(result_df['tmp_name'].unique()) - set(model_info_df['tmp_name'].unique())}"
27
  merged_df = merged_df.drop(columns=["Model", "tmp_name"])
28
  merged_df = merged_df.rename(columns={"model_w_link": "Model"})
29
  return merged_df
 
55
  - Internal column names are mapped to display names using `AutoEvalColumn`.
56
  - A new column for model type symbols is created by parsing the `model_type` column.
57
  - The `model_type` column is updated to prepend the model type symbol.
58
+ - The DataFrame is sorted by the `Rank_scaled` column in ascending order.
59
  """
60
 
61
  df = pd.read_csv(results_path)
 
64
  column_mapping = {field.name: getattr(AutoEvalColumn, field.name).name for field in fields(AutoEvalColumn)}
65
  # Assuming `df` is your DataFrame:
66
  df.rename(columns=column_mapping, inplace=True)
 
 
67
  return df
68
 
69