lewtun HF staff commited on
Commit
b59264d
·
1 Parent(s): dc1a5db

Fix DS MATH

Browse files
Files changed (1) hide show
  1. app.py +12 -18
app.py CHANGED
@@ -62,36 +62,23 @@ def get_leaderboard_df():
62
  elif task.lower() == "agieval":
63
  value = data["results"]["all"]["acc_norm"]
64
  # MATH reports qem
65
- elif task.lower() in ["math", "math_v2", "aimo_kaggle"]:
66
  value = data["results"]["all"]["qem"]
67
- else:
68
- first_metric_key = next(
69
- iter(data["results"][first_result_key])
70
- ) # gets the first key in the first result
71
- value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
72
-
73
  # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
74
- if task.lower() in ["mini_math_v2"]:
75
  for k, v in data["results"].items():
76
  if k != "all":
77
  level = k.split("|")[1].split(":")[-1]
78
  value = v["qem"]
79
  df.loc[model_revision, f"{task}_{level}"] = value
80
- # For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe
81
- elif task.lower() in ["aimo_kaggle_medium_pot"]:
82
- for k, v in data["results"].items():
83
- if k != "all" and "_average" not in k:
84
- version = k.split("|")[1].split(":")[-1]
85
- value = v["qem"] if "qem" in v else v["score"]
86
- df.loc[model_revision, f"{task}_{version}"] = value
87
- # For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe
88
- elif task.lower() in ["aimo_kaggle_hard_pot"]:
89
  for k, v in data["results"].items():
90
  if k != "all" and "_average" not in k:
91
  version = k.split("|")[1].split(":")[-1]
92
  value = v["qem"] if "qem" in v else v["score"]
93
  df.loc[model_revision, f"{task}_{version}"] = value
94
- # For kaggle_tora we report accuracy, so need to divide by 100
95
  elif task.lower() in [
96
  "aimo_tora_eval_kaggle_medium",
97
  "aimo_tora_eval_kaggle_hard",
@@ -113,6 +100,10 @@ def get_leaderboard_df():
113
  value = data["results"][first_result_key]["length_controlled_winrate"]
114
  df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
115
  else:
 
 
 
 
116
  df.loc[model_revision, task] = float(value)
117
 
118
  # Drop rows where every entry is NaN
@@ -130,8 +121,10 @@ def get_leaderboard_df():
130
 
131
  return df
132
 
 
133
  leaderboard_df = get_leaderboard_df()
134
 
 
135
  def agg_df(df, agg: str = "max"):
136
  df = df.copy()
137
  # Drop date and aggregate results by model name
@@ -144,6 +137,7 @@ def agg_df(df, agg: str = "max"):
144
  df = df.sort_values(by=["Average"], ascending=False)
145
  return df
146
 
 
147
  # Function to update the table based on search query
148
  def filter_and_search(cols: list[str], search_query: str, agg: str):
149
  df = leaderboard_df
 
62
  elif task.lower() == "agieval":
63
  value = data["results"]["all"]["acc_norm"]
64
  # MATH reports qem
65
+ elif task.lower() in ["math", "math_v2", "aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
66
  value = data["results"]["all"]["qem"]
 
 
 
 
 
 
67
  # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
68
+ elif task.lower() in ["mini_math_v2"]:
69
  for k, v in data["results"].items():
70
  if k != "all":
71
  level = k.split("|")[1].split(":")[-1]
72
  value = v["qem"]
73
  df.loc[model_revision, f"{task}_{level}"] = value
74
+ # For PoT we report N metrics, one for each prompt and store each one as a separate row in the dataframe
75
+ elif task.lower() in ["aimo_kaggle_medium_pot", "aimo_kaggle_hard_pot"]:
 
 
 
 
 
 
 
76
  for k, v in data["results"].items():
77
  if k != "all" and "_average" not in k:
78
  version = k.split("|")[1].split(":")[-1]
79
  value = v["qem"] if "qem" in v else v["score"]
80
  df.loc[model_revision, f"{task}_{version}"] = value
81
+ # For kaggle_tora we report accuracy as a percentage, so need to divide by 100
82
  elif task.lower() in [
83
  "aimo_tora_eval_kaggle_medium",
84
  "aimo_tora_eval_kaggle_hard",
 
100
  value = data["results"][first_result_key]["length_controlled_winrate"]
101
  df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
102
  else:
103
+ first_metric_key = next(
104
+ iter(data["results"][first_result_key])
105
+ ) # gets the first key in the first result
106
+ value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
107
  df.loc[model_revision, task] = float(value)
108
 
109
  # Drop rows where every entry is NaN
 
121
 
122
  return df
123
 
124
+
125
  leaderboard_df = get_leaderboard_df()
126
 
127
+
128
  def agg_df(df, agg: str = "max"):
129
  df = df.copy()
130
  # Drop date and aggregate results by model name
 
137
  df = df.sort_values(by=["Average"], ascending=False)
138
  return df
139
 
140
+
141
  # Function to update the table based on search query
142
  def filter_and_search(cols: list[str], search_query: str, agg: str):
143
  df = leaderboard_df