Fix DS MATH
Browse files
app.py
CHANGED
@@ -62,36 +62,23 @@ def get_leaderboard_df():
|
|
62 |
elif task.lower() == "agieval":
|
63 |
value = data["results"]["all"]["acc_norm"]
|
64 |
# MATH reports qem
|
65 |
-
elif task.lower() in ["math", "math_v2", "aimo_kaggle"]:
|
66 |
value = data["results"]["all"]["qem"]
|
67 |
-
else:
|
68 |
-
first_metric_key = next(
|
69 |
-
iter(data["results"][first_result_key])
|
70 |
-
) # gets the first key in the first result
|
71 |
-
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
|
72 |
-
|
73 |
# For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
|
74 |
-
|
75 |
for k, v in data["results"].items():
|
76 |
if k != "all":
|
77 |
level = k.split("|")[1].split(":")[-1]
|
78 |
value = v["qem"]
|
79 |
df.loc[model_revision, f"{task}_{level}"] = value
|
80 |
-
# For
|
81 |
-
elif task.lower() in ["aimo_kaggle_medium_pot"]:
|
82 |
-
for k, v in data["results"].items():
|
83 |
-
if k != "all" and "_average" not in k:
|
84 |
-
version = k.split("|")[1].split(":")[-1]
|
85 |
-
value = v["qem"] if "qem" in v else v["score"]
|
86 |
-
df.loc[model_revision, f"{task}_{version}"] = value
|
87 |
-
# For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe
|
88 |
-
elif task.lower() in ["aimo_kaggle_hard_pot"]:
|
89 |
for k, v in data["results"].items():
|
90 |
if k != "all" and "_average" not in k:
|
91 |
version = k.split("|")[1].split(":")[-1]
|
92 |
value = v["qem"] if "qem" in v else v["score"]
|
93 |
df.loc[model_revision, f"{task}_{version}"] = value
|
94 |
-
# For kaggle_tora we report accuracy, so need to divide by 100
|
95 |
elif task.lower() in [
|
96 |
"aimo_tora_eval_kaggle_medium",
|
97 |
"aimo_tora_eval_kaggle_hard",
|
@@ -113,6 +100,10 @@ def get_leaderboard_df():
|
|
113 |
value = data["results"][first_result_key]["length_controlled_winrate"]
|
114 |
df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
|
115 |
else:
|
|
|
|
|
|
|
|
|
116 |
df.loc[model_revision, task] = float(value)
|
117 |
|
118 |
# Drop rows where every entry is NaN
|
@@ -130,8 +121,10 @@ def get_leaderboard_df():
|
|
130 |
|
131 |
return df
|
132 |
|
|
|
133 |
leaderboard_df = get_leaderboard_df()
|
134 |
|
|
|
135 |
def agg_df(df, agg: str = "max"):
|
136 |
df = df.copy()
|
137 |
# Drop date and aggregate results by model name
|
@@ -144,6 +137,7 @@ def agg_df(df, agg: str = "max"):
|
|
144 |
df = df.sort_values(by=["Average"], ascending=False)
|
145 |
return df
|
146 |
|
|
|
147 |
# Function to update the table based on search query
|
148 |
def filter_and_search(cols: list[str], search_query: str, agg: str):
|
149 |
df = leaderboard_df
|
|
|
62 |
elif task.lower() == "agieval":
|
63 |
value = data["results"]["all"]["acc_norm"]
|
64 |
# MATH reports qem
|
65 |
+
elif task.lower() in ["math", "math_v2", "aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
|
66 |
value = data["results"]["all"]["qem"]
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
# For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
|
68 |
+
elif task.lower() in ["mini_math_v2"]:
|
69 |
for k, v in data["results"].items():
|
70 |
if k != "all":
|
71 |
level = k.split("|")[1].split(":")[-1]
|
72 |
value = v["qem"]
|
73 |
df.loc[model_revision, f"{task}_{level}"] = value
|
74 |
+
# For PoT we report N metrics, one for each prompt and store each one as a separate row in the dataframe
|
75 |
+
elif task.lower() in ["aimo_kaggle_medium_pot", "aimo_kaggle_hard_pot"]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
for k, v in data["results"].items():
|
77 |
if k != "all" and "_average" not in k:
|
78 |
version = k.split("|")[1].split(":")[-1]
|
79 |
value = v["qem"] if "qem" in v else v["score"]
|
80 |
df.loc[model_revision, f"{task}_{version}"] = value
|
81 |
+
# For kaggle_tora we report accuracy as a percentage, so need to divide by 100
|
82 |
elif task.lower() in [
|
83 |
"aimo_tora_eval_kaggle_medium",
|
84 |
"aimo_tora_eval_kaggle_hard",
|
|
|
100 |
value = data["results"][first_result_key]["length_controlled_winrate"]
|
101 |
df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
|
102 |
else:
|
103 |
+
first_metric_key = next(
|
104 |
+
iter(data["results"][first_result_key])
|
105 |
+
) # gets the first key in the first result
|
106 |
+
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
|
107 |
df.loc[model_revision, task] = float(value)
|
108 |
|
109 |
# Drop rows where every entry is NaN
|
|
|
121 |
|
122 |
return df
|
123 |
|
124 |
+
|
125 |
leaderboard_df = get_leaderboard_df()
|
126 |
|
127 |
+
|
128 |
def agg_df(df, agg: str = "max"):
|
129 |
df = df.copy()
|
130 |
# Drop date and aggregate results by model name
|
|
|
137 |
df = df.sort_values(by=["Average"], ascending=False)
|
138 |
return df
|
139 |
|
140 |
+
|
141 |
# Function to update the table based on search query
|
142 |
def filter_and_search(cols: list[str], search_query: str, agg: str):
|
143 |
df = leaderboard_df
|