Spaces:
Running
Running
Add Details tab
Browse files
app.py
CHANGED
@@ -19,6 +19,8 @@ EXCLUDED_KEYS = {
|
|
19 |
# "alias",
|
20 |
# }
|
21 |
|
|
|
|
|
22 |
|
23 |
TASKS = {
|
24 |
"leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
|
@@ -29,6 +31,57 @@ TASKS = {
|
|
29 |
"leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
|
30 |
"leaderboard_musr": ("MuSR", "leaderboard_musr"),
|
31 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
fs = HfFileSystem()
|
34 |
|
@@ -103,6 +156,49 @@ def update_tasks(task):
|
|
103 |
)
|
104 |
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
# if __name__ == "__main__":
|
107 |
latest_result_path_per_model = filter_latest_result_path_per_model(fetch_result_paths())
|
108 |
|
@@ -135,6 +231,18 @@ with gr.Blocks(fill_height=True) as demo:
|
|
135 |
results = gr.HTML()
|
136 |
with gr.Tab("Configs"):
|
137 |
configs = gr.HTML()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
load_btn_1.click(
|
140 |
fn=load_result_dataframe,
|
@@ -166,6 +274,29 @@ with gr.Blocks(fill_height=True) as demo:
|
|
166 |
fn=display_results,
|
167 |
inputs=[dataframe_1, dataframe_2, task],
|
168 |
outputs=[results, configs],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
)
|
170 |
|
171 |
demo.launch()
|
|
|
19 |
# "alias",
|
20 |
# }
|
21 |
|
22 |
+
DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
|
23 |
+
DETAILS_FILENAME = "samples_{subtask}_*.json"
|
24 |
|
25 |
TASKS = {
|
26 |
"leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
|
|
|
31 |
"leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
|
32 |
"leaderboard_musr": ("MuSR", "leaderboard_musr"),
|
33 |
}
|
34 |
+
SUBTASKS = {
|
35 |
+
"leaderboard_arc_challenge": ["leaderboard_arc_challenge"],
|
36 |
+
"leaderboard_bbh": [
|
37 |
+
"leaderboard_bbh_boolean_expressions",
|
38 |
+
"leaderboard_bbh_causal_judgement",
|
39 |
+
"leaderboard_bbh_date_understanding",
|
40 |
+
"leaderboard_bbh_disambiguation_qa",
|
41 |
+
"leaderboard_bbh_formal_fallacies",
|
42 |
+
"leaderboard_bbh_geometric_shapes",
|
43 |
+
"leaderboard_bbh_hyperbaton",
|
44 |
+
"leaderboard_bbh_logical_deduction_five_objects",
|
45 |
+
"leaderboard_bbh_logical_deduction_seven_objects",
|
46 |
+
"leaderboard_bbh_logical_deduction_three_objects",
|
47 |
+
"leaderboard_bbh_movie_recommendation",
|
48 |
+
"leaderboard_bbh_navigate",
|
49 |
+
"leaderboard_bbh_object_counting",
|
50 |
+
"leaderboard_bbh_penguins_in_a_table",
|
51 |
+
"leaderboard_bbh_reasoning_about_colored_objects",
|
52 |
+
"leaderboard_bbh_ruin_names",
|
53 |
+
"leaderboard_bbh_salient_translation_error_detection",
|
54 |
+
"leaderboard_bbh_snarks", "leaderboard_bbh_sports_understanding",
|
55 |
+
"leaderboard_bbh_temporal_sequences",
|
56 |
+
"leaderboard_bbh_tracking_shuffled_objects_five_objects",
|
57 |
+
"leaderboard_bbh_tracking_shuffled_objects_seven_objects",
|
58 |
+
"leaderboard_bbh_tracking_shuffled_objects_three_objects",
|
59 |
+
"leaderboard_bbh_web_of_lies",
|
60 |
+
],
|
61 |
+
"leaderboard_gpqa": [
|
62 |
+
"leaderboard_gpqa_extended",
|
63 |
+
"leaderboard_gpqa_diamond",
|
64 |
+
"leaderboard_gpqa_main",
|
65 |
+
],
|
66 |
+
"leaderboard_ifeval": ["leaderboard_ifeval"],
|
67 |
+
# "leaderboard_math_hard": [
|
68 |
+
"leaderboard_math": [
|
69 |
+
"leaderboard_math_algebra_hard",
|
70 |
+
"leaderboard_math_counting_and_prob_hard",
|
71 |
+
"leaderboard_math_geometry_hard",
|
72 |
+
"leaderboard_math_intermediate_algebra_hard",
|
73 |
+
"leaderboard_math_num_theory_hard",
|
74 |
+
"leaderboard_math_prealgebra_hard",
|
75 |
+
"leaderboard_math_precalculus_hard",
|
76 |
+
],
|
77 |
+
"leaderboard_mmlu_pro": ["leaderboard_mmlu_pro"],
|
78 |
+
"leaderboard_musr": [
|
79 |
+
"leaderboard_musr_murder_mysteries",
|
80 |
+
"leaderboard_musr_object_placements",
|
81 |
+
"leaderboard_musr_team_allocation",
|
82 |
+
],
|
83 |
+
}
|
84 |
+
|
85 |
|
86 |
fs = HfFileSystem()
|
87 |
|
|
|
156 |
)
|
157 |
|
158 |
|
159 |
+
def update_subtasks(task):
|
160 |
+
return gr.Radio(
|
161 |
+
SUBTASKS.get(task),
|
162 |
+
info="Evaluation subtasks to be displayed",
|
163 |
+
)
|
164 |
+
|
165 |
+
|
166 |
+
def load_details_dataframe(model_id, subtask):
|
167 |
+
if not model_id or not subtask:
|
168 |
+
return
|
169 |
+
model_name_sanitized = model_id.replace("/", "__")
|
170 |
+
paths = fs.glob(
|
171 |
+
f"{DETAILS_DATASET_ID}/**/{DETAILS_FILENAME}".format(
|
172 |
+
model_name_sanitized=model_name_sanitized, subtask=subtask
|
173 |
+
)
|
174 |
+
)
|
175 |
+
if not paths:
|
176 |
+
return
|
177 |
+
path = max(paths)
|
178 |
+
with fs.open(path, "r") as f:
|
179 |
+
data = [json.loads(line) for line in f]
|
180 |
+
df = pd.json_normalize(data)
|
181 |
+
# df = df.rename_axis("Parameters", axis="columns")
|
182 |
+
df["model_name"] = model_id # Keep model_name
|
183 |
+
return df
|
184 |
+
# return df.set_index(pd.Index([model_id])).reset_index()
|
185 |
+
|
186 |
+
|
187 |
+
def display_details(df_1, df_2, sample_idx):
|
188 |
+
s_1 = df_1.iloc[sample_idx]
|
189 |
+
s_2 = df_2.iloc[sample_idx]
|
190 |
+
# Pop model_name and add it to the column name
|
191 |
+
s_1 = s_1.rename(s_1.pop("model_name"))
|
192 |
+
s_2 = s_2.rename(s_2.pop("model_name"))
|
193 |
+
df = pd.concat([s_1, s_2], axis="columns")#.rename_axis("Parameters").reset_index()
|
194 |
+
return (
|
195 |
+
df.style
|
196 |
+
.format(na_rep="")
|
197 |
+
# .hide(axis="index")
|
198 |
+
.to_html()
|
199 |
+
)
|
200 |
+
|
201 |
+
|
202 |
# if __name__ == "__main__":
|
203 |
latest_result_path_per_model = filter_latest_result_path_per_model(fetch_result_paths())
|
204 |
|
|
|
231 |
results = gr.HTML()
|
232 |
with gr.Tab("Configs"):
|
233 |
configs = gr.HTML()
|
234 |
+
with gr.Tab("Details"):
|
235 |
+
subtask = gr.Radio(
|
236 |
+
SUBTASKS.get(task.value),
|
237 |
+
label="Subtasks",
|
238 |
+
info="Evaluation subtasks to be displayed (choose one of the Tasks above)",
|
239 |
+
)
|
240 |
+
sample_idx = gr.Number(value=0, label="Sample Index", info="Index of the sample to be displayed", minimum=0)
|
241 |
+
load_details_btn = gr.Button("Load Details")
|
242 |
+
details = gr.HTML()
|
243 |
+
details_dataframe_1 = gr.Dataframe(visible=False)
|
244 |
+
details_dataframe_2 = gr.Dataframe(visible=False)
|
245 |
+
details_dataframe = gr.DataFrame(visible=False)
|
246 |
|
247 |
load_btn_1.click(
|
248 |
fn=load_result_dataframe,
|
|
|
274 |
fn=display_results,
|
275 |
inputs=[dataframe_1, dataframe_2, task],
|
276 |
outputs=[results, configs],
|
277 |
+
).then(
|
278 |
+
fn=update_subtasks,
|
279 |
+
inputs=task,
|
280 |
+
outputs=subtask,
|
281 |
+
)
|
282 |
+
|
283 |
+
load_details_btn.click(
|
284 |
+
fn=load_details_dataframe,
|
285 |
+
inputs=[model_id_1, subtask],
|
286 |
+
outputs=details_dataframe_1,
|
287 |
+
).then(
|
288 |
+
fn=load_details_dataframe,
|
289 |
+
inputs=[model_id_2, subtask],
|
290 |
+
outputs=details_dataframe_2,
|
291 |
+
).then(
|
292 |
+
fn=display_details,
|
293 |
+
inputs=[details_dataframe_1, details_dataframe_2, sample_idx],
|
294 |
+
outputs=details,
|
295 |
+
)
|
296 |
+
sample_idx.change(
|
297 |
+
fn=display_details,
|
298 |
+
inputs=[details_dataframe_1, details_dataframe_2, sample_idx],
|
299 |
+
outputs=details,
|
300 |
)
|
301 |
|
302 |
demo.launch()
|