xeon27
commited on
Commit
·
bbde2b0
1
Parent(s):
2c5e9d1
Make values clickable
Browse files- src/about.py +15 -14
- src/display/utils.py +1 -1
- src/inspect_log_file_names.json +61 -0
- src/populate.py +28 -9
src/about.py
CHANGED
@@ -6,6 +6,7 @@ class Task:
|
|
6 |
benchmark: str
|
7 |
metric: str
|
8 |
col_name: str
|
|
|
9 |
|
10 |
|
11 |
# Select your tasks here
|
@@ -14,22 +15,22 @@ class Tasks(Enum):
|
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
|
16 |
# base
|
17 |
-
task0 = Task("arc_easy", "accuracy", "ARC-Easy")
|
18 |
-
task1 = Task("arc_challenge", "accuracy", "ARC-Challenge")
|
19 |
-
task2 = Task("drop", "mean", "DROP")
|
20 |
-
task3 = Task("winogrande", "accuracy", "WinoGrande")
|
21 |
-
task4 = Task("gsm8k", "accuracy", "GSM8K")
|
22 |
-
task5 = Task("hellaswag", "accuracy", "HellaSwag")
|
23 |
-
task6 = Task("humaneval", "mean", "HumanEval")
|
24 |
-
task7 = Task("ifeval", "final_acc", "IFEval")
|
25 |
-
task8 = Task("math", "accuracy", "MATH")
|
26 |
-
task9 = Task("mmlu", "accuracy", "MMLU")
|
27 |
-
task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro")
|
28 |
-
task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond")
|
29 |
|
30 |
# agentic
|
31 |
-
task12 = Task("gaia", "mean", "GAIA")
|
32 |
-
task13 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF")
|
33 |
|
34 |
|
35 |
NUM_FEWSHOT = 0 # Change with your few shot
|
|
|
6 |
benchmark: str
|
7 |
metric: str
|
8 |
col_name: str
|
9 |
+
type: str
|
10 |
|
11 |
|
12 |
# Select your tasks here
|
|
|
15 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
|
17 |
# base
|
18 |
+
task0 = Task("arc_easy", "accuracy", "ARC-Easy", "base")
|
19 |
+
task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "base")
|
20 |
+
task2 = Task("drop", "mean", "DROP", "base")
|
21 |
+
task3 = Task("winogrande", "accuracy", "WinoGrande", "base")
|
22 |
+
task4 = Task("gsm8k", "accuracy", "GSM8K", "base")
|
23 |
+
task5 = Task("hellaswag", "accuracy", "HellaSwag", "base")
|
24 |
+
task6 = Task("humaneval", "mean", "HumanEval", "base")
|
25 |
+
task7 = Task("ifeval", "final_acc", "IFEval", "base")
|
26 |
+
task8 = Task("math", "accuracy", "MATH", "base")
|
27 |
+
task9 = Task("mmlu", "accuracy", "MMLU", "base")
|
28 |
+
task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "base")
|
29 |
+
task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "base")
|
30 |
|
31 |
# agentic
|
32 |
+
task12 = Task("gaia", "mean", "GAIA", "agentic")
|
33 |
+
task13 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic")
|
34 |
|
35 |
|
36 |
NUM_FEWSHOT = 0 # Change with your few shot
|
src/display/utils.py
CHANGED
@@ -28,7 +28,7 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "
|
32 |
# # Model information
|
33 |
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "markdown", True)])
|
32 |
# # Model information
|
33 |
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
src/inspect_log_file_names.json
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"gemini-1.5-pro": {
|
3 |
+
"mmlu": "2024-11-04T16-56-26-05-00_mmlu_Z9KrcK7x4ZLAR5nJ9JaVUe.json",
|
4 |
+
"humaneval": "2024-11-04T12-43-07-05-00_humaneval_5JBjtymGtK23qwVKxqidhV.json",
|
5 |
+
"mmlu_pro": "2024-11-04T20-13-09-05-00_mmlu-pro_Hv2ujvKLV6H7ZwQu2q8LNw.json",
|
6 |
+
"math": "2024-11-04T15-48-46-05-00_math_9DAZmGEfhpa3nUcmMAwqZe.json",
|
7 |
+
"arc_easy": "2024-11-04T12-31-43-05-00_arc-easy_eGxYWywpLuREcaCKvHa8Uk.json",
|
8 |
+
"gsm8k": "2024-11-04T15-15-26-05-00_gsm8k_cTebw3ugfrVz3dyPwxtdUZ.json",
|
9 |
+
"gpqa_diamond": "2024-11-05T09-56-31-05-00_gpqa-diamond_FBq2bnoyGYQ3NF96xQw8iy.json",
|
10 |
+
"ifeval": "2024-11-04T12-43-32-05-00_ifeval_mSwZ7AwA7akj5PjZbQMjgC.json",
|
11 |
+
"winogrande": "2024-11-04T12-40-46-05-00_winogrande_5SmD6rx47zmZvHHkQSSfHK.json",
|
12 |
+
"arc_challenge": "2024-11-04T12-37-36-05-00_arc-challenge_5VVApyQD22QpJoMm53EMdU.json",
|
13 |
+
"drop": "2024-11-04T12-44-32-05-00_drop_9dzPKVJojSVsxmiBFnej2m.json",
|
14 |
+
"hellaswag": "2024-11-05T13-14-31-05-00_hellaswag_N98eeftuY2pucRtgpUYk5m.json",
|
15 |
+
"gaia": "2024-11-15T12-53-32-05-00_gaia_NvyGRTXFrFskJfUvuLwvVr.json",
|
16 |
+
"gdm_intercode_ctf": "2024-11-15T16-23-23-05-00_gdm-intercode-ctf_3JrgtTMcijTUxHVaagPRYh.json"
|
17 |
+
},
|
18 |
+
"gemini-1.5-flash": {
|
19 |
+
"gpqa_diamond": "2024-11-04T12-47-34-05-00_gpqa-diamond_cL5kQj8DWbRfxz79piTSdy.json",
|
20 |
+
"arc_challenge": "2024-11-04T12-45-59-05-00_arc-challenge_YQLMHfEXqeYgGJY86EB9bp.json",
|
21 |
+
"math": "2024-11-04T15-25-38-05-00_math_eaYBRMFgo8p6VUUCYxnCWj.json",
|
22 |
+
"drop": "2024-11-04T12-52-08-05-00_drop_5i253AQzbENgHTYN4ATemV.json",
|
23 |
+
"mmlu_pro": "2024-11-04T19-44-13-05-00_mmlu-pro_8GrR6wUsYNkthiZNMmLa8y.json",
|
24 |
+
"ifeval": "2024-11-04T12-51-30-05-00_ifeval_ZATErMbLHoyxh4kDaSqy8j.json",
|
25 |
+
"hellaswag": "2024-11-05T23-19-25-05-00_hellaswag_MRffohuzgVjighGb8FoqSJ.json",
|
26 |
+
"winogrande": "2024-11-04T12-48-29-05-00_winogrande_Hmqo6Ydz3nfCnQAdUwgrbD.json",
|
27 |
+
"humaneval": "2024-11-04T12-50-47-05-00_humaneval_9j4rYguKeKmxEoD9VuddwX.json",
|
28 |
+
"arc_easy": "2024-11-04T12-39-50-05-00_arc-easy_NwmTEw6C8VSCXzzwZCFy48.json",
|
29 |
+
"gsm8k": "2024-11-04T15-22-21-05-00_gsm8k_hdJs3Z6XzpR5netTcWLXJT.json",
|
30 |
+
"mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
|
31 |
+
"gdm_intercode_ctf": "2024-11-15T20-52-53-05-00_gdm-intercode-ctf_oLYr3H6bFtrcmgM6EABmNt.json"
|
32 |
+
},
|
33 |
+
"c4ai-command-r-plus": {
|
34 |
+
"ifeval": "2024-10-30T17-23-04-04-00_ifeval_RGucUMwdGmUnRpqyMTZTzW.json",
|
35 |
+
"winogrande": "2024-10-30T14-42-18-04-00_winogrande_bY8yg7aRR5dCCK7NDCZEcc.json",
|
36 |
+
"arc_challenge": "2024-10-29T17-30-03-04-00_arc-challenge_XB7LURXEGaxskWuLtYwdnW.json",
|
37 |
+
"drop": "2024-10-30T12-06-30-04-00_drop_itY9cLiYAW2BF7NTeDceNd.json",
|
38 |
+
"math": "2024-10-30T17-26-34-04-00_math_kohBUMpMFuMsR4jz4vUNWM.json",
|
39 |
+
"gpqa_diamond": "2024-10-29T22-47-45-04-00_gpqa-diamond_JKpb6ya4pec9hh7uovPPCZ.json",
|
40 |
+
"mmlu_pro": "2024-10-31T01-11-38-04-00_mmlu-pro_gZVAuy3zMKR23BieM5PqAX.json",
|
41 |
+
"humaneval": "2024-10-30T17-22-23-04-00_humaneval_5ByPqUhoofSbKgvsUQNFCX.json",
|
42 |
+
"gsm8k": "2024-10-30T15-03-35-04-00_gsm8k_QxbfbriJsKGQAg96JyjkoT.json",
|
43 |
+
"hellaswag": "2024-10-30T15-18-17-04-00_hellaswag_UYyBTR6N8VJnKRmnbCrB8N.json",
|
44 |
+
"mmlu": "2024-10-30T21-55-26-04-00_mmlu_JUPPLTzfe3Kme6UuorPTqg.json",
|
45 |
+
"arc_easy": "2024-10-29T17-10-40-04-00_arc-easy_UvprihBMLXPF8JENVLRkdx.json"
|
46 |
+
},
|
47 |
+
"Qwen2.5-72B-Instruct": {
|
48 |
+
"arc_challenge": "2024-10-31T13-46-34-04-00_arc-challenge_FSybKYYwpXVLQag8VwpjKe.json",
|
49 |
+
"mmlu_pro": "2024-11-01T20-31-04-04-00_mmlu-pro_2TfSPmsVmKatntHy2CnR7A.json",
|
50 |
+
"gpqa_diamond": "2024-10-31T13-48-32-04-00_gpqa-diamond_8qSySicySUyNvRRYVFBKLU.json",
|
51 |
+
"winogrande": "2024-10-31T14-46-29-04-00_winogrande_CX692dYh53gJ6JigT9GMpa.json",
|
52 |
+
"mmlu": "2024-11-01T10-08-50-04-00_mmlu_AgK27yYvmAo2LxotBH7ZL9.json",
|
53 |
+
"hellaswag": "2024-11-01T02-55-55-04-00_hellaswag_RSk8rGcQWg3HRrLffTNoiM.json",
|
54 |
+
"gsm8k": "2024-11-01T01-15-16-04-00_gsm8k_3h4W6xZjXpz9oCwtgKNYzo.json",
|
55 |
+
"arc_easy": "2024-10-31T13-40-08-04-00_arc-easy_3JUyzfoEHxhSBUdCU2AaVC.json",
|
56 |
+
"math": "2024-11-01T10-06-46-04-00_math_UUpS2R9eQc9KxBxkanT2gE.json",
|
57 |
+
"ifeval": "2024-10-31T14-51-45-04-00_ifeval_VGxA7gTZLZSruceM9Ci37C.json",
|
58 |
+
"humaneval": "2024-10-31T14-49-39-04-00_humaneval_9u7khnxivCDroJoPNRFpjs.json",
|
59 |
+
"drop": "2024-10-31T15-03-20-04-00_drop_DDLi98VhiV2bLzuw7fx6H4.json"
|
60 |
+
}
|
61 |
+
}
|
src/populate.py
CHANGED
@@ -3,19 +3,33 @@ import os
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
|
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
"""Returns the URL to the log file for a given model and benchmark"""
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
if
|
17 |
-
return
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
21 |
"""Creates a dataframe from all the individual experiment results"""
|
@@ -30,8 +44,13 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
30 |
# filter out if any of the benchmarks have not been produced
|
31 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
32 |
|
33 |
-
#
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
return df
|
37 |
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.about import Tasks
|
7 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
8 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
12 |
+
TASK_NAME_INVERSE_MAP = dict()
|
13 |
+
for task in Tasks:
|
14 |
+
TASK_NAME_INVERSE_MAP[task.value.col_name] = {
|
15 |
+
"name": task.benchmark,
|
16 |
+
"type": task.type,
|
17 |
+
}
|
18 |
+
|
19 |
+
|
20 |
+
def get_inspect_log_url(model_name: str, benchmark_name: str) -> str:
|
21 |
"""Returns the URL to the log file for a given model and benchmark"""
|
22 |
+
with open("./inspect_log_file_names.json", "r") as f:
|
23 |
+
inspect_log_files = json.load(f)
|
24 |
+
log_file_name = inspect_log_files[model_name].get(benchmark_name, None)
|
25 |
+
if log_file_name is None:
|
26 |
+
return ""
|
27 |
+
else:
|
28 |
+
# TMP: Debugging
|
29 |
+
print_str = f"https://storage.googleapis.com/inspect-evals/{model_name}/index.html?log_file=logs/logs/{log_file_name}"
|
30 |
+
print(print_str)
|
31 |
+
return f"https://storage.googleapis.com/inspect-evals/{model_name}/index.html?log_file=logs/logs/{log_file_name}"
|
32 |
+
|
33 |
|
34 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
35 |
"""Creates a dataframe from all the individual experiment results"""
|
|
|
44 |
# filter out if any of the benchmarks have not been produced
|
45 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
46 |
|
47 |
+
# make values clickable and link to log files
|
48 |
+
for col in benchmark_cols:
|
49 |
+
df[col] = df[[AutoEvalColumn.model.name, col]].apply(lambda x:
|
50 |
+
f"[{x[col]}]({get_inspect_log_url(
|
51 |
+
model_name=x[AutoEvalColumn.model.name].split(">")[1].split("<")[0],
|
52 |
+
benchmark_name=TASK_NAME_INVERSE_MAP[col]["name"],
|
53 |
+
)})", axis=1)
|
54 |
|
55 |
return df
|
56 |
|