xeon27
commited on
Commit
·
15e5347
1
Parent(s):
2a314d2
Make task names clickable and link to inspect-evals repo
Browse files- src/about.py +15 -14
- src/display/utils.py +1 -1
src/about.py
CHANGED
@@ -7,6 +7,7 @@ class Task:
|
|
7 |
metric: str
|
8 |
col_name: str
|
9 |
type: str
|
|
|
10 |
|
11 |
|
12 |
# Select your tasks here
|
@@ -15,22 +16,22 @@ class Tasks(Enum):
|
|
15 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
|
17 |
# base
|
18 |
-
task0 = Task("arc_easy", "accuracy", "ARC-Easy", "base")
|
19 |
-
task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "base")
|
20 |
-
task2 = Task("drop", "mean", "DROP", "base")
|
21 |
-
task3 = Task("winogrande", "accuracy", "WinoGrande", "base")
|
22 |
-
task4 = Task("gsm8k", "accuracy", "GSM8K", "base")
|
23 |
-
task5 = Task("hellaswag", "accuracy", "HellaSwag", "base")
|
24 |
-
task6 = Task("humaneval", "mean", "HumanEval", "base")
|
25 |
-
task7 = Task("ifeval", "final_acc", "IFEval", "base")
|
26 |
-
task8 = Task("math", "accuracy", "MATH", "base")
|
27 |
-
task9 = Task("mmlu", "accuracy", "MMLU", "base")
|
28 |
-
task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "base")
|
29 |
-
task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "base")
|
30 |
|
31 |
# agentic
|
32 |
-
task12 = Task("gaia", "mean", "GAIA", "agentic")
|
33 |
-
task13 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic")
|
34 |
|
35 |
|
36 |
NUM_FEWSHOT = 0 # Change with your few shot
|
|
|
7 |
metric: str
|
8 |
col_name: str
|
9 |
type: str
|
10 |
+
source: str
|
11 |
|
12 |
|
13 |
# Select your tasks here
|
|
|
16 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
17 |
|
18 |
# base
|
19 |
+
task0 = Task("arc_easy", "accuracy", "ARC-Easy", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
|
20 |
+
task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
|
21 |
+
task2 = Task("drop", "mean", "DROP", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop")
|
22 |
+
task3 = Task("winogrande", "accuracy", "WinoGrande", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande")
|
23 |
+
task4 = Task("gsm8k", "accuracy", "GSM8K", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k")
|
24 |
+
task5 = Task("hellaswag", "accuracy", "HellaSwag", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag")
|
25 |
+
task6 = Task("humaneval", "mean", "HumanEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval")
|
26 |
+
task7 = Task("ifeval", "final_acc", "IFEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval")
|
27 |
+
task8 = Task("math", "accuracy", "MATH", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics")
|
28 |
+
task9 = Task("mmlu", "accuracy", "MMLU", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu")
|
29 |
+
task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro")
|
30 |
+
task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa")
|
31 |
|
32 |
# agentic
|
33 |
+
task12 = Task("gaia", "mean", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
|
34 |
+
task13 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
|
35 |
|
36 |
|
37 |
NUM_FEWSHOT = 0 # Change with your few shot
|
src/display/utils.py
CHANGED
@@ -28,7 +28,7 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "markdown", True)])
|
32 |
# # Model information
|
33 |
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(f"[{task.value.col_name}]({task.value.source})", "markdown", True)])
|
32 |
# # Model information
|
33 |
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|