xeon27 commited on
Commit
15e5347
·
1 Parent(s): 2a314d2

Make task names clickable and link to inspect-evals repo

Browse files
Files changed (2) hide show
  1. src/about.py +15 -14
  2. src/display/utils.py +1 -1
src/about.py CHANGED
@@ -7,6 +7,7 @@ class Task:
7
  metric: str
8
  col_name: str
9
  type: str
 
10
 
11
 
12
  # Select your tasks here
@@ -15,22 +16,22 @@ class Tasks(Enum):
15
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
 
17
  # base
18
- task0 = Task("arc_easy", "accuracy", "ARC-Easy", "base")
19
- task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "base")
20
- task2 = Task("drop", "mean", "DROP", "base")
21
- task3 = Task("winogrande", "accuracy", "WinoGrande", "base")
22
- task4 = Task("gsm8k", "accuracy", "GSM8K", "base")
23
- task5 = Task("hellaswag", "accuracy", "HellaSwag", "base")
24
- task6 = Task("humaneval", "mean", "HumanEval", "base")
25
- task7 = Task("ifeval", "final_acc", "IFEval", "base")
26
- task8 = Task("math", "accuracy", "MATH", "base")
27
- task9 = Task("mmlu", "accuracy", "MMLU", "base")
28
- task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "base")
29
- task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "base")
30
 
31
  # agentic
32
- task12 = Task("gaia", "mean", "GAIA", "agentic")
33
- task13 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic")
34
 
35
 
36
  NUM_FEWSHOT = 0 # Change with your few shot
 
7
  metric: str
8
  col_name: str
9
  type: str
10
+ source: str
11
 
12
 
13
  # Select your tasks here
 
16
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
17
 
18
  # base
19
+ task0 = Task("arc_easy", "accuracy", "ARC-Easy", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
20
+ task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
21
+ task2 = Task("drop", "mean", "DROP", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop")
22
+ task3 = Task("winogrande", "accuracy", "WinoGrande", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande")
23
+ task4 = Task("gsm8k", "accuracy", "GSM8K", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k")
24
+ task5 = Task("hellaswag", "accuracy", "HellaSwag", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag")
25
+ task6 = Task("humaneval", "mean", "HumanEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval")
26
+ task7 = Task("ifeval", "final_acc", "IFEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval")
27
+ task8 = Task("math", "accuracy", "MATH", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics")
28
+ task9 = Task("mmlu", "accuracy", "MMLU", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu")
29
+ task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro")
30
+ task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa")
31
 
32
  # agentic
33
+ task12 = Task("gaia", "mean", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
34
+ task13 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
35
 
36
 
37
  NUM_FEWSHOT = 0 # Change with your few shot
src/display/utils.py CHANGED
@@ -28,7 +28,7 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
28
  #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "markdown", True)])
32
  # # Model information
33
  # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
  # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
28
  #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(f"[{task.value.col_name}]({task.value.source})", "markdown", True)])
32
  # # Model information
33
  # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
  # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])