xeon27 commited on
Commit
bbde2b0
·
1 Parent(s): 2c5e9d1

Make values clickable

Browse files
src/about.py CHANGED
@@ -6,6 +6,7 @@ class Task:
6
  benchmark: str
7
  metric: str
8
  col_name: str
 
9
 
10
 
11
  # Select your tasks here
@@ -14,22 +15,22 @@ class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
 
16
  # base
17
- task0 = Task("arc_easy", "accuracy", "ARC-Easy")
18
- task1 = Task("arc_challenge", "accuracy", "ARC-Challenge")
19
- task2 = Task("drop", "mean", "DROP")
20
- task3 = Task("winogrande", "accuracy", "WinoGrande")
21
- task4 = Task("gsm8k", "accuracy", "GSM8K")
22
- task5 = Task("hellaswag", "accuracy", "HellaSwag")
23
- task6 = Task("humaneval", "mean", "HumanEval")
24
- task7 = Task("ifeval", "final_acc", "IFEval")
25
- task8 = Task("math", "accuracy", "MATH")
26
- task9 = Task("mmlu", "accuracy", "MMLU")
27
- task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro")
28
- task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond")
29
 
30
  # agentic
31
- task12 = Task("gaia", "mean", "GAIA")
32
- task13 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF")
33
 
34
 
35
  NUM_FEWSHOT = 0 # Change with your few shot
 
6
  benchmark: str
7
  metric: str
8
  col_name: str
9
+ type: str
10
 
11
 
12
  # Select your tasks here
 
15
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
 
17
  # base
18
+ task0 = Task("arc_easy", "accuracy", "ARC-Easy", "base")
19
+ task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "base")
20
+ task2 = Task("drop", "mean", "DROP", "base")
21
+ task3 = Task("winogrande", "accuracy", "WinoGrande", "base")
22
+ task4 = Task("gsm8k", "accuracy", "GSM8K", "base")
23
+ task5 = Task("hellaswag", "accuracy", "HellaSwag", "base")
24
+ task6 = Task("humaneval", "mean", "HumanEval", "base")
25
+ task7 = Task("ifeval", "final_acc", "IFEval", "base")
26
+ task8 = Task("math", "accuracy", "MATH", "base")
27
+ task9 = Task("mmlu", "accuracy", "MMLU", "base")
28
+ task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "base")
29
+ task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "base")
30
 
31
  # agentic
32
+ task12 = Task("gaia", "mean", "GAIA", "agentic")
33
+ task13 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic")
34
 
35
 
36
  NUM_FEWSHOT = 0 # Change with your few shot
src/display/utils.py CHANGED
@@ -28,7 +28,7 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
28
  #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # # Model information
33
  # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
  # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
28
  #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "markdown", True)])
32
  # # Model information
33
  # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
  # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
src/inspect_log_file_names.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "gemini-1.5-pro": {
3
+ "mmlu": "2024-11-04T16-56-26-05-00_mmlu_Z9KrcK7x4ZLAR5nJ9JaVUe.json",
4
+ "humaneval": "2024-11-04T12-43-07-05-00_humaneval_5JBjtymGtK23qwVKxqidhV.json",
5
+ "mmlu_pro": "2024-11-04T20-13-09-05-00_mmlu-pro_Hv2ujvKLV6H7ZwQu2q8LNw.json",
6
+ "math": "2024-11-04T15-48-46-05-00_math_9DAZmGEfhpa3nUcmMAwqZe.json",
7
+ "arc_easy": "2024-11-04T12-31-43-05-00_arc-easy_eGxYWywpLuREcaCKvHa8Uk.json",
8
+ "gsm8k": "2024-11-04T15-15-26-05-00_gsm8k_cTebw3ugfrVz3dyPwxtdUZ.json",
9
+ "gpqa_diamond": "2024-11-05T09-56-31-05-00_gpqa-diamond_FBq2bnoyGYQ3NF96xQw8iy.json",
10
+ "ifeval": "2024-11-04T12-43-32-05-00_ifeval_mSwZ7AwA7akj5PjZbQMjgC.json",
11
+ "winogrande": "2024-11-04T12-40-46-05-00_winogrande_5SmD6rx47zmZvHHkQSSfHK.json",
12
+ "arc_challenge": "2024-11-04T12-37-36-05-00_arc-challenge_5VVApyQD22QpJoMm53EMdU.json",
13
+ "drop": "2024-11-04T12-44-32-05-00_drop_9dzPKVJojSVsxmiBFnej2m.json",
14
+ "hellaswag": "2024-11-05T13-14-31-05-00_hellaswag_N98eeftuY2pucRtgpUYk5m.json",
15
+ "gaia": "2024-11-15T12-53-32-05-00_gaia_NvyGRTXFrFskJfUvuLwvVr.json",
16
+ "gdm_intercode_ctf": "2024-11-15T16-23-23-05-00_gdm-intercode-ctf_3JrgtTMcijTUxHVaagPRYh.json"
17
+ },
18
+ "gemini-1.5-flash": {
19
+ "gpqa_diamond": "2024-11-04T12-47-34-05-00_gpqa-diamond_cL5kQj8DWbRfxz79piTSdy.json",
20
+ "arc_challenge": "2024-11-04T12-45-59-05-00_arc-challenge_YQLMHfEXqeYgGJY86EB9bp.json",
21
+ "math": "2024-11-04T15-25-38-05-00_math_eaYBRMFgo8p6VUUCYxnCWj.json",
22
+ "drop": "2024-11-04T12-52-08-05-00_drop_5i253AQzbENgHTYN4ATemV.json",
23
+ "mmlu_pro": "2024-11-04T19-44-13-05-00_mmlu-pro_8GrR6wUsYNkthiZNMmLa8y.json",
24
+ "ifeval": "2024-11-04T12-51-30-05-00_ifeval_ZATErMbLHoyxh4kDaSqy8j.json",
25
+ "hellaswag": "2024-11-05T23-19-25-05-00_hellaswag_MRffohuzgVjighGb8FoqSJ.json",
26
+ "winogrande": "2024-11-04T12-48-29-05-00_winogrande_Hmqo6Ydz3nfCnQAdUwgrbD.json",
27
+ "humaneval": "2024-11-04T12-50-47-05-00_humaneval_9j4rYguKeKmxEoD9VuddwX.json",
28
+ "arc_easy": "2024-11-04T12-39-50-05-00_arc-easy_NwmTEw6C8VSCXzzwZCFy48.json",
29
+ "gsm8k": "2024-11-04T15-22-21-05-00_gsm8k_hdJs3Z6XzpR5netTcWLXJT.json",
30
+ "mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
31
+ "gdm_intercode_ctf": "2024-11-15T20-52-53-05-00_gdm-intercode-ctf_oLYr3H6bFtrcmgM6EABmNt.json"
32
+ },
33
+ "c4ai-command-r-plus": {
34
+ "ifeval": "2024-10-30T17-23-04-04-00_ifeval_RGucUMwdGmUnRpqyMTZTzW.json",
35
+ "winogrande": "2024-10-30T14-42-18-04-00_winogrande_bY8yg7aRR5dCCK7NDCZEcc.json",
36
+ "arc_challenge": "2024-10-29T17-30-03-04-00_arc-challenge_XB7LURXEGaxskWuLtYwdnW.json",
37
+ "drop": "2024-10-30T12-06-30-04-00_drop_itY9cLiYAW2BF7NTeDceNd.json",
38
+ "math": "2024-10-30T17-26-34-04-00_math_kohBUMpMFuMsR4jz4vUNWM.json",
39
+ "gpqa_diamond": "2024-10-29T22-47-45-04-00_gpqa-diamond_JKpb6ya4pec9hh7uovPPCZ.json",
40
+ "mmlu_pro": "2024-10-31T01-11-38-04-00_mmlu-pro_gZVAuy3zMKR23BieM5PqAX.json",
41
+ "humaneval": "2024-10-30T17-22-23-04-00_humaneval_5ByPqUhoofSbKgvsUQNFCX.json",
42
+ "gsm8k": "2024-10-30T15-03-35-04-00_gsm8k_QxbfbriJsKGQAg96JyjkoT.json",
43
+ "hellaswag": "2024-10-30T15-18-17-04-00_hellaswag_UYyBTR6N8VJnKRmnbCrB8N.json",
44
+ "mmlu": "2024-10-30T21-55-26-04-00_mmlu_JUPPLTzfe3Kme6UuorPTqg.json",
45
+ "arc_easy": "2024-10-29T17-10-40-04-00_arc-easy_UvprihBMLXPF8JENVLRkdx.json"
46
+ },
47
+ "Qwen2.5-72B-Instruct": {
48
+ "arc_challenge": "2024-10-31T13-46-34-04-00_arc-challenge_FSybKYYwpXVLQag8VwpjKe.json",
49
+ "mmlu_pro": "2024-11-01T20-31-04-04-00_mmlu-pro_2TfSPmsVmKatntHy2CnR7A.json",
50
+ "gpqa_diamond": "2024-10-31T13-48-32-04-00_gpqa-diamond_8qSySicySUyNvRRYVFBKLU.json",
51
+ "winogrande": "2024-10-31T14-46-29-04-00_winogrande_CX692dYh53gJ6JigT9GMpa.json",
52
+ "mmlu": "2024-11-01T10-08-50-04-00_mmlu_AgK27yYvmAo2LxotBH7ZL9.json",
53
+ "hellaswag": "2024-11-01T02-55-55-04-00_hellaswag_RSk8rGcQWg3HRrLffTNoiM.json",
54
+ "gsm8k": "2024-11-01T01-15-16-04-00_gsm8k_3h4W6xZjXpz9oCwtgKNYzo.json",
55
+ "arc_easy": "2024-10-31T13-40-08-04-00_arc-easy_3JUyzfoEHxhSBUdCU2AaVC.json",
56
+ "math": "2024-11-01T10-06-46-04-00_math_UUpS2R9eQc9KxBxkanT2gE.json",
57
+ "ifeval": "2024-10-31T14-51-45-04-00_ifeval_VGxA7gTZLZSruceM9Ci37C.json",
58
+ "humaneval": "2024-10-31T14-49-39-04-00_humaneval_9u7khnxivCDroJoPNRFpjs.json",
59
+ "drop": "2024-10-31T15-03-20-04-00_drop_DDLi98VhiV2bLzuw7fx6H4.json"
60
+ }
61
+ }
src/populate.py CHANGED
@@ -3,19 +3,33 @@ import os
3
 
4
  import pandas as pd
5
 
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
- def get_inspect_log_url(model_name: str, benchmark_type: str, benchmark_name: str, log_dir: str) -> str:
 
 
 
 
 
 
 
 
12
  """Returns the URL to the log file for a given model and benchmark"""
13
- model_name = model_name.replace("/", "_")
14
- log_name = f"{model_name}_{benchmark_type}.log"
15
- log_path = os.path.join(log_dir, log_name)
16
- if os.path.exists(log_path):
17
- return log_path
18
- return ""
 
 
 
 
 
19
 
20
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
21
  """Creates a dataframe from all the individual experiment results"""
@@ -30,8 +44,13 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
30
  # filter out if any of the benchmarks have not been produced
31
  df = df[has_no_nan_values(df, benchmark_cols)]
32
 
33
- # TMP: Debugging
34
- print((df[AutoEvalColumn.model.name].iloc[0]).split(">")[1].split("<")[0])
 
 
 
 
 
35
 
36
  return df
37
 
 
3
 
4
  import pandas as pd
5
 
6
+ from src.about import Tasks
7
  from src.display.formatting import has_no_nan_values, make_clickable_model
8
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
12
+ TASK_NAME_INVERSE_MAP = dict()
13
+ for task in Tasks:
14
+ TASK_NAME_INVERSE_MAP[task.value.col_name] = {
15
+ "name": task.benchmark,
16
+ "type": task.type,
17
+ }
18
+
19
+
20
+ def get_inspect_log_url(model_name: str, benchmark_name: str) -> str:
21
  """Returns the URL to the log file for a given model and benchmark"""
22
+ with open("./inspect_log_file_names.json", "r") as f:
23
+ inspect_log_files = json.load(f)
24
+ log_file_name = inspect_log_files[model_name].get(benchmark_name, None)
25
+ if log_file_name is None:
26
+ return ""
27
+ else:
28
+ # TMP: Debugging
29
+ print_str = f"https://storage.googleapis.com/inspect-evals/{model_name}/index.html?log_file=logs/logs/{log_file_name}"
30
+ print(print_str)
31
+ return f"https://storage.googleapis.com/inspect-evals/{model_name}/index.html?log_file=logs/logs/{log_file_name}"
32
+
33
 
34
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
35
  """Creates a dataframe from all the individual experiment results"""
 
44
  # filter out if any of the benchmarks have not been produced
45
  df = df[has_no_nan_values(df, benchmark_cols)]
46
 
47
+ # make values clickable and link to log files
48
+ for col in benchmark_cols:
49
+ df[col] = df[[AutoEvalColumn.model.name, col]].apply(lambda x:
50
+ f"[{x[col]}]({get_inspect_log_url(
51
+ model_name=x[AutoEvalColumn.model.name].split(">")[1].split("<")[0],
52
+ benchmark_name=TASK_NAME_INVERSE_MAP[col]["name"],
53
+ )})", axis=1)
54
 
55
  return df
56