jwilles commited on
Commit
b0b7fbb
·
1 Parent(s): 3159db8
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
- from apscheduler.schedulers.background import BackgroundScheduler
3
- from huggingface_hub import snapshot_download
4
  import pandas as pd
 
5
 
6
  from src.about import (
7
  REPRODUCIBILITY_TEXT,
@@ -10,64 +9,65 @@ from src.about import (
10
  TITLE,
11
  )
12
  from src.display.css_html_js import custom_css, custom_js
13
- from src.display.utils import (
14
- COLS,
15
- ST_BENCHMARK_COLS,
16
- AGENTIC_BENCHMARK_COLS,
17
- EVAL_COLS,
18
- AutoEvalColumn,
19
- fields,
20
- )
21
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
22
- from src.populate import get_evaluation_queue_df, get_leaderboard_df, TASK_NAME_INVERSE_MAP
23
- from src.submission.submit import add_new_eval
24
-
25
-
26
- def restart_space():
27
- API.restart_space(repo_id=REPO_ID)
28
-
29
- ### Space initialisation
30
- try:
31
- print(EVAL_REQUESTS_PATH)
32
- snapshot_download(
33
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
34
- )
35
- except Exception:
36
- restart_space()
37
- try:
38
- print(EVAL_RESULTS_PATH)
39
- snapshot_download(
40
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
41
- )
42
- except Exception:
43
- restart_space()
44
-
45
-
46
- ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, ST_BENCHMARK_COLS)
47
- AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, AGENTIC_BENCHMARK_COLS)
48
-
49
- (
50
- finished_eval_queue_df,
51
- running_eval_queue_df,
52
- pending_eval_queue_df,
53
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
54
-
55
- def bold_max(s):
56
- is_max = s == s.max() # Boolean Series: True for the max value(s)
57
- return ['font-weight: bold' if v else '' for v in is_max]
58
-
59
- def init_leaderboard(df, benchmark_type):
60
- if df is None or df.empty:
61
- raise ValueError("Leaderboard DataFrame is empty or None.")
 
62
 
63
- non_task_cols = ["Model"]
64
- if benchmark_type == "agentic":
65
- # Include agent column
66
- non_task_cols.append("Agent")
67
- elif benchmark_type == "base":
68
- # Drop agent column
69
- dataframe = dataframe.drop(columns=["Agent"])
70
- AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name in non_task_cols) or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))]
71
 
72
  # styler = dataframe.style.apply(bold_max, subset=pd.IndexSlice[:, dataframe.columns[1:]])
73
  # df.style.set_table_styles([
@@ -85,13 +85,78 @@ def init_leaderboard(df, benchmark_type):
85
  # styled_df = df.style.set_tooltips(tooltips)
86
 
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  return gr.components.Dataframe(
89
- value=df,
90
- datatype=[c.type for c in AutoEvalColumnSubset],
91
- column_widths=["150px" if c.name != "Model" else "250px" for c in AutoEvalColumnSubset],
92
  wrap=False,
93
  )
94
 
 
95
  black_logo_path = "src/assets/logo-icon-black.png"
96
  white_logo_path = "src/assets/logo-icon-white.png"
97
 
@@ -123,10 +188,10 @@ with demo:
123
 
124
  with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs:
125
  with gr.TabItem("Base Benchmarks", elem_classes="llm-benchmark-tab-table", id=0):
126
- leaderboard = init_leaderboard(ST_LEADERBOARD_DF, "base")
127
 
128
  with gr.TabItem("Agentic Benchmarks", elem_classes="llm-benchmark-tab-table", id=1):
129
- leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")
130
 
131
  with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2):
132
  gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text", sanitize_html=False)
@@ -135,8 +200,5 @@ with demo:
135
  gr.Markdown(REPRODUCIBILITY_TEXT, elem_classes="markdown-text", sanitize_html=False)
136
 
137
  assets = [black_logo_path, white_logo_path]
 
138
 
139
- scheduler = BackgroundScheduler()
140
- scheduler.add_job(restart_space, "interval", seconds=1800)
141
- scheduler.start()
142
- demo.queue(default_concurrency_limit=40).launch(allowed_paths=assets)
 
1
  import gradio as gr
 
 
2
  import pandas as pd
3
+ import json
4
 
5
  from src.about import (
6
  REPRODUCIBILITY_TEXT,
 
9
  TITLE,
10
  )
11
  from src.display.css_html_js import custom_css, custom_js
12
+ # from src.display.utils import (
13
+ # COLS,
14
+ # ST_BENCHMARK_COLS,
15
+ # AGENTIC_BENCHMARK_COLS,
16
+ # EVAL_COLS,
17
+ # AutoEvalColumn,
18
+ # fields,
19
+ # )
20
+ # from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
21
+ # from src.populate import get_evaluation_queue_df, get_leaderboard_df, TASK_NAME_INVERSE_MAP
22
+ # from src.submission.submit import add_new_eval
23
+ from src.display.formatting import make_clickable_field
24
+
25
+
26
+ # def restart_space():
27
+ # API.restart_space(repo_id=REPO_ID)
28
+
29
+ # ### Space initialisation
30
+ # try:
31
+ # print(EVAL_REQUESTS_PATH)
32
+ # snapshot_download(
33
+ # repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
34
+ # )
35
+ # except Exception:
36
+ # restart_space()
37
+ # try:
38
+ # print(EVAL_RESULTS_PATH)
39
+ # snapshot_download(
40
+ # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
41
+ # )
42
+ # except Exception:
43
+ # restart_space()
44
+
45
+
46
+ # ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, ST_BENCHMARK_COLS)
47
+ # AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, AGENTIC_BENCHMARK_COLS)
48
+
49
+ # (
50
+ # finished_eval_queue_df,
51
+ # running_eval_queue_df,
52
+ # pending_eval_queue_df,
53
+ # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
54
+
55
+ # def bold_max(s):
56
+ # is_max = s == s.max() # Boolean Series: True for the max value(s)
57
+ # return ['font-weight: bold' if v else '' for v in is_max]
58
+
59
+ # def init_leaderboard(df, benchmark_type):
60
+ # if df is None or df.empty:
61
+ # raise ValueError("Leaderboard DataFrame is empty or None.")
62
 
63
+ # non_task_cols = ["Model"]
64
+ # if benchmark_type == "agentic":
65
+ # # Include agent column
66
+ # non_task_cols.append("Agent")
67
+ # elif benchmark_type == "base":
68
+ # # Drop agent column
69
+ # dataframe = dataframe.drop(columns=["Agent"])
70
+ # AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name in non_task_cols) or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))]
71
 
72
  # styler = dataframe.style.apply(bold_max, subset=pd.IndexSlice[:, dataframe.columns[1:]])
73
  # df.style.set_table_styles([
 
85
  # styled_df = df.style.set_tooltips(tooltips)
86
 
87
 
88
+ # return gr.components.Dataframe(
89
+ # value=df,
90
+ # datatype=[c.type for c in AutoEvalColumnSubset],
91
+ # column_widths=["150px" if c.name != "Model" else "250px" for c in AutoEvalColumnSubset],
92
+ # wrap=False,
93
+ # )
94
+
95
+
96
+
97
+ def build_leaderboard(type):
98
+ with open('data/results.json', 'r') as f:
99
+ results = json.load(f)
100
+
101
+ with open('data/tasks.json', 'r') as f:
102
+ tasks = json.load(f)
103
+
104
+ # Filter tasks based on type
105
+ filtered_tasks = {k: v for k, v in tasks.items() if v['type'] == type}
106
+
107
+ data = []
108
+ for model_name, model_data in results.items():
109
+ # For agentic type, skip models that have all null values for agentic tasks
110
+ if type == "agentic":
111
+ has_agentic_results = any(
112
+ model_data['results'].get(task, {}).get(tasks[task]['metric']) is not None
113
+ for task in filtered_tasks
114
+ )
115
+ if not has_agentic_results:
116
+ continue
117
+
118
+ model_sha = model_data["config"]["model_sha"]
119
+ model_name = model_data["config"]["model_name"]
120
+ row = {
121
+ 'Model': make_clickable_field(model_name, model_sha)
122
+ }
123
+
124
+ for dataset, metrics in model_data['results'].items():
125
+ # Only include metrics for tasks of the specified type
126
+ if dataset in filtered_tasks:
127
+ value = next(iter(metrics.values()))
128
+ log_url = metrics.get('log_url')
129
+ # Use display name from tasks.json instead of raw dataset name
130
+ display_name = filtered_tasks[dataset]['display_name']
131
+ # Round non-null values to 2 decimal places and make clickable if log_url exists
132
+ if value is not None:
133
+ value = round(value*100, 2)
134
+ if log_url:
135
+ value = make_clickable_field(value, log_url)
136
+ row[display_name] = value
137
+ data.append(row)
138
+
139
+ results_df = pd.DataFrame(data)
140
+
141
+ # Round all numeric columns to 2 decimal places
142
+ numeric_cols = results_df.select_dtypes(include=['float64', 'float32']).columns
143
+ results_df[numeric_cols] = results_df[numeric_cols].round(2)
144
+
145
+ # Fill null values with "-"
146
+ results_df = results_df.fillna("--")
147
+
148
+ if type == "agentic":
149
+ # Include agent column as second column after Model
150
+ results_df.insert(1, 'Agent', '[Basic Agent](https://inspect.ai-safety-institute.org.uk/agents.html#sec-basic-agent)')
151
+
152
  return gr.components.Dataframe(
153
+ value=results_df,
154
+ datatype=["html" for _ in results_df.columns],
155
+ column_widths=["250px" if c == "Model" else "150px" for c in results_df.columns],
156
  wrap=False,
157
  )
158
 
159
+
160
  black_logo_path = "src/assets/logo-icon-black.png"
161
  white_logo_path = "src/assets/logo-icon-white.png"
162
 
 
188
 
189
  with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs:
190
  with gr.TabItem("Base Benchmarks", elem_classes="llm-benchmark-tab-table", id=0):
191
+ build_leaderboard("base")
192
 
193
  with gr.TabItem("Agentic Benchmarks", elem_classes="llm-benchmark-tab-table", id=1):
194
+ build_leaderboard("agentic")
195
 
196
  with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2):
197
  gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text", sanitize_html=False)
 
200
  gr.Markdown(REPRODUCIBILITY_TEXT, elem_classes="markdown-text", sanitize_html=False)
201
 
202
  assets = [black_logo_path, white_logo_path]
203
+ demo.launch()
204
 
 
 
 
 
inspect_log_file_names.json → data/inspect_log_file_names.json RENAMED
File without changes
data/populate_results.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ def get_log_url(model_name: str, log_file_name: str) -> str:
4
+ """Returns the URL to the log file for a given model and benchmark"""
5
+ if log_file_name is None:
6
+ return None
7
+ else:
8
+ # replace .json with .eval
9
+ log_file_name = log_file_name.replace(".json", ".eval")
10
+ return f"https://storage.googleapis.com/inspect-evals/eval/{model_name}/index.html?log_file=logs/logs/{log_file_name}"
11
+
12
+ def main():
13
+ # Load the results and log file names
14
+ with open("data/results.json", "r") as f:
15
+ results = json.load(f)
16
+
17
+ with open("data/inspect_log_file_names.json", "r") as f:
18
+ log_files = json.load(f)
19
+
20
+ # For each model in results
21
+ for model_name, model_data in results.items():
22
+ # Get the log files for this model
23
+ model_logs = log_files.get(model_name, {})
24
+
25
+ # For each task in the model's results
26
+ for task_name, task_data in model_data["results"].items():
27
+ # Get the log file name for this task
28
+ log_file_name = model_logs.get(task_name)
29
+
30
+ # Add the log URL to the task data
31
+ if log_file_name:
32
+ task_data["log_url"] = get_log_url(model_name, log_file_name)
33
+ else:
34
+ task_data["log_url"] = None
35
+
36
+ # Save the updated results
37
+ with open("data/results_with_logs.json", "w") as f:
38
+ json.dump(results, f, indent=4)
39
+
40
+ if __name__ == "__main__":
41
+ main()
data/results.json ADDED
@@ -0,0 +1,948 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "DeepSeek-R1": {
3
+ "config": {
4
+ "model_name": "DeepSeek-R1",
5
+ "model_sha": "https://api-docs.deepseek.com/news/news250120",
6
+ "model_dtype": "torch.float16"
7
+ },
8
+ "results": {
9
+ "mmlu_pro": {
10
+ "accuracy": 0.8382646276595744,
11
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-12T11-02-35-05-00_mmlu-pro_BhD89DYN9KM3k4weSDfaQK.eval"
12
+ },
13
+ "humaneval": {
14
+ "mean": 0.9567901234567902,
15
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-03T11-45-22-05-00_humaneval_hnkHWYqrb5HxiBt2CWzCnq.eval"
16
+ },
17
+ "math": {
18
+ "accuracy": 0.9272,
19
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-11T11-38-10-05-00_math_ZYFSqsWsmP5kLRLHEMWULU.eval"
20
+ },
21
+ "gsm8k": {
22
+ "accuracy": 0.954510993176649,
23
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-02T16-28-05-05-00_gsm8k_YMw6WiZkgTBQ54z5UHtDDX.eval"
24
+ },
25
+ "arc_challenge": {
26
+ "accuracy": 0.9667235494880546,
27
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-01-30T15-42-39-05-00_arc-challenge_CviW9ro6rKBbctkwJzQstp.eval"
28
+ },
29
+ "winogrande": {
30
+ "accuracy": 0.9179163378058406,
31
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-04T00-25-12-05-00_winogrande_NPgTbtqom2QSPKxeThWrdZ.eval"
32
+ },
33
+ "arc_easy": {
34
+ "accuracy": 0.9873737373737373,
35
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2024-10-29T17-10-40-04-00_arc-easy_UvprihBMLXPF8JENVLRkdx.eval"
36
+ },
37
+ "gpqa_diamond": {
38
+ "accuracy": 0.7045454545454546,
39
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-11T11-37-45-05-00_gpqa-diamond_MwnVeLwyuiEAALr3M5q3dn.eval"
40
+ },
41
+ "drop": {
42
+ "mean": null,
43
+ "log_url": null
44
+ },
45
+ "hellaswag": {
46
+ "accuracy": null,
47
+ "log_url": null
48
+ },
49
+ "ifeval": {
50
+ "final_acc": null,
51
+ "log_url": null
52
+ },
53
+ "mmlu": {
54
+ "accuracy": null,
55
+ "log_url": null
56
+ },
57
+ "mmmu_multiple_choice": {
58
+ "accuracy": null,
59
+ "log_url": null
60
+ },
61
+ "mmmu_open": {
62
+ "accuracy": null,
63
+ "log_url": null
64
+ },
65
+ "gaia": {
66
+ "accuracy": null,
67
+ "log_url": null
68
+ },
69
+ "gdm_intercode_ctf": {
70
+ "accuracy": null,
71
+ "log_url": null
72
+ },
73
+ "gdm_in_house_ctf": {
74
+ "accuracy": null,
75
+ "log_url": null
76
+ },
77
+ "agentharm": {
78
+ "avg_score": null,
79
+ "log_url": null
80
+ },
81
+ "agentharm_benign": {
82
+ "avg_score": null,
83
+ "log_url": null
84
+ },
85
+ "swe_bench": {
86
+ "mean": null,
87
+ "log_url": null
88
+ }
89
+ }
90
+ },
91
+ "Meta-Llama-3.1-70B-Instruct": {
92
+ "config": {
93
+ "model_name": "Meta-Llama-3.1-70B-Instruct",
94
+ "model_sha": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
95
+ "model_dtype": "torch.float16"
96
+ },
97
+ "results": {
98
+ "hellaswag": {
99
+ "accuracy": 0.869946225851424,
100
+ "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T00-45-54-04-00_hellaswag_BKfQG9yGAr383MGnooMLBH.eval"
101
+ },
102
+ "drop": {
103
+ "mean": 0.8811263765076035,
104
+ "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T21-01-02-04-00_drop_LzAWvLWkNrNKu5qf56wXRo.eval"
105
+ },
106
+ "gpqa_diamond": {
107
+ "accuracy": 0.4318181818181818,
108
+ "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T23-41-39-04-00_gpqa-diamond_TdLdYmVM6GCVMAECcXkuhj.eval"
109
+ },
110
+ "winogrande": {
111
+ "accuracy": 0.8666140489344909,
112
+ "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T09-20-56-04-00_winogrande_WnUgkSRhSMvh3zUjnuJWQZ.eval"
113
+ },
114
+ "gsm8k": {
115
+ "accuracy": 0.9469294920394238,
116
+ "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T00-03-31-04-00_gsm8k_bKsUfCAfcmBCeryboNaLoX.eval"
117
+ },
118
+ "math": {
119
+ "accuracy": 0.6004,
120
+ "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-34-50-04-00_math_2xiNcrGih26uzJdG4q88bM.eval"
121
+ },
122
+ "ifeval": {
123
+ "final_acc": 0.8604907201780166,
124
+ "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-29-32-04-00_ifeval_Dwh3CF2ZYFrvw7UcTwrsvK.eval"
125
+ },
126
+ "arc_challenge": {
127
+ "accuracy": 0.9445392491467577,
128
+ "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-58-56-04-00_arc-challenge_oFL5wFjT7KwNFhMFfe72JN.eval"
129
+ },
130
+ "arc_easy": {
131
+ "accuracy": 0.9823232323232324,
132
+ "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-53-12-04-00_arc-easy_UXzR7cDeNteP39NoXUYnhm.eval"
133
+ },
134
+ "mmlu_pro": {
135
+ "accuracy": 0.6688829787234043,
136
+ "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T06-11-16-04-00_mmlu-pro_oQiEBJdeKtEEt4cm9KL7uy.eval"
137
+ },
138
+ "humaneval": {
139
+ "mean": 0.7865853658536586,
140
+ "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-28-25-04-00_humaneval_KcJV2rHuHJ2JLxijihEkcW.eval"
141
+ },
142
+ "mmlu": {
143
+ "accuracy": 0.8033755875231449,
144
+ "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T03-51-50-04-00_mmlu_6SNjs2QmPRvqGnvbnNtaqb.eval"
145
+ },
146
+ "mmmu_multiple_choice": {
147
+ "accuracy": null,
148
+ "log_url": null
149
+ },
150
+ "mmmu_open": {
151
+ "accuracy": null,
152
+ "log_url": null
153
+ },
154
+ "gaia": {
155
+ "accuracy": null,
156
+ "log_url": null
157
+ },
158
+ "gdm_intercode_ctf": {
159
+ "accuracy": null,
160
+ "log_url": null
161
+ },
162
+ "gdm_in_house_ctf": {
163
+ "accuracy": null,
164
+ "log_url": null
165
+ },
166
+ "agentharm": {
167
+ "avg_score": null,
168
+ "log_url": null
169
+ },
170
+ "agentharm_benign": {
171
+ "avg_score": null,
172
+ "log_url": null
173
+ },
174
+ "swe_bench": {
175
+ "mean": null,
176
+ "log_url": null
177
+ }
178
+ }
179
+ },
180
+ "Mistral-Large-Instruct-2407": {
181
+ "config": {
182
+ "model_name": "Mistral-Large-Instruct-2407",
183
+ "model_sha": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
184
+ "model_dtype": "torch.float16"
185
+ },
186
+ "results": {
187
+ "drop": {
188
+ "mean": 0.7424257996853698,
189
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T01-56-12-04-00_drop_NtvuCoU2LoMbH8DztcCTen.eval"
190
+ },
191
+ "ifeval": {
192
+ "final_acc": 0.8285172231900246,
193
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T06-30-16-04-00_ifeval_TLkvCSFEWo4PLv6hAha7YB.eval"
194
+ },
195
+ "mmlu": {
196
+ "accuracy": 0.8035892323030908,
197
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T07-21-48-04-00_mmlu_YnUhmHoStr3WuJdchWmNPt.eval"
198
+ },
199
+ "gpqa_diamond": {
200
+ "accuracy": 0.4734848484848485,
201
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T04-22-52-04-00_gpqa-diamond_SuZUZxGdqS2ZecbLRNkKd4.eval"
202
+ },
203
+ "gsm8k": {
204
+ "accuracy": 0.9378316906747536,
205
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T04-28-49-04-00_gsm8k_5tQp9tbwUMj6NpjNKCAfVm.eval"
206
+ },
207
+ "math": {
208
+ "accuracy": 0.6574,
209
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T06-33-09-04-00_math_2CmjBedAfUxqvmcHRdBgyB.eval"
210
+ },
211
+ "arc_easy": {
212
+ "accuracy": 0.9852693602693603,
213
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T01-48-39-04-00_arc-easy_YbfuBT3usZXt2xgZkkR5dq.eval"
214
+ },
215
+ "mmlu_pro": {
216
+ "accuracy": 0.6942320478723404,
217
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T09-41-25-04-00_mmlu-pro_fyYT4aabPesfY5TpzFMPnd.eval"
218
+ },
219
+ "humaneval": {
220
+ "mean": 0.8658536585365854,
221
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T06-29-24-04-00_humaneval_nu8SUSGekKJWB8HLKDigYK.eval"
222
+ },
223
+ "hellaswag": {
224
+ "accuracy": 0.9047998406691894,
225
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T04-50-00-04-00_hellaswag_ZzQoZ6gkRQsTzMhQr7GYNn.eval"
226
+ },
227
+ "arc_challenge": {
228
+ "accuracy": 0.9436860068259386,
229
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T01-54-13-04-00_arc-challenge_WfQRhMkFcywefpU46isBVP.eval"
230
+ },
231
+ "winogrande": {
232
+ "accuracy": 0.8547750591949487,
233
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T11-57-58-04-00_winogrande_TP3UGwpp37Dyv6ks9Ty5Hk.eval"
234
+ },
235
+ "mmmu_multiple_choice": {
236
+ "accuracy": null,
237
+ "log_url": null
238
+ },
239
+ "mmmu_open": {
240
+ "accuracy": null,
241
+ "log_url": null
242
+ },
243
+ "gaia": {
244
+ "accuracy": null,
245
+ "log_url": null
246
+ },
247
+ "gdm_intercode_ctf": {
248
+ "accuracy": null,
249
+ "log_url": null
250
+ },
251
+ "gdm_in_house_ctf": {
252
+ "accuracy": null,
253
+ "log_url": null
254
+ },
255
+ "agentharm": {
256
+ "avg_score": null,
257
+ "log_url": null
258
+ },
259
+ "agentharm_benign": {
260
+ "avg_score": null,
261
+ "log_url": null
262
+ },
263
+ "swe_bench": {
264
+ "mean": null,
265
+ "log_url": null
266
+ }
267
+ }
268
+ },
269
+ "c4ai-command-r-plus": {
270
+ "config": {
271
+ "model_name": "c4ai-command-r-plus",
272
+ "model_sha": "https://huggingface.co/CohereForAI/c4ai-command-r-plus"
273
+ },
274
+ "results": {
275
+ "ifeval": {
276
+ "final_acc": 0.7779591483929307,
277
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T17-23-04-04-00_ifeval_RGucUMwdGmUnRpqyMTZTzW.eval"
278
+ },
279
+ "winogrande": {
280
+ "accuracy": 0.7490134175217048,
281
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T14-42-18-04-00_winogrande_bY8yg7aRR5dCCK7NDCZEcc.eval"
282
+ },
283
+ "arc_challenge": {
284
+ "accuracy": 0.8506825938566553,
285
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-29T17-30-03-04-00_arc-challenge_XB7LURXEGaxskWuLtYwdnW.eval"
286
+ },
287
+ "drop": {
288
+ "mean": 0.743557420031463,
289
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T12-06-30-04-00_drop_itY9cLiYAW2BF7NTeDceNd.eval"
290
+ },
291
+ "math": {
292
+ "accuracy": 0.2626,
293
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T17-26-34-04-00_math_kohBUMpMFuMsR4jz4vUNWM.eval"
294
+ },
295
+ "gpqa_diamond": {
296
+ "accuracy": 0.3194444444444444,
297
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-29T22-47-45-04-00_gpqa-diamond_JKpb6ya4pec9hh7uovPPCZ.eval"
298
+ },
299
+ "mmlu_pro": {
300
+ "accuracy": 0.441156914893617,
301
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-31T01-11-38-04-00_mmlu-pro_gZVAuy3zMKR23BieM5PqAX.eval"
302
+ },
303
+ "humaneval": {
304
+ "mean": 0.6219512195121951,
305
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T17-22-23-04-00_humaneval_5ByPqUhoofSbKgvsUQNFCX.eval"
306
+ },
307
+ "gsm8k": {
308
+ "accuracy": 0.7816527672479151,
309
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T15-03-35-04-00_gsm8k_QxbfbriJsKGQAg96JyjkoT.eval"
310
+ },
311
+ "hellaswag": {
312
+ "accuracy": 0.7954590718980283,
313
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T15-18-17-04-00_hellaswag_UYyBTR6N8VJnKRmnbCrB8N.eval"
314
+ },
315
+ "mmlu": {
316
+ "accuracy": 0.695128899017234,
317
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T21-55-26-04-00_mmlu_JUPPLTzfe3Kme6UuorPTqg.eval"
318
+ },
319
+ "arc_easy": {
320
+ "accuracy": 0.9377104377104377,
321
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-29T17-10-40-04-00_arc-easy_UvprihBMLXPF8JENVLRkdx.eval"
322
+ }
323
+ }
324
+ },
325
+ "claude-3-5-sonnet-20241022": {
326
+ "config": {
327
+ "model_name": "claude-3-5-sonnet-20241022",
328
+ "model_sha": "https://www.anthropic.com/claude/sonnet",
329
+ "model_dtype": "torch.float16"
330
+ },
331
+ "results": {
332
+ "mmmu_multiple_choice": {
333
+ "accuracy": 0.6481700118063755,
334
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-21T11-20-03-05-00_mmmu-multiple-choice_CWhKvGdoFo6pdHhDyi9GNm.eval"
335
+ },
336
+ "mmlu_pro": {
337
+ "accuracy": 0.7762632978723404,
338
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T19-01-05-05-00_mmlu-pro_3vi84or97gQupuj5sT6vgZ.eval"
339
+ },
340
+ "hellaswag": {
341
+ "accuracy": 0.9228241386178052,
342
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T15-09-33-05-00_hellaswag_QXqFxojvSToMu8ckHEMLkB.eval"
343
+ },
344
+ "gpqa_diamond": {
345
+ "accuracy": 0.6098484848484849,
346
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T13-56-36-05-00_gpqa-diamond_eg4gFaMRENjnnYvQNtSB59.eval"
347
+ },
348
+ "gsm8k": {
349
+ "accuracy": 0.9620924943138741,
350
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T14-23-25-05-00_gsm8k_nHB8Z4uZAwRAZFYpKmTptA.eval"
351
+ },
352
+ "mmmu_open": {
353
+ "accuracy": 0.41509433962264153,
354
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-21T11-24-21-05-00_mmmu-open_SSjv3Dq9gZkEEUnvJUd5xf.eval"
355
+ },
356
+ "arc_easy": {
357
+ "accuracy": 0.9915824915824916,
358
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T10-06-24-05-00_arc-easy_oBReQZQM5SAwMMD2jFshPb.eval"
359
+ },
360
+ "arc_challenge": {
361
+ "accuracy": 0.9692832764505119,
362
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T10-12-11-05-00_arc-challenge_X8i6caCzkcQo5AT5zXkXso.eval"
363
+ },
364
+ "mmlu": {
365
+ "accuracy": 0.8665432274604757,
366
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T15-16-51-05-00_mmlu_NFDs2kxmh3kQEbpbd8sz3w.eval"
367
+ },
368
+ "math": {
369
+ "accuracy": 0.7942,
370
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T12-29-54-05-00_math_NvNQU58M8r3fpiwPGnvq8h.eval"
371
+ },
372
+ "ifeval": {
373
+ "final_acc": 0.8958114469607309,
374
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T11-28-44-05-00_ifeval_fmWxch4ZjbmYCST6yUZsdV.eval"
375
+ },
376
+ "humaneval": {
377
+ "mean": 0.9451219512195121,
378
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T11-26-12-05-00_humaneval_kUASiaNd9uZfWvCwYHhdF5.eval"
379
+ },
380
+ "winogrande": {
381
+ "accuracy": 0.9021310181531176,
382
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T22-09-41-05-00_winogrande_mSWGAKg75E5RP79KWizvb9.eval"
383
+ },
384
+ "drop": {
385
+ "mean": 0.8977608809648663,
386
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T10-15-15-05-00_drop_Z9A2Y84HYponNxnzNT9TNq.eval"
387
+ },
388
+ "gaia": {
389
+ "accuracy": 0.3381818181818182,
390
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-12T23-57-37-05-00_claude-3-5-sonnet_gaia_merged.eval"
391
+ },
392
+ "gdm_intercode_ctf": {
393
+ "accuracy": 0.8556962025316455,
394
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-11T02-47-45-05-00_claude-3-5-sonnet_gdm-intercode-ctf_merged.eval"
395
+ },
396
+ "gdm_in_house_ctf": {
397
+ "accuracy": 0.6153846153846154,
398
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-11T07-41-14+00-00_claude-3-5-sonnet_gdm-in-house-ctf.eval"
399
+ },
400
+ "agentharm": {
401
+ "avg_score": 0.14767992424242424,
402
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T08-05-14-08-00_agentharm_VJGhWKLrVLdQczBZVgCXHc.eval"
403
+ },
404
+ "agentharm_benign": {
405
+ "avg_score": 0.800704570051161,
406
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-21T15-09-48-08-00_agentharm-benign_A3uBBWNvv88P5BsgqwFCfg.eval"
407
+ },
408
+ "swe_bench": {
409
+ "mean": 0.0672,
410
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T18-56-55+00-00_anthropic-claude-3-5-sonnet.eval"
411
+ }
412
+ }
413
+ },
414
+ "gemini-1.5-flash": {
415
+ "config": {
416
+ "model_name": "gemini-1.5-flash",
417
+ "model_sha": "https://deepmind.google/technologies/gemini/flash",
418
+ "model_dtype": "torch.float16"
419
+ },
420
+ "results": {
421
+ "gpqa_diamond": {
422
+ "accuracy": 0.40404040404040403,
423
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T19-31-26-04-00_gpqa-diamond_7aNe9wQiQKpNN96mfaWBPg.eval"
424
+ },
425
+ "arc_challenge": {
426
+ "accuracy": 0.9308873720136519,
427
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T17-34-51-04-00_arc-challenge_FbGgLswBZbRE4EhWiMyRt6.eval"
428
+ },
429
+ "math": {
430
+ "accuracy": 0.452,
431
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T21-15-49-04-00_math_YsWdRzpqMq2dqQ9SPKfack.eval"
432
+ },
433
+ "mmmu_open": {
434
+ "accuracy": 0.16981132075471697,
435
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2025-01-20T23-13-27-05-00_mmmu-open_GWi6XNYUSLq99BdabtScGm.eval"
436
+ },
437
+ "drop": {
438
+ "mean": 0.751044572627163,
439
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T17-36-25-04-00_drop_6TzJGqqEkpFUCxGD4QejV6.eval"
440
+ },
441
+ "mmlu_pro": {
442
+ "accuracy": 0.5993184840425532,
443
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T22-43-30-04-00_mmlu-pro_Dc2uu3EV7MJtjg6gg5Y9qH.eval"
444
+ },
445
+ "ifeval": {
446
+ "final_acc": 0.7681296737102001,
447
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T21-15-06-04-00_ifeval_nYs9KujQMQjcpbpbLtVx8G.eval"
448
+ },
449
+ "hellaswag": {
450
+ "accuracy": 0.8557060346544513,
451
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T19-35-34-04-00_hellaswag_2SAz3cvMpDxFaApdHDR3s4.eval"
452
+ },
453
+ "winogrande": {
454
+ "accuracy": 0.7884767166535123,
455
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-31T00-59-07-04-00_winogrande_Ci55vHvbGGW38zVpMCwtWa.eval"
456
+ },
457
+ "humaneval": {
458
+ "mean": 0.7439024390243902,
459
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T21-14-41-04-00_humaneval_Z9aXdUERuwYxoTheZ5GANC.eval"
460
+ },
461
+ "arc_easy": {
462
+ "accuracy": 0.984006734006734,
463
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T17-29-56-04-00_arc-easy_XcEzqqPqJsRV29NqYDfnNo.eval"
464
+ },
465
+ "gsm8k": {
466
+ "accuracy": 0.8582259287338894,
467
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T19-32-39-04-00_gsm8k_nLSssETKDDWNktAFWnVwfv.eval"
468
+ },
469
+ "mmlu": {
470
+ "accuracy": 0.7714713003845606,
471
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-31T10-49-43-04-00_mmlu_oGb9mspeGbYS2gfbkknskN.eval"
472
+ },
473
+ "mmmu_multiple_choice": {
474
+ "accuracy": 0.5702479338842975,
475
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2025-01-20T23-10-01-05-00_mmmu-multiple-choice_c5rLkrXkV83udX6DVJui5F.eval"
476
+ },
477
+ "gaia": {
478
+ "accuracy": null,
479
+ "log_url": null
480
+ },
481
+ "gdm_intercode_ctf": {
482
+ "accuracy": null,
483
+ "log_url": null
484
+ },
485
+ "gdm_in_house_ctf": {
486
+ "accuracy": null,
487
+ "log_url": null
488
+ },
489
+ "agentharm": {
490
+ "avg_score": null,
491
+ "log_url": null
492
+ },
493
+ "agentharm_benign": {
494
+ "avg_score": null,
495
+ "log_url": null
496
+ },
497
+ "swe_bench": {
498
+ "mean": null,
499
+ "log_url": null
500
+ }
501
+ }
502
+ },
503
+ "gemini-1.5-pro": {
504
+ "config": {
505
+ "model_name": "gemini-1.5-pro",
506
+ "model_sha": "https://deepmind.google/technologies/gemini/pro",
507
+ "model_dtype": "torch.float16"
508
+ },
509
+ "results": {
510
+ "mmlu": {
511
+ "accuracy": 0.8467454778521578,
512
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T16-56-26-05-00_mmlu_Z9KrcK7x4ZLAR5nJ9JaVUe.eval"
513
+ },
514
+ "humaneval": {
515
+ "mean": 0.8719512195121951,
516
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-43-07-05-00_humaneval_5JBjtymGtK23qwVKxqidhV.eval"
517
+ },
518
+ "mmmu_multiple_choice": {
519
+ "accuracy": 0.6304604486422668,
520
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2025-01-20T23-16-04-05-00_mmmu-multiple-choice_NLmxmHYt6CJymRVVa5UsbD.eval"
521
+ },
522
+ "mmlu_pro": {
523
+ "accuracy": 0.7563996010638298,
524
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T20-13-09-05-00_mmlu-pro_Hv2ujvKLV6H7ZwQu2q8LNw.eval"
525
+ },
526
+ "math": {
527
+ "accuracy": 0.852,
528
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T15-48-46-05-00_math_9DAZmGEfhpa3nUcmMAwqZe.eval"
529
+ },
530
+ "arc_easy": {
531
+ "accuracy": 0.9877946127946128,
532
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-31-43-05-00_arc-easy_eGxYWywpLuREcaCKvHa8Uk.eval"
533
+ },
534
+ "mmmu_open": {
535
+ "accuracy": 0.3584905660377358,
536
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2025-01-20T23-19-25-05-00_mmmu-open_CDbtEQ7tjs5zkj4ScBbzod.eval"
537
+ },
538
+ "gsm8k": {
539
+ "accuracy": 0.9613343442001516,
540
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T15-15-26-05-00_gsm8k_cTebw3ugfrVz3dyPwxtdUZ.eval"
541
+ },
542
+ "gpqa_diamond": {
543
+ "accuracy": 0.5782828282828283,
544
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-05T09-56-31-05-00_gpqa-diamond_FBq2bnoyGYQ3NF96xQw8iy.eval"
545
+ },
546
+ "ifeval": {
547
+ "final_acc": 0.8982344623377084,
548
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-43-32-05-00_ifeval_mSwZ7AwA7akj5PjZbQMjgC.eval"
549
+ },
550
+ "winogrande": {
551
+ "accuracy": 0.8768745067087609,
552
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-40-46-05-00_winogrande_5SmD6rx47zmZvHHkQSSfHK.eval"
553
+ },
554
+ "arc_challenge": {
555
+ "accuracy": 0.9633105802047781,
556
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-37-36-05-00_arc-challenge_5VVApyQD22QpJoMm53EMdU.eval"
557
+ },
558
+ "drop": {
559
+ "mean": 0.8800912427897221,
560
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-44-32-05-00_drop_9dzPKVJojSVsxmiBFnej2m.eval"
561
+ },
562
+ "hellaswag": {
563
+ "accuracy": 0.9123680541724756,
564
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-05T13-14-31-05-00_hellaswag_N98eeftuY2pucRtgpUYk5m.eval"
565
+ },
566
+ "gaia": {
567
+ "accuracy": 0.13818181818181818,
568
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-21T15-33-29-05-00_gemini-1.5-pro_gaia_merged.eval"
569
+ },
570
+ "gdm_intercode_ctf": {
571
+ "accuracy": 0.5291139240506328,
572
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-21T23-59-58+00-00_gemini-1.5-pro_gdm-intercode-ctf_merged.eval"
573
+ },
574
+ "gdm_in_house_ctf": {
575
+ "accuracy": 0.23076923076923078,
576
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-22T03-42-16+00-00_gemini-1.5-pro_gdm-in-house-ctf.eval"
577
+ },
578
+ "agentharm": {
579
+ "avg_score": 0.2898649645808737,
580
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-21T12-45-43-08-00_agentharm_VmD26soLwmRgWPo3hpRHBr.eval"
581
+ },
582
+ "agentharm_benign": {
583
+ "avg_score": 0.5961489079102715,
584
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-21T13-18-51-08-00_agentharm-benign_gP3pQPxAuCtFLiHzt2Egt7.eval"
585
+ },
586
+ "swe_bench": {
587
+ "mean": 0.004,
588
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-22T03-00-08+00-00_google-gemini-1.5-pro_swe.eval"
589
+ }
590
+ }
591
+ },
592
+ "gpt-4o": {
593
+ "config": {
594
+ "model_name": "gpt-4o",
595
+ "model_sha": "https://openai.com/index/hello-gpt-4o",
596
+ "model_dtype": "torch.float16"
597
+ },
598
+ "results": {
599
+ "gpqa_diamond": {
600
+ "accuracy": 0.51010101010101,
601
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T03-29-33-04-00_gpqa-diamond_nFmRv5MJiYjHjezmq4V6Va.eval"
602
+ },
603
+ "arc_challenge": {
604
+ "accuracy": 0.9633105802047781,
605
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T01-45-55-04-00_arc-challenge_nrsPPxh4DpzgLPQDFdcfVp.eval"
606
+ },
607
+ "gsm8k": {
608
+ "accuracy": 0.9446550416982562,
609
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T03-31-16-04-00_gsm8k_jVXeSvHowbietZCFsFYCwB.eval"
610
+ },
611
+ "mmlu": {
612
+ "accuracy": 0.8435408061529697,
613
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T10-49-43-04-00_mmlu_GarLpfQFSpM3C22nbbGp54.eval"
614
+ },
615
+ "ifeval": {
616
+ "final_acc": 0.8780386042367585,
617
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T05-00-11-04-00_ifeval_jxreUu8JqRdkrcHP4E3hLR.eval"
618
+ },
619
+ "mmlu_pro": {
620
+ "accuracy": 0.7450964095744681,
621
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T06-59-42-04-00_mmlu-pro_EuAKDwAWSfNVpqyyqrf2Ba.eval"
622
+ },
623
+ "mmmu_open": {
624
+ "accuracy": 0.3584905660377358,
625
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2025-01-20T23-07-46-05-00_mmmu-open_d3Q2HvuPZzEX6FAM4NBhnp.eval"
626
+ },
627
+ "winogrande": {
628
+ "accuracy": 0.9013417521704814,
629
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T09-02-03-04-00_winogrande_44kKF7M9mKoqVC7ixZVXuq.eval"
630
+ },
631
+ "drop": {
632
+ "mean": 0.7511693759832198,
633
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T01-47-20-04-00_drop_3gxDcn6vUoR3nvHX9BcSq4.eval"
634
+ },
635
+ "arc_easy": {
636
+ "accuracy": 0.9915824915824916,
637
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T01-41-34-04-00_arc-easy_nUavRHdiRVfrxo6dmCPadh.eval"
638
+ },
639
+ "mmmu_multiple_choice": {
640
+ "accuracy": 0.5903187721369539,
641
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2025-01-20T23-03-21-05-00_mmmu-multiple-choice_eoycAFLMirSqiURdXmBP2e.eval"
642
+ },
643
+ "humaneval": {
644
+ "mean": 0.9085365853658537,
645
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T04-59-42-04-00_humaneval_nmJcd84CcNKjWS8fBfMbZM.eval"
646
+ },
647
+ "math": {
648
+ "accuracy": 0.7054,
649
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T05-01-22-04-00_math_cDSpKPp3nLrFy8uYfYKEbM.eval"
650
+ },
651
+ "hellaswag": {
652
+ "accuracy": 0.924317864967138,
653
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T03-33-47-04-00_hellaswag_JNnnPuz3dhZRpyXzizMUBF.eval"
654
+ },
655
+ "gaia": {
656
+ "accuracy": 0.16606060606060608,
657
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-13T15-53-22+00-00_gpt-4o_gaia_merged.eval"
658
+ },
659
+ "gdm_intercode_ctf": {
660
+ "accuracy": 0.6379746835443038,
661
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-08T10-06-29-05-00_gpt-4o_gdm-intercode-ctf_merged.eval"
662
+ },
663
+ "gdm_in_house_ctf": {
664
+ "accuracy": 0.23076923076923078,
665
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-11T07-02-14+00-00_gpt-4o_gdm-in-house-ctf.eval"
666
+ },
667
+ "agentharm": {
668
+ "avg_score": 0.49953844451003543,
669
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-07T16-34-15-08-00_agentharm_UfSoyHEAH2E5RVdrPVUemy.eval"
670
+ },
671
+ "agentharm_benign": {
672
+ "avg_score": 0.8249433048012594,
673
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-21T13-45-18-08-00_agentharm-benign_8DhGJqEAvw6o8uCv4a4dVz.eval"
674
+ },
675
+ "swe_bench": {
676
+ "mean": 0.012,
677
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-14T23-09-10+00-00_openai-gpt-4o_swe.eval"
678
+ }
679
+ }
680
+ },
681
+ "gpt-4o-mini": {
682
+ "config": {
683
+ "model_name": "gpt-4o-mini",
684
+ "model_sha": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
685
+ "model_dtype": "torch.float16"
686
+ },
687
+ "results": {
688
+ "drop": {
689
+ "mean": 0.8065915049816466,
690
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T17-36-25-04-00_drop_6TzJGqqEkpFUCxGD4QejV6.eval"
691
+ },
692
+ "humaneval": {
693
+ "mean": 0.8597560975609756,
694
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T21-14-41-04-00_humaneval_Z9aXdUERuwYxoTheZ5GANC.eval"
695
+ },
696
+ "gpqa_diamond": {
697
+ "accuracy": 0.3838383838383838,
698
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T19-31-26-04-00_gpqa-diamond_7aNe9wQiQKpNN96mfaWBPg.eval"
699
+ },
700
+ "mmmu_open": {
701
+ "accuracy": 0.18867924528301888,
702
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2025-01-20T23-13-27-05-00_mmmu-open_GWi6XNYUSLq99BdabtScGm.eval"
703
+ },
704
+ "arc_challenge": {
705
+ "accuracy": 0.9249146757679181,
706
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T17-34-51-04-00_arc-challenge_FbGgLswBZbRE4EhWiMyRt6.eval"
707
+ },
708
+ "mmlu": {
709
+ "accuracy": 0.7698333570716422,
710
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-31T10-49-43-04-00_mmlu_oGb9mspeGbYS2gfbkknskN.eval"
711
+ },
712
+ "hellaswag": {
713
+ "accuracy": 0.8750248954391555,
714
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T19-35-34-04-00_hellaswag_2SAz3cvMpDxFaApdHDR3s4.eval"
715
+ },
716
+ "ifeval": {
717
+ "final_acc": 0.8419061423689144,
718
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T21-15-06-04-00_ifeval_nYs9KujQMQjcpbpbLtVx8G.eval"
719
+ },
720
+ "mmmu_multiple_choice": {
721
+ "accuracy": 0.5395513577331759,
722
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2025-01-20T23-10-01-05-00_mmmu-multiple-choice_c5rLkrXkV83udX6DVJui5F.eval"
723
+ },
724
+ "arc_easy": {
725
+ "accuracy": 0.9793771043771043,
726
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T17-29-56-04-00_arc-easy_XcEzqqPqJsRV29NqYDfnNo.eval"
727
+ },
728
+ "winogrande": {
729
+ "accuracy": 0.7529597474348856,
730
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-31T00-59-07-04-00_winogrande_Ci55vHvbGGW38zVpMCwtWa.eval"
731
+ },
732
+ "mmlu_pro": {
733
+ "accuracy": 0.6396276595744681,
734
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T22-43-30-04-00_mmlu-pro_Dc2uu3EV7MJtjg6gg5Y9qH.eval"
735
+ },
736
+ "math": {
737
+ "accuracy": 0.633,
738
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T21-15-49-04-00_math_YsWdRzpqMq2dqQ9SPKfack.eval"
739
+ },
740
+ "gsm8k": {
741
+ "accuracy": 0.9181197877179682,
742
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T19-32-39-04-00_gsm8k_nLSssETKDDWNktAFWnVwfv.eval"
743
+ },
744
+ "gaia": {
745
+ "accuracy": null,
746
+ "log_url": null
747
+ },
748
+ "gdm_intercode_ctf": {
749
+ "accuracy": null,
750
+ "log_url": null
751
+ },
752
+ "gdm_in_house_ctf": {
753
+ "accuracy": null,
754
+ "log_url": null
755
+ },
756
+ "agentharm": {
757
+ "avg_score": null,
758
+ "log_url": null
759
+ },
760
+ "agentharm_benign": {
761
+ "avg_score": null,
762
+ "log_url": null
763
+ },
764
+ "swe_bench": {
765
+ "mean": null,
766
+ "log_url": null
767
+ }
768
+ }
769
+ },
770
+ "o1": {
771
+ "config": {
772
+ "model_name": "o1",
773
+ "model_sha": "https://openai.com/o1",
774
+ "model_dtype": "torch.float16"
775
+ },
776
+ "results": {
777
+ "winogrande": {
778
+ "accuracy": 0.9392265193370166,
779
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.eval"
780
+ },
781
+ "humaneval": {
782
+ "mean": 0.9695121951219512,
783
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.eval"
784
+ },
785
+ "mmmu_open": {
786
+ "accuracy": 0.6981132075471698,
787
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-20T22-48-09-05-00_mmmu-open_oBzxJBYbvnktbbAwhoCrYK.eval"
788
+ },
789
+ "math": {
790
+ "accuracy": 0.959,
791
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T15-03-22-05-00_math_6BbvHFF8hLMsVYozyNLbyQ.eval"
792
+ },
793
+ "arc_easy": {
794
+ "accuracy": 0.9911616161616161,
795
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T11-29-26-05-00_arc-easy_DFbir4BdgQDbKd52r7tRKR.eval"
796
+ },
797
+ "arc_challenge": {
798
+ "accuracy": 0.9786689419795221,
799
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T11-44-42-05-00_arc-challenge_PsWXaBqrgv3EcTZC55gRzJ.eval"
800
+ },
801
+ "gsm8k": {
802
+ "accuracy": 0.9416224412433661,
803
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T12-56-38-05-00_gsm8k_iD8275qeyNTgX523pn45bF.eval"
804
+ },
805
+ "gpqa_diamond": {
806
+ "accuracy": 0.7550505050505051,
807
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T11-53-53-05-00_gpqa-diamond_EJV7ULFSQLRoFTEqsv3t6q.eval"
808
+ },
809
+ "mmlu_pro": {
810
+ "accuracy": 0.8447473404255319,
811
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-20T14-02-37-05-00_mmlu-pro_EvDzvqaahQwhv6fJovN4BT.eval"
812
+ },
813
+ "mmmu_multiple_choice": {
814
+ "accuracy": 0.8063754427390791,
815
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-20T21-04-57-05-00_mmmu-multiple-choice_MctxjookaeTLCL8KpUeazT.eval"
816
+ },
817
+ "drop": {
818
+ "mean": null,
819
+ "log_url": null
820
+ },
821
+ "hellaswag": {
822
+ "accuracy": null,
823
+ "log_url": null
824
+ },
825
+ "ifeval": {
826
+ "final_acc": null,
827
+ "log_url": null
828
+ },
829
+ "mmlu": {
830
+ "accuracy": null,
831
+ "log_url": null
832
+ },
833
+ "gaia": {
834
+ "accuracy": 0.41090909090909084,
835
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-22T13-42-00-05-00_o1_gaia_merged.eval"
836
+ },
837
+ "gdm_intercode_ctf": {
838
+ "accuracy": 0.8481012658227849,
839
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-22T20-46-35+00-00_o1_gdm-intercode-ctf_merged.eval"
840
+ },
841
+ "gdm_in_house_ctf": {
842
+ "accuracy": 0.46153846153846156,
843
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-22T05-52-25+00-00_o1_gdm-in-house-ctf.eval"
844
+ },
845
+ "agentharm": {
846
+ "avg_score": 0.08782061688311688,
847
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-21T09-05-42-08-00_agentharm_UGDq2yJeLAnPH6p7FgDgD8.eval"
848
+ },
849
+ "agentharm_benign": {
850
+ "avg_score": 0.7235176849665487,
851
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-21T18-20-15-08-00_agentharm-benign_bkW2Bf5xLyDQdNtfLdjCpJ.eval"
852
+ },
853
+ "swe_bench": {
854
+ "mean": 0.0036,
855
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-21T17-42-11+00-00_openai-o1_swe.eval "
856
+ }
857
+ }
858
+ },
859
+ "o3-mini": {
860
+ "config": {
861
+ "model_name": "o3-mini",
862
+ "model_sha": "https://openai.com/index/openai-o3-mini",
863
+ "model_dtype": "torch.float16"
864
+ },
865
+ "results": {
866
+ "math": {
867
+ "accuracy": 0.9691320905993185,
868
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T18-33-30-05-00_math_86Gx8n4BxhpyfaSHmRcCUm.eval"
869
+ },
870
+ "humaneval": {
871
+ "mean": 0.9817073170731707,
872
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T20-58-48-05-00_humaneval_Dkod7CS9RmbbogYx9aEXtx.eval"
873
+ },
874
+ "mmlu_pro": {
875
+ "accuracy": 0.7924606807023383,
876
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T19-49-27-05-00_mmlu-pro_jz9woKfdKt8VMzqNFsy7kY.eval"
877
+ },
878
+ "gpqa_diamond": {
879
+ "accuracy": 0.7365319865319865,
880
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T17-57-54-05-00_gpqa-diamond_2znyMtdc7X4LJufxXeXA8Z.eval"
881
+ },
882
+ "winogrande": {
883
+ "accuracy": 0.8492501973164956,
884
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T22-50-40-05-00_winogrande_VsTW2uU2Kj66YoNoFfRfUj.eval"
885
+ },
886
+ "gsm8k": {
887
+ "accuracy": 0.9454131918119788,
888
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T18-23-05-05-00_gsm8k_d523pJzkcvobxamhhobCRb.eval"
889
+ },
890
+ "arc_challenge": {
891
+ "accuracy": 0.9641638225255973,
892
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T17-53-30-05-00_arc-challenge_AYFHec7wmd4jELF2Rgzfya.eval"
893
+ },
894
+ "arc_easy": {
895
+ "accuracy": 0.9755892255892256,
896
+ "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T17-45-57-05-00_arc-easy_Nd8NP3K48tvwLVZb8kXDwg.eval"
897
+ },
898
+ "drop": {
899
+ "mean": null,
900
+ "log_url": null
901
+ },
902
+ "hellaswag": {
903
+ "accuracy": null,
904
+ "log_url": null
905
+ },
906
+ "ifeval": {
907
+ "final_acc": null,
908
+ "log_url": null
909
+ },
910
+ "mmlu": {
911
+ "accuracy": null,
912
+ "log_url": null
913
+ },
914
+ "mmmu_multiple_choice": {
915
+ "accuracy": null,
916
+ "log_url": null
917
+ },
918
+ "mmmu_open": {
919
+ "accuracy": null,
920
+ "log_url": null
921
+ },
922
+ "gaia": {
923
+ "accuracy": 0.27030303030303043,
924
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-02-05T23-21-20+00-00_gaia_hyMq8MzMm6NgAeq3dNqZSU.eval"
925
+ },
926
+ "gdm_intercode_ctf": {
927
+ "accuracy": 0.8278481012658225,
928
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-05T21-43-18+00-00_gdm-intercode-ctf_gdm29C6DuTEsX9qm9ymmrC.eval"
929
+ },
930
+ "gdm_in_house_ctf": {
931
+ "accuracy": 0.38461538461538464,
932
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-05T23-59-08+00-00_gdm-in-house-ctf_2zkAX5nkJoxDnVKpJL9VgW.eval"
933
+ },
934
+ "agentharm": {
935
+ "avg_score": 0.1241931080283353,
936
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-03T18-17-03-08-00_agentharm_DmN6i5HrgXHNARjsuSewjg.eval"
937
+ },
938
+ "agentharm_benign": {
939
+ "avg_score": 0.5429306867375049,
940
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-03T18-49-08-08-00_agentharm-benign_Gv94YFpAXaaCJqe3Fc6yr3.eval"
941
+ },
942
+ "swe_bench": {
943
+ "mean": 0.0024,
944
+ "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-03T06-49-09+00-00_openai-o3-mini_swe.eval"
945
+ }
946
+ }
947
+ }
948
+ }
data/results.json.bak ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "DeepSeek-R1": {
3
+ "config": {
4
+ "model_name": "DeepSeek-R1",
5
+ "model_sha": "https://api-docs.deepseek.com/news/news250120",
6
+ "model_dtype": "torch.float16"
7
+ },
8
+ "results": {
9
+ "mmlu_pro": {
10
+ "accuracy": 0.8382646276595744
11
+ },
12
+ "humaneval": {
13
+ "mean": 0.9567901234567902
14
+ },
15
+ "math": {
16
+ "accuracy": 0.9272
17
+ },
18
+ "gsm8k": {
19
+ "accuracy": 0.954510993176649
20
+ },
21
+ "arc_challenge": {
22
+ "accuracy": 0.9667235494880546
23
+ },
24
+ "winogrande": {
25
+ "accuracy": 0.9179163378058406
26
+ },
27
+ "arc_easy": {
28
+ "accuracy": 0.9873737373737373
29
+ },
30
+ "gpqa_diamond": {
31
+ "accuracy": 0.7045454545454546
32
+ },
33
+ "drop": {
34
+ "mean": null
35
+ },
36
+ "hellaswag": {
37
+ "accuracy": null
38
+ },
39
+ "ifeval": {
40
+ "final_acc": null
41
+ },
42
+ "mmlu": {
43
+ "accuracy": null
44
+ },
45
+ "mmmu_multiple_choice": {
46
+ "accuracy": null
47
+ },
48
+ "mmmu_open": {
49
+ "accuracy": null
50
+ },
51
+ "gaia": {
52
+ "accuracy": null
53
+ },
54
+ "gdm_intercode_ctf": {
55
+ "accuracy": null
56
+ },
57
+ "gdm_in_house_ctf": {
58
+ "accuracy": null
59
+ },
60
+ "agentharm": {
61
+ "avg_score": null
62
+ },
63
+ "agentharm_benign": {
64
+ "avg_score": null
65
+ },
66
+ "swe_bench": {
67
+ "mean": null
68
+ }
69
+ }
70
+ },
71
+ "Meta-Llama-3.1-70B-Instruct": {
72
+ "config": {
73
+ "model_name": "Meta-Llama-3.1-70B-Instruct",
74
+ "model_sha": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
75
+ "model_dtype": "torch.float16"
76
+ },
77
+ "results": {
78
+ "hellaswag": {
79
+ "accuracy": 0.869946225851424
80
+ },
81
+ "drop": {
82
+ "mean": 0.8811263765076035
83
+ },
84
+ "gpqa_diamond": {
85
+ "accuracy": 0.4318181818181818
86
+ },
87
+ "winogrande": {
88
+ "accuracy": 0.8666140489344909
89
+ },
90
+ "gsm8k": {
91
+ "accuracy": 0.9469294920394238
92
+ },
93
+ "math": {
94
+ "accuracy": 0.6004
95
+ },
96
+ "ifeval": {
97
+ "final_acc": 0.8604907201780166
98
+ },
99
+ "arc_challenge": {
100
+ "accuracy": 0.9445392491467577
101
+ },
102
+ "arc_easy": {
103
+ "accuracy": 0.9823232323232324
104
+ },
105
+ "mmlu_pro": {
106
+ "accuracy": 0.6688829787234043
107
+ },
108
+ "humaneval": {
109
+ "mean": 0.7865853658536586
110
+ },
111
+ "mmlu": {
112
+ "accuracy": 0.8033755875231449
113
+ },
114
+ "mmmu_multiple_choice": {
115
+ "accuracy": null
116
+ },
117
+ "mmmu_open": {
118
+ "accuracy": null
119
+ },
120
+ "gaia": {
121
+ "accuracy": null
122
+ },
123
+ "gdm_intercode_ctf": {
124
+ "accuracy": null
125
+ },
126
+ "gdm_in_house_ctf": {
127
+ "accuracy": null
128
+ },
129
+ "agentharm": {
130
+ "avg_score": null
131
+ },
132
+ "agentharm_benign": {
133
+ "avg_score": null
134
+ },
135
+ "swe_bench": {
136
+ "mean": null
137
+ }
138
+ }
139
+ },
140
+ "Mistral-Large-Instruct-2407": {
141
+ "config": {
142
+ "model_name": "Mistral-Large-Instruct-2407",
143
+ "model_sha": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
144
+ "model_dtype": "torch.float16"
145
+ },
146
+ "results": {
147
+ "drop": {
148
+ "mean": 0.7424257996853698
149
+ },
150
+ "ifeval": {
151
+ "final_acc": 0.8285172231900246
152
+ },
153
+ "mmlu": {
154
+ "accuracy": 0.8035892323030908
155
+ },
156
+ "gpqa_diamond": {
157
+ "accuracy": 0.4734848484848485
158
+ },
159
+ "gsm8k": {
160
+ "accuracy": 0.9378316906747536
161
+ },
162
+ "math": {
163
+ "accuracy": 0.6574
164
+ },
165
+ "arc_easy": {
166
+ "accuracy": 0.9852693602693603
167
+ },
168
+ "mmlu_pro": {
169
+ "accuracy": 0.6942320478723404
170
+ },
171
+ "humaneval": {
172
+ "mean": 0.8658536585365854
173
+ },
174
+ "hellaswag": {
175
+ "accuracy": 0.9047998406691894
176
+ },
177
+ "arc_challenge": {
178
+ "accuracy": 0.9436860068259386
179
+ },
180
+ "winogrande": {
181
+ "accuracy": 0.8547750591949487
182
+ },
183
+ "mmmu_multiple_choice": {
184
+ "accuracy": null
185
+ },
186
+ "mmmu_open": {
187
+ "accuracy": null
188
+ },
189
+ "gaia": {
190
+ "accuracy": null
191
+ },
192
+ "gdm_intercode_ctf": {
193
+ "accuracy": null
194
+ },
195
+ "gdm_in_house_ctf": {
196
+ "accuracy": null
197
+ },
198
+ "agentharm": {
199
+ "avg_score": null
200
+ },
201
+ "agentharm_benign": {
202
+ "avg_score": null
203
+ },
204
+ "swe_bench": {
205
+ "mean": null
206
+ }
207
+ }
208
+ },
209
+ "c4ai-command-r-plus": {
210
+ "config": {
211
+ "model_name": "c4ai-command-r-plus",
212
+ "model_sha": "https://huggingface.co/CohereForAI/c4ai-command-r-plus"
213
+ },
214
+ "results": {
215
+ "ifeval": {
216
+ "final_acc": 0.7779591483929307
217
+ },
218
+ "winogrande": {
219
+ "accuracy": 0.7490134175217048
220
+ },
221
+ "arc_challenge": {
222
+ "accuracy": 0.8506825938566553
223
+ },
224
+ "drop": {
225
+ "mean": 0.743557420031463
226
+ },
227
+ "math": {
228
+ "accuracy": 0.2626
229
+ },
230
+ "gpqa_diamond": {
231
+ "accuracy": 0.3194444444444444
232
+ },
233
+ "mmlu_pro": {
234
+ "accuracy": 0.441156914893617
235
+ },
236
+ "humaneval": {
237
+ "mean": 0.6219512195121951
238
+ },
239
+ "gsm8k": {
240
+ "accuracy": 0.7816527672479151
241
+ },
242
+ "hellaswag": {
243
+ "accuracy": 0.7954590718980283
244
+ },
245
+ "mmlu": {
246
+ "accuracy": 0.695128899017234
247
+ },
248
+ "arc_easy": {
249
+ "accuracy": 0.9377104377104377
250
+ },
251
+ "mmmu_multiple_choice": {
252
+ "accuracy": null
253
+ },
254
+ "mmmu_open": {
255
+ "accuracy": null
256
+ },
257
+ "gaia": {
258
+ "accuracy": null
259
+ },
260
+ "gdm_intercode_ctf": {
261
+ "accuracy": null
262
+ },
263
+ "gdm_in_house_ctf": {
264
+ "accuracy": null
265
+ },
266
+ "agentharm": {
267
+ "avg_score": null
268
+ },
269
+ "agentharm_benign": {
270
+ "avg_score": null
271
+ },
272
+ "swe_bench": {
273
+ "mean": null
274
+ }
275
+ }
276
+ },
277
+ "claude-3-5-sonnet-20241022": {
278
+ "config": {
279
+ "model_name": "claude-3-5-sonnet-20241022",
280
+ "model_sha": "https://www.anthropic.com/claude/sonnet",
281
+ "model_dtype": "torch.float16"
282
+ },
283
+ "results": {
284
+ "mmmu_multiple_choice": {
285
+ "accuracy": 0.6481700118063755
286
+ },
287
+ "mmlu_pro": {
288
+ "accuracy": 0.7762632978723404
289
+ },
290
+ "hellaswag": {
291
+ "accuracy": 0.9228241386178052
292
+ },
293
+ "gpqa_diamond": {
294
+ "accuracy": 0.6098484848484849
295
+ },
296
+ "gsm8k": {
297
+ "accuracy": 0.9620924943138741
298
+ },
299
+ "mmmu_open": {
300
+ "accuracy": 0.41509433962264153
301
+ },
302
+ "arc_easy": {
303
+ "accuracy": 0.9915824915824916
304
+ },
305
+ "arc_challenge": {
306
+ "accuracy": 0.9692832764505119
307
+ },
308
+ "mmlu": {
309
+ "accuracy": 0.8665432274604757
310
+ },
311
+ "math": {
312
+ "accuracy": 0.7942
313
+ },
314
+ "ifeval": {
315
+ "final_acc": 0.8958114469607309
316
+ },
317
+ "humaneval": {
318
+ "mean": 0.9451219512195121
319
+ },
320
+ "winogrande": {
321
+ "accuracy": 0.9021310181531176
322
+ },
323
+ "drop": {
324
+ "mean": 0.8977608809648663
325
+ },
326
+ "gaia": {
327
+ "accuracy": 0.3381818181818182
328
+ },
329
+ "gdm_intercode_ctf": {
330
+ "accuracy": 0.8556962025316455
331
+ },
332
+ "gdm_in_house_ctf": {
333
+ "accuracy": 0.6153846153846154
334
+ },
335
+ "agentharm": {
336
+ "avg_score": 0.14767992424242424
337
+ },
338
+ "agentharm_benign": {
339
+ "avg_score": 0.800704570051161
340
+ },
341
+ "swe_bench": {
342
+ "mean": 0.0672
343
+ }
344
+ }
345
+ },
346
+ "gemini-1.5-flash": {
347
+ "config": {
348
+ "model_name": "gemini-1.5-flash",
349
+ "model_sha": "https://deepmind.google/technologies/gemini/flash",
350
+ "model_dtype": "torch.float16"
351
+ },
352
+ "results": {
353
+ "gpqa_diamond": {
354
+ "accuracy": 0.40404040404040403
355
+ },
356
+ "arc_challenge": {
357
+ "accuracy": 0.9308873720136519
358
+ },
359
+ "math": {
360
+ "accuracy": 0.452
361
+ },
362
+ "mmmu_open": {
363
+ "accuracy": 0.16981132075471697
364
+ },
365
+ "drop": {
366
+ "mean": 0.751044572627163
367
+ },
368
+ "mmlu_pro": {
369
+ "accuracy": 0.5993184840425532
370
+ },
371
+ "ifeval": {
372
+ "final_acc": 0.7681296737102001
373
+ },
374
+ "hellaswag": {
375
+ "accuracy": 0.8557060346544513
376
+ },
377
+ "winogrande": {
378
+ "accuracy": 0.7884767166535123
379
+ },
380
+ "humaneval": {
381
+ "mean": 0.7439024390243902
382
+ },
383
+ "arc_easy": {
384
+ "accuracy": 0.984006734006734
385
+ },
386
+ "gsm8k": {
387
+ "accuracy": 0.8582259287338894
388
+ },
389
+ "mmlu": {
390
+ "accuracy": 0.7714713003845606
391
+ },
392
+ "mmmu_multiple_choice": {
393
+ "accuracy": 0.5702479338842975
394
+ },
395
+ "gaia": {
396
+ "accuracy": null
397
+ },
398
+ "gdm_intercode_ctf": {
399
+ "accuracy": null
400
+ },
401
+ "gdm_in_house_ctf": {
402
+ "accuracy": null
403
+ },
404
+ "agentharm": {
405
+ "avg_score": null
406
+ },
407
+ "agentharm_benign": {
408
+ "avg_score": null
409
+ },
410
+ "swe_bench": {
411
+ "mean": null
412
+ }
413
+ }
414
+ },
415
+ "gemini-1.5-pro": {
416
+ "config": {
417
+ "model_name": "gemini-1.5-pro",
418
+ "model_sha": "https://deepmind.google/technologies/gemini/pro",
419
+ "model_dtype": "torch.float16"
420
+ },
421
+ "results": {
422
+ "mmlu": {
423
+ "accuracy": 0.8467454778521578
424
+ },
425
+ "humaneval": {
426
+ "mean": 0.8719512195121951
427
+ },
428
+ "mmmu_multiple_choice": {
429
+ "accuracy": 0.6304604486422668
430
+ },
431
+ "mmlu_pro": {
432
+ "accuracy": 0.7563996010638298
433
+ },
434
+ "math": {
435
+ "accuracy": 0.852
436
+ },
437
+ "arc_easy": {
438
+ "accuracy": 0.9877946127946128
439
+ },
440
+ "mmmu_open": {
441
+ "accuracy": 0.3584905660377358
442
+ },
443
+ "gsm8k": {
444
+ "accuracy": 0.9613343442001516
445
+ },
446
+ "gpqa_diamond": {
447
+ "accuracy": 0.5782828282828283
448
+ },
449
+ "ifeval": {
450
+ "final_acc": 0.8982344623377084
451
+ },
452
+ "winogrande": {
453
+ "accuracy": 0.8768745067087609
454
+ },
455
+ "arc_challenge": {
456
+ "accuracy": 0.9633105802047781
457
+ },
458
+ "drop": {
459
+ "mean": 0.8800912427897221
460
+ },
461
+ "hellaswag": {
462
+ "accuracy": 0.9123680541724756
463
+ },
464
+ "gaia": {
465
+ "accuracy": 0.13818181818181818
466
+ },
467
+ "gdm_intercode_ctf": {
468
+ "accuracy": 0.5291139240506328
469
+ },
470
+ "gdm_in_house_ctf": {
471
+ "accuracy": 0.23076923076923078
472
+ },
473
+ "agentharm": {
474
+ "avg_score": 0.2898649645808737
475
+ },
476
+ "agentharm_benign": {
477
+ "avg_score": 0.5961489079102715
478
+ },
479
+ "swe_bench": {
480
+ "mean": 0.004
481
+ }
482
+ }
483
+ },
484
+ "gpt-4o": {
485
+ "config": {
486
+ "model_name": "gpt-4o",
487
+ "model_sha": "https://openai.com/index/hello-gpt-4o",
488
+ "model_dtype": "torch.float16"
489
+ },
490
+ "results": {
491
+ "gpqa_diamond": {
492
+ "accuracy": 0.51010101010101
493
+ },
494
+ "arc_challenge": {
495
+ "accuracy": 0.9633105802047781
496
+ },
497
+ "gsm8k": {
498
+ "accuracy": 0.9446550416982562
499
+ },
500
+ "mmlu": {
501
+ "accuracy": 0.8435408061529697
502
+ },
503
+ "ifeval": {
504
+ "final_acc": 0.8780386042367585
505
+ },
506
+ "mmlu_pro": {
507
+ "accuracy": 0.7450964095744681
508
+ },
509
+ "mmmu_open": {
510
+ "accuracy": 0.3584905660377358
511
+ },
512
+ "winogrande": {
513
+ "accuracy": 0.9013417521704814
514
+ },
515
+ "drop": {
516
+ "mean": 0.7511693759832198
517
+ },
518
+ "arc_easy": {
519
+ "accuracy": 0.9915824915824916
520
+ },
521
+ "mmmu_multiple_choice": {
522
+ "accuracy": 0.5903187721369539
523
+ },
524
+ "humaneval": {
525
+ "mean": 0.9085365853658537
526
+ },
527
+ "math": {
528
+ "accuracy": 0.7054
529
+ },
530
+ "hellaswag": {
531
+ "accuracy": 0.924317864967138
532
+ },
533
+ "gaia": {
534
+ "accuracy": 0.16606060606060608
535
+ },
536
+ "gdm_intercode_ctf": {
537
+ "accuracy": 0.6379746835443038
538
+ },
539
+ "gdm_in_house_ctf": {
540
+ "accuracy": 0.23076923076923078
541
+ },
542
+ "agentharm": {
543
+ "avg_score": 0.49953844451003543
544
+ },
545
+ "agentharm_benign": {
546
+ "avg_score": 0.8249433048012594
547
+ },
548
+ "swe_bench": {
549
+ "mean": 0.012
550
+ }
551
+ }
552
+ },
553
+ "gpt-4o-mini": {
554
+ "config": {
555
+ "model_name": "gpt-4o-mini",
556
+ "model_sha": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
557
+ "model_dtype": "torch.float16"
558
+ },
559
+ "results": {
560
+ "drop": {
561
+ "mean": 0.8065915049816466
562
+ },
563
+ "humaneval": {
564
+ "mean": 0.8597560975609756
565
+ },
566
+ "gpqa_diamond": {
567
+ "accuracy": 0.3838383838383838
568
+ },
569
+ "mmmu_open": {
570
+ "accuracy": 0.18867924528301888
571
+ },
572
+ "arc_challenge": {
573
+ "accuracy": 0.9249146757679181
574
+ },
575
+ "mmlu": {
576
+ "accuracy": 0.7698333570716422
577
+ },
578
+ "hellaswag": {
579
+ "accuracy": 0.8750248954391555
580
+ },
581
+ "ifeval": {
582
+ "final_acc": 0.8419061423689144
583
+ },
584
+ "mmmu_multiple_choice": {
585
+ "accuracy": 0.5395513577331759
586
+ },
587
+ "arc_easy": {
588
+ "accuracy": 0.9793771043771043
589
+ },
590
+ "winogrande": {
591
+ "accuracy": 0.7529597474348856
592
+ },
593
+ "mmlu_pro": {
594
+ "accuracy": 0.6396276595744681
595
+ },
596
+ "math": {
597
+ "accuracy": 0.633
598
+ },
599
+ "gsm8k": {
600
+ "accuracy": 0.9181197877179682
601
+ },
602
+ "gaia": {
603
+ "accuracy": null
604
+ },
605
+ "gdm_intercode_ctf": {
606
+ "accuracy": null
607
+ },
608
+ "gdm_in_house_ctf": {
609
+ "accuracy": null
610
+ },
611
+ "agentharm": {
612
+ "avg_score": null
613
+ },
614
+ "agentharm_benign": {
615
+ "avg_score": null
616
+ },
617
+ "swe_bench": {
618
+ "mean": null
619
+ }
620
+ }
621
+ },
622
+ "o1": {
623
+ "config": {
624
+ "model_name": "o1",
625
+ "model_sha": "https://openai.com/o1",
626
+ "model_dtype": "torch.float16"
627
+ },
628
+ "results": {
629
+ "winogrande": {
630
+ "accuracy": 0.9392265193370166
631
+ },
632
+ "humaneval": {
633
+ "mean": 0.9695121951219512
634
+ },
635
+ "mmmu_open": {
636
+ "accuracy": 0.6981132075471698
637
+ },
638
+ "math": {
639
+ "accuracy": 0.959
640
+ },
641
+ "arc_easy": {
642
+ "accuracy": 0.9911616161616161
643
+ },
644
+ "arc_challenge": {
645
+ "accuracy": 0.9786689419795221
646
+ },
647
+ "gsm8k": {
648
+ "accuracy": 0.9416224412433661
649
+ },
650
+ "gpqa_diamond": {
651
+ "accuracy": 0.7550505050505051
652
+ },
653
+ "mmlu_pro": {
654
+ "accuracy": 0.8447473404255319
655
+ },
656
+ "mmmu_multiple_choice": {
657
+ "accuracy": 0.8063754427390791
658
+ },
659
+ "drop": {
660
+ "mean": null
661
+ },
662
+ "hellaswag": {
663
+ "accuracy": null
664
+ },
665
+ "ifeval": {
666
+ "final_acc": null
667
+ },
668
+ "mmlu": {
669
+ "accuracy": null
670
+ },
671
+ "gaia": {
672
+ "accuracy": 0.41090909090909084
673
+ },
674
+ "gdm_intercode_ctf": {
675
+ "accuracy": 0.8481012658227849
676
+ },
677
+ "gdm_in_house_ctf": {
678
+ "accuracy": 0.46153846153846156
679
+ },
680
+ "agentharm": {
681
+ "avg_score": 0.08782061688311688
682
+ },
683
+ "agentharm_benign": {
684
+ "avg_score": 0.7235176849665487
685
+ },
686
+ "swe_bench": {
687
+ "mean": 0.0036
688
+ }
689
+ }
690
+ },
691
+ "o3-mini": {
692
+ "config": {
693
+ "model_name": "o3-mini",
694
+ "model_sha": "https://openai.com/index/openai-o3-mini",
695
+ "model_dtype": "torch.float16"
696
+ },
697
+ "results": {
698
+ "math": {
699
+ "accuracy": 0.9691320905993185
700
+ },
701
+ "humaneval": {
702
+ "mean": 0.9817073170731707
703
+ },
704
+ "mmlu_pro": {
705
+ "accuracy": 0.7924606807023383
706
+ },
707
+ "gpqa_diamond": {
708
+ "accuracy": 0.7365319865319865
709
+ },
710
+ "winogrande": {
711
+ "accuracy": 0.8492501973164956
712
+ },
713
+ "gsm8k": {
714
+ "accuracy": 0.9454131918119788
715
+ },
716
+ "arc_challenge": {
717
+ "accuracy": 0.9641638225255973
718
+ },
719
+ "arc_easy": {
720
+ "accuracy": 0.9755892255892256
721
+ },
722
+ "drop": {
723
+ "mean": null
724
+ },
725
+ "hellaswag": {
726
+ "accuracy": null
727
+ },
728
+ "ifeval": {
729
+ "final_acc": null
730
+ },
731
+ "mmlu": {
732
+ "accuracy": null
733
+ },
734
+ "mmmu_multiple_choice": {
735
+ "accuracy": null
736
+ },
737
+ "mmmu_open": {
738
+ "accuracy": null
739
+ },
740
+ "gaia": {
741
+ "accuracy": 0.27030303030303043
742
+ },
743
+ "gdm_intercode_ctf": {
744
+ "accuracy": 0.8278481012658225
745
+ },
746
+ "gdm_in_house_ctf": {
747
+ "accuracy": 0.38461538461538464
748
+ },
749
+ "agentharm": {
750
+ "avg_score": 0.1241931080283353
751
+ },
752
+ "agentharm_benign": {
753
+ "avg_score": 0.5429306867375049
754
+ },
755
+ "swe_bench": {
756
+ "mean": 0.0024
757
+ }
758
+ }
759
+ }
760
+ }
data/tasks.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "arc_easy": {
3
+ "benchmark": "arc_easy",
4
+ "metric": "accuracy",
5
+ "display_name": "ARC-Easy",
6
+ "type": "base",
7
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
8
+ },
9
+ "arc_challenge": {
10
+ "benchmark": "arc_challenge",
11
+ "metric": "accuracy",
12
+ "display_name": "ARC-Challenge",
13
+ "type": "base",
14
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
15
+ },
16
+ "drop": {
17
+ "benchmark": "drop",
18
+ "metric": "mean",
19
+ "display_name": "DROP",
20
+ "type": "base",
21
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop"
22
+ },
23
+ "winogrande": {
24
+ "benchmark": "winogrande",
25
+ "metric": "accuracy",
26
+ "display_name": "WinoGrande",
27
+ "type": "base",
28
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande"
29
+ },
30
+ "gsm8k": {
31
+ "benchmark": "gsm8k",
32
+ "metric": "accuracy",
33
+ "display_name": "GSM8K",
34
+ "type": "base",
35
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k"
36
+ },
37
+ "hellaswag": {
38
+ "benchmark": "hellaswag",
39
+ "metric": "accuracy",
40
+ "display_name": "HellaSwag",
41
+ "type": "base",
42
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag"
43
+ },
44
+ "humaneval": {
45
+ "benchmark": "humaneval",
46
+ "metric": "mean",
47
+ "display_name": "HumanEval",
48
+ "type": "base",
49
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval"
50
+ },
51
+ "ifeval": {
52
+ "benchmark": "ifeval",
53
+ "metric": "final_acc",
54
+ "display_name": "IFEval",
55
+ "type": "base",
56
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval"
57
+ },
58
+ "math": {
59
+ "benchmark": "math",
60
+ "metric": "accuracy",
61
+ "display_name": "MATH",
62
+ "type": "base",
63
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics"
64
+ },
65
+ "mmlu": {
66
+ "benchmark": "mmlu",
67
+ "metric": "accuracy",
68
+ "display_name": "MMLU",
69
+ "type": "base",
70
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu"
71
+ },
72
+ "mmlu_pro": {
73
+ "benchmark": "mmlu_pro",
74
+ "metric": "accuracy",
75
+ "display_name": "MMLU-Pro",
76
+ "type": "base",
77
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro"
78
+ },
79
+ "gpqa_diamond": {
80
+ "benchmark": "gpqa_diamond",
81
+ "metric": "accuracy",
82
+ "display_name": "GPQA-Diamond",
83
+ "type": "base",
84
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa"
85
+ },
86
+ "mmmu_multiple_choice": {
87
+ "benchmark": "mmmu_multiple_choice",
88
+ "metric": "accuracy",
89
+ "display_name": "MMMU-Multiple-Choice",
90
+ "type": "base",
91
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu"
92
+ },
93
+ "mmmu_open": {
94
+ "benchmark": "mmmu_open",
95
+ "metric": "accuracy",
96
+ "display_name": "MMMU-Open-Ended",
97
+ "type": "base",
98
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu"
99
+ },
100
+ "gaia": {
101
+ "benchmark": "gaia",
102
+ "metric": "accuracy",
103
+ "display_name": "GAIA",
104
+ "type": "agentic",
105
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
106
+ },
107
+ "gdm_intercode_ctf": {
108
+ "benchmark": "gdm_intercode_ctf",
109
+ "metric": "accuracy",
110
+ "display_name": "InterCode-CTF",
111
+ "type": "agentic",
112
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf"
113
+ },
114
+ "gdm_in_house_ctf": {
115
+ "benchmark": "gdm_in_house_ctf",
116
+ "metric": "accuracy",
117
+ "display_name": "In-House-CTF",
118
+ "type": "agentic",
119
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf"
120
+ },
121
+ "agentharm": {
122
+ "benchmark": "agentharm",
123
+ "metric": "avg_score",
124
+ "display_name": "AgentHarm",
125
+ "type": "agentic",
126
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm"
127
+ },
128
+ "agentharm_benign": {
129
+ "benchmark": "agentharm_benign",
130
+ "metric": "avg_score",
131
+ "display_name": "AgentHarm-Benign",
132
+ "type": "agentic",
133
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm"
134
+ },
135
+ "swe_bench": {
136
+ "benchmark": "swe_bench",
137
+ "metric": "mean",
138
+ "display_name": "SWE-Bench",
139
+ "type": "agentic",
140
+ "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/swe_bench"
141
+ }
142
+ }
src/about.py CHANGED
@@ -33,18 +33,13 @@ class Tasks(Enum):
33
 
34
  # agentic
35
  task14 = Task("gaia", "accuracy", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
36
- task15 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
37
- task16 = Task("gdm_in_house_ctf", "accuracy", "GDM-In-House-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf")
38
  task17 = Task("agentharm", "avg_score", "AgentHarm", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm")
39
  task18 = Task("agentharm_benign", "avg_score", "AgentHarm-Benign", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm")
40
  task19 = Task("swe_bench", "mean", "SWE-Bench", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/swe_bench")
41
 
42
 
43
- NUM_FEWSHOT = 0 # Change with your few shot
44
- # ---------------------------------------------------
45
-
46
-
47
-
48
  # Your leaderboard name
49
  TITLE = """<h1 align="center" id="space-title">State of Evaluation Leaderboard</h1>"""
50
 
 
33
 
34
  # agentic
35
  task14 = Task("gaia", "accuracy", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
36
+ task15 = Task("gdm_intercode_ctf", "accuracy", "InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
37
+ task16 = Task("gdm_in_house_ctf", "accuracy", "In-House-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf")
38
  task17 = Task("agentharm", "avg_score", "AgentHarm", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm")
39
  task18 = Task("agentharm_benign", "avg_score", "AgentHarm-Benign", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm")
40
  task19 = Task("swe_bench", "mean", "SWE-Bench", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/swe_bench")
41
 
42
 
 
 
 
 
 
43
  # Your leaderboard name
44
  TITLE = """<h1 align="center" id="space-title">State of Evaluation Leaderboard</h1>"""
45
 
src/display/formatting.py CHANGED
@@ -5,6 +5,8 @@ def model_hyperlink(link, model_name):
5
  def make_clickable_model(model_name, model_sha):
6
  return model_hyperlink(model_sha, model_name)
7
 
 
 
8
 
9
  def styled_error(error):
10
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
 
5
  def make_clickable_model(model_name, model_sha):
6
  return model_hyperlink(model_sha, model_name)
7
 
8
+ def make_clickable_field(name, url):
9
+ return model_hyperlink(url, name)
10
 
11
  def styled_error(error):
12
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
src/submission/submit.py CHANGED
@@ -1,119 +1,119 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
-
5
- from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
-
14
- REQUESTED_MODELS = None
15
- USERS_TO_SUBMISSION_DATES = None
16
-
17
- def add_new_eval(
18
- model: str,
19
- base_model: str,
20
- revision: str,
21
- precision: str,
22
- weight_type: str,
23
- model_type: str,
24
- ):
25
- global REQUESTED_MODELS
26
- global USERS_TO_SUBMISSION_DATES
27
- if not REQUESTED_MODELS:
28
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
-
30
- user_name = ""
31
- model_path = model
32
- if "/" in model:
33
- user_name = model.split("/")[0]
34
- model_path = model.split("/")[1]
35
-
36
- precision = precision.split(" ")[0]
37
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
-
39
- if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
-
42
- # Does the model actually exist?
43
- if revision == "":
44
- revision = "main"
45
-
46
- # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
-
52
- if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
- if not model_on_hub:
55
- return styled_error(f'Model "{model}" {error}')
56
-
57
- # Is the model info correctly filled?
58
- try:
59
- model_info = API.model_info(repo_id=model, revision=revision)
60
- except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
62
-
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
-
65
- # Were the model card and license filled?
66
- try:
67
- license = model_info.cardData["license"]
68
- except Exception:
69
- return styled_error("Please select a license for your model")
70
-
71
- modelcard_OK, error_msg = check_model_card(model)
72
- if not modelcard_OK:
73
- return styled_error(error_msg)
74
-
75
- # Seems good, creating the eval
76
- print("Adding new eval")
77
-
78
- eval_entry = {
79
- "model": model,
80
- "base_model": base_model,
81
- "revision": revision,
82
- "precision": precision,
83
- "weight_type": weight_type,
84
- "status": "PENDING",
85
- "submitted_time": current_time,
86
- "model_type": model_type,
87
- "likes": model_info.likes,
88
- "params": model_size,
89
- "license": license,
90
- "private": False,
91
- }
92
-
93
- # Check for duplicate submission
94
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
- return styled_warning("This model has been already submitted.")
96
-
97
- print("Creating eval file")
98
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
- os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
-
102
- with open(out_path, "w") as f:
103
- f.write(json.dumps(eval_entry))
104
-
105
- print("Uploading eval file")
106
- API.upload_file(
107
- path_or_fileobj=out_path,
108
- path_in_repo=out_path.split("eval-queue/")[1],
109
- repo_id=QUEUE_REPO,
110
- repo_type="dataset",
111
- commit_message=f"Add {model} to eval queue",
112
- )
113
-
114
- # Remove the local file
115
- os.remove(out_path)
116
-
117
- return styled_message(
118
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
- )
 
1
+ # import json
2
+ # import os
3
+ # from datetime import datetime, timezone
4
+
5
+ # from src.display.formatting import styled_error, styled_message, styled_warning
6
+ # from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
+ # from src.submission.check_validity import (
8
+ # already_submitted_models,
9
+ # check_model_card,
10
+ # get_model_size,
11
+ # is_model_on_hub,
12
+ # )
13
+
14
+ # REQUESTED_MODELS = None
15
+ # USERS_TO_SUBMISSION_DATES = None
16
+
17
+ # def add_new_eval(
18
+ # model: str,
19
+ # base_model: str,
20
+ # revision: str,
21
+ # precision: str,
22
+ # weight_type: str,
23
+ # model_type: str,
24
+ # ):
25
+ # global REQUESTED_MODELS
26
+ # global USERS_TO_SUBMISSION_DATES
27
+ # if not REQUESTED_MODELS:
28
+ # REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
+
30
+ # user_name = ""
31
+ # model_path = model
32
+ # if "/" in model:
33
+ # user_name = model.split("/")[0]
34
+ # model_path = model.split("/")[1]
35
+
36
+ # precision = precision.split(" ")[0]
37
+ # current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
+
39
+ # if model_type is None or model_type == "":
40
+ # return styled_error("Please select a model type.")
41
+
42
+ # # Does the model actually exist?
43
+ # if revision == "":
44
+ # revision = "main"
45
+
46
+ # # Is the model on the hub?
47
+ # if weight_type in ["Delta", "Adapter"]:
48
+ # base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
+ # if not base_model_on_hub:
50
+ # return styled_error(f'Base model "{base_model}" {error}')
51
+
52
+ # if not weight_type == "Adapter":
53
+ # model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
+ # if not model_on_hub:
55
+ # return styled_error(f'Model "{model}" {error}')
56
+
57
+ # # Is the model info correctly filled?
58
+ # try:
59
+ # model_info = API.model_info(repo_id=model, revision=revision)
60
+ # except Exception:
61
+ # return styled_error("Could not get your model information. Please fill it up properly.")
62
+
63
+ # model_size = get_model_size(model_info=model_info, precision=precision)
64
+
65
+ # # Were the model card and license filled?
66
+ # try:
67
+ # license = model_info.cardData["license"]
68
+ # except Exception:
69
+ # return styled_error("Please select a license for your model")
70
+
71
+ # modelcard_OK, error_msg = check_model_card(model)
72
+ # if not modelcard_OK:
73
+ # return styled_error(error_msg)
74
+
75
+ # # Seems good, creating the eval
76
+ # print("Adding new eval")
77
+
78
+ # eval_entry = {
79
+ # "model": model,
80
+ # "base_model": base_model,
81
+ # "revision": revision,
82
+ # "precision": precision,
83
+ # "weight_type": weight_type,
84
+ # "status": "PENDING",
85
+ # "submitted_time": current_time,
86
+ # "model_type": model_type,
87
+ # "likes": model_info.likes,
88
+ # "params": model_size,
89
+ # "license": license,
90
+ # "private": False,
91
+ # }
92
+
93
+ # # Check for duplicate submission
94
+ # if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
+ # return styled_warning("This model has been already submitted.")
96
+
97
+ # print("Creating eval file")
98
+ # OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
+ # os.makedirs(OUT_DIR, exist_ok=True)
100
+ # out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
+
102
+ # with open(out_path, "w") as f:
103
+ # f.write(json.dumps(eval_entry))
104
+
105
+ # print("Uploading eval file")
106
+ # API.upload_file(
107
+ # path_or_fileobj=out_path,
108
+ # path_in_repo=out_path.split("eval-queue/")[1],
109
+ # repo_id=QUEUE_REPO,
110
+ # repo_type="dataset",
111
+ # commit_message=f"Add {model} to eval queue",
112
+ # )
113
+
114
+ # # Remove the local file
115
+ # os.remove(out_path)
116
+
117
+ # return styled_message(
118
+ # "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
+ # )