Add data
Browse files- app.py +130 -68
- inspect_log_file_names.json → data/inspect_log_file_names.json +0 -0
- data/populate_results.py +41 -0
- data/results.json +948 -0
- data/results.json.bak +760 -0
- data/tasks.json +142 -0
- src/about.py +2 -7
- src/display/formatting.py +2 -0
- src/submission/submit.py +119 -119
app.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
3 |
-
from huggingface_hub import snapshot_download
|
4 |
import pandas as pd
|
|
|
5 |
|
6 |
from src.about import (
|
7 |
REPRODUCIBILITY_TEXT,
|
@@ -10,64 +9,65 @@ from src.about import (
|
|
10 |
TITLE,
|
11 |
)
|
12 |
from src.display.css_html_js import custom_css, custom_js
|
13 |
-
from src.display.utils import (
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
)
|
21 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
22 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df, TASK_NAME_INVERSE_MAP
|
23 |
-
from src.submission.submit import add_new_eval
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
|
72 |
# styler = dataframe.style.apply(bold_max, subset=pd.IndexSlice[:, dataframe.columns[1:]])
|
73 |
# df.style.set_table_styles([
|
@@ -85,13 +85,78 @@ def init_leaderboard(df, benchmark_type):
|
|
85 |
# styled_df = df.style.set_tooltips(tooltips)
|
86 |
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
return gr.components.Dataframe(
|
89 |
-
value=
|
90 |
-
datatype=[
|
91 |
-
column_widths=["
|
92 |
wrap=False,
|
93 |
)
|
94 |
|
|
|
95 |
black_logo_path = "src/assets/logo-icon-black.png"
|
96 |
white_logo_path = "src/assets/logo-icon-white.png"
|
97 |
|
@@ -123,10 +188,10 @@ with demo:
|
|
123 |
|
124 |
with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs:
|
125 |
with gr.TabItem("Base Benchmarks", elem_classes="llm-benchmark-tab-table", id=0):
|
126 |
-
|
127 |
|
128 |
with gr.TabItem("Agentic Benchmarks", elem_classes="llm-benchmark-tab-table", id=1):
|
129 |
-
|
130 |
|
131 |
with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2):
|
132 |
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text", sanitize_html=False)
|
@@ -135,8 +200,5 @@ with demo:
|
|
135 |
gr.Markdown(REPRODUCIBILITY_TEXT, elem_classes="markdown-text", sanitize_html=False)
|
136 |
|
137 |
assets = [black_logo_path, white_logo_path]
|
|
|
138 |
|
139 |
-
scheduler = BackgroundScheduler()
|
140 |
-
scheduler.add_job(restart_space, "interval", seconds=1800)
|
141 |
-
scheduler.start()
|
142 |
-
demo.queue(default_concurrency_limit=40).launch(allowed_paths=assets)
|
|
|
1 |
import gradio as gr
|
|
|
|
|
2 |
import pandas as pd
|
3 |
+
import json
|
4 |
|
5 |
from src.about import (
|
6 |
REPRODUCIBILITY_TEXT,
|
|
|
9 |
TITLE,
|
10 |
)
|
11 |
from src.display.css_html_js import custom_css, custom_js
|
12 |
+
# from src.display.utils import (
|
13 |
+
# COLS,
|
14 |
+
# ST_BENCHMARK_COLS,
|
15 |
+
# AGENTIC_BENCHMARK_COLS,
|
16 |
+
# EVAL_COLS,
|
17 |
+
# AutoEvalColumn,
|
18 |
+
# fields,
|
19 |
+
# )
|
20 |
+
# from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
21 |
+
# from src.populate import get_evaluation_queue_df, get_leaderboard_df, TASK_NAME_INVERSE_MAP
|
22 |
+
# from src.submission.submit import add_new_eval
|
23 |
+
from src.display.formatting import make_clickable_field
|
24 |
+
|
25 |
+
|
26 |
+
# def restart_space():
|
27 |
+
# API.restart_space(repo_id=REPO_ID)
|
28 |
+
|
29 |
+
# ### Space initialisation
|
30 |
+
# try:
|
31 |
+
# print(EVAL_REQUESTS_PATH)
|
32 |
+
# snapshot_download(
|
33 |
+
# repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
34 |
+
# )
|
35 |
+
# except Exception:
|
36 |
+
# restart_space()
|
37 |
+
# try:
|
38 |
+
# print(EVAL_RESULTS_PATH)
|
39 |
+
# snapshot_download(
|
40 |
+
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
41 |
+
# )
|
42 |
+
# except Exception:
|
43 |
+
# restart_space()
|
44 |
+
|
45 |
+
|
46 |
+
# ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, ST_BENCHMARK_COLS)
|
47 |
+
# AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, AGENTIC_BENCHMARK_COLS)
|
48 |
+
|
49 |
+
# (
|
50 |
+
# finished_eval_queue_df,
|
51 |
+
# running_eval_queue_df,
|
52 |
+
# pending_eval_queue_df,
|
53 |
+
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
54 |
+
|
55 |
+
# def bold_max(s):
|
56 |
+
# is_max = s == s.max() # Boolean Series: True for the max value(s)
|
57 |
+
# return ['font-weight: bold' if v else '' for v in is_max]
|
58 |
+
|
59 |
+
# def init_leaderboard(df, benchmark_type):
|
60 |
+
# if df is None or df.empty:
|
61 |
+
# raise ValueError("Leaderboard DataFrame is empty or None.")
|
62 |
|
63 |
+
# non_task_cols = ["Model"]
|
64 |
+
# if benchmark_type == "agentic":
|
65 |
+
# # Include agent column
|
66 |
+
# non_task_cols.append("Agent")
|
67 |
+
# elif benchmark_type == "base":
|
68 |
+
# # Drop agent column
|
69 |
+
# dataframe = dataframe.drop(columns=["Agent"])
|
70 |
+
# AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name in non_task_cols) or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))]
|
71 |
|
72 |
# styler = dataframe.style.apply(bold_max, subset=pd.IndexSlice[:, dataframe.columns[1:]])
|
73 |
# df.style.set_table_styles([
|
|
|
85 |
# styled_df = df.style.set_tooltips(tooltips)
|
86 |
|
87 |
|
88 |
+
# return gr.components.Dataframe(
|
89 |
+
# value=df,
|
90 |
+
# datatype=[c.type for c in AutoEvalColumnSubset],
|
91 |
+
# column_widths=["150px" if c.name != "Model" else "250px" for c in AutoEvalColumnSubset],
|
92 |
+
# wrap=False,
|
93 |
+
# )
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
def build_leaderboard(type):
|
98 |
+
with open('data/results.json', 'r') as f:
|
99 |
+
results = json.load(f)
|
100 |
+
|
101 |
+
with open('data/tasks.json', 'r') as f:
|
102 |
+
tasks = json.load(f)
|
103 |
+
|
104 |
+
# Filter tasks based on type
|
105 |
+
filtered_tasks = {k: v for k, v in tasks.items() if v['type'] == type}
|
106 |
+
|
107 |
+
data = []
|
108 |
+
for model_name, model_data in results.items():
|
109 |
+
# For agentic type, skip models that have all null values for agentic tasks
|
110 |
+
if type == "agentic":
|
111 |
+
has_agentic_results = any(
|
112 |
+
model_data['results'].get(task, {}).get(tasks[task]['metric']) is not None
|
113 |
+
for task in filtered_tasks
|
114 |
+
)
|
115 |
+
if not has_agentic_results:
|
116 |
+
continue
|
117 |
+
|
118 |
+
model_sha = model_data["config"]["model_sha"]
|
119 |
+
model_name = model_data["config"]["model_name"]
|
120 |
+
row = {
|
121 |
+
'Model': make_clickable_field(model_name, model_sha)
|
122 |
+
}
|
123 |
+
|
124 |
+
for dataset, metrics in model_data['results'].items():
|
125 |
+
# Only include metrics for tasks of the specified type
|
126 |
+
if dataset in filtered_tasks:
|
127 |
+
value = next(iter(metrics.values()))
|
128 |
+
log_url = metrics.get('log_url')
|
129 |
+
# Use display name from tasks.json instead of raw dataset name
|
130 |
+
display_name = filtered_tasks[dataset]['display_name']
|
131 |
+
# Round non-null values to 2 decimal places and make clickable if log_url exists
|
132 |
+
if value is not None:
|
133 |
+
value = round(value*100, 2)
|
134 |
+
if log_url:
|
135 |
+
value = make_clickable_field(value, log_url)
|
136 |
+
row[display_name] = value
|
137 |
+
data.append(row)
|
138 |
+
|
139 |
+
results_df = pd.DataFrame(data)
|
140 |
+
|
141 |
+
# Round all numeric columns to 2 decimal places
|
142 |
+
numeric_cols = results_df.select_dtypes(include=['float64', 'float32']).columns
|
143 |
+
results_df[numeric_cols] = results_df[numeric_cols].round(2)
|
144 |
+
|
145 |
+
# Fill null values with "-"
|
146 |
+
results_df = results_df.fillna("--")
|
147 |
+
|
148 |
+
if type == "agentic":
|
149 |
+
# Include agent column as second column after Model
|
150 |
+
results_df.insert(1, 'Agent', '[Basic Agent](https://inspect.ai-safety-institute.org.uk/agents.html#sec-basic-agent)')
|
151 |
+
|
152 |
return gr.components.Dataframe(
|
153 |
+
value=results_df,
|
154 |
+
datatype=["html" for _ in results_df.columns],
|
155 |
+
column_widths=["250px" if c == "Model" else "150px" for c in results_df.columns],
|
156 |
wrap=False,
|
157 |
)
|
158 |
|
159 |
+
|
160 |
black_logo_path = "src/assets/logo-icon-black.png"
|
161 |
white_logo_path = "src/assets/logo-icon-white.png"
|
162 |
|
|
|
188 |
|
189 |
with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs:
|
190 |
with gr.TabItem("Base Benchmarks", elem_classes="llm-benchmark-tab-table", id=0):
|
191 |
+
build_leaderboard("base")
|
192 |
|
193 |
with gr.TabItem("Agentic Benchmarks", elem_classes="llm-benchmark-tab-table", id=1):
|
194 |
+
build_leaderboard("agentic")
|
195 |
|
196 |
with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2):
|
197 |
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text", sanitize_html=False)
|
|
|
200 |
gr.Markdown(REPRODUCIBILITY_TEXT, elem_classes="markdown-text", sanitize_html=False)
|
201 |
|
202 |
assets = [black_logo_path, white_logo_path]
|
203 |
+
demo.launch()
|
204 |
|
|
|
|
|
|
|
|
inspect_log_file_names.json → data/inspect_log_file_names.json
RENAMED
File without changes
|
data/populate_results.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
def get_log_url(model_name: str, log_file_name: str) -> str:
|
4 |
+
"""Returns the URL to the log file for a given model and benchmark"""
|
5 |
+
if log_file_name is None:
|
6 |
+
return None
|
7 |
+
else:
|
8 |
+
# replace .json with .eval
|
9 |
+
log_file_name = log_file_name.replace(".json", ".eval")
|
10 |
+
return f"https://storage.googleapis.com/inspect-evals/eval/{model_name}/index.html?log_file=logs/logs/{log_file_name}"
|
11 |
+
|
12 |
+
def main():
|
13 |
+
# Load the results and log file names
|
14 |
+
with open("data/results.json", "r") as f:
|
15 |
+
results = json.load(f)
|
16 |
+
|
17 |
+
with open("data/inspect_log_file_names.json", "r") as f:
|
18 |
+
log_files = json.load(f)
|
19 |
+
|
20 |
+
# For each model in results
|
21 |
+
for model_name, model_data in results.items():
|
22 |
+
# Get the log files for this model
|
23 |
+
model_logs = log_files.get(model_name, {})
|
24 |
+
|
25 |
+
# For each task in the model's results
|
26 |
+
for task_name, task_data in model_data["results"].items():
|
27 |
+
# Get the log file name for this task
|
28 |
+
log_file_name = model_logs.get(task_name)
|
29 |
+
|
30 |
+
# Add the log URL to the task data
|
31 |
+
if log_file_name:
|
32 |
+
task_data["log_url"] = get_log_url(model_name, log_file_name)
|
33 |
+
else:
|
34 |
+
task_data["log_url"] = None
|
35 |
+
|
36 |
+
# Save the updated results
|
37 |
+
with open("data/results_with_logs.json", "w") as f:
|
38 |
+
json.dump(results, f, indent=4)
|
39 |
+
|
40 |
+
if __name__ == "__main__":
|
41 |
+
main()
|
data/results.json
ADDED
@@ -0,0 +1,948 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"DeepSeek-R1": {
|
3 |
+
"config": {
|
4 |
+
"model_name": "DeepSeek-R1",
|
5 |
+
"model_sha": "https://api-docs.deepseek.com/news/news250120",
|
6 |
+
"model_dtype": "torch.float16"
|
7 |
+
},
|
8 |
+
"results": {
|
9 |
+
"mmlu_pro": {
|
10 |
+
"accuracy": 0.8382646276595744,
|
11 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-12T11-02-35-05-00_mmlu-pro_BhD89DYN9KM3k4weSDfaQK.eval"
|
12 |
+
},
|
13 |
+
"humaneval": {
|
14 |
+
"mean": 0.9567901234567902,
|
15 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-03T11-45-22-05-00_humaneval_hnkHWYqrb5HxiBt2CWzCnq.eval"
|
16 |
+
},
|
17 |
+
"math": {
|
18 |
+
"accuracy": 0.9272,
|
19 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-11T11-38-10-05-00_math_ZYFSqsWsmP5kLRLHEMWULU.eval"
|
20 |
+
},
|
21 |
+
"gsm8k": {
|
22 |
+
"accuracy": 0.954510993176649,
|
23 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-02T16-28-05-05-00_gsm8k_YMw6WiZkgTBQ54z5UHtDDX.eval"
|
24 |
+
},
|
25 |
+
"arc_challenge": {
|
26 |
+
"accuracy": 0.9667235494880546,
|
27 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-01-30T15-42-39-05-00_arc-challenge_CviW9ro6rKBbctkwJzQstp.eval"
|
28 |
+
},
|
29 |
+
"winogrande": {
|
30 |
+
"accuracy": 0.9179163378058406,
|
31 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-04T00-25-12-05-00_winogrande_NPgTbtqom2QSPKxeThWrdZ.eval"
|
32 |
+
},
|
33 |
+
"arc_easy": {
|
34 |
+
"accuracy": 0.9873737373737373,
|
35 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2024-10-29T17-10-40-04-00_arc-easy_UvprihBMLXPF8JENVLRkdx.eval"
|
36 |
+
},
|
37 |
+
"gpqa_diamond": {
|
38 |
+
"accuracy": 0.7045454545454546,
|
39 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-11T11-37-45-05-00_gpqa-diamond_MwnVeLwyuiEAALr3M5q3dn.eval"
|
40 |
+
},
|
41 |
+
"drop": {
|
42 |
+
"mean": null,
|
43 |
+
"log_url": null
|
44 |
+
},
|
45 |
+
"hellaswag": {
|
46 |
+
"accuracy": null,
|
47 |
+
"log_url": null
|
48 |
+
},
|
49 |
+
"ifeval": {
|
50 |
+
"final_acc": null,
|
51 |
+
"log_url": null
|
52 |
+
},
|
53 |
+
"mmlu": {
|
54 |
+
"accuracy": null,
|
55 |
+
"log_url": null
|
56 |
+
},
|
57 |
+
"mmmu_multiple_choice": {
|
58 |
+
"accuracy": null,
|
59 |
+
"log_url": null
|
60 |
+
},
|
61 |
+
"mmmu_open": {
|
62 |
+
"accuracy": null,
|
63 |
+
"log_url": null
|
64 |
+
},
|
65 |
+
"gaia": {
|
66 |
+
"accuracy": null,
|
67 |
+
"log_url": null
|
68 |
+
},
|
69 |
+
"gdm_intercode_ctf": {
|
70 |
+
"accuracy": null,
|
71 |
+
"log_url": null
|
72 |
+
},
|
73 |
+
"gdm_in_house_ctf": {
|
74 |
+
"accuracy": null,
|
75 |
+
"log_url": null
|
76 |
+
},
|
77 |
+
"agentharm": {
|
78 |
+
"avg_score": null,
|
79 |
+
"log_url": null
|
80 |
+
},
|
81 |
+
"agentharm_benign": {
|
82 |
+
"avg_score": null,
|
83 |
+
"log_url": null
|
84 |
+
},
|
85 |
+
"swe_bench": {
|
86 |
+
"mean": null,
|
87 |
+
"log_url": null
|
88 |
+
}
|
89 |
+
}
|
90 |
+
},
|
91 |
+
"Meta-Llama-3.1-70B-Instruct": {
|
92 |
+
"config": {
|
93 |
+
"model_name": "Meta-Llama-3.1-70B-Instruct",
|
94 |
+
"model_sha": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
|
95 |
+
"model_dtype": "torch.float16"
|
96 |
+
},
|
97 |
+
"results": {
|
98 |
+
"hellaswag": {
|
99 |
+
"accuracy": 0.869946225851424,
|
100 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T00-45-54-04-00_hellaswag_BKfQG9yGAr383MGnooMLBH.eval"
|
101 |
+
},
|
102 |
+
"drop": {
|
103 |
+
"mean": 0.8811263765076035,
|
104 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T21-01-02-04-00_drop_LzAWvLWkNrNKu5qf56wXRo.eval"
|
105 |
+
},
|
106 |
+
"gpqa_diamond": {
|
107 |
+
"accuracy": 0.4318181818181818,
|
108 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T23-41-39-04-00_gpqa-diamond_TdLdYmVM6GCVMAECcXkuhj.eval"
|
109 |
+
},
|
110 |
+
"winogrande": {
|
111 |
+
"accuracy": 0.8666140489344909,
|
112 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T09-20-56-04-00_winogrande_WnUgkSRhSMvh3zUjnuJWQZ.eval"
|
113 |
+
},
|
114 |
+
"gsm8k": {
|
115 |
+
"accuracy": 0.9469294920394238,
|
116 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T00-03-31-04-00_gsm8k_bKsUfCAfcmBCeryboNaLoX.eval"
|
117 |
+
},
|
118 |
+
"math": {
|
119 |
+
"accuracy": 0.6004,
|
120 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-34-50-04-00_math_2xiNcrGih26uzJdG4q88bM.eval"
|
121 |
+
},
|
122 |
+
"ifeval": {
|
123 |
+
"final_acc": 0.8604907201780166,
|
124 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-29-32-04-00_ifeval_Dwh3CF2ZYFrvw7UcTwrsvK.eval"
|
125 |
+
},
|
126 |
+
"arc_challenge": {
|
127 |
+
"accuracy": 0.9445392491467577,
|
128 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-58-56-04-00_arc-challenge_oFL5wFjT7KwNFhMFfe72JN.eval"
|
129 |
+
},
|
130 |
+
"arc_easy": {
|
131 |
+
"accuracy": 0.9823232323232324,
|
132 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-53-12-04-00_arc-easy_UXzR7cDeNteP39NoXUYnhm.eval"
|
133 |
+
},
|
134 |
+
"mmlu_pro": {
|
135 |
+
"accuracy": 0.6688829787234043,
|
136 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T06-11-16-04-00_mmlu-pro_oQiEBJdeKtEEt4cm9KL7uy.eval"
|
137 |
+
},
|
138 |
+
"humaneval": {
|
139 |
+
"mean": 0.7865853658536586,
|
140 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-28-25-04-00_humaneval_KcJV2rHuHJ2JLxijihEkcW.eval"
|
141 |
+
},
|
142 |
+
"mmlu": {
|
143 |
+
"accuracy": 0.8033755875231449,
|
144 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T03-51-50-04-00_mmlu_6SNjs2QmPRvqGnvbnNtaqb.eval"
|
145 |
+
},
|
146 |
+
"mmmu_multiple_choice": {
|
147 |
+
"accuracy": null,
|
148 |
+
"log_url": null
|
149 |
+
},
|
150 |
+
"mmmu_open": {
|
151 |
+
"accuracy": null,
|
152 |
+
"log_url": null
|
153 |
+
},
|
154 |
+
"gaia": {
|
155 |
+
"accuracy": null,
|
156 |
+
"log_url": null
|
157 |
+
},
|
158 |
+
"gdm_intercode_ctf": {
|
159 |
+
"accuracy": null,
|
160 |
+
"log_url": null
|
161 |
+
},
|
162 |
+
"gdm_in_house_ctf": {
|
163 |
+
"accuracy": null,
|
164 |
+
"log_url": null
|
165 |
+
},
|
166 |
+
"agentharm": {
|
167 |
+
"avg_score": null,
|
168 |
+
"log_url": null
|
169 |
+
},
|
170 |
+
"agentharm_benign": {
|
171 |
+
"avg_score": null,
|
172 |
+
"log_url": null
|
173 |
+
},
|
174 |
+
"swe_bench": {
|
175 |
+
"mean": null,
|
176 |
+
"log_url": null
|
177 |
+
}
|
178 |
+
}
|
179 |
+
},
|
180 |
+
"Mistral-Large-Instruct-2407": {
|
181 |
+
"config": {
|
182 |
+
"model_name": "Mistral-Large-Instruct-2407",
|
183 |
+
"model_sha": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
|
184 |
+
"model_dtype": "torch.float16"
|
185 |
+
},
|
186 |
+
"results": {
|
187 |
+
"drop": {
|
188 |
+
"mean": 0.7424257996853698,
|
189 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T01-56-12-04-00_drop_NtvuCoU2LoMbH8DztcCTen.eval"
|
190 |
+
},
|
191 |
+
"ifeval": {
|
192 |
+
"final_acc": 0.8285172231900246,
|
193 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T06-30-16-04-00_ifeval_TLkvCSFEWo4PLv6hAha7YB.eval"
|
194 |
+
},
|
195 |
+
"mmlu": {
|
196 |
+
"accuracy": 0.8035892323030908,
|
197 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T07-21-48-04-00_mmlu_YnUhmHoStr3WuJdchWmNPt.eval"
|
198 |
+
},
|
199 |
+
"gpqa_diamond": {
|
200 |
+
"accuracy": 0.4734848484848485,
|
201 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T04-22-52-04-00_gpqa-diamond_SuZUZxGdqS2ZecbLRNkKd4.eval"
|
202 |
+
},
|
203 |
+
"gsm8k": {
|
204 |
+
"accuracy": 0.9378316906747536,
|
205 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T04-28-49-04-00_gsm8k_5tQp9tbwUMj6NpjNKCAfVm.eval"
|
206 |
+
},
|
207 |
+
"math": {
|
208 |
+
"accuracy": 0.6574,
|
209 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T06-33-09-04-00_math_2CmjBedAfUxqvmcHRdBgyB.eval"
|
210 |
+
},
|
211 |
+
"arc_easy": {
|
212 |
+
"accuracy": 0.9852693602693603,
|
213 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T01-48-39-04-00_arc-easy_YbfuBT3usZXt2xgZkkR5dq.eval"
|
214 |
+
},
|
215 |
+
"mmlu_pro": {
|
216 |
+
"accuracy": 0.6942320478723404,
|
217 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T09-41-25-04-00_mmlu-pro_fyYT4aabPesfY5TpzFMPnd.eval"
|
218 |
+
},
|
219 |
+
"humaneval": {
|
220 |
+
"mean": 0.8658536585365854,
|
221 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T06-29-24-04-00_humaneval_nu8SUSGekKJWB8HLKDigYK.eval"
|
222 |
+
},
|
223 |
+
"hellaswag": {
|
224 |
+
"accuracy": 0.9047998406691894,
|
225 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T04-50-00-04-00_hellaswag_ZzQoZ6gkRQsTzMhQr7GYNn.eval"
|
226 |
+
},
|
227 |
+
"arc_challenge": {
|
228 |
+
"accuracy": 0.9436860068259386,
|
229 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T01-54-13-04-00_arc-challenge_WfQRhMkFcywefpU46isBVP.eval"
|
230 |
+
},
|
231 |
+
"winogrande": {
|
232 |
+
"accuracy": 0.8547750591949487,
|
233 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T11-57-58-04-00_winogrande_TP3UGwpp37Dyv6ks9Ty5Hk.eval"
|
234 |
+
},
|
235 |
+
"mmmu_multiple_choice": {
|
236 |
+
"accuracy": null,
|
237 |
+
"log_url": null
|
238 |
+
},
|
239 |
+
"mmmu_open": {
|
240 |
+
"accuracy": null,
|
241 |
+
"log_url": null
|
242 |
+
},
|
243 |
+
"gaia": {
|
244 |
+
"accuracy": null,
|
245 |
+
"log_url": null
|
246 |
+
},
|
247 |
+
"gdm_intercode_ctf": {
|
248 |
+
"accuracy": null,
|
249 |
+
"log_url": null
|
250 |
+
},
|
251 |
+
"gdm_in_house_ctf": {
|
252 |
+
"accuracy": null,
|
253 |
+
"log_url": null
|
254 |
+
},
|
255 |
+
"agentharm": {
|
256 |
+
"avg_score": null,
|
257 |
+
"log_url": null
|
258 |
+
},
|
259 |
+
"agentharm_benign": {
|
260 |
+
"avg_score": null,
|
261 |
+
"log_url": null
|
262 |
+
},
|
263 |
+
"swe_bench": {
|
264 |
+
"mean": null,
|
265 |
+
"log_url": null
|
266 |
+
}
|
267 |
+
}
|
268 |
+
},
|
269 |
+
"c4ai-command-r-plus": {
|
270 |
+
"config": {
|
271 |
+
"model_name": "c4ai-command-r-plus",
|
272 |
+
"model_sha": "https://huggingface.co/CohereForAI/c4ai-command-r-plus"
|
273 |
+
},
|
274 |
+
"results": {
|
275 |
+
"ifeval": {
|
276 |
+
"final_acc": 0.7779591483929307,
|
277 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T17-23-04-04-00_ifeval_RGucUMwdGmUnRpqyMTZTzW.eval"
|
278 |
+
},
|
279 |
+
"winogrande": {
|
280 |
+
"accuracy": 0.7490134175217048,
|
281 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T14-42-18-04-00_winogrande_bY8yg7aRR5dCCK7NDCZEcc.eval"
|
282 |
+
},
|
283 |
+
"arc_challenge": {
|
284 |
+
"accuracy": 0.8506825938566553,
|
285 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-29T17-30-03-04-00_arc-challenge_XB7LURXEGaxskWuLtYwdnW.eval"
|
286 |
+
},
|
287 |
+
"drop": {
|
288 |
+
"mean": 0.743557420031463,
|
289 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T12-06-30-04-00_drop_itY9cLiYAW2BF7NTeDceNd.eval"
|
290 |
+
},
|
291 |
+
"math": {
|
292 |
+
"accuracy": 0.2626,
|
293 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T17-26-34-04-00_math_kohBUMpMFuMsR4jz4vUNWM.eval"
|
294 |
+
},
|
295 |
+
"gpqa_diamond": {
|
296 |
+
"accuracy": 0.3194444444444444,
|
297 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-29T22-47-45-04-00_gpqa-diamond_JKpb6ya4pec9hh7uovPPCZ.eval"
|
298 |
+
},
|
299 |
+
"mmlu_pro": {
|
300 |
+
"accuracy": 0.441156914893617,
|
301 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-31T01-11-38-04-00_mmlu-pro_gZVAuy3zMKR23BieM5PqAX.eval"
|
302 |
+
},
|
303 |
+
"humaneval": {
|
304 |
+
"mean": 0.6219512195121951,
|
305 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T17-22-23-04-00_humaneval_5ByPqUhoofSbKgvsUQNFCX.eval"
|
306 |
+
},
|
307 |
+
"gsm8k": {
|
308 |
+
"accuracy": 0.7816527672479151,
|
309 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T15-03-35-04-00_gsm8k_QxbfbriJsKGQAg96JyjkoT.eval"
|
310 |
+
},
|
311 |
+
"hellaswag": {
|
312 |
+
"accuracy": 0.7954590718980283,
|
313 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T15-18-17-04-00_hellaswag_UYyBTR6N8VJnKRmnbCrB8N.eval"
|
314 |
+
},
|
315 |
+
"mmlu": {
|
316 |
+
"accuracy": 0.695128899017234,
|
317 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T21-55-26-04-00_mmlu_JUPPLTzfe3Kme6UuorPTqg.eval"
|
318 |
+
},
|
319 |
+
"arc_easy": {
|
320 |
+
"accuracy": 0.9377104377104377,
|
321 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-29T17-10-40-04-00_arc-easy_UvprihBMLXPF8JENVLRkdx.eval"
|
322 |
+
}
|
323 |
+
}
|
324 |
+
},
|
325 |
+
"claude-3-5-sonnet-20241022": {
|
326 |
+
"config": {
|
327 |
+
"model_name": "claude-3-5-sonnet-20241022",
|
328 |
+
"model_sha": "https://www.anthropic.com/claude/sonnet",
|
329 |
+
"model_dtype": "torch.float16"
|
330 |
+
},
|
331 |
+
"results": {
|
332 |
+
"mmmu_multiple_choice": {
|
333 |
+
"accuracy": 0.6481700118063755,
|
334 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-21T11-20-03-05-00_mmmu-multiple-choice_CWhKvGdoFo6pdHhDyi9GNm.eval"
|
335 |
+
},
|
336 |
+
"mmlu_pro": {
|
337 |
+
"accuracy": 0.7762632978723404,
|
338 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T19-01-05-05-00_mmlu-pro_3vi84or97gQupuj5sT6vgZ.eval"
|
339 |
+
},
|
340 |
+
"hellaswag": {
|
341 |
+
"accuracy": 0.9228241386178052,
|
342 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T15-09-33-05-00_hellaswag_QXqFxojvSToMu8ckHEMLkB.eval"
|
343 |
+
},
|
344 |
+
"gpqa_diamond": {
|
345 |
+
"accuracy": 0.6098484848484849,
|
346 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T13-56-36-05-00_gpqa-diamond_eg4gFaMRENjnnYvQNtSB59.eval"
|
347 |
+
},
|
348 |
+
"gsm8k": {
|
349 |
+
"accuracy": 0.9620924943138741,
|
350 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T14-23-25-05-00_gsm8k_nHB8Z4uZAwRAZFYpKmTptA.eval"
|
351 |
+
},
|
352 |
+
"mmmu_open": {
|
353 |
+
"accuracy": 0.41509433962264153,
|
354 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-21T11-24-21-05-00_mmmu-open_SSjv3Dq9gZkEEUnvJUd5xf.eval"
|
355 |
+
},
|
356 |
+
"arc_easy": {
|
357 |
+
"accuracy": 0.9915824915824916,
|
358 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T10-06-24-05-00_arc-easy_oBReQZQM5SAwMMD2jFshPb.eval"
|
359 |
+
},
|
360 |
+
"arc_challenge": {
|
361 |
+
"accuracy": 0.9692832764505119,
|
362 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T10-12-11-05-00_arc-challenge_X8i6caCzkcQo5AT5zXkXso.eval"
|
363 |
+
},
|
364 |
+
"mmlu": {
|
365 |
+
"accuracy": 0.8665432274604757,
|
366 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T15-16-51-05-00_mmlu_NFDs2kxmh3kQEbpbd8sz3w.eval"
|
367 |
+
},
|
368 |
+
"math": {
|
369 |
+
"accuracy": 0.7942,
|
370 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T12-29-54-05-00_math_NvNQU58M8r3fpiwPGnvq8h.eval"
|
371 |
+
},
|
372 |
+
"ifeval": {
|
373 |
+
"final_acc": 0.8958114469607309,
|
374 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T11-28-44-05-00_ifeval_fmWxch4ZjbmYCST6yUZsdV.eval"
|
375 |
+
},
|
376 |
+
"humaneval": {
|
377 |
+
"mean": 0.9451219512195121,
|
378 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T11-26-12-05-00_humaneval_kUASiaNd9uZfWvCwYHhdF5.eval"
|
379 |
+
},
|
380 |
+
"winogrande": {
|
381 |
+
"accuracy": 0.9021310181531176,
|
382 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T22-09-41-05-00_winogrande_mSWGAKg75E5RP79KWizvb9.eval"
|
383 |
+
},
|
384 |
+
"drop": {
|
385 |
+
"mean": 0.8977608809648663,
|
386 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T10-15-15-05-00_drop_Z9A2Y84HYponNxnzNT9TNq.eval"
|
387 |
+
},
|
388 |
+
"gaia": {
|
389 |
+
"accuracy": 0.3381818181818182,
|
390 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-12T23-57-37-05-00_claude-3-5-sonnet_gaia_merged.eval"
|
391 |
+
},
|
392 |
+
"gdm_intercode_ctf": {
|
393 |
+
"accuracy": 0.8556962025316455,
|
394 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-11T02-47-45-05-00_claude-3-5-sonnet_gdm-intercode-ctf_merged.eval"
|
395 |
+
},
|
396 |
+
"gdm_in_house_ctf": {
|
397 |
+
"accuracy": 0.6153846153846154,
|
398 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-11T07-41-14+00-00_claude-3-5-sonnet_gdm-in-house-ctf.eval"
|
399 |
+
},
|
400 |
+
"agentharm": {
|
401 |
+
"avg_score": 0.14767992424242424,
|
402 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T08-05-14-08-00_agentharm_VJGhWKLrVLdQczBZVgCXHc.eval"
|
403 |
+
},
|
404 |
+
"agentharm_benign": {
|
405 |
+
"avg_score": 0.800704570051161,
|
406 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-21T15-09-48-08-00_agentharm-benign_A3uBBWNvv88P5BsgqwFCfg.eval"
|
407 |
+
},
|
408 |
+
"swe_bench": {
|
409 |
+
"mean": 0.0672,
|
410 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T18-56-55+00-00_anthropic-claude-3-5-sonnet.eval"
|
411 |
+
}
|
412 |
+
}
|
413 |
+
},
|
414 |
+
"gemini-1.5-flash": {
|
415 |
+
"config": {
|
416 |
+
"model_name": "gemini-1.5-flash",
|
417 |
+
"model_sha": "https://deepmind.google/technologies/gemini/flash",
|
418 |
+
"model_dtype": "torch.float16"
|
419 |
+
},
|
420 |
+
"results": {
|
421 |
+
"gpqa_diamond": {
|
422 |
+
"accuracy": 0.40404040404040403,
|
423 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T19-31-26-04-00_gpqa-diamond_7aNe9wQiQKpNN96mfaWBPg.eval"
|
424 |
+
},
|
425 |
+
"arc_challenge": {
|
426 |
+
"accuracy": 0.9308873720136519,
|
427 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T17-34-51-04-00_arc-challenge_FbGgLswBZbRE4EhWiMyRt6.eval"
|
428 |
+
},
|
429 |
+
"math": {
|
430 |
+
"accuracy": 0.452,
|
431 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T21-15-49-04-00_math_YsWdRzpqMq2dqQ9SPKfack.eval"
|
432 |
+
},
|
433 |
+
"mmmu_open": {
|
434 |
+
"accuracy": 0.16981132075471697,
|
435 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2025-01-20T23-13-27-05-00_mmmu-open_GWi6XNYUSLq99BdabtScGm.eval"
|
436 |
+
},
|
437 |
+
"drop": {
|
438 |
+
"mean": 0.751044572627163,
|
439 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T17-36-25-04-00_drop_6TzJGqqEkpFUCxGD4QejV6.eval"
|
440 |
+
},
|
441 |
+
"mmlu_pro": {
|
442 |
+
"accuracy": 0.5993184840425532,
|
443 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T22-43-30-04-00_mmlu-pro_Dc2uu3EV7MJtjg6gg5Y9qH.eval"
|
444 |
+
},
|
445 |
+
"ifeval": {
|
446 |
+
"final_acc": 0.7681296737102001,
|
447 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T21-15-06-04-00_ifeval_nYs9KujQMQjcpbpbLtVx8G.eval"
|
448 |
+
},
|
449 |
+
"hellaswag": {
|
450 |
+
"accuracy": 0.8557060346544513,
|
451 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T19-35-34-04-00_hellaswag_2SAz3cvMpDxFaApdHDR3s4.eval"
|
452 |
+
},
|
453 |
+
"winogrande": {
|
454 |
+
"accuracy": 0.7884767166535123,
|
455 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-31T00-59-07-04-00_winogrande_Ci55vHvbGGW38zVpMCwtWa.eval"
|
456 |
+
},
|
457 |
+
"humaneval": {
|
458 |
+
"mean": 0.7439024390243902,
|
459 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T21-14-41-04-00_humaneval_Z9aXdUERuwYxoTheZ5GANC.eval"
|
460 |
+
},
|
461 |
+
"arc_easy": {
|
462 |
+
"accuracy": 0.984006734006734,
|
463 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T17-29-56-04-00_arc-easy_XcEzqqPqJsRV29NqYDfnNo.eval"
|
464 |
+
},
|
465 |
+
"gsm8k": {
|
466 |
+
"accuracy": 0.8582259287338894,
|
467 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T19-32-39-04-00_gsm8k_nLSssETKDDWNktAFWnVwfv.eval"
|
468 |
+
},
|
469 |
+
"mmlu": {
|
470 |
+
"accuracy": 0.7714713003845606,
|
471 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-31T10-49-43-04-00_mmlu_oGb9mspeGbYS2gfbkknskN.eval"
|
472 |
+
},
|
473 |
+
"mmmu_multiple_choice": {
|
474 |
+
"accuracy": 0.5702479338842975,
|
475 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2025-01-20T23-10-01-05-00_mmmu-multiple-choice_c5rLkrXkV83udX6DVJui5F.eval"
|
476 |
+
},
|
477 |
+
"gaia": {
|
478 |
+
"accuracy": null,
|
479 |
+
"log_url": null
|
480 |
+
},
|
481 |
+
"gdm_intercode_ctf": {
|
482 |
+
"accuracy": null,
|
483 |
+
"log_url": null
|
484 |
+
},
|
485 |
+
"gdm_in_house_ctf": {
|
486 |
+
"accuracy": null,
|
487 |
+
"log_url": null
|
488 |
+
},
|
489 |
+
"agentharm": {
|
490 |
+
"avg_score": null,
|
491 |
+
"log_url": null
|
492 |
+
},
|
493 |
+
"agentharm_benign": {
|
494 |
+
"avg_score": null,
|
495 |
+
"log_url": null
|
496 |
+
},
|
497 |
+
"swe_bench": {
|
498 |
+
"mean": null,
|
499 |
+
"log_url": null
|
500 |
+
}
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"gemini-1.5-pro": {
|
504 |
+
"config": {
|
505 |
+
"model_name": "gemini-1.5-pro",
|
506 |
+
"model_sha": "https://deepmind.google/technologies/gemini/pro",
|
507 |
+
"model_dtype": "torch.float16"
|
508 |
+
},
|
509 |
+
"results": {
|
510 |
+
"mmlu": {
|
511 |
+
"accuracy": 0.8467454778521578,
|
512 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T16-56-26-05-00_mmlu_Z9KrcK7x4ZLAR5nJ9JaVUe.eval"
|
513 |
+
},
|
514 |
+
"humaneval": {
|
515 |
+
"mean": 0.8719512195121951,
|
516 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-43-07-05-00_humaneval_5JBjtymGtK23qwVKxqidhV.eval"
|
517 |
+
},
|
518 |
+
"mmmu_multiple_choice": {
|
519 |
+
"accuracy": 0.6304604486422668,
|
520 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2025-01-20T23-16-04-05-00_mmmu-multiple-choice_NLmxmHYt6CJymRVVa5UsbD.eval"
|
521 |
+
},
|
522 |
+
"mmlu_pro": {
|
523 |
+
"accuracy": 0.7563996010638298,
|
524 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T20-13-09-05-00_mmlu-pro_Hv2ujvKLV6H7ZwQu2q8LNw.eval"
|
525 |
+
},
|
526 |
+
"math": {
|
527 |
+
"accuracy": 0.852,
|
528 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T15-48-46-05-00_math_9DAZmGEfhpa3nUcmMAwqZe.eval"
|
529 |
+
},
|
530 |
+
"arc_easy": {
|
531 |
+
"accuracy": 0.9877946127946128,
|
532 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-31-43-05-00_arc-easy_eGxYWywpLuREcaCKvHa8Uk.eval"
|
533 |
+
},
|
534 |
+
"mmmu_open": {
|
535 |
+
"accuracy": 0.3584905660377358,
|
536 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2025-01-20T23-19-25-05-00_mmmu-open_CDbtEQ7tjs5zkj4ScBbzod.eval"
|
537 |
+
},
|
538 |
+
"gsm8k": {
|
539 |
+
"accuracy": 0.9613343442001516,
|
540 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T15-15-26-05-00_gsm8k_cTebw3ugfrVz3dyPwxtdUZ.eval"
|
541 |
+
},
|
542 |
+
"gpqa_diamond": {
|
543 |
+
"accuracy": 0.5782828282828283,
|
544 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-05T09-56-31-05-00_gpqa-diamond_FBq2bnoyGYQ3NF96xQw8iy.eval"
|
545 |
+
},
|
546 |
+
"ifeval": {
|
547 |
+
"final_acc": 0.8982344623377084,
|
548 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-43-32-05-00_ifeval_mSwZ7AwA7akj5PjZbQMjgC.eval"
|
549 |
+
},
|
550 |
+
"winogrande": {
|
551 |
+
"accuracy": 0.8768745067087609,
|
552 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-40-46-05-00_winogrande_5SmD6rx47zmZvHHkQSSfHK.eval"
|
553 |
+
},
|
554 |
+
"arc_challenge": {
|
555 |
+
"accuracy": 0.9633105802047781,
|
556 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-37-36-05-00_arc-challenge_5VVApyQD22QpJoMm53EMdU.eval"
|
557 |
+
},
|
558 |
+
"drop": {
|
559 |
+
"mean": 0.8800912427897221,
|
560 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-44-32-05-00_drop_9dzPKVJojSVsxmiBFnej2m.eval"
|
561 |
+
},
|
562 |
+
"hellaswag": {
|
563 |
+
"accuracy": 0.9123680541724756,
|
564 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-05T13-14-31-05-00_hellaswag_N98eeftuY2pucRtgpUYk5m.eval"
|
565 |
+
},
|
566 |
+
"gaia": {
|
567 |
+
"accuracy": 0.13818181818181818,
|
568 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-21T15-33-29-05-00_gemini-1.5-pro_gaia_merged.eval"
|
569 |
+
},
|
570 |
+
"gdm_intercode_ctf": {
|
571 |
+
"accuracy": 0.5291139240506328,
|
572 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-21T23-59-58+00-00_gemini-1.5-pro_gdm-intercode-ctf_merged.eval"
|
573 |
+
},
|
574 |
+
"gdm_in_house_ctf": {
|
575 |
+
"accuracy": 0.23076923076923078,
|
576 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-22T03-42-16+00-00_gemini-1.5-pro_gdm-in-house-ctf.eval"
|
577 |
+
},
|
578 |
+
"agentharm": {
|
579 |
+
"avg_score": 0.2898649645808737,
|
580 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-21T12-45-43-08-00_agentharm_VmD26soLwmRgWPo3hpRHBr.eval"
|
581 |
+
},
|
582 |
+
"agentharm_benign": {
|
583 |
+
"avg_score": 0.5961489079102715,
|
584 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-21T13-18-51-08-00_agentharm-benign_gP3pQPxAuCtFLiHzt2Egt7.eval"
|
585 |
+
},
|
586 |
+
"swe_bench": {
|
587 |
+
"mean": 0.004,
|
588 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-22T03-00-08+00-00_google-gemini-1.5-pro_swe.eval"
|
589 |
+
}
|
590 |
+
}
|
591 |
+
},
|
592 |
+
"gpt-4o": {
|
593 |
+
"config": {
|
594 |
+
"model_name": "gpt-4o",
|
595 |
+
"model_sha": "https://openai.com/index/hello-gpt-4o",
|
596 |
+
"model_dtype": "torch.float16"
|
597 |
+
},
|
598 |
+
"results": {
|
599 |
+
"gpqa_diamond": {
|
600 |
+
"accuracy": 0.51010101010101,
|
601 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T03-29-33-04-00_gpqa-diamond_nFmRv5MJiYjHjezmq4V6Va.eval"
|
602 |
+
},
|
603 |
+
"arc_challenge": {
|
604 |
+
"accuracy": 0.9633105802047781,
|
605 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T01-45-55-04-00_arc-challenge_nrsPPxh4DpzgLPQDFdcfVp.eval"
|
606 |
+
},
|
607 |
+
"gsm8k": {
|
608 |
+
"accuracy": 0.9446550416982562,
|
609 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T03-31-16-04-00_gsm8k_jVXeSvHowbietZCFsFYCwB.eval"
|
610 |
+
},
|
611 |
+
"mmlu": {
|
612 |
+
"accuracy": 0.8435408061529697,
|
613 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T10-49-43-04-00_mmlu_GarLpfQFSpM3C22nbbGp54.eval"
|
614 |
+
},
|
615 |
+
"ifeval": {
|
616 |
+
"final_acc": 0.8780386042367585,
|
617 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T05-00-11-04-00_ifeval_jxreUu8JqRdkrcHP4E3hLR.eval"
|
618 |
+
},
|
619 |
+
"mmlu_pro": {
|
620 |
+
"accuracy": 0.7450964095744681,
|
621 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T06-59-42-04-00_mmlu-pro_EuAKDwAWSfNVpqyyqrf2Ba.eval"
|
622 |
+
},
|
623 |
+
"mmmu_open": {
|
624 |
+
"accuracy": 0.3584905660377358,
|
625 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2025-01-20T23-07-46-05-00_mmmu-open_d3Q2HvuPZzEX6FAM4NBhnp.eval"
|
626 |
+
},
|
627 |
+
"winogrande": {
|
628 |
+
"accuracy": 0.9013417521704814,
|
629 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T09-02-03-04-00_winogrande_44kKF7M9mKoqVC7ixZVXuq.eval"
|
630 |
+
},
|
631 |
+
"drop": {
|
632 |
+
"mean": 0.7511693759832198,
|
633 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T01-47-20-04-00_drop_3gxDcn6vUoR3nvHX9BcSq4.eval"
|
634 |
+
},
|
635 |
+
"arc_easy": {
|
636 |
+
"accuracy": 0.9915824915824916,
|
637 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T01-41-34-04-00_arc-easy_nUavRHdiRVfrxo6dmCPadh.eval"
|
638 |
+
},
|
639 |
+
"mmmu_multiple_choice": {
|
640 |
+
"accuracy": 0.5903187721369539,
|
641 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2025-01-20T23-03-21-05-00_mmmu-multiple-choice_eoycAFLMirSqiURdXmBP2e.eval"
|
642 |
+
},
|
643 |
+
"humaneval": {
|
644 |
+
"mean": 0.9085365853658537,
|
645 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T04-59-42-04-00_humaneval_nmJcd84CcNKjWS8fBfMbZM.eval"
|
646 |
+
},
|
647 |
+
"math": {
|
648 |
+
"accuracy": 0.7054,
|
649 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T05-01-22-04-00_math_cDSpKPp3nLrFy8uYfYKEbM.eval"
|
650 |
+
},
|
651 |
+
"hellaswag": {
|
652 |
+
"accuracy": 0.924317864967138,
|
653 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T03-33-47-04-00_hellaswag_JNnnPuz3dhZRpyXzizMUBF.eval"
|
654 |
+
},
|
655 |
+
"gaia": {
|
656 |
+
"accuracy": 0.16606060606060608,
|
657 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-13T15-53-22+00-00_gpt-4o_gaia_merged.eval"
|
658 |
+
},
|
659 |
+
"gdm_intercode_ctf": {
|
660 |
+
"accuracy": 0.6379746835443038,
|
661 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-08T10-06-29-05-00_gpt-4o_gdm-intercode-ctf_merged.eval"
|
662 |
+
},
|
663 |
+
"gdm_in_house_ctf": {
|
664 |
+
"accuracy": 0.23076923076923078,
|
665 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-11T07-02-14+00-00_gpt-4o_gdm-in-house-ctf.eval"
|
666 |
+
},
|
667 |
+
"agentharm": {
|
668 |
+
"avg_score": 0.49953844451003543,
|
669 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-07T16-34-15-08-00_agentharm_UfSoyHEAH2E5RVdrPVUemy.eval"
|
670 |
+
},
|
671 |
+
"agentharm_benign": {
|
672 |
+
"avg_score": 0.8249433048012594,
|
673 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-21T13-45-18-08-00_agentharm-benign_8DhGJqEAvw6o8uCv4a4dVz.eval"
|
674 |
+
},
|
675 |
+
"swe_bench": {
|
676 |
+
"mean": 0.012,
|
677 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-14T23-09-10+00-00_openai-gpt-4o_swe.eval"
|
678 |
+
}
|
679 |
+
}
|
680 |
+
},
|
681 |
+
"gpt-4o-mini": {
|
682 |
+
"config": {
|
683 |
+
"model_name": "gpt-4o-mini",
|
684 |
+
"model_sha": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
|
685 |
+
"model_dtype": "torch.float16"
|
686 |
+
},
|
687 |
+
"results": {
|
688 |
+
"drop": {
|
689 |
+
"mean": 0.8065915049816466,
|
690 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T17-36-25-04-00_drop_6TzJGqqEkpFUCxGD4QejV6.eval"
|
691 |
+
},
|
692 |
+
"humaneval": {
|
693 |
+
"mean": 0.8597560975609756,
|
694 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T21-14-41-04-00_humaneval_Z9aXdUERuwYxoTheZ5GANC.eval"
|
695 |
+
},
|
696 |
+
"gpqa_diamond": {
|
697 |
+
"accuracy": 0.3838383838383838,
|
698 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T19-31-26-04-00_gpqa-diamond_7aNe9wQiQKpNN96mfaWBPg.eval"
|
699 |
+
},
|
700 |
+
"mmmu_open": {
|
701 |
+
"accuracy": 0.18867924528301888,
|
702 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2025-01-20T23-13-27-05-00_mmmu-open_GWi6XNYUSLq99BdabtScGm.eval"
|
703 |
+
},
|
704 |
+
"arc_challenge": {
|
705 |
+
"accuracy": 0.9249146757679181,
|
706 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T17-34-51-04-00_arc-challenge_FbGgLswBZbRE4EhWiMyRt6.eval"
|
707 |
+
},
|
708 |
+
"mmlu": {
|
709 |
+
"accuracy": 0.7698333570716422,
|
710 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-31T10-49-43-04-00_mmlu_oGb9mspeGbYS2gfbkknskN.eval"
|
711 |
+
},
|
712 |
+
"hellaswag": {
|
713 |
+
"accuracy": 0.8750248954391555,
|
714 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T19-35-34-04-00_hellaswag_2SAz3cvMpDxFaApdHDR3s4.eval"
|
715 |
+
},
|
716 |
+
"ifeval": {
|
717 |
+
"final_acc": 0.8419061423689144,
|
718 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T21-15-06-04-00_ifeval_nYs9KujQMQjcpbpbLtVx8G.eval"
|
719 |
+
},
|
720 |
+
"mmmu_multiple_choice": {
|
721 |
+
"accuracy": 0.5395513577331759,
|
722 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2025-01-20T23-10-01-05-00_mmmu-multiple-choice_c5rLkrXkV83udX6DVJui5F.eval"
|
723 |
+
},
|
724 |
+
"arc_easy": {
|
725 |
+
"accuracy": 0.9793771043771043,
|
726 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T17-29-56-04-00_arc-easy_XcEzqqPqJsRV29NqYDfnNo.eval"
|
727 |
+
},
|
728 |
+
"winogrande": {
|
729 |
+
"accuracy": 0.7529597474348856,
|
730 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-31T00-59-07-04-00_winogrande_Ci55vHvbGGW38zVpMCwtWa.eval"
|
731 |
+
},
|
732 |
+
"mmlu_pro": {
|
733 |
+
"accuracy": 0.6396276595744681,
|
734 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T22-43-30-04-00_mmlu-pro_Dc2uu3EV7MJtjg6gg5Y9qH.eval"
|
735 |
+
},
|
736 |
+
"math": {
|
737 |
+
"accuracy": 0.633,
|
738 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T21-15-49-04-00_math_YsWdRzpqMq2dqQ9SPKfack.eval"
|
739 |
+
},
|
740 |
+
"gsm8k": {
|
741 |
+
"accuracy": 0.9181197877179682,
|
742 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T19-32-39-04-00_gsm8k_nLSssETKDDWNktAFWnVwfv.eval"
|
743 |
+
},
|
744 |
+
"gaia": {
|
745 |
+
"accuracy": null,
|
746 |
+
"log_url": null
|
747 |
+
},
|
748 |
+
"gdm_intercode_ctf": {
|
749 |
+
"accuracy": null,
|
750 |
+
"log_url": null
|
751 |
+
},
|
752 |
+
"gdm_in_house_ctf": {
|
753 |
+
"accuracy": null,
|
754 |
+
"log_url": null
|
755 |
+
},
|
756 |
+
"agentharm": {
|
757 |
+
"avg_score": null,
|
758 |
+
"log_url": null
|
759 |
+
},
|
760 |
+
"agentharm_benign": {
|
761 |
+
"avg_score": null,
|
762 |
+
"log_url": null
|
763 |
+
},
|
764 |
+
"swe_bench": {
|
765 |
+
"mean": null,
|
766 |
+
"log_url": null
|
767 |
+
}
|
768 |
+
}
|
769 |
+
},
|
770 |
+
"o1": {
|
771 |
+
"config": {
|
772 |
+
"model_name": "o1",
|
773 |
+
"model_sha": "https://openai.com/o1",
|
774 |
+
"model_dtype": "torch.float16"
|
775 |
+
},
|
776 |
+
"results": {
|
777 |
+
"winogrande": {
|
778 |
+
"accuracy": 0.9392265193370166,
|
779 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.eval"
|
780 |
+
},
|
781 |
+
"humaneval": {
|
782 |
+
"mean": 0.9695121951219512,
|
783 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.eval"
|
784 |
+
},
|
785 |
+
"mmmu_open": {
|
786 |
+
"accuracy": 0.6981132075471698,
|
787 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-20T22-48-09-05-00_mmmu-open_oBzxJBYbvnktbbAwhoCrYK.eval"
|
788 |
+
},
|
789 |
+
"math": {
|
790 |
+
"accuracy": 0.959,
|
791 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T15-03-22-05-00_math_6BbvHFF8hLMsVYozyNLbyQ.eval"
|
792 |
+
},
|
793 |
+
"arc_easy": {
|
794 |
+
"accuracy": 0.9911616161616161,
|
795 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T11-29-26-05-00_arc-easy_DFbir4BdgQDbKd52r7tRKR.eval"
|
796 |
+
},
|
797 |
+
"arc_challenge": {
|
798 |
+
"accuracy": 0.9786689419795221,
|
799 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T11-44-42-05-00_arc-challenge_PsWXaBqrgv3EcTZC55gRzJ.eval"
|
800 |
+
},
|
801 |
+
"gsm8k": {
|
802 |
+
"accuracy": 0.9416224412433661,
|
803 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T12-56-38-05-00_gsm8k_iD8275qeyNTgX523pn45bF.eval"
|
804 |
+
},
|
805 |
+
"gpqa_diamond": {
|
806 |
+
"accuracy": 0.7550505050505051,
|
807 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T11-53-53-05-00_gpqa-diamond_EJV7ULFSQLRoFTEqsv3t6q.eval"
|
808 |
+
},
|
809 |
+
"mmlu_pro": {
|
810 |
+
"accuracy": 0.8447473404255319,
|
811 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-20T14-02-37-05-00_mmlu-pro_EvDzvqaahQwhv6fJovN4BT.eval"
|
812 |
+
},
|
813 |
+
"mmmu_multiple_choice": {
|
814 |
+
"accuracy": 0.8063754427390791,
|
815 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-20T21-04-57-05-00_mmmu-multiple-choice_MctxjookaeTLCL8KpUeazT.eval"
|
816 |
+
},
|
817 |
+
"drop": {
|
818 |
+
"mean": null,
|
819 |
+
"log_url": null
|
820 |
+
},
|
821 |
+
"hellaswag": {
|
822 |
+
"accuracy": null,
|
823 |
+
"log_url": null
|
824 |
+
},
|
825 |
+
"ifeval": {
|
826 |
+
"final_acc": null,
|
827 |
+
"log_url": null
|
828 |
+
},
|
829 |
+
"mmlu": {
|
830 |
+
"accuracy": null,
|
831 |
+
"log_url": null
|
832 |
+
},
|
833 |
+
"gaia": {
|
834 |
+
"accuracy": 0.41090909090909084,
|
835 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-22T13-42-00-05-00_o1_gaia_merged.eval"
|
836 |
+
},
|
837 |
+
"gdm_intercode_ctf": {
|
838 |
+
"accuracy": 0.8481012658227849,
|
839 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-22T20-46-35+00-00_o1_gdm-intercode-ctf_merged.eval"
|
840 |
+
},
|
841 |
+
"gdm_in_house_ctf": {
|
842 |
+
"accuracy": 0.46153846153846156,
|
843 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-22T05-52-25+00-00_o1_gdm-in-house-ctf.eval"
|
844 |
+
},
|
845 |
+
"agentharm": {
|
846 |
+
"avg_score": 0.08782061688311688,
|
847 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-21T09-05-42-08-00_agentharm_UGDq2yJeLAnPH6p7FgDgD8.eval"
|
848 |
+
},
|
849 |
+
"agentharm_benign": {
|
850 |
+
"avg_score": 0.7235176849665487,
|
851 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-21T18-20-15-08-00_agentharm-benign_bkW2Bf5xLyDQdNtfLdjCpJ.eval"
|
852 |
+
},
|
853 |
+
"swe_bench": {
|
854 |
+
"mean": 0.0036,
|
855 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-21T17-42-11+00-00_openai-o1_swe.eval "
|
856 |
+
}
|
857 |
+
}
|
858 |
+
},
|
859 |
+
"o3-mini": {
|
860 |
+
"config": {
|
861 |
+
"model_name": "o3-mini",
|
862 |
+
"model_sha": "https://openai.com/index/openai-o3-mini",
|
863 |
+
"model_dtype": "torch.float16"
|
864 |
+
},
|
865 |
+
"results": {
|
866 |
+
"math": {
|
867 |
+
"accuracy": 0.9691320905993185,
|
868 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T18-33-30-05-00_math_86Gx8n4BxhpyfaSHmRcCUm.eval"
|
869 |
+
},
|
870 |
+
"humaneval": {
|
871 |
+
"mean": 0.9817073170731707,
|
872 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T20-58-48-05-00_humaneval_Dkod7CS9RmbbogYx9aEXtx.eval"
|
873 |
+
},
|
874 |
+
"mmlu_pro": {
|
875 |
+
"accuracy": 0.7924606807023383,
|
876 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T19-49-27-05-00_mmlu-pro_jz9woKfdKt8VMzqNFsy7kY.eval"
|
877 |
+
},
|
878 |
+
"gpqa_diamond": {
|
879 |
+
"accuracy": 0.7365319865319865,
|
880 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T17-57-54-05-00_gpqa-diamond_2znyMtdc7X4LJufxXeXA8Z.eval"
|
881 |
+
},
|
882 |
+
"winogrande": {
|
883 |
+
"accuracy": 0.8492501973164956,
|
884 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T22-50-40-05-00_winogrande_VsTW2uU2Kj66YoNoFfRfUj.eval"
|
885 |
+
},
|
886 |
+
"gsm8k": {
|
887 |
+
"accuracy": 0.9454131918119788,
|
888 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T18-23-05-05-00_gsm8k_d523pJzkcvobxamhhobCRb.eval"
|
889 |
+
},
|
890 |
+
"arc_challenge": {
|
891 |
+
"accuracy": 0.9641638225255973,
|
892 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T17-53-30-05-00_arc-challenge_AYFHec7wmd4jELF2Rgzfya.eval"
|
893 |
+
},
|
894 |
+
"arc_easy": {
|
895 |
+
"accuracy": 0.9755892255892256,
|
896 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T17-45-57-05-00_arc-easy_Nd8NP3K48tvwLVZb8kXDwg.eval"
|
897 |
+
},
|
898 |
+
"drop": {
|
899 |
+
"mean": null,
|
900 |
+
"log_url": null
|
901 |
+
},
|
902 |
+
"hellaswag": {
|
903 |
+
"accuracy": null,
|
904 |
+
"log_url": null
|
905 |
+
},
|
906 |
+
"ifeval": {
|
907 |
+
"final_acc": null,
|
908 |
+
"log_url": null
|
909 |
+
},
|
910 |
+
"mmlu": {
|
911 |
+
"accuracy": null,
|
912 |
+
"log_url": null
|
913 |
+
},
|
914 |
+
"mmmu_multiple_choice": {
|
915 |
+
"accuracy": null,
|
916 |
+
"log_url": null
|
917 |
+
},
|
918 |
+
"mmmu_open": {
|
919 |
+
"accuracy": null,
|
920 |
+
"log_url": null
|
921 |
+
},
|
922 |
+
"gaia": {
|
923 |
+
"accuracy": 0.27030303030303043,
|
924 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-02-05T23-21-20+00-00_gaia_hyMq8MzMm6NgAeq3dNqZSU.eval"
|
925 |
+
},
|
926 |
+
"gdm_intercode_ctf": {
|
927 |
+
"accuracy": 0.8278481012658225,
|
928 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-05T21-43-18+00-00_gdm-intercode-ctf_gdm29C6DuTEsX9qm9ymmrC.eval"
|
929 |
+
},
|
930 |
+
"gdm_in_house_ctf": {
|
931 |
+
"accuracy": 0.38461538461538464,
|
932 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-05T23-59-08+00-00_gdm-in-house-ctf_2zkAX5nkJoxDnVKpJL9VgW.eval"
|
933 |
+
},
|
934 |
+
"agentharm": {
|
935 |
+
"avg_score": 0.1241931080283353,
|
936 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-03T18-17-03-08-00_agentharm_DmN6i5HrgXHNARjsuSewjg.eval"
|
937 |
+
},
|
938 |
+
"agentharm_benign": {
|
939 |
+
"avg_score": 0.5429306867375049,
|
940 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-03T18-49-08-08-00_agentharm-benign_Gv94YFpAXaaCJqe3Fc6yr3.eval"
|
941 |
+
},
|
942 |
+
"swe_bench": {
|
943 |
+
"mean": 0.0024,
|
944 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-03T06-49-09+00-00_openai-o3-mini_swe.eval"
|
945 |
+
}
|
946 |
+
}
|
947 |
+
}
|
948 |
+
}
|
data/results.json.bak
ADDED
@@ -0,0 +1,760 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"DeepSeek-R1": {
|
3 |
+
"config": {
|
4 |
+
"model_name": "DeepSeek-R1",
|
5 |
+
"model_sha": "https://api-docs.deepseek.com/news/news250120",
|
6 |
+
"model_dtype": "torch.float16"
|
7 |
+
},
|
8 |
+
"results": {
|
9 |
+
"mmlu_pro": {
|
10 |
+
"accuracy": 0.8382646276595744
|
11 |
+
},
|
12 |
+
"humaneval": {
|
13 |
+
"mean": 0.9567901234567902
|
14 |
+
},
|
15 |
+
"math": {
|
16 |
+
"accuracy": 0.9272
|
17 |
+
},
|
18 |
+
"gsm8k": {
|
19 |
+
"accuracy": 0.954510993176649
|
20 |
+
},
|
21 |
+
"arc_challenge": {
|
22 |
+
"accuracy": 0.9667235494880546
|
23 |
+
},
|
24 |
+
"winogrande": {
|
25 |
+
"accuracy": 0.9179163378058406
|
26 |
+
},
|
27 |
+
"arc_easy": {
|
28 |
+
"accuracy": 0.9873737373737373
|
29 |
+
},
|
30 |
+
"gpqa_diamond": {
|
31 |
+
"accuracy": 0.7045454545454546
|
32 |
+
},
|
33 |
+
"drop": {
|
34 |
+
"mean": null
|
35 |
+
},
|
36 |
+
"hellaswag": {
|
37 |
+
"accuracy": null
|
38 |
+
},
|
39 |
+
"ifeval": {
|
40 |
+
"final_acc": null
|
41 |
+
},
|
42 |
+
"mmlu": {
|
43 |
+
"accuracy": null
|
44 |
+
},
|
45 |
+
"mmmu_multiple_choice": {
|
46 |
+
"accuracy": null
|
47 |
+
},
|
48 |
+
"mmmu_open": {
|
49 |
+
"accuracy": null
|
50 |
+
},
|
51 |
+
"gaia": {
|
52 |
+
"accuracy": null
|
53 |
+
},
|
54 |
+
"gdm_intercode_ctf": {
|
55 |
+
"accuracy": null
|
56 |
+
},
|
57 |
+
"gdm_in_house_ctf": {
|
58 |
+
"accuracy": null
|
59 |
+
},
|
60 |
+
"agentharm": {
|
61 |
+
"avg_score": null
|
62 |
+
},
|
63 |
+
"agentharm_benign": {
|
64 |
+
"avg_score": null
|
65 |
+
},
|
66 |
+
"swe_bench": {
|
67 |
+
"mean": null
|
68 |
+
}
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"Meta-Llama-3.1-70B-Instruct": {
|
72 |
+
"config": {
|
73 |
+
"model_name": "Meta-Llama-3.1-70B-Instruct",
|
74 |
+
"model_sha": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
|
75 |
+
"model_dtype": "torch.float16"
|
76 |
+
},
|
77 |
+
"results": {
|
78 |
+
"hellaswag": {
|
79 |
+
"accuracy": 0.869946225851424
|
80 |
+
},
|
81 |
+
"drop": {
|
82 |
+
"mean": 0.8811263765076035
|
83 |
+
},
|
84 |
+
"gpqa_diamond": {
|
85 |
+
"accuracy": 0.4318181818181818
|
86 |
+
},
|
87 |
+
"winogrande": {
|
88 |
+
"accuracy": 0.8666140489344909
|
89 |
+
},
|
90 |
+
"gsm8k": {
|
91 |
+
"accuracy": 0.9469294920394238
|
92 |
+
},
|
93 |
+
"math": {
|
94 |
+
"accuracy": 0.6004
|
95 |
+
},
|
96 |
+
"ifeval": {
|
97 |
+
"final_acc": 0.8604907201780166
|
98 |
+
},
|
99 |
+
"arc_challenge": {
|
100 |
+
"accuracy": 0.9445392491467577
|
101 |
+
},
|
102 |
+
"arc_easy": {
|
103 |
+
"accuracy": 0.9823232323232324
|
104 |
+
},
|
105 |
+
"mmlu_pro": {
|
106 |
+
"accuracy": 0.6688829787234043
|
107 |
+
},
|
108 |
+
"humaneval": {
|
109 |
+
"mean": 0.7865853658536586
|
110 |
+
},
|
111 |
+
"mmlu": {
|
112 |
+
"accuracy": 0.8033755875231449
|
113 |
+
},
|
114 |
+
"mmmu_multiple_choice": {
|
115 |
+
"accuracy": null
|
116 |
+
},
|
117 |
+
"mmmu_open": {
|
118 |
+
"accuracy": null
|
119 |
+
},
|
120 |
+
"gaia": {
|
121 |
+
"accuracy": null
|
122 |
+
},
|
123 |
+
"gdm_intercode_ctf": {
|
124 |
+
"accuracy": null
|
125 |
+
},
|
126 |
+
"gdm_in_house_ctf": {
|
127 |
+
"accuracy": null
|
128 |
+
},
|
129 |
+
"agentharm": {
|
130 |
+
"avg_score": null
|
131 |
+
},
|
132 |
+
"agentharm_benign": {
|
133 |
+
"avg_score": null
|
134 |
+
},
|
135 |
+
"swe_bench": {
|
136 |
+
"mean": null
|
137 |
+
}
|
138 |
+
}
|
139 |
+
},
|
140 |
+
"Mistral-Large-Instruct-2407": {
|
141 |
+
"config": {
|
142 |
+
"model_name": "Mistral-Large-Instruct-2407",
|
143 |
+
"model_sha": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
|
144 |
+
"model_dtype": "torch.float16"
|
145 |
+
},
|
146 |
+
"results": {
|
147 |
+
"drop": {
|
148 |
+
"mean": 0.7424257996853698
|
149 |
+
},
|
150 |
+
"ifeval": {
|
151 |
+
"final_acc": 0.8285172231900246
|
152 |
+
},
|
153 |
+
"mmlu": {
|
154 |
+
"accuracy": 0.8035892323030908
|
155 |
+
},
|
156 |
+
"gpqa_diamond": {
|
157 |
+
"accuracy": 0.4734848484848485
|
158 |
+
},
|
159 |
+
"gsm8k": {
|
160 |
+
"accuracy": 0.9378316906747536
|
161 |
+
},
|
162 |
+
"math": {
|
163 |
+
"accuracy": 0.6574
|
164 |
+
},
|
165 |
+
"arc_easy": {
|
166 |
+
"accuracy": 0.9852693602693603
|
167 |
+
},
|
168 |
+
"mmlu_pro": {
|
169 |
+
"accuracy": 0.6942320478723404
|
170 |
+
},
|
171 |
+
"humaneval": {
|
172 |
+
"mean": 0.8658536585365854
|
173 |
+
},
|
174 |
+
"hellaswag": {
|
175 |
+
"accuracy": 0.9047998406691894
|
176 |
+
},
|
177 |
+
"arc_challenge": {
|
178 |
+
"accuracy": 0.9436860068259386
|
179 |
+
},
|
180 |
+
"winogrande": {
|
181 |
+
"accuracy": 0.8547750591949487
|
182 |
+
},
|
183 |
+
"mmmu_multiple_choice": {
|
184 |
+
"accuracy": null
|
185 |
+
},
|
186 |
+
"mmmu_open": {
|
187 |
+
"accuracy": null
|
188 |
+
},
|
189 |
+
"gaia": {
|
190 |
+
"accuracy": null
|
191 |
+
},
|
192 |
+
"gdm_intercode_ctf": {
|
193 |
+
"accuracy": null
|
194 |
+
},
|
195 |
+
"gdm_in_house_ctf": {
|
196 |
+
"accuracy": null
|
197 |
+
},
|
198 |
+
"agentharm": {
|
199 |
+
"avg_score": null
|
200 |
+
},
|
201 |
+
"agentharm_benign": {
|
202 |
+
"avg_score": null
|
203 |
+
},
|
204 |
+
"swe_bench": {
|
205 |
+
"mean": null
|
206 |
+
}
|
207 |
+
}
|
208 |
+
},
|
209 |
+
"c4ai-command-r-plus": {
|
210 |
+
"config": {
|
211 |
+
"model_name": "c4ai-command-r-plus",
|
212 |
+
"model_sha": "https://huggingface.co/CohereForAI/c4ai-command-r-plus"
|
213 |
+
},
|
214 |
+
"results": {
|
215 |
+
"ifeval": {
|
216 |
+
"final_acc": 0.7779591483929307
|
217 |
+
},
|
218 |
+
"winogrande": {
|
219 |
+
"accuracy": 0.7490134175217048
|
220 |
+
},
|
221 |
+
"arc_challenge": {
|
222 |
+
"accuracy": 0.8506825938566553
|
223 |
+
},
|
224 |
+
"drop": {
|
225 |
+
"mean": 0.743557420031463
|
226 |
+
},
|
227 |
+
"math": {
|
228 |
+
"accuracy": 0.2626
|
229 |
+
},
|
230 |
+
"gpqa_diamond": {
|
231 |
+
"accuracy": 0.3194444444444444
|
232 |
+
},
|
233 |
+
"mmlu_pro": {
|
234 |
+
"accuracy": 0.441156914893617
|
235 |
+
},
|
236 |
+
"humaneval": {
|
237 |
+
"mean": 0.6219512195121951
|
238 |
+
},
|
239 |
+
"gsm8k": {
|
240 |
+
"accuracy": 0.7816527672479151
|
241 |
+
},
|
242 |
+
"hellaswag": {
|
243 |
+
"accuracy": 0.7954590718980283
|
244 |
+
},
|
245 |
+
"mmlu": {
|
246 |
+
"accuracy": 0.695128899017234
|
247 |
+
},
|
248 |
+
"arc_easy": {
|
249 |
+
"accuracy": 0.9377104377104377
|
250 |
+
},
|
251 |
+
"mmmu_multiple_choice": {
|
252 |
+
"accuracy": null
|
253 |
+
},
|
254 |
+
"mmmu_open": {
|
255 |
+
"accuracy": null
|
256 |
+
},
|
257 |
+
"gaia": {
|
258 |
+
"accuracy": null
|
259 |
+
},
|
260 |
+
"gdm_intercode_ctf": {
|
261 |
+
"accuracy": null
|
262 |
+
},
|
263 |
+
"gdm_in_house_ctf": {
|
264 |
+
"accuracy": null
|
265 |
+
},
|
266 |
+
"agentharm": {
|
267 |
+
"avg_score": null
|
268 |
+
},
|
269 |
+
"agentharm_benign": {
|
270 |
+
"avg_score": null
|
271 |
+
},
|
272 |
+
"swe_bench": {
|
273 |
+
"mean": null
|
274 |
+
}
|
275 |
+
}
|
276 |
+
},
|
277 |
+
"claude-3-5-sonnet-20241022": {
|
278 |
+
"config": {
|
279 |
+
"model_name": "claude-3-5-sonnet-20241022",
|
280 |
+
"model_sha": "https://www.anthropic.com/claude/sonnet",
|
281 |
+
"model_dtype": "torch.float16"
|
282 |
+
},
|
283 |
+
"results": {
|
284 |
+
"mmmu_multiple_choice": {
|
285 |
+
"accuracy": 0.6481700118063755
|
286 |
+
},
|
287 |
+
"mmlu_pro": {
|
288 |
+
"accuracy": 0.7762632978723404
|
289 |
+
},
|
290 |
+
"hellaswag": {
|
291 |
+
"accuracy": 0.9228241386178052
|
292 |
+
},
|
293 |
+
"gpqa_diamond": {
|
294 |
+
"accuracy": 0.6098484848484849
|
295 |
+
},
|
296 |
+
"gsm8k": {
|
297 |
+
"accuracy": 0.9620924943138741
|
298 |
+
},
|
299 |
+
"mmmu_open": {
|
300 |
+
"accuracy": 0.41509433962264153
|
301 |
+
},
|
302 |
+
"arc_easy": {
|
303 |
+
"accuracy": 0.9915824915824916
|
304 |
+
},
|
305 |
+
"arc_challenge": {
|
306 |
+
"accuracy": 0.9692832764505119
|
307 |
+
},
|
308 |
+
"mmlu": {
|
309 |
+
"accuracy": 0.8665432274604757
|
310 |
+
},
|
311 |
+
"math": {
|
312 |
+
"accuracy": 0.7942
|
313 |
+
},
|
314 |
+
"ifeval": {
|
315 |
+
"final_acc": 0.8958114469607309
|
316 |
+
},
|
317 |
+
"humaneval": {
|
318 |
+
"mean": 0.9451219512195121
|
319 |
+
},
|
320 |
+
"winogrande": {
|
321 |
+
"accuracy": 0.9021310181531176
|
322 |
+
},
|
323 |
+
"drop": {
|
324 |
+
"mean": 0.8977608809648663
|
325 |
+
},
|
326 |
+
"gaia": {
|
327 |
+
"accuracy": 0.3381818181818182
|
328 |
+
},
|
329 |
+
"gdm_intercode_ctf": {
|
330 |
+
"accuracy": 0.8556962025316455
|
331 |
+
},
|
332 |
+
"gdm_in_house_ctf": {
|
333 |
+
"accuracy": 0.6153846153846154
|
334 |
+
},
|
335 |
+
"agentharm": {
|
336 |
+
"avg_score": 0.14767992424242424
|
337 |
+
},
|
338 |
+
"agentharm_benign": {
|
339 |
+
"avg_score": 0.800704570051161
|
340 |
+
},
|
341 |
+
"swe_bench": {
|
342 |
+
"mean": 0.0672
|
343 |
+
}
|
344 |
+
}
|
345 |
+
},
|
346 |
+
"gemini-1.5-flash": {
|
347 |
+
"config": {
|
348 |
+
"model_name": "gemini-1.5-flash",
|
349 |
+
"model_sha": "https://deepmind.google/technologies/gemini/flash",
|
350 |
+
"model_dtype": "torch.float16"
|
351 |
+
},
|
352 |
+
"results": {
|
353 |
+
"gpqa_diamond": {
|
354 |
+
"accuracy": 0.40404040404040403
|
355 |
+
},
|
356 |
+
"arc_challenge": {
|
357 |
+
"accuracy": 0.9308873720136519
|
358 |
+
},
|
359 |
+
"math": {
|
360 |
+
"accuracy": 0.452
|
361 |
+
},
|
362 |
+
"mmmu_open": {
|
363 |
+
"accuracy": 0.16981132075471697
|
364 |
+
},
|
365 |
+
"drop": {
|
366 |
+
"mean": 0.751044572627163
|
367 |
+
},
|
368 |
+
"mmlu_pro": {
|
369 |
+
"accuracy": 0.5993184840425532
|
370 |
+
},
|
371 |
+
"ifeval": {
|
372 |
+
"final_acc": 0.7681296737102001
|
373 |
+
},
|
374 |
+
"hellaswag": {
|
375 |
+
"accuracy": 0.8557060346544513
|
376 |
+
},
|
377 |
+
"winogrande": {
|
378 |
+
"accuracy": 0.7884767166535123
|
379 |
+
},
|
380 |
+
"humaneval": {
|
381 |
+
"mean": 0.7439024390243902
|
382 |
+
},
|
383 |
+
"arc_easy": {
|
384 |
+
"accuracy": 0.984006734006734
|
385 |
+
},
|
386 |
+
"gsm8k": {
|
387 |
+
"accuracy": 0.8582259287338894
|
388 |
+
},
|
389 |
+
"mmlu": {
|
390 |
+
"accuracy": 0.7714713003845606
|
391 |
+
},
|
392 |
+
"mmmu_multiple_choice": {
|
393 |
+
"accuracy": 0.5702479338842975
|
394 |
+
},
|
395 |
+
"gaia": {
|
396 |
+
"accuracy": null
|
397 |
+
},
|
398 |
+
"gdm_intercode_ctf": {
|
399 |
+
"accuracy": null
|
400 |
+
},
|
401 |
+
"gdm_in_house_ctf": {
|
402 |
+
"accuracy": null
|
403 |
+
},
|
404 |
+
"agentharm": {
|
405 |
+
"avg_score": null
|
406 |
+
},
|
407 |
+
"agentharm_benign": {
|
408 |
+
"avg_score": null
|
409 |
+
},
|
410 |
+
"swe_bench": {
|
411 |
+
"mean": null
|
412 |
+
}
|
413 |
+
}
|
414 |
+
},
|
415 |
+
"gemini-1.5-pro": {
|
416 |
+
"config": {
|
417 |
+
"model_name": "gemini-1.5-pro",
|
418 |
+
"model_sha": "https://deepmind.google/technologies/gemini/pro",
|
419 |
+
"model_dtype": "torch.float16"
|
420 |
+
},
|
421 |
+
"results": {
|
422 |
+
"mmlu": {
|
423 |
+
"accuracy": 0.8467454778521578
|
424 |
+
},
|
425 |
+
"humaneval": {
|
426 |
+
"mean": 0.8719512195121951
|
427 |
+
},
|
428 |
+
"mmmu_multiple_choice": {
|
429 |
+
"accuracy": 0.6304604486422668
|
430 |
+
},
|
431 |
+
"mmlu_pro": {
|
432 |
+
"accuracy": 0.7563996010638298
|
433 |
+
},
|
434 |
+
"math": {
|
435 |
+
"accuracy": 0.852
|
436 |
+
},
|
437 |
+
"arc_easy": {
|
438 |
+
"accuracy": 0.9877946127946128
|
439 |
+
},
|
440 |
+
"mmmu_open": {
|
441 |
+
"accuracy": 0.3584905660377358
|
442 |
+
},
|
443 |
+
"gsm8k": {
|
444 |
+
"accuracy": 0.9613343442001516
|
445 |
+
},
|
446 |
+
"gpqa_diamond": {
|
447 |
+
"accuracy": 0.5782828282828283
|
448 |
+
},
|
449 |
+
"ifeval": {
|
450 |
+
"final_acc": 0.8982344623377084
|
451 |
+
},
|
452 |
+
"winogrande": {
|
453 |
+
"accuracy": 0.8768745067087609
|
454 |
+
},
|
455 |
+
"arc_challenge": {
|
456 |
+
"accuracy": 0.9633105802047781
|
457 |
+
},
|
458 |
+
"drop": {
|
459 |
+
"mean": 0.8800912427897221
|
460 |
+
},
|
461 |
+
"hellaswag": {
|
462 |
+
"accuracy": 0.9123680541724756
|
463 |
+
},
|
464 |
+
"gaia": {
|
465 |
+
"accuracy": 0.13818181818181818
|
466 |
+
},
|
467 |
+
"gdm_intercode_ctf": {
|
468 |
+
"accuracy": 0.5291139240506328
|
469 |
+
},
|
470 |
+
"gdm_in_house_ctf": {
|
471 |
+
"accuracy": 0.23076923076923078
|
472 |
+
},
|
473 |
+
"agentharm": {
|
474 |
+
"avg_score": 0.2898649645808737
|
475 |
+
},
|
476 |
+
"agentharm_benign": {
|
477 |
+
"avg_score": 0.5961489079102715
|
478 |
+
},
|
479 |
+
"swe_bench": {
|
480 |
+
"mean": 0.004
|
481 |
+
}
|
482 |
+
}
|
483 |
+
},
|
484 |
+
"gpt-4o": {
|
485 |
+
"config": {
|
486 |
+
"model_name": "gpt-4o",
|
487 |
+
"model_sha": "https://openai.com/index/hello-gpt-4o",
|
488 |
+
"model_dtype": "torch.float16"
|
489 |
+
},
|
490 |
+
"results": {
|
491 |
+
"gpqa_diamond": {
|
492 |
+
"accuracy": 0.51010101010101
|
493 |
+
},
|
494 |
+
"arc_challenge": {
|
495 |
+
"accuracy": 0.9633105802047781
|
496 |
+
},
|
497 |
+
"gsm8k": {
|
498 |
+
"accuracy": 0.9446550416982562
|
499 |
+
},
|
500 |
+
"mmlu": {
|
501 |
+
"accuracy": 0.8435408061529697
|
502 |
+
},
|
503 |
+
"ifeval": {
|
504 |
+
"final_acc": 0.8780386042367585
|
505 |
+
},
|
506 |
+
"mmlu_pro": {
|
507 |
+
"accuracy": 0.7450964095744681
|
508 |
+
},
|
509 |
+
"mmmu_open": {
|
510 |
+
"accuracy": 0.3584905660377358
|
511 |
+
},
|
512 |
+
"winogrande": {
|
513 |
+
"accuracy": 0.9013417521704814
|
514 |
+
},
|
515 |
+
"drop": {
|
516 |
+
"mean": 0.7511693759832198
|
517 |
+
},
|
518 |
+
"arc_easy": {
|
519 |
+
"accuracy": 0.9915824915824916
|
520 |
+
},
|
521 |
+
"mmmu_multiple_choice": {
|
522 |
+
"accuracy": 0.5903187721369539
|
523 |
+
},
|
524 |
+
"humaneval": {
|
525 |
+
"mean": 0.9085365853658537
|
526 |
+
},
|
527 |
+
"math": {
|
528 |
+
"accuracy": 0.7054
|
529 |
+
},
|
530 |
+
"hellaswag": {
|
531 |
+
"accuracy": 0.924317864967138
|
532 |
+
},
|
533 |
+
"gaia": {
|
534 |
+
"accuracy": 0.16606060606060608
|
535 |
+
},
|
536 |
+
"gdm_intercode_ctf": {
|
537 |
+
"accuracy": 0.6379746835443038
|
538 |
+
},
|
539 |
+
"gdm_in_house_ctf": {
|
540 |
+
"accuracy": 0.23076923076923078
|
541 |
+
},
|
542 |
+
"agentharm": {
|
543 |
+
"avg_score": 0.49953844451003543
|
544 |
+
},
|
545 |
+
"agentharm_benign": {
|
546 |
+
"avg_score": 0.8249433048012594
|
547 |
+
},
|
548 |
+
"swe_bench": {
|
549 |
+
"mean": 0.012
|
550 |
+
}
|
551 |
+
}
|
552 |
+
},
|
553 |
+
"gpt-4o-mini": {
|
554 |
+
"config": {
|
555 |
+
"model_name": "gpt-4o-mini",
|
556 |
+
"model_sha": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
|
557 |
+
"model_dtype": "torch.float16"
|
558 |
+
},
|
559 |
+
"results": {
|
560 |
+
"drop": {
|
561 |
+
"mean": 0.8065915049816466
|
562 |
+
},
|
563 |
+
"humaneval": {
|
564 |
+
"mean": 0.8597560975609756
|
565 |
+
},
|
566 |
+
"gpqa_diamond": {
|
567 |
+
"accuracy": 0.3838383838383838
|
568 |
+
},
|
569 |
+
"mmmu_open": {
|
570 |
+
"accuracy": 0.18867924528301888
|
571 |
+
},
|
572 |
+
"arc_challenge": {
|
573 |
+
"accuracy": 0.9249146757679181
|
574 |
+
},
|
575 |
+
"mmlu": {
|
576 |
+
"accuracy": 0.7698333570716422
|
577 |
+
},
|
578 |
+
"hellaswag": {
|
579 |
+
"accuracy": 0.8750248954391555
|
580 |
+
},
|
581 |
+
"ifeval": {
|
582 |
+
"final_acc": 0.8419061423689144
|
583 |
+
},
|
584 |
+
"mmmu_multiple_choice": {
|
585 |
+
"accuracy": 0.5395513577331759
|
586 |
+
},
|
587 |
+
"arc_easy": {
|
588 |
+
"accuracy": 0.9793771043771043
|
589 |
+
},
|
590 |
+
"winogrande": {
|
591 |
+
"accuracy": 0.7529597474348856
|
592 |
+
},
|
593 |
+
"mmlu_pro": {
|
594 |
+
"accuracy": 0.6396276595744681
|
595 |
+
},
|
596 |
+
"math": {
|
597 |
+
"accuracy": 0.633
|
598 |
+
},
|
599 |
+
"gsm8k": {
|
600 |
+
"accuracy": 0.9181197877179682
|
601 |
+
},
|
602 |
+
"gaia": {
|
603 |
+
"accuracy": null
|
604 |
+
},
|
605 |
+
"gdm_intercode_ctf": {
|
606 |
+
"accuracy": null
|
607 |
+
},
|
608 |
+
"gdm_in_house_ctf": {
|
609 |
+
"accuracy": null
|
610 |
+
},
|
611 |
+
"agentharm": {
|
612 |
+
"avg_score": null
|
613 |
+
},
|
614 |
+
"agentharm_benign": {
|
615 |
+
"avg_score": null
|
616 |
+
},
|
617 |
+
"swe_bench": {
|
618 |
+
"mean": null
|
619 |
+
}
|
620 |
+
}
|
621 |
+
},
|
622 |
+
"o1": {
|
623 |
+
"config": {
|
624 |
+
"model_name": "o1",
|
625 |
+
"model_sha": "https://openai.com/o1",
|
626 |
+
"model_dtype": "torch.float16"
|
627 |
+
},
|
628 |
+
"results": {
|
629 |
+
"winogrande": {
|
630 |
+
"accuracy": 0.9392265193370166
|
631 |
+
},
|
632 |
+
"humaneval": {
|
633 |
+
"mean": 0.9695121951219512
|
634 |
+
},
|
635 |
+
"mmmu_open": {
|
636 |
+
"accuracy": 0.6981132075471698
|
637 |
+
},
|
638 |
+
"math": {
|
639 |
+
"accuracy": 0.959
|
640 |
+
},
|
641 |
+
"arc_easy": {
|
642 |
+
"accuracy": 0.9911616161616161
|
643 |
+
},
|
644 |
+
"arc_challenge": {
|
645 |
+
"accuracy": 0.9786689419795221
|
646 |
+
},
|
647 |
+
"gsm8k": {
|
648 |
+
"accuracy": 0.9416224412433661
|
649 |
+
},
|
650 |
+
"gpqa_diamond": {
|
651 |
+
"accuracy": 0.7550505050505051
|
652 |
+
},
|
653 |
+
"mmlu_pro": {
|
654 |
+
"accuracy": 0.8447473404255319
|
655 |
+
},
|
656 |
+
"mmmu_multiple_choice": {
|
657 |
+
"accuracy": 0.8063754427390791
|
658 |
+
},
|
659 |
+
"drop": {
|
660 |
+
"mean": null
|
661 |
+
},
|
662 |
+
"hellaswag": {
|
663 |
+
"accuracy": null
|
664 |
+
},
|
665 |
+
"ifeval": {
|
666 |
+
"final_acc": null
|
667 |
+
},
|
668 |
+
"mmlu": {
|
669 |
+
"accuracy": null
|
670 |
+
},
|
671 |
+
"gaia": {
|
672 |
+
"accuracy": 0.41090909090909084
|
673 |
+
},
|
674 |
+
"gdm_intercode_ctf": {
|
675 |
+
"accuracy": 0.8481012658227849
|
676 |
+
},
|
677 |
+
"gdm_in_house_ctf": {
|
678 |
+
"accuracy": 0.46153846153846156
|
679 |
+
},
|
680 |
+
"agentharm": {
|
681 |
+
"avg_score": 0.08782061688311688
|
682 |
+
},
|
683 |
+
"agentharm_benign": {
|
684 |
+
"avg_score": 0.7235176849665487
|
685 |
+
},
|
686 |
+
"swe_bench": {
|
687 |
+
"mean": 0.0036
|
688 |
+
}
|
689 |
+
}
|
690 |
+
},
|
691 |
+
"o3-mini": {
|
692 |
+
"config": {
|
693 |
+
"model_name": "o3-mini",
|
694 |
+
"model_sha": "https://openai.com/index/openai-o3-mini",
|
695 |
+
"model_dtype": "torch.float16"
|
696 |
+
},
|
697 |
+
"results": {
|
698 |
+
"math": {
|
699 |
+
"accuracy": 0.9691320905993185
|
700 |
+
},
|
701 |
+
"humaneval": {
|
702 |
+
"mean": 0.9817073170731707
|
703 |
+
},
|
704 |
+
"mmlu_pro": {
|
705 |
+
"accuracy": 0.7924606807023383
|
706 |
+
},
|
707 |
+
"gpqa_diamond": {
|
708 |
+
"accuracy": 0.7365319865319865
|
709 |
+
},
|
710 |
+
"winogrande": {
|
711 |
+
"accuracy": 0.8492501973164956
|
712 |
+
},
|
713 |
+
"gsm8k": {
|
714 |
+
"accuracy": 0.9454131918119788
|
715 |
+
},
|
716 |
+
"arc_challenge": {
|
717 |
+
"accuracy": 0.9641638225255973
|
718 |
+
},
|
719 |
+
"arc_easy": {
|
720 |
+
"accuracy": 0.9755892255892256
|
721 |
+
},
|
722 |
+
"drop": {
|
723 |
+
"mean": null
|
724 |
+
},
|
725 |
+
"hellaswag": {
|
726 |
+
"accuracy": null
|
727 |
+
},
|
728 |
+
"ifeval": {
|
729 |
+
"final_acc": null
|
730 |
+
},
|
731 |
+
"mmlu": {
|
732 |
+
"accuracy": null
|
733 |
+
},
|
734 |
+
"mmmu_multiple_choice": {
|
735 |
+
"accuracy": null
|
736 |
+
},
|
737 |
+
"mmmu_open": {
|
738 |
+
"accuracy": null
|
739 |
+
},
|
740 |
+
"gaia": {
|
741 |
+
"accuracy": 0.27030303030303043
|
742 |
+
},
|
743 |
+
"gdm_intercode_ctf": {
|
744 |
+
"accuracy": 0.8278481012658225
|
745 |
+
},
|
746 |
+
"gdm_in_house_ctf": {
|
747 |
+
"accuracy": 0.38461538461538464
|
748 |
+
},
|
749 |
+
"agentharm": {
|
750 |
+
"avg_score": 0.1241931080283353
|
751 |
+
},
|
752 |
+
"agentharm_benign": {
|
753 |
+
"avg_score": 0.5429306867375049
|
754 |
+
},
|
755 |
+
"swe_bench": {
|
756 |
+
"mean": 0.0024
|
757 |
+
}
|
758 |
+
}
|
759 |
+
}
|
760 |
+
}
|
data/tasks.json
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"arc_easy": {
|
3 |
+
"benchmark": "arc_easy",
|
4 |
+
"metric": "accuracy",
|
5 |
+
"display_name": "ARC-Easy",
|
6 |
+
"type": "base",
|
7 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
|
8 |
+
},
|
9 |
+
"arc_challenge": {
|
10 |
+
"benchmark": "arc_challenge",
|
11 |
+
"metric": "accuracy",
|
12 |
+
"display_name": "ARC-Challenge",
|
13 |
+
"type": "base",
|
14 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
|
15 |
+
},
|
16 |
+
"drop": {
|
17 |
+
"benchmark": "drop",
|
18 |
+
"metric": "mean",
|
19 |
+
"display_name": "DROP",
|
20 |
+
"type": "base",
|
21 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop"
|
22 |
+
},
|
23 |
+
"winogrande": {
|
24 |
+
"benchmark": "winogrande",
|
25 |
+
"metric": "accuracy",
|
26 |
+
"display_name": "WinoGrande",
|
27 |
+
"type": "base",
|
28 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande"
|
29 |
+
},
|
30 |
+
"gsm8k": {
|
31 |
+
"benchmark": "gsm8k",
|
32 |
+
"metric": "accuracy",
|
33 |
+
"display_name": "GSM8K",
|
34 |
+
"type": "base",
|
35 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k"
|
36 |
+
},
|
37 |
+
"hellaswag": {
|
38 |
+
"benchmark": "hellaswag",
|
39 |
+
"metric": "accuracy",
|
40 |
+
"display_name": "HellaSwag",
|
41 |
+
"type": "base",
|
42 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag"
|
43 |
+
},
|
44 |
+
"humaneval": {
|
45 |
+
"benchmark": "humaneval",
|
46 |
+
"metric": "mean",
|
47 |
+
"display_name": "HumanEval",
|
48 |
+
"type": "base",
|
49 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval"
|
50 |
+
},
|
51 |
+
"ifeval": {
|
52 |
+
"benchmark": "ifeval",
|
53 |
+
"metric": "final_acc",
|
54 |
+
"display_name": "IFEval",
|
55 |
+
"type": "base",
|
56 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval"
|
57 |
+
},
|
58 |
+
"math": {
|
59 |
+
"benchmark": "math",
|
60 |
+
"metric": "accuracy",
|
61 |
+
"display_name": "MATH",
|
62 |
+
"type": "base",
|
63 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics"
|
64 |
+
},
|
65 |
+
"mmlu": {
|
66 |
+
"benchmark": "mmlu",
|
67 |
+
"metric": "accuracy",
|
68 |
+
"display_name": "MMLU",
|
69 |
+
"type": "base",
|
70 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu"
|
71 |
+
},
|
72 |
+
"mmlu_pro": {
|
73 |
+
"benchmark": "mmlu_pro",
|
74 |
+
"metric": "accuracy",
|
75 |
+
"display_name": "MMLU-Pro",
|
76 |
+
"type": "base",
|
77 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro"
|
78 |
+
},
|
79 |
+
"gpqa_diamond": {
|
80 |
+
"benchmark": "gpqa_diamond",
|
81 |
+
"metric": "accuracy",
|
82 |
+
"display_name": "GPQA-Diamond",
|
83 |
+
"type": "base",
|
84 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa"
|
85 |
+
},
|
86 |
+
"mmmu_multiple_choice": {
|
87 |
+
"benchmark": "mmmu_multiple_choice",
|
88 |
+
"metric": "accuracy",
|
89 |
+
"display_name": "MMMU-Multiple-Choice",
|
90 |
+
"type": "base",
|
91 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu"
|
92 |
+
},
|
93 |
+
"mmmu_open": {
|
94 |
+
"benchmark": "mmmu_open",
|
95 |
+
"metric": "accuracy",
|
96 |
+
"display_name": "MMMU-Open-Ended",
|
97 |
+
"type": "base",
|
98 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu"
|
99 |
+
},
|
100 |
+
"gaia": {
|
101 |
+
"benchmark": "gaia",
|
102 |
+
"metric": "accuracy",
|
103 |
+
"display_name": "GAIA",
|
104 |
+
"type": "agentic",
|
105 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
|
106 |
+
},
|
107 |
+
"gdm_intercode_ctf": {
|
108 |
+
"benchmark": "gdm_intercode_ctf",
|
109 |
+
"metric": "accuracy",
|
110 |
+
"display_name": "InterCode-CTF",
|
111 |
+
"type": "agentic",
|
112 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf"
|
113 |
+
},
|
114 |
+
"gdm_in_house_ctf": {
|
115 |
+
"benchmark": "gdm_in_house_ctf",
|
116 |
+
"metric": "accuracy",
|
117 |
+
"display_name": "In-House-CTF",
|
118 |
+
"type": "agentic",
|
119 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf"
|
120 |
+
},
|
121 |
+
"agentharm": {
|
122 |
+
"benchmark": "agentharm",
|
123 |
+
"metric": "avg_score",
|
124 |
+
"display_name": "AgentHarm",
|
125 |
+
"type": "agentic",
|
126 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm"
|
127 |
+
},
|
128 |
+
"agentharm_benign": {
|
129 |
+
"benchmark": "agentharm_benign",
|
130 |
+
"metric": "avg_score",
|
131 |
+
"display_name": "AgentHarm-Benign",
|
132 |
+
"type": "agentic",
|
133 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm"
|
134 |
+
},
|
135 |
+
"swe_bench": {
|
136 |
+
"benchmark": "swe_bench",
|
137 |
+
"metric": "mean",
|
138 |
+
"display_name": "SWE-Bench",
|
139 |
+
"type": "agentic",
|
140 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/swe_bench"
|
141 |
+
}
|
142 |
+
}
|
src/about.py
CHANGED
@@ -33,18 +33,13 @@ class Tasks(Enum):
|
|
33 |
|
34 |
# agentic
|
35 |
task14 = Task("gaia", "accuracy", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
|
36 |
-
task15 = Task("gdm_intercode_ctf", "accuracy", "
|
37 |
-
task16 = Task("gdm_in_house_ctf", "accuracy", "
|
38 |
task17 = Task("agentharm", "avg_score", "AgentHarm", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm")
|
39 |
task18 = Task("agentharm_benign", "avg_score", "AgentHarm-Benign", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm")
|
40 |
task19 = Task("swe_bench", "mean", "SWE-Bench", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/swe_bench")
|
41 |
|
42 |
|
43 |
-
NUM_FEWSHOT = 0 # Change with your few shot
|
44 |
-
# ---------------------------------------------------
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
# Your leaderboard name
|
49 |
TITLE = """<h1 align="center" id="space-title">State of Evaluation Leaderboard</h1>"""
|
50 |
|
|
|
33 |
|
34 |
# agentic
|
35 |
task14 = Task("gaia", "accuracy", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
|
36 |
+
task15 = Task("gdm_intercode_ctf", "accuracy", "InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
|
37 |
+
task16 = Task("gdm_in_house_ctf", "accuracy", "In-House-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf")
|
38 |
task17 = Task("agentharm", "avg_score", "AgentHarm", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm")
|
39 |
task18 = Task("agentharm_benign", "avg_score", "AgentHarm-Benign", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm")
|
40 |
task19 = Task("swe_bench", "mean", "SWE-Bench", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/swe_bench")
|
41 |
|
42 |
|
|
|
|
|
|
|
|
|
|
|
43 |
# Your leaderboard name
|
44 |
TITLE = """<h1 align="center" id="space-title">State of Evaluation Leaderboard</h1>"""
|
45 |
|
src/display/formatting.py
CHANGED
@@ -5,6 +5,8 @@ def model_hyperlink(link, model_name):
|
|
5 |
def make_clickable_model(model_name, model_sha):
|
6 |
return model_hyperlink(model_sha, model_name)
|
7 |
|
|
|
|
|
8 |
|
9 |
def styled_error(error):
|
10 |
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
|
|
5 |
def make_clickable_model(model_name, model_sha):
|
6 |
return model_hyperlink(model_sha, model_name)
|
7 |
|
8 |
+
def make_clickable_field(name, url):
|
9 |
+
return model_hyperlink(url, name)
|
10 |
|
11 |
def styled_error(error):
|
12 |
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
src/submission/submit.py
CHANGED
@@ -1,119 +1,119 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
from datetime import datetime, timezone
|
4 |
-
|
5 |
-
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
-
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
7 |
-
from src.submission.check_validity import (
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
)
|
13 |
-
|
14 |
-
REQUESTED_MODELS = None
|
15 |
-
USERS_TO_SUBMISSION_DATES = None
|
16 |
-
|
17 |
-
def add_new_eval(
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
):
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
|
|
1 |
+
# import json
|
2 |
+
# import os
|
3 |
+
# from datetime import datetime, timezone
|
4 |
+
|
5 |
+
# from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
+
# from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
7 |
+
# from src.submission.check_validity import (
|
8 |
+
# already_submitted_models,
|
9 |
+
# check_model_card,
|
10 |
+
# get_model_size,
|
11 |
+
# is_model_on_hub,
|
12 |
+
# )
|
13 |
+
|
14 |
+
# REQUESTED_MODELS = None
|
15 |
+
# USERS_TO_SUBMISSION_DATES = None
|
16 |
+
|
17 |
+
# def add_new_eval(
|
18 |
+
# model: str,
|
19 |
+
# base_model: str,
|
20 |
+
# revision: str,
|
21 |
+
# precision: str,
|
22 |
+
# weight_type: str,
|
23 |
+
# model_type: str,
|
24 |
+
# ):
|
25 |
+
# global REQUESTED_MODELS
|
26 |
+
# global USERS_TO_SUBMISSION_DATES
|
27 |
+
# if not REQUESTED_MODELS:
|
28 |
+
# REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
29 |
+
|
30 |
+
# user_name = ""
|
31 |
+
# model_path = model
|
32 |
+
# if "/" in model:
|
33 |
+
# user_name = model.split("/")[0]
|
34 |
+
# model_path = model.split("/")[1]
|
35 |
+
|
36 |
+
# precision = precision.split(" ")[0]
|
37 |
+
# current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
38 |
+
|
39 |
+
# if model_type is None or model_type == "":
|
40 |
+
# return styled_error("Please select a model type.")
|
41 |
+
|
42 |
+
# # Does the model actually exist?
|
43 |
+
# if revision == "":
|
44 |
+
# revision = "main"
|
45 |
+
|
46 |
+
# # Is the model on the hub?
|
47 |
+
# if weight_type in ["Delta", "Adapter"]:
|
48 |
+
# base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
49 |
+
# if not base_model_on_hub:
|
50 |
+
# return styled_error(f'Base model "{base_model}" {error}')
|
51 |
+
|
52 |
+
# if not weight_type == "Adapter":
|
53 |
+
# model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
54 |
+
# if not model_on_hub:
|
55 |
+
# return styled_error(f'Model "{model}" {error}')
|
56 |
+
|
57 |
+
# # Is the model info correctly filled?
|
58 |
+
# try:
|
59 |
+
# model_info = API.model_info(repo_id=model, revision=revision)
|
60 |
+
# except Exception:
|
61 |
+
# return styled_error("Could not get your model information. Please fill it up properly.")
|
62 |
+
|
63 |
+
# model_size = get_model_size(model_info=model_info, precision=precision)
|
64 |
+
|
65 |
+
# # Were the model card and license filled?
|
66 |
+
# try:
|
67 |
+
# license = model_info.cardData["license"]
|
68 |
+
# except Exception:
|
69 |
+
# return styled_error("Please select a license for your model")
|
70 |
+
|
71 |
+
# modelcard_OK, error_msg = check_model_card(model)
|
72 |
+
# if not modelcard_OK:
|
73 |
+
# return styled_error(error_msg)
|
74 |
+
|
75 |
+
# # Seems good, creating the eval
|
76 |
+
# print("Adding new eval")
|
77 |
+
|
78 |
+
# eval_entry = {
|
79 |
+
# "model": model,
|
80 |
+
# "base_model": base_model,
|
81 |
+
# "revision": revision,
|
82 |
+
# "precision": precision,
|
83 |
+
# "weight_type": weight_type,
|
84 |
+
# "status": "PENDING",
|
85 |
+
# "submitted_time": current_time,
|
86 |
+
# "model_type": model_type,
|
87 |
+
# "likes": model_info.likes,
|
88 |
+
# "params": model_size,
|
89 |
+
# "license": license,
|
90 |
+
# "private": False,
|
91 |
+
# }
|
92 |
+
|
93 |
+
# # Check for duplicate submission
|
94 |
+
# if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
|
95 |
+
# return styled_warning("This model has been already submitted.")
|
96 |
+
|
97 |
+
# print("Creating eval file")
|
98 |
+
# OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
99 |
+
# os.makedirs(OUT_DIR, exist_ok=True)
|
100 |
+
# out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
101 |
+
|
102 |
+
# with open(out_path, "w") as f:
|
103 |
+
# f.write(json.dumps(eval_entry))
|
104 |
+
|
105 |
+
# print("Uploading eval file")
|
106 |
+
# API.upload_file(
|
107 |
+
# path_or_fileobj=out_path,
|
108 |
+
# path_in_repo=out_path.split("eval-queue/")[1],
|
109 |
+
# repo_id=QUEUE_REPO,
|
110 |
+
# repo_type="dataset",
|
111 |
+
# commit_message=f"Add {model} to eval queue",
|
112 |
+
# )
|
113 |
+
|
114 |
+
# # Remove the local file
|
115 |
+
# os.remove(out_path)
|
116 |
+
|
117 |
+
# return styled_message(
|
118 |
+
# "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
119 |
+
# )
|