Commit
·
d7502bf
1
Parent(s):
f911da5
- src/lib/benchmarks/anthropic.ts +90 -91
- src/lib/benchmarks/deepseek.ts +3 -3
- src/lib/benchmarks/openai.ts +8 -8
- src/lib/benchmarks/types.ts +5 -1
src/lib/benchmarks/anthropic.ts
CHANGED
@@ -1,103 +1,102 @@
|
|
1 |
import { Benchmark } from "./types";
|
2 |
|
3 |
export const anthropicBenchmarks: Benchmark[] = [
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
},
|
20 |
},
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
},
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
},
|
53 |
},
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
},
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
},
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
},
|
100 |
-
|
|
|
101 |
model: "Claude 3 Opus",
|
102 |
provider: "Anthropic",
|
103 |
inputPrice: 15.0,
|
|
|
1 |
import { Benchmark } from "./types";
|
2 |
|
3 |
export const anthropicBenchmarks: Benchmark[] = [
|
4 |
+
{
|
5 |
+
model: "Claude Opus 4",
|
6 |
+
provider: "Anthropic",
|
7 |
+
inputPrice: 15.0,
|
8 |
+
outputPrice: 75.0,
|
9 |
+
source: "https://www.anthropic.com/news/claude-4",
|
10 |
+
benchmark: {
|
11 |
+
swe_bench_verified: 72.5,
|
12 |
+
//terminal_bench: 43.2,
|
13 |
+
gpqa_diamond: 79.6,
|
14 |
+
aime_2025: 75.5,
|
15 |
+
mmmlu: 88.8,
|
16 |
+
mmmu: 76.5,
|
17 |
+
tau_bench_retail: 81.4,
|
18 |
+
tau_bench_airline: 59.6,
|
|
|
19 |
},
|
20 |
+
},
|
21 |
+
{
|
22 |
+
model: "Claude Sonnet 4",
|
23 |
+
provider: "Anthropic",
|
24 |
+
inputPrice: 3.0,
|
25 |
+
outputPrice: 15.0,
|
26 |
+
source: "https://www.anthropic.com/news/claude-4",
|
27 |
+
benchmark: {
|
28 |
+
swe_bench_verified: 72.7,
|
29 |
+
//terminal_bench: 35.5,
|
30 |
+
gpqa_diamond: 75.4,
|
31 |
+
aime_2025: 70.5,
|
32 |
+
mmmlu: 86.5,
|
33 |
+
mmmu: 74.4,
|
34 |
+
tau_bench_retail: 80.5,
|
35 |
+
tau_bench_airline: 60.0,
|
36 |
},
|
37 |
+
},
|
38 |
+
{
|
39 |
+
model: "Claude 3.7 Sonnet (Extended Thinking 64K)",
|
40 |
+
provider: "Anthropic",
|
41 |
+
inputPrice: 3.0,
|
42 |
+
outputPrice: 15.0,
|
43 |
+
source: "https://www.anthropic.com/news/claude-3-7-sonnet",
|
44 |
+
benchmark: {
|
45 |
+
gpqa_diamond: 78.2,
|
46 |
+
tau_bench_retail: 81.2,
|
47 |
+
tau_bench_airline: 58.4,
|
48 |
+
mmmlu: 86.1,
|
49 |
+
mmmu: 75.0,
|
50 |
+
aime_24: 61.3,
|
|
|
51 |
},
|
52 |
+
},
|
53 |
+
{
|
54 |
+
model: "Claude 3.7 Sonnet (No Extended Thinking)",
|
55 |
+
provider: "Anthropic",
|
56 |
+
inputPrice: 3.0,
|
57 |
+
outputPrice: 15.0,
|
58 |
+
source: "https://www.anthropic.com/news/claude-3-7-sonnet",
|
59 |
+
benchmark: {
|
60 |
+
gpqa_diamond: 68.0,
|
61 |
+
swe_bench_verified: 62.3,
|
62 |
+
mmmlu: 83.2,
|
63 |
+
mmmu: 71.8,
|
64 |
+
aime_24: 51.7, // using average of 23.3 & 80.0
|
65 |
},
|
66 |
+
},
|
67 |
+
{
|
68 |
+
model: "Claude 3.5 Sonnet (New)",
|
69 |
+
provider: "Anthropic",
|
70 |
+
inputPrice: 3.0,
|
71 |
+
outputPrice: 15.0,
|
72 |
+
source: "https://www.anthropic.com/news/claude-3-7-sonnet",
|
73 |
+
benchmark: {
|
74 |
+
gpqa_diamond: 65.0,
|
75 |
+
swe_bench_verified: 49.0,
|
76 |
+
tau_bench_retail: 71.5,
|
77 |
+
tau_bench_airline: 48.8,
|
78 |
+
mmmlu: 82.1,
|
79 |
+
mmmu: 70.4,
|
80 |
+
aime_24: 16.0, // average of 16.0 & 65.4
|
81 |
},
|
82 |
+
},
|
83 |
+
{
|
84 |
+
model: "Claude 3.5 Haiku",
|
85 |
+
provider: "Anthropic",
|
86 |
+
inputPrice: 3.0,
|
87 |
+
outputPrice: 15.0,
|
88 |
+
source: "https://www.anthropic.com/news/3-5-models-and-computer-use",
|
89 |
+
benchmark: {
|
90 |
+
gpqa_diamond: 41.6,
|
91 |
+
swe_bench_verified: 49.0,
|
92 |
+
tau_bench_retail: 51.0,
|
93 |
+
tau_bench_airline: 22.8,
|
94 |
+
humaneval: 88.1,
|
95 |
+
mmmlu: 65.0,
|
96 |
+
aime_24: 5.3,
|
97 |
},
|
98 |
+
},
|
99 |
+
{
|
100 |
model: "Claude 3 Opus",
|
101 |
provider: "Anthropic",
|
102 |
inputPrice: 15.0,
|
src/lib/benchmarks/deepseek.ts
CHANGED
@@ -22,8 +22,8 @@ export const deepseekBenchmarks: Benchmark[] = [
|
|
22 |
// Not in BenchmarkMetric, but useful (commented for type safety):
|
23 |
// codeforces_div1: 1930,
|
24 |
// frames: 83.0,
|
25 |
-
|
26 |
-
|
27 |
// bfcl_v3_multiturn: 37.0,
|
28 |
// cnmo_2024: 86.9,
|
29 |
// hmmt_2025: 79.4,
|
@@ -87,7 +87,7 @@ export const deepseekBenchmarks: Benchmark[] = [
|
|
87 |
{
|
88 |
model: "DeepSeek-R1",
|
89 |
provider: "DeepSeek",
|
90 |
-
|
91 |
outputPrice: 2.19,
|
92 |
benchmark: {
|
93 |
mmlu: 90.8,
|
|
|
22 |
// Not in BenchmarkMetric, but useful (commented for type safety):
|
23 |
// codeforces_div1: 1930,
|
24 |
// frames: 83.0,
|
25 |
+
tau_bench_airline: 53.5,
|
26 |
+
tau_bench_retail: 63.9,
|
27 |
// bfcl_v3_multiturn: 37.0,
|
28 |
// cnmo_2024: 86.9,
|
29 |
// hmmt_2025: 79.4,
|
|
|
87 |
{
|
88 |
model: "DeepSeek-R1",
|
89 |
provider: "DeepSeek",
|
90 |
+
inputPrice: 0.55, // Placeholder, update if pricing becomes available
|
91 |
outputPrice: 2.19,
|
92 |
benchmark: {
|
93 |
mmlu: 90.8,
|
src/lib/benchmarks/openai.ts
CHANGED
@@ -229,8 +229,8 @@ export const openaiBenchmarks: Benchmark[] = [
|
|
229 |
aider_polyglot: 81.3, // "(whole)"
|
230 |
//scale_multichallenge: 56.51,
|
231 |
//browsecomp: 8.35, // "o3 with python +browsing*"
|
232 |
-
|
233 |
-
|
234 |
},
|
235 |
},
|
236 |
{
|
@@ -267,8 +267,8 @@ export const openaiBenchmarks: Benchmark[] = [
|
|
267 |
aider_polyglot: 68.9, // "(whole)"
|
268 |
//scale_multichallenge: 42.99,
|
269 |
//browsecomp: 1.5, // "o4-mini with python +browsing** tools"
|
270 |
-
|
271 |
-
|
272 |
},
|
273 |
},
|
274 |
{
|
@@ -291,8 +291,8 @@ export const openaiBenchmarks: Benchmark[] = [
|
|
291 |
aider_polyglot: 64.4, // "(whole)"
|
292 |
//scale_multichallenge: 44.93,
|
293 |
//browsecomp: 1.94, // "4o + browsing" - this seems to be a typo in the source, likely refers to o1's browsing capability
|
294 |
-
|
295 |
-
|
296 |
},
|
297 |
},
|
298 |
{
|
@@ -313,8 +313,8 @@ export const openaiBenchmarks: Benchmark[] = [
|
|
313 |
aider_polyglot: 61.7, // "(diff)"
|
314 |
//scale_multichallenge: 39.89,
|
315 |
// BrowseComp not explicitly listed for o3-mini
|
316 |
-
|
317 |
-
|
318 |
},
|
319 |
},
|
320 |
];
|
|
|
229 |
aider_polyglot: 81.3, // "(whole)"
|
230 |
//scale_multichallenge: 56.51,
|
231 |
//browsecomp: 8.35, // "o3 with python +browsing*"
|
232 |
+
tau_bench_airline: 52.0, // "(Airline)"
|
233 |
+
tau_bench_retail: 73.9, // "(Retail)"
|
234 |
},
|
235 |
},
|
236 |
{
|
|
|
267 |
aider_polyglot: 68.9, // "(whole)"
|
268 |
//scale_multichallenge: 42.99,
|
269 |
//browsecomp: 1.5, // "o4-mini with python +browsing** tools"
|
270 |
+
tau_bench_airline: 49.2, // "(Airline)"
|
271 |
+
tau_bench_retail: 71.8, // "(Retail)"
|
272 |
},
|
273 |
},
|
274 |
{
|
|
|
291 |
aider_polyglot: 64.4, // "(whole)"
|
292 |
//scale_multichallenge: 44.93,
|
293 |
//browsecomp: 1.94, // "4o + browsing" - this seems to be a typo in the source, likely refers to o1's browsing capability
|
294 |
+
tau_bench_airline: 50.0, // "(Airline)"
|
295 |
+
tau_bench_retail: 70.8, // "(Retail)"
|
296 |
},
|
297 |
},
|
298 |
{
|
|
|
313 |
aider_polyglot: 61.7, // "(diff)"
|
314 |
//scale_multichallenge: 39.89,
|
315 |
// BrowseComp not explicitly listed for o3-mini
|
316 |
+
tau_bench_airline: 32.4, // "(Airline)"
|
317 |
+
tau_bench_retail: 57.6, // "(Retail)"
|
318 |
},
|
319 |
},
|
320 |
];
|
src/lib/benchmarks/types.ts
CHANGED
@@ -11,6 +11,8 @@ export type BenchmarkMetric =
|
|
11 |
| "aime_24"
|
12 |
| "aime_2025"
|
13 |
| "gpqa_diamond"
|
|
|
|
|
14 |
|
15 |
// Code benchmarks (frequent)
|
16 |
| "humaneval"
|
@@ -58,7 +60,9 @@ export const benchmarkMetricOrder: BenchmarkMetric[] = [
|
|
58 |
"aime_24",
|
59 |
"aime_2025",
|
60 |
"gpqa_diamond",
|
61 |
-
"aider_polyglot"
|
|
|
|
|
62 |
|
63 |
// // Code benchmarks (frequent)
|
64 |
// "humaneval",
|
|
|
11 |
| "aime_24"
|
12 |
| "aime_2025"
|
13 |
| "gpqa_diamond"
|
14 |
+
| "tau_bench_airline"
|
15 |
+
| "tau_bench_retail"
|
16 |
|
17 |
// Code benchmarks (frequent)
|
18 |
| "humaneval"
|
|
|
60 |
"aime_24",
|
61 |
"aime_2025",
|
62 |
"gpqa_diamond",
|
63 |
+
"aider_polyglot",
|
64 |
+
"tau_bench_airline",
|
65 |
+
"tau_bench_retail"
|
66 |
|
67 |
// // Code benchmarks (frequent)
|
68 |
// "humaneval",
|