Commit
·
5a1ab27
1
Parent(s):
95775cb
- src/lib/benchmarks/ index.ts +2 -0
- src/lib/benchmarks/anthropic.ts +84 -0
- src/lib/benchmarks/google.ts +46 -0
src/lib/benchmarks/ index.ts
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
import { Benchmark } from "./types";
|
2 |
import { xaiBenchmarks } from "./xai";
|
3 |
import { googleBenchmarks } from "./google";
|
|
|
4 |
// import other sources here as you add them
|
5 |
// import { openaiBenchmarks } from "./openai";
|
6 |
|
7 |
export const benchmarkData: Benchmark[] = [
|
8 |
...xaiBenchmarks,
|
9 |
...googleBenchmarks,
|
|
|
10 |
// ...openaiBenchmarks,
|
11 |
];
|
|
|
1 |
import { Benchmark } from "./types";
|
2 |
import { xaiBenchmarks } from "./xai";
|
3 |
import { googleBenchmarks } from "./google";
|
4 |
+
import { anthropicBenchmarks } from "./anthropic";
|
5 |
// import other sources here as you add them
|
6 |
// import { openaiBenchmarks } from "./openai";
|
7 |
|
8 |
export const benchmarkData: Benchmark[] = [
|
9 |
...xaiBenchmarks,
|
10 |
...googleBenchmarks,
|
11 |
+
...anthropicBenchmarks
|
12 |
// ...openaiBenchmarks,
|
13 |
];
|
src/lib/benchmarks/anthropic.ts
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { Benchmark } from "./types";
|
2 |
+
|
3 |
+
export const anthropicBenchmarks: Benchmark[] = [
|
4 |
+
{
|
5 |
+
model: "Claude Opus 4",
|
6 |
+
provider: "Anthropic",
|
7 |
+
inputPrice: 15.0,
|
8 |
+
outputPrice: 75.0,
|
9 |
+
source: "https://www.anthropic.com/news/claude-4",
|
10 |
+
benchmark: {
|
11 |
+
swe_bench_verified: 72.5,
|
12 |
+
//terminal_bench: 43.2,
|
13 |
+
gpqa_diamond: 79.6,
|
14 |
+
aime_2025: 75.5,
|
15 |
+
//mmmlu: 88.8,
|
16 |
+
mmmu: 76.5,
|
17 |
+
// tau_bench_retail: 81.4,
|
18 |
+
//tau_bench_airline: 59.6,
|
19 |
+
},
|
20 |
+
},
|
21 |
+
{
|
22 |
+
model: "Claude Sonnet 4",
|
23 |
+
provider: "Anthropic",
|
24 |
+
inputPrice: 3.0,
|
25 |
+
outputPrice: 15.0,
|
26 |
+
source: "https://www.anthropic.com/news/claude-4",
|
27 |
+
benchmark: {
|
28 |
+
swe_bench_verified: 72.7,
|
29 |
+
//terminal_bench: 35.5,
|
30 |
+
gpqa_diamond: 75.4,
|
31 |
+
aime_2025: 70.5,
|
32 |
+
// mmmlu: 86.5,
|
33 |
+
mmmu: 74.4,
|
34 |
+
// tau_bench_retail: 80.5,
|
35 |
+
// tau_bench_airline: 60.0,
|
36 |
+
},
|
37 |
+
},
|
38 |
+
{
|
39 |
+
model: "Claude 3.7 Sonnet (Extended Thinking 64K)",
|
40 |
+
provider: "Anthropic",
|
41 |
+
inputPrice: 3.0,
|
42 |
+
outputPrice: 15.0,
|
43 |
+
source: "https://www.anthropic.com/news/claude-3-7-sonnet",
|
44 |
+
benchmark: {
|
45 |
+
gpqa_diamond: 78.2,
|
46 |
+
|
47 |
+
// tau_bench_retail: 81.2,
|
48 |
+
// tau_bench_airline: 58.4,
|
49 |
+
// mmmlu: 86.1,
|
50 |
+
mmmu: 75.0,
|
51 |
+
aime_24: 61.3,
|
52 |
+
},
|
53 |
+
},
|
54 |
+
{
|
55 |
+
model: "Claude 3.7 Sonnet (No Extended Thinking)",
|
56 |
+
provider: "Anthropic",
|
57 |
+
inputPrice: 3.0,
|
58 |
+
outputPrice: 15.0,
|
59 |
+
source: "https://www.anthropic.com/news/claude-3-7-sonnet",
|
60 |
+
benchmark: {
|
61 |
+
gpqa_diamond: 68.0,
|
62 |
+
swe_bench_verified: 62.3,
|
63 |
+
// mmmlu: 83.2,
|
64 |
+
mmmu: 71.8,
|
65 |
+
aime_24: 51.7, // using average of 23.3 & 80.0
|
66 |
+
},
|
67 |
+
},
|
68 |
+
{
|
69 |
+
model: "Claude 3.5 Sonnet (New)",
|
70 |
+
provider: "Anthropic",
|
71 |
+
inputPrice: 3.0,
|
72 |
+
outputPrice: 15.0,
|
73 |
+
source: "https://www.anthropic.com/news/claude-3-7-sonnet",
|
74 |
+
benchmark: {
|
75 |
+
gpqa_diamond: 65.0,
|
76 |
+
swe_bench_verified: 49.0,
|
77 |
+
// tau_bench_retail: 71.5,
|
78 |
+
// tau_bench_airline: 48.8,
|
79 |
+
// mmmlu: 82.1,
|
80 |
+
mmmu: 70.4,
|
81 |
+
aime_24: 16.0, // average of 16.0 & 65.4
|
82 |
+
},
|
83 |
+
},
|
84 |
+
];
|
src/lib/benchmarks/google.ts
CHANGED
@@ -43,6 +43,52 @@ export const googleBenchmarks: Benchmark[] = [
|
|
43 |
mmmu: 82.0,
|
44 |
},
|
45 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
{
|
47 |
model: "Gemini 2.5 Flash (Thinking-enabled, default)",
|
48 |
provider: "Google",
|
|
|
43 |
mmmu: 82.0,
|
44 |
},
|
45 |
},
|
46 |
+
|
47 |
+
{
|
48 |
+
model: "Gemini 2.5 Pro Experimental (03-25)",
|
49 |
+
provider: "Google",
|
50 |
+
inputPrice: 1.25,
|
51 |
+
outputPrice: 10.0,
|
52 |
+
source: "https://blog.google/products/gemini/gemini-2-5-pro-updates/",
|
53 |
+
benchmark: {
|
54 |
+
livecodebench_v6: 70.4,
|
55 |
+
aider_polyglot: 74.0,
|
56 |
+
swe_bench_verified: 63.8,
|
57 |
+
gpqa_diamond: 84.0,
|
58 |
+
aime_2025: 86.7,
|
59 |
+
humanitys_last_exam: 18.8,
|
60 |
+
simpleqa: 52.9,
|
61 |
+
global_mmlu_lite: 89.8,
|
62 |
+
mrcr_v2_avg_128k: 94.5,
|
63 |
+
mrcr_v2_pointwise_1m: 83.1,
|
64 |
+
mmmu: 81.7,
|
65 |
+
// vibe_eval: 69.4,
|
66 |
+
// video_mme: not reported
|
67 |
+
},
|
68 |
+
},
|
69 |
+
{
|
70 |
+
model: "Gemini 2.5 Pro Preview (05-06)",
|
71 |
+
provider: "Google",
|
72 |
+
inputPrice: 1.25,
|
73 |
+
outputPrice: 10.0,
|
74 |
+
source: "https://blog.google/products/gemini/gemini-2-5-pro-updates/",
|
75 |
+
benchmark: {
|
76 |
+
livecodebench_v6: 75.6,
|
77 |
+
aider_polyglot: 76.5,
|
78 |
+
swe_bench_verified: 63.2,
|
79 |
+
gpqa_diamond: 83.0,
|
80 |
+
aime_2025: 83.0,
|
81 |
+
humanitys_last_exam: 17.8,
|
82 |
+
simpleqa: 50.8,
|
83 |
+
global_mmlu_lite: 88.6,
|
84 |
+
mrcr_v2_avg_128k: 93.0,
|
85 |
+
mrcr_v2_pointwise_1m: 82.9,
|
86 |
+
mmmu: 79.6,
|
87 |
+
// vibe_eval: 65.6,
|
88 |
+
// video_mme: 84.8,
|
89 |
+
},
|
90 |
+
},
|
91 |
+
|
92 |
{
|
93 |
model: "Gemini 2.5 Flash (Thinking-enabled, default)",
|
94 |
provider: "Google",
|