Commit
·
d4f1866
1
Parent(s):
5a1ab27
src/lib/benchmarks/anthropic.ts
CHANGED
@@ -12,7 +12,7 @@ export const anthropicBenchmarks: Benchmark[] = [
|
|
12 |
//terminal_bench: 43.2,
|
13 |
gpqa_diamond: 79.6,
|
14 |
aime_2025: 75.5,
|
15 |
-
|
16 |
mmmu: 76.5,
|
17 |
// tau_bench_retail: 81.4,
|
18 |
//tau_bench_airline: 59.6,
|
@@ -29,7 +29,7 @@ export const anthropicBenchmarks: Benchmark[] = [
|
|
29 |
//terminal_bench: 35.5,
|
30 |
gpqa_diamond: 75.4,
|
31 |
aime_2025: 70.5,
|
32 |
-
|
33 |
mmmu: 74.4,
|
34 |
// tau_bench_retail: 80.5,
|
35 |
// tau_bench_airline: 60.0,
|
@@ -46,7 +46,7 @@ export const anthropicBenchmarks: Benchmark[] = [
|
|
46 |
|
47 |
// tau_bench_retail: 81.2,
|
48 |
// tau_bench_airline: 58.4,
|
49 |
-
|
50 |
mmmu: 75.0,
|
51 |
aime_24: 61.3,
|
52 |
},
|
@@ -60,7 +60,7 @@ export const anthropicBenchmarks: Benchmark[] = [
|
|
60 |
benchmark: {
|
61 |
gpqa_diamond: 68.0,
|
62 |
swe_bench_verified: 62.3,
|
63 |
-
|
64 |
mmmu: 71.8,
|
65 |
aime_24: 51.7, // using average of 23.3 & 80.0
|
66 |
},
|
@@ -76,9 +76,81 @@ export const anthropicBenchmarks: Benchmark[] = [
|
|
76 |
swe_bench_verified: 49.0,
|
77 |
// tau_bench_retail: 71.5,
|
78 |
// tau_bench_airline: 48.8,
|
79 |
-
|
80 |
mmmu: 70.4,
|
81 |
aime_24: 16.0, // average of 16.0 & 65.4
|
82 |
},
|
83 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
];
|
|
|
12 |
//terminal_bench: 43.2,
|
13 |
gpqa_diamond: 79.6,
|
14 |
aime_2025: 75.5,
|
15 |
+
mmmlu: 88.8,
|
16 |
mmmu: 76.5,
|
17 |
// tau_bench_retail: 81.4,
|
18 |
//tau_bench_airline: 59.6,
|
|
|
29 |
//terminal_bench: 35.5,
|
30 |
gpqa_diamond: 75.4,
|
31 |
aime_2025: 70.5,
|
32 |
+
mmmlu: 86.5,
|
33 |
mmmu: 74.4,
|
34 |
// tau_bench_retail: 80.5,
|
35 |
// tau_bench_airline: 60.0,
|
|
|
46 |
|
47 |
// tau_bench_retail: 81.2,
|
48 |
// tau_bench_airline: 58.4,
|
49 |
+
mmmlu: 86.1,
|
50 |
mmmu: 75.0,
|
51 |
aime_24: 61.3,
|
52 |
},
|
|
|
60 |
benchmark: {
|
61 |
gpqa_diamond: 68.0,
|
62 |
swe_bench_verified: 62.3,
|
63 |
+
mmmlu: 83.2,
|
64 |
mmmu: 71.8,
|
65 |
aime_24: 51.7, // using average of 23.3 & 80.0
|
66 |
},
|
|
|
76 |
swe_bench_verified: 49.0,
|
77 |
// tau_bench_retail: 71.5,
|
78 |
// tau_bench_airline: 48.8,
|
79 |
+
mmmlu: 82.1,
|
80 |
mmmu: 70.4,
|
81 |
aime_24: 16.0, // average of 16.0 & 65.4
|
82 |
},
|
83 |
},
|
84 |
+
{
|
85 |
+
model: "Claude 3 Opus",
|
86 |
+
provider: "Anthropic",
|
87 |
+
inputPrice: 15.0,
|
88 |
+
outputPrice: 75.0,
|
89 |
+
source: "https://www.anthropic.com/news/claude-3-opus-release",
|
90 |
+
benchmark: {
|
91 |
+
gpqa_diamond: 50.4,
|
92 |
+
mmmlu: 86.8,
|
93 |
+
mmmu: 59.4,
|
94 |
+
// gsm8k: 95.0,
|
95 |
+
// math: 60.1,
|
96 |
+
// mgsm: 90.7,
|
97 |
+
// humaneval: 84.9,
|
98 |
+
// drop: 83.1,
|
99 |
+
// big_bench_hard: 86.8,
|
100 |
+
// arc_challenge: 96.4,
|
101 |
+
// hellaswag: 95.4,
|
102 |
+
// mathvista: 50.5,
|
103 |
+
// ai2d: 88.1,
|
104 |
+
// chart_qa: 80.8,
|
105 |
+
// docvqa_anls: 89.3,
|
106 |
+
},
|
107 |
+
},
|
108 |
+
{
|
109 |
+
model: "Claude 3 Sonnet",
|
110 |
+
provider: "Anthropic",
|
111 |
+
inputPrice: 3.0,
|
112 |
+
outputPrice: 15.0,
|
113 |
+
source: "https://www.anthropic.com/news/claude-3-opus-release",
|
114 |
+
benchmark: {
|
115 |
+
gpqa_diamond: 40.4,
|
116 |
+
mmmlu: 79.0,
|
117 |
+
mmmu: 53.1,
|
118 |
+
// gsm8k: 92.3,
|
119 |
+
// math: 43.1,
|
120 |
+
// mgsm: 83.5,
|
121 |
+
// humaneval: 73.0,
|
122 |
+
// drop: 78.9,
|
123 |
+
// big_bench_hard: 82.9,
|
124 |
+
// arc_challenge: 93.2,
|
125 |
+
// hellaswag: 89.0,
|
126 |
+
// mathvista: 47.9,
|
127 |
+
// ai2d: 88.7,
|
128 |
+
// chart_qa: 81.1,
|
129 |
+
// docvqa_anls: 89.5,
|
130 |
+
},
|
131 |
+
},
|
132 |
+
{
|
133 |
+
model: "Claude 3 Haiku",
|
134 |
+
provider: "Anthropic",
|
135 |
+
inputPrice: 0.25,
|
136 |
+
outputPrice: 1.25,
|
137 |
+
source: "https://www.anthropic.com/news/claude-3-opus-release",
|
138 |
+
benchmark: {
|
139 |
+
gpqa_diamond: 33.3,
|
140 |
+
mmmlu: 75.2,
|
141 |
+
mmmu: 50.2,
|
142 |
+
// gsm8k: 88.9,
|
143 |
+
// math: 38.9,
|
144 |
+
// mgsm: 75.1,
|
145 |
+
// humaneval: 75.9,
|
146 |
+
// drop: 78.4,
|
147 |
+
// big_bench_hard: 73.7,
|
148 |
+
// arc_challenge: 89.2,
|
149 |
+
// hellaswag: 85.9,
|
150 |
+
// mathvista: 46.4,
|
151 |
+
// ai2d: 86.7,
|
152 |
+
// chart_qa: 81.7,
|
153 |
+
// docvqa_anls: 88.8,
|
154 |
+
},
|
155 |
+
},
|
156 |
];
|
src/lib/benchmarks/types.ts
CHANGED
@@ -23,6 +23,7 @@ export type BenchmarkMetric =
|
|
23 |
// General reasoning & robustness
|
24 |
| "bigbench_extra_hard"
|
25 |
| "global_mmlu_lite"
|
|
|
26 |
|
27 |
// Optional: less frequent but still potentially useful
|
28 |
| "facts_grounding"
|
@@ -68,6 +69,7 @@ export const benchmarkMetricOrder: BenchmarkMetric[] = [
|
|
68 |
// // General reasoning & robustness
|
69 |
// "bigbench_extra_hard",
|
70 |
// "global_mmlu_lite",
|
|
|
71 |
|
72 |
// // Optional: less frequent but still potentially useful
|
73 |
// "facts_grounding",
|
|
|
23 |
// General reasoning & robustness
|
24 |
| "bigbench_extra_hard"
|
25 |
| "global_mmlu_lite"
|
26 |
+
| "mmmlu"
|
27 |
|
28 |
// Optional: less frequent but still potentially useful
|
29 |
| "facts_grounding"
|
|
|
69 |
// // General reasoning & robustness
|
70 |
// "bigbench_extra_hard",
|
71 |
// "global_mmlu_lite",
|
72 |
+
//"mmmlu"
|
73 |
|
74 |
// // Optional: less frequent but still potentially useful
|
75 |
// "facts_grounding",
|