Presidentlin commited on
Commit
d4f1866
·
1 Parent(s): 5a1ab27
src/lib/benchmarks/anthropic.ts CHANGED
@@ -12,7 +12,7 @@ export const anthropicBenchmarks: Benchmark[] = [
12
  //terminal_bench: 43.2,
13
  gpqa_diamond: 79.6,
14
  aime_2025: 75.5,
15
- //mmmlu: 88.8,
16
  mmmu: 76.5,
17
  // tau_bench_retail: 81.4,
18
  //tau_bench_airline: 59.6,
@@ -29,7 +29,7 @@ export const anthropicBenchmarks: Benchmark[] = [
29
  //terminal_bench: 35.5,
30
  gpqa_diamond: 75.4,
31
  aime_2025: 70.5,
32
- // mmmlu: 86.5,
33
  mmmu: 74.4,
34
  // tau_bench_retail: 80.5,
35
  // tau_bench_airline: 60.0,
@@ -46,7 +46,7 @@ export const anthropicBenchmarks: Benchmark[] = [
46
 
47
  // tau_bench_retail: 81.2,
48
  // tau_bench_airline: 58.4,
49
- // mmmlu: 86.1,
50
  mmmu: 75.0,
51
  aime_24: 61.3,
52
  },
@@ -60,7 +60,7 @@ export const anthropicBenchmarks: Benchmark[] = [
60
  benchmark: {
61
  gpqa_diamond: 68.0,
62
  swe_bench_verified: 62.3,
63
- // mmmlu: 83.2,
64
  mmmu: 71.8,
65
  aime_24: 51.7, // using average of 23.3 & 80.0
66
  },
@@ -76,9 +76,81 @@ export const anthropicBenchmarks: Benchmark[] = [
76
  swe_bench_verified: 49.0,
77
  // tau_bench_retail: 71.5,
78
  // tau_bench_airline: 48.8,
79
- // mmmlu: 82.1,
80
  mmmu: 70.4,
81
  aime_24: 16.0, // average of 16.0 & 65.4
82
  },
83
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  ];
 
12
  //terminal_bench: 43.2,
13
  gpqa_diamond: 79.6,
14
  aime_2025: 75.5,
15
+ mmmlu: 88.8,
16
  mmmu: 76.5,
17
  // tau_bench_retail: 81.4,
18
  //tau_bench_airline: 59.6,
 
29
  //terminal_bench: 35.5,
30
  gpqa_diamond: 75.4,
31
  aime_2025: 70.5,
32
+ mmmlu: 86.5,
33
  mmmu: 74.4,
34
  // tau_bench_retail: 80.5,
35
  // tau_bench_airline: 60.0,
 
46
 
47
  // tau_bench_retail: 81.2,
48
  // tau_bench_airline: 58.4,
49
+ mmmlu: 86.1,
50
  mmmu: 75.0,
51
  aime_24: 61.3,
52
  },
 
60
  benchmark: {
61
  gpqa_diamond: 68.0,
62
  swe_bench_verified: 62.3,
63
+ mmmlu: 83.2,
64
  mmmu: 71.8,
65
  aime_24: 51.7, // using average of 23.3 & 80.0
66
  },
 
76
  swe_bench_verified: 49.0,
77
  // tau_bench_retail: 71.5,
78
  // tau_bench_airline: 48.8,
79
+ mmmlu: 82.1,
80
  mmmu: 70.4,
81
  aime_24: 16.0, // average of 16.0 & 65.4
82
  },
83
  },
84
+ {
85
+ model: "Claude 3 Opus",
86
+ provider: "Anthropic",
87
+ inputPrice: 15.0,
88
+ outputPrice: 75.0,
89
+ source: "https://www.anthropic.com/news/claude-3-opus-release",
90
+ benchmark: {
91
+ gpqa_diamond: 50.4,
92
+ mmmlu: 86.8,
93
+ mmmu: 59.4,
94
+ // gsm8k: 95.0,
95
+ // math: 60.1,
96
+ // mgsm: 90.7,
97
+ // humaneval: 84.9,
98
+ // drop: 83.1,
99
+ // big_bench_hard: 86.8,
100
+ // arc_challenge: 96.4,
101
+ // hellaswag: 95.4,
102
+ // mathvista: 50.5,
103
+ // ai2d: 88.1,
104
+ // chart_qa: 80.8,
105
+ // docvqa_anls: 89.3,
106
+ },
107
+ },
108
+ {
109
+ model: "Claude 3 Sonnet",
110
+ provider: "Anthropic",
111
+ inputPrice: 3.0,
112
+ outputPrice: 15.0,
113
+ source: "https://www.anthropic.com/news/claude-3-opus-release",
114
+ benchmark: {
115
+ gpqa_diamond: 40.4,
116
+ mmmlu: 79.0,
117
+ mmmu: 53.1,
118
+ // gsm8k: 92.3,
119
+ // math: 43.1,
120
+ // mgsm: 83.5,
121
+ // humaneval: 73.0,
122
+ // drop: 78.9,
123
+ // big_bench_hard: 82.9,
124
+ // arc_challenge: 93.2,
125
+ // hellaswag: 89.0,
126
+ // mathvista: 47.9,
127
+ // ai2d: 88.7,
128
+ // chart_qa: 81.1,
129
+ // docvqa_anls: 89.5,
130
+ },
131
+ },
132
+ {
133
+ model: "Claude 3 Haiku",
134
+ provider: "Anthropic",
135
+ inputPrice: 0.25,
136
+ outputPrice: 1.25,
137
+ source: "https://www.anthropic.com/news/claude-3-opus-release",
138
+ benchmark: {
139
+ gpqa_diamond: 33.3,
140
+ mmmlu: 75.2,
141
+ mmmu: 50.2,
142
+ // gsm8k: 88.9,
143
+ // math: 38.9,
144
+ // mgsm: 75.1,
145
+ // humaneval: 75.9,
146
+ // drop: 78.4,
147
+ // big_bench_hard: 73.7,
148
+ // arc_challenge: 89.2,
149
+ // hellaswag: 85.9,
150
+ // mathvista: 46.4,
151
+ // ai2d: 86.7,
152
+ // chart_qa: 81.7,
153
+ // docvqa_anls: 88.8,
154
+ },
155
+ },
156
  ];
src/lib/benchmarks/types.ts CHANGED
@@ -23,6 +23,7 @@ export type BenchmarkMetric =
23
  // General reasoning & robustness
24
  | "bigbench_extra_hard"
25
  | "global_mmlu_lite"
 
26
 
27
  // Optional: less frequent but still potentially useful
28
  | "facts_grounding"
@@ -68,6 +69,7 @@ export const benchmarkMetricOrder: BenchmarkMetric[] = [
68
  // // General reasoning & robustness
69
  // "bigbench_extra_hard",
70
  // "global_mmlu_lite",
 
71
 
72
  // // Optional: less frequent but still potentially useful
73
  // "facts_grounding",
 
23
  // General reasoning & robustness
24
  | "bigbench_extra_hard"
25
  | "global_mmlu_lite"
26
+ | "mmmlu"
27
 
28
  // Optional: less frequent but still potentially useful
29
  | "facts_grounding"
 
69
  // // General reasoning & robustness
70
  // "bigbench_extra_hard",
71
  // "global_mmlu_lite",
72
+ //"mmmlu"
73
 
74
  // // Optional: less frequent but still potentially useful
75
  // "facts_grounding",