Presidentlin commited on
Commit
85d193b
·
1 Parent(s): a4caafd
Files changed (1) hide show
  1. src/lib/benchmarks/google.ts +196 -164
src/lib/benchmarks/google.ts CHANGED
@@ -1,176 +1,208 @@
1
  import { Benchmark } from "./types";
2
 
3
  export const googleBenchmarks: Benchmark[] = [
4
- {
5
- model: "Gemini 2.5 Pro (Thinking-enabled, default)",
6
- provider: "Google",
7
- inputPrice: 2.5,
8
- outputPrice: 15.0,
9
- source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
10
- benchmark: {
11
- livecodebench_v6: 69.0,
12
- aider_polyglot: 82.2,
13
- swe_bench_verified: 67.2,
14
- gpqa_diamond: 86.4,
15
- aime_2025: 88.0,
16
- humanitys_last_exam: 21.6,
17
- simpleqa: 54.0,
18
- facts_grounding: 87.8,
19
- global_mmlu_lite: 89.2,
20
- mrcr_v2_avg_128k: 58.0,
21
- mrcr_v2_pointwise_1m: 16.4,
22
- mmmu: 82.0,
23
- },
 
24
  },
25
- {
26
- model: "Gemini 2.5 Flash (Thinking-enabled, default)",
27
- provider: "Google",
28
- inputPrice: 0.15,
29
- outputPrice: 3.5,
30
- source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
31
- benchmark: {
32
- livecodebench_v6: 55.4,
33
- aider_polyglot: 56.7,
34
- swe_bench_verified: 60.3,
35
- gpqa_diamond: 82.8,
36
- aime_2025: 72.0,
37
- humanitys_last_exam: 11.0,
38
- simpleqa: 26.9,
39
- facts_grounding: 85.3,
40
- global_mmlu_lite: 88.4,
41
- mrcr_v2_avg_128k: 54.3,
42
- mrcr_v2_pointwise_1m: 21.0,
43
- mmmu: 79.7,
44
- },
 
 
45
  },
46
- {
47
- model: "Gemini 2.5 Flash (Non-Thinking)",
48
- provider: "Google",
49
- inputPrice: 0.30,
50
- outputPrice: 2.50,
51
- source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
52
- benchmark: {
53
- humanitys_last_exam: 8.4,
54
- gpqa_diamond: 78.3,
55
- aime_2025: 61.6,
56
- livecodebench_v6: 41.1,
57
- aider_polyglot: 44.0,
58
- swe_bench_verified: 50.0,
59
- simpleqa: 25.8,
60
- facts_grounding: 83.4,
61
- mmmu: 76.9,
62
- //vibe_eval: 66.2,
63
- mrcr_v2_avg_128k: 34.1,
64
- mrcr_v2_pointwise_1m: 16.8,
65
- global_mmlu_lite: 85.8,
66
- },
 
 
67
  },
68
-
69
- {
70
- model: "Gemini 2.5 Flash-Lite (Non-Thinking)",
71
- provider: "Google",
72
- inputPrice: 0.10,
73
- outputPrice: 0.40,
74
- source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
75
- benchmark: {
76
- humanitys_last_exam: 5.1,
77
- gpqa_diamond: 64.6,
78
- aime_2025: 49.8,
79
- livecodebench_v6: 33.7,
80
- aider_polyglot: 26.7,
81
- swe_bench_verified: 42.6,
82
- simpleqa: 10.7,
83
- facts_grounding: 84.1,
84
- mmmu: 72.9,
85
- // vibe_eval: 51.3,
86
- mrcr_v2_avg_128k: 16.6,
87
- mrcr_v2_pointwise_1m: 4.1,
88
- global_mmlu_lite: 81.1,
89
- },
 
90
  },
91
- {
92
- model: "Gemini 2.5 Flash-Lite (Thinking)",
93
- provider: "Google",
94
- inputPrice: 0.10,
95
- outputPrice: 0.40,
96
- source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
97
- benchmark: {
98
- humanitys_last_exam: 6.9,
99
- gpqa_diamond: 66.7,
100
- aime_2025: 63.1,
101
- livecodebench_v6: 34.3,
102
- aider_polyglot: 27.1,
103
- swe_bench_verified: 44.9,
104
- simpleqa: 13.0,
105
- facts_grounding: 86.8,
106
- mmmu: 72.9,
107
- //vibe_eval: 57.5,
108
- mrcr_v2_avg_128k: 30.6,
109
- mrcr_v2_pointwise_1m: 5.4,
110
- global_mmlu_lite: 84.5,
111
- },
 
 
112
  },
113
-
114
- {
115
- model: "Gemini 2.0 Flash",
116
- provider: "Google",
117
- inputPrice: 0.1,
118
- outputPrice: 0.4,
119
- source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
120
- benchmark: {
121
- aime_2025: 29.7,
122
- gpqa_diamond: 65.2,
123
- simpleqa: 29.9,
124
- global_mmlu_lite: 83.4,
125
- livecodebench_v6: 29.1,
126
- mmmu: 69.3,
127
- facts_grounding: 84.6,
128
- humanitys_last_exam: 5.1,
129
- mrcr_v2_avg_128k: 19.0,
130
- mrcr_v2_pointwise_1m: 5.3,
131
- },
132
  },
133
-
134
-
135
- {
136
- model: "Gemini 1.5 Pro",
137
- provider: "Google",
138
- inputPrice: 0.015,
139
- outputPrice: 0.075,
140
- source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
141
- benchmark: {
142
- livecodebench_v6: 29.7,
143
- aider_polyglot: 16.9,
144
- swe_bench_verified: 34.2,
145
- gpqa_diamond: 58.1,
146
- aime_2025: 17.5,
147
- humanitys_last_exam: 4.6,
148
- simpleqa: 24.9,
149
- facts_grounding: 80.0,
150
- global_mmlu_lite: 80.8,
151
- mrcr_v2_avg_128k: 26.2,
152
- mrcr_v2_pointwise_1m: 12.1,
153
- mmmu: 67.7,
154
- },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  },
156
- {
157
- model: "Gemini 1.5 Flash",
158
- provider: "Google",
159
- inputPrice: 0.0025,
160
- outputPrice: 0.0075,
161
- source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
162
- benchmark: {
163
- livecodebench_v6: 30.3,
164
- aider_polyglot: 2.8,
165
- swe_bench_verified: 19.7,
166
- gpqa_diamond: 50.0,
167
- aime_2025: 14.7,
168
- simpleqa: 8.6,
169
- facts_grounding: 82.9,
170
- global_mmlu_lite: 72.5,
171
- mrcr_v2_avg_128k: 18.4,
172
- mrcr_v2_pointwise_1m: 10.2,
173
- mmmu: 58.3,
174
- },
 
 
175
  },
 
176
  ];
 
1
  import { Benchmark } from "./types";
2
 
3
  export const googleBenchmarks: Benchmark[] = [
4
+ {
5
+ model: "Gemini 2.5 Pro (Thinking-enabled, default)",
6
+ provider: "Google",
7
+ inputPrice: 2.5,
8
+ outputPrice: 15.0,
9
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
10
+ benchmark: {
11
+ livecodebench_v6: 69.0,
12
+ aider_polyglot: 82.2,
13
+ swe_bench_verified: 67.2,
14
+ gpqa_diamond: 86.4,
15
+ aime_2025: 88.0,
16
+ humanitys_last_exam: 21.6,
17
+ simpleqa: 54.0,
18
+ facts_grounding: 87.8,
19
+ global_mmlu_lite: 89.2,
20
+ mrcr_v2_avg_128k: 58.0,
21
+ mrcr_v2_pointwise_1m: 16.4,
22
+ mmmu: 82.0,
23
+ // loft_128k: 87.0,
24
+ // loft_1m: 69.8,
25
  },
26
+ },
27
+ {
28
+ model: "Gemini 2.5 Flash (Thinking-enabled, default)",
29
+ provider: "Google",
30
+ inputPrice: 0.15,
31
+ outputPrice: 3.5,
32
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
33
+ benchmark: {
34
+ livecodebench_v6: 55.4,
35
+ aider_polyglot: 56.7,
36
+ swe_bench_verified: 60.3,
37
+ gpqa_diamond: 82.8,
38
+ aime_2025: 72.0,
39
+ humanitys_last_exam: 11.0,
40
+ simpleqa: 26.9,
41
+ facts_grounding: 85.3,
42
+ global_mmlu_lite: 88.4,
43
+ mrcr_v2_avg_128k: 54.3,
44
+ mrcr_v2_pointwise_1m: 21.0,
45
+ mmmu: 79.7,
46
+ // loft_128k: 82.1,
47
+ // loft_1m: 58.9,
48
  },
49
+ },
50
+ {
51
+ model: "Gemini 2.5 Flash (Non-Thinking)",
52
+ provider: "Google",
53
+ inputPrice: 0.30,
54
+ outputPrice: 2.50,
55
+ source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
56
+ benchmark: {
57
+ humanitys_last_exam: 8.4,
58
+ gpqa_diamond: 78.3,
59
+ aime_2025: 61.6,
60
+ livecodebench_v6: 41.1,
61
+ aider_polyglot: 44.0,
62
+ swe_bench_verified: 50.0,
63
+ simpleqa: 25.8,
64
+ facts_grounding: 83.4,
65
+ mmmu: 76.9,
66
+ // vibe_eval: 66.2,
67
+ mrcr_v2_avg_128k: 34.1,
68
+ mrcr_v2_pointwise_1m: 16.8,
69
+ global_mmlu_lite: 85.8,
70
+ // loft_128k: 76.2,
71
+ // loft_1m: 49.5,
72
  },
73
+ },
74
+ {
75
+ model: "Gemini 2.5 Flash-Lite (Non-Thinking)",
76
+ provider: "Google",
77
+ inputPrice: 0.10,
78
+ outputPrice: 0.40,
79
+ source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
80
+ benchmark: {
81
+ humanitys_last_exam: 5.1,
82
+ gpqa_diamond: 64.6,
83
+ aime_2025: 49.8,
84
+ livecodebench_v6: 33.7,
85
+ aider_polyglot: 26.7,
86
+ swe_bench_verified: 42.6,
87
+ simpleqa: 10.7,
88
+ facts_grounding: 84.1,
89
+ mmmu: 72.9,
90
+ // vibe_eval: 51.3,
91
+ mrcr_v2_avg_128k: 16.6,
92
+ mrcr_v2_pointwise_1m: 4.1,
93
+ global_mmlu_lite: 81.1,
94
+ // loft_128k: 65.7,
95
+ // loft_1m: 31.1,
96
  },
97
+ },
98
+ {
99
+ model: "Gemini 2.5 Flash-Lite (Thinking)",
100
+ provider: "Google",
101
+ inputPrice: 0.10,
102
+ outputPrice: 0.40,
103
+ source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
104
+ benchmark: {
105
+ humanitys_last_exam: 6.9,
106
+ gpqa_diamond: 66.7,
107
+ aime_2025: 63.1,
108
+ livecodebench_v6: 34.3,
109
+ aider_polyglot: 27.1,
110
+ swe_bench_verified: 44.9,
111
+ simpleqa: 13.0,
112
+ facts_grounding: 86.8,
113
+ mmmu: 72.9,
114
+ // vibe_eval: 57.5,
115
+ mrcr_v2_avg_128k: 30.6,
116
+ mrcr_v2_pointwise_1m: 5.4,
117
+ global_mmlu_lite: 84.5,
118
+ // loft_128k: 67.3,
119
+ // loft_1m: 38.4,
120
  },
121
+ },
122
+ {
123
+ model: "Gemini 2.0 Flash-Lite",
124
+ provider: "Google",
125
+ inputPrice: 0.10,
126
+ outputPrice: 0.40,
127
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
128
+ benchmark: {
129
+ livecodebench_v6: 29.1,
130
+ aider_polyglot: 10.5,
131
+ swe_bench_verified: 23.1,
132
+ gpqa_diamond: 50.5,
133
+ aime_2025: 23.8,
134
+ humanitys_last_exam: 4.6,
135
+ simpleqa: 16.5,
136
+ facts_grounding: 82.4,
137
+ global_mmlu_lite: 78.0,
138
+ // loft_128k: 50.7,
139
+ // loft_1m: 7.6,
140
  },
141
+ },
142
+ {
143
+ model: "Gemini 2.0 Flash",
144
+ provider: "Google",
145
+ inputPrice: 0.1,
146
+ outputPrice: 0.4,
147
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
148
+ benchmark: {
149
+ aime_2025: 29.7,
150
+ gpqa_diamond: 65.2,
151
+ simpleqa: 29.9,
152
+ global_mmlu_lite: 83.4,
153
+ livecodebench_v6: 29.1,
154
+ mmmu: 69.3,
155
+ facts_grounding: 84.6,
156
+ humanitys_last_exam: 5.1,
157
+ mrcr_v2_avg_128k: 19.0,
158
+ mrcr_v2_pointwise_1m: 5.3,
159
+ // loft_128k: 58.0,
160
+ // loft_1m: 7.6,
161
+ },
162
+ },
163
+ {
164
+ model: "Gemini 1.5 Pro",
165
+ provider: "Google",
166
+ inputPrice: 0.015,
167
+ outputPrice: 0.075,
168
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
169
+ benchmark: {
170
+ livecodebench_v6: 29.7,
171
+ aider_polyglot: 16.9,
172
+ swe_bench_verified: 34.2,
173
+ gpqa_diamond: 58.1,
174
+ aime_2025: 17.5,
175
+ humanitys_last_exam: 4.6,
176
+ simpleqa: 24.9,
177
+ facts_grounding: 80.0,
178
+ global_mmlu_lite: 80.8,
179
+ mrcr_v2_avg_128k: 26.2,
180
+ mrcr_v2_pointwise_1m: 12.1,
181
+ mmmu: 67.7,
182
+ // loft_128k: 75.9,
183
+ // loft_1m: 47.1,
184
  },
185
+ },
186
+ {
187
+ model: "Gemini 1.5 Flash",
188
+ provider: "Google",
189
+ inputPrice: 0.0025,
190
+ outputPrice: 0.0075,
191
+ source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
192
+ benchmark: {
193
+ livecodebench_v6: 30.3,
194
+ aider_polyglot: 2.8,
195
+ swe_bench_verified: 19.7,
196
+ gpqa_diamond: 50.0,
197
+ aime_2025: 14.7,
198
+ simpleqa: 8.6,
199
+ facts_grounding: 82.9,
200
+ global_mmlu_lite: 72.5,
201
+ mrcr_v2_avg_128k: 18.4,
202
+ mrcr_v2_pointwise_1m: 10.2,
203
+ mmmu: 58.3,
204
+ // loft_128k: 67.3,
205
+ // loft_1m: 36.7,
206
  },
207
+ },
208
  ];