Presidentlin commited on
Commit
58de9fb
·
1 Parent(s): 5403286
src/lib/benchmarks/ index.ts CHANGED
@@ -2,12 +2,11 @@ import { Benchmark } from "./types";
2
  import { xaiBenchmarks } from "./xai";
3
  import { googleBenchmarks } from "./google";
4
  import { anthropicBenchmarks } from "./anthropic";
5
- // import other sources here as you add them
6
- // import { openaiBenchmarks } from "./openai";
7
 
8
  export const benchmarkData: Benchmark[] = [
9
  ...xaiBenchmarks,
10
  ...googleBenchmarks,
11
- ...anthropicBenchmarks
12
- // ...openaiBenchmarks,
13
  ];
 
2
  import { xaiBenchmarks } from "./xai";
3
  import { googleBenchmarks } from "./google";
4
  import { anthropicBenchmarks } from "./anthropic";
5
+ import { openaiBenchmarks } from "./openai";
 
6
 
7
  export const benchmarkData: Benchmark[] = [
8
  ...xaiBenchmarks,
9
  ...googleBenchmarks,
10
+ ...anthropicBenchmarks,
11
+ ...openaiBenchmarks,
12
  ];
src/lib/benchmarks/openai.ts ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Benchmark } from "./types";
2
+
3
+ export const openaiBenchmarks: Benchmark[] = [
4
+ {
5
+ model: "GPT-4o-2024-11-20",
6
+ provider: "OpenAI",
7
+ inputPrice: 2.5,
8
+ outputPrice: 10.0,
9
+ source: "https://github.com/openai/simple-evals",
10
+ benchmark: {
11
+ mmlu: 85.7,
12
+ gpqa: 46.0,
13
+ humaneval: 90.2,
14
+ simpleqa: 38.8,
15
+ // math: 68.5,
16
+ // mgsm: 90.3,
17
+ // drop: 81.5,
18
+ },
19
+ },
20
+ {
21
+ model: "GPT-4o-2024-08-06",
22
+ provider: "OpenAI",
23
+ inputPrice: 2.5,
24
+ outputPrice: 10.0,
25
+ source: "https://github.com/openai/simple-evals",
26
+ benchmark: {
27
+ mmlu: 88.7,
28
+ gpqa: 53.1,
29
+ humaneval: 90.2,
30
+ simpleqa: 40.1,
31
+ // math: 75.9,
32
+ // mgsm: 90.0,
33
+ // drop: 79.8,
34
+ },
35
+ },
36
+ {
37
+ model: "GPT-4o-2024-05-13",
38
+ provider: "OpenAI",
39
+ inputPrice: 5.0,
40
+ outputPrice: 15.0,
41
+ source: "https://github.com/openai/simple-evals",
42
+ benchmark: {
43
+ mmlu: 87.2,
44
+ gpqa: 49.9,
45
+ humaneval: 91.0,
46
+ simpleqa: 39.0,
47
+ // math: 76.6,
48
+ // mgsm: 89.9,
49
+ // drop: 83.7,
50
+ },
51
+ },
52
+ {
53
+ model: "GPT-4o-mini-2024-07-18",
54
+ provider: "OpenAI",
55
+ inputPrice: 0.15,
56
+ outputPrice: 0.60,
57
+ source: "https://github.com/openai/simple-evals",
58
+ benchmark: {
59
+ mmlu: 82.0,
60
+ gpqa: 40.2,
61
+ humaneval: 87.2,
62
+ mmmu: 59.4,
63
+ simpleqa: 9.5,
64
+ // mgsm: 87.0,
65
+ // drop: 79.7,
66
+ // math: 70.2,
67
+ },
68
+ },
69
+ {
70
+ model: "GPT-4.1-2025-04-14",
71
+ provider: "OpenAI",
72
+ inputPrice: 2.0,
73
+ outputPrice: 8.0,
74
+ source: "https://github.com/openai/simple-evals",
75
+ benchmark: {
76
+ mmlu: 90.2,
77
+ gpqa: 66.3,
78
+ humaneval: 94.5,
79
+ simpleqa: 41.6,
80
+ // math: 82.1,
81
+ // mgsm: 86.9,
82
+ // drop: 79.4,
83
+ },
84
+ },
85
+ {
86
+ model: "GPT-4.1-mini-2025-04-14",
87
+ provider: "OpenAI",
88
+ inputPrice: 0.4,
89
+ outputPrice: 1.6,
90
+ source: "https://github.com/openai/simple-evals",
91
+ benchmark: {
92
+ mmlu: 87.5,
93
+ gpqa: 65.0,
94
+ humaneval: 93.8,
95
+ simpleqa: 16.8,
96
+ // math: 81.4,
97
+ // mgsm: 88.2,
98
+ // drop: 81.0,
99
+ },
100
+ },
101
+ {
102
+ model: "GPT-4.1-nano-2025-04-14",
103
+ provider: "OpenAI",
104
+ inputPrice: 0.1,
105
+ outputPrice: 0.4,
106
+ source: "https://github.com/openai/simple-evals",
107
+ benchmark: {
108
+ mmlu: 80.1,
109
+ gpqa: 50.3,
110
+ humaneval: 87.0,
111
+ simpleqa: 7.6,
112
+ // math: 62.3,
113
+ // mgsm: 73.0,
114
+ // drop: 82.2,
115
+ },
116
+ },
117
+ {
118
+ model: "GPT-4.5-preview-2025-02-27",
119
+ provider: "OpenAI",
120
+ inputPrice: 75.0,
121
+ outputPrice: 150.0,
122
+ source: "https://github.com/openai/simple-evals",
123
+ benchmark: {
124
+ mmlu: 90.8,
125
+ gpqa: 69.5,
126
+ simpleqa: 62.5,
127
+ humaneval: 88.6,
128
+ // mgsm: 86.9,
129
+ // drop: 83.4,
130
+ // math: 87.1,
131
+
132
+ },
133
+ },
134
+ {
135
+ model: "GPT-4-turbo-2024-04-09",
136
+ provider: "OpenAI",
137
+ inputPrice: 10.0,
138
+ outputPrice: 30.0,
139
+ source: "https://github.com/openai/simple-evals",
140
+ benchmark: {
141
+ mmlu: 86.7,
142
+ gpqa: 49.3,
143
+ humaneval: 88.2,
144
+ simpleqa: 24.2,
145
+ // math: 73.4,
146
+ // mgsm: 89.6,
147
+ // drop: 86.0,
148
+
149
+ },
150
+ },
151
+ {
152
+ model: "GPT-4-0125-preview",
153
+ provider: "OpenAI",
154
+ inputPrice: 10.0,
155
+ outputPrice: 30.0,
156
+ source: "https://github.com/openai/simple-evals",
157
+ benchmark: {
158
+ mmlu: 85.4,
159
+ gpqa: 41.4,
160
+ humaneval: 86.6,
161
+ // math: 64.5,
162
+ // mgsm: 85.1,
163
+ // drop: 81.5,
164
+ },
165
+ },
166
+ {
167
+ model: "GPT-4-1106-preview",
168
+ provider: "OpenAI",
169
+ inputPrice: 10.0,
170
+ outputPrice: 30.0,
171
+ source: "https://github.com/openai/simple-evals",
172
+ benchmark: {
173
+ mmlu: 84.7,
174
+ gpqa: 42.5,
175
+ humaneval: 83.7,
176
+ // math: 64.3,
177
+ // mgsm: 87.1,
178
+ // drop: 83.2,
179
+ },
180
+ },
181
+ ];
src/lib/benchmarks/types.ts CHANGED
@@ -2,6 +2,7 @@ export type BenchmarkMetric =
2
  // Most common and high-priority
3
  | "simpleqa"
4
  | "mmlu_pro"
 
5
  | "gpqa"
6
  | "egoschema"
7
  | "loft"
 
2
  // Most common and high-priority
3
  | "simpleqa"
4
  | "mmlu_pro"
5
+ | "mmlu"
6
  | "gpqa"
7
  | "egoschema"
8
  | "loft"