Presidentlin commited on
Commit
d7502bf
·
1 Parent(s): f911da5
src/lib/benchmarks/anthropic.ts CHANGED
@@ -1,103 +1,102 @@
1
  import { Benchmark } from "./types";
2
 
3
  export const anthropicBenchmarks: Benchmark[] = [
4
- {
5
- model: "Claude Opus 4",
6
- provider: "Anthropic",
7
- inputPrice: 15.0,
8
- outputPrice: 75.0,
9
- source: "https://www.anthropic.com/news/claude-4",
10
- benchmark: {
11
- swe_bench_verified: 72.5,
12
- //terminal_bench: 43.2,
13
- gpqa_diamond: 79.6,
14
- aime_2025: 75.5,
15
- mmmlu: 88.8,
16
- mmmu: 76.5,
17
- // tau_bench_retail: 81.4,
18
- //tau_bench_airline: 59.6,
19
- },
20
  },
21
- {
22
- model: "Claude Sonnet 4",
23
- provider: "Anthropic",
24
- inputPrice: 3.0,
25
- outputPrice: 15.0,
26
- source: "https://www.anthropic.com/news/claude-4",
27
- benchmark: {
28
- swe_bench_verified: 72.7,
29
- //terminal_bench: 35.5,
30
- gpqa_diamond: 75.4,
31
- aime_2025: 70.5,
32
- mmmlu: 86.5,
33
- mmmu: 74.4,
34
- // tau_bench_retail: 80.5,
35
- // tau_bench_airline: 60.0,
36
- },
37
  },
38
- {
39
- model: "Claude 3.7 Sonnet (Extended Thinking 64K)",
40
- provider: "Anthropic",
41
- inputPrice: 3.0,
42
- outputPrice: 15.0,
43
- source: "https://www.anthropic.com/news/claude-3-7-sonnet",
44
- benchmark: {
45
- gpqa_diamond: 78.2,
46
-
47
- // tau_bench_retail: 81.2,
48
- // tau_bench_airline: 58.4,
49
- mmmlu: 86.1,
50
- mmmu: 75.0,
51
- aime_24: 61.3,
52
- },
53
  },
54
- {
55
- model: "Claude 3.7 Sonnet (No Extended Thinking)",
56
- provider: "Anthropic",
57
- inputPrice: 3.0,
58
- outputPrice: 15.0,
59
- source: "https://www.anthropic.com/news/claude-3-7-sonnet",
60
- benchmark: {
61
- gpqa_diamond: 68.0,
62
- swe_bench_verified: 62.3,
63
- mmmlu: 83.2,
64
- mmmu: 71.8,
65
- aime_24: 51.7, // using average of 23.3 & 80.0
66
- },
67
  },
68
- {
69
- model: "Claude 3.5 Sonnet (New)",
70
- provider: "Anthropic",
71
- inputPrice: 3.0,
72
- outputPrice: 15.0,
73
- source: "https://www.anthropic.com/news/claude-3-7-sonnet",
74
- benchmark: {
75
- gpqa_diamond: 65.0,
76
- swe_bench_verified: 49.0,
77
- // tau_bench_retail: 71.5,
78
- // tau_bench_airline: 48.8,
79
- mmmlu: 82.1,
80
- mmmu: 70.4,
81
- aime_24: 16.0, // average of 16.0 & 65.4
82
- },
83
  },
84
- {
85
- model: "Claude 3.5 Haiku",
86
- provider: "Anthropic",
87
- inputPrice: 3.0,
88
- outputPrice: 15.0,
89
- source: "https://www.anthropic.com/news/3-5-models-and-computer-use",
90
- benchmark: {
91
- gpqa_diamond: 41.6,
92
- swe_bench_verified: 49.0,
93
- // tau_bench_retail: 51.0,
94
- // tau_bench_airline: 22.8,
95
- humaneval:88.1,
96
- mmmlu: 65.0,
97
- aime_24: 5.3,
98
- },
99
  },
100
- {
 
101
  model: "Claude 3 Opus",
102
  provider: "Anthropic",
103
  inputPrice: 15.0,
 
1
  import { Benchmark } from "./types";
2
 
3
  export const anthropicBenchmarks: Benchmark[] = [
4
+ {
5
+ model: "Claude Opus 4",
6
+ provider: "Anthropic",
7
+ inputPrice: 15.0,
8
+ outputPrice: 75.0,
9
+ source: "https://www.anthropic.com/news/claude-4",
10
+ benchmark: {
11
+ swe_bench_verified: 72.5,
12
+ //terminal_bench: 43.2,
13
+ gpqa_diamond: 79.6,
14
+ aime_2025: 75.5,
15
+ mmmlu: 88.8,
16
+ mmmu: 76.5,
17
+ tau_bench_retail: 81.4,
18
+ tau_bench_airline: 59.6,
 
19
  },
20
+ },
21
+ {
22
+ model: "Claude Sonnet 4",
23
+ provider: "Anthropic",
24
+ inputPrice: 3.0,
25
+ outputPrice: 15.0,
26
+ source: "https://www.anthropic.com/news/claude-4",
27
+ benchmark: {
28
+ swe_bench_verified: 72.7,
29
+ //terminal_bench: 35.5,
30
+ gpqa_diamond: 75.4,
31
+ aime_2025: 70.5,
32
+ mmmlu: 86.5,
33
+ mmmu: 74.4,
34
+ tau_bench_retail: 80.5,
35
+ tau_bench_airline: 60.0,
36
  },
37
+ },
38
+ {
39
+ model: "Claude 3.7 Sonnet (Extended Thinking 64K)",
40
+ provider: "Anthropic",
41
+ inputPrice: 3.0,
42
+ outputPrice: 15.0,
43
+ source: "https://www.anthropic.com/news/claude-3-7-sonnet",
44
+ benchmark: {
45
+ gpqa_diamond: 78.2,
46
+ tau_bench_retail: 81.2,
47
+ tau_bench_airline: 58.4,
48
+ mmmlu: 86.1,
49
+ mmmu: 75.0,
50
+ aime_24: 61.3,
 
51
  },
52
+ },
53
+ {
54
+ model: "Claude 3.7 Sonnet (No Extended Thinking)",
55
+ provider: "Anthropic",
56
+ inputPrice: 3.0,
57
+ outputPrice: 15.0,
58
+ source: "https://www.anthropic.com/news/claude-3-7-sonnet",
59
+ benchmark: {
60
+ gpqa_diamond: 68.0,
61
+ swe_bench_verified: 62.3,
62
+ mmmlu: 83.2,
63
+ mmmu: 71.8,
64
+ aime_24: 51.7, // using average of 23.3 & 80.0
65
  },
66
+ },
67
+ {
68
+ model: "Claude 3.5 Sonnet (New)",
69
+ provider: "Anthropic",
70
+ inputPrice: 3.0,
71
+ outputPrice: 15.0,
72
+ source: "https://www.anthropic.com/news/claude-3-7-sonnet",
73
+ benchmark: {
74
+ gpqa_diamond: 65.0,
75
+ swe_bench_verified: 49.0,
76
+ tau_bench_retail: 71.5,
77
+ tau_bench_airline: 48.8,
78
+ mmmlu: 82.1,
79
+ mmmu: 70.4,
80
+ aime_24: 16.0, // average of 16.0 & 65.4
81
  },
82
+ },
83
+ {
84
+ model: "Claude 3.5 Haiku",
85
+ provider: "Anthropic",
86
+ inputPrice: 3.0,
87
+ outputPrice: 15.0,
88
+ source: "https://www.anthropic.com/news/3-5-models-and-computer-use",
89
+ benchmark: {
90
+ gpqa_diamond: 41.6,
91
+ swe_bench_verified: 49.0,
92
+ tau_bench_retail: 51.0,
93
+ tau_bench_airline: 22.8,
94
+ humaneval: 88.1,
95
+ mmmlu: 65.0,
96
+ aime_24: 5.3,
97
  },
98
+ },
99
+ {
100
  model: "Claude 3 Opus",
101
  provider: "Anthropic",
102
  inputPrice: 15.0,
src/lib/benchmarks/deepseek.ts CHANGED
@@ -22,8 +22,8 @@ export const deepseekBenchmarks: Benchmark[] = [
22
  // Not in BenchmarkMetric, but useful (commented for type safety):
23
  // codeforces_div1: 1930,
24
  // frames: 83.0,
25
- // tau_bench_airline: 53.5,
26
- // tau_bench_retail: 63.9,
27
  // bfcl_v3_multiturn: 37.0,
28
  // cnmo_2024: 86.9,
29
  // hmmt_2025: 79.4,
@@ -87,7 +87,7 @@ export const deepseekBenchmarks: Benchmark[] = [
87
  {
88
  model: "DeepSeek-R1",
89
  provider: "DeepSeek",
90
- inputPrice: 0.55, // Placeholder, update if pricing becomes available
91
  outputPrice: 2.19,
92
  benchmark: {
93
  mmlu: 90.8,
 
22
  // Not in BenchmarkMetric, but useful (commented for type safety):
23
  // codeforces_div1: 1930,
24
  // frames: 83.0,
25
+ tau_bench_airline: 53.5,
26
+ tau_bench_retail: 63.9,
27
  // bfcl_v3_multiturn: 37.0,
28
  // cnmo_2024: 86.9,
29
  // hmmt_2025: 79.4,
 
87
  {
88
  model: "DeepSeek-R1",
89
  provider: "DeepSeek",
90
+ inputPrice: 0.55, // Placeholder, update if pricing becomes available
91
  outputPrice: 2.19,
92
  benchmark: {
93
  mmlu: 90.8,
src/lib/benchmarks/openai.ts CHANGED
@@ -229,8 +229,8 @@ export const openaiBenchmarks: Benchmark[] = [
229
  aider_polyglot: 81.3, // "(whole)"
230
  //scale_multichallenge: 56.51,
231
  //browsecomp: 8.35, // "o3 with python +browsing*"
232
- //tau_bench: 52.0, // "(Airline)"
233
- // tau_bench_retail: 73.9, // "(Retail)"
234
  },
235
  },
236
  {
@@ -267,8 +267,8 @@ export const openaiBenchmarks: Benchmark[] = [
267
  aider_polyglot: 68.9, // "(whole)"
268
  //scale_multichallenge: 42.99,
269
  //browsecomp: 1.5, // "o4-mini with python +browsing** tools"
270
- //tau_bench: 49.2, // "(Airline)"
271
- //tau_bench_retail: 71.8, // "(Retail)"
272
  },
273
  },
274
  {
@@ -291,8 +291,8 @@ export const openaiBenchmarks: Benchmark[] = [
291
  aider_polyglot: 64.4, // "(whole)"
292
  //scale_multichallenge: 44.93,
293
  //browsecomp: 1.94, // "4o + browsing" - this seems to be a typo in the source, likely refers to o1's browsing capability
294
- //tau_bench: 50.0, // "(Airline)"
295
- //tau_bench_retail: 70.8, // "(Retail)"
296
  },
297
  },
298
  {
@@ -313,8 +313,8 @@ export const openaiBenchmarks: Benchmark[] = [
313
  aider_polyglot: 61.7, // "(diff)"
314
  //scale_multichallenge: 39.89,
315
  // BrowseComp not explicitly listed for o3-mini
316
- //tau_bench: 32.4, // "(Airline)"
317
- //tau_bench_retail: 57.6, // "(Retail)"
318
  },
319
  },
320
  ];
 
229
  aider_polyglot: 81.3, // "(whole)"
230
  //scale_multichallenge: 56.51,
231
  //browsecomp: 8.35, // "o3 with python +browsing*"
232
+ tau_bench_airline: 52.0, // "(Airline)"
233
+ tau_bench_retail: 73.9, // "(Retail)"
234
  },
235
  },
236
  {
 
267
  aider_polyglot: 68.9, // "(whole)"
268
  //scale_multichallenge: 42.99,
269
  //browsecomp: 1.5, // "o4-mini with python +browsing** tools"
270
+ tau_bench_airline: 49.2, // "(Airline)"
271
+ tau_bench_retail: 71.8, // "(Retail)"
272
  },
273
  },
274
  {
 
291
  aider_polyglot: 64.4, // "(whole)"
292
  //scale_multichallenge: 44.93,
293
  //browsecomp: 1.94, // "4o + browsing" - this seems to be a typo in the source, likely refers to o1's browsing capability
294
+ tau_bench_airline: 50.0, // "(Airline)"
295
+ tau_bench_retail: 70.8, // "(Retail)"
296
  },
297
  },
298
  {
 
313
  aider_polyglot: 61.7, // "(diff)"
314
  //scale_multichallenge: 39.89,
315
  // BrowseComp not explicitly listed for o3-mini
316
+ tau_bench_airline: 32.4, // "(Airline)"
317
+ tau_bench_retail: 57.6, // "(Retail)"
318
  },
319
  },
320
  ];
src/lib/benchmarks/types.ts CHANGED
@@ -11,6 +11,8 @@ export type BenchmarkMetric =
11
  | "aime_24"
12
  | "aime_2025"
13
  | "gpqa_diamond"
 
 
14
 
15
  // Code benchmarks (frequent)
16
  | "humaneval"
@@ -58,7 +60,9 @@ export const benchmarkMetricOrder: BenchmarkMetric[] = [
58
  "aime_24",
59
  "aime_2025",
60
  "gpqa_diamond",
61
- "aider_polyglot"
 
 
62
 
63
  // // Code benchmarks (frequent)
64
  // "humaneval",
 
11
  | "aime_24"
12
  | "aime_2025"
13
  | "gpqa_diamond"
14
+ | "tau_bench_airline"
15
+ | "tau_bench_retail"
16
 
17
  // Code benchmarks (frequent)
18
  | "humaneval"
 
60
  "aime_24",
61
  "aime_2025",
62
  "gpqa_diamond",
63
+ "aider_polyglot",
64
+ "tau_bench_airline",
65
+ "tau_bench_retail"
66
 
67
  // // Code benchmarks (frequent)
68
  // "humaneval",