Presidentlin commited on
Commit
5a1ab27
·
1 Parent(s): 95775cb
src/lib/benchmarks/ index.ts CHANGED
@@ -1,11 +1,13 @@
1
  import { Benchmark } from "./types";
2
  import { xaiBenchmarks } from "./xai";
3
  import { googleBenchmarks } from "./google";
 
4
  // import other sources here as you add them
5
  // import { openaiBenchmarks } from "./openai";
6
 
7
  export const benchmarkData: Benchmark[] = [
8
  ...xaiBenchmarks,
9
  ...googleBenchmarks,
 
10
  // ...openaiBenchmarks,
11
  ];
 
1
  import { Benchmark } from "./types";
2
  import { xaiBenchmarks } from "./xai";
3
  import { googleBenchmarks } from "./google";
4
+ import { anthropicBenchmarks } from "./anthropic";
5
  // import other sources here as you add them
6
  // import { openaiBenchmarks } from "./openai";
7
 
8
  export const benchmarkData: Benchmark[] = [
9
  ...xaiBenchmarks,
10
  ...googleBenchmarks,
11
+ ...anthropicBenchmarks
12
  // ...openaiBenchmarks,
13
  ];
src/lib/benchmarks/anthropic.ts ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Benchmark } from "./types";
2
+
3
+ export const anthropicBenchmarks: Benchmark[] = [
4
+ {
5
+ model: "Claude Opus 4",
6
+ provider: "Anthropic",
7
+ inputPrice: 15.0,
8
+ outputPrice: 75.0,
9
+ source: "https://www.anthropic.com/news/claude-4",
10
+ benchmark: {
11
+ swe_bench_verified: 72.5,
12
+ //terminal_bench: 43.2,
13
+ gpqa_diamond: 79.6,
14
+ aime_2025: 75.5,
15
+ //mmmlu: 88.8,
16
+ mmmu: 76.5,
17
+ // tau_bench_retail: 81.4,
18
+ //tau_bench_airline: 59.6,
19
+ },
20
+ },
21
+ {
22
+ model: "Claude Sonnet 4",
23
+ provider: "Anthropic",
24
+ inputPrice: 3.0,
25
+ outputPrice: 15.0,
26
+ source: "https://www.anthropic.com/news/claude-4",
27
+ benchmark: {
28
+ swe_bench_verified: 72.7,
29
+ //terminal_bench: 35.5,
30
+ gpqa_diamond: 75.4,
31
+ aime_2025: 70.5,
32
+ // mmmlu: 86.5,
33
+ mmmu: 74.4,
34
+ // tau_bench_retail: 80.5,
35
+ // tau_bench_airline: 60.0,
36
+ },
37
+ },
38
+ {
39
+ model: "Claude 3.7 Sonnet (Extended Thinking 64K)",
40
+ provider: "Anthropic",
41
+ inputPrice: 3.0,
42
+ outputPrice: 15.0,
43
+ source: "https://www.anthropic.com/news/claude-3-7-sonnet",
44
+ benchmark: {
45
+ gpqa_diamond: 78.2,
46
+
47
+ // tau_bench_retail: 81.2,
48
+ // tau_bench_airline: 58.4,
49
+ // mmmlu: 86.1,
50
+ mmmu: 75.0,
51
+ aime_24: 61.3,
52
+ },
53
+ },
54
+ {
55
+ model: "Claude 3.7 Sonnet (No Extended Thinking)",
56
+ provider: "Anthropic",
57
+ inputPrice: 3.0,
58
+ outputPrice: 15.0,
59
+ source: "https://www.anthropic.com/news/claude-3-7-sonnet",
60
+ benchmark: {
61
+ gpqa_diamond: 68.0,
62
+ swe_bench_verified: 62.3,
63
+ // mmmlu: 83.2,
64
+ mmmu: 71.8,
65
+ aime_24: 51.7, // using average of 23.3 & 80.0
66
+ },
67
+ },
68
+ {
69
+ model: "Claude 3.5 Sonnet (New)",
70
+ provider: "Anthropic",
71
+ inputPrice: 3.0,
72
+ outputPrice: 15.0,
73
+ source: "https://www.anthropic.com/news/claude-3-7-sonnet",
74
+ benchmark: {
75
+ gpqa_diamond: 65.0,
76
+ swe_bench_verified: 49.0,
77
+ // tau_bench_retail: 71.5,
78
+ // tau_bench_airline: 48.8,
79
+ // mmmlu: 82.1,
80
+ mmmu: 70.4,
81
+ aime_24: 16.0, // average of 16.0 & 65.4
82
+ },
83
+ },
84
+ ];
src/lib/benchmarks/google.ts CHANGED
@@ -43,6 +43,52 @@ export const googleBenchmarks: Benchmark[] = [
43
  mmmu: 82.0,
44
  },
45
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  {
47
  model: "Gemini 2.5 Flash (Thinking-enabled, default)",
48
  provider: "Google",
 
43
  mmmu: 82.0,
44
  },
45
  },
46
+
47
+ {
48
+ model: "Gemini 2.5 Pro Experimental (03-25)",
49
+ provider: "Google",
50
+ inputPrice: 1.25,
51
+ outputPrice: 10.0,
52
+ source: "https://blog.google/products/gemini/gemini-2-5-pro-updates/",
53
+ benchmark: {
54
+ livecodebench_v6: 70.4,
55
+ aider_polyglot: 74.0,
56
+ swe_bench_verified: 63.8,
57
+ gpqa_diamond: 84.0,
58
+ aime_2025: 86.7,
59
+ humanitys_last_exam: 18.8,
60
+ simpleqa: 52.9,
61
+ global_mmlu_lite: 89.8,
62
+ mrcr_v2_avg_128k: 94.5,
63
+ mrcr_v2_pointwise_1m: 83.1,
64
+ mmmu: 81.7,
65
+ // vibe_eval: 69.4,
66
+ // video_mme: not reported
67
+ },
68
+ },
69
+ {
70
+ model: "Gemini 2.5 Pro Preview (05-06)",
71
+ provider: "Google",
72
+ inputPrice: 1.25,
73
+ outputPrice: 10.0,
74
+ source: "https://blog.google/products/gemini/gemini-2-5-pro-updates/",
75
+ benchmark: {
76
+ livecodebench_v6: 75.6,
77
+ aider_polyglot: 76.5,
78
+ swe_bench_verified: 63.2,
79
+ gpqa_diamond: 83.0,
80
+ aime_2025: 83.0,
81
+ humanitys_last_exam: 17.8,
82
+ simpleqa: 50.8,
83
+ global_mmlu_lite: 88.6,
84
+ mrcr_v2_avg_128k: 93.0,
85
+ mrcr_v2_pointwise_1m: 82.9,
86
+ mmmu: 79.6,
87
+ // vibe_eval: 65.6,
88
+ // video_mme: 84.8,
89
+ },
90
+ },
91
+
92
  {
93
  model: "Gemini 2.5 Flash (Thinking-enabled, default)",
94
  provider: "Google",