llm-pricing-calculator

Running

App Files Files Community

Presidentlin commited on Jun 21

Commit

d4f1866

1 Parent(s): 5a1ab27

x

Browse files

Files changed (2) hide show

src/lib/benchmarks/anthropic.ts +77 -5
src/lib/benchmarks/types.ts +2 -0

src/lib/benchmarks/anthropic.ts CHANGED Viewed

@@ -12,7 +12,7 @@ export const anthropicBenchmarks: Benchmark[] = [
             //terminal_bench: 43.2,
             gpqa_diamond: 79.6,
             aime_2025: 75.5,
-            //mmmlu: 88.8,
             mmmu: 76.5,
             // tau_bench_retail: 81.4,
             //tau_bench_airline: 59.6,
@@ -29,7 +29,7 @@ export const anthropicBenchmarks: Benchmark[] = [
             //terminal_bench: 35.5,
             gpqa_diamond: 75.4,
             aime_2025: 70.5,
-            // mmmlu: 86.5,
             mmmu: 74.4,
             // tau_bench_retail: 80.5,
             // tau_bench_airline: 60.0,
@@ -46,7 +46,7 @@ export const anthropicBenchmarks: Benchmark[] = [
             //  tau_bench_retail: 81.2,
             //  tau_bench_airline: 58.4,
-            //  mmmlu: 86.1,
             mmmu: 75.0,
             aime_24: 61.3,
         },
@@ -60,7 +60,7 @@ export const anthropicBenchmarks: Benchmark[] = [
         benchmark: {
             gpqa_diamond: 68.0,
             swe_bench_verified: 62.3,
-            //  mmmlu: 83.2,
             mmmu: 71.8,
             aime_24: 51.7, // using average of 23.3 & 80.0
         },
@@ -76,9 +76,81 @@ export const anthropicBenchmarks: Benchmark[] = [
             swe_bench_verified: 49.0,
             // tau_bench_retail: 71.5,
             //  tau_bench_airline: 48.8,
-            //  mmmlu: 82.1,
             mmmu: 70.4,
             aime_24: 16.0, // average of 16.0 & 65.4
         },
     },
 ];

             //terminal_bench: 43.2,
             gpqa_diamond: 79.6,
             aime_2025: 75.5,
+            mmmlu: 88.8,
             mmmu: 76.5,
             // tau_bench_retail: 81.4,
             //tau_bench_airline: 59.6,
             //terminal_bench: 35.5,
             gpqa_diamond: 75.4,
             aime_2025: 70.5,
+             mmmlu: 86.5,
             mmmu: 74.4,
             // tau_bench_retail: 80.5,
             // tau_bench_airline: 60.0,
             //  tau_bench_retail: 81.2,
             //  tau_bench_airline: 58.4,
+              mmmlu: 86.1,
             mmmu: 75.0,
             aime_24: 61.3,
         },
         benchmark: {
             gpqa_diamond: 68.0,
             swe_bench_verified: 62.3,
+            mmmlu: 83.2,
             mmmu: 71.8,
             aime_24: 51.7, // using average of 23.3 & 80.0
         },
             swe_bench_verified: 49.0,
             // tau_bench_retail: 71.5,
             //  tau_bench_airline: 48.8,
+            mmmlu: 82.1,
             mmmu: 70.4,
             aime_24: 16.0, // average of 16.0 & 65.4
         },
     },
+     {
+    model: "Claude 3 Opus",
+    provider: "Anthropic",
+    inputPrice: 15.0,
+    outputPrice: 75.0,
+    source: "https://www.anthropic.com/news/claude-3-opus-release",
+    benchmark: {
+      gpqa_diamond: 50.4,
+      mmmlu: 86.8,
+      mmmu: 59.4,
+      // gsm8k: 95.0,
+      // math: 60.1,
+      // mgsm: 90.7,
+      // humaneval: 84.9,
+      // drop: 83.1,
+      // big_bench_hard: 86.8,
+      // arc_challenge: 96.4,
+      // hellaswag: 95.4,
+      // mathvista: 50.5,
+      // ai2d: 88.1,
+      // chart_qa: 80.8,
+      // docvqa_anls: 89.3,
+    },
+  },
+  {
+    model: "Claude 3 Sonnet",
+    provider: "Anthropic",
+    inputPrice: 3.0,
+    outputPrice: 15.0,
+    source: "https://www.anthropic.com/news/claude-3-opus-release",
+    benchmark: {
+      gpqa_diamond: 40.4,
+      mmmlu: 79.0,
+      mmmu: 53.1,
+      // gsm8k: 92.3,
+      // math: 43.1,
+      // mgsm: 83.5,
+      // humaneval: 73.0,
+      // drop: 78.9,
+      // big_bench_hard: 82.9,
+      // arc_challenge: 93.2,
+      // hellaswag: 89.0,
+      // mathvista: 47.9,
+      // ai2d: 88.7,
+      // chart_qa: 81.1,
+      // docvqa_anls: 89.5,
+    },
+  },
+  {
+    model: "Claude 3 Haiku",
+    provider: "Anthropic",
+    inputPrice: 0.25,
+    outputPrice: 1.25,
+    source: "https://www.anthropic.com/news/claude-3-opus-release",
+    benchmark: {
+      gpqa_diamond: 33.3,
+      mmmlu: 75.2,
+      mmmu: 50.2,
+      // gsm8k: 88.9,
+      // math: 38.9,
+      // mgsm: 75.1,
+      // humaneval: 75.9,
+      // drop: 78.4,
+      // big_bench_hard: 73.7,
+      // arc_challenge: 89.2,
+      // hellaswag: 85.9,
+      // mathvista: 46.4,
+      // ai2d: 86.7,
+      // chart_qa: 81.7,
+      // docvqa_anls: 88.8,
+    },
+  },
 ];

src/lib/benchmarks/types.ts CHANGED Viewed

@@ -23,6 +23,7 @@ export type BenchmarkMetric =
   // General reasoning & robustness
   | "bigbench_extra_hard"
   | "global_mmlu_lite"
   // Optional: less frequent but still potentially useful
   | "facts_grounding"
@@ -68,6 +69,7 @@ export const benchmarkMetricOrder: BenchmarkMetric[] = [
   // // General reasoning & robustness
   // "bigbench_extra_hard",
   // "global_mmlu_lite",
   // // Optional: less frequent but still potentially useful
   // "facts_grounding",

   // General reasoning & robustness
   | "bigbench_extra_hard"
   | "global_mmlu_lite"
+  | "mmmlu"
   // Optional: less frequent but still potentially useful
   | "facts_grounding"
   // // General reasoning & robustness
   // "bigbench_extra_hard",
   // "global_mmlu_lite",
+  //"mmmlu"
   // // Optional: less frequent but still potentially useful
   // "facts_grounding",