llm-pricing-calculator

Running

App Files Files Community

Presidentlin commited on Jun 25

Commit

d7502bf

1 Parent(s): f911da5

x

Browse files

Files changed (4) hide show

src/lib/benchmarks/anthropic.ts +90 -91
src/lib/benchmarks/deepseek.ts +3 -3
src/lib/benchmarks/openai.ts +8 -8
src/lib/benchmarks/types.ts +5 -1

src/lib/benchmarks/anthropic.ts CHANGED Viewed

@@ -1,103 +1,102 @@
 import { Benchmark } from "./types";
 export const anthropicBenchmarks: Benchmark[] = [
-    {
-        model: "Claude Opus 4",
-        provider: "Anthropic",
-        inputPrice: 15.0,
-        outputPrice: 75.0,
-        source: "https://www.anthropic.com/news/claude-4",
-        benchmark: {
-            swe_bench_verified: 72.5,
-            //terminal_bench: 43.2,
-            gpqa_diamond: 79.6,
-            aime_2025: 75.5,
-            mmmlu: 88.8,
-            mmmu: 76.5,
-            // tau_bench_retail: 81.4,
-            //tau_bench_airline: 59.6,
-        },
     },
-    {
-        model: "Claude Sonnet 4",
-        provider: "Anthropic",
-        inputPrice: 3.0,
-        outputPrice: 15.0,
-        source: "https://www.anthropic.com/news/claude-4",
-        benchmark: {
-            swe_bench_verified: 72.7,
-            //terminal_bench: 35.5,
-            gpqa_diamond: 75.4,
-            aime_2025: 70.5,
-             mmmlu: 86.5,
-            mmmu: 74.4,
-            // tau_bench_retail: 80.5,
-            // tau_bench_airline: 60.0,
-        },
     },
-    {
-        model: "Claude 3.7 Sonnet (Extended Thinking 64K)",
-        provider: "Anthropic",
-        inputPrice: 3.0,
-        outputPrice: 15.0,
-        source: "https://www.anthropic.com/news/claude-3-7-sonnet",
-        benchmark: {
-            gpqa_diamond: 78.2,
-            //  tau_bench_retail: 81.2,
-            //  tau_bench_airline: 58.4,
-              mmmlu: 86.1,
-            mmmu: 75.0,
-            aime_24: 61.3,
-        },
     },
-    {
-        model: "Claude 3.7 Sonnet (No Extended Thinking)",
-        provider: "Anthropic",
-        inputPrice: 3.0,
-        outputPrice: 15.0,
-        source: "https://www.anthropic.com/news/claude-3-7-sonnet",
-        benchmark: {
-            gpqa_diamond: 68.0,
-            swe_bench_verified: 62.3,
-            mmmlu: 83.2,
-            mmmu: 71.8,
-            aime_24: 51.7, // using average of 23.3 & 80.0
-        },
     },
-    {
-        model: "Claude 3.5 Sonnet (New)",
-        provider: "Anthropic",
-        inputPrice: 3.0,
-        outputPrice: 15.0,
-        source: "https://www.anthropic.com/news/claude-3-7-sonnet",
-        benchmark: {
-            gpqa_diamond: 65.0,
-            swe_bench_verified: 49.0,
-            // tau_bench_retail: 71.5,
-            //  tau_bench_airline: 48.8,
-            mmmlu: 82.1,
-            mmmu: 70.4,
-            aime_24: 16.0, // average of 16.0 & 65.4
-        },
     },
-        {
-        model: "Claude 3.5 Haiku",
-        provider: "Anthropic",
-        inputPrice: 3.0,
-        outputPrice: 15.0,
-        source: "https://www.anthropic.com/news/3-5-models-and-computer-use",
-        benchmark: {
-            gpqa_diamond: 41.6,
-            swe_bench_verified: 49.0,
-            // tau_bench_retail: 51.0,
-            //  tau_bench_airline: 22.8,
-            humaneval:88.1,
-            mmmlu: 65.0,
-            aime_24: 5.3,
-        },
     },
-     {
     model: "Claude 3 Opus",
     provider: "Anthropic",
     inputPrice: 15.0,

 import { Benchmark } from "./types";
 export const anthropicBenchmarks: Benchmark[] = [
+  {
+    model: "Claude Opus 4",
+    provider: "Anthropic",
+    inputPrice: 15.0,
+    outputPrice: 75.0,
+    source: "https://www.anthropic.com/news/claude-4",
+    benchmark: {
+      swe_bench_verified: 72.5,
+      //terminal_bench: 43.2,
+      gpqa_diamond: 79.6,
+      aime_2025: 75.5,
+      mmmlu: 88.8,
+      mmmu: 76.5,
+      tau_bench_retail: 81.4,
+      tau_bench_airline: 59.6,
     },
+  },
+  {
+    model: "Claude Sonnet 4",
+    provider: "Anthropic",
+    inputPrice: 3.0,
+    outputPrice: 15.0,
+    source: "https://www.anthropic.com/news/claude-4",
+    benchmark: {
+      swe_bench_verified: 72.7,
+      //terminal_bench: 35.5,
+      gpqa_diamond: 75.4,
+      aime_2025: 70.5,
+      mmmlu: 86.5,
+      mmmu: 74.4,
+      tau_bench_retail: 80.5,
+      tau_bench_airline: 60.0,
     },
+  },
+  {
+    model: "Claude 3.7 Sonnet (Extended Thinking 64K)",
+    provider: "Anthropic",
+    inputPrice: 3.0,
+    outputPrice: 15.0,
+    source: "https://www.anthropic.com/news/claude-3-7-sonnet",
+    benchmark: {
+      gpqa_diamond: 78.2,
+      tau_bench_retail: 81.2,
+      tau_bench_airline: 58.4,
+      mmmlu: 86.1,
+      mmmu: 75.0,
+      aime_24: 61.3,
     },
+  },
+  {
+    model: "Claude 3.7 Sonnet (No Extended Thinking)",
+    provider: "Anthropic",
+    inputPrice: 3.0,
+    outputPrice: 15.0,
+    source: "https://www.anthropic.com/news/claude-3-7-sonnet",
+    benchmark: {
+      gpqa_diamond: 68.0,
+      swe_bench_verified: 62.3,
+      mmmlu: 83.2,
+      mmmu: 71.8,
+      aime_24: 51.7, // using average of 23.3 & 80.0
     },
+  },
+  {
+    model: "Claude 3.5 Sonnet (New)",
+    provider: "Anthropic",
+    inputPrice: 3.0,
+    outputPrice: 15.0,
+    source: "https://www.anthropic.com/news/claude-3-7-sonnet",
+    benchmark: {
+      gpqa_diamond: 65.0,
+      swe_bench_verified: 49.0,
+      tau_bench_retail: 71.5,
+      tau_bench_airline: 48.8,
+      mmmlu: 82.1,
+      mmmu: 70.4,
+      aime_24: 16.0, // average of 16.0 & 65.4
     },
+  },
+  {
+    model: "Claude 3.5 Haiku",
+    provider: "Anthropic",
+    inputPrice: 3.0,
+    outputPrice: 15.0,
+    source: "https://www.anthropic.com/news/3-5-models-and-computer-use",
+    benchmark: {
+      gpqa_diamond: 41.6,
+      swe_bench_verified: 49.0,
+      tau_bench_retail: 51.0,
+      tau_bench_airline: 22.8,
+      humaneval: 88.1,
+      mmmlu: 65.0,
+      aime_24: 5.3,
     },
+  },
+  {
     model: "Claude 3 Opus",
     provider: "Anthropic",
     inputPrice: 15.0,

src/lib/benchmarks/deepseek.ts CHANGED Viewed

@@ -22,8 +22,8 @@ export const deepseekBenchmarks: Benchmark[] = [
             // Not in BenchmarkMetric, but useful (commented for type safety):
             // codeforces_div1: 1930,
             // frames: 83.0,
-            // tau_bench_airline: 53.5,
-            // tau_bench_retail: 63.9,
             // bfcl_v3_multiturn: 37.0,
             // cnmo_2024: 86.9,
             // hmmt_2025: 79.4,
@@ -87,7 +87,7 @@ export const deepseekBenchmarks: Benchmark[] = [
     {
         model: "DeepSeek-R1",
         provider: "DeepSeek",
-       inputPrice: 0.55, // Placeholder, update if pricing becomes available
         outputPrice: 2.19,
         benchmark: {
             mmlu: 90.8,

             // Not in BenchmarkMetric, but useful (commented for type safety):
             // codeforces_div1: 1930,
             // frames: 83.0,
+            tau_bench_airline: 53.5,
+            tau_bench_retail: 63.9,
             // bfcl_v3_multiturn: 37.0,
             // cnmo_2024: 86.9,
             // hmmt_2025: 79.4,
     {
         model: "DeepSeek-R1",
         provider: "DeepSeek",
+        inputPrice: 0.55, // Placeholder, update if pricing becomes available
         outputPrice: 2.19,
         benchmark: {
             mmlu: 90.8,

src/lib/benchmarks/openai.ts CHANGED Viewed

@@ -229,8 +229,8 @@ export const openaiBenchmarks: Benchmark[] = [
             aider_polyglot: 81.3, // "(whole)"
             //scale_multichallenge: 56.51,
             //browsecomp: 8.35, // "o3 with python +browsing*"
-            //tau_bench: 52.0, // "(Airline)"
-            //  tau_bench_retail: 73.9, // "(Retail)"
         },
     },
     {
@@ -267,8 +267,8 @@ export const openaiBenchmarks: Benchmark[] = [
             aider_polyglot: 68.9, // "(whole)"
             //scale_multichallenge: 42.99,
             //browsecomp: 1.5, // "o4-mini with python +browsing** tools"
-            //tau_bench: 49.2, // "(Airline)"
-            //tau_bench_retail: 71.8, // "(Retail)"
         },
     },
     {
@@ -291,8 +291,8 @@ export const openaiBenchmarks: Benchmark[] = [
             aider_polyglot: 64.4, // "(whole)"
             //scale_multichallenge: 44.93,
             //browsecomp: 1.94, // "4o + browsing" - this seems to be a typo in the source, likely refers to o1's browsing capability
-            //tau_bench: 50.0, // "(Airline)"
-            //tau_bench_retail: 70.8, // "(Retail)"
         },
     },
     {
@@ -313,8 +313,8 @@ export const openaiBenchmarks: Benchmark[] = [
             aider_polyglot: 61.7, // "(diff)"
             //scale_multichallenge: 39.89,
             // BrowseComp not explicitly listed for o3-mini
-            //tau_bench: 32.4, // "(Airline)"
-            //tau_bench_retail: 57.6, // "(Retail)"
         },
     },
 ];

             aider_polyglot: 81.3, // "(whole)"
             //scale_multichallenge: 56.51,
             //browsecomp: 8.35, // "o3 with python +browsing*"
+            tau_bench_airline: 52.0, // "(Airline)"
+            tau_bench_retail: 73.9, // "(Retail)"
         },
     },
     {
             aider_polyglot: 68.9, // "(whole)"
             //scale_multichallenge: 42.99,
             //browsecomp: 1.5, // "o4-mini with python +browsing** tools"
+            tau_bench_airline: 49.2, // "(Airline)"
+            tau_bench_retail: 71.8, // "(Retail)"
         },
     },
     {
             aider_polyglot: 64.4, // "(whole)"
             //scale_multichallenge: 44.93,
             //browsecomp: 1.94, // "4o + browsing" - this seems to be a typo in the source, likely refers to o1's browsing capability
+            tau_bench_airline: 50.0, // "(Airline)"
+            tau_bench_retail: 70.8, // "(Retail)"
         },
     },
     {
             aider_polyglot: 61.7, // "(diff)"
             //scale_multichallenge: 39.89,
             // BrowseComp not explicitly listed for o3-mini
+            tau_bench_airline: 32.4, // "(Airline)"
+            tau_bench_retail: 57.6, // "(Retail)"
         },
     },
 ];

src/lib/benchmarks/types.ts CHANGED Viewed

@@ -11,6 +11,8 @@ export type BenchmarkMetric =
   | "aime_24"
   | "aime_2025"
   | "gpqa_diamond"
   // Code benchmarks (frequent)
   | "humaneval"
@@ -58,7 +60,9 @@ export const benchmarkMetricOrder: BenchmarkMetric[] = [
   "aime_24",
   "aime_2025",
   "gpqa_diamond",
-  "aider_polyglot"
   // // Code benchmarks (frequent)
   // "humaneval",

   | "aime_24"
   | "aime_2025"
   | "gpqa_diamond"
+  | "tau_bench_airline"
+  | "tau_bench_retail"
   // Code benchmarks (frequent)
   | "humaneval"
   "aime_24",
   "aime_2025",
   "gpqa_diamond",
+  "aider_polyglot",
+  "tau_bench_airline",
+  "tau_bench_retail"
   // // Code benchmarks (frequent)
   // "humaneval",