Presidentlin's picture
x
80fc9d2
import { Benchmark } from "./types";
export const qwenBenchmarks: Benchmark[] = [
{
model: "Qwen3-235B-A22B (Base Model)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
mmlu: 87.81,
//mmluredux: 87.40,
////"mmlu-pro": 68.18,
////supergpqa: 44.06,
////bbh: 88.87,
gpqa: 47.47,
////gsm8k: 94.39,
////math: 71.84,
////evalplus: 77.60,
//multiple: 65.94,
mbpp: 81.40,
//cruxo: 79.00,
////mgsm: 83.53,
mmmlu: 86.70,
////include: 73.46,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-32B (Base Model)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
mmlu: 83.61,
//"mmlu-redux": 83.41,
//"mmlu-pro": 65.54,
//supergpqa: 39.78,
//bbh: 87.38,
gpqa: 49.49,
//gsm8k: 93.40,
//math: 61.62,
//evalplus: 72.05,
//"multipl-e":: 67.06,
mbpp: 78.20,
// "crux-o":: 72.50,
//mgsm: 83.06,
mmmlu: 83.83,
//include: 67.87,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-14B (Base Model)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
mmlu: 81.05,
//"mmlu-redux": 79.88,
//"mmlu-pro": 61.03,
//supergpqa: 34.27,
//bbh: 81.07,
gpqa: 39.90,
//gsm8k: 92.49,
//math: 62.02,
//evalplus: 72.23,
//"multipl-e":: 61.69,
mbpp: 73.40,
// "crux-o":: 68.60,
//mgsm: 79.20,
mmmlu: 81.46,
//include: 64.55,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-30B-A3B (Base Model)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
mmlu: 81.38,
//"mmlu-redux": 81.17,
//"mmlu-pro": 61.49,
//supergpqa: 35.72,
//bbh: 81.54,
gpqa: 43.94,
//gsm8k: 91.81,
//math: 59.04,
//evalplus: 71.45,
//"multipl-e":: 66.53,
mbpp: 74.40,
// "crux-o":: 67.20,
//mgsm: 79.11,
mmmlu: 81.46,
//include: 67.00,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-8B (Base Model)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
mmlu: 76.89,
//"mmlu-redux": 76.17,
//"mmlu-pro": 56.73,
//supergpqa: 31.64,
//bbh: 78.40,
gpqa: 44.44,
//gsm8k: 89.84,
//math: 60.80,
//evalplus: 67.65,
//"multipl-e":: 58.75,
mbpp: 69.80,
// "crux-o":: 62.00,
//mgsm: 76.02,
mmmlu: 75.72,
//include: 59.40,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-4B (Base Model)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
mmlu: 72.99,
//"mmlu-redux": 72.79,
//"mmlu-pro": 50.58,
//supergpqa: 28.43,
//bbh: 72.59,
gpqa: 36.87,
//gsm8k: 87.79,
//math: 54.10,
//evalplus: 63.53,
//"multipl-e":: 53.13,
mbpp: 67.00,
// "crux-o":: 55.00,
//mgsm: 67.74,
mmmlu: 71.42,
//include: 56.29,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-1.7B (Base Model)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
mmlu: 62.63,
//"mmlu-redux": 61.66,
//"mmlu-pro": 36.76,
//supergpqa: 20.92,
//bbh: 54.47,
gpqa: 28.28,
//gsm8k: 75.44,
//math: 43.50,
//evalplus: 52.70,
//"multipl-e":: 42.71,
mbpp: 55.40,
// "crux-o":: 36.40,
//mgsm: 50.71,
mmmlu: 63.27,
//include: 45.57,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-0.6B (Base Model)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
mmlu: 52.81,
//"mmlu-redux": 51.26,
//"mmlu-pro": 24.74,
//supergpqa: 15.03,
//bbh: 41.47,
gpqa: 26.77,
//gsm8k: 59.59,
//math: 32.44,
//evalplus: 36.23,
//"multipl-e":: 24.58,
mbpp: 36.60,
// "crux-o":: 27.00,
//mgsm: 30.99,
mmmlu: 50.16,
//include: 34.26,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-235B-A22B (Thinking Mode)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
//"mmlu-redux": 92.7,
gpqa_diamond: 71.1,
//"c-eval": 89.6,
//"livebench-2024-11-25": 77.1,
//"ifeval-strict-prompt": 83.4,
//"arena-hard": 95.6,
//"alignbench-v1.1": 8.94,
//"creative-writing-v3": 84.6,
//writingbench: 8.03,
//"math-500": 98.0,
aime_24: 85.7,
aime_2025: 81.5,
//zebralogic: 80.3,
//autologi: 89.0,
//"bfcl-v3": 70.8,
//"livecodebench-v5": 70.7,
//"codeforces-rating": 2056,
//"codeforces-percentile": 98.2,
//"multi-if": 71.9,
//include: 78.7,
// "mmmlu-14-languages": 84.3,
//"mt-aime2024": 80.8,
//poly//math: 54.7,
//mlogiqa: 77.1,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-235B-A22B (Non-thinking Mode)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
//"mmlu-redux": 89.2,
gpqa_diamond: 62.9,
//"c-eval": 86.1,
//"livebench-2024-11-25": 62.5,
//"ifeval-strict-prompt": 83.2,
//"arena-hard": 96.1,
//"alignbench-v1.1": 8.91,
//"creative-writing-v3": 80.4,
//writingbench: 7.70,
//"math-500": 91.2,
aime_24: 40.1,
aime_2025: 24.7,
//zebralogic: 37.7,
//autologi: 83.3,
//"bfcl-v3": 68.0,
//"livecodebench-v5": 35.3,
//"codeforces-rating": 1387,
//"codeforces-percentile": 75.7,
//"multi-if": 70.2,
//include: 75.6,
// "mmmlu-14-languages": 79.8,
//"mt-aime2024": 32.4,
//poly//math: 27.0,
//mlogiqa: 67.6,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-32B (Thinking Mode)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
//"mmlu-redux": 90.9,
gpqa_diamond: 68.4,
//"c-eval": 87.3,
//"livebench-2024-11-25": 74.9,
//"ifeval-strict-prompt": 85.0,
//"arena-hard": 93.8,
//"alignbench-v1.1": 8.72,
//"creative-writing-v3": 81.0,
//writingbench: 7.90,
//"math-500": 97.2,
aime_24: 81.4,
aime_2025: 72.9,
//zebralogic: 88.8,
//autologi: 87.3,
//"bfcl-v3": 70.3,
//"livecodebench-v5": 65.7,
//"codeforces-rating": 1977,
//"codeforces-percentile": 97.7,
//"multi-if": 73.0,
//include: 73.7,
// "mmmlu-14-languages": 80.6,
//"mt-aime2024": 75.0,
//poly//math: 47.4,
//mlogiqa: 76.3,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-32B (Non-thinking Mode)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
//"mmlu-redux": 85.7,
gpqa_diamond: 54.6,
//"c-eval": 83.3,
//"livebench-2024-11-25": 59.8,
//"ifeval-strict-prompt": 83.2,
//"arena-hard": 92.8,
//"alignbench-v1.1": 8.58,
//"creative-writing-v3": 78.3,
//writingbench: 7.54,
//"math-500": 88.6,
aime_24: 31.0,
aime_2025: 20.2,
//zebralogic: 29.2,
//autologi: 78.5,
//"bfcl-v3": 63.0,
//"livecodebench-v5": 31.3,
//"codeforces-rating": 1353,
//"codeforces-percentile": 71.0,
//"multi-if": 70.7,
//include: 70.9,
// "mmmlu-14-languages": 76.5,
//"mt-aime2024": 24.1,
//poly//math: 22.5,
//mlogiqa: 62.9,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-14B (Thinking Mode)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
//"mmlu-redux": 88.6,
gpqa_diamond: 64.0,
//"c-eval": 86.2,
//"livebench-2024-11-25": 71.3,
//"ifeval-strict-prompt": 85.4,
//"arena-hard": 91.7,
//"alignbench-v1.1": 8.56,
//"creative-writing-v3": 80.3,
//writingbench: 7.80,
//"math-500": 96.8,
aime_24: 79.3,
aime_2025: 70.4,
//zebralogic: 88.5,
//autologi: 89.2,
//"bfcl-v3": 70.4,
//"livecodebench-v5": 63.5,
//"codeforces-rating": 1766,
//"codeforces-percentile": 95.3,
//"multi-if": 74.8,
//include: 71.7,
// "mmmlu-14-languages": 77.9,
//"mt-aime2024": 73.3,
//poly//math: 45.8,
//mlogiqa: 71.1,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-30B-A3B (Thinking Mode)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
//"mmlu-redux": 89.5,
gpqa_diamond: 65.8,
//"c-eval": 86.6,
//"livebench-2024-11-25": 74.3,
//"ifeval-strict-prompt": 86.5,
//"arena-hard": 91.0,
//"alignbench-v1.1": 8.70,
//"creative-writing-v3": 79.1,
//writingbench: 7.70,
//"math-500": 98.0,
aime_24: 80.4,
aime_2025: 70.9,
//zebralogic: 89.5,
//autologi: 88.7,
//"bfcl-v3": 69.1,
//"livecodebench-v5": 62.6,
//"codeforces-rating": 1974,
//"codeforces-percentile": 97.7,
//"multi-if": 72.2,
//include: 71.9,
// "mmmlu-14-languages": 78.4,
//"mt-aime2024": 73.9,
//poly//math: 46.1,
//mlogiqa: 70.1,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-14B (Non-thinking Mode)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
//"mmlu-redux": 82.0,
gpqa_diamond: 54.8,
//"c-eval": 81.0,
//"livebench-2024-11-25": 59.6,
//"ifeval-strict-prompt": 84.8,
//"arena-hard": 86.3,
//"alignbench-v1.1": 8.52,
//"creative-writing-v3": 73.1,
//writingbench: 7.24,
//"math-500": 90.0,
aime_24: 31.7,
aime_2025: 23.3,
//zebralogic: 33.0,
//autologi: 82.0,
//"bfcl-v3": 61.5,
//"livecodebench-v5": 29.0,
//"codeforces-rating": 1200,
//"codeforces-percentile": 58.6,
//"multi-if": 72.9,
//include: 67.8,
// "mmmlu-14-languages": 72.6,
//"mt-aime2024": 23.2,
//poly//math: 22.0,
//mlogiqa: 58.9,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-30B-A3B (Non-thinking Mode)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
//"mmlu-redux": 84.1,
gpqa_diamond: 54.8,
//"c-eval": 82.9,
//"livebench-2024-11-25": 59.4,
//"ifeval-strict-prompt": 83.7,
//"arena-hard": 88.0,
//"alignbench-v1.1": 8.55,
//"creative-writing-v3": 68.1,
//writingbench: 7.22,
//"math-500": 89.8,
aime_24: 32.8,
aime_2025: 21.6,
//zebralogic: 33.2,
//autologi: 81.5,
//"bfcl-v3": 58.6,
//"livecodebench-v5": 29.8,
//"codeforces-rating": 1267,
//"codeforces-percentile": 64.1,
//"multi-if": 70.8,
//include: 67.8,
// "mmmlu-14-languages": 73.8,
//"mt-aime2024": 24.6,
//poly//math: 23.3,
//mlogiqa: 53.3,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-4B (Thinking Mode)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
//"mmlu-redux": 83.7,
gpqa_diamond: 55.9,
//"c-eval": 77.5,
//"livebench-2024-11-25": 63.6,
//"ifeval-strict-prompt": 81.9,
//"arena-hard": 76.6,
//"alignbench-v1.1": 8.30,
//"creative-writing-v3": 61.1,
//writingbench: 7.35,
//"math-500": 97.0,
aime_24: 73.8,
aime_2025: 65.6,
//zebralogic: 81.0,
//autologi: 87.9,
//"bfcl-v3": 65.9,
//"livecodebench-v5": 54.2,
//"codeforces-rating": 1671,
//"codeforces-percentile": 92.8,
//"multi-if": 66.3,
//include: 61.8,
// "mmmlu-14-languages": 69.8,
//"mt-aime2024": 60.7,
//poly//math: 40.0,
//mlogiqa: 65.9,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-8B (Thinking Mode)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
//"mmlu-redux": 87.5,
gpqa_diamond: 62.0,
//"c-eval": 83.4,
//"livebench-2024-11-25": 67.1,
//"ifeval-strict-prompt": 85.0,
//"arena-hard": 85.8,
//"alignbench-v1.1": 8.46,
//"creative-writing-v3": 75.0,
//writingbench: 7.59,
//"math-500": 97.4,
aime_24: 76.0,
aime_2025: 67.3,
//zebralogic: 84.8,
//autologi: 89.1,
//"bfcl-v3": 68.1,
//"livecodebench-v5": 57.5,
//"codeforces-rating": 1785,
//"codeforces-percentile": 95.6,
//"multi-if": 71.2,
//include: 67.8,
// "mmmlu-14-languages": 74.4,
//"mt-aime2024": 65.4,
//poly//math: 42.7,
//mlogiqa: 69.0,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-4B (Non-thinking Mode)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
//"mmlu-redux": 77.3,
gpqa_diamond: 41.7,
//"c-eval": 72.2,
//"livebench-2024-11-25": 48.4,
//"ifeval-strict-prompt": 81.2,
//"arena-hard": 66.2,
//"alignbench-v1.1": 8.10,
//"creative-writing-v3": 53.6,
//writingbench: 6.85,
//"math-500": 84.8,
aime_24: 25.0,
aime_2025: 19.1,
//zebralogic: 35.2,
//autologi: 76.3,
//"bfcl-v3": 57.6,
//"livecodebench-v5": 21.3,
//"codeforces-rating": 842,
//"codeforces-percentile": 33.7,
//"multi-if": 61.3,
//include: 53.8,
// "mmmlu-14-languages": 61.7,
//"mt-aime2024": 13.9,
//poly//math: 16.6,
//mlogiqa: 49.9,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-8B (Non-thinking Mode)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
//"mmlu-redux": 79.5,
gpqa_diamond: 39.3,
//"c-eval": 77.9,
//"livebench-2024-11-25": 53.5,
//"ifeval-strict-prompt": 83.0,
//"arena-hard": 79.6,
//"alignbench-v1.1": 8.38,
//"creative-writing-v3": 64.5,
//writingbench: 7.15,
//"math-500": 87.4,
aime_24: 29.1,
aime_2025: 20.9,
//zebralogic: 26.7,
//autologi: 76.5,
//"bfcl-v3": 60.2,
//"livecodebench-v5": 22.8,
//"codeforces-rating": 1110,
//"codeforces-percentile": 52.4,
//"multi-if": 69.2,
//include: 62.5,
// "mmmlu-14-languages": 66.9,
//"mt-aime2024": 16.6,
//poly//math: 18.8,
//mlogiqa: 51.4,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-0.6B (Thinking Mode)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
//"mmlu-redux": 55.6,
gpqa_diamond: 27.9,
//"c-eval": 50.4,
//"livebench-2024-11-25": 30.3,
//"ifeval-strict-prompt": 59.2,
//"arena-hard": 8.5,
//"alignbench-v1.1": 6.10,
//"creative-writing-v3": 30.6,
//writingbench: 5.61,
//"math-500": 77.6,
aime_24: 10.7,
aime_2025: 15.1,
//zebralogic: 30.3,
//autologi: 61.6,
//"bfcl-v3": 46.4,
//"livecodebench-v5": 12.3,
//"multi-if": 36.1,
//include: 35.9,
// "mmmlu-14-languages": 43.1,
//"mt-aime2024": 7.8,
//poly//math: 11.4,
//mlogiqa: 40.9,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-1.7B (Thinking Mode)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
//"mmlu-redux": 73.9,
gpqa_diamond: 40.1,
//"c-eval": 68.1,
//"livebench-2024-11-25": 51.1,
//"ifeval-strict-prompt": 72.5,
//"arena-hard": 43.1,
//"alignbench-v1.1": 7.60,
//"creative-writing-v3": 48.0,
//writingbench: 7.02,
//"math-500": 93.4,
aime_24: 48.3,
aime_2025: 36.8,
//zebralogic: 63.2,
//autologi: 83.2,
//"bfcl-v3": 56.6,
//"livecodebench-v5": 33.2,
//"multi-if": 51.2,
//include: 51.8,
// "mmmlu-14-languages": 59.1,
//"mt-aime2024": 36.1,
//poly//math: 25.2,
//mlogiqa: 56.0,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-0.6B (Non-thinking Mode)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
//"mmlu-redux": 44.6,
gpqa_diamond: 22.9,
//"c-eval": 42.6,
//"livebench-2024-11-25": 21.8,
//"ifeval-strict-prompt": 54.5,
//"arena-hard": 6.5,
//"alignbench-v1.1": 5.60,
//"creative-writing-v3": 28.4,
//writingbench: 5.13,
//"math-500": 55.2,
aime_24: 3.4,
aime_2025: 2.6,
//zebralogic: 4.2,
//autologi: 37.4,
//"bfcl-v3": 44.1,
//"livecodebench-v5": 3.6,
//"multi-if": 33.3,
//include: 34.4,
// "mmmlu-14-languages": 37.1,
//"mt-aime2024": 1.5,
//poly//math: 4.6,
//mlogiqa: 37.3,
},
source: "https://arxiv.org/pdf/2505.09388",
},
{
model: "Qwen3-1.7B (Non-thinking Mode)",
provider: "Qwen",
inputPrice: 0,
outputPrice: 0,
benchmark: {
//"mmlu-redux": 64.4,
gpqa_diamond: 28.6,
//"c-eval": 61.0,
//"livebench-2024-11-25": 35.6,
//"ifeval-strict-prompt": 68.2,
//"arena-hard": 36.9,
//"alignbench-v1.1": 7.20,
//"creative-writing-v3": 43.6,
//writingbench: 6.54,
//"math-500": 73.0,
aime_24: 13.4,
aime_2025: 9.8,
//zebralogic: 12.8,
//autologi: 59.8,
//"bfcl-v3": 52.2,
//"livecodebench-v5": 11.6,
//"multi-if": 44.7,
//include: 42.6,
// "mmmlu-14-languages": 48.3,
//"mt-aime2024": 4.9,
//poly//math: 10.3,
//mlogiqa: 41.1,
},
source: "https://arxiv.org/pdf/2505.09388",
},];