import { Benchmark } from "./types"; | |
export const qwenBenchmarks: Benchmark[] = [ | |
{ | |
model: "Qwen3-235B-A22B (Base Model)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
mmlu: 87.81, | |
//mmluredux: 87.40, | |
////"mmlu-pro": 68.18, | |
////supergpqa: 44.06, | |
////bbh: 88.87, | |
gpqa: 47.47, | |
////gsm8k: 94.39, | |
////math: 71.84, | |
////evalplus: 77.60, | |
//multiple: 65.94, | |
mbpp: 81.40, | |
//cruxo: 79.00, | |
////mgsm: 83.53, | |
mmmlu: 86.70, | |
////include: 73.46, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-32B (Base Model)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
mmlu: 83.61, | |
//"mmlu-redux": 83.41, | |
//"mmlu-pro": 65.54, | |
//supergpqa: 39.78, | |
//bbh: 87.38, | |
gpqa: 49.49, | |
//gsm8k: 93.40, | |
//math: 61.62, | |
//evalplus: 72.05, | |
//"multipl-e":: 67.06, | |
mbpp: 78.20, | |
// "crux-o":: 72.50, | |
//mgsm: 83.06, | |
mmmlu: 83.83, | |
//include: 67.87, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-14B (Base Model)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
mmlu: 81.05, | |
//"mmlu-redux": 79.88, | |
//"mmlu-pro": 61.03, | |
//supergpqa: 34.27, | |
//bbh: 81.07, | |
gpqa: 39.90, | |
//gsm8k: 92.49, | |
//math: 62.02, | |
//evalplus: 72.23, | |
//"multipl-e":: 61.69, | |
mbpp: 73.40, | |
// "crux-o":: 68.60, | |
//mgsm: 79.20, | |
mmmlu: 81.46, | |
//include: 64.55, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-30B-A3B (Base Model)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
mmlu: 81.38, | |
//"mmlu-redux": 81.17, | |
//"mmlu-pro": 61.49, | |
//supergpqa: 35.72, | |
//bbh: 81.54, | |
gpqa: 43.94, | |
//gsm8k: 91.81, | |
//math: 59.04, | |
//evalplus: 71.45, | |
//"multipl-e":: 66.53, | |
mbpp: 74.40, | |
// "crux-o":: 67.20, | |
//mgsm: 79.11, | |
mmmlu: 81.46, | |
//include: 67.00, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-8B (Base Model)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
mmlu: 76.89, | |
//"mmlu-redux": 76.17, | |
//"mmlu-pro": 56.73, | |
//supergpqa: 31.64, | |
//bbh: 78.40, | |
gpqa: 44.44, | |
//gsm8k: 89.84, | |
//math: 60.80, | |
//evalplus: 67.65, | |
//"multipl-e":: 58.75, | |
mbpp: 69.80, | |
// "crux-o":: 62.00, | |
//mgsm: 76.02, | |
mmmlu: 75.72, | |
//include: 59.40, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-4B (Base Model)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
mmlu: 72.99, | |
//"mmlu-redux": 72.79, | |
//"mmlu-pro": 50.58, | |
//supergpqa: 28.43, | |
//bbh: 72.59, | |
gpqa: 36.87, | |
//gsm8k: 87.79, | |
//math: 54.10, | |
//evalplus: 63.53, | |
//"multipl-e":: 53.13, | |
mbpp: 67.00, | |
// "crux-o":: 55.00, | |
//mgsm: 67.74, | |
mmmlu: 71.42, | |
//include: 56.29, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-1.7B (Base Model)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
mmlu: 62.63, | |
//"mmlu-redux": 61.66, | |
//"mmlu-pro": 36.76, | |
//supergpqa: 20.92, | |
//bbh: 54.47, | |
gpqa: 28.28, | |
//gsm8k: 75.44, | |
//math: 43.50, | |
//evalplus: 52.70, | |
//"multipl-e":: 42.71, | |
mbpp: 55.40, | |
// "crux-o":: 36.40, | |
//mgsm: 50.71, | |
mmmlu: 63.27, | |
//include: 45.57, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-0.6B (Base Model)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
mmlu: 52.81, | |
//"mmlu-redux": 51.26, | |
//"mmlu-pro": 24.74, | |
//supergpqa: 15.03, | |
//bbh: 41.47, | |
gpqa: 26.77, | |
//gsm8k: 59.59, | |
//math: 32.44, | |
//evalplus: 36.23, | |
//"multipl-e":: 24.58, | |
mbpp: 36.60, | |
// "crux-o":: 27.00, | |
//mgsm: 30.99, | |
mmmlu: 50.16, | |
//include: 34.26, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-235B-A22B (Thinking Mode)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
//"mmlu-redux": 92.7, | |
gpqa_diamond: 71.1, | |
//"c-eval": 89.6, | |
//"livebench-2024-11-25": 77.1, | |
//"ifeval-strict-prompt": 83.4, | |
//"arena-hard": 95.6, | |
//"alignbench-v1.1": 8.94, | |
//"creative-writing-v3": 84.6, | |
//writingbench: 8.03, | |
//"math-500": 98.0, | |
aime_24: 85.7, | |
aime_2025: 81.5, | |
//zebralogic: 80.3, | |
//autologi: 89.0, | |
//"bfcl-v3": 70.8, | |
//"livecodebench-v5": 70.7, | |
//"codeforces-rating": 2056, | |
//"codeforces-percentile": 98.2, | |
//"multi-if": 71.9, | |
//include: 78.7, | |
// "mmmlu-14-languages": 84.3, | |
//"mt-aime2024": 80.8, | |
//poly//math: 54.7, | |
//mlogiqa: 77.1, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-235B-A22B (Non-thinking Mode)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
//"mmlu-redux": 89.2, | |
gpqa_diamond: 62.9, | |
//"c-eval": 86.1, | |
//"livebench-2024-11-25": 62.5, | |
//"ifeval-strict-prompt": 83.2, | |
//"arena-hard": 96.1, | |
//"alignbench-v1.1": 8.91, | |
//"creative-writing-v3": 80.4, | |
//writingbench: 7.70, | |
//"math-500": 91.2, | |
aime_24: 40.1, | |
aime_2025: 24.7, | |
//zebralogic: 37.7, | |
//autologi: 83.3, | |
//"bfcl-v3": 68.0, | |
//"livecodebench-v5": 35.3, | |
//"codeforces-rating": 1387, | |
//"codeforces-percentile": 75.7, | |
//"multi-if": 70.2, | |
//include: 75.6, | |
// "mmmlu-14-languages": 79.8, | |
//"mt-aime2024": 32.4, | |
//poly//math: 27.0, | |
//mlogiqa: 67.6, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-32B (Thinking Mode)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
//"mmlu-redux": 90.9, | |
gpqa_diamond: 68.4, | |
//"c-eval": 87.3, | |
//"livebench-2024-11-25": 74.9, | |
//"ifeval-strict-prompt": 85.0, | |
//"arena-hard": 93.8, | |
//"alignbench-v1.1": 8.72, | |
//"creative-writing-v3": 81.0, | |
//writingbench: 7.90, | |
//"math-500": 97.2, | |
aime_24: 81.4, | |
aime_2025: 72.9, | |
//zebralogic: 88.8, | |
//autologi: 87.3, | |
//"bfcl-v3": 70.3, | |
//"livecodebench-v5": 65.7, | |
//"codeforces-rating": 1977, | |
//"codeforces-percentile": 97.7, | |
//"multi-if": 73.0, | |
//include: 73.7, | |
// "mmmlu-14-languages": 80.6, | |
//"mt-aime2024": 75.0, | |
//poly//math: 47.4, | |
//mlogiqa: 76.3, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-32B (Non-thinking Mode)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
//"mmlu-redux": 85.7, | |
gpqa_diamond: 54.6, | |
//"c-eval": 83.3, | |
//"livebench-2024-11-25": 59.8, | |
//"ifeval-strict-prompt": 83.2, | |
//"arena-hard": 92.8, | |
//"alignbench-v1.1": 8.58, | |
//"creative-writing-v3": 78.3, | |
//writingbench: 7.54, | |
//"math-500": 88.6, | |
aime_24: 31.0, | |
aime_2025: 20.2, | |
//zebralogic: 29.2, | |
//autologi: 78.5, | |
//"bfcl-v3": 63.0, | |
//"livecodebench-v5": 31.3, | |
//"codeforces-rating": 1353, | |
//"codeforces-percentile": 71.0, | |
//"multi-if": 70.7, | |
//include: 70.9, | |
// "mmmlu-14-languages": 76.5, | |
//"mt-aime2024": 24.1, | |
//poly//math: 22.5, | |
//mlogiqa: 62.9, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-14B (Thinking Mode)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
//"mmlu-redux": 88.6, | |
gpqa_diamond: 64.0, | |
//"c-eval": 86.2, | |
//"livebench-2024-11-25": 71.3, | |
//"ifeval-strict-prompt": 85.4, | |
//"arena-hard": 91.7, | |
//"alignbench-v1.1": 8.56, | |
//"creative-writing-v3": 80.3, | |
//writingbench: 7.80, | |
//"math-500": 96.8, | |
aime_24: 79.3, | |
aime_2025: 70.4, | |
//zebralogic: 88.5, | |
//autologi: 89.2, | |
//"bfcl-v3": 70.4, | |
//"livecodebench-v5": 63.5, | |
//"codeforces-rating": 1766, | |
//"codeforces-percentile": 95.3, | |
//"multi-if": 74.8, | |
//include: 71.7, | |
// "mmmlu-14-languages": 77.9, | |
//"mt-aime2024": 73.3, | |
//poly//math: 45.8, | |
//mlogiqa: 71.1, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-30B-A3B (Thinking Mode)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
//"mmlu-redux": 89.5, | |
gpqa_diamond: 65.8, | |
//"c-eval": 86.6, | |
//"livebench-2024-11-25": 74.3, | |
//"ifeval-strict-prompt": 86.5, | |
//"arena-hard": 91.0, | |
//"alignbench-v1.1": 8.70, | |
//"creative-writing-v3": 79.1, | |
//writingbench: 7.70, | |
//"math-500": 98.0, | |
aime_24: 80.4, | |
aime_2025: 70.9, | |
//zebralogic: 89.5, | |
//autologi: 88.7, | |
//"bfcl-v3": 69.1, | |
//"livecodebench-v5": 62.6, | |
//"codeforces-rating": 1974, | |
//"codeforces-percentile": 97.7, | |
//"multi-if": 72.2, | |
//include: 71.9, | |
// "mmmlu-14-languages": 78.4, | |
//"mt-aime2024": 73.9, | |
//poly//math: 46.1, | |
//mlogiqa: 70.1, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-14B (Non-thinking Mode)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
//"mmlu-redux": 82.0, | |
gpqa_diamond: 54.8, | |
//"c-eval": 81.0, | |
//"livebench-2024-11-25": 59.6, | |
//"ifeval-strict-prompt": 84.8, | |
//"arena-hard": 86.3, | |
//"alignbench-v1.1": 8.52, | |
//"creative-writing-v3": 73.1, | |
//writingbench: 7.24, | |
//"math-500": 90.0, | |
aime_24: 31.7, | |
aime_2025: 23.3, | |
//zebralogic: 33.0, | |
//autologi: 82.0, | |
//"bfcl-v3": 61.5, | |
//"livecodebench-v5": 29.0, | |
//"codeforces-rating": 1200, | |
//"codeforces-percentile": 58.6, | |
//"multi-if": 72.9, | |
//include: 67.8, | |
// "mmmlu-14-languages": 72.6, | |
//"mt-aime2024": 23.2, | |
//poly//math: 22.0, | |
//mlogiqa: 58.9, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-30B-A3B (Non-thinking Mode)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
//"mmlu-redux": 84.1, | |
gpqa_diamond: 54.8, | |
//"c-eval": 82.9, | |
//"livebench-2024-11-25": 59.4, | |
//"ifeval-strict-prompt": 83.7, | |
//"arena-hard": 88.0, | |
//"alignbench-v1.1": 8.55, | |
//"creative-writing-v3": 68.1, | |
//writingbench: 7.22, | |
//"math-500": 89.8, | |
aime_24: 32.8, | |
aime_2025: 21.6, | |
//zebralogic: 33.2, | |
//autologi: 81.5, | |
//"bfcl-v3": 58.6, | |
//"livecodebench-v5": 29.8, | |
//"codeforces-rating": 1267, | |
//"codeforces-percentile": 64.1, | |
//"multi-if": 70.8, | |
//include: 67.8, | |
// "mmmlu-14-languages": 73.8, | |
//"mt-aime2024": 24.6, | |
//poly//math: 23.3, | |
//mlogiqa: 53.3, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-4B (Thinking Mode)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
//"mmlu-redux": 83.7, | |
gpqa_diamond: 55.9, | |
//"c-eval": 77.5, | |
//"livebench-2024-11-25": 63.6, | |
//"ifeval-strict-prompt": 81.9, | |
//"arena-hard": 76.6, | |
//"alignbench-v1.1": 8.30, | |
//"creative-writing-v3": 61.1, | |
//writingbench: 7.35, | |
//"math-500": 97.0, | |
aime_24: 73.8, | |
aime_2025: 65.6, | |
//zebralogic: 81.0, | |
//autologi: 87.9, | |
//"bfcl-v3": 65.9, | |
//"livecodebench-v5": 54.2, | |
//"codeforces-rating": 1671, | |
//"codeforces-percentile": 92.8, | |
//"multi-if": 66.3, | |
//include: 61.8, | |
// "mmmlu-14-languages": 69.8, | |
//"mt-aime2024": 60.7, | |
//poly//math: 40.0, | |
//mlogiqa: 65.9, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-8B (Thinking Mode)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
//"mmlu-redux": 87.5, | |
gpqa_diamond: 62.0, | |
//"c-eval": 83.4, | |
//"livebench-2024-11-25": 67.1, | |
//"ifeval-strict-prompt": 85.0, | |
//"arena-hard": 85.8, | |
//"alignbench-v1.1": 8.46, | |
//"creative-writing-v3": 75.0, | |
//writingbench: 7.59, | |
//"math-500": 97.4, | |
aime_24: 76.0, | |
aime_2025: 67.3, | |
//zebralogic: 84.8, | |
//autologi: 89.1, | |
//"bfcl-v3": 68.1, | |
//"livecodebench-v5": 57.5, | |
//"codeforces-rating": 1785, | |
//"codeforces-percentile": 95.6, | |
//"multi-if": 71.2, | |
//include: 67.8, | |
// "mmmlu-14-languages": 74.4, | |
//"mt-aime2024": 65.4, | |
//poly//math: 42.7, | |
//mlogiqa: 69.0, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-4B (Non-thinking Mode)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
//"mmlu-redux": 77.3, | |
gpqa_diamond: 41.7, | |
//"c-eval": 72.2, | |
//"livebench-2024-11-25": 48.4, | |
//"ifeval-strict-prompt": 81.2, | |
//"arena-hard": 66.2, | |
//"alignbench-v1.1": 8.10, | |
//"creative-writing-v3": 53.6, | |
//writingbench: 6.85, | |
//"math-500": 84.8, | |
aime_24: 25.0, | |
aime_2025: 19.1, | |
//zebralogic: 35.2, | |
//autologi: 76.3, | |
//"bfcl-v3": 57.6, | |
//"livecodebench-v5": 21.3, | |
//"codeforces-rating": 842, | |
//"codeforces-percentile": 33.7, | |
//"multi-if": 61.3, | |
//include: 53.8, | |
// "mmmlu-14-languages": 61.7, | |
//"mt-aime2024": 13.9, | |
//poly//math: 16.6, | |
//mlogiqa: 49.9, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-8B (Non-thinking Mode)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
//"mmlu-redux": 79.5, | |
gpqa_diamond: 39.3, | |
//"c-eval": 77.9, | |
//"livebench-2024-11-25": 53.5, | |
//"ifeval-strict-prompt": 83.0, | |
//"arena-hard": 79.6, | |
//"alignbench-v1.1": 8.38, | |
//"creative-writing-v3": 64.5, | |
//writingbench: 7.15, | |
//"math-500": 87.4, | |
aime_24: 29.1, | |
aime_2025: 20.9, | |
//zebralogic: 26.7, | |
//autologi: 76.5, | |
//"bfcl-v3": 60.2, | |
//"livecodebench-v5": 22.8, | |
//"codeforces-rating": 1110, | |
//"codeforces-percentile": 52.4, | |
//"multi-if": 69.2, | |
//include: 62.5, | |
// "mmmlu-14-languages": 66.9, | |
//"mt-aime2024": 16.6, | |
//poly//math: 18.8, | |
//mlogiqa: 51.4, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-0.6B (Thinking Mode)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
//"mmlu-redux": 55.6, | |
gpqa_diamond: 27.9, | |
//"c-eval": 50.4, | |
//"livebench-2024-11-25": 30.3, | |
//"ifeval-strict-prompt": 59.2, | |
//"arena-hard": 8.5, | |
//"alignbench-v1.1": 6.10, | |
//"creative-writing-v3": 30.6, | |
//writingbench: 5.61, | |
//"math-500": 77.6, | |
aime_24: 10.7, | |
aime_2025: 15.1, | |
//zebralogic: 30.3, | |
//autologi: 61.6, | |
//"bfcl-v3": 46.4, | |
//"livecodebench-v5": 12.3, | |
//"multi-if": 36.1, | |
//include: 35.9, | |
// "mmmlu-14-languages": 43.1, | |
//"mt-aime2024": 7.8, | |
//poly//math: 11.4, | |
//mlogiqa: 40.9, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-1.7B (Thinking Mode)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
//"mmlu-redux": 73.9, | |
gpqa_diamond: 40.1, | |
//"c-eval": 68.1, | |
//"livebench-2024-11-25": 51.1, | |
//"ifeval-strict-prompt": 72.5, | |
//"arena-hard": 43.1, | |
//"alignbench-v1.1": 7.60, | |
//"creative-writing-v3": 48.0, | |
//writingbench: 7.02, | |
//"math-500": 93.4, | |
aime_24: 48.3, | |
aime_2025: 36.8, | |
//zebralogic: 63.2, | |
//autologi: 83.2, | |
//"bfcl-v3": 56.6, | |
//"livecodebench-v5": 33.2, | |
//"multi-if": 51.2, | |
//include: 51.8, | |
// "mmmlu-14-languages": 59.1, | |
//"mt-aime2024": 36.1, | |
//poly//math: 25.2, | |
//mlogiqa: 56.0, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-0.6B (Non-thinking Mode)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
//"mmlu-redux": 44.6, | |
gpqa_diamond: 22.9, | |
//"c-eval": 42.6, | |
//"livebench-2024-11-25": 21.8, | |
//"ifeval-strict-prompt": 54.5, | |
//"arena-hard": 6.5, | |
//"alignbench-v1.1": 5.60, | |
//"creative-writing-v3": 28.4, | |
//writingbench: 5.13, | |
//"math-500": 55.2, | |
aime_24: 3.4, | |
aime_2025: 2.6, | |
//zebralogic: 4.2, | |
//autologi: 37.4, | |
//"bfcl-v3": 44.1, | |
//"livecodebench-v5": 3.6, | |
//"multi-if": 33.3, | |
//include: 34.4, | |
// "mmmlu-14-languages": 37.1, | |
//"mt-aime2024": 1.5, | |
//poly//math: 4.6, | |
//mlogiqa: 37.3, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
}, | |
{ | |
model: "Qwen3-1.7B (Non-thinking Mode)", | |
provider: "Qwen", | |
inputPrice: 0, | |
outputPrice: 0, | |
benchmark: { | |
//"mmlu-redux": 64.4, | |
gpqa_diamond: 28.6, | |
//"c-eval": 61.0, | |
//"livebench-2024-11-25": 35.6, | |
//"ifeval-strict-prompt": 68.2, | |
//"arena-hard": 36.9, | |
//"alignbench-v1.1": 7.20, | |
//"creative-writing-v3": 43.6, | |
//writingbench: 6.54, | |
//"math-500": 73.0, | |
aime_24: 13.4, | |
aime_2025: 9.8, | |
//zebralogic: 12.8, | |
//autologi: 59.8, | |
//"bfcl-v3": 52.2, | |
//"livecodebench-v5": 11.6, | |
//"multi-if": 44.7, | |
//include: 42.6, | |
// "mmmlu-14-languages": 48.3, | |
//"mt-aime2024": 4.9, | |
//poly//math: 10.3, | |
//mlogiqa: 41.1, | |
}, | |
source: "https://arxiv.org/pdf/2505.09388", | |
},]; | |