Spaces:
Running
Running
Upload summary_data.csv
Browse files
runs/run_2025-08-14/summary_data.csv
CHANGED
@@ -1,34 +1,34 @@
|
|
1 |
-
Model,Iterations,AutoBench,LMArena,AAI Index,MMLU-Pro,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec),Fail Rate
|
2 |
-
claude-3.5-haiku,393,3.586292962,1317,23326,0.634,0.008262832,11.51902452,17.98,4.15
|
3 |
-
claude-opus-4-1,387,4.239909895,1446,58830,,0.091256434,48.62490598,32.86,5.61
|
4 |
-
claude-sonnet-4,393,4.171968576,1399,61000,0.842,0.017099466,33.66639032,82.6,4.15
|
5 |
-
deepSeek-R1-0528,385,4.18112906,1418,58740,0.849,0.006382309,119.174235,223.47,6.10
|
6 |
-
deepSeek-V3-0324,392,3.945669087,1390,43990,0.819,0.00119639,40.30336432,106.53,4.39
|
7 |
-
gemini-2.5-flash,387,4.32099389,1409,58430,0.759,0.004512314,48.7078753,140.54,5.61
|
8 |
-
gemini-2.5-flash-lite,389,4.017202952,1351,44348,0.832,0.001052718,19.15509939,8.82,5.12
|
9 |
-
gemini-2.5-pro,388,4.416904571,1458,64630,0.862,0.015866994,65.03115036,64.18,5.37
|
10 |
-
gemma-3-27b-it,393,3.881640548,1363,25220,0.669,0.00028134,29.7215,79.12,4.15
|
11 |
-
GLM-4.5,389,4.176558031,1414,56080,0.835,0.00629521,80.74437254,29.19,5.12
|
12 |
-
GLM-4.5-Air,392,3.98464018,1379,49475,0.815,0.003611243,68.34050587,21.75,4.39
|
13 |
-
gpt-4.1,392,4.165890881,1406,46770,0.806,0.009144648,32.86274006,23.32,4.39
|
14 |
-
gpt-5,385,4.511567341,1481,68950,0.871,0.043676351,89.99818067,69.79,6.10
|
15 |
-
gpt-5-mini,392,4.486571107,,63700,0.828,0.006324841,65.89701176,48.74,4.39
|
16 |
-
gpt-5-nano,390,4.325926956,,53780,0.772,0.002414388,66.4959839,73.7,4.88
|
17 |
-
gpt-oss-120b,388,4.479287977,1356,61340,0.808,0.001361942,27.00733404,94.45,5.37
|
18 |
-
grok-3-mini,391,4.055940505,1360,58010,0.828,0.000895661,26.12147499,23.11,4.63
|
19 |
-
grok-4,360,4.308828831,1430,67520,0.866,0.029202342,60.95525411,13.82,12.20
|
20 |
-
Kimi-K2-Instruct,325,4.177138663,1420,48560,0.824,0.002379296,65.0222057,96.77,20.73
|
21 |
-
llama-3_1-Nemotron-Ultra-253B-v1,391,4.020264345,1345,46420,0.825,0.003451924,61.53657957,29.62,4.63
|
22 |
-
llama-3_3-Nemotron-Super-49B-v1,392,3.883310532,1324,40473,0.698,0.000455359,32.63831081,12.47,4.39
|
23 |
-
llama-4-maverick,388,3.640194992,1330,41730,0.809,0.000496574,10.65014104,9.93,5.37
|
24 |
-
llama-4-Scout,393,3.614481399,1318,33060,0.752,0.000410484,10.86684261,23.67,4.15
|
25 |
-
magistral-small-2506,390,3.713933337,1347,35950,0.746,0.0019781,17.53939687,52.3,4.88
|
26 |
-
mistral-large-2411,392,3.714675671,1313,27013,0.697,0.006101138,24.36368715,66.7,4.39
|
27 |
-
nova-lite-v1,393,3.538201832,1262,24540,0.59,0.00018322,5.288625128,21.75,4.15
|
28 |
-
nova-pro-v1,389,3.490835422,1289,28830,0.691,0.001800498,7.528192069,23.32,5.12
|
29 |
-
o3,391,4.409851586,1451,67070,0.853,0.018504711,63.89621339,69.79,4.63
|
30 |
-
o4-mini,393,4.27410734,1398,65050,0.832,0.008704364,39.05469579,48.74,4.15
|
31 |
-
phi-4,392,3.657791802,1258,27950,0.714,0.000240436,7.744667446,73.7,4.39
|
32 |
-
Qwen3-14B,392,3.976245179,,45235,0.774,0.000789184,61.11544056,94.45,4.39
|
33 |
-
Qwen3-235B-A22B-Thinking-2507,331,4.394399183,1401,63590,0.843,0.00416518,78.79346155,23.11,19.27
|
34 |
-
Qwen3-30B-A3B,390,3.952481327,1380,42340,0.777,0.000763337,72.64171253,13.82,4.88
|
|
|
1 |
+
Model,Iterations,AutoBench,LMArena,AAI Index,MMLU-Pro,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec),Fail Rate %,
|
2 |
+
claude-3.5-haiku,393,3.586292962,1317,23326,0.634,0.008262832,11.51902452,17.98,4.15%,3.586292962
|
3 |
+
claude-opus-4-1,387,4.239909895,1446,58830,,0.091256434,48.62490598,32.86,5.61%,4.239909895
|
4 |
+
claude-sonnet-4,393,4.171968576,1399,61000,0.842,0.017099466,33.66639032,82.6,4.15%,4.171968576
|
5 |
+
deepSeek-R1-0528,385,4.18112906,1418,58740,0.849,0.006382309,119.174235,223.47,6.10%,4.18112906
|
6 |
+
deepSeek-V3-0324,392,3.945669087,1390,43990,0.819,0.00119639,40.30336432,106.53,4.39%,3.945669087
|
7 |
+
gemini-2.5-flash,387,4.32099389,1409,58430,0.759,0.004512314,48.7078753,140.54,5.61%,4.32099389
|
8 |
+
gemini-2.5-flash-lite,389,4.017202952,1351,44348,0.832,0.001052718,19.15509939,8.82,5.12%,4.017202952
|
9 |
+
gemini-2.5-pro,388,4.416904571,1458,64630,0.862,0.015866994,65.03115036,64.18,5.37%,4.416904571
|
10 |
+
gemma-3-27b-it,393,3.881640548,1363,25220,0.669,0.00028134,29.7215,79.12,4.15%,3.881640548
|
11 |
+
GLM-4.5,389,4.176558031,1414,56080,0.835,0.00629521,80.74437254,29.19,5.12%,4.176558031
|
12 |
+
GLM-4.5-Air,392,3.98464018,1379,49475,0.815,0.003611243,68.34050587,21.75,4.39%,3.98464018
|
13 |
+
gpt-4.1,392,4.165890881,1406,46770,0.806,0.009144648,32.86274006,23.32,4.39%,4.165890881
|
14 |
+
gpt-5,385,4.511567341,1481,68950,0.871,0.043676351,89.99818067,69.79,6.10%,4.511567341
|
15 |
+
gpt-5-mini,392,4.486571107,,63700,0.828,0.006324841,65.89701176,48.74,4.39%,4.486571107
|
16 |
+
gpt-5-nano,390,4.325926956,,53780,0.772,0.002414388,66.4959839,73.7,4.88%,4.325926956
|
17 |
+
gpt-oss-120b,388,4.479287977,1356,61340,0.808,0.001361942,27.00733404,94.45,5.37%,4.479287977
|
18 |
+
grok-3-mini,391,4.055940505,1360,58010,0.828,0.000895661,26.12147499,23.11,4.63%,4.055940505
|
19 |
+
grok-4,360,4.308828831,1430,67520,0.866,0.029202342,60.95525411,13.82,12.20%,4.308828831
|
20 |
+
Kimi-K2-Instruct,325,4.177138663,1420,48560,0.824,0.002379296,65.0222057,96.77,20.73%,4.177138663
|
21 |
+
llama-3_1-Nemotron-Ultra-253B-v1,391,4.020264345,1345,46420,0.825,0.003451924,61.53657957,29.62,4.63%,4.020264345
|
22 |
+
llama-3_3-Nemotron-Super-49B-v1,392,3.883310532,1324,40473,0.698,0.000455359,32.63831081,12.47,4.39%,3.883310532
|
23 |
+
llama-4-maverick,388,3.640194992,1330,41730,0.809,0.000496574,10.65014104,9.93,5.37%,3.640194992
|
24 |
+
llama-4-Scout,393,3.614481399,1318,33060,0.752,0.000410484,10.86684261,23.67,4.15%,3.614481399
|
25 |
+
magistral-small-2506,390,3.713933337,1347,35950,0.746,0.0019781,17.53939687,52.3,4.88%,3.713933337
|
26 |
+
mistral-large-2411,392,3.714675671,1313,27013,0.697,0.006101138,24.36368715,66.7,4.39%,3.714675671
|
27 |
+
nova-lite-v1,393,3.538201832,1262,24540,0.59,0.00018322,5.288625128,21.75,4.15%,3.538201832
|
28 |
+
nova-pro-v1,389,3.490835422,1289,28830,0.691,0.001800498,7.528192069,23.32,5.12%,3.490835422
|
29 |
+
o3,391,4.409851586,1451,67070,0.853,0.018504711,63.89621339,69.79,4.63%,4.409851586
|
30 |
+
o4-mini,393,4.27410734,1398,65050,0.832,0.008704364,39.05469579,48.74,4.15%,4.27410734
|
31 |
+
phi-4,392,3.657791802,1258,27950,0.714,0.000240436,7.744667446,73.7,4.39%,3.657791802
|
32 |
+
Qwen3-14B,392,3.976245179,,45235,0.774,0.000789184,61.11544056,94.45,4.39%,3.976245179
|
33 |
+
Qwen3-235B-A22B-Thinking-2507,331,4.394399183,1401,63590,0.843,0.00416518,78.79346155,23.11,19.27%,4.394399183
|
34 |
+
Qwen3-30B-A3B,390,3.952481327,1380,42340,0.777,0.000763337,72.64171253,13.82,4.88%,3.952481327
|