Spaces:
Running
Running
Upload summary_data.csv
Browse files
runs/run_2025-08-14/summary_data.csv
CHANGED
@@ -1,34 +1,34 @@
|
|
1 |
-
Model,Iterations,AutoBench,LMArena,AAI Index,MMLU-Pro,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec),Fail Rate
|
2 |
-
claude-3.5-haiku,393,3.586292962,1317,23326,0.634,0.008262832,11.51902452,
|
3 |
-
claude-opus-4-1,387,4.239909895,1446,58830,,0.091256434,48.62490598,
|
4 |
-
claude-sonnet-4,393,4.171968576,1399,61000,0.842,0.017099466,33.66639032,
|
5 |
-
deepSeek-R1-0528,385,4.18112906,1418,58740,0.849,0.006382309,119.174235,
|
6 |
-
deepSeek-V3-0324,392,3.945669087,1390,43990,0.819,0.00119639,40.30336432,
|
7 |
-
gemini-2.5-flash,387,4.32099389,1409,58430,0.759,0.004512314,48.7078753,
|
8 |
-
gemini-2.5-flash-lite,389,4.017202952,1351,44348,0.832,0.001052718,19.15509939,
|
9 |
-
gemini-2.5-pro,388,4.416904571,1458,64630,0.862,0.015866994,65.03115036,
|
10 |
-
gemma-3-27b-it,393,3.881640548,1363,25220,0.669,0.00028134,29.7215,
|
11 |
-
GLM-4.5,389,4.176558031,1414,56080,0.835,0.00629521,80.74437254,
|
12 |
-
GLM-4.5-Air,392,3.98464018,1379,49475,0.815,0.003611243,68.34050587,
|
13 |
-
gpt-4.1,392,4.165890881,1406,46770,0.806,0.009144648,32.86274006,
|
14 |
-
gpt-5,385,4.511567341,1481,68950,0.871,0.043676351,89.99818067,
|
15 |
-
gpt-5-mini,392,4.486571107,,63700,0.828,0.006324841,65.89701176,
|
16 |
-
gpt-5-nano,390,4.325926956,,53780,0.772,0.002414388,66.4959839,
|
17 |
-
gpt-oss-120b,388,4.479287977,1356,61340,0.808,0.001361942,27.00733404,
|
18 |
-
grok-3-mini,391,4.055940505,1360,58010,0.828,0.000895661,26.12147499,
|
19 |
-
grok-4,360,4.308828831,1430,67520,0.866,0.029202342,60.95525411,
|
20 |
-
Kimi-K2-Instruct,325,4.177138663,1420,48560,0.824,0.002379296,65.0222057,
|
21 |
-
llama-3_1-Nemotron-Ultra-253B-v1,391,4.020264345,1345,46420,0.825,0.003451924,61.53657957,
|
22 |
-
llama-3_3-Nemotron-Super-49B-v1,392,3.883310532,1324,40473,0.698,0.000455359,32.63831081,
|
23 |
-
llama-4-maverick,388,3.640194992,1330,41730,0.809,0.000496574,10.65014104,
|
24 |
-
llama-4-Scout,393,3.614481399,1318,33060,0.752,0.000410484,10.86684261,
|
25 |
-
magistral-small-2506,390,3.713933337,1347,35950,0.746,0.0019781,17.53939687,
|
26 |
-
mistral-large-2411,392,3.714675671,1313,27013,0.697,0.006101138,24.36368715,
|
27 |
-
nova-lite-v1,393,3.538201832,1262,24540,0.59,0.00018322,5.288625128,
|
28 |
-
nova-pro-v1,389,3.490835422,1289,28830,0.691,0.001800498,7.528192069,
|
29 |
-
o3,391,4.409851586,1451,67070,0.853,0.018504711,63.89621339,
|
30 |
-
o4-mini,393,4.27410734,1398,65050,0.832,0.008704364,39.05469579,
|
31 |
-
phi-4,392,3.657791802,1258,27950,0.714,0.000240436,7.744667446,
|
32 |
-
Qwen3-14B,392,3.976245179,,45235,0.774,0.000789184,61.11544056,
|
33 |
-
Qwen3-235B-A22B-Thinking-2507,331,4.394399183,1401,63590,0.843,0.00416518,78.79346155,
|
34 |
-
Qwen3-30B-A3B,390,3.952481327,1380,42340,0.777,0.000763337,72.64171253,
|
|
|
1 |
+
Model,Iterations,AutoBench,LMArena,AAI Index,MMLU-Pro,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec),Fail Rate %
|
2 |
+
claude-3.5-haiku,393,3.586292962,1317,23326,0.634,0.008262832,11.51902452,25.3473,4.15%
|
3 |
+
claude-opus-4-1,387,4.239909895,1446,58830,,0.091256434,48.62490598,155.1479,5.61%
|
4 |
+
claude-sonnet-4,393,4.171968576,1399,61000,0.842,0.017099466,33.66639032,119.5972,4.15%
|
5 |
+
deepSeek-R1-0528,385,4.18112906,1418,58740,0.849,0.006382309,119.174235,265.5545,6.10%
|
6 |
+
deepSeek-V3-0324,392,3.945669087,1390,43990,0.819,0.00119639,40.30336432,199.711,4.39%
|
7 |
+
gemini-2.5-flash,387,4.32099389,1409,58430,0.759,0.004512314,48.7078753,244.1111,5.61%
|
8 |
+
gemini-2.5-flash-lite,389,4.017202952,1351,44348,0.832,0.001052718,19.15509939,127.4435,5.12%
|
9 |
+
gemini-2.5-pro,388,4.416904571,1458,64630,0.862,0.015866994,65.03115036,199.3419,5.37%
|
10 |
+
gemma-3-27b-it,393,3.881640548,1363,25220,0.669,0.00028134,29.7215,134.5146,4.15%
|
11 |
+
GLM-4.5,389,4.176558031,1414,56080,0.835,0.00629521,80.74437254,246.0464,5.12%
|
12 |
+
GLM-4.5-Air,392,3.98464018,1379,49475,0.815,0.003611243,68.34050587,240.499,4.39%
|
13 |
+
gpt-4.1,392,4.165890881,1406,46770,0.806,0.009144648,32.86274006,180.7419,4.39%
|
14 |
+
gpt-5,385,4.511567341,1481,68950,0.871,0.043676351,89.99818067,277.6722,6.10%
|
15 |
+
gpt-5-mini,392,4.486571107,,63700,0.828,0.006324841,65.89701176,231.3798,4.39%
|
16 |
+
gpt-5-nano,390,4.325926956,,53780,0.772,0.002414388,66.4959839,231.9208,4.88%
|
17 |
+
gpt-oss-120b,388,4.479287977,1356,61340,0.808,0.001361942,27.00733404,119.1503,5.37%
|
18 |
+
grok-3-mini,391,4.055940505,1360,58010,0.828,0.000895661,26.12147499,116.1006,4.63%
|
19 |
+
grok-4,360,4.308828831,1430,67520,0.866,0.029202342,60.95525411,262.5205,12.20%
|
20 |
+
Kimi-K2-Instruct,325,4.177138663,1420,48560,0.824,0.002379296,65.0222057,390.4685,20.73%
|
21 |
+
llama-3_1-Nemotron-Ultra-253B-v1,391,4.020264345,1345,46420,0.825,0.003451924,61.53657957,201.967,4.63%
|
22 |
+
llama-3_3-Nemotron-Super-49B-v1,392,3.883310532,1324,40473,0.698,0.000455359,32.63831081,151.4413,4.39%
|
23 |
+
llama-4-maverick,388,3.640194992,1330,41730,0.809,0.000496574,10.65014104,71.1375,5.37%
|
24 |
+
llama-4-Scout,393,3.614481399,1318,33060,0.752,0.000410484,10.86684261,39.6221,4.15%
|
25 |
+
magistral-small-2506,390,3.713933337,1347,35950,0.746,0.0019781,17.53939687,89.5235,4.88%
|
26 |
+
mistral-large-2411,392,3.714675671,1313,27013,0.697,0.006101138,24.36368715,96.8859,4.39%
|
27 |
+
nova-lite-v1,393,3.538201832,1262,24540,0.59,0.00018322,5.288625128,10.2766,4.15%
|
28 |
+
nova-pro-v1,389,3.490835422,1289,28830,0.691,0.001800498,7.528192069,20.1557,5.12%
|
29 |
+
o3,391,4.409851586,1451,67070,0.853,0.018504711,63.89621339,276.7346,4.63%
|
30 |
+
o4-mini,393,4.27410734,1398,65050,0.832,0.008704364,39.05469579,185.5417,4.15%
|
31 |
+
phi-4,392,3.657791802,1258,27950,0.714,0.000240436,7.744667446,19.1869,4.39%
|
32 |
+
Qwen3-14B,392,3.976245179,,45235,0.774,0.000789184,61.11544056,239.235,4.39%
|
33 |
+
Qwen3-235B-A22B-Thinking-2507,331,4.394399183,1401,63590,0.843,0.00416518,78.79346155,283.8425,19.27%
|
34 |
+
Qwen3-30B-A3B,390,3.952481327,1380,42340,0.777,0.000763337,72.64171253,243.0743,4.88%
|