PeterKruger commited on
Commit
ed18635
·
verified ·
1 Parent(s): fa11ba1

Upload summary_data.csv

Browse files
Files changed (1) hide show
  1. runs/run_2025-08-14/summary_data.csv +34 -34
runs/run_2025-08-14/summary_data.csv CHANGED
@@ -1,34 +1,34 @@
1
- Model,Iterations,AutoBench,LMArena,AAI Index,MMLU-Pro,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec),Fail Rate %
2
- claude-3.5-haiku,393,3.586292962,1317,23326,0.634,0.008262832,11.51902452,17.98,4.15%
3
- claude-opus-4-1,387,4.239909895,1446,58830,,0.091256434,48.62490598,32.86,5.61%
4
- claude-sonnet-4,393,4.171968576,1399,61000,0.842,0.017099466,33.66639032,82.6,4.15%
5
- deepSeek-R1-0528,385,4.18112906,1418,58740,0.849,0.006382309,119.174235,223.47,6.10%
6
- deepSeek-V3-0324,392,3.945669087,1390,43990,0.819,0.00119639,40.30336432,106.53,4.39%
7
- gemini-2.5-flash,387,4.32099389,1409,58430,0.759,0.004512314,48.7078753,140.54,5.61%
8
- gemini-2.5-flash-lite,389,4.017202952,1351,44348,0.832,0.001052718,19.15509939,8.82,5.12%
9
- gemini-2.5-pro,388,4.416904571,1458,64630,0.862,0.015866994,65.03115036,64.18,5.37%
10
- gemma-3-27b-it,393,3.881640548,1363,25220,0.669,0.00028134,29.7215,79.12,4.15%
11
- GLM-4.5,389,4.176558031,1414,56080,0.835,0.00629521,80.74437254,29.19,5.12%
12
- GLM-4.5-Air,392,3.98464018,1379,49475,0.815,0.003611243,68.34050587,21.75,4.39%
13
- gpt-4.1,392,4.165890881,1406,46770,0.806,0.009144648,32.86274006,23.32,4.39%
14
- gpt-5,385,4.511567341,1481,68950,0.871,0.043676351,89.99818067,69.79,6.10%
15
- gpt-5-mini,392,4.486571107,,63700,0.828,0.006324841,65.89701176,48.74,4.39%
16
- gpt-5-nano,390,4.325926956,,53780,0.772,0.002414388,66.4959839,73.7,4.88%
17
- gpt-oss-120b,388,4.479287977,1356,61340,0.808,0.001361942,27.00733404,94.45,5.37%
18
- grok-3-mini,391,4.055940505,1360,58010,0.828,0.000895661,26.12147499,23.11,4.63%
19
- grok-4,360,4.308828831,1430,67520,0.866,0.029202342,60.95525411,13.82,12.20%
20
- Kimi-K2-Instruct,325,4.177138663,1420,48560,0.824,0.002379296,65.0222057,96.77,20.73%
21
- llama-3_1-Nemotron-Ultra-253B-v1,391,4.020264345,1345,46420,0.825,0.003451924,61.53657957,29.62,4.63%
22
- llama-3_3-Nemotron-Super-49B-v1,392,3.883310532,1324,40473,0.698,0.000455359,32.63831081,12.47,4.39%
23
- llama-4-maverick,388,3.640194992,1330,41730,0.809,0.000496574,10.65014104,9.93,5.37%
24
- llama-4-Scout,393,3.614481399,1318,33060,0.752,0.000410484,10.86684261,23.67,4.15%
25
- magistral-small-2506,390,3.713933337,1347,35950,0.746,0.0019781,17.53939687,52.3,4.88%
26
- mistral-large-2411,392,3.714675671,1313,27013,0.697,0.006101138,24.36368715,66.7,4.39%
27
- nova-lite-v1,393,3.538201832,1262,24540,0.59,0.00018322,5.288625128,21.75,4.15%
28
- nova-pro-v1,389,3.490835422,1289,28830,0.691,0.001800498,7.528192069,23.32,5.12%
29
- o3,391,4.409851586,1451,67070,0.853,0.018504711,63.89621339,69.79,4.63%
30
- o4-mini,393,4.27410734,1398,65050,0.832,0.008704364,39.05469579,48.74,4.15%
31
- phi-4,392,3.657791802,1258,27950,0.714,0.000240436,7.744667446,73.7,4.39%
32
- Qwen3-14B,392,3.976245179,,45235,0.774,0.000789184,61.11544056,94.45,4.39%
33
- Qwen3-235B-A22B-Thinking-2507,331,4.394399183,1401,63590,0.843,0.00416518,78.79346155,23.11,19.27%
34
- Qwen3-30B-A3B,390,3.952481327,1380,42340,0.777,0.000763337,72.64171253,13.82,4.88%
 
1
+ Model,Iterations,AutoBench,LMArena,AAI Index,MMLU-Pro,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec),Fail Rate %,
2
+ claude-3.5-haiku,393,3.586292962,1317,23326,0.634,0.008262832,11.51902452,17.98,4.15%,3.586292962
3
+ claude-opus-4-1,387,4.239909895,1446,58830,,0.091256434,48.62490598,32.86,5.61%,4.239909895
4
+ claude-sonnet-4,393,4.171968576,1399,61000,0.842,0.017099466,33.66639032,82.6,4.15%,4.171968576
5
+ deepSeek-R1-0528,385,4.18112906,1418,58740,0.849,0.006382309,119.174235,223.47,6.10%,4.18112906
6
+ deepSeek-V3-0324,392,3.945669087,1390,43990,0.819,0.00119639,40.30336432,106.53,4.39%,3.945669087
7
+ gemini-2.5-flash,387,4.32099389,1409,58430,0.759,0.004512314,48.7078753,140.54,5.61%,4.32099389
8
+ gemini-2.5-flash-lite,389,4.017202952,1351,44348,0.832,0.001052718,19.15509939,8.82,5.12%,4.017202952
9
+ gemini-2.5-pro,388,4.416904571,1458,64630,0.862,0.015866994,65.03115036,64.18,5.37%,4.416904571
10
+ gemma-3-27b-it,393,3.881640548,1363,25220,0.669,0.00028134,29.7215,79.12,4.15%,3.881640548
11
+ GLM-4.5,389,4.176558031,1414,56080,0.835,0.00629521,80.74437254,29.19,5.12%,4.176558031
12
+ GLM-4.5-Air,392,3.98464018,1379,49475,0.815,0.003611243,68.34050587,21.75,4.39%,3.98464018
13
+ gpt-4.1,392,4.165890881,1406,46770,0.806,0.009144648,32.86274006,23.32,4.39%,4.165890881
14
+ gpt-5,385,4.511567341,1481,68950,0.871,0.043676351,89.99818067,69.79,6.10%,4.511567341
15
+ gpt-5-mini,392,4.486571107,,63700,0.828,0.006324841,65.89701176,48.74,4.39%,4.486571107
16
+ gpt-5-nano,390,4.325926956,,53780,0.772,0.002414388,66.4959839,73.7,4.88%,4.325926956
17
+ gpt-oss-120b,388,4.479287977,1356,61340,0.808,0.001361942,27.00733404,94.45,5.37%,4.479287977
18
+ grok-3-mini,391,4.055940505,1360,58010,0.828,0.000895661,26.12147499,23.11,4.63%,4.055940505
19
+ grok-4,360,4.308828831,1430,67520,0.866,0.029202342,60.95525411,13.82,12.20%,4.308828831
20
+ Kimi-K2-Instruct,325,4.177138663,1420,48560,0.824,0.002379296,65.0222057,96.77,20.73%,4.177138663
21
+ llama-3_1-Nemotron-Ultra-253B-v1,391,4.020264345,1345,46420,0.825,0.003451924,61.53657957,29.62,4.63%,4.020264345
22
+ llama-3_3-Nemotron-Super-49B-v1,392,3.883310532,1324,40473,0.698,0.000455359,32.63831081,12.47,4.39%,3.883310532
23
+ llama-4-maverick,388,3.640194992,1330,41730,0.809,0.000496574,10.65014104,9.93,5.37%,3.640194992
24
+ llama-4-Scout,393,3.614481399,1318,33060,0.752,0.000410484,10.86684261,23.67,4.15%,3.614481399
25
+ magistral-small-2506,390,3.713933337,1347,35950,0.746,0.0019781,17.53939687,52.3,4.88%,3.713933337
26
+ mistral-large-2411,392,3.714675671,1313,27013,0.697,0.006101138,24.36368715,66.7,4.39%,3.714675671
27
+ nova-lite-v1,393,3.538201832,1262,24540,0.59,0.00018322,5.288625128,21.75,4.15%,3.538201832
28
+ nova-pro-v1,389,3.490835422,1289,28830,0.691,0.001800498,7.528192069,23.32,5.12%,3.490835422
29
+ o3,391,4.409851586,1451,67070,0.853,0.018504711,63.89621339,69.79,4.63%,4.409851586
30
+ o4-mini,393,4.27410734,1398,65050,0.832,0.008704364,39.05469579,48.74,4.15%,4.27410734
31
+ phi-4,392,3.657791802,1258,27950,0.714,0.000240436,7.744667446,73.7,4.39%,3.657791802
32
+ Qwen3-14B,392,3.976245179,,45235,0.774,0.000789184,61.11544056,94.45,4.39%,3.976245179
33
+ Qwen3-235B-A22B-Thinking-2507,331,4.394399183,1401,63590,0.843,0.00416518,78.79346155,23.11,19.27%,4.394399183
34
+ Qwen3-30B-A3B,390,3.952481327,1380,42340,0.777,0.000763337,72.64171253,13.82,4.88%,3.952481327