jansowa commited on
Commit
fa79b35
·
verified ·
1 Parent(s): c1a3e8e

Update data.json

Browse files
Files changed (1) hide show
  1. data.json +105 -65
data.json CHANGED
@@ -5,15 +5,23 @@
5
  "Sentiment": 4.230769230769231,
6
  "Language understanding": 4.0,
7
  "Phraseology": 3.86,
8
- "Tricky questions": 3.9
9
  },
10
  {
11
- "Model": "alpindale/WizardLM-2-8x22B",
 
 
 
 
 
 
 
 
12
  "Params": "141B",
13
  "Sentiment": 3.7051282051282053,
14
  "Language understanding": 3.815,
15
  "Phraseology": 4.22,
16
- "Tricky questions": 3.9
17
  },
18
  {
19
  "Model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
@@ -21,7 +29,7 @@
21
  "Sentiment": 4.326923076923077,
22
  "Language understanding": 3.91,
23
  "Phraseology": 3.25,
24
- "Tricky questions": 3.9
25
  },
26
  {
27
  "Model": "meta-llama/Meta-Llama-3-70B-Instruct",
@@ -29,7 +37,7 @@
29
  "Sentiment": 4.134615384615385,
30
  "Language understanding": 3.82,
31
  "Phraseology": 3.465,
32
- "Tricky questions": 3.9
33
  },
34
  {
35
  "Model": "speakleash/Bielik-11B-v2.3-Instruct",
@@ -37,15 +45,15 @@
37
  "Sentiment": 3.9743589743589745,
38
  "Language understanding": 3.785,
39
  "Phraseology": 3.55,
40
- "Tricky questions": 3.9
41
  },
42
  {
43
- "Model": "mistralai/Mixtral-8x22B-Instruct-v0.1",
44
  "Params": "141B",
45
  "Sentiment": 3.782051282051282,
46
  "Language understanding": 3.675,
47
  "Phraseology": 3.55,
48
- "Tricky questions": 3.9
49
  },
50
  {
51
  "Model": "speakleash/Bielik-11B-v2.1-Instruct",
@@ -53,7 +61,7 @@
53
  "Sentiment": 3.9551282051282053,
54
  "Language understanding": 3.915,
55
  "Phraseology": 3.105,
56
- "Tricky questions": 3.9
57
  },
58
  {
59
  "Model": "Qwen/Qwen2-72B-Instruct",
@@ -61,7 +69,7 @@
61
  "Sentiment": 3.7628205128205128,
62
  "Language understanding": 3.89,
63
  "Phraseology": 3.28,
64
- "Tricky questions": 3.9
65
  },
66
  {
67
  "Model": "speakleash/Bielik-11B-v2.0-Instruct",
@@ -69,7 +77,7 @@
69
  "Sentiment": 3.9743589743589745,
70
  "Language understanding": 3.745,
71
  "Phraseology": 3.125,
72
- "Tricky questions": 3.9
73
  },
74
  {
75
  "Model": "speakleash/Bielik-11B-v2.2-Instruct",
@@ -77,7 +85,7 @@
77
  "Sentiment": 3.717948717948718,
78
  "Language understanding": 3.73,
79
  "Phraseology": 3.25,
80
- "Tricky questions": 3.9
81
  },
82
  {
83
  "Model": "Qwen/Qwen1.5-72B-Chat",
@@ -85,7 +93,7 @@
85
  "Sentiment": 3.4743589743589745,
86
  "Language understanding": 3.515,
87
  "Phraseology": 2.975,
88
- "Tricky questions": 3.9
89
  },
90
  {
91
  "Model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
@@ -93,7 +101,7 @@
93
  "Sentiment": 3.9743589743589745,
94
  "Language understanding": 3.38,
95
  "Phraseology": 2.58,
96
- "Tricky questions": 3.9
97
  },
98
  {
99
  "Model": "THUDM/glm-4-9b-chat",
@@ -101,7 +109,7 @@
101
  "Sentiment": 3.58974358974359,
102
  "Language understanding": 3.455,
103
  "Phraseology": 2.78,
104
- "Tricky questions": 3.9
105
  },
106
  {
107
  "Model": "mistralai/Mistral-Nemo-Instruct-2407",
@@ -109,7 +117,7 @@
109
  "Sentiment": 3.641025641025641,
110
  "Language understanding": 3.29,
111
  "Phraseology": 2.74,
112
- "Tricky questions": 3.9
113
  },
114
  {
115
  "Model": "meta-llama/Meta-Llama-3-8B-Instruct",
@@ -117,7 +125,7 @@
117
  "Sentiment": 3.3333333333333335,
118
  "Language understanding": 3.15,
119
  "Phraseology": 3.035,
120
- "Tricky questions": 3.9
121
  },
122
  {
123
  "Model": "upstage/SOLAR-10.7B-Instruct-v1.0",
@@ -125,7 +133,7 @@
125
  "Sentiment": 2.967948717948718,
126
  "Language understanding": 3.18,
127
  "Phraseology": 3.255,
128
- "Tricky questions": 3.9
129
  },
130
  {
131
  "Model": "speakleash/Bielik-7B-Instruct-v0.1",
@@ -133,7 +141,7 @@
133
  "Sentiment": 3.58974358974359,
134
  "Language understanding": 3.475,
135
  "Phraseology": 2.315,
136
- "Tricky questions": 3.9
137
  },
138
  {
139
  "Model": "openchat/openchat-3.5-0106-gemma",
@@ -141,7 +149,7 @@
141
  "Sentiment": 3.730769230769231,
142
  "Language understanding": 3.08,
143
  "Phraseology": 2.445,
144
- "Tricky questions": 3.9
145
  },
146
  {
147
  "Model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
@@ -149,7 +157,7 @@
149
  "Sentiment": 3.0576923076923075,
150
  "Language understanding": 3.175,
151
  "Phraseology": 2.885,
152
- "Tricky questions": 3.9
153
  },
154
  {
155
  "Model": "mistralai/Mistral-7B-Instruct-v0.3",
@@ -157,7 +165,7 @@
157
  "Sentiment": 3.326923076923077,
158
  "Language understanding": 3.06,
159
  "Phraseology": 2.68,
160
- "Tricky questions": 3.9
161
  },
162
  {
163
  "Model": "berkeley-nest/Starling-LM-7B-alpha",
@@ -165,7 +173,7 @@
165
  "Sentiment": 3.0576923076923075,
166
  "Language understanding": 2.925,
167
  "Phraseology": 2.855,
168
- "Tricky questions": 3.9
169
  },
170
  {
171
  "Model": "openchat/openchat-3.5-0106",
@@ -173,7 +181,7 @@
173
  "Sentiment": 3.16025641025641,
174
  "Language understanding": 2.835,
175
  "Phraseology": 2.555,
176
- "Tricky questions": 3.9
177
  },
178
  {
179
  "Model": "internlm/internlm2-chat-20b",
@@ -181,7 +189,7 @@
181
  "Sentiment": 3.301282051282051,
182
  "Language understanding": 2.785,
183
  "Phraseology": 2.385,
184
- "Tricky questions": 3.9
185
  },
186
  {
187
  "Model": "01-ai/Yi-1.5-34B-Chat",
@@ -189,7 +197,7 @@
189
  "Sentiment": 3.076923076923077,
190
  "Language understanding": 2.87,
191
  "Phraseology": 2.38,
192
- "Tricky questions": 3.9
193
  },
194
  {
195
  "Model": "Voicelab/trurl-2-13b-academic",
@@ -197,7 +205,7 @@
197
  "Sentiment": 3.301282051282051,
198
  "Language understanding": 2.755,
199
  "Phraseology": 2.165,
200
- "Tricky questions": 3.9
201
  },
202
  {
203
  "Model": "google/gemma-2-2b-it",
@@ -205,7 +213,7 @@
205
  "Sentiment": 3.3974358974359,
206
  "Language understanding": 2.9,
207
  "Phraseology": 2.095,
208
- "Tricky questions": 3.9
209
  },
210
  {
211
  "Model": "Qwen/Qwen2.5-3B-Instruct",
@@ -213,7 +221,7 @@
213
  "Sentiment": 2.948717948717949,
214
  "Language understanding": 2.455,
215
  "Phraseology": 2.8,
216
- "Tricky questions": 3.9
217
  },
218
  {
219
  "Model": "NousResearch/Hermes-3-Llama-3.2-3B",
@@ -221,7 +229,7 @@
221
  "Sentiment": 2.6153846153846154,
222
  "Language understanding": 2.705,
223
  "Phraseology": 2.765,
224
- "Tricky questions": 3.9
225
  },
226
  {
227
  "Model": "ibm-granite/granite-3.1-2b-instruct",
@@ -229,7 +237,7 @@
229
  "Sentiment": 3.076923076923077,
230
  "Language understanding": 2.235,
231
  "Phraseology": 1.88,
232
- "Tricky questions": 3.9
233
  },
234
  {
235
  "Model": "meta-llama/Llama-3.2-1B-Instruct",
@@ -237,7 +245,7 @@
237
  "Sentiment": 3.076923076923077,
238
  "Language understanding": 1.735,
239
  "Phraseology": 2.34,
240
- "Tricky questions": 3.9
241
  },
242
  {
243
  "Model": "microsoft/Phi-3.5-mini-instruct",
@@ -245,7 +253,7 @@
245
  "Sentiment": 2.435897435897436,
246
  "Language understanding": 2.135,
247
  "Phraseology": 2.425,
248
- "Tricky questions": 3.9
249
  },
250
  {
251
  "Model": "meta-llama/Llama-3.2-3B-Instruct",
@@ -253,7 +261,7 @@
253
  "Sentiment": 2.7564102564102564,
254
  "Language understanding": 2.295,
255
  "Phraseology": 1.72,
256
- "Tricky questions": 3.9
257
  },
258
  {
259
  "Model": "h2oai/h2o-danube2-1.8b-chat",
@@ -261,7 +269,7 @@
261
  "Sentiment": 2.371794871794872,
262
  "Language understanding": 1.595,
263
  "Phraseology": 2.47,
264
- "Tricky questions": 3.9
265
  },
266
  {
267
  "Model": "Qwen/Qwen2.5-1.5B-Instruct",
@@ -269,7 +277,7 @@
269
  "Sentiment": 2.7948717948717947,
270
  "Language understanding": 1.35,
271
  "Phraseology": 2.225,
272
- "Tricky questions": 3.9
273
  },
274
  {
275
  "Model": "utter-project/EuroLLM-1.7B-Instruct",
@@ -277,7 +285,7 @@
277
  "Sentiment": 2.243589743589744,
278
  "Language understanding": 1.79,
279
  "Phraseology": 2.26,
280
- "Tricky questions": 3.9
281
  },
282
  {
283
  "Model": "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct",
@@ -285,7 +293,7 @@
285
  "Sentiment": 1.9423076923076923,
286
  "Language understanding": 2.1155778894472363,
287
  "Phraseology": 2.130653266331658,
288
- "Tricky questions": 3.9
289
  },
290
  {
291
  "Model": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
@@ -293,7 +301,7 @@
293
  "Sentiment": 2.275641025641025,
294
  "Language understanding": 1.1,
295
  "Phraseology": 2.355,
296
- "Tricky questions": 3.9
297
  },
298
  {
299
  "Model": "Qwen/Qwen2.5-0.5B-Instruct",
@@ -301,7 +309,7 @@
301
  "Sentiment": 1.955128205128205,
302
  "Language understanding": 0.835,
303
  "Phraseology": 2.595,
304
- "Tricky questions": 3.9
305
  },
306
  {
307
  "Model": "CYFRAGOVPL/Llama-PLLuM-70B-chat",
@@ -309,7 +317,7 @@
309
  "Sentiment": 3.94,
310
  "Language understanding": 3.61,
311
  "Phraseology": 3.35,
312
- "Tricky questions": 3.9
313
  },
314
  {
315
  "Model": "CYFRAGOVPL/PLLuM-8x7B-nc-instruct",
@@ -317,7 +325,7 @@
317
  "Sentiment": 3.88,
318
  "Language understanding": 3.59,
319
  "Phraseology": 3.22,
320
- "Tricky questions": 3.9
321
  },
322
  {
323
  "Model": "CYFRAGOVPL/Llama-PLLuM-70B-instruct",
@@ -325,7 +333,7 @@
325
  "Sentiment": 3.78,
326
  "Language understanding": 3.63,
327
  "Phraseology": 3.26,
328
- "Tricky questions": 3.9
329
  },
330
  {
331
  "Model": "CYFRAGOVPL/PLLuM-8x7B-instruct",
@@ -333,7 +341,7 @@
333
  "Sentiment": 3.59,
334
  "Language understanding": 3.47,
335
  "Phraseology": 3.46,
336
- "Tricky questions": 3.9
337
  },
338
  {
339
  "Model": "CYFRAGOVPL/PLLuM-12B-instruct",
@@ -341,7 +349,7 @@
341
  "Sentiment": 3.71,
342
  "Language understanding": 3.17,
343
  "Phraseology": 3.59,
344
- "Tricky questions": 3.9
345
  },
346
  {
347
  "Model": "CYFRAGOVPL/PLLuM-8x7B-nc-chat",
@@ -349,7 +357,7 @@
349
  "Sentiment": 3.76,
350
  "Language understanding": 3.48,
351
  "Phraseology": 3.08,
352
- "Tricky questions": 3.9
353
  },
354
  {
355
  "Model": "CYFRAGOVPL/PLLuM-8x7B-chat",
@@ -357,7 +365,7 @@
357
  "Sentiment": 3.44,
358
  "Language understanding": 3.45,
359
  "Phraseology": 3.35,
360
- "Tricky questions": 3.9
361
  },
362
  {
363
  "Model": "CYFRAGOVPL/PLLuM-12B-chat",
@@ -365,7 +373,7 @@
365
  "Sentiment": 3.32,
366
  "Language understanding": 3.21,
367
  "Phraseology": 3.43,
368
- "Tricky questions": 3.9
369
  },
370
  {
371
  "Model": "CYFRAGOVPL/PLLuM-12B-nc-instruct",
@@ -373,7 +381,7 @@
373
  "Sentiment": 3.24,
374
  "Language understanding": 3.31,
375
  "Phraseology": 3.32,
376
- "Tricky questions": 3.9
377
  },
378
  {
379
  "Model": "CYFRAGOVPL/Llama-PLLuM-8B-instruct",
@@ -381,7 +389,7 @@
381
  "Sentiment": 3.24,
382
  "Language understanding": 2.90,
383
  "Phraseology": 3.46,
384
- "Tricky questions": 3.9
385
  },
386
  {
387
  "Model": "CYFRAGOVPL/Llama-PLLuM-8B-chat",
@@ -389,7 +397,7 @@
389
  "Sentiment": 3.13,
390
  "Language understanding": 2.93,
391
  "Phraseology": 3.36,
392
- "Tricky questions": 3.9
393
  },
394
  {
395
  "Model": "CYFRAGOVPL/PLLuM-12B-nc-chat",
@@ -397,7 +405,7 @@
397
  "Sentiment": 3.22,
398
  "Language understanding": 3.23,
399
  "Phraseology": 3.54,
400
- "Tricky questions": 3.9
401
  },
402
  {
403
  "Model": "Qwen/Qwen2.5-72B-Instruct",
@@ -405,7 +413,7 @@
405
  "Sentiment": 4.076923076923077,
406
  "Language understanding": 3.97,
407
  "Phraseology": 3.93,
408
- "Tricky questions": 3.9
409
  },
410
  {
411
  "Model": "Qwen/Qwen2.5-32B-Instruct",
@@ -413,7 +421,7 @@
413
  "Sentiment": 3.8141025641025643,
414
  "Language understanding": 3.565,
415
  "Phraseology": 4.035,
416
- "Tricky questions": 3.9
417
  },
418
  {
419
  "Model": "mistralai/Mistral-Small-24B-Instruct-2501",
@@ -421,7 +429,7 @@
421
  "Sentiment": 3.91025641025641,
422
  "Language understanding": 3.6,
423
  "Phraseology": 3.875,
424
- "Tricky questions": 3.9
425
  },
426
  {
427
  "Model": "meta-llama/Llama-3.3-70B-Instruct",
@@ -429,7 +437,7 @@
429
  "Sentiment": 4.294871794871795,
430
  "Language understanding": 3.865,
431
  "Phraseology": 3.04,
432
- "Tricky questions": 3.9
433
  },
434
  {
435
  "Model": "Qwen/Qwen2.5-14B-Instruct",
@@ -437,7 +445,7 @@
437
  "Sentiment": 3.91025641025641,
438
  "Language understanding": 3.565,
439
  "Phraseology": 3.37,
440
- "Tricky questions": 3.9
441
  },
442
  {
443
  "Model": "microsoft/phi-4",
@@ -445,7 +453,7 @@
445
  "Sentiment": 3.717948717948718,
446
  "Language understanding": 3.54,
447
  "Phraseology": 3.235,
448
- "Tricky questions": 3.9
449
  },
450
  {
451
  "Model": "Qwen/Qwen2.5-7B-Instruct",
@@ -453,7 +461,7 @@
453
  "Sentiment": 3.5576923076923075,
454
  "Language understanding": 3.025,
455
  "Phraseology": 3.095,
456
- "Tricky questions": 3.9
457
  },
458
  {
459
  "Model": "microsoft/Phi-4-mini-instruct",
@@ -461,7 +469,7 @@
461
  "Sentiment": 2.6923076923076925,
462
  "Language understanding": 2.43,
463
  "Phraseology": 2.245,
464
- "Tricky questions": 3.9
465
  },
466
  {
467
  "Model": "gemini-2.0-flash-001",
@@ -469,7 +477,7 @@
469
  "Sentiment": 4.519230769230769,
470
  "Language understanding": 4.32,
471
  "Phraseology": 4.34,
472
- "Tricky questions": 3.9
473
  },
474
  {
475
  "Model": "gemini-2.0-flash-lite-001",
@@ -477,7 +485,7 @@
477
  "Sentiment": 4.230769230769231,
478
  "Language understanding": 4.055,
479
  "Phraseology": 4.235,
480
- "Tricky questions": 3.9
481
  },
482
  {
483
  "Model": "deepseek-ai/DeepSeek-V3 (API)",
@@ -485,7 +493,23 @@
485
  "Sentiment": 4.358974358974359,
486
  "Language understanding": 4.22,
487
  "Phraseology": 3.525,
488
- "Tricky questions": 3.9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
  },
490
  {
491
  "Model": "google/gemma-3-27b-it (API)",
@@ -493,6 +517,22 @@
493
  "Sentiment": 3.878205128205128,
494
  "Language understanding": 3.785,
495
  "Phraseology": 4.025,
496
- "Tricky questions": 3.9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497
  }
498
- ]
 
5
  "Sentiment": 4.230769230769231,
6
  "Language understanding": 4.0,
7
  "Phraseology": 3.86,
8
+ "Tricky questions": 3.646067415730337
9
  },
10
  {
11
+ "Model": "mistralai/Mistral-Large-Instruct-2411",
12
+ "Params": "123B",
13
+ "Sentiment": 4.326923076923077,
14
+ "Language understanding": 3.975,
15
+ "Phraseology": 3.99,
16
+ "Tricky questions": 3.7247191011235956
17
+ },
18
+ {
19
+ "Model": "alpindale/WizardLM-2-8x22B (API)",
20
  "Params": "141B",
21
  "Sentiment": 3.7051282051282053,
22
  "Language understanding": 3.815,
23
  "Phraseology": 4.22,
24
+ "Tricky questions": 3.056179775280899
25
  },
26
  {
27
  "Model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
 
29
  "Sentiment": 4.326923076923077,
30
  "Language understanding": 3.91,
31
  "Phraseology": 3.25,
32
+ "Tricky questions": 3.0112359550561796
33
  },
34
  {
35
  "Model": "meta-llama/Meta-Llama-3-70B-Instruct",
 
37
  "Sentiment": 4.134615384615385,
38
  "Language understanding": 3.82,
39
  "Phraseology": 3.465,
40
+ "Tricky questions": 3.707865168539326
41
  },
42
  {
43
  "Model": "speakleash/Bielik-11B-v2.3-Instruct",
 
45
  "Sentiment": 3.9743589743589745,
46
  "Language understanding": 3.785,
47
  "Phraseology": 3.55,
48
+ "Tricky questions": 3.2191011235955056
49
  },
50
  {
51
+ "Model": "mistralai/Mixtral-8x22B-Instruct-v0.1 (API)",
52
  "Params": "141B",
53
  "Sentiment": 3.782051282051282,
54
  "Language understanding": 3.675,
55
  "Phraseology": 3.55,
56
+ "Tricky questions": 3.235955056179775
57
  },
58
  {
59
  "Model": "speakleash/Bielik-11B-v2.1-Instruct",
 
61
  "Sentiment": 3.9551282051282053,
62
  "Language understanding": 3.915,
63
  "Phraseology": 3.105,
64
+ "Tricky questions": 3.4719101123595504
65
  },
66
  {
67
  "Model": "Qwen/Qwen2-72B-Instruct",
 
69
  "Sentiment": 3.7628205128205128,
70
  "Language understanding": 3.89,
71
  "Phraseology": 3.28,
72
+ "Tricky questions": 3.6797752808988764
73
  },
74
  {
75
  "Model": "speakleash/Bielik-11B-v2.0-Instruct",
 
77
  "Sentiment": 3.9743589743589745,
78
  "Language understanding": 3.745,
79
  "Phraseology": 3.125,
80
+ "Tricky questions": 2.196629213483146
81
  },
82
  {
83
  "Model": "speakleash/Bielik-11B-v2.2-Instruct",
 
85
  "Sentiment": 3.717948717948718,
86
  "Language understanding": 3.73,
87
  "Phraseology": 3.25,
88
+ "Tricky questions": 3.1235955056179776
89
  },
90
  {
91
  "Model": "Qwen/Qwen1.5-72B-Chat",
 
93
  "Sentiment": 3.4743589743589745,
94
  "Language understanding": 3.515,
95
  "Phraseology": 2.975,
96
+ "Tricky questions": 2.668539325842697
97
  },
98
  {
99
  "Model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 
101
  "Sentiment": 3.9743589743589745,
102
  "Language understanding": 3.38,
103
  "Phraseology": 2.58,
104
+ "Tricky questions": 2.1123595505617976
105
  },
106
  {
107
  "Model": "THUDM/glm-4-9b-chat",
 
109
  "Sentiment": 3.58974358974359,
110
  "Language understanding": 3.455,
111
  "Phraseology": 2.78,
112
+ "Tricky questions": 1.9831460674157304
113
  },
114
  {
115
  "Model": "mistralai/Mistral-Nemo-Instruct-2407",
 
117
  "Sentiment": 3.641025641025641,
118
  "Language understanding": 3.29,
119
  "Phraseology": 2.74,
120
+ "Tricky questions": 2.0898876404494384
121
  },
122
  {
123
  "Model": "meta-llama/Meta-Llama-3-8B-Instruct",
 
125
  "Sentiment": 3.3333333333333335,
126
  "Language understanding": 3.15,
127
  "Phraseology": 3.035,
128
+ "Tricky questions": 2.4775280898876404
129
  },
130
  {
131
  "Model": "upstage/SOLAR-10.7B-Instruct-v1.0",
 
133
  "Sentiment": 2.967948717948718,
134
  "Language understanding": 3.18,
135
  "Phraseology": 3.255,
136
+ "Tricky questions": 2.1235955056179776
137
  },
138
  {
139
  "Model": "speakleash/Bielik-7B-Instruct-v0.1",
 
141
  "Sentiment": 3.58974358974359,
142
  "Language understanding": 3.475,
143
  "Phraseology": 2.315,
144
+ "Tricky questions": 2.157303370786517
145
  },
146
  {
147
  "Model": "openchat/openchat-3.5-0106-gemma",
 
149
  "Sentiment": 3.730769230769231,
150
  "Language understanding": 3.08,
151
  "Phraseology": 2.445,
152
+ "Tricky questions": 1.6797752808988764
153
  },
154
  {
155
  "Model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
 
157
  "Sentiment": 3.0576923076923075,
158
  "Language understanding": 3.175,
159
  "Phraseology": 2.885,
160
+ "Tricky questions": 1.797752808988764
161
  },
162
  {
163
  "Model": "mistralai/Mistral-7B-Instruct-v0.3",
 
165
  "Sentiment": 3.326923076923077,
166
  "Language understanding": 3.06,
167
  "Phraseology": 2.68,
168
+ "Tricky questions": 1.9887640449438202
169
  },
170
  {
171
  "Model": "berkeley-nest/Starling-LM-7B-alpha",
 
173
  "Sentiment": 3.0576923076923075,
174
  "Language understanding": 2.925,
175
  "Phraseology": 2.855,
176
+ "Tricky questions": 1.6797752808988764
177
  },
178
  {
179
  "Model": "openchat/openchat-3.5-0106",
 
181
  "Sentiment": 3.16025641025641,
182
  "Language understanding": 2.835,
183
  "Phraseology": 2.555,
184
+ "Tricky questions": 1.9606741573033708
185
  },
186
  {
187
  "Model": "internlm/internlm2-chat-20b",
 
189
  "Sentiment": 3.301282051282051,
190
  "Language understanding": 2.785,
191
  "Phraseology": 2.385,
192
+ "Tricky questions": 0.12359550561797752
193
  },
194
  {
195
  "Model": "01-ai/Yi-1.5-34B-Chat",
 
197
  "Sentiment": 3.076923076923077,
198
  "Language understanding": 2.87,
199
  "Phraseology": 2.38,
200
+ "Tricky questions": 1.0
201
  },
202
  {
203
  "Model": "Voicelab/trurl-2-13b-academic",
 
205
  "Sentiment": 3.301282051282051,
206
  "Language understanding": 2.755,
207
  "Phraseology": 2.165,
208
+ "Tricky questions": 1.0168539325842696
209
  },
210
  {
211
  "Model": "google/gemma-2-2b-it",
 
213
  "Sentiment": 3.3974358974359,
214
  "Language understanding": 2.9,
215
  "Phraseology": 2.095,
216
+ "Tricky questions": 2.2134831460674156
217
  },
218
  {
219
  "Model": "Qwen/Qwen2.5-3B-Instruct",
 
221
  "Sentiment": 2.948717948717949,
222
  "Language understanding": 2.455,
223
  "Phraseology": 2.8,
224
+ "Tricky questions": 1.8089887640449438
225
  },
226
  {
227
  "Model": "NousResearch/Hermes-3-Llama-3.2-3B",
 
229
  "Sentiment": 2.6153846153846154,
230
  "Language understanding": 2.705,
231
  "Phraseology": 2.765,
232
+ "Tricky questions": 1.1404494382022472
233
  },
234
  {
235
  "Model": "ibm-granite/granite-3.1-2b-instruct",
 
237
  "Sentiment": 3.076923076923077,
238
  "Language understanding": 2.235,
239
  "Phraseology": 1.88,
240
+ "Tricky questions": 0.5898876404494382
241
  },
242
  {
243
  "Model": "meta-llama/Llama-3.2-1B-Instruct",
 
245
  "Sentiment": 3.076923076923077,
246
  "Language understanding": 1.735,
247
  "Phraseology": 2.34,
248
+ "Tricky questions": 0.5224719101123596
249
  },
250
  {
251
  "Model": "microsoft/Phi-3.5-mini-instruct",
 
253
  "Sentiment": 2.435897435897436,
254
  "Language understanding": 2.135,
255
  "Phraseology": 2.425,
256
+ "Tricky questions": 1.0449438202247192
257
  },
258
  {
259
  "Model": "meta-llama/Llama-3.2-3B-Instruct",
 
261
  "Sentiment": 2.7564102564102564,
262
  "Language understanding": 2.295,
263
  "Phraseology": 1.72,
264
+ "Tricky questions": 1.2191011235955056
265
  },
266
  {
267
  "Model": "h2oai/h2o-danube2-1.8b-chat",
 
269
  "Sentiment": 2.371794871794872,
270
  "Language understanding": 1.595,
271
  "Phraseology": 2.47,
272
+ "Tricky questions": 0.12921348314606743
273
  },
274
  {
275
  "Model": "Qwen/Qwen2.5-1.5B-Instruct",
 
277
  "Sentiment": 2.7948717948717947,
278
  "Language understanding": 1.35,
279
  "Phraseology": 2.225,
280
+ "Tricky questions": 0.6629213483146067
281
  },
282
  {
283
  "Model": "utter-project/EuroLLM-1.7B-Instruct",
 
285
  "Sentiment": 2.243589743589744,
286
  "Language understanding": 1.79,
287
  "Phraseology": 2.26,
288
+ "Tricky questions": 0.7584269662921348
289
  },
290
  {
291
  "Model": "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct",
 
293
  "Sentiment": 1.9423076923076923,
294
  "Language understanding": 2.1155778894472363,
295
  "Phraseology": 2.130653266331658,
296
+ "Tricky questions": 0.4887640449438202
297
  },
298
  {
299
  "Model": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
 
301
  "Sentiment": 2.275641025641025,
302
  "Language understanding": 1.1,
303
  "Phraseology": 2.355,
304
+ "Tricky questions": 0.25280898876404495
305
  },
306
  {
307
  "Model": "Qwen/Qwen2.5-0.5B-Instruct",
 
309
  "Sentiment": 1.955128205128205,
310
  "Language understanding": 0.835,
311
  "Phraseology": 2.595,
312
+ "Tricky questions": 0.21910112359550563
313
  },
314
  {
315
  "Model": "CYFRAGOVPL/Llama-PLLuM-70B-chat",
 
317
  "Sentiment": 3.94,
318
  "Language understanding": 3.61,
319
  "Phraseology": 3.35,
320
+ "Tricky questions": 3.2134831460674156
321
  },
322
  {
323
  "Model": "CYFRAGOVPL/PLLuM-8x7B-nc-instruct",
 
325
  "Sentiment": 3.88,
326
  "Language understanding": 3.59,
327
  "Phraseology": 3.22,
328
+ "Tricky questions": 1.7640449438202248
329
  },
330
  {
331
  "Model": "CYFRAGOVPL/Llama-PLLuM-70B-instruct",
 
333
  "Sentiment": 3.78,
334
  "Language understanding": 3.63,
335
  "Phraseology": 3.26,
336
+ "Tricky questions": 2.634831460674157
337
  },
338
  {
339
  "Model": "CYFRAGOVPL/PLLuM-8x7B-instruct",
 
341
  "Sentiment": 3.59,
342
  "Language understanding": 3.47,
343
  "Phraseology": 3.46,
344
+ "Tricky questions": 1.5056179775280898
345
  },
346
  {
347
  "Model": "CYFRAGOVPL/PLLuM-12B-instruct",
 
349
  "Sentiment": 3.71,
350
  "Language understanding": 3.17,
351
  "Phraseology": 3.59,
352
+ "Tricky questions": 1.904494382022472
353
  },
354
  {
355
  "Model": "CYFRAGOVPL/PLLuM-8x7B-nc-chat",
 
357
  "Sentiment": 3.76,
358
  "Language understanding": 3.48,
359
  "Phraseology": 3.08,
360
+ "Tricky questions": 1.797752808988764
361
  },
362
  {
363
  "Model": "CYFRAGOVPL/PLLuM-8x7B-chat",
 
365
  "Sentiment": 3.44,
366
  "Language understanding": 3.45,
367
  "Phraseology": 3.35,
368
+ "Tricky questions": 1.7808988764044944
369
  },
370
  {
371
  "Model": "CYFRAGOVPL/PLLuM-12B-chat",
 
373
  "Sentiment": 3.32,
374
  "Language understanding": 3.21,
375
  "Phraseology": 3.43,
376
+ "Tricky questions": 2.5898876404494384
377
  },
378
  {
379
  "Model": "CYFRAGOVPL/PLLuM-12B-nc-instruct",
 
381
  "Sentiment": 3.24,
382
  "Language understanding": 3.31,
383
  "Phraseology": 3.32,
384
+ "Tricky questions": 1.9831460674157304
385
  },
386
  {
387
  "Model": "CYFRAGOVPL/Llama-PLLuM-8B-instruct",
 
389
  "Sentiment": 3.24,
390
  "Language understanding": 2.90,
391
  "Phraseology": 3.46,
392
+ "Tricky questions": 1.6629213483146068
393
  },
394
  {
395
  "Model": "CYFRAGOVPL/Llama-PLLuM-8B-chat",
 
397
  "Sentiment": 3.13,
398
  "Language understanding": 2.93,
399
  "Phraseology": 3.36,
400
+ "Tricky questions": 2.252808988764045
401
  },
402
  {
403
  "Model": "CYFRAGOVPL/PLLuM-12B-nc-chat",
 
405
  "Sentiment": 3.22,
406
  "Language understanding": 3.23,
407
  "Phraseology": 3.54,
408
+ "Tricky questions": 2.6235955056179776
409
  },
410
  {
411
  "Model": "Qwen/Qwen2.5-72B-Instruct",
 
413
  "Sentiment": 4.076923076923077,
414
  "Language understanding": 3.97,
415
  "Phraseology": 3.93,
416
+ "Tricky questions": 3.808988764044944
417
  },
418
  {
419
  "Model": "Qwen/Qwen2.5-32B-Instruct",
 
421
  "Sentiment": 3.8141025641025643,
422
  "Language understanding": 3.565,
423
  "Phraseology": 4.035,
424
+ "Tricky questions": 3.5898876404494384
425
  },
426
  {
427
  "Model": "mistralai/Mistral-Small-24B-Instruct-2501",
 
429
  "Sentiment": 3.91025641025641,
430
  "Language understanding": 3.6,
431
  "Phraseology": 3.875,
432
+ "Tricky questions": 3.449438202247191
433
  },
434
  {
435
  "Model": "meta-llama/Llama-3.3-70B-Instruct",
 
437
  "Sentiment": 4.294871794871795,
438
  "Language understanding": 3.865,
439
  "Phraseology": 3.04,
440
+ "Tricky questions": 3.3764044943820224
441
  },
442
  {
443
  "Model": "Qwen/Qwen2.5-14B-Instruct",
 
445
  "Sentiment": 3.91025641025641,
446
  "Language understanding": 3.565,
447
  "Phraseology": 3.37,
448
+ "Tricky questions": 3.337078651685393
449
  },
450
  {
451
  "Model": "microsoft/phi-4",
 
453
  "Sentiment": 3.717948717948718,
454
  "Language understanding": 3.54,
455
  "Phraseology": 3.235,
456
+ "Tricky questions": 2.7247191011235956
457
  },
458
  {
459
  "Model": "Qwen/Qwen2.5-7B-Instruct",
 
461
  "Sentiment": 3.5576923076923075,
462
  "Language understanding": 3.025,
463
  "Phraseology": 3.095,
464
+ "Tricky questions": 2.5842696629213484
465
  },
466
  {
467
  "Model": "microsoft/Phi-4-mini-instruct",
 
469
  "Sentiment": 2.6923076923076925,
470
  "Language understanding": 2.43,
471
  "Phraseology": 2.245,
472
+ "Tricky questions": 1.303370786516854
473
  },
474
  {
475
  "Model": "gemini-2.0-flash-001",
 
477
  "Sentiment": 4.519230769230769,
478
  "Language understanding": 4.32,
479
  "Phraseology": 4.34,
480
+ "Tricky questions": 3.9887640449438204
481
  },
482
  {
483
  "Model": "gemini-2.0-flash-lite-001",
 
485
  "Sentiment": 4.230769230769231,
486
  "Language understanding": 4.055,
487
  "Phraseology": 4.235,
488
+ "Tricky questions": 3.853932584269663
489
  },
490
  {
491
  "Model": "deepseek-ai/DeepSeek-V3 (API)",
 
493
  "Sentiment": 4.358974358974359,
494
  "Language understanding": 4.22,
495
  "Phraseology": 3.525,
496
+ "Tricky questions": 3.9887640449438204
497
+ },
498
+ {
499
+ "Model": "deepseek-ai/DeepSeek-R1 (API)",
500
+ "Params": "685B",
501
+ "Sentiment": 4.487179487179487,
502
+ "Language understanding": 4.345,
503
+ "Phraseology": 3.6,
504
+ "Tricky questions": 4.117977528089888
505
+ },
506
+ {
507
+ "Model": "deepseek-ai/DeepSeek-V3-0324 (API)",
508
+ "Params": "685B",
509
+ "Sentiment": 4.358974358974359,
510
+ "Language understanding": 4.195,
511
+ "Phraseology": 3.54,
512
+ "Tricky questions": 4.022471910112359
513
  },
514
  {
515
  "Model": "google/gemma-3-27b-it (API)",
 
517
  "Sentiment": 3.878205128205128,
518
  "Language understanding": 3.785,
519
  "Phraseology": 4.025,
520
+ "Tricky questions": 3.533707865168539
521
+ },
522
+ {
523
+ "Model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 (API)",
524
+ "Params": "402B",
525
+ "Sentiment": 4.391025641025641,
526
+ "Language understanding": 4.11,
527
+ "Phraseology": 3.475,
528
+ "Tricky questions": 3.758426966292135
529
+ },
530
+ {
531
+ "Model": "meta-llama/Llama-4-Scout-17B-16E-Instruct (API)",
532
+ "Params": "109B",
533
+ "Sentiment": 4.102564102564102,
534
+ "Language understanding": 3.805,
535
+ "Phraseology": 3.9,
536
+ "Tricky questions": 3.191011235955056
537
  }
538
+ ]