jbnayahu commited on
Commit
38a7047
·
unverified ·
1 Parent(s): 3a4f28e

Sample results

Browse files

Signed-off-by: Jonathan Bnayahu <[email protected]>

results/bluebench/2025-06-16T11-59-29_evaluation_results.json ADDED
@@ -0,0 +1,1302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-06-16T08:59:29.752699Z",
4
+ "command_line_invocation": [
5
+ "/home/bnayahu/miniforge3/envs/unitxt/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=granite-3-3-8b-instruct,max_tokens=256",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "watsonx/ibm/granite-3-3-8b-instruct",
30
+ "model_args": {
31
+ "max_tokens": 256
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "4cafeaa09eea146f2e2d0609974999a64dfffbbe",
46
+ "python_version": "3.11.12",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Mon Apr 21 17:08:54 UTC 2025",
49
+ "installed_packages": {
50
+ "tqdm": "4.67.1",
51
+ "httpretty": "1.1.4",
52
+ "evaluate": "0.4.3",
53
+ "ruff": "0.11.10",
54
+ "virtualenv": "20.31.2",
55
+ "urllib3": "2.4.0",
56
+ "httpcore": "1.0.9",
57
+ "mecab-ko-dic": "1.0.0",
58
+ "mecab-ko": "1.0.1",
59
+ "identify": "2.6.10",
60
+ "bert-score": "0.3.13",
61
+ "lxml": "5.4.0",
62
+ "python-dotenv": "1.1.0",
63
+ "accelerate": "1.7.0",
64
+ "httpx-sse": "0.4.0",
65
+ "pillow": "11.2.1",
66
+ "certifi": "2025.4.26",
67
+ "pyparsing": "3.2.3",
68
+ "nvidia-cusparselt-cu12": "0.6.3",
69
+ "tzdata": "2025.2",
70
+ "torch": "2.7.0",
71
+ "MarkupSafe": "3.0.2",
72
+ "setuptools": "80.1.0",
73
+ "pydantic": "2.11.4",
74
+ "yarl": "1.20.0",
75
+ "importlib_metadata": "8.0.0",
76
+ "pydantic_core": "2.33.2",
77
+ "scipy": "1.15.3",
78
+ "annotated-types": "0.7.0",
79
+ "portalocker": "3.1.1",
80
+ "packaging": "24.2",
81
+ "Deprecated": "1.2.18",
82
+ "typing_extensions": "4.12.2",
83
+ "ibm-cos-sdk-s3transfer": "2.14.1",
84
+ "nvidia-cufft-cu12": "11.3.0.4",
85
+ "nvidia-cusolver-cu12": "11.7.1.2",
86
+ "diskcache": "5.6.3",
87
+ "fsspec": "2025.3.0",
88
+ "transformers": "4.51.3",
89
+ "platformdirs": "4.2.2",
90
+ "nvidia-cublas-cu12": "12.6.4.1",
91
+ "threadpoolctl": "3.6.0",
92
+ "jsonschema-specifications": "2025.4.1",
93
+ "tenacity": "9.1.2",
94
+ "propcache": "0.3.1",
95
+ "ibm-cos-sdk": "2.14.1",
96
+ "mpmath": "1.3.0",
97
+ "jiter": "0.9.0",
98
+ "filelock": "3.18.0",
99
+ "tomli": "2.0.1",
100
+ "nvidia-nvjitlink-cu12": "12.6.85",
101
+ "cfgv": "3.4.0",
102
+ "ibm_watsonx_ai": "1.3.13",
103
+ "ibm-generative-ai": "3.0.0",
104
+ "wheel": "0.45.1",
105
+ "sympy": "1.14.0",
106
+ "requests": "2.32.2",
107
+ "charset-normalizer": "3.4.2",
108
+ "psutil": "7.0.0",
109
+ "pre_commit": "4.2.0",
110
+ "nodeenv": "1.9.1",
111
+ "colorama": "0.4.6",
112
+ "absl-py": "2.2.2",
113
+ "rouge_score": "0.1.2",
114
+ "scikit-learn": "1.6.1",
115
+ "multiprocess": "0.70.16",
116
+ "xxhash": "3.5.0",
117
+ "detect-secrets": "1.5.0",
118
+ "aiohttp": "3.11.18",
119
+ "frozenlist": "1.6.0",
120
+ "tabulate": "0.9.0",
121
+ "triton": "3.3.0",
122
+ "idna": "3.10",
123
+ "PyYAML": "6.0.2",
124
+ "ibm-cos-sdk-core": "2.14.1",
125
+ "nvidia-curand-cu12": "10.3.7.77",
126
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
127
+ "tiktoken": "0.9.0",
128
+ "aiosignal": "1.3.2",
129
+ "attrs": "25.3.0",
130
+ "h11": "0.16.0",
131
+ "anyio": "4.9.0",
132
+ "wrapt": "1.17.2",
133
+ "kiwisolver": "1.4.8",
134
+ "nvidia-cudnn-cu12": "9.5.1.17",
135
+ "matplotlib": "3.10.3",
136
+ "aiolimiter": "1.2.1",
137
+ "codespell": "2.4.1",
138
+ "jmespath": "1.0.1",
139
+ "nltk": "3.9.1",
140
+ "unitxt": "1.24.0",
141
+ "dill": "0.3.8",
142
+ "multidict": "6.4.3",
143
+ "conllu": "6.0.0",
144
+ "litellm": "1.69.3",
145
+ "joblib": "1.5.0",
146
+ "cycler": "0.12.1",
147
+ "pip": "25.1.1",
148
+ "nvidia-nccl-cu12": "2.26.2",
149
+ "click": "8.2.0",
150
+ "fonttools": "4.58.0",
151
+ "datasets": "3.6.0",
152
+ "six": "1.17.0",
153
+ "numpy": "2.2.5",
154
+ "nvidia-cuda-runtime-cu12": "12.6.77",
155
+ "huggingface-hub": "0.31.2",
156
+ "aiohappyeyeballs": "2.6.1",
157
+ "sacrebleu": "2.5.1",
158
+ "pyarrow": "20.0.0",
159
+ "openai": "1.75.0",
160
+ "python-dateutil": "2.9.0.post0",
161
+ "pytz": "2025.2",
162
+ "contourpy": "1.3.2",
163
+ "pandas": "2.2.3",
164
+ "distro": "1.9.0",
165
+ "httpx": "0.27.2",
166
+ "rpds-py": "0.25.0",
167
+ "Jinja2": "3.1.6",
168
+ "nvidia-cusparse-cu12": "12.5.4.2",
169
+ "nvidia-nvtx-cu12": "12.6.77",
170
+ "fuzzywuzzy": "0.18.0",
171
+ "tokenizers": "0.21.1",
172
+ "lomond": "0.3.3",
173
+ "nvidia-cufile-cu12": "1.11.1.6",
174
+ "typing-inspection": "0.4.0",
175
+ "safetensors": "0.5.3",
176
+ "nvidia-cuda-cupti-cu12": "12.6.80",
177
+ "referencing": "0.36.2",
178
+ "networkx": "3.4.2",
179
+ "jsonschema": "4.23.0",
180
+ "zipp": "3.19.2",
181
+ "regex": "2024.11.6",
182
+ "distlib": "0.3.9",
183
+ "sniffio": "1.3.1",
184
+ "autocommand": "2.2.2",
185
+ "jaraco.collections": "5.1.0",
186
+ "typeguard": "4.3.0",
187
+ "jaraco.text": "3.12.1",
188
+ "jaraco.context": "5.3.0",
189
+ "jaraco.functools": "4.0.1",
190
+ "more-itertools": "10.3.0",
191
+ "backports.tarfile": "1.2.0",
192
+ "inflect": "7.3.1"
193
+ }
194
+ },
195
+ "results": {
196
+ "bias": {
197
+ "safety_bbq_age": {
198
+ "accuracy": 0.5111111111111111,
199
+ "accuracy_ci_low": 0.4111111111111111,
200
+ "accuracy_ci_high": 0.6111111111111112,
201
+ "score_name": "accuracy",
202
+ "score": 0.5111111111111111,
203
+ "score_ci_high": 0.6111111111111112,
204
+ "score_ci_low": 0.4111111111111111,
205
+ "num_of_instances": 90
206
+ },
207
+ "safety_bbq_disability_status": {
208
+ "accuracy": 0.6555555555555556,
209
+ "accuracy_ci_low": 0.5444444444444444,
210
+ "accuracy_ci_high": 0.7444444444444445,
211
+ "score_name": "accuracy",
212
+ "score": 0.6555555555555556,
213
+ "score_ci_high": 0.7444444444444445,
214
+ "score_ci_low": 0.5444444444444444,
215
+ "num_of_instances": 90
216
+ },
217
+ "safety_bbq_gender_identity": {
218
+ "accuracy": 0.8777777777777778,
219
+ "accuracy_ci_low": 0.8,
220
+ "accuracy_ci_high": 0.9333333333333333,
221
+ "score_name": "accuracy",
222
+ "score": 0.8777777777777778,
223
+ "score_ci_high": 0.9333333333333333,
224
+ "score_ci_low": 0.8,
225
+ "num_of_instances": 90
226
+ },
227
+ "safety_bbq_nationality": {
228
+ "accuracy": 0.6555555555555556,
229
+ "accuracy_ci_low": 0.5555555555555556,
230
+ "accuracy_ci_high": 0.7555555555555555,
231
+ "score_name": "accuracy",
232
+ "score": 0.6555555555555556,
233
+ "score_ci_high": 0.7555555555555555,
234
+ "score_ci_low": 0.5555555555555556,
235
+ "num_of_instances": 90
236
+ },
237
+ "safety_bbq_physical_appearance": {
238
+ "accuracy": 0.7333333333333333,
239
+ "accuracy_ci_low": 0.6222222222222222,
240
+ "accuracy_ci_high": 0.8222222222222222,
241
+ "score_name": "accuracy",
242
+ "score": 0.7333333333333333,
243
+ "score_ci_high": 0.8222222222222222,
244
+ "score_ci_low": 0.6222222222222222,
245
+ "num_of_instances": 90
246
+ },
247
+ "safety_bbq_race_ethnicity": {
248
+ "accuracy": 0.9,
249
+ "accuracy_ci_low": 0.8222222222222222,
250
+ "accuracy_ci_high": 0.9444444444444444,
251
+ "score_name": "accuracy",
252
+ "score": 0.9,
253
+ "score_ci_high": 0.9444444444444444,
254
+ "score_ci_low": 0.8222222222222222,
255
+ "num_of_instances": 90
256
+ },
257
+ "safety_bbq_race_x_gender": {
258
+ "accuracy": 0.9,
259
+ "accuracy_ci_low": 0.8316765653997056,
260
+ "accuracy_ci_high": 0.9444444444444444,
261
+ "score_name": "accuracy",
262
+ "score": 0.9,
263
+ "score_ci_high": 0.9444444444444444,
264
+ "score_ci_low": 0.8316765653997056,
265
+ "num_of_instances": 90
266
+ },
267
+ "safety_bbq_race_x_ses": {
268
+ "accuracy": 0.8555555555555555,
269
+ "accuracy_ci_low": 0.7666666666666667,
270
+ "accuracy_ci_high": 0.9111111111111111,
271
+ "score_name": "accuracy",
272
+ "score": 0.8555555555555555,
273
+ "score_ci_high": 0.9111111111111111,
274
+ "score_ci_low": 0.7666666666666667,
275
+ "num_of_instances": 90
276
+ },
277
+ "safety_bbq_religion": {
278
+ "accuracy": 0.7555555555555555,
279
+ "accuracy_ci_low": 0.6666666666666666,
280
+ "accuracy_ci_high": 0.8444444444444444,
281
+ "score_name": "accuracy",
282
+ "score": 0.7555555555555555,
283
+ "score_ci_high": 0.8444444444444444,
284
+ "score_ci_low": 0.6666666666666666,
285
+ "num_of_instances": 90
286
+ },
287
+ "safety_bbq_ses": {
288
+ "accuracy": 0.6777777777777778,
289
+ "accuracy_ci_low": 0.5777777777777777,
290
+ "accuracy_ci_high": 0.7666666666666667,
291
+ "score_name": "accuracy",
292
+ "score": 0.6777777777777778,
293
+ "score_ci_high": 0.7666666666666667,
294
+ "score_ci_low": 0.5777777777777777,
295
+ "num_of_instances": 90
296
+ },
297
+ "safety_bbq_sexual_orientation": {
298
+ "accuracy": 0.7888888888888889,
299
+ "accuracy_ci_low": 0.7,
300
+ "accuracy_ci_high": 0.8666666666666667,
301
+ "score_name": "accuracy",
302
+ "score": 0.7888888888888889,
303
+ "score_ci_high": 0.8666666666666667,
304
+ "score_ci_low": 0.7,
305
+ "num_of_instances": 90
306
+ },
307
+ "score": 0.7555555555555555,
308
+ "score_name": "subsets_mean",
309
+ "num_of_instances": 990
310
+ },
311
+ "chatbot_abilities": {
312
+ "arena_hard_generation_english_gpt_4_0314_reference": {
313
+ "num_of_instances": 500,
314
+ "llama_3_70b_instruct_template_arena_hard": 0.5,
315
+ "score": 0.5,
316
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
317
+ },
318
+ "score": 0.5,
319
+ "score_name": "subsets_mean",
320
+ "num_of_instances": 500
321
+ },
322
+ "entity_extraction": {
323
+ "universal_ner_en_ewt": {
324
+ "num_of_instances": 1000,
325
+ "f1_Person": 0.48648648648648646,
326
+ "f1_Organization": 0.410958904109589,
327
+ "f1_Location": 0.3448275862068966,
328
+ "f1_macro": 0.4140909922676574,
329
+ "recall_macro": 0.338801872982567,
330
+ "precision_macro": 0.5399801587301587,
331
+ "in_classes_support": 0.5876106194690265,
332
+ "f1_micro": 0.3321100917431193,
333
+ "recall_micro": 0.34476190476190477,
334
+ "precision_micro": 0.32035398230088497,
335
+ "score": 0.3321100917431193,
336
+ "score_name": "f1_micro",
337
+ "score_ci_low": 0.2855436018468573,
338
+ "score_ci_high": 0.38276564122852924,
339
+ "f1_micro_ci_low": 0.2855436018468573,
340
+ "f1_micro_ci_high": 0.38276564122852924
341
+ },
342
+ "score": 0.3321100917431193,
343
+ "score_name": "subsets_mean",
344
+ "num_of_instances": 1000
345
+ },
346
+ "knowledge": {
347
+ "mmlu_pro_biology": {
348
+ "accuracy": 0.5352112676056338,
349
+ "accuracy_ci_low": 0.4225352112676056,
350
+ "accuracy_ci_high": 0.6510866959365942,
351
+ "score_name": "accuracy",
352
+ "score": 0.5352112676056338,
353
+ "score_ci_high": 0.6510866959365942,
354
+ "score_ci_low": 0.4225352112676056,
355
+ "num_of_instances": 71
356
+ },
357
+ "mmlu_pro_business": {
358
+ "accuracy": 0.19718309859154928,
359
+ "accuracy_ci_low": 0.11267605633802817,
360
+ "accuracy_ci_high": 0.30985915492957744,
361
+ "score_name": "accuracy",
362
+ "score": 0.19718309859154928,
363
+ "score_ci_high": 0.30985915492957744,
364
+ "score_ci_low": 0.11267605633802817,
365
+ "num_of_instances": 71
366
+ },
367
+ "mmlu_pro_chemistry": {
368
+ "accuracy": 0.19718309859154928,
369
+ "accuracy_ci_low": 0.11267605633802817,
370
+ "accuracy_ci_high": 0.29577464788732394,
371
+ "score_name": "accuracy",
372
+ "score": 0.19718309859154928,
373
+ "score_ci_high": 0.29577464788732394,
374
+ "score_ci_low": 0.11267605633802817,
375
+ "num_of_instances": 71
376
+ },
377
+ "mmlu_pro_computer_science": {
378
+ "accuracy": 0.38028169014084506,
379
+ "accuracy_ci_low": 0.2676056338028169,
380
+ "accuracy_ci_high": 0.49295774647887325,
381
+ "score_name": "accuracy",
382
+ "score": 0.38028169014084506,
383
+ "score_ci_high": 0.49295774647887325,
384
+ "score_ci_low": 0.2676056338028169,
385
+ "num_of_instances": 71
386
+ },
387
+ "mmlu_pro_economics": {
388
+ "accuracy": 0.4084507042253521,
389
+ "accuracy_ci_low": 0.30985915492957744,
390
+ "accuracy_ci_high": 0.5211267605633803,
391
+ "score_name": "accuracy",
392
+ "score": 0.4084507042253521,
393
+ "score_ci_high": 0.5211267605633803,
394
+ "score_ci_low": 0.30985915492957744,
395
+ "num_of_instances": 71
396
+ },
397
+ "mmlu_pro_engineering": {
398
+ "accuracy": 0.22535211267605634,
399
+ "accuracy_ci_low": 0.1267605633802817,
400
+ "accuracy_ci_high": 0.323943661971831,
401
+ "score_name": "accuracy",
402
+ "score": 0.22535211267605634,
403
+ "score_ci_high": 0.323943661971831,
404
+ "score_ci_low": 0.1267605633802817,
405
+ "num_of_instances": 71
406
+ },
407
+ "mmlu_pro_health": {
408
+ "accuracy": 0.3380281690140845,
409
+ "accuracy_ci_low": 0.23943661971830985,
410
+ "accuracy_ci_high": 0.4507042253521127,
411
+ "score_name": "accuracy",
412
+ "score": 0.3380281690140845,
413
+ "score_ci_high": 0.4507042253521127,
414
+ "score_ci_low": 0.23943661971830985,
415
+ "num_of_instances": 71
416
+ },
417
+ "mmlu_pro_history": {
418
+ "accuracy": 0.352112676056338,
419
+ "accuracy_ci_low": 0.23943661971830985,
420
+ "accuracy_ci_high": 0.4788732394366197,
421
+ "score_name": "accuracy",
422
+ "score": 0.352112676056338,
423
+ "score_ci_high": 0.4788732394366197,
424
+ "score_ci_low": 0.23943661971830985,
425
+ "num_of_instances": 71
426
+ },
427
+ "mmlu_pro_law": {
428
+ "accuracy": 0.39436619718309857,
429
+ "accuracy_ci_low": 0.29577464788732394,
430
+ "accuracy_ci_high": 0.5070422535211268,
431
+ "score_name": "accuracy",
432
+ "score": 0.39436619718309857,
433
+ "score_ci_high": 0.5070422535211268,
434
+ "score_ci_low": 0.29577464788732394,
435
+ "num_of_instances": 71
436
+ },
437
+ "mmlu_pro_math": {
438
+ "accuracy": 0.14084507042253522,
439
+ "accuracy_ci_low": 0.07042253521126761,
440
+ "accuracy_ci_high": 0.23943661971830985,
441
+ "score_name": "accuracy",
442
+ "score": 0.14084507042253522,
443
+ "score_ci_high": 0.23943661971830985,
444
+ "score_ci_low": 0.07042253521126761,
445
+ "num_of_instances": 71
446
+ },
447
+ "mmlu_pro_other": {
448
+ "accuracy": 0.28169014084507044,
449
+ "accuracy_ci_low": 0.18309859154929578,
450
+ "accuracy_ci_high": 0.38028169014084506,
451
+ "score_name": "accuracy",
452
+ "score": 0.28169014084507044,
453
+ "score_ci_high": 0.38028169014084506,
454
+ "score_ci_low": 0.18309859154929578,
455
+ "num_of_instances": 71
456
+ },
457
+ "mmlu_pro_philosophy": {
458
+ "accuracy": 0.4507042253521127,
459
+ "accuracy_ci_low": 0.3272644997208875,
460
+ "accuracy_ci_high": 0.5774647887323944,
461
+ "score_name": "accuracy",
462
+ "score": 0.4507042253521127,
463
+ "score_ci_high": 0.5774647887323944,
464
+ "score_ci_low": 0.3272644997208875,
465
+ "num_of_instances": 71
466
+ },
467
+ "mmlu_pro_physics": {
468
+ "accuracy": 0.2112676056338028,
469
+ "accuracy_ci_low": 0.1267605633802817,
470
+ "accuracy_ci_high": 0.323943661971831,
471
+ "score_name": "accuracy",
472
+ "score": 0.2112676056338028,
473
+ "score_ci_high": 0.323943661971831,
474
+ "score_ci_low": 0.1267605633802817,
475
+ "num_of_instances": 71
476
+ },
477
+ "mmlu_pro_psychology": {
478
+ "accuracy": 0.5633802816901409,
479
+ "accuracy_ci_low": 0.43661971830985913,
480
+ "accuracy_ci_high": 0.6619718309859155,
481
+ "score_name": "accuracy",
482
+ "score": 0.5633802816901409,
483
+ "score_ci_high": 0.6619718309859155,
484
+ "score_ci_low": 0.43661971830985913,
485
+ "num_of_instances": 71
486
+ },
487
+ "score": 0.3340040241448692,
488
+ "score_name": "subsets_mean",
489
+ "num_of_instances": 994
490
+ },
491
+ "legal": {
492
+ "legalbench_abercrombie": {
493
+ "f1_macro": 0.3091896407685881,
494
+ "f1_suggestive": 0.2857142857142857,
495
+ "f1_arbitrary": 0.4444444444444444,
496
+ "f1_generic": 0.3157894736842105,
497
+ "f1_fanciful": 0.0,
498
+ "f1_descriptive": 0.5,
499
+ "f1_macro_ci_low": 0.22288382556382555,
500
+ "f1_macro_ci_high": 0.40529088046834727,
501
+ "score_name": "f1_micro",
502
+ "score": 0.3717948717948718,
503
+ "score_ci_high": 0.4807113986939902,
504
+ "score_ci_low": 0.2631578947368421,
505
+ "num_of_instances": 85,
506
+ "accuracy": 0.3411764705882353,
507
+ "accuracy_ci_low": 0.23529411764705882,
508
+ "accuracy_ci_high": 0.4470588235294118,
509
+ "f1_micro": 0.3717948717948718,
510
+ "f1_micro_ci_low": 0.2631578947368421,
511
+ "f1_micro_ci_high": 0.4807113986939902
512
+ },
513
+ "legalbench_corporate_lobbying": {
514
+ "f1_macro": 0.5386002886002885,
515
+ "f1_no": 0.7676767676767676,
516
+ "f1_yes": 0.30952380952380953,
517
+ "f1_macro_ci_low": 0.46173823746740844,
518
+ "f1_macro_ci_high": 0.6159447239385368,
519
+ "score_name": "f1_micro",
520
+ "score": 0.6666666666666666,
521
+ "score_ci_high": 0.7237928752902334,
522
+ "score_ci_low": 0.5925925925925926,
523
+ "num_of_instances": 200,
524
+ "accuracy": 0.635,
525
+ "accuracy_ci_low": 0.56,
526
+ "accuracy_ci_high": 0.695,
527
+ "f1_micro": 0.6666666666666666,
528
+ "f1_micro_ci_low": 0.5925925925925926,
529
+ "f1_micro_ci_high": 0.7237928752902334
530
+ },
531
+ "legalbench_function_of_decision_section": {
532
+ "f1_macro": 0.2803267774022364,
533
+ "f1_conclusion": 0.08695652173913043,
534
+ "f1_decree": 0.3333333333333333,
535
+ "f1_issue": 0.24561403508771928,
536
+ "f1_analysis": 0.3076923076923077,
537
+ "f1_facts": 0.21621621621621623,
538
+ "f1_procedural history": 0.3018867924528302,
539
+ "f1_rule": 0.47058823529411764,
540
+ "f1_macro_ci_low": 0.21723272868885718,
541
+ "f1_macro_ci_high": 0.3464281083472716,
542
+ "score_name": "f1_micro",
543
+ "score": 0.2922636103151863,
544
+ "score_ci_high": 0.3563218390804598,
545
+ "score_ci_low": 0.22030548535299002,
546
+ "num_of_instances": 200,
547
+ "accuracy": 0.255,
548
+ "accuracy_ci_low": 0.195,
549
+ "accuracy_ci_high": 0.31,
550
+ "f1_micro": 0.2922636103151863,
551
+ "f1_micro_ci_low": 0.22030548535299002,
552
+ "f1_micro_ci_high": 0.3563218390804598
553
+ },
554
+ "legalbench_international_citizenship_questions": {
555
+ "f1_macro": 0.501342318650011,
556
+ "f1_yes": 0.5648148148148148,
557
+ "f1_no": 0.4378698224852071,
558
+ "f1_macro_ci_low": 0.43005283833620406,
559
+ "f1_macro_ci_high": 0.5697757173860767,
560
+ "score_name": "f1_micro",
561
+ "score": 0.509090909090909,
562
+ "score_ci_high": 0.5735035597182048,
563
+ "score_ci_low": 0.4339558606291127,
564
+ "num_of_instances": 200,
565
+ "accuracy": 0.49,
566
+ "accuracy_ci_low": 0.415,
567
+ "accuracy_ci_high": 0.555,
568
+ "f1_micro": 0.509090909090909,
569
+ "f1_micro_ci_low": 0.4339558606291127,
570
+ "f1_micro_ci_high": 0.5735035597182048
571
+ },
572
+ "legalbench_proa": {
573
+ "f1_macro": 0.8456121343445286,
574
+ "f1_yes": 0.8450704225352113,
575
+ "f1_no": 0.8461538461538461,
576
+ "f1_macro_ci_low": 0.7755496746888861,
577
+ "f1_macro_ci_high": 0.9030851777330651,
578
+ "score_name": "f1_micro",
579
+ "score": 0.8456375838926175,
580
+ "score_ci_high": 0.9032258064516129,
581
+ "score_ci_low": 0.770569043574124,
582
+ "num_of_instances": 85,
583
+ "accuracy": 0.7411764705882353,
584
+ "accuracy_ci_low": 0.6396825906719896,
585
+ "accuracy_ci_high": 0.8235294117647058,
586
+ "f1_micro": 0.8456375838926175,
587
+ "f1_micro_ci_low": 0.770569043574124,
588
+ "f1_micro_ci_high": 0.9032258064516129
589
+ },
590
+ "score": 0.5370907283520503,
591
+ "score_name": "subsets_mean",
592
+ "num_of_instances": 770
593
+ },
594
+ "news_classification": {
595
+ "20_newsgroups_short": {
596
+ "f1_macro": 0.4301866659512963,
597
+ "f1_cars": 0.693069306930693,
598
+ "f1_pc hardware": 0.35454545454545455,
599
+ "f1_windows x": 0.0,
600
+ "f1_atheism": 0.26666666666666666,
601
+ "f1_christianity": 0.16129032258064516,
602
+ "f1_religion": 0.21359223300970873,
603
+ "f1_medicine": 0.8051948051948052,
604
+ "f1_computer graphics": 0.45454545454545453,
605
+ "f1_microsoft windows": 0.39436619718309857,
606
+ "f1_middle east": 0.4166666666666667,
607
+ "f1_motorcycles": 0.5116279069767442,
608
+ "f1_mac hardware": 0.03125,
609
+ "f1_for sale": 0.5757575757575758,
610
+ "f1_guns": 0.27586206896551724,
611
+ "f1_politics": 0.3235294117647059,
612
+ "f1_space": 0.5569620253164557,
613
+ "f1_cryptography": 0.45901639344262296,
614
+ "f1_baseball": 0.8440366972477065,
615
+ "f1_hockey": 0.859504132231405,
616
+ "f1_electronics": 0.40625,
617
+ "f1_macro_ci_low": 0.40471791977084814,
618
+ "f1_macro_ci_high": 0.463234774838722,
619
+ "score_name": "f1_micro",
620
+ "score": 0.45168667810177243,
621
+ "score_ci_high": 0.48422465158112066,
622
+ "score_ci_low": 0.4219481399548598,
623
+ "num_of_instances": 1000,
624
+ "accuracy": 0.395,
625
+ "accuracy_ci_low": 0.366,
626
+ "accuracy_ci_high": 0.4246748738033053,
627
+ "f1_micro": 0.45168667810177243,
628
+ "f1_micro_ci_low": 0.4219481399548598,
629
+ "f1_micro_ci_high": 0.48422465158112066
630
+ },
631
+ "score": 0.45168667810177243,
632
+ "score_name": "subsets_mean",
633
+ "num_of_instances": 1000
634
+ },
635
+ "product_help": {
636
+ "cfpb_product_2023": {
637
+ "f1_macro": 0.632121660327464,
638
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.8923533778767632,
639
+ "f1_credit card or prepaid card": 0.5736434108527132,
640
+ "f1_debt collection": 0.593939393939394,
641
+ "f1_checking or savings account": 0.7526881720430108,
642
+ "f1_mortgage": 0.8,
643
+ "f1_vehicle loan or lease": 0.5,
644
+ "f1_money transfer or virtual currency or money service": 0.5882352941176471,
645
+ "f1_payday loan or title loan or personal loan": 0.4,
646
+ "f1_student loan": 0.5882352941176471,
647
+ "f1_macro_ci_low": 0.5726966985491608,
648
+ "f1_macro_ci_high": 0.7037376869442199,
649
+ "score_name": "f1_micro",
650
+ "score": 0.8152004164497657,
651
+ "score_ci_high": 0.8381933866103766,
652
+ "score_ci_low": 0.7918410041841004,
653
+ "num_of_instances": 1000,
654
+ "accuracy": 0.783,
655
+ "accuracy_ci_low": 0.758,
656
+ "accuracy_ci_high": 0.808,
657
+ "f1_micro": 0.8152004164497657,
658
+ "f1_micro_ci_low": 0.7918410041841004,
659
+ "f1_micro_ci_high": 0.8381933866103766
660
+ },
661
+ "cfpb_product_watsonx": {
662
+ "f1_macro": 0.6993262696245249,
663
+ "f1_mortgages and loans": 0.7441860465116279,
664
+ "f1_credit card": 0.7425149700598802,
665
+ "f1_debt collection": 0.6952380952380952,
666
+ "f1_retail banking": 0.5901639344262295,
667
+ "f1_credit reporting": 0.7245283018867924,
668
+ "f1_macro_ci_low": 0.6582490219544427,
669
+ "f1_macro_ci_high": 0.7402017475442481,
670
+ "score_name": "f1_micro",
671
+ "score": 0.7072649572649573,
672
+ "score_ci_high": 0.7466308129162856,
673
+ "score_ci_low": 0.667389078497867,
674
+ "num_of_instances": 500,
675
+ "accuracy": 0.662,
676
+ "accuracy_ci_low": 0.622,
677
+ "accuracy_ci_high": 0.704,
678
+ "f1_micro": 0.7072649572649573,
679
+ "f1_micro_ci_low": 0.667389078497867,
680
+ "f1_micro_ci_high": 0.7466308129162856
681
+ },
682
+ "score": 0.7612326868573616,
683
+ "score_name": "subsets_mean",
684
+ "num_of_instances": 1500
685
+ },
686
+ "qa_finance": {
687
+ "fin_qa": {
688
+ "num_of_instances": 1000,
689
+ "execution_accuracy": 0.081,
690
+ "program_accuracy": 0.097,
691
+ "score": 0.097,
692
+ "score_name": "program_accuracy",
693
+ "execution_accuracy_ci_low": 0.065,
694
+ "execution_accuracy_ci_high": 0.099,
695
+ "program_accuracy_ci_low": 0.08,
696
+ "program_accuracy_ci_high": 0.116,
697
+ "score_ci_low": 0.08,
698
+ "score_ci_high": 0.116
699
+ },
700
+ "score": 0.097,
701
+ "score_name": "subsets_mean",
702
+ "num_of_instances": 1000
703
+ },
704
+ "rag_general": {
705
+ "rag_response_generation_clapnq": {
706
+ "precision": 0.30206806077427395,
707
+ "recall": 0.5816362067038935,
708
+ "f1": 0.3356223026170467,
709
+ "precision_ci_low": 0.2814417986773037,
710
+ "precision_ci_high": 0.3229447355634418,
711
+ "recall_ci_low": 0.5644812495018089,
712
+ "recall_ci_high": 0.5979433202487962,
713
+ "f1_ci_low": 0.3158251342416766,
714
+ "f1_ci_high": 0.3520825144808485,
715
+ "score_name": "f1",
716
+ "score": 0.3356223026170467,
717
+ "score_ci_high": 0.3520825144808485,
718
+ "score_ci_low": 0.3158251342416766,
719
+ "num_of_instances": 600,
720
+ "correctness_f1_bert_score.deberta_large_mnli": 0.5992281610767046,
721
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6849613479276498,
722
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5520117846379677,
723
+ "faithfullness_f1_token_overlap": 0.3362020072357691,
724
+ "faithfullness_recall_token_overlap": 0.271110248692323,
725
+ "faithfullness_precision_token_overlap": 0.560394767420604,
726
+ "correctness_f1_token_overlap": 0.3356223026170467,
727
+ "correctness_recall_token_overlap": 0.5816362067038935,
728
+ "correctness_precision_token_overlap": 0.30206806077427395
729
+ },
730
+ "score": 0.3356223026170467,
731
+ "score_name": "subsets_mean",
732
+ "num_of_instances": 600
733
+ },
734
+ "reasoning": {
735
+ "hellaswag": {
736
+ "accuracy": 0.444,
737
+ "accuracy_ci_low": 0.412,
738
+ "accuracy_ci_high": 0.474,
739
+ "score_name": "accuracy",
740
+ "score": 0.444,
741
+ "score_ci_high": 0.474,
742
+ "score_ci_low": 0.412,
743
+ "num_of_instances": 1000
744
+ },
745
+ "openbook_qa": {
746
+ "accuracy": 0.778,
747
+ "accuracy_ci_low": 0.738,
748
+ "accuracy_ci_high": 0.812,
749
+ "score_name": "accuracy",
750
+ "score": 0.778,
751
+ "score_ci_high": 0.812,
752
+ "score_ci_low": 0.738,
753
+ "num_of_instances": 500
754
+ },
755
+ "score": 0.611,
756
+ "score_name": "subsets_mean",
757
+ "num_of_instances": 1500
758
+ },
759
+ "safety": {
760
+ "attaq_500": {
761
+ "severity_critical": 0.0,
762
+ "severity_high": 0.0,
763
+ "severity_medium": 0.0,
764
+ "severity_low": 100.0,
765
+ "category_harmful_info": 0.8762445710728775,
766
+ "category_deception": 0.8653640456714036,
767
+ "category_explicit_content": 0.8295791004643295,
768
+ "category_violence": 0.8484539006408008,
769
+ "category_discrimination": 0.8543881776905249,
770
+ "category_substance_abuse": 0.9145642853445477,
771
+ "category_pii": 0.8098305000199212,
772
+ "safety": 0.8585813929057784,
773
+ "safety_ci_low": 0.8429344770949673,
774
+ "safety_ci_high": 0.8748245522188953,
775
+ "score_name": "safety",
776
+ "score": 0.8585813929057784,
777
+ "score_ci_high": 0.8748245522188953,
778
+ "score_ci_low": 0.8429344770949673,
779
+ "num_of_instances": 100
780
+ },
781
+ "score": 0.8585813929057784,
782
+ "score_name": "subsets_mean",
783
+ "num_of_instances": 100
784
+ },
785
+ "summarization": {
786
+ "billsum_document_filtered_to_6000_chars": {
787
+ "num_of_instances": 528,
788
+ "rouge2": 0.1970371669553263,
789
+ "rougeLsum": 0.3501126102752881,
790
+ "rougeL": 0.2846394886664465,
791
+ "score": 0.2846394886664465,
792
+ "score_name": "rougeL",
793
+ "rouge1": 0.41761700537983565,
794
+ "rouge2_ci_low": 0.18974866066979085,
795
+ "rouge2_ci_high": 0.2039076063263163,
796
+ "rougeLsum_ci_low": 0.3417084656758268,
797
+ "rougeLsum_ci_high": 0.35798610023418986,
798
+ "rougeL_ci_low": 0.27743853819473024,
799
+ "rougeL_ci_high": 0.29146357040869636,
800
+ "score_ci_low": 0.27743853819473024,
801
+ "score_ci_high": 0.29146357040869636,
802
+ "rouge1_ci_low": 0.4080893419540732,
803
+ "rouge1_ci_high": 0.42587062241350043
804
+ },
805
+ "tldr_document_filtered_to_6000_chars": {
806
+ "num_of_instances": 1000,
807
+ "rouge2": 0.013386366389249846,
808
+ "rougeLsum": 0.09129053607147332,
809
+ "rougeL": 0.07976722752092735,
810
+ "score": 0.07976722752092735,
811
+ "score_name": "rougeL",
812
+ "rouge1": 0.11205823965005557,
813
+ "rouge2_ci_low": 0.011778285117435726,
814
+ "rouge2_ci_high": 0.015025647426455475,
815
+ "rougeLsum_ci_low": 0.08722354399820309,
816
+ "rougeLsum_ci_high": 0.09499919628325168,
817
+ "rougeL_ci_low": 0.07634372485273437,
818
+ "rougeL_ci_high": 0.0827839048129559,
819
+ "score_ci_low": 0.07634372485273437,
820
+ "score_ci_high": 0.0827839048129559,
821
+ "rouge1_ci_low": 0.10678148406842189,
822
+ "rouge1_ci_high": 0.11674098323235016
823
+ },
824
+ "score": 0.18220335809368693,
825
+ "score_name": "subsets_mean",
826
+ "num_of_instances": 1528
827
+ },
828
+ "translation": {
829
+ "mt_flores_101_ara_eng": {
830
+ "num_of_instances": 66,
831
+ "counts": [
832
+ 1120,
833
+ 591,
834
+ 346,
835
+ 205
836
+ ],
837
+ "totals": [
838
+ 2708,
839
+ 2642,
840
+ 2576,
841
+ 2510
842
+ ],
843
+ "precisions": [
844
+ 0.413589364844904,
845
+ 0.22369417108251327,
846
+ 0.1343167701863354,
847
+ 0.08167330677290836
848
+ ],
849
+ "bp": 1.0,
850
+ "sys_len": 2708,
851
+ "ref_len": 1734,
852
+ "sacrebleu": 0.17848782709335775,
853
+ "score": 0.17848782709335775,
854
+ "score_name": "sacrebleu",
855
+ "score_ci_low": 0.12553105401151735,
856
+ "score_ci_high": 0.23257206528815122,
857
+ "sacrebleu_ci_low": 0.12553105401151735,
858
+ "sacrebleu_ci_high": 0.23257206528815122
859
+ },
860
+ "mt_flores_101_deu_eng": {
861
+ "num_of_instances": 66,
862
+ "counts": [
863
+ 1238,
864
+ 714,
865
+ 444,
866
+ 283
867
+ ],
868
+ "totals": [
869
+ 3974,
870
+ 3908,
871
+ 3842,
872
+ 3776
873
+ ],
874
+ "precisions": [
875
+ 0.31152491192752896,
876
+ 0.1827021494370522,
877
+ 0.11556480999479439,
878
+ 0.0749470338983051
879
+ ],
880
+ "bp": 1.0,
881
+ "sys_len": 3974,
882
+ "ref_len": 1734,
883
+ "sacrebleu": 0.14900612629588061,
884
+ "score": 0.14900612629588061,
885
+ "score_name": "sacrebleu",
886
+ "score_ci_low": 0.12104205384160306,
887
+ "score_ci_high": 0.17208001279821492,
888
+ "sacrebleu_ci_low": 0.12104205384160306,
889
+ "sacrebleu_ci_high": 0.17208001279821492
890
+ },
891
+ "mt_flores_101_eng_ara": {
892
+ "num_of_instances": 66,
893
+ "counts": [
894
+ 695,
895
+ 280,
896
+ 122,
897
+ 54
898
+ ],
899
+ "totals": [
900
+ 3398,
901
+ 3332,
902
+ 3266,
903
+ 3200
904
+ ],
905
+ "precisions": [
906
+ 0.20453207769276044,
907
+ 0.08403361344537816,
908
+ 0.03735456215554195,
909
+ 0.016875
910
+ ],
911
+ "bp": 1.0,
912
+ "sys_len": 3398,
913
+ "ref_len": 1589,
914
+ "sacrebleu": 0.057372064118917265,
915
+ "score": 0.057372064118917265,
916
+ "score_name": "sacrebleu",
917
+ "score_ci_low": 0.030141531652344938,
918
+ "score_ci_high": 0.09128806282306597,
919
+ "sacrebleu_ci_low": 0.030141531652344938,
920
+ "sacrebleu_ci_high": 0.09128806282306597
921
+ },
922
+ "mt_flores_101_eng_deu": {
923
+ "num_of_instances": 66,
924
+ "counts": [
925
+ 1081,
926
+ 583,
927
+ 367,
928
+ 236
929
+ ],
930
+ "totals": [
931
+ 2698,
932
+ 2632,
933
+ 2566,
934
+ 2500
935
+ ],
936
+ "precisions": [
937
+ 0.4006671608598962,
938
+ 0.2215045592705167,
939
+ 0.1430241621200312,
940
+ 0.0944
941
+ ],
942
+ "bp": 1.0,
943
+ "sys_len": 2698,
944
+ "ref_len": 1835,
945
+ "sacrebleu": 0.18605311955362527,
946
+ "score": 0.18605311955362527,
947
+ "score_name": "sacrebleu",
948
+ "score_ci_low": 0.14535553546673896,
949
+ "score_ci_high": 0.23518227086475807,
950
+ "sacrebleu_ci_low": 0.14535553546673896,
951
+ "sacrebleu_ci_high": 0.23518227086475807
952
+ },
953
+ "mt_flores_101_eng_fra": {
954
+ "num_of_instances": 66,
955
+ "counts": [
956
+ 1369,
957
+ 908,
958
+ 653,
959
+ 480
960
+ ],
961
+ "totals": [
962
+ 2838,
963
+ 2772,
964
+ 2706,
965
+ 2640
966
+ ],
967
+ "precisions": [
968
+ 0.4823819591261451,
969
+ 0.32756132756132755,
970
+ 0.24131559497413158,
971
+ 0.18181818181818182
972
+ ],
973
+ "bp": 1.0,
974
+ "sys_len": 2838,
975
+ "ref_len": 2068,
976
+ "sacrebleu": 0.28855366500271445,
977
+ "score": 0.28855366500271445,
978
+ "score_name": "sacrebleu",
979
+ "score_ci_low": 0.23842554781291075,
980
+ "score_ci_high": 0.3295060649249719,
981
+ "sacrebleu_ci_low": 0.23842554781291075,
982
+ "sacrebleu_ci_high": 0.3295060649249719
983
+ },
984
+ "mt_flores_101_eng_kor": {
985
+ "num_of_instances": 66,
986
+ "counts": [
987
+ 1105,
988
+ 464,
989
+ 237,
990
+ 129
991
+ ],
992
+ "totals": [
993
+ 4614,
994
+ 4548,
995
+ 4482,
996
+ 4416
997
+ ],
998
+ "precisions": [
999
+ 0.23948851322063283,
1000
+ 0.10202286719437115,
1001
+ 0.05287817938420348,
1002
+ 0.029211956521739132
1003
+ ],
1004
+ "bp": 1.0,
1005
+ "sys_len": 4614,
1006
+ "ref_len": 2235,
1007
+ "sacrebleu": 0.07837992398310743,
1008
+ "score": 0.07837992398310743,
1009
+ "score_name": "sacrebleu",
1010
+ "score_ci_low": 0.0624749618885479,
1011
+ "score_ci_high": 0.0978985560656709,
1012
+ "sacrebleu_ci_low": 0.0624749618885479,
1013
+ "sacrebleu_ci_high": 0.0978985560656709
1014
+ },
1015
+ "mt_flores_101_eng_por": {
1016
+ "num_of_instances": 66,
1017
+ "counts": [
1018
+ 1337,
1019
+ 872,
1020
+ 615,
1021
+ 443
1022
+ ],
1023
+ "totals": [
1024
+ 3197,
1025
+ 3131,
1026
+ 3065,
1027
+ 2999
1028
+ ],
1029
+ "precisions": [
1030
+ 0.41820456678135753,
1031
+ 0.2785052698818269,
1032
+ 0.200652528548124,
1033
+ 0.14771590530176726
1034
+ ],
1035
+ "bp": 1.0,
1036
+ "sys_len": 3197,
1037
+ "ref_len": 1916,
1038
+ "sacrebleu": 0.2423949242277413,
1039
+ "score": 0.2423949242277413,
1040
+ "score_name": "sacrebleu",
1041
+ "score_ci_low": 0.19031908380666443,
1042
+ "score_ci_high": 0.29285841291168263,
1043
+ "sacrebleu_ci_low": 0.19031908380666443,
1044
+ "sacrebleu_ci_high": 0.29285841291168263
1045
+ },
1046
+ "mt_flores_101_eng_ron": {
1047
+ "num_of_instances": 66,
1048
+ "counts": [
1049
+ 916,
1050
+ 408,
1051
+ 215,
1052
+ 118
1053
+ ],
1054
+ "totals": [
1055
+ 3065,
1056
+ 2999,
1057
+ 2933,
1058
+ 2867
1059
+ ],
1060
+ "precisions": [
1061
+ 0.29885807504078304,
1062
+ 0.13604534844948316,
1063
+ 0.0733037845209683,
1064
+ 0.04115800488315312
1065
+ ],
1066
+ "bp": 1.0,
1067
+ "sys_len": 3065,
1068
+ "ref_len": 1949,
1069
+ "sacrebleu": 0.10524036626034926,
1070
+ "score": 0.10524036626034926,
1071
+ "score_name": "sacrebleu",
1072
+ "score_ci_low": 0.08576114841329757,
1073
+ "score_ci_high": 0.138434331243868,
1074
+ "sacrebleu_ci_low": 0.08576114841329757,
1075
+ "sacrebleu_ci_high": 0.138434331243868
1076
+ },
1077
+ "mt_flores_101_eng_spa": {
1078
+ "num_of_instances": 66,
1079
+ "counts": [
1080
+ 1218,
1081
+ 627,
1082
+ 360,
1083
+ 200
1084
+ ],
1085
+ "totals": [
1086
+ 3325,
1087
+ 3259,
1088
+ 3193,
1089
+ 3127
1090
+ ],
1091
+ "precisions": [
1092
+ 0.36631578947368415,
1093
+ 0.19239030377416383,
1094
+ 0.11274663326025681,
1095
+ 0.06395906619763352
1096
+ ],
1097
+ "bp": 1.0,
1098
+ "sys_len": 3325,
1099
+ "ref_len": 2098,
1100
+ "sacrebleu": 0.15014508803615928,
1101
+ "score": 0.15014508803615928,
1102
+ "score_name": "sacrebleu",
1103
+ "score_ci_low": 0.1204863094487622,
1104
+ "score_ci_high": 0.1765822854477207,
1105
+ "sacrebleu_ci_low": 0.1204863094487622,
1106
+ "sacrebleu_ci_high": 0.1765822854477207
1107
+ },
1108
+ "mt_flores_101_fra_eng": {
1109
+ "num_of_instances": 66,
1110
+ "counts": [
1111
+ 1254,
1112
+ 757,
1113
+ 500,
1114
+ 349
1115
+ ],
1116
+ "totals": [
1117
+ 3006,
1118
+ 2940,
1119
+ 2874,
1120
+ 2808
1121
+ ],
1122
+ "precisions": [
1123
+ 0.4171656686626746,
1124
+ 0.2574829931972789,
1125
+ 0.17397355601948505,
1126
+ 0.1242877492877493
1127
+ ],
1128
+ "bp": 1.0,
1129
+ "sys_len": 3006,
1130
+ "ref_len": 1734,
1131
+ "sacrebleu": 0.21952913609680724,
1132
+ "score": 0.21952913609680724,
1133
+ "score_name": "sacrebleu",
1134
+ "score_ci_low": 0.1789920824532995,
1135
+ "score_ci_high": 0.2645460018949846,
1136
+ "sacrebleu_ci_low": 0.1789920824532995,
1137
+ "sacrebleu_ci_high": 0.2645460018949846
1138
+ },
1139
+ "mt_flores_101_jpn_eng": {
1140
+ "num_of_instances": 66,
1141
+ "counts": [
1142
+ 1041,
1143
+ 455,
1144
+ 240,
1145
+ 140
1146
+ ],
1147
+ "totals": [
1148
+ 3373,
1149
+ 3307,
1150
+ 3241,
1151
+ 3175
1152
+ ],
1153
+ "precisions": [
1154
+ 0.30862733471686926,
1155
+ 0.13758693680072573,
1156
+ 0.07405121875964209,
1157
+ 0.04409448818897638
1158
+ ],
1159
+ "bp": 1.0,
1160
+ "sys_len": 3373,
1161
+ "ref_len": 1734,
1162
+ "sacrebleu": 0.10851306405113036,
1163
+ "score": 0.10851306405113036,
1164
+ "score_name": "sacrebleu",
1165
+ "score_ci_low": 0.07946974341775867,
1166
+ "score_ci_high": 0.13751739390259268,
1167
+ "sacrebleu_ci_low": 0.07946974341775867,
1168
+ "sacrebleu_ci_high": 0.13751739390259268
1169
+ },
1170
+ "mt_flores_101_kor_eng": {
1171
+ "num_of_instances": 66,
1172
+ "counts": [
1173
+ 989,
1174
+ 446,
1175
+ 231,
1176
+ 127
1177
+ ],
1178
+ "totals": [
1179
+ 3117,
1180
+ 3051,
1181
+ 2985,
1182
+ 2919
1183
+ ],
1184
+ "precisions": [
1185
+ 0.3172922682066089,
1186
+ 0.14618157980989838,
1187
+ 0.07738693467336684,
1188
+ 0.04350805070229531
1189
+ ],
1190
+ "bp": 1.0,
1191
+ "sys_len": 3117,
1192
+ "ref_len": 1734,
1193
+ "sacrebleu": 0.11178855764603894,
1194
+ "score": 0.11178855764603894,
1195
+ "score_name": "sacrebleu",
1196
+ "score_ci_low": 0.08704919524478445,
1197
+ "score_ci_high": 0.14019227407461762,
1198
+ "sacrebleu_ci_low": 0.08704919524478445,
1199
+ "sacrebleu_ci_high": 0.14019227407461762
1200
+ },
1201
+ "mt_flores_101_por_eng": {
1202
+ "num_of_instances": 66,
1203
+ "counts": [
1204
+ 1279,
1205
+ 803,
1206
+ 560,
1207
+ 401
1208
+ ],
1209
+ "totals": [
1210
+ 4379,
1211
+ 4313,
1212
+ 4247,
1213
+ 4181
1214
+ ],
1215
+ "precisions": [
1216
+ 0.29207581639643754,
1217
+ 0.18618131231161605,
1218
+ 0.13185778196373912,
1219
+ 0.0959100693613968
1220
+ ],
1221
+ "bp": 1.0,
1222
+ "sys_len": 4379,
1223
+ "ref_len": 1734,
1224
+ "sacrebleu": 0.16193861267968562,
1225
+ "score": 0.16193861267968562,
1226
+ "score_name": "sacrebleu",
1227
+ "score_ci_low": 0.13000309557072665,
1228
+ "score_ci_high": 0.19897125526163875,
1229
+ "sacrebleu_ci_low": 0.13000309557072665,
1230
+ "sacrebleu_ci_high": 0.19897125526163875
1231
+ },
1232
+ "mt_flores_101_ron_eng": {
1233
+ "num_of_instances": 66,
1234
+ "counts": [
1235
+ 1220,
1236
+ 701,
1237
+ 443,
1238
+ 287
1239
+ ],
1240
+ "totals": [
1241
+ 3361,
1242
+ 3295,
1243
+ 3229,
1244
+ 3163
1245
+ ],
1246
+ "precisions": [
1247
+ 0.36298720618863434,
1248
+ 0.21274658573596358,
1249
+ 0.13719417776401363,
1250
+ 0.09073664242807462
1251
+ ],
1252
+ "bp": 1.0,
1253
+ "sys_len": 3361,
1254
+ "ref_len": 1734,
1255
+ "sacrebleu": 0.1760832623038549,
1256
+ "score": 0.1760832623038549,
1257
+ "score_name": "sacrebleu",
1258
+ "score_ci_low": 0.13158026543556073,
1259
+ "score_ci_high": 0.2197696196203422,
1260
+ "sacrebleu_ci_low": 0.13158026543556073,
1261
+ "sacrebleu_ci_high": 0.2197696196203422
1262
+ },
1263
+ "mt_flores_101_spa_eng": {
1264
+ "num_of_instances": 66,
1265
+ "counts": [
1266
+ 1169,
1267
+ 609,
1268
+ 355,
1269
+ 202
1270
+ ],
1271
+ "totals": [
1272
+ 2961,
1273
+ 2895,
1274
+ 2829,
1275
+ 2763
1276
+ ],
1277
+ "precisions": [
1278
+ 0.3947990543735224,
1279
+ 0.21036269430051813,
1280
+ 0.12548603746907033,
1281
+ 0.07310893955845095
1282
+ ],
1283
+ "bp": 1.0,
1284
+ "sys_len": 2961,
1285
+ "ref_len": 1734,
1286
+ "sacrebleu": 0.16614132879343932,
1287
+ "score": 0.16614132879343932,
1288
+ "score_name": "sacrebleu",
1289
+ "score_ci_low": 0.12631240742927416,
1290
+ "score_ci_high": 0.2061148130220288,
1291
+ "sacrebleu_ci_low": 0.12631240742927416,
1292
+ "sacrebleu_ci_high": 0.2061148130220288
1293
+ },
1294
+ "score": 0.1586418044095206,
1295
+ "score_name": "subsets_mean",
1296
+ "num_of_instances": 990
1297
+ },
1298
+ "score": 0.4549791248292893,
1299
+ "score_name": "subsets_mean",
1300
+ "num_of_instances": 12472
1301
+ }
1302
+ }
results/bluebench/2025-06-16T17-40-01_evaluation_results.json ADDED
@@ -0,0 +1,580 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-06-16T14:40:01.560857Z",
4
+ "command_line_invocation": [
5
+ "/home/bnayahu/miniforge3/envs/unitxt/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=watsonx/meta-llama/llama-3-3-70b-instruct,max_tokens=256",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR",
20
+ "--limit",
21
+ "100"
22
+ ],
23
+ "parsed_arguments": {
24
+ "tasks": [
25
+ "benchmarks.bluebench"
26
+ ],
27
+ "split": "test",
28
+ "num_fewshots": null,
29
+ "limit": 100,
30
+ "batch_size": 8,
31
+ "model": "watsonx/meta-llama/llama-3-3-70b-instruct",
32
+ "model_args": {
33
+ "max_tokens": 256
34
+ },
35
+ "gen_kwargs": null,
36
+ "chat_template_kwargs": null,
37
+ "output_path": "./results/bluebench",
38
+ "output_file_prefix": "evaluation_results",
39
+ "log_samples": true,
40
+ "verbosity": "ERROR",
41
+ "apply_chat_template": false,
42
+ "trust_remote_code": true,
43
+ "disable_hf_cache": false,
44
+ "cache_dir": null
45
+ },
46
+ "unitxt_version": "1.24.0",
47
+ "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
48
+ "python_version": "3.11.12",
49
+ "system": "Linux",
50
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Mon Apr 21 17:08:54 UTC 2025",
51
+ "installed_packages": {
52
+ "tqdm": "4.67.1",
53
+ "httpretty": "1.1.4",
54
+ "evaluate": "0.4.3",
55
+ "ruff": "0.11.10",
56
+ "virtualenv": "20.31.2",
57
+ "urllib3": "2.4.0",
58
+ "httpcore": "1.0.9",
59
+ "mecab-ko-dic": "1.0.0",
60
+ "mecab-ko": "1.0.1",
61
+ "identify": "2.6.10",
62
+ "bert-score": "0.3.13",
63
+ "lxml": "5.4.0",
64
+ "python-dotenv": "1.1.0",
65
+ "accelerate": "1.7.0",
66
+ "httpx-sse": "0.4.0",
67
+ "pillow": "11.2.1",
68
+ "certifi": "2025.4.26",
69
+ "pyparsing": "3.2.3",
70
+ "nvidia-cusparselt-cu12": "0.6.3",
71
+ "tzdata": "2025.2",
72
+ "torch": "2.7.0",
73
+ "MarkupSafe": "3.0.2",
74
+ "setuptools": "80.1.0",
75
+ "pydantic": "2.11.4",
76
+ "yarl": "1.20.0",
77
+ "importlib_metadata": "8.0.0",
78
+ "pydantic_core": "2.33.2",
79
+ "scipy": "1.15.3",
80
+ "annotated-types": "0.7.0",
81
+ "portalocker": "3.1.1",
82
+ "packaging": "24.2",
83
+ "Deprecated": "1.2.18",
84
+ "typing_extensions": "4.12.2",
85
+ "ibm-cos-sdk-s3transfer": "2.14.1",
86
+ "nvidia-cufft-cu12": "11.3.0.4",
87
+ "nvidia-cusolver-cu12": "11.7.1.2",
88
+ "diskcache": "5.6.3",
89
+ "fsspec": "2025.3.0",
90
+ "transformers": "4.51.3",
91
+ "platformdirs": "4.2.2",
92
+ "nvidia-cublas-cu12": "12.6.4.1",
93
+ "threadpoolctl": "3.6.0",
94
+ "jsonschema-specifications": "2025.4.1",
95
+ "tenacity": "9.1.2",
96
+ "propcache": "0.3.1",
97
+ "ibm-cos-sdk": "2.14.1",
98
+ "mpmath": "1.3.0",
99
+ "jiter": "0.9.0",
100
+ "filelock": "3.18.0",
101
+ "tomli": "2.0.1",
102
+ "nvidia-nvjitlink-cu12": "12.6.85",
103
+ "cfgv": "3.4.0",
104
+ "ibm_watsonx_ai": "1.3.13",
105
+ "ibm-generative-ai": "3.0.0",
106
+ "wheel": "0.45.1",
107
+ "sympy": "1.14.0",
108
+ "requests": "2.32.2",
109
+ "charset-normalizer": "3.4.2",
110
+ "psutil": "7.0.0",
111
+ "pre_commit": "4.2.0",
112
+ "nodeenv": "1.9.1",
113
+ "colorama": "0.4.6",
114
+ "absl-py": "2.2.2",
115
+ "rouge_score": "0.1.2",
116
+ "scikit-learn": "1.6.1",
117
+ "multiprocess": "0.70.16",
118
+ "xxhash": "3.5.0",
119
+ "detect-secrets": "1.5.0",
120
+ "aiohttp": "3.11.18",
121
+ "frozenlist": "1.6.0",
122
+ "tabulate": "0.9.0",
123
+ "triton": "3.3.0",
124
+ "idna": "3.10",
125
+ "PyYAML": "6.0.2",
126
+ "ibm-cos-sdk-core": "2.14.1",
127
+ "nvidia-curand-cu12": "10.3.7.77",
128
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
129
+ "tiktoken": "0.9.0",
130
+ "aiosignal": "1.3.2",
131
+ "attrs": "25.3.0",
132
+ "h11": "0.16.0",
133
+ "anyio": "4.9.0",
134
+ "wrapt": "1.17.2",
135
+ "kiwisolver": "1.4.8",
136
+ "nvidia-cudnn-cu12": "9.5.1.17",
137
+ "matplotlib": "3.10.3",
138
+ "aiolimiter": "1.2.1",
139
+ "codespell": "2.4.1",
140
+ "jmespath": "1.0.1",
141
+ "nltk": "3.9.1",
142
+ "unitxt": "1.24.0",
143
+ "dill": "0.3.8",
144
+ "multidict": "6.4.3",
145
+ "conllu": "6.0.0",
146
+ "litellm": "1.69.3",
147
+ "joblib": "1.5.0",
148
+ "cycler": "0.12.1",
149
+ "pip": "25.1.1",
150
+ "nvidia-nccl-cu12": "2.26.2",
151
+ "click": "8.2.0",
152
+ "fonttools": "4.58.0",
153
+ "datasets": "3.6.0",
154
+ "six": "1.17.0",
155
+ "numpy": "2.2.5",
156
+ "nvidia-cuda-runtime-cu12": "12.6.77",
157
+ "huggingface-hub": "0.31.2",
158
+ "aiohappyeyeballs": "2.6.1",
159
+ "sacrebleu": "2.5.1",
160
+ "pyarrow": "20.0.0",
161
+ "openai": "1.75.0",
162
+ "python-dateutil": "2.9.0.post0",
163
+ "pytz": "2025.2",
164
+ "contourpy": "1.3.2",
165
+ "pandas": "2.2.3",
166
+ "distro": "1.9.0",
167
+ "httpx": "0.27.2",
168
+ "rpds-py": "0.25.0",
169
+ "Jinja2": "3.1.6",
170
+ "nvidia-cusparse-cu12": "12.5.4.2",
171
+ "nvidia-nvtx-cu12": "12.6.77",
172
+ "fuzzywuzzy": "0.18.0",
173
+ "tokenizers": "0.21.1",
174
+ "lomond": "0.3.3",
175
+ "nvidia-cufile-cu12": "1.11.1.6",
176
+ "typing-inspection": "0.4.0",
177
+ "safetensors": "0.5.3",
178
+ "nvidia-cuda-cupti-cu12": "12.6.80",
179
+ "referencing": "0.36.2",
180
+ "networkx": "3.4.2",
181
+ "jsonschema": "4.23.0",
182
+ "zipp": "3.19.2",
183
+ "regex": "2024.11.6",
184
+ "distlib": "0.3.9",
185
+ "sniffio": "1.3.1",
186
+ "autocommand": "2.2.2",
187
+ "jaraco.collections": "5.1.0",
188
+ "typeguard": "4.3.0",
189
+ "jaraco.text": "3.12.1",
190
+ "jaraco.context": "5.3.0",
191
+ "jaraco.functools": "4.0.1",
192
+ "more-itertools": "10.3.0",
193
+ "backports.tarfile": "1.2.0",
194
+ "inflect": "7.3.1"
195
+ }
196
+ },
197
+ "results": {
198
+ "bias": {
199
+ "safety_bbq_age": {
200
+ "accuracy": 0.7888888888888889,
201
+ "accuracy_ci_low": 0.7,
202
+ "accuracy_ci_high": 0.8555555555555555,
203
+ "score_name": "accuracy",
204
+ "score": 0.7888888888888889,
205
+ "score_ci_high": 0.8555555555555555,
206
+ "score_ci_low": 0.7,
207
+ "num_of_instances": 90
208
+ },
209
+ "safety_bbq_disability_status": {
210
+ "accuracy": 1.0,
211
+ "accuracy_ci_low": 1.0,
212
+ "accuracy_ci_high": 1.0,
213
+ "score_name": "accuracy",
214
+ "score": 1.0,
215
+ "score_ci_high": 1.0,
216
+ "score_ci_low": 1.0,
217
+ "num_of_instances": 10
218
+ },
219
+ "score": 0.8944444444444444,
220
+ "score_name": "subsets_mean",
221
+ "num_of_instances": 100
222
+ },
223
+ "chatbot_abilities": {
224
+ "arena_hard_generation_english_gpt_4_0314_reference": {
225
+ "num_of_instances": 100,
226
+ "llama_3_70b_instruct_template_arena_hard": 0.5,
227
+ "score": 0.5,
228
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
229
+ },
230
+ "score": 0.5,
231
+ "score_name": "subsets_mean",
232
+ "num_of_instances": 100
233
+ },
234
+ "entity_extraction": {
235
+ "universal_ner_en_ewt": {
236
+ "num_of_instances": 100,
237
+ "f1_Person": 0.5294117647058824,
238
+ "f1_Organization": 0.4489795918367347,
239
+ "f1_Location": 0.3076923076923077,
240
+ "f1_macro": 0.4286945547449749,
241
+ "recall_macro": 0.3447204968944099,
242
+ "precision_macro": 0.5806637806637807,
243
+ "in_classes_support": 0.6266666666666667,
244
+ "f1_micro": 0.3466666666666667,
245
+ "recall_micro": 0.3466666666666667,
246
+ "precision_micro": 0.3466666666666667,
247
+ "score": 0.3466666666666667,
248
+ "score_name": "f1_micro",
249
+ "score_ci_low": 0.2410871556202116,
250
+ "score_ci_high": 0.45611092451496155,
251
+ "f1_micro_ci_low": 0.2410871556202116,
252
+ "f1_micro_ci_high": 0.45611092451496155
253
+ },
254
+ "score": 0.3466666666666667,
255
+ "score_name": "subsets_mean",
256
+ "num_of_instances": 100
257
+ },
258
+ "knowledge": {
259
+ "mmlu_pro_biology": {
260
+ "accuracy": 0.704225352112676,
261
+ "accuracy_ci_low": 0.5915492957746479,
262
+ "accuracy_ci_high": 0.8028169014084507,
263
+ "score_name": "accuracy",
264
+ "score": 0.704225352112676,
265
+ "score_ci_high": 0.8028169014084507,
266
+ "score_ci_low": 0.5915492957746479,
267
+ "num_of_instances": 71
268
+ },
269
+ "mmlu_pro_business": {
270
+ "accuracy": 0.13793103448275862,
271
+ "accuracy_ci_low": 0.034482758620689655,
272
+ "accuracy_ci_high": 0.3103448275862069,
273
+ "score_name": "accuracy",
274
+ "score": 0.13793103448275862,
275
+ "score_ci_high": 0.3103448275862069,
276
+ "score_ci_low": 0.034482758620689655,
277
+ "num_of_instances": 29
278
+ },
279
+ "score": 0.42107819329771734,
280
+ "score_name": "subsets_mean",
281
+ "num_of_instances": 100
282
+ },
283
+ "legal": {
284
+ "legalbench_abercrombie": {
285
+ "f1_macro": 0.6635397677258142,
286
+ "f1_suggestive": 0.5555555555555556,
287
+ "f1_generic": 0.7692307692307693,
288
+ "f1_descriptive": 0.6976744186046512,
289
+ "f1_fanciful": 0.6666666666666666,
290
+ "f1_arbitrary": 0.6285714285714286,
291
+ "f1_macro_ci_low": 0.5625965245413472,
292
+ "f1_macro_ci_high": 0.7723256077319486,
293
+ "score_name": "f1_micro",
294
+ "score": 0.6586826347305389,
295
+ "score_ci_high": 0.7515151515151515,
296
+ "score_ci_low": 0.5524327906405184,
297
+ "num_of_instances": 85,
298
+ "accuracy": 0.6470588235294118,
299
+ "accuracy_ci_low": 0.5411764705882353,
300
+ "accuracy_ci_high": 0.7411764705882353,
301
+ "f1_micro": 0.6586826347305389,
302
+ "f1_micro_ci_low": 0.5524327906405184,
303
+ "f1_micro_ci_high": 0.7515151515151515
304
+ },
305
+ "legalbench_corporate_lobbying": {
306
+ "f1_macro": 0.5357142857142857,
307
+ "f1_no": 0.5,
308
+ "f1_yes": 0.5714285714285714,
309
+ "f1_macro_ci_low": 0.2833333333333333,
310
+ "f1_macro_ci_high": 0.7999279223515758,
311
+ "score_name": "f1_micro",
312
+ "score": 0.5384615384615384,
313
+ "score_ci_high": 0.7857142857142857,
314
+ "score_ci_low": 0.26917373942421613,
315
+ "num_of_instances": 15,
316
+ "accuracy": 0.4666666666666667,
317
+ "accuracy_ci_low": 0.2,
318
+ "accuracy_ci_high": 0.7333333333333333,
319
+ "f1_micro": 0.5384615384615384,
320
+ "f1_micro_ci_low": 0.26917373942421613,
321
+ "f1_micro_ci_high": 0.7857142857142857
322
+ },
323
+ "score": 0.5985720865960387,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "news_classification": {
328
+ "20_newsgroups_short": {
329
+ "f1_macro": 0.6443434343434343,
330
+ "f1_cars": 0.9090909090909091,
331
+ "f1_windows x": 0.5714285714285714,
332
+ "f1_computer graphics": 0.6666666666666666,
333
+ "f1_atheism": 0.5714285714285714,
334
+ "f1_religion": 0.0,
335
+ "f1_medicine": 1.0,
336
+ "f1_christianity": 0.8571428571428571,
337
+ "f1_microsoft windows": 0.6666666666666666,
338
+ "f1_middle east": 0.5,
339
+ "f1_motorcycles": 0.6,
340
+ "f1_pc hardware": 0.8,
341
+ "f1_mac hardware": 0.8,
342
+ "f1_for sale": 0.5,
343
+ "f1_guns": 0.4444444444444444,
344
+ "f1_space": 0.75,
345
+ "f1_cryptography": 0.3333333333333333,
346
+ "f1_baseball": 1.0,
347
+ "f1_politics": 0.5,
348
+ "f1_hockey": 0.75,
349
+ "f1_electronics": 0.6666666666666666,
350
+ "f1_macro_ci_low": 0.5605248203581513,
351
+ "f1_macro_ci_high": 0.7498000775037662,
352
+ "score_name": "f1_micro",
353
+ "score": 0.6740331491712708,
354
+ "score_ci_high": 0.7567567567567568,
355
+ "score_ci_low": 0.5654571096096505,
356
+ "num_of_instances": 100,
357
+ "accuracy": 0.61,
358
+ "accuracy_ci_low": 0.5,
359
+ "accuracy_ci_high": 0.7,
360
+ "f1_micro": 0.6740331491712708,
361
+ "f1_micro_ci_low": 0.5654571096096505,
362
+ "f1_micro_ci_high": 0.7567567567567568
363
+ },
364
+ "score": 0.6740331491712708,
365
+ "score_name": "subsets_mean",
366
+ "num_of_instances": 100
367
+ },
368
+ "product_help": {
369
+ "cfpb_product_2023": {
370
+ "f1_macro": 0.8637383872166481,
371
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.927536231884058,
372
+ "f1_credit card or prepaid card": 1.0,
373
+ "f1_debt collection": 0.64,
374
+ "f1_checking or savings account": 0.9230769230769231,
375
+ "f1_mortgage": 0.8888888888888888,
376
+ "f1_vehicle loan or lease": 0.6666666666666666,
377
+ "f1_money transfer or virtual currency or money service": 1.0,
378
+ "f1_macro_ci_low": 0.7066777160591827,
379
+ "f1_macro_ci_high": 0.9300773607822144,
380
+ "score_name": "f1_micro",
381
+ "score": 0.8888888888888888,
382
+ "score_ci_high": 0.9393939393939394,
383
+ "score_ci_low": 0.8163265306122449,
384
+ "num_of_instances": 100,
385
+ "accuracy": 0.88,
386
+ "accuracy_ci_low": 0.81,
387
+ "accuracy_ci_high": 0.93,
388
+ "f1_micro": 0.8888888888888888,
389
+ "f1_micro_ci_low": 0.8163265306122449,
390
+ "f1_micro_ci_high": 0.9393939393939394
391
+ },
392
+ "score": 0.8888888888888888,
393
+ "score_name": "subsets_mean",
394
+ "num_of_instances": 100
395
+ },
396
+ "qa_finance": {
397
+ "fin_qa": {
398
+ "num_of_instances": 100,
399
+ "program_accuracy": 0.2,
400
+ "score": 0.2,
401
+ "score_name": "program_accuracy",
402
+ "execution_accuracy": 0.2,
403
+ "program_accuracy_ci_low": 0.13,
404
+ "program_accuracy_ci_high": 0.29,
405
+ "score_ci_low": 0.13,
406
+ "score_ci_high": 0.29,
407
+ "execution_accuracy_ci_low": 0.13,
408
+ "execution_accuracy_ci_high": 0.29
409
+ },
410
+ "score": 0.2,
411
+ "score_name": "subsets_mean",
412
+ "num_of_instances": 100
413
+ },
414
+ "rag_general": {
415
+ "rag_response_generation_clapnq": {
416
+ "precision": 0.4639242544792729,
417
+ "recall": 0.6403509065582018,
418
+ "f1": 0.4976247962897783,
419
+ "precision_ci_low": 0.42748095786992185,
420
+ "precision_ci_high": 0.5047660147148081,
421
+ "recall_ci_low": 0.6019930525471148,
422
+ "recall_ci_high": 0.6774368805520771,
423
+ "f1_ci_low": 0.4683673012782413,
424
+ "f1_ci_high": 0.5304072668948948,
425
+ "score_name": "f1",
426
+ "score": 0.4976247962897783,
427
+ "score_ci_high": 0.5304072668948948,
428
+ "score_ci_low": 0.4683673012782413,
429
+ "num_of_instances": 100,
430
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6864667510986329,
431
+ "correctness_recall_bert_score.deberta_large_mnli": 0.7247687041759491,
432
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6621749752759933,
433
+ "faithfullness_f1_token_overlap": 0.412101720837311,
434
+ "faithfullness_recall_token_overlap": 0.32402585790821237,
435
+ "faithfullness_precision_token_overlap": 0.6786708572055166,
436
+ "correctness_f1_token_overlap": 0.4976247962897783,
437
+ "correctness_recall_token_overlap": 0.6403509065582018,
438
+ "correctness_precision_token_overlap": 0.4639242544792729
439
+ },
440
+ "score": 0.4976247962897783,
441
+ "score_name": "subsets_mean",
442
+ "num_of_instances": 100
443
+ },
444
+ "reasoning": {
445
+ "hellaswag": {
446
+ "accuracy": 0.57,
447
+ "accuracy_ci_low": 0.47,
448
+ "accuracy_ci_high": 0.66,
449
+ "score_name": "accuracy",
450
+ "score": 0.57,
451
+ "score_ci_high": 0.66,
452
+ "score_ci_low": 0.47,
453
+ "num_of_instances": 100
454
+ },
455
+ "score": 0.57,
456
+ "score_name": "subsets_mean",
457
+ "num_of_instances": 100
458
+ },
459
+ "safety": {
460
+ "attaq_500": {
461
+ "severity_critical": 0.0,
462
+ "severity_high": 0.0,
463
+ "severity_medium": 1.0,
464
+ "severity_low": 99.0,
465
+ "category_harmful_info": 0.8407948520514996,
466
+ "category_deception": 0.8690199783316466,
467
+ "category_explicit_content": 0.8879365721962067,
468
+ "category_violence": 0.8437370418910942,
469
+ "category_discrimination": 0.7742161632530273,
470
+ "category_substance_abuse": 0.8879393110672632,
471
+ "category_pii": 0.7969612302603545,
472
+ "safety": 0.8464966776832524,
473
+ "safety_ci_low": 0.8296491155827732,
474
+ "safety_ci_high": 0.8642379209192678,
475
+ "score_name": "safety",
476
+ "score": 0.8464966776832524,
477
+ "score_ci_high": 0.8642379209192678,
478
+ "score_ci_low": 0.8296491155827732,
479
+ "num_of_instances": 100
480
+ },
481
+ "score": 0.8464966776832524,
482
+ "score_name": "subsets_mean",
483
+ "num_of_instances": 100
484
+ },
485
+ "summarization": {
486
+ "billsum_document_filtered_to_6000_chars": {
487
+ "num_of_instances": 100,
488
+ "rouge1": 0.43029845221947843,
489
+ "rougeL": 0.2955165700225417,
490
+ "score": 0.2955165700225417,
491
+ "score_name": "rougeL",
492
+ "rouge2": 0.20799738817238542,
493
+ "rougeLsum": 0.37091272315340484,
494
+ "rouge1_ci_low": 0.40762860443579957,
495
+ "rouge1_ci_high": 0.45046632483836146,
496
+ "rougeL_ci_low": 0.2802994422178466,
497
+ "rougeL_ci_high": 0.31441983596023754,
498
+ "score_ci_low": 0.2802994422178466,
499
+ "score_ci_high": 0.31441983596023754,
500
+ "rouge2_ci_low": 0.193214668225847,
501
+ "rouge2_ci_high": 0.22420116008616867,
502
+ "rougeLsum_ci_low": 0.35057685960681195,
503
+ "rougeLsum_ci_high": 0.3911461732163174
504
+ },
505
+ "score": 0.2955165700225417,
506
+ "score_name": "subsets_mean",
507
+ "num_of_instances": 100
508
+ },
509
+ "translation": {
510
+ "mt_flores_101_ara_eng": {
511
+ "num_of_instances": 66,
512
+ "counts": [
513
+ 1308,
514
+ 854,
515
+ 606,
516
+ 437
517
+ ],
518
+ "totals": [
519
+ 1801,
520
+ 1735,
521
+ 1669,
522
+ 1603
523
+ ],
524
+ "precisions": [
525
+ 0.7262631871182677,
526
+ 0.49221902017291064,
527
+ 0.36309167165967643,
528
+ 0.272613849033063
529
+ ],
530
+ "bp": 1.0,
531
+ "sys_len": 1801,
532
+ "ref_len": 1734,
533
+ "sacrebleu": 0.4337147141407253,
534
+ "score": 0.4337147141407253,
535
+ "score_name": "sacrebleu",
536
+ "score_ci_low": 0.3842057657729977,
537
+ "score_ci_high": 0.4730390019325389,
538
+ "sacrebleu_ci_low": 0.3842057657729977,
539
+ "sacrebleu_ci_high": 0.4730390019325389
540
+ },
541
+ "mt_flores_101_deu_eng": {
542
+ "num_of_instances": 34,
543
+ "counts": [
544
+ 718,
545
+ 461,
546
+ 323,
547
+ 234
548
+ ],
549
+ "totals": [
550
+ 1016,
551
+ 982,
552
+ 948,
553
+ 914
554
+ ],
555
+ "precisions": [
556
+ 0.7066929133858268,
557
+ 0.4694501018329939,
558
+ 0.3407172995780591,
559
+ 0.25601750547045954
560
+ ],
561
+ "bp": 1.0,
562
+ "sys_len": 1016,
563
+ "ref_len": 960,
564
+ "sacrebleu": 0.4124497124322012,
565
+ "score": 0.4124497124322012,
566
+ "score_name": "sacrebleu",
567
+ "score_ci_low": 0.3505214366395574,
568
+ "score_ci_high": 0.4751525306662991,
569
+ "sacrebleu_ci_low": 0.3505214366395574,
570
+ "sacrebleu_ci_high": 0.4751525306662991
571
+ },
572
+ "score": 0.4230822132864632,
573
+ "score_name": "subsets_mean",
574
+ "num_of_instances": 100
575
+ },
576
+ "score": 0.5504925912574663,
577
+ "score_name": "subsets_mean",
578
+ "num_of_instances": 1300
579
+ }
580
+ }