jbnayahu commited on
Commit
b4f4176
·
unverified ·
1 Parent(s): 27bad72

Signed-off-by: Jonathan Bnayahu <[email protected]>

results/bluebench/2025-06-24T05-35-50_evaluation_results.json ADDED
@@ -0,0 +1,1283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-06-24T09:35:45.814508Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=watsonx/mistralai/mistral-medium-2505,max_tokens=256",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "watsonx/mistralai/mistral-medium-2505",
30
+ "model_args": {
31
+ "max_tokens": 256
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "absl-py": "2.3.0",
55
+ "tiktoken": "0.9.0",
56
+ "charset-normalizer": "3.4.2",
57
+ "nvidia-cuda-runtime-cu12": "12.6.77",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "litellm": "1.72.6.post1",
61
+ "httpcore": "1.0.9",
62
+ "Jinja2": "3.1.6",
63
+ "jsonschema-specifications": "2025.4.1",
64
+ "pydantic_core": "2.33.2",
65
+ "nvidia-cusparse-cu12": "12.5.4.2",
66
+ "yarl": "1.20.1",
67
+ "openai": "1.88.0",
68
+ "portalocker": "3.2.0",
69
+ "pandas": "2.3.0",
70
+ "multiprocess": "0.70.16",
71
+ "jsonschema": "4.24.0",
72
+ "unitxt": "1.24.0",
73
+ "nvidia-nvjitlink-cu12": "12.6.85",
74
+ "nvidia-cublas-cu12": "12.6.4.1",
75
+ "pydantic": "2.11.7",
76
+ "async-timeout": "5.0.1",
77
+ "annotated-types": "0.7.0",
78
+ "rouge_score": "0.1.2",
79
+ "contourpy": "1.3.2",
80
+ "aiosignal": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "pillow": "11.2.1",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "pyarrow": "20.0.0",
87
+ "h11": "0.16.0",
88
+ "zipp": "3.19.2",
89
+ "tzdata": "2025.2",
90
+ "bert-score": "0.3.13",
91
+ "setuptools": "80.9.0",
92
+ "referencing": "0.36.2",
93
+ "sacrebleu": "2.5.1",
94
+ "filelock": "3.18.0",
95
+ "urllib3": "2.5.0",
96
+ "scipy": "1.15.3",
97
+ "nvidia-nccl-cu12": "2.26.2",
98
+ "kiwisolver": "1.4.8",
99
+ "networkx": "3.4.2",
100
+ "typing-inspection": "0.4.1",
101
+ "lxml": "5.4.0",
102
+ "sniffio": "1.3.1",
103
+ "scikit-learn": "1.7.0",
104
+ "nvidia-curand-cu12": "10.3.7.77",
105
+ "pip": "25.1.1",
106
+ "fonttools": "4.58.4",
107
+ "transformers": "4.52.4",
108
+ "datasets": "3.6.0",
109
+ "nvidia-cusolver-cu12": "11.7.1.2",
110
+ "cycler": "0.12.1",
111
+ "evaluate": "0.4.3",
112
+ "distro": "1.9.0",
113
+ "idna": "3.10",
114
+ "MarkupSafe": "3.0.2",
115
+ "frozenlist": "1.7.0",
116
+ "pyparsing": "3.2.3",
117
+ "jiter": "0.10.0",
118
+ "importlib_metadata": "8.0.0",
119
+ "packaging": "24.2",
120
+ "psutil": "7.0.0",
121
+ "mecab-ko-dic": "1.0.0",
122
+ "joblib": "1.5.1",
123
+ "fsspec": "2025.3.0",
124
+ "dill": "0.3.8",
125
+ "tokenizers": "0.21.1",
126
+ "wheel": "0.45.1",
127
+ "nvidia-nvtx-cu12": "12.6.77",
128
+ "nvidia-cusparselt-cu12": "0.6.3",
129
+ "hf-xet": "1.1.4",
130
+ "propcache": "0.3.2",
131
+ "numpy": "2.2.6",
132
+ "mpmath": "1.3.0",
133
+ "multidict": "6.5.0",
134
+ "conllu": "6.0.0",
135
+ "safetensors": "0.5.3",
136
+ "requests": "2.32.4",
137
+ "regex": "2024.11.6",
138
+ "aiohttp": "3.12.13",
139
+ "tabulate": "0.9.0",
140
+ "certifi": "2025.6.15",
141
+ "accelerate": "1.8.0",
142
+ "nvidia-cufft-cu12": "11.3.0.4",
143
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
+ "click": "8.2.1",
145
+ "typing_extensions": "4.12.2",
146
+ "attrs": "25.3.0",
147
+ "exceptiongroup": "1.3.0",
148
+ "tenacity": "9.1.2",
149
+ "pytz": "2025.2",
150
+ "aiohappyeyeballs": "2.6.1",
151
+ "python-dateutil": "2.9.0.post0",
152
+ "torch": "2.7.1",
153
+ "python-dotenv": "1.1.0",
154
+ "httpx": "0.28.1",
155
+ "matplotlib": "3.10.3",
156
+ "xxhash": "3.5.0",
157
+ "PyYAML": "6.0.2",
158
+ "huggingface-hub": "0.33.0",
159
+ "colorama": "0.4.6",
160
+ "rpds-py": "0.25.1",
161
+ "threadpoolctl": "3.6.0",
162
+ "nvidia-cudnn-cu12": "9.5.1.17",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.8888888888888888,
180
+ "accuracy_ci_low": 0.8111111111111111,
181
+ "accuracy_ci_high": 0.9444444444444444,
182
+ "score_name": "accuracy",
183
+ "score": 0.8888888888888888,
184
+ "score_ci_high": 0.9444444444444444,
185
+ "score_ci_low": 0.8111111111111111,
186
+ "num_of_instances": 90
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 0.9777777777777777,
190
+ "accuracy_ci_low": 0.9222222222222223,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 0.9777777777777777,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 0.9222222222222223,
196
+ "num_of_instances": 90
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 1.0,
200
+ "accuracy_ci_low": 1.0,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 1.0,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 1.0,
206
+ "num_of_instances": 90
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 0.9888888888888889,
210
+ "accuracy_ci_low": 0.9444444444444444,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 0.9888888888888889,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 0.9444444444444444,
216
+ "num_of_instances": 90
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.9777777777777777,
220
+ "accuracy_ci_low": 0.9120747810244609,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 0.9777777777777777,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 0.9120747810244609,
226
+ "num_of_instances": 90
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 90
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 90
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 1.0,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 90
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.9444444444444444,
260
+ "accuracy_ci_low": 0.8777777777777778,
261
+ "accuracy_ci_high": 0.9777777777777777,
262
+ "score_name": "accuracy",
263
+ "score": 0.9444444444444444,
264
+ "score_ci_high": 0.9777777777777777,
265
+ "score_ci_low": 0.8777777777777778,
266
+ "num_of_instances": 90
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.9777777777777777,
270
+ "accuracy_ci_low": 0.9222222222222223,
271
+ "accuracy_ci_high": 1.0,
272
+ "score_name": "accuracy",
273
+ "score": 0.9777777777777777,
274
+ "score_ci_high": 1.0,
275
+ "score_ci_low": 0.9222222222222223,
276
+ "num_of_instances": 90
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.9111111111111111,
280
+ "accuracy_ci_low": 0.8333333333333334,
281
+ "accuracy_ci_high": 0.9555555555555556,
282
+ "score_name": "accuracy",
283
+ "score": 0.9111111111111111,
284
+ "score_ci_high": 0.9555555555555556,
285
+ "score_ci_low": 0.8333333333333334,
286
+ "num_of_instances": 90
287
+ },
288
+ "score": 0.9696969696969697,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 990
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.21593291404612158,
296
+ "score": 0.21593291404612158,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.21593291404612158,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 500
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 1000,
306
+ "f1_Person": 0.5372340425531915,
307
+ "f1_Organization": 0.39877300613496935,
308
+ "f1_Location": 0.4341085271317829,
309
+ "f1_macro": 0.4567051919399812,
310
+ "recall_macro": 0.41629941518296043,
311
+ "precision_macro": 0.5099857929442114,
312
+ "in_classes_support": 0.8463035019455253,
313
+ "f1_micro": 0.4273339749759385,
314
+ "recall_micro": 0.4228571428571429,
315
+ "precision_micro": 0.43190661478599224,
316
+ "score": 0.4273339749759385,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.37217028010501046,
319
+ "score_ci_high": 0.4861504171897927,
320
+ "f1_micro_ci_low": 0.37217028010501046,
321
+ "f1_micro_ci_high": 0.4861504171897927
322
+ },
323
+ "score": 0.4273339749759385,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 1000
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.5070422535211268,
330
+ "accuracy_ci_low": 0.39436619718309857,
331
+ "accuracy_ci_high": 0.6338028169014085,
332
+ "score_name": "accuracy",
333
+ "score": 0.5070422535211268,
334
+ "score_ci_high": 0.6338028169014085,
335
+ "score_ci_low": 0.39436619718309857,
336
+ "num_of_instances": 71
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.28169014084507044,
340
+ "accuracy_ci_low": 0.18309859154929578,
341
+ "accuracy_ci_high": 0.39436619718309857,
342
+ "score_name": "accuracy",
343
+ "score": 0.28169014084507044,
344
+ "score_ci_high": 0.39436619718309857,
345
+ "score_ci_low": 0.18309859154929578,
346
+ "num_of_instances": 71
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.23943661971830985,
350
+ "accuracy_ci_low": 0.15492957746478872,
351
+ "accuracy_ci_high": 0.36619718309859156,
352
+ "score_name": "accuracy",
353
+ "score": 0.23943661971830985,
354
+ "score_ci_high": 0.36619718309859156,
355
+ "score_ci_low": 0.15492957746478872,
356
+ "num_of_instances": 71
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.4507042253521127,
360
+ "accuracy_ci_low": 0.3380281690140845,
361
+ "accuracy_ci_high": 0.5633802816901409,
362
+ "score_name": "accuracy",
363
+ "score": 0.4507042253521127,
364
+ "score_ci_high": 0.5633802816901409,
365
+ "score_ci_low": 0.3380281690140845,
366
+ "num_of_instances": 71
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.4788732394366197,
370
+ "accuracy_ci_low": 0.36619718309859156,
371
+ "accuracy_ci_high": 0.5915492957746479,
372
+ "score_name": "accuracy",
373
+ "score": 0.4788732394366197,
374
+ "score_ci_high": 0.5915492957746479,
375
+ "score_ci_low": 0.36619718309859156,
376
+ "num_of_instances": 71
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.2535211267605634,
380
+ "accuracy_ci_low": 0.16901408450704225,
381
+ "accuracy_ci_high": 0.352112676056338,
382
+ "score_name": "accuracy",
383
+ "score": 0.2535211267605634,
384
+ "score_ci_high": 0.352112676056338,
385
+ "score_ci_low": 0.16901408450704225,
386
+ "num_of_instances": 71
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.4507042253521127,
390
+ "accuracy_ci_low": 0.3380281690140845,
391
+ "accuracy_ci_high": 0.5633802816901409,
392
+ "score_name": "accuracy",
393
+ "score": 0.4507042253521127,
394
+ "score_ci_high": 0.5633802816901409,
395
+ "score_ci_low": 0.3380281690140845,
396
+ "num_of_instances": 71
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.6901408450704225,
400
+ "accuracy_ci_low": 0.5774647887323944,
401
+ "accuracy_ci_high": 0.7887323943661971,
402
+ "score_name": "accuracy",
403
+ "score": 0.6901408450704225,
404
+ "score_ci_high": 0.7887323943661971,
405
+ "score_ci_low": 0.5774647887323944,
406
+ "num_of_instances": 71
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.5211267605633803,
410
+ "accuracy_ci_low": 0.4084507042253521,
411
+ "accuracy_ci_high": 0.6338028169014085,
412
+ "score_name": "accuracy",
413
+ "score": 0.5211267605633803,
414
+ "score_ci_high": 0.6338028169014085,
415
+ "score_ci_low": 0.4084507042253521,
416
+ "num_of_instances": 71
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.2112676056338028,
420
+ "accuracy_ci_low": 0.1267605633802817,
421
+ "accuracy_ci_high": 0.323943661971831,
422
+ "score_name": "accuracy",
423
+ "score": 0.2112676056338028,
424
+ "score_ci_high": 0.323943661971831,
425
+ "score_ci_low": 0.1267605633802817,
426
+ "num_of_instances": 71
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.6056338028169014,
430
+ "accuracy_ci_low": 0.4788732394366197,
431
+ "accuracy_ci_high": 0.704225352112676,
432
+ "score_name": "accuracy",
433
+ "score": 0.6056338028169014,
434
+ "score_ci_high": 0.704225352112676,
435
+ "score_ci_low": 0.4788732394366197,
436
+ "num_of_instances": 71
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.5774647887323944,
440
+ "accuracy_ci_low": 0.4641445381497224,
441
+ "accuracy_ci_high": 0.6901408450704225,
442
+ "score_name": "accuracy",
443
+ "score": 0.5774647887323944,
444
+ "score_ci_high": 0.6901408450704225,
445
+ "score_ci_low": 0.4641445381497224,
446
+ "num_of_instances": 71
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.38028169014084506,
450
+ "accuracy_ci_low": 0.2676056338028169,
451
+ "accuracy_ci_high": 0.49295774647887325,
452
+ "score_name": "accuracy",
453
+ "score": 0.38028169014084506,
454
+ "score_ci_high": 0.49295774647887325,
455
+ "score_ci_low": 0.2676056338028169,
456
+ "num_of_instances": 71
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.6901408450704225,
460
+ "accuracy_ci_low": 0.5915492957746479,
461
+ "accuracy_ci_high": 0.7887323943661971,
462
+ "score_name": "accuracy",
463
+ "score": 0.6901408450704225,
464
+ "score_ci_high": 0.7887323943661971,
465
+ "score_ci_low": 0.5915492957746479,
466
+ "num_of_instances": 71
467
+ },
468
+ "score": 0.4527162977867203,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 994
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.6742790542790542,
475
+ "f1_suggestive": 0.5,
476
+ "f1_arbitrary": 0.7428571428571429,
477
+ "f1_generic": 0.7692307692307693,
478
+ "f1_fanciful": 0.7878787878787878,
479
+ "f1_descriptive": 0.5714285714285714,
480
+ "f1_macro_ci_low": 0.5753913879446501,
481
+ "f1_macro_ci_high": 0.7706997345400891,
482
+ "score_name": "f1_micro",
483
+ "score": 0.6753246753246753,
484
+ "score_ci_high": 0.7692307692307693,
485
+ "score_ci_low": 0.5714285714285714,
486
+ "num_of_instances": 85,
487
+ "accuracy": 0.611764705882353,
488
+ "accuracy_ci_low": 0.5058823529411764,
489
+ "accuracy_ci_high": 0.7176470588235294,
490
+ "f1_micro": 0.6753246753246753,
491
+ "f1_micro_ci_low": 0.5714285714285714,
492
+ "f1_micro_ci_high": 0.7692307692307693
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.6702552592536235,
496
+ "f1_no": 0.8044280442804428,
497
+ "f1_yes": 0.5360824742268041,
498
+ "f1_macro_ci_low": 0.5910640615323238,
499
+ "f1_macro_ci_high": 0.7399413497413018,
500
+ "score_name": "f1_micro",
501
+ "score": 0.7336956521739131,
502
+ "score_ci_high": 0.7849589580408969,
503
+ "score_ci_low": 0.6703296703296703,
504
+ "num_of_instances": 200,
505
+ "accuracy": 0.675,
506
+ "accuracy_ci_low": 0.61,
507
+ "accuracy_ci_high": 0.7337815438953987,
508
+ "f1_micro": 0.7336956521739131,
509
+ "f1_micro_ci_low": 0.6703296703296703,
510
+ "f1_micro_ci_high": 0.7849589580408969
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.2882009780664939,
514
+ "f1_conclusion": 0.0,
515
+ "f1_decree": 0.24242424242424243,
516
+ "f1_issue": 0.125,
517
+ "f1_analysis": 0.5066666666666667,
518
+ "f1_facts": 0.3404255319148936,
519
+ "f1_procedural history": 0.37735849056603776,
520
+ "f1_rule": 0.425531914893617,
521
+ "f1_macro_ci_low": 0.23171753915464377,
522
+ "f1_macro_ci_high": 0.354602644638753,
523
+ "score_name": "f1_micro",
524
+ "score": 0.31213872832369943,
525
+ "score_ci_high": 0.3852848273478974,
526
+ "score_ci_low": 0.2482597199380587,
527
+ "num_of_instances": 200,
528
+ "accuracy": 0.27,
529
+ "accuracy_ci_low": 0.21222780691215828,
530
+ "accuracy_ci_high": 0.335,
531
+ "f1_micro": 0.31213872832369943,
532
+ "f1_micro_ci_low": 0.2482597199380587,
533
+ "f1_micro_ci_high": 0.3852848273478974
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.5042439393058543,
537
+ "f1_yes": 0.5533980582524272,
538
+ "f1_no": 0.4550898203592814,
539
+ "f1_macro_ci_low": 0.4382241405188477,
540
+ "f1_macro_ci_high": 0.578009910646732,
541
+ "score_name": "f1_micro",
542
+ "score": 0.5093833780160858,
543
+ "score_ci_high": 0.5813333333333334,
544
+ "score_ci_low": 0.4408231981771538,
545
+ "num_of_instances": 200,
546
+ "accuracy": 0.475,
547
+ "accuracy_ci_low": 0.41,
548
+ "accuracy_ci_high": 0.545,
549
+ "f1_micro": 0.5093833780160858,
550
+ "f1_micro_ci_low": 0.4408231981771538,
551
+ "f1_micro_ci_high": 0.5813333333333334
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.8536778693722258,
555
+ "f1_yes": 0.8157894736842105,
556
+ "f1_no": 0.891566265060241,
557
+ "f1_macro_ci_low": 0.7713419638520551,
558
+ "f1_macro_ci_high": 0.9104777945855288,
559
+ "score_name": "f1_micro",
560
+ "score": 0.8553459119496856,
561
+ "score_ci_high": 0.9116547658523262,
562
+ "score_ci_low": 0.774649129783561,
563
+ "num_of_instances": 85,
564
+ "accuracy": 0.8,
565
+ "accuracy_ci_low": 0.7058823529411765,
566
+ "accuracy_ci_high": 0.8705882352941177,
567
+ "f1_micro": 0.8553459119496856,
568
+ "f1_micro_ci_low": 0.774649129783561,
569
+ "f1_micro_ci_high": 0.9116547658523262
570
+ },
571
+ "score": 0.6171776691576119,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 770
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.5860856127026939,
578
+ "f1_cars": 0.8367346938775511,
579
+ "f1_windows x": 0.058823529411764705,
580
+ "f1_computer graphics": 0.39416058394160586,
581
+ "f1_atheism": 0.3448275862068966,
582
+ "f1_religion": 0.2823529411764706,
583
+ "f1_medicine": 0.7560975609756098,
584
+ "f1_christianity": 0.723404255319149,
585
+ "f1_microsoft windows": 0.625,
586
+ "f1_middle east": 0.5294117647058824,
587
+ "f1_motorcycles": 0.68,
588
+ "f1_pc hardware": 0.6258503401360545,
589
+ "f1_mac hardware": 0.6666666666666666,
590
+ "f1_electronics": 0.5432098765432098,
591
+ "f1_for sale": 0.647887323943662,
592
+ "f1_guns": 0.42857142857142855,
593
+ "f1_space": 0.8316831683168316,
594
+ "f1_cryptography": 0.6301369863013698,
595
+ "f1_baseball": 0.9009009009009009,
596
+ "f1_hockey": 0.921875,
597
+ "f1_politics": 0.29411764705882354,
598
+ "f1_macro_ci_low": 0.5600698966074297,
599
+ "f1_macro_ci_high": 0.615183097196166,
600
+ "score_name": "f1_micro",
601
+ "score": 0.6065839179708581,
602
+ "score_ci_high": 0.6347185950732467,
603
+ "score_ci_low": 0.5756568591398169,
604
+ "num_of_instances": 1000,
605
+ "accuracy": 0.562,
606
+ "accuracy_ci_low": 0.53,
607
+ "accuracy_ci_high": 0.591,
608
+ "f1_micro": 0.6065839179708581,
609
+ "f1_micro_ci_low": 0.5756568591398169,
610
+ "f1_micro_ci_high": 0.6347185950732467
611
+ },
612
+ "score": 0.6065839179708581,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 1000
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.6779178973920781,
619
+ "f1_student loan": 0.55,
620
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9028400597907325,
621
+ "f1_debt collection": 0.6338797814207651,
622
+ "f1_checking or savings account": 0.7333333333333333,
623
+ "f1_mortgage": 0.8918918918918919,
624
+ "f1_payday loan or title loan or personal loan": 0.4444444444444444,
625
+ "f1_credit card or prepaid card": 0.7375886524822695,
626
+ "f1_money transfer or virtual currency or money service": 0.6190476190476191,
627
+ "f1_vehicle loan or lease": 0.5882352941176471,
628
+ "f1_macro_ci_low": 0.6171314549566215,
629
+ "f1_macro_ci_high": 0.7524963184778365,
630
+ "score_name": "f1_micro",
631
+ "score": 0.8364941055868785,
632
+ "score_ci_high": 0.8567598395419967,
633
+ "score_ci_low": 0.8142490729602508,
634
+ "num_of_instances": 1000,
635
+ "accuracy": 0.816,
636
+ "accuracy_ci_low": 0.792,
637
+ "accuracy_ci_high": 0.838,
638
+ "f1_micro": 0.8364941055868785,
639
+ "f1_micro_ci_low": 0.8142490729602508,
640
+ "f1_micro_ci_high": 0.8567598395419967
641
+ },
642
+ "cfpb_product_watsonx": {
643
+ "f1_macro": 0.765772984914942,
644
+ "f1_mortgages and loans": 0.8022598870056498,
645
+ "f1_credit card": 0.7804878048780488,
646
+ "f1_debt collection": 0.6633663366336634,
647
+ "f1_retail banking": 0.8258064516129032,
648
+ "f1_credit reporting": 0.7569444444444444,
649
+ "f1_macro_ci_low": 0.7299746635884687,
650
+ "f1_macro_ci_high": 0.8024109976234067,
651
+ "score_name": "f1_micro",
652
+ "score": 0.7606490872210954,
653
+ "score_ci_high": 0.7966089405987493,
654
+ "score_ci_low": 0.7235772357723578,
655
+ "num_of_instances": 500,
656
+ "accuracy": 0.75,
657
+ "accuracy_ci_low": 0.712,
658
+ "accuracy_ci_high": 0.786,
659
+ "f1_micro": 0.7606490872210954,
660
+ "f1_micro_ci_low": 0.7235772357723578,
661
+ "f1_micro_ci_high": 0.7966089405987493
662
+ },
663
+ "score": 0.798571596403987,
664
+ "score_name": "subsets_mean",
665
+ "num_of_instances": 1500
666
+ },
667
+ "qa_finance": {
668
+ "fin_qa": {
669
+ "num_of_instances": 1000,
670
+ "program_accuracy": 0.286,
671
+ "score": 0.286,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy": 0.269,
674
+ "program_accuracy_ci_low": 0.258,
675
+ "program_accuracy_ci_high": 0.31669793838493693,
676
+ "score_ci_low": 0.258,
677
+ "score_ci_high": 0.31669793838493693,
678
+ "execution_accuracy_ci_low": 0.24,
679
+ "execution_accuracy_ci_high": 0.296
680
+ },
681
+ "score": 0.286,
682
+ "score_name": "subsets_mean",
683
+ "num_of_instances": 1000
684
+ },
685
+ "rag_general": {
686
+ "rag_response_generation_clapnq": {
687
+ "precision": 0.36599613818156607,
688
+ "recall": 0.5557501311367539,
689
+ "f1": 0.37319564588260945,
690
+ "precision_ci_low": 0.3442558968534392,
691
+ "precision_ci_high": 0.3883241948512314,
692
+ "recall_ci_low": 0.5362096092778822,
693
+ "recall_ci_high": 0.5736677217848365,
694
+ "f1_ci_low": 0.35489262694728296,
695
+ "f1_ci_high": 0.3908912014863383,
696
+ "score_name": "f1",
697
+ "score": 0.37319564588260945,
698
+ "score_ci_high": 0.3908912014863383,
699
+ "score_ci_low": 0.35489262694728296,
700
+ "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6236683922012647,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6875705857078235,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5926454355816047,
704
+ "faithfullness_f1_token_overlap": 0.29589250789383037,
705
+ "faithfullness_recall_token_overlap": 0.23146224223124676,
706
+ "faithfullness_precision_token_overlap": 0.577646149691817,
707
+ "correctness_f1_token_overlap": 0.37319564588260945,
708
+ "correctness_recall_token_overlap": 0.5557501311367539,
709
+ "correctness_precision_token_overlap": 0.36599613818156607
710
+ },
711
+ "score": 0.37319564588260945,
712
+ "score_name": "subsets_mean",
713
+ "num_of_instances": 600
714
+ },
715
+ "reasoning": {
716
+ "hellaswag": {
717
+ "accuracy": 0.543,
718
+ "accuracy_ci_low": 0.509,
719
+ "accuracy_ci_high": 0.5724275235427522,
720
+ "score_name": "accuracy",
721
+ "score": 0.543,
722
+ "score_ci_high": 0.5724275235427522,
723
+ "score_ci_low": 0.509,
724
+ "num_of_instances": 1000
725
+ },
726
+ "openbook_qa": {
727
+ "accuracy": 0.946,
728
+ "accuracy_ci_low": 0.924,
729
+ "accuracy_ci_high": 0.964,
730
+ "score_name": "accuracy",
731
+ "score": 0.946,
732
+ "score_ci_high": 0.964,
733
+ "score_ci_low": 0.924,
734
+ "num_of_instances": 500
735
+ },
736
+ "score": 0.7444999999999999,
737
+ "score_name": "subsets_mean",
738
+ "num_of_instances": 1500
739
+ },
740
+ "safety": {
741
+ "attaq_500": {
742
+ "severity_critical": 3.0,
743
+ "severity_high": 0.0,
744
+ "severity_medium": 3.0,
745
+ "severity_low": 94.0,
746
+ "category_harmful_info": 0.7736262833137832,
747
+ "category_deception": 0.804969501088966,
748
+ "category_explicit_content": 0.8485184834159984,
749
+ "category_violence": 0.8188645232764501,
750
+ "category_discrimination": 0.812319999649411,
751
+ "category_substance_abuse": 0.8757585287094116,
752
+ "category_pii": 0.7689538912640678,
753
+ "safety": 0.8088205639190145,
754
+ "safety_ci_low": 0.7726942773816009,
755
+ "safety_ci_high": 0.8354766223203407,
756
+ "score_name": "safety",
757
+ "score": 0.8088205639190145,
758
+ "score_ci_high": 0.8354766223203407,
759
+ "score_ci_low": 0.7726942773816009,
760
+ "num_of_instances": 100
761
+ },
762
+ "score": 0.8088205639190145,
763
+ "score_name": "subsets_mean",
764
+ "num_of_instances": 100
765
+ },
766
+ "summarization": {
767
+ "billsum_document_filtered_to_6000_chars": {
768
+ "num_of_instances": 528,
769
+ "rougeL": 0.2851032294311037,
770
+ "score": 0.2851032294311037,
771
+ "score_name": "rougeL",
772
+ "rouge1": 0.4123501725725848,
773
+ "rouge2": 0.19895954851239922,
774
+ "rougeLsum": 0.35125917517726385,
775
+ "rougeL_ci_low": 0.2779227330572199,
776
+ "rougeL_ci_high": 0.2914680129199687,
777
+ "score_ci_low": 0.2779227330572199,
778
+ "score_ci_high": 0.2914680129199687,
779
+ "rouge1_ci_low": 0.40253840366855864,
780
+ "rouge1_ci_high": 0.42032838645745946,
781
+ "rouge2_ci_low": 0.19202884338408688,
782
+ "rouge2_ci_high": 0.20527522963546996,
783
+ "rougeLsum_ci_low": 0.341974836500192,
784
+ "rougeLsum_ci_high": 0.35869934507733836
785
+ },
786
+ "tldr_document_filtered_to_6000_chars": {
787
+ "num_of_instances": 1000,
788
+ "rougeL": 0.08701215343252551,
789
+ "score": 0.08701215343252551,
790
+ "score_name": "rougeL",
791
+ "rouge1": 0.11996161951194893,
792
+ "rouge2": 0.0178490095861687,
793
+ "rougeLsum": 0.100117177086591,
794
+ "rougeL_ci_low": 0.08292268727915428,
795
+ "rougeL_ci_high": 0.09048235705877403,
796
+ "score_ci_low": 0.08292268727915428,
797
+ "score_ci_high": 0.09048235705877403,
798
+ "rouge1_ci_low": 0.11470157824157777,
799
+ "rouge1_ci_high": 0.12484791019703942,
800
+ "rouge2_ci_low": 0.01588885277761203,
801
+ "rouge2_ci_high": 0.019912308568393905,
802
+ "rougeLsum_ci_low": 0.0954430567472731,
803
+ "rougeLsum_ci_high": 0.10391423395753559
804
+ },
805
+ "score": 0.1860576914318146,
806
+ "score_name": "subsets_mean",
807
+ "num_of_instances": 1528
808
+ },
809
+ "translation": {
810
+ "mt_flores_101_ara_eng": {
811
+ "num_of_instances": 66,
812
+ "counts": [
813
+ 1314,
814
+ 768,
815
+ 485,
816
+ 319
817
+ ],
818
+ "totals": [
819
+ 4679,
820
+ 4613,
821
+ 4547,
822
+ 4481
823
+ ],
824
+ "precisions": [
825
+ 0.2808292370164565,
826
+ 0.16648601777585084,
827
+ 0.10666373433032769,
828
+ 0.0711894666369114
829
+ ],
830
+ "bp": 1.0,
831
+ "sys_len": 4679,
832
+ "ref_len": 1734,
833
+ "sacrebleu": 0.13726605497706998,
834
+ "score": 0.13726605497706998,
835
+ "score_name": "sacrebleu",
836
+ "score_ci_low": 0.11130071390055578,
837
+ "score_ci_high": 0.15417215121184147,
838
+ "sacrebleu_ci_low": 0.11130071390055578,
839
+ "sacrebleu_ci_high": 0.15417215121184147
840
+ },
841
+ "mt_flores_101_deu_eng": {
842
+ "num_of_instances": 66,
843
+ "counts": [
844
+ 1281,
845
+ 774,
846
+ 505,
847
+ 334
848
+ ],
849
+ "totals": [
850
+ 4331,
851
+ 4265,
852
+ 4199,
853
+ 4133
854
+ ],
855
+ "precisions": [
856
+ 0.29577464788732394,
857
+ 0.18147713950762018,
858
+ 0.12026673017385091,
859
+ 0.08081296878780547
860
+ ],
861
+ "bp": 1.0,
862
+ "sys_len": 4331,
863
+ "ref_len": 1734,
864
+ "sacrebleu": 0.15113058347637284,
865
+ "score": 0.15113058347637284,
866
+ "score_name": "sacrebleu",
867
+ "score_ci_low": 0.12621064656698186,
868
+ "score_ci_high": 0.17668499331479298,
869
+ "sacrebleu_ci_low": 0.12621064656698186,
870
+ "sacrebleu_ci_high": 0.17668499331479298
871
+ },
872
+ "mt_flores_101_eng_ara": {
873
+ "num_of_instances": 66,
874
+ "counts": [
875
+ 826,
876
+ 368,
877
+ 191,
878
+ 101
879
+ ],
880
+ "totals": [
881
+ 7562,
882
+ 7496,
883
+ 7430,
884
+ 7364
885
+ ],
886
+ "precisions": [
887
+ 0.10923036233800582,
888
+ 0.04909284951974386,
889
+ 0.02570659488559892,
890
+ 0.013715372080391093
891
+ ],
892
+ "bp": 1.0,
893
+ "sys_len": 7562,
894
+ "ref_len": 1589,
895
+ "sacrebleu": 0.03708117101677291,
896
+ "score": 0.03708117101677291,
897
+ "score_name": "sacrebleu",
898
+ "score_ci_low": 0.02919214198062126,
899
+ "score_ci_high": 0.04738077615243459,
900
+ "sacrebleu_ci_low": 0.02919214198062126,
901
+ "sacrebleu_ci_high": 0.04738077615243459
902
+ },
903
+ "mt_flores_101_eng_deu": {
904
+ "num_of_instances": 66,
905
+ "counts": [
906
+ 1265,
907
+ 701,
908
+ 451,
909
+ 286
910
+ ],
911
+ "totals": [
912
+ 6926,
913
+ 6860,
914
+ 6794,
915
+ 6728
916
+ ],
917
+ "precisions": [
918
+ 0.18264510539994222,
919
+ 0.1021865889212828,
920
+ 0.06638210185457756,
921
+ 0.0425089179548157
922
+ ],
923
+ "bp": 1.0,
924
+ "sys_len": 6926,
925
+ "ref_len": 1835,
926
+ "sacrebleu": 0.08518894650805668,
927
+ "score": 0.08518894650805668,
928
+ "score_name": "sacrebleu",
929
+ "score_ci_low": 0.0687650483762349,
930
+ "score_ci_high": 0.10261212511583902,
931
+ "sacrebleu_ci_low": 0.0687650483762349,
932
+ "sacrebleu_ci_high": 0.10261212511583902
933
+ },
934
+ "mt_flores_101_eng_fra": {
935
+ "num_of_instances": 66,
936
+ "counts": [
937
+ 1521,
938
+ 1010,
939
+ 718,
940
+ 531
941
+ ],
942
+ "totals": [
943
+ 6656,
944
+ 6590,
945
+ 6524,
946
+ 6458
947
+ ],
948
+ "precisions": [
949
+ 0.228515625,
950
+ 0.15326251896813353,
951
+ 0.11005518087063151,
952
+ 0.08222359863734903
953
+ ],
954
+ "bp": 1.0,
955
+ "sys_len": 6656,
956
+ "ref_len": 2068,
957
+ "sacrebleu": 0.13342576700837747,
958
+ "score": 0.13342576700837747,
959
+ "score_name": "sacrebleu",
960
+ "score_ci_low": 0.11254831150471772,
961
+ "score_ci_high": 0.15689913123914057,
962
+ "sacrebleu_ci_low": 0.11254831150471772,
963
+ "sacrebleu_ci_high": 0.15689913123914057
964
+ },
965
+ "mt_flores_101_eng_kor": {
966
+ "num_of_instances": 66,
967
+ "counts": [
968
+ 1418,
969
+ 652,
970
+ 346,
971
+ 201
972
+ ],
973
+ "totals": [
974
+ 9112,
975
+ 9046,
976
+ 8980,
977
+ 8914
978
+ ],
979
+ "precisions": [
980
+ 0.15561896400351186,
981
+ 0.07207605571523326,
982
+ 0.038530066815144766,
983
+ 0.022548799641014135
984
+ ],
985
+ "bp": 1.0,
986
+ "sys_len": 9112,
987
+ "ref_len": 2235,
988
+ "sacrebleu": 0.05587199521216053,
989
+ "score": 0.05587199521216053,
990
+ "score_name": "sacrebleu",
991
+ "score_ci_low": 0.048601675175909714,
992
+ "score_ci_high": 0.06817084289821868,
993
+ "sacrebleu_ci_low": 0.048601675175909714,
994
+ "sacrebleu_ci_high": 0.06817084289821868
995
+ },
996
+ "mt_flores_101_eng_por": {
997
+ "num_of_instances": 66,
998
+ "counts": [
999
+ 1469,
1000
+ 961,
1001
+ 685,
1002
+ 498
1003
+ ],
1004
+ "totals": [
1005
+ 5975,
1006
+ 5909,
1007
+ 5843,
1008
+ 5777
1009
+ ],
1010
+ "precisions": [
1011
+ 0.24585774058577406,
1012
+ 0.16263327128109661,
1013
+ 0.11723429744994011,
1014
+ 0.08620391206508568
1015
+ ],
1016
+ "bp": 1.0,
1017
+ "sys_len": 5975,
1018
+ "ref_len": 1916,
1019
+ "sacrebleu": 0.141781228818647,
1020
+ "score": 0.141781228818647,
1021
+ "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.11599286268316018,
1023
+ "score_ci_high": 0.16818798188781423,
1024
+ "sacrebleu_ci_low": 0.11599286268316018,
1025
+ "sacrebleu_ci_high": 0.16818798188781423
1026
+ },
1027
+ "mt_flores_101_eng_ron": {
1028
+ "num_of_instances": 66,
1029
+ "counts": [
1030
+ 1355,
1031
+ 840,
1032
+ 572,
1033
+ 378
1034
+ ],
1035
+ "totals": [
1036
+ 5226,
1037
+ 5160,
1038
+ 5094,
1039
+ 5028
1040
+ ],
1041
+ "precisions": [
1042
+ 0.2592805204745503,
1043
+ 0.16279069767441862,
1044
+ 0.11228896741264233,
1045
+ 0.07517899761336516
1046
+ ],
1047
+ "bp": 1.0,
1048
+ "sys_len": 5226,
1049
+ "ref_len": 1949,
1050
+ "sacrebleu": 0.13739099644279126,
1051
+ "score": 0.13739099644279126,
1052
+ "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.11645720976845897,
1054
+ "score_ci_high": 0.15881392834378366,
1055
+ "sacrebleu_ci_low": 0.11645720976845897,
1056
+ "sacrebleu_ci_high": 0.15881392834378366
1057
+ },
1058
+ "mt_flores_101_eng_spa": {
1059
+ "num_of_instances": 66,
1060
+ "counts": [
1061
+ 1403,
1062
+ 733,
1063
+ 444,
1064
+ 271
1065
+ ],
1066
+ "totals": [
1067
+ 6635,
1068
+ 6569,
1069
+ 6503,
1070
+ 6437
1071
+ ],
1072
+ "precisions": [
1073
+ 0.21145440844009045,
1074
+ 0.11158471609072919,
1075
+ 0.06827618022451176,
1076
+ 0.04210035730930557
1077
+ ],
1078
+ "bp": 1.0,
1079
+ "sys_len": 6635,
1080
+ "ref_len": 2098,
1081
+ "sacrebleu": 0.09074947340353524,
1082
+ "score": 0.09074947340353524,
1083
+ "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.07752359705897967,
1085
+ "score_ci_high": 0.10595879813792543,
1086
+ "sacrebleu_ci_low": 0.07752359705897967,
1087
+ "sacrebleu_ci_high": 0.10595879813792543
1088
+ },
1089
+ "mt_flores_101_fra_eng": {
1090
+ "num_of_instances": 66,
1091
+ "counts": [
1092
+ 1306,
1093
+ 794,
1094
+ 517,
1095
+ 350
1096
+ ],
1097
+ "totals": [
1098
+ 4479,
1099
+ 4413,
1100
+ 4347,
1101
+ 4281
1102
+ ],
1103
+ "precisions": [
1104
+ 0.2915829426211208,
1105
+ 0.17992295490595966,
1106
+ 0.11893259719346676,
1107
+ 0.0817565989254847
1108
+ ],
1109
+ "bp": 1.0,
1110
+ "sys_len": 4479,
1111
+ "ref_len": 1734,
1112
+ "sacrebleu": 0.1502858535414232,
1113
+ "score": 0.1502858535414232,
1114
+ "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.12789422451472196,
1116
+ "score_ci_high": 0.17007936865047835,
1117
+ "sacrebleu_ci_low": 0.12789422451472196,
1118
+ "sacrebleu_ci_high": 0.17007936865047835
1119
+ },
1120
+ "mt_flores_101_jpn_eng": {
1121
+ "num_of_instances": 66,
1122
+ "counts": [
1123
+ 1178,
1124
+ 573,
1125
+ 324,
1126
+ 186
1127
+ ],
1128
+ "totals": [
1129
+ 5118,
1130
+ 5052,
1131
+ 4986,
1132
+ 4920
1133
+ ],
1134
+ "precisions": [
1135
+ 0.23016803438843297,
1136
+ 0.11342042755344417,
1137
+ 0.06498194945848375,
1138
+ 0.03780487804878049
1139
+ ],
1140
+ "bp": 1.0,
1141
+ "sys_len": 5118,
1142
+ "ref_len": 1734,
1143
+ "sacrebleu": 0.08948890965648465,
1144
+ "score": 0.08948890965648465,
1145
+ "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.0760587558942395,
1147
+ "score_ci_high": 0.10824221229872567,
1148
+ "sacrebleu_ci_low": 0.0760587558942395,
1149
+ "sacrebleu_ci_high": 0.10824221229872567
1150
+ },
1151
+ "mt_flores_101_kor_eng": {
1152
+ "num_of_instances": 66,
1153
+ "counts": [
1154
+ 1164,
1155
+ 556,
1156
+ 307,
1157
+ 182
1158
+ ],
1159
+ "totals": [
1160
+ 5014,
1161
+ 4948,
1162
+ 4882,
1163
+ 4816
1164
+ ],
1165
+ "precisions": [
1166
+ 0.23214998005584364,
1167
+ 0.11236863379143087,
1168
+ 0.06288406390823434,
1169
+ 0.0377906976744186
1170
+ ],
1171
+ "bp": 1.0,
1172
+ "sys_len": 5014,
1173
+ "ref_len": 1734,
1174
+ "sacrebleu": 0.08873292857251397,
1175
+ "score": 0.08873292857251397,
1176
+ "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.07691044423007594,
1178
+ "score_ci_high": 0.10654488317878491,
1179
+ "sacrebleu_ci_low": 0.07691044423007594,
1180
+ "sacrebleu_ci_high": 0.10654488317878491
1181
+ },
1182
+ "mt_flores_101_por_eng": {
1183
+ "num_of_instances": 66,
1184
+ "counts": [
1185
+ 1333,
1186
+ 846,
1187
+ 580,
1188
+ 415
1189
+ ],
1190
+ "totals": [
1191
+ 5066,
1192
+ 5000,
1193
+ 4934,
1194
+ 4868
1195
+ ],
1196
+ "precisions": [
1197
+ 0.2631267272009475,
1198
+ 0.16920000000000002,
1199
+ 0.11755168220510742,
1200
+ 0.0852506162695152
1201
+ ],
1202
+ "bp": 1.0,
1203
+ "sys_len": 5066,
1204
+ "ref_len": 1734,
1205
+ "sacrebleu": 0.14533590675842165,
1206
+ "score": 0.14533590675842165,
1207
+ "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.12391870492274426,
1209
+ "score_ci_high": 0.16749053357807966,
1210
+ "sacrebleu_ci_low": 0.12391870492274426,
1211
+ "sacrebleu_ci_high": 0.16749053357807966
1212
+ },
1213
+ "mt_flores_101_ron_eng": {
1214
+ "num_of_instances": 66,
1215
+ "counts": [
1216
+ 1355,
1217
+ 846,
1218
+ 557,
1219
+ 374
1220
+ ],
1221
+ "totals": [
1222
+ 5005,
1223
+ 4939,
1224
+ 4873,
1225
+ 4807
1226
+ ],
1227
+ "precisions": [
1228
+ 0.2707292707292707,
1229
+ 0.17128973476412232,
1230
+ 0.11430330391955675,
1231
+ 0.07780320366132723
1232
+ ],
1233
+ "bp": 1.0,
1234
+ "sys_len": 5005,
1235
+ "ref_len": 1734,
1236
+ "sacrebleu": 0.14250519722396413,
1237
+ "score": 0.14250519722396413,
1238
+ "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.12789330310428798,
1240
+ "score_ci_high": 0.16557053049469844,
1241
+ "sacrebleu_ci_low": 0.12789330310428798,
1242
+ "sacrebleu_ci_high": 0.16557053049469844
1243
+ },
1244
+ "mt_flores_101_spa_eng": {
1245
+ "num_of_instances": 66,
1246
+ "counts": [
1247
+ 1237,
1248
+ 664,
1249
+ 407,
1250
+ 253
1251
+ ],
1252
+ "totals": [
1253
+ 4616,
1254
+ 4550,
1255
+ 4484,
1256
+ 4418
1257
+ ],
1258
+ "precisions": [
1259
+ 0.26798093587521665,
1260
+ 0.14593406593406594,
1261
+ 0.09076717216770741,
1262
+ 0.057265731100045264
1263
+ ],
1264
+ "bp": 1.0,
1265
+ "sys_len": 4616,
1266
+ "ref_len": 1734,
1267
+ "sacrebleu": 0.11940459758039601,
1268
+ "score": 0.11940459758039601,
1269
+ "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.10463458061774036,
1271
+ "score_ci_high": 0.13275807402507517,
1272
+ "sacrebleu_ci_low": 0.10463458061774036,
1273
+ "sacrebleu_ci_high": 0.13275807402507517
1274
+ },
1275
+ "score": 0.11370930734646584,
1276
+ "score_name": "subsets_mean",
1277
+ "num_of_instances": 990
1278
+ },
1279
+ "score": 0.5077151191244701,
1280
+ "score_name": "subsets_mean",
1281
+ "num_of_instances": 12472
1282
+ }
1283
+ }