jbnayahu commited on
Commit
5cbd691
·
unverified ·
1 Parent(s): 0201c01

gpt results

Browse files

Signed-off-by: Jonathan Bnayahu <[email protected]>

results/bluebench/2025-08-03T08-32-43_evaluation_results.json ADDED
@@ -0,0 +1,1282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-08-03T12:32:38.916038Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=azure/Azure/gpt-4.1-ncf,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "azure/Azure/gpt-4.1-ncf",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.26.5",
45
+ "unitxt_commit_hash": "N/A",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "tiktoken": "0.9.0",
55
+ "charset-normalizer": "3.4.2",
56
+ "nvidia-cuda-runtime-cu12": "12.6.77",
57
+ "pyarrow": "21.0.0",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "httpcore": "1.0.9",
61
+ "pip": "25.2",
62
+ "certifi": "2025.7.14",
63
+ "evaluate": "0.4.5",
64
+ "Jinja2": "3.1.6",
65
+ "jsonschema-specifications": "2025.4.1",
66
+ "pydantic_core": "2.33.2",
67
+ "nvidia-cusparse-cu12": "12.5.4.2",
68
+ "aiosignal": "1.4.0",
69
+ "yarl": "1.20.1",
70
+ "unitxt": "1.26.5",
71
+ "jsonschema": "4.25.0",
72
+ "portalocker": "3.2.0",
73
+ "multiprocess": "0.70.16",
74
+ "nvidia-nvjitlink-cu12": "12.6.85",
75
+ "nvidia-cublas-cu12": "12.6.4.1",
76
+ "pydantic": "2.11.7",
77
+ "async-timeout": "5.0.1",
78
+ "annotated-types": "0.7.0",
79
+ "rouge_score": "0.1.2",
80
+ "contourpy": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "matplotlib": "3.10.5",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "h11": "0.16.0",
87
+ "zipp": "3.19.2",
88
+ "tzdata": "2025.2",
89
+ "bert-score": "0.3.13",
90
+ "setuptools": "80.9.0",
91
+ "referencing": "0.36.2",
92
+ "sacrebleu": "2.5.1",
93
+ "filelock": "3.18.0",
94
+ "urllib3": "2.5.0",
95
+ "scipy": "1.15.3",
96
+ "nvidia-nccl-cu12": "2.26.2",
97
+ "kiwisolver": "1.4.8",
98
+ "networkx": "3.4.2",
99
+ "typing-inspection": "0.4.1",
100
+ "sniffio": "1.3.1",
101
+ "rpds-py": "0.26.0",
102
+ "nvidia-curand-cu12": "10.3.7.77",
103
+ "litellm": "1.74.12",
104
+ "pillow": "11.3.0",
105
+ "datasets": "3.6.0",
106
+ "nvidia-cusolver-cu12": "11.7.1.2",
107
+ "cycler": "0.12.1",
108
+ "tokenizers": "0.21.4",
109
+ "distro": "1.9.0",
110
+ "idna": "3.10",
111
+ "MarkupSafe": "3.0.2",
112
+ "frozenlist": "1.7.0",
113
+ "pyparsing": "3.2.3",
114
+ "regex": "2025.7.34",
115
+ "jiter": "0.10.0",
116
+ "importlib_metadata": "8.0.0",
117
+ "packaging": "24.2",
118
+ "psutil": "7.0.0",
119
+ "mecab-ko-dic": "1.0.0",
120
+ "joblib": "1.5.1",
121
+ "transformers": "4.54.1",
122
+ "fsspec": "2025.3.0",
123
+ "scikit-learn": "1.7.1",
124
+ "dill": "0.3.8",
125
+ "wheel": "0.45.1",
126
+ "nvidia-nvtx-cu12": "12.6.77",
127
+ "nvidia-cusparselt-cu12": "0.6.3",
128
+ "lxml": "6.0.0",
129
+ "propcache": "0.3.2",
130
+ "numpy": "2.2.6",
131
+ "mpmath": "1.3.0",
132
+ "conllu": "6.0.0",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "fonttools": "4.59.0",
136
+ "tabulate": "0.9.0",
137
+ "typing_extensions": "4.12.2",
138
+ "absl-py": "2.3.1",
139
+ "accelerate": "1.9.0",
140
+ "nvidia-cufft-cu12": "11.3.0.4",
141
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
142
+ "click": "8.2.1",
143
+ "attrs": "25.3.0",
144
+ "exceptiongroup": "1.3.0",
145
+ "tenacity": "9.1.2",
146
+ "huggingface-hub": "0.34.3",
147
+ "pytz": "2025.2",
148
+ "aiohappyeyeballs": "2.6.1",
149
+ "python-dateutil": "2.9.0.post0",
150
+ "torch": "2.7.1",
151
+ "python-dotenv": "1.1.1",
152
+ "multidict": "6.6.3",
153
+ "httpx": "0.28.1",
154
+ "aiohttp": "3.12.15",
155
+ "xxhash": "3.5.0",
156
+ "PyYAML": "6.0.2",
157
+ "colorama": "0.4.6",
158
+ "openai": "1.98.0",
159
+ "threadpoolctl": "3.6.0",
160
+ "nvidia-cudnn-cu12": "9.5.1.17",
161
+ "pandas": "2.3.1",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 1.0,
180
+ "accuracy_ci_low": 1.0,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 1.0,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 1.0,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 1.0,
190
+ "accuracy_ci_low": 1.0,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 1.0,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 1.0,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 1.0,
200
+ "accuracy_ci_low": 1.0,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 1.0,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 1.0,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 1.0,
220
+ "accuracy_ci_low": 1.0,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 1.0,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 1.0,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 1.0,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.8888888888888888,
260
+ "accuracy_ci_low": 0.5310928992288233,
261
+ "accuracy_ci_high": 1.0,
262
+ "score_name": "accuracy",
263
+ "score": 0.8888888888888888,
264
+ "score_ci_high": 1.0,
265
+ "score_ci_low": 0.5310928992288233,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.8888888888888888,
270
+ "accuracy_ci_low": 0.5555555555555556,
271
+ "accuracy_ci_high": 1.0,
272
+ "score_name": "accuracy",
273
+ "score": 0.8888888888888888,
274
+ "score_ci_high": 1.0,
275
+ "score_ci_low": 0.5555555555555556,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 1.0,
280
+ "accuracy_ci_low": 1.0,
281
+ "accuracy_ci_high": 1.0,
282
+ "score_name": "accuracy",
283
+ "score": 1.0,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 1.0,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.9797979797979798,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.9636363636363636,
296
+ "score": 0.9636363636363636,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.9636363636363636,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.8085106382978724,
307
+ "f1_Organization": 0.6857142857142857,
308
+ "f1_Location": 0.6956521739130435,
309
+ "f1_macro": 0.7299590326417338,
310
+ "recall_macro": 0.7832988267770876,
311
+ "precision_macro": 0.6967893217893218,
312
+ "in_classes_support": 1.0,
313
+ "f1_micro": 0.7239263803680981,
314
+ "recall_micro": 0.7866666666666666,
315
+ "precision_micro": 0.6704545454545454,
316
+ "score": 0.7239263803680981,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.6420029997196985,
319
+ "score_ci_high": 0.782066798498023,
320
+ "f1_micro_ci_low": 0.6420029997196985,
321
+ "f1_micro_ci_high": 0.782066798498023
322
+ },
323
+ "score": 0.7239263803680981,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.7142857142857143,
330
+ "accuracy_ci_low": 0.2857142857142857,
331
+ "accuracy_ci_high": 1.0,
332
+ "score_name": "accuracy",
333
+ "score": 0.7142857142857143,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.2857142857142857,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.42857142857142855,
340
+ "accuracy_ci_low": 0.14285714285714285,
341
+ "accuracy_ci_high": 0.8571428571428571,
342
+ "score_name": "accuracy",
343
+ "score": 0.42857142857142855,
344
+ "score_ci_high": 0.8571428571428571,
345
+ "score_ci_low": 0.14285714285714285,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.2857142857142857,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.7142857142857143,
352
+ "score_name": "accuracy",
353
+ "score": 0.2857142857142857,
354
+ "score_ci_high": 0.7142857142857143,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 1.0,
360
+ "accuracy_ci_low": 1.0,
361
+ "accuracy_ci_high": 1.0,
362
+ "score_name": "accuracy",
363
+ "score": 1.0,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 1.0,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.8571428571428571,
370
+ "accuracy_ci_low": 0.42857142857142855,
371
+ "accuracy_ci_high": 1.0,
372
+ "score_name": "accuracy",
373
+ "score": 0.8571428571428571,
374
+ "score_ci_high": 1.0,
375
+ "score_ci_low": 0.42857142857142855,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.42857142857142855,
380
+ "accuracy_ci_low": 0.14285714285714285,
381
+ "accuracy_ci_high": 0.8571428571428571,
382
+ "score_name": "accuracy",
383
+ "score": 0.42857142857142855,
384
+ "score_ci_high": 0.8571428571428571,
385
+ "score_ci_low": 0.14285714285714285,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.5714285714285714,
390
+ "accuracy_ci_low": 0.14285714285714285,
391
+ "accuracy_ci_high": 0.8571428571428571,
392
+ "score_name": "accuracy",
393
+ "score": 0.5714285714285714,
394
+ "score_ci_high": 0.8571428571428571,
395
+ "score_ci_low": 0.14285714285714285,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.2857142857142857,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.7142857142857143,
402
+ "score_name": "accuracy",
403
+ "score": 0.2857142857142857,
404
+ "score_ci_high": 0.7142857142857143,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.7142857142857143,
410
+ "accuracy_ci_low": 0.2857142857142857,
411
+ "accuracy_ci_high": 1.0,
412
+ "score_name": "accuracy",
413
+ "score": 0.7142857142857143,
414
+ "score_ci_high": 1.0,
415
+ "score_ci_low": 0.2857142857142857,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.5714285714285714,
420
+ "accuracy_ci_low": 0.14285714285714285,
421
+ "accuracy_ci_high": 0.8571428571428571,
422
+ "score_name": "accuracy",
423
+ "score": 0.5714285714285714,
424
+ "score_ci_high": 0.8571428571428571,
425
+ "score_ci_low": 0.14285714285714285,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.5714285714285714,
430
+ "accuracy_ci_low": 0.14285714285714285,
431
+ "accuracy_ci_high": 0.8571428571428571,
432
+ "score_name": "accuracy",
433
+ "score": 0.5714285714285714,
434
+ "score_ci_high": 0.8571428571428571,
435
+ "score_ci_low": 0.14285714285714285,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.8571428571428571,
440
+ "accuracy_ci_low": 0.31927964061584246,
441
+ "accuracy_ci_high": 1.0,
442
+ "score_name": "accuracy",
443
+ "score": 0.8571428571428571,
444
+ "score_ci_high": 1.0,
445
+ "score_ci_low": 0.31927964061584246,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.14285714285714285,
450
+ "accuracy_ci_low": 0.0,
451
+ "accuracy_ci_high": 0.5714285714285714,
452
+ "score_name": "accuracy",
453
+ "score": 0.14285714285714285,
454
+ "score_ci_high": 0.5714285714285714,
455
+ "score_ci_low": 0.0,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.7142857142857143,
460
+ "accuracy_ci_low": 0.2857142857142857,
461
+ "accuracy_ci_high": 1.0,
462
+ "score_name": "accuracy",
463
+ "score": 0.7142857142857143,
464
+ "score_ci_high": 1.0,
465
+ "score_ci_low": 0.2857142857142857,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.5816326530612245,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.503030303030303,
475
+ "f1_suggestive": 0.18181818181818182,
476
+ "f1_generic": 0.8,
477
+ "f1_fanciful": 0.4,
478
+ "f1_descriptive": 0.3333333333333333,
479
+ "f1_arbitrary": 0.8,
480
+ "f1_macro_ci_low": 0.3144226612853847,
481
+ "f1_macro_ci_high": 0.76,
482
+ "score_name": "f1_micro",
483
+ "score": 0.4375,
484
+ "score_ci_high": 0.6666666666666666,
485
+ "score_ci_low": 0.21080178633741004,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.35,
488
+ "accuracy_ci_low": 0.15,
489
+ "accuracy_ci_high": 0.6,
490
+ "f1_micro": 0.4375,
491
+ "f1_micro_ci_low": 0.21080178633741004,
492
+ "f1_micro_ci_high": 0.6666666666666666
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.6360153256704981,
496
+ "f1_no": 0.8275862068965517,
497
+ "f1_yes": 0.4444444444444444,
498
+ "f1_macro_ci_low": 0.4117647058823529,
499
+ "f1_macro_ci_high": 0.8932752204410275,
500
+ "score_name": "f1_micro",
501
+ "score": 0.7368421052631579,
502
+ "score_ci_high": 0.8947368421052632,
503
+ "score_ci_low": 0.5263157894736842,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.7,
506
+ "accuracy_ci_low": 0.5,
507
+ "accuracy_ci_high": 0.85,
508
+ "f1_micro": 0.7368421052631579,
509
+ "f1_micro_ci_low": 0.5263157894736842,
510
+ "f1_micro_ci_high": 0.8947368421052632
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.24591836734693878,
514
+ "f1_conclusion": 0.25,
515
+ "f1_issue": 0.4,
516
+ "f1_decree": 0.0,
517
+ "f1_rule": 0.0,
518
+ "f1_analysis": 0.0,
519
+ "f1_facts": 0.5714285714285714,
520
+ "f1_procedural history": 0.5,
521
+ "f1_macro_ci_low": 0.06687444590451443,
522
+ "f1_macro_ci_high": 0.4222222222222222,
523
+ "score_name": "f1_micro",
524
+ "score": 0.3157894736842105,
525
+ "score_ci_high": 0.5405405405405406,
526
+ "score_ci_low": 0.11428571428571428,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.3,
529
+ "accuracy_ci_low": 0.15,
530
+ "accuracy_ci_high": 0.55,
531
+ "f1_micro": 0.3157894736842105,
532
+ "f1_micro_ci_low": 0.11428571428571428,
533
+ "f1_micro_ci_high": 0.5405405405405406
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.7,
537
+ "f1_yes": 0.7,
538
+ "f1_no": 0.7,
539
+ "f1_macro_ci_low": 0.4949494949494949,
540
+ "f1_macro_ci_high": 0.898989898989899,
541
+ "score_name": "f1_micro",
542
+ "score": 0.7,
543
+ "score_ci_high": 0.859273262592211,
544
+ "score_ci_low": 0.5,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.7,
547
+ "accuracy_ci_low": 0.5,
548
+ "accuracy_ci_high": 0.859273262592211,
549
+ "f1_micro": 0.7,
550
+ "f1_micro_ci_low": 0.5,
551
+ "f1_micro_ci_high": 0.859273262592211
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.8375,
555
+ "f1_yes": 0.875,
556
+ "f1_no": 0.8,
557
+ "f1_macro_ci_low": 0.6112456731982998,
558
+ "f1_macro_ci_high": 0.9449275362318841,
559
+ "score_name": "f1_micro",
560
+ "score": 0.8333333333333334,
561
+ "score_ci_high": 0.9473684210526315,
562
+ "score_ci_low": 0.6076349233925447,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.75,
565
+ "accuracy_ci_low": 0.50468235519016,
566
+ "accuracy_ci_high": 0.9,
567
+ "f1_micro": 0.8333333333333334,
568
+ "f1_micro_ci_low": 0.6076349233925447,
569
+ "f1_micro_ci_high": 0.9473684210526315
570
+ },
571
+ "score": 0.6046929824561403,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.6248042235542235,
578
+ "f1_cars": 0.7272727272727273,
579
+ "f1_windows x": 0.5714285714285714,
580
+ "f1_computer graphics": 0.625,
581
+ "f1_atheism": 0.5,
582
+ "f1_religion": 0.0,
583
+ "f1_medicine": 1.0,
584
+ "f1_christianity": 0.8571428571428571,
585
+ "f1_microsoft windows": 0.8,
586
+ "f1_middle east": 0.7272727272727273,
587
+ "f1_motorcycles": 0.6,
588
+ "f1_pc hardware": 0.6666666666666666,
589
+ "f1_mac hardware": 0.9090909090909091,
590
+ "f1_electronics": 0.5,
591
+ "f1_for sale": 0.4,
592
+ "f1_guns": 0.2857142857142857,
593
+ "f1_space": 0.8888888888888888,
594
+ "f1_cryptography": 0.3333333333333333,
595
+ "f1_baseball": 0.6,
596
+ "f1_hockey": 0.8888888888888888,
597
+ "f1_politics": 0.6153846153846154,
598
+ "f1_macro_ci_low": 0.5435598724209302,
599
+ "f1_macro_ci_high": 0.727169076447674,
600
+ "score_name": "f1_micro",
601
+ "score": 0.6629213483146067,
602
+ "score_ci_high": 0.7431693989071039,
603
+ "score_ci_low": 0.561198821408691,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.59,
606
+ "accuracy_ci_low": 0.49,
607
+ "accuracy_ci_high": 0.68,
608
+ "f1_micro": 0.6629213483146067,
609
+ "f1_micro_ci_low": 0.561198821408691,
610
+ "f1_micro_ci_high": 0.7431693989071039
611
+ },
612
+ "score": 0.6629213483146067,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.7315018315018316,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.8923076923076924,
620
+ "f1_debt collection": 0.6666666666666666,
621
+ "f1_payday loan or title loan or personal loan": 0.0,
622
+ "f1_student loan": 0.7692307692307693,
623
+ "f1_credit card or prepaid card": 0.8571428571428571,
624
+ "f1_checking or savings account": 1.0,
625
+ "f1_mortgage": 0.6666666666666666,
626
+ "f1_money transfer or virtual currency or money service": 1.0,
627
+ "f1_macro_ci_low": 0.5341663901608891,
628
+ "f1_macro_ci_high": 0.8376311452519075,
629
+ "score_name": "f1_micro",
630
+ "score": 0.8586387434554974,
631
+ "score_ci_high": 0.916233746693676,
632
+ "score_ci_low": 0.783068783068783,
633
+ "num_of_instances": 100,
634
+ "accuracy": 0.82,
635
+ "accuracy_ci_low": 0.74,
636
+ "accuracy_ci_high": 0.89,
637
+ "f1_micro": 0.8586387434554974,
638
+ "f1_micro_ci_low": 0.783068783068783,
639
+ "f1_micro_ci_high": 0.916233746693676
640
+ },
641
+ "cfpb_product_watsonx": {
642
+ "f1_macro": 0.799064551009631,
643
+ "f1_mortgages and loans": 0.782608695652174,
644
+ "f1_credit card": 0.8571428571428571,
645
+ "f1_debt collection": 0.7368421052631579,
646
+ "f1_credit reporting": 0.6956521739130435,
647
+ "f1_retail banking": 0.9230769230769231,
648
+ "f1_macro_ci_low": 0.6817947520200675,
649
+ "f1_macro_ci_high": 0.8954471115753557,
650
+ "score_name": "f1_micro",
651
+ "score": 0.7878787878787878,
652
+ "score_ci_high": 0.88,
653
+ "score_ci_low": 0.66,
654
+ "num_of_instances": 50,
655
+ "accuracy": 0.78,
656
+ "accuracy_ci_low": 0.66,
657
+ "accuracy_ci_high": 0.88,
658
+ "f1_micro": 0.7878787878787878,
659
+ "f1_micro_ci_low": 0.66,
660
+ "f1_micro_ci_high": 0.88
661
+ },
662
+ "score": 0.8232587656671426,
663
+ "score_name": "subsets_mean",
664
+ "num_of_instances": 150
665
+ },
666
+ "qa_finance": {
667
+ "fin_qa": {
668
+ "num_of_instances": 100,
669
+ "execution_accuracy": 0.32,
670
+ "program_accuracy": 0.33,
671
+ "score": 0.33,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy_ci_low": 0.23,
674
+ "execution_accuracy_ci_high": 0.42,
675
+ "program_accuracy_ci_low": 0.25,
676
+ "program_accuracy_ci_high": 0.43,
677
+ "score_ci_low": 0.25,
678
+ "score_ci_high": 0.43
679
+ },
680
+ "score": 0.33,
681
+ "score_name": "subsets_mean",
682
+ "num_of_instances": 100
683
+ },
684
+ "rag_general": {
685
+ "rag_response_generation_clapnq": {
686
+ "precision": 0.4343698865384216,
687
+ "recall": 0.6709870742364467,
688
+ "f1": 0.4846153577687695,
689
+ "precision_ci_low": 0.3998020013123197,
690
+ "precision_ci_high": 0.4698068976247165,
691
+ "recall_ci_low": 0.6303563546991966,
692
+ "recall_ci_high": 0.7089929114321589,
693
+ "f1_ci_low": 0.4526567482268106,
694
+ "f1_ci_high": 0.5146388287292981,
695
+ "score_name": "f1",
696
+ "score": 0.4846153577687695,
697
+ "score_ci_high": 0.5146388287292981,
698
+ "score_ci_low": 0.4526567482268106,
699
+ "num_of_instances": 100,
700
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6816095349192619,
701
+ "correctness_recall_bert_score.deberta_large_mnli": 0.729021328985691,
702
+ "correctness_precision_bert_score.deberta_large_mnli": 0.6500837489962578,
703
+ "faithfullness_f1_token_overlap": 0.43231085895541577,
704
+ "faithfullness_recall_token_overlap": 0.3622151601642728,
705
+ "faithfullness_precision_token_overlap": 0.6636443305066012,
706
+ "correctness_f1_token_overlap": 0.4846153577687695,
707
+ "correctness_recall_token_overlap": 0.6709870742364467,
708
+ "correctness_precision_token_overlap": 0.4343698865384216
709
+ },
710
+ "score": 0.4846153577687695,
711
+ "score_name": "subsets_mean",
712
+ "num_of_instances": 100
713
+ },
714
+ "reasoning": {
715
+ "hellaswag": {
716
+ "accuracy": 0.61,
717
+ "accuracy_ci_low": 0.51,
718
+ "accuracy_ci_high": 0.7,
719
+ "score_name": "accuracy",
720
+ "score": 0.61,
721
+ "score_ci_high": 0.7,
722
+ "score_ci_low": 0.51,
723
+ "num_of_instances": 100
724
+ },
725
+ "openbook_qa": {
726
+ "accuracy": 0.94,
727
+ "accuracy_ci_low": 0.87,
728
+ "accuracy_ci_high": 0.98,
729
+ "score_name": "accuracy",
730
+ "score": 0.94,
731
+ "score_ci_high": 0.98,
732
+ "score_ci_low": 0.87,
733
+ "num_of_instances": 100
734
+ },
735
+ "score": 0.7749999999999999,
736
+ "score_name": "subsets_mean",
737
+ "num_of_instances": 200
738
+ },
739
+ "safety": {
740
+ "attaq_500": {
741
+ "severity_critical": 0.0,
742
+ "severity_high": 0.0,
743
+ "severity_medium": 0.0,
744
+ "severity_low": 100.0,
745
+ "category_harmful_info": 0.875045096996935,
746
+ "category_deception": 0.9087016443260992,
747
+ "category_explicit_content": 0.9104775799946352,
748
+ "category_violence": 0.866230480028649,
749
+ "category_discrimination": 0.8078865560274275,
750
+ "category_substance_abuse": 0.8765388902690676,
751
+ "category_pii": 0.8728977072018164,
752
+ "safety": 0.8748689444958777,
753
+ "safety_ci_low": 0.859557728046316,
754
+ "safety_ci_high": 0.8896609433147404,
755
+ "score_name": "safety",
756
+ "score": 0.8748689444958777,
757
+ "score_ci_high": 0.8896609433147404,
758
+ "score_ci_low": 0.859557728046316,
759
+ "num_of_instances": 100
760
+ },
761
+ "score": 0.8748689444958777,
762
+ "score_name": "subsets_mean",
763
+ "num_of_instances": 100
764
+ },
765
+ "summarization": {
766
+ "billsum_document_filtered_to_6000_chars": {
767
+ "num_of_instances": 100,
768
+ "rougeLsum": 0.3449679916101768,
769
+ "rouge2": 0.17691065019354774,
770
+ "rougeL": 0.2755019690756654,
771
+ "score": 0.2755019690756654,
772
+ "score_name": "rougeL",
773
+ "rouge1": 0.40866416882835144,
774
+ "rougeLsum_ci_low": 0.32452422336075787,
775
+ "rougeLsum_ci_high": 0.362432692973816,
776
+ "rouge2_ci_low": 0.16371569781722647,
777
+ "rouge2_ci_high": 0.19131565638937167,
778
+ "rougeL_ci_low": 0.260558384013039,
779
+ "rougeL_ci_high": 0.2917913421726095,
780
+ "score_ci_low": 0.260558384013039,
781
+ "score_ci_high": 0.2917913421726095,
782
+ "rouge1_ci_low": 0.3860176168688086,
783
+ "rouge1_ci_high": 0.42632106299588085
784
+ },
785
+ "tldr_document_filtered_to_6000_chars": {
786
+ "num_of_instances": 100,
787
+ "rougeLsum": 0.09942588005347286,
788
+ "rouge2": 0.015180143919691523,
789
+ "rougeL": 0.08942312196371219,
790
+ "score": 0.08942312196371219,
791
+ "score_name": "rougeL",
792
+ "rouge1": 0.1194115419368174,
793
+ "rougeLsum_ci_low": 0.0862462499816593,
794
+ "rougeLsum_ci_high": 0.11236018272084582,
795
+ "rouge2_ci_low": 0.010616515576465502,
796
+ "rouge2_ci_high": 0.02165065601326996,
797
+ "rougeL_ci_low": 0.07820323036355636,
798
+ "rougeL_ci_high": 0.10112740368476562,
799
+ "score_ci_low": 0.07820323036355636,
800
+ "score_ci_high": 0.10112740368476562,
801
+ "rouge1_ci_low": 0.10290535826717513,
802
+ "rouge1_ci_high": 0.13692429898442587
803
+ },
804
+ "score": 0.18246254551968877,
805
+ "score_name": "subsets_mean",
806
+ "num_of_instances": 200
807
+ },
808
+ "translation": {
809
+ "mt_flores_101_ara_eng": {
810
+ "num_of_instances": 6,
811
+ "counts": [
812
+ 152,
813
+ 108,
814
+ 78,
815
+ 57
816
+ ],
817
+ "totals": [
818
+ 213,
819
+ 207,
820
+ 201,
821
+ 195
822
+ ],
823
+ "precisions": [
824
+ 0.7136150234741784,
825
+ 0.5217391304347826,
826
+ 0.3880597014925373,
827
+ 0.2923076923076923
828
+ ],
829
+ "bp": 1.0,
830
+ "sys_len": 213,
831
+ "ref_len": 208,
832
+ "sacrebleu": 0.4533295675744374,
833
+ "score": 0.4533295675744374,
834
+ "score_name": "sacrebleu",
835
+ "score_ci_low": 0.2970856371571267,
836
+ "score_ci_high": 0.6155700608318663,
837
+ "sacrebleu_ci_low": 0.2970856371571267,
838
+ "sacrebleu_ci_high": 0.6155700608318663
839
+ },
840
+ "mt_flores_101_deu_eng": {
841
+ "num_of_instances": 6,
842
+ "counts": [
843
+ 143,
844
+ 89,
845
+ 60,
846
+ 43
847
+ ],
848
+ "totals": [
849
+ 221,
850
+ 215,
851
+ 209,
852
+ 203
853
+ ],
854
+ "precisions": [
855
+ 0.6470588235294117,
856
+ 0.41395348837209306,
857
+ 0.28708133971291866,
858
+ 0.21182266009852216
859
+ ],
860
+ "bp": 1.0,
861
+ "sys_len": 221,
862
+ "ref_len": 208,
863
+ "sacrebleu": 0.35724665668654765,
864
+ "score": 0.35724665668654765,
865
+ "score_name": "sacrebleu",
866
+ "score_ci_low": 0.2566990236689781,
867
+ "score_ci_high": 0.5362601500874902,
868
+ "sacrebleu_ci_low": 0.2566990236689781,
869
+ "sacrebleu_ci_high": 0.5362601500874902
870
+ },
871
+ "mt_flores_101_eng_ara": {
872
+ "num_of_instances": 6,
873
+ "counts": [
874
+ 116,
875
+ 72,
876
+ 49,
877
+ 30
878
+ ],
879
+ "totals": [
880
+ 204,
881
+ 198,
882
+ 192,
883
+ 186
884
+ ],
885
+ "precisions": [
886
+ 0.5686274509803921,
887
+ 0.36363636363636365,
888
+ 0.2552083333333333,
889
+ 0.16129032258064516
890
+ ],
891
+ "bp": 0.9757881223212935,
892
+ "sys_len": 204,
893
+ "ref_len": 209,
894
+ "sacrebleu": 0.2963842353604502,
895
+ "score": 0.2963842353604502,
896
+ "score_name": "sacrebleu",
897
+ "score_ci_low": 0.21279117709012274,
898
+ "score_ci_high": 0.3978311539100688,
899
+ "sacrebleu_ci_low": 0.21279117709012274,
900
+ "sacrebleu_ci_high": 0.3978311539100688
901
+ },
902
+ "mt_flores_101_eng_deu": {
903
+ "num_of_instances": 6,
904
+ "counts": [
905
+ 148,
906
+ 95,
907
+ 62,
908
+ 45
909
+ ],
910
+ "totals": [
911
+ 210,
912
+ 204,
913
+ 198,
914
+ 192
915
+ ],
916
+ "precisions": [
917
+ 0.7047619047619048,
918
+ 0.46568627450980393,
919
+ 0.3131313131313131,
920
+ 0.234375
921
+ ],
922
+ "bp": 0.9718328750329812,
923
+ "sys_len": 210,
924
+ "ref_len": 216,
925
+ "sacrebleu": 0.382855593891157,
926
+ "score": 0.382855593891157,
927
+ "score_name": "sacrebleu",
928
+ "score_ci_low": 0.28077912493137325,
929
+ "score_ci_high": 0.5224412329440934,
930
+ "sacrebleu_ci_low": 0.28077912493137325,
931
+ "sacrebleu_ci_high": 0.5224412329440934
932
+ },
933
+ "mt_flores_101_eng_fra": {
934
+ "num_of_instances": 6,
935
+ "counts": [
936
+ 184,
937
+ 135,
938
+ 99,
939
+ 72
940
+ ],
941
+ "totals": [
942
+ 240,
943
+ 234,
944
+ 228,
945
+ 222
946
+ ],
947
+ "precisions": [
948
+ 0.7666666666666667,
949
+ 0.576923076923077,
950
+ 0.43421052631578944,
951
+ 0.32432432432432434
952
+ ],
953
+ "bp": 1.0,
954
+ "sys_len": 240,
955
+ "ref_len": 235,
956
+ "sacrebleu": 0.4995754525815319,
957
+ "score": 0.4995754525815319,
958
+ "score_name": "sacrebleu",
959
+ "score_ci_low": 0.4515767855995504,
960
+ "score_ci_high": 0.5566329044186812,
961
+ "sacrebleu_ci_low": 0.4515767855995504,
962
+ "sacrebleu_ci_high": 0.5566329044186812
963
+ },
964
+ "mt_flores_101_eng_kor": {
965
+ "num_of_instances": 6,
966
+ "counts": [
967
+ 163,
968
+ 87,
969
+ 56,
970
+ 36
971
+ ],
972
+ "totals": [
973
+ 297,
974
+ 291,
975
+ 285,
976
+ 279
977
+ ],
978
+ "precisions": [
979
+ 0.5488215488215488,
980
+ 0.29896907216494845,
981
+ 0.19649122807017544,
982
+ 0.12903225806451613
983
+ ],
984
+ "bp": 1.0,
985
+ "sys_len": 297,
986
+ "ref_len": 249,
987
+ "sacrebleu": 0.25396549824957954,
988
+ "score": 0.25396549824957954,
989
+ "score_name": "sacrebleu",
990
+ "score_ci_low": 0.18656507398105443,
991
+ "score_ci_high": 0.3676949692724427,
992
+ "sacrebleu_ci_low": 0.18656507398105443,
993
+ "sacrebleu_ci_high": 0.3676949692724427
994
+ },
995
+ "mt_flores_101_eng_por": {
996
+ "num_of_instances": 6,
997
+ "counts": [
998
+ 181,
999
+ 140,
1000
+ 113,
1001
+ 92
1002
+ ],
1003
+ "totals": [
1004
+ 226,
1005
+ 220,
1006
+ 214,
1007
+ 208
1008
+ ],
1009
+ "precisions": [
1010
+ 0.8008849557522124,
1011
+ 0.6363636363636364,
1012
+ 0.5280373831775701,
1013
+ 0.44230769230769235
1014
+ ],
1015
+ "bp": 1.0,
1016
+ "sys_len": 226,
1017
+ "ref_len": 222,
1018
+ "sacrebleu": 0.587375953828071,
1019
+ "score": 0.587375953828071,
1020
+ "score_name": "sacrebleu",
1021
+ "score_ci_low": 0.5264308904974885,
1022
+ "score_ci_high": 0.6527446503464284,
1023
+ "sacrebleu_ci_low": 0.5264308904974885,
1024
+ "sacrebleu_ci_high": 0.6527446503464284
1025
+ },
1026
+ "mt_flores_101_eng_ron": {
1027
+ "num_of_instances": 6,
1028
+ "counts": [
1029
+ 159,
1030
+ 110,
1031
+ 79,
1032
+ 59
1033
+ ],
1034
+ "totals": [
1035
+ 230,
1036
+ 224,
1037
+ 218,
1038
+ 212
1039
+ ],
1040
+ "precisions": [
1041
+ 0.6913043478260871,
1042
+ 0.49107142857142855,
1043
+ 0.3623853211009175,
1044
+ 0.2783018867924528
1045
+ ],
1046
+ "bp": 1.0,
1047
+ "sys_len": 230,
1048
+ "ref_len": 230,
1049
+ "sacrebleu": 0.4301551985882873,
1050
+ "score": 0.4301551985882873,
1051
+ "score_name": "sacrebleu",
1052
+ "score_ci_low": 0.35322253633753253,
1053
+ "score_ci_high": 0.542330368706405,
1054
+ "sacrebleu_ci_low": 0.35322253633753253,
1055
+ "sacrebleu_ci_high": 0.542330368706405
1056
+ },
1057
+ "mt_flores_101_eng_spa": {
1058
+ "num_of_instances": 6,
1059
+ "counts": [
1060
+ 168,
1061
+ 106,
1062
+ 70,
1063
+ 45
1064
+ ],
1065
+ "totals": [
1066
+ 237,
1067
+ 231,
1068
+ 225,
1069
+ 219
1070
+ ],
1071
+ "precisions": [
1072
+ 0.7088607594936708,
1073
+ 0.4588744588744589,
1074
+ 0.3111111111111111,
1075
+ 0.2054794520547945
1076
+ ],
1077
+ "bp": 0.9750013184817767,
1078
+ "sys_len": 237,
1079
+ "ref_len": 243,
1080
+ "sacrebleu": 0.37024558499956833,
1081
+ "score": 0.37024558499956833,
1082
+ "score_name": "sacrebleu",
1083
+ "score_ci_low": 0.2914675063581613,
1084
+ "score_ci_high": 0.4722241333633745,
1085
+ "sacrebleu_ci_low": 0.2914675063581613,
1086
+ "sacrebleu_ci_high": 0.4722241333633745
1087
+ },
1088
+ "mt_flores_101_fra_eng": {
1089
+ "num_of_instances": 6,
1090
+ "counts": [
1091
+ 158,
1092
+ 112,
1093
+ 82,
1094
+ 63
1095
+ ],
1096
+ "totals": [
1097
+ 218,
1098
+ 212,
1099
+ 206,
1100
+ 200
1101
+ ],
1102
+ "precisions": [
1103
+ 0.724770642201835,
1104
+ 0.5283018867924528,
1105
+ 0.3980582524271845,
1106
+ 0.315
1107
+ ],
1108
+ "bp": 1.0,
1109
+ "sys_len": 218,
1110
+ "ref_len": 208,
1111
+ "sacrebleu": 0.4680960595371609,
1112
+ "score": 0.4680960595371609,
1113
+ "score_name": "sacrebleu",
1114
+ "score_ci_low": 0.26555389781536615,
1115
+ "score_ci_high": 0.6882400443755067,
1116
+ "sacrebleu_ci_low": 0.26555389781536615,
1117
+ "sacrebleu_ci_high": 0.6882400443755067
1118
+ },
1119
+ "mt_flores_101_jpn_eng": {
1120
+ "num_of_instances": 6,
1121
+ "counts": [
1122
+ 147,
1123
+ 92,
1124
+ 63,
1125
+ 43
1126
+ ],
1127
+ "totals": [
1128
+ 219,
1129
+ 213,
1130
+ 207,
1131
+ 201
1132
+ ],
1133
+ "precisions": [
1134
+ 0.6712328767123288,
1135
+ 0.431924882629108,
1136
+ 0.30434782608695654,
1137
+ 0.21393034825870647
1138
+ ],
1139
+ "bp": 1.0,
1140
+ "sys_len": 219,
1141
+ "ref_len": 208,
1142
+ "sacrebleu": 0.3706645149919594,
1143
+ "score": 0.3706645149919594,
1144
+ "score_name": "sacrebleu",
1145
+ "score_ci_low": 0.18561228093884444,
1146
+ "score_ci_high": 0.45332840825867166,
1147
+ "sacrebleu_ci_low": 0.18561228093884444,
1148
+ "sacrebleu_ci_high": 0.45332840825867166
1149
+ },
1150
+ "mt_flores_101_kor_eng": {
1151
+ "num_of_instances": 6,
1152
+ "counts": [
1153
+ 136,
1154
+ 82,
1155
+ 53,
1156
+ 38
1157
+ ],
1158
+ "totals": [
1159
+ 208,
1160
+ 202,
1161
+ 196,
1162
+ 190
1163
+ ],
1164
+ "precisions": [
1165
+ 0.6538461538461539,
1166
+ 0.4059405940594059,
1167
+ 0.27040816326530615,
1168
+ 0.2
1169
+ ],
1170
+ "bp": 1.0,
1171
+ "sys_len": 208,
1172
+ "ref_len": 208,
1173
+ "sacrebleu": 0.346136152997744,
1174
+ "score": 0.346136152997744,
1175
+ "score_name": "sacrebleu",
1176
+ "score_ci_low": 0.197154502834615,
1177
+ "score_ci_high": 0.4450318608831673,
1178
+ "sacrebleu_ci_low": 0.197154502834615,
1179
+ "sacrebleu_ci_high": 0.4450318608831673
1180
+ },
1181
+ "mt_flores_101_por_eng": {
1182
+ "num_of_instances": 6,
1183
+ "counts": [
1184
+ 161,
1185
+ 117,
1186
+ 85,
1187
+ 65
1188
+ ],
1189
+ "totals": [
1190
+ 213,
1191
+ 207,
1192
+ 201,
1193
+ 195
1194
+ ],
1195
+ "precisions": [
1196
+ 0.755868544600939,
1197
+ 0.5652173913043478,
1198
+ 0.42288557213930345,
1199
+ 0.33333333333333337
1200
+ ],
1201
+ "bp": 1.0,
1202
+ "sys_len": 213,
1203
+ "ref_len": 208,
1204
+ "sacrebleu": 0.4953827168207276,
1205
+ "score": 0.4953827168207276,
1206
+ "score_name": "sacrebleu",
1207
+ "score_ci_low": 0.28814389311162725,
1208
+ "score_ci_high": 0.617325493054366,
1209
+ "sacrebleu_ci_low": 0.28814389311162725,
1210
+ "sacrebleu_ci_high": 0.617325493054366
1211
+ },
1212
+ "mt_flores_101_ron_eng": {
1213
+ "num_of_instances": 6,
1214
+ "counts": [
1215
+ 160,
1216
+ 120,
1217
+ 92,
1218
+ 76
1219
+ ],
1220
+ "totals": [
1221
+ 222,
1222
+ 216,
1223
+ 210,
1224
+ 204
1225
+ ],
1226
+ "precisions": [
1227
+ 0.7207207207207208,
1228
+ 0.5555555555555556,
1229
+ 0.4380952380952381,
1230
+ 0.37254901960784315
1231
+ ],
1232
+ "bp": 1.0,
1233
+ "sys_len": 222,
1234
+ "ref_len": 208,
1235
+ "sacrebleu": 0.505605296924794,
1236
+ "score": 0.505605296924794,
1237
+ "score_name": "sacrebleu",
1238
+ "score_ci_low": 0.32967576416379546,
1239
+ "score_ci_high": 0.7153750601746934,
1240
+ "sacrebleu_ci_low": 0.32967576416379546,
1241
+ "sacrebleu_ci_high": 0.7153750601746934
1242
+ },
1243
+ "mt_flores_101_spa_eng": {
1244
+ "num_of_instances": 6,
1245
+ "counts": [
1246
+ 152,
1247
+ 95,
1248
+ 62,
1249
+ 44
1250
+ ],
1251
+ "totals": [
1252
+ 226,
1253
+ 220,
1254
+ 214,
1255
+ 208
1256
+ ],
1257
+ "precisions": [
1258
+ 0.672566371681416,
1259
+ 0.4318181818181818,
1260
+ 0.28971962616822433,
1261
+ 0.21153846153846154
1262
+ ],
1263
+ "bp": 1.0,
1264
+ "sys_len": 226,
1265
+ "ref_len": 208,
1266
+ "sacrebleu": 0.3652589217481651,
1267
+ "score": 0.3652589217481651,
1268
+ "score_name": "sacrebleu",
1269
+ "score_ci_low": 0.27931028013393655,
1270
+ "score_ci_high": 0.40059757012052705,
1271
+ "sacrebleu_ci_low": 0.27931028013393655,
1272
+ "sacrebleu_ci_high": 0.40059757012052705
1273
+ },
1274
+ "score": 0.4121518269853454,
1275
+ "score_name": "subsets_mean",
1276
+ "num_of_instances": 90
1277
+ },
1278
+ "score": 0.646074242159326,
1279
+ "score_name": "subsets_mean",
1280
+ "num_of_instances": 1537
1281
+ }
1282
+ }
results/bluebench/2025-08-03T08-51-48_evaluation_results.json ADDED
@@ -0,0 +1,1282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-08-03T12:51:43.070081Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=azure/Azure/gpt-4.1-mini-ncf,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "azure/Azure/gpt-4.1-mini-ncf",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.26.5",
45
+ "unitxt_commit_hash": "N/A",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "tiktoken": "0.9.0",
55
+ "charset-normalizer": "3.4.2",
56
+ "nvidia-cuda-runtime-cu12": "12.6.77",
57
+ "pyarrow": "21.0.0",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "httpcore": "1.0.9",
61
+ "pip": "25.2",
62
+ "certifi": "2025.7.14",
63
+ "evaluate": "0.4.5",
64
+ "Jinja2": "3.1.6",
65
+ "jsonschema-specifications": "2025.4.1",
66
+ "pydantic_core": "2.33.2",
67
+ "nvidia-cusparse-cu12": "12.5.4.2",
68
+ "aiosignal": "1.4.0",
69
+ "yarl": "1.20.1",
70
+ "unitxt": "1.26.5",
71
+ "jsonschema": "4.25.0",
72
+ "portalocker": "3.2.0",
73
+ "multiprocess": "0.70.16",
74
+ "nvidia-nvjitlink-cu12": "12.6.85",
75
+ "nvidia-cublas-cu12": "12.6.4.1",
76
+ "pydantic": "2.11.7",
77
+ "async-timeout": "5.0.1",
78
+ "annotated-types": "0.7.0",
79
+ "rouge_score": "0.1.2",
80
+ "contourpy": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "matplotlib": "3.10.5",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "h11": "0.16.0",
87
+ "zipp": "3.19.2",
88
+ "tzdata": "2025.2",
89
+ "bert-score": "0.3.13",
90
+ "setuptools": "80.9.0",
91
+ "referencing": "0.36.2",
92
+ "sacrebleu": "2.5.1",
93
+ "filelock": "3.18.0",
94
+ "urllib3": "2.5.0",
95
+ "scipy": "1.15.3",
96
+ "nvidia-nccl-cu12": "2.26.2",
97
+ "kiwisolver": "1.4.8",
98
+ "networkx": "3.4.2",
99
+ "typing-inspection": "0.4.1",
100
+ "sniffio": "1.3.1",
101
+ "rpds-py": "0.26.0",
102
+ "nvidia-curand-cu12": "10.3.7.77",
103
+ "litellm": "1.74.12",
104
+ "pillow": "11.3.0",
105
+ "datasets": "3.6.0",
106
+ "nvidia-cusolver-cu12": "11.7.1.2",
107
+ "cycler": "0.12.1",
108
+ "tokenizers": "0.21.4",
109
+ "distro": "1.9.0",
110
+ "idna": "3.10",
111
+ "MarkupSafe": "3.0.2",
112
+ "frozenlist": "1.7.0",
113
+ "pyparsing": "3.2.3",
114
+ "regex": "2025.7.34",
115
+ "jiter": "0.10.0",
116
+ "importlib_metadata": "8.0.0",
117
+ "packaging": "24.2",
118
+ "psutil": "7.0.0",
119
+ "mecab-ko-dic": "1.0.0",
120
+ "joblib": "1.5.1",
121
+ "transformers": "4.54.1",
122
+ "fsspec": "2025.3.0",
123
+ "scikit-learn": "1.7.1",
124
+ "dill": "0.3.8",
125
+ "wheel": "0.45.1",
126
+ "nvidia-nvtx-cu12": "12.6.77",
127
+ "nvidia-cusparselt-cu12": "0.6.3",
128
+ "lxml": "6.0.0",
129
+ "propcache": "0.3.2",
130
+ "numpy": "2.2.6",
131
+ "mpmath": "1.3.0",
132
+ "conllu": "6.0.0",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "fonttools": "4.59.0",
136
+ "tabulate": "0.9.0",
137
+ "typing_extensions": "4.12.2",
138
+ "absl-py": "2.3.1",
139
+ "accelerate": "1.9.0",
140
+ "nvidia-cufft-cu12": "11.3.0.4",
141
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
142
+ "click": "8.2.1",
143
+ "attrs": "25.3.0",
144
+ "exceptiongroup": "1.3.0",
145
+ "tenacity": "9.1.2",
146
+ "huggingface-hub": "0.34.3",
147
+ "pytz": "2025.2",
148
+ "aiohappyeyeballs": "2.6.1",
149
+ "python-dateutil": "2.9.0.post0",
150
+ "torch": "2.7.1",
151
+ "python-dotenv": "1.1.1",
152
+ "multidict": "6.6.3",
153
+ "httpx": "0.28.1",
154
+ "aiohttp": "3.12.15",
155
+ "xxhash": "3.5.0",
156
+ "PyYAML": "6.0.2",
157
+ "colorama": "0.4.6",
158
+ "openai": "1.98.0",
159
+ "threadpoolctl": "3.6.0",
160
+ "nvidia-cudnn-cu12": "9.5.1.17",
161
+ "pandas": "2.3.1",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 1.0,
180
+ "accuracy_ci_low": 1.0,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 1.0,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 1.0,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 1.0,
190
+ "accuracy_ci_low": 1.0,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 1.0,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 1.0,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 1.0,
200
+ "accuracy_ci_low": 1.0,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 1.0,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 1.0,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 1.0,
220
+ "accuracy_ci_low": 1.0,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 1.0,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 1.0,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 1.0,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.7777777777777778,
260
+ "accuracy_ci_low": 0.4444444444444444,
261
+ "accuracy_ci_high": 1.0,
262
+ "score_name": "accuracy",
263
+ "score": 0.7777777777777778,
264
+ "score_ci_high": 1.0,
265
+ "score_ci_low": 0.4444444444444444,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.6666666666666666,
270
+ "accuracy_ci_low": 0.3333333333333333,
271
+ "accuracy_ci_high": 1.0,
272
+ "score_name": "accuracy",
273
+ "score": 0.6666666666666666,
274
+ "score_ci_high": 1.0,
275
+ "score_ci_low": 0.3333333333333333,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 1.0,
280
+ "accuracy_ci_low": 1.0,
281
+ "accuracy_ci_high": 1.0,
282
+ "score_name": "accuracy",
283
+ "score": 1.0,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 1.0,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.9494949494949495,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.9754901960784313,
296
+ "score": 0.9754901960784313,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.9754901960784313,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.851063829787234,
307
+ "f1_Organization": 0.6301369863013698,
308
+ "f1_Location": 0.7727272727272727,
309
+ "f1_macro": 0.7513093629386255,
310
+ "recall_macro": 0.7997757073844031,
311
+ "precision_macro": 0.7314814814814815,
312
+ "in_classes_support": 1.0,
313
+ "f1_micro": 0.7317073170731706,
314
+ "recall_micro": 0.8,
315
+ "precision_micro": 0.6741573033707865,
316
+ "score": 0.7317073170731706,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.6582752488607289,
319
+ "score_ci_high": 0.7931160320928256,
320
+ "f1_micro_ci_low": 0.6582752488607289,
321
+ "f1_micro_ci_high": 0.7931160320928256
322
+ },
323
+ "score": 0.7317073170731706,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.7142857142857143,
330
+ "accuracy_ci_low": 0.2857142857142857,
331
+ "accuracy_ci_high": 1.0,
332
+ "score_name": "accuracy",
333
+ "score": 0.7142857142857143,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.2857142857142857,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.2857142857142857,
340
+ "accuracy_ci_low": 0.0,
341
+ "accuracy_ci_high": 0.7142857142857143,
342
+ "score_name": "accuracy",
343
+ "score": 0.2857142857142857,
344
+ "score_ci_high": 0.7142857142857143,
345
+ "score_ci_low": 0.0,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.0,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.0,
352
+ "score_name": "accuracy",
353
+ "score": 0.0,
354
+ "score_ci_high": 0.0,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.8571428571428571,
360
+ "accuracy_ci_low": 0.42857142857142855,
361
+ "accuracy_ci_high": 1.0,
362
+ "score_name": "accuracy",
363
+ "score": 0.8571428571428571,
364
+ "score_ci_high": 1.0,
365
+ "score_ci_low": 0.42857142857142855,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.7142857142857143,
370
+ "accuracy_ci_low": 0.2857142857142857,
371
+ "accuracy_ci_high": 1.0,
372
+ "score_name": "accuracy",
373
+ "score": 0.7142857142857143,
374
+ "score_ci_high": 1.0,
375
+ "score_ci_low": 0.2857142857142857,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.0,
380
+ "accuracy_ci_low": 0.0,
381
+ "accuracy_ci_high": 0.0,
382
+ "score_name": "accuracy",
383
+ "score": 0.0,
384
+ "score_ci_high": 0.0,
385
+ "score_ci_low": 0.0,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.2857142857142857,
390
+ "accuracy_ci_low": 0.0,
391
+ "accuracy_ci_high": 0.7142857142857143,
392
+ "score_name": "accuracy",
393
+ "score": 0.2857142857142857,
394
+ "score_ci_high": 0.7142857142857143,
395
+ "score_ci_low": 0.0,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.2857142857142857,
400
+ "accuracy_ci_low": 0.0,
401
+ "accuracy_ci_high": 0.7142857142857143,
402
+ "score_name": "accuracy",
403
+ "score": 0.2857142857142857,
404
+ "score_ci_high": 0.7142857142857143,
405
+ "score_ci_low": 0.0,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.5714285714285714,
410
+ "accuracy_ci_low": 0.14285714285714285,
411
+ "accuracy_ci_high": 0.8571428571428571,
412
+ "score_name": "accuracy",
413
+ "score": 0.5714285714285714,
414
+ "score_ci_high": 0.8571428571428571,
415
+ "score_ci_low": 0.14285714285714285,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.42857142857142855,
420
+ "accuracy_ci_low": 0.14285714285714285,
421
+ "accuracy_ci_high": 0.8571428571428571,
422
+ "score_name": "accuracy",
423
+ "score": 0.42857142857142855,
424
+ "score_ci_high": 0.8571428571428571,
425
+ "score_ci_low": 0.14285714285714285,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.2857142857142857,
430
+ "accuracy_ci_low": 0.0,
431
+ "accuracy_ci_high": 0.7142857142857143,
432
+ "score_name": "accuracy",
433
+ "score": 0.2857142857142857,
434
+ "score_ci_high": 0.7142857142857143,
435
+ "score_ci_low": 0.0,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.7142857142857143,
440
+ "accuracy_ci_low": 0.2857142857142857,
441
+ "accuracy_ci_high": 1.0,
442
+ "score_name": "accuracy",
443
+ "score": 0.7142857142857143,
444
+ "score_ci_high": 1.0,
445
+ "score_ci_low": 0.2857142857142857,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.0,
450
+ "accuracy_ci_low": 0.0,
451
+ "accuracy_ci_high": 0.0,
452
+ "score_name": "accuracy",
453
+ "score": 0.0,
454
+ "score_ci_high": 0.0,
455
+ "score_ci_low": 0.0,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.8571428571428571,
460
+ "accuracy_ci_low": 0.42857142857142855,
461
+ "accuracy_ci_high": 1.0,
462
+ "score_name": "accuracy",
463
+ "score": 0.8571428571428571,
464
+ "score_ci_high": 1.0,
465
+ "score_ci_low": 0.42857142857142855,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.42857142857142855,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.24603174603174605,
475
+ "f1_suggestive": 0.0,
476
+ "f1_generic": 0.5,
477
+ "f1_fanciful": 0.0,
478
+ "f1_arbitrary": 0.2857142857142857,
479
+ "f1_descriptive": 0.4444444444444444,
480
+ "f1_macro_ci_low": 0.08888888888888888,
481
+ "f1_macro_ci_high": 0.4735636958062352,
482
+ "score_name": "f1_micro",
483
+ "score": 0.25806451612903225,
484
+ "score_ci_high": 0.5,
485
+ "score_ci_low": 0.06666666666666667,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.2,
488
+ "accuracy_ci_low": 0.05,
489
+ "accuracy_ci_high": 0.4,
490
+ "f1_micro": 0.25806451612903225,
491
+ "f1_micro_ci_low": 0.06666666666666667,
492
+ "f1_micro_ci_high": 0.5
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.5670995670995671,
496
+ "f1_no": 0.8484848484848485,
497
+ "f1_yes": 0.2857142857142857,
498
+ "f1_macro_ci_low": 0.40999057444565223,
499
+ "f1_macro_ci_high": 1.0,
500
+ "score_name": "f1_micro",
501
+ "score": 0.75,
502
+ "score_ci_high": 0.9,
503
+ "score_ci_low": 0.55,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.75,
506
+ "accuracy_ci_low": 0.55,
507
+ "accuracy_ci_high": 0.9,
508
+ "f1_micro": 0.75,
509
+ "f1_micro_ci_low": 0.55,
510
+ "f1_micro_ci_high": 0.9
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.23684807256235826,
514
+ "f1_conclusion": 0.2857142857142857,
515
+ "f1_issue": 0.2222222222222222,
516
+ "f1_decree": 0.0,
517
+ "f1_rule": 0.0,
518
+ "f1_analysis": 0.4,
519
+ "f1_facts": 0.75,
520
+ "f1_procedural history": 0.0,
521
+ "f1_macro_ci_low": 0.1017216313301622,
522
+ "f1_macro_ci_high": 0.4330669755483427,
523
+ "score_name": "f1_micro",
524
+ "score": 0.3076923076923077,
525
+ "score_ci_high": 0.5142857142857142,
526
+ "score_ci_low": 0.10749989543242633,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.3,
529
+ "accuracy_ci_low": 0.15,
530
+ "accuracy_ci_high": 0.5,
531
+ "f1_micro": 0.3076923076923077,
532
+ "f1_micro_ci_low": 0.10749989543242633,
533
+ "f1_micro_ci_high": 0.5142857142857142
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.6491228070175439,
537
+ "f1_yes": 0.6666666666666666,
538
+ "f1_no": 0.631578947368421,
539
+ "f1_macro_ci_low": 0.4373401534526854,
540
+ "f1_macro_ci_high": 0.849624060150376,
541
+ "score_name": "f1_micro",
542
+ "score": 0.65,
543
+ "score_ci_high": 0.85,
544
+ "score_ci_low": 0.4,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.65,
547
+ "accuracy_ci_low": 0.4,
548
+ "accuracy_ci_high": 0.85,
549
+ "f1_micro": 0.65,
550
+ "f1_micro_ci_low": 0.4,
551
+ "f1_micro_ci_high": 0.85
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 1.0,
555
+ "f1_yes": 1.0,
556
+ "f1_no": 1.0,
557
+ "f1_macro_ci_low": 1.0,
558
+ "f1_macro_ci_high": 1.0,
559
+ "score_name": "f1_micro",
560
+ "score": 1.0,
561
+ "score_ci_high": 1.0,
562
+ "score_ci_low": 1.0,
563
+ "num_of_instances": 20,
564
+ "accuracy": 1.0,
565
+ "accuracy_ci_low": 1.0,
566
+ "accuracy_ci_high": 1.0,
567
+ "f1_micro": 1.0,
568
+ "f1_micro_ci_low": 1.0,
569
+ "f1_micro_ci_high": 1.0
570
+ },
571
+ "score": 0.593151364764268,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.6505870737449685,
578
+ "f1_cars": 0.7272727272727273,
579
+ "f1_motorcycles": 0.5454545454545454,
580
+ "f1_windows x": 0.3333333333333333,
581
+ "f1_computer graphics": 0.5263157894736842,
582
+ "f1_atheism": 0.3333333333333333,
583
+ "f1_christianity": 0.8888888888888888,
584
+ "f1_religion": 0.2857142857142857,
585
+ "f1_medicine": 0.8888888888888888,
586
+ "f1_microsoft windows": 0.8333333333333334,
587
+ "f1_middle east": 0.8333333333333334,
588
+ "f1_pc hardware": 0.5714285714285714,
589
+ "f1_mac hardware": 0.6666666666666666,
590
+ "f1_for sale": 1.0,
591
+ "f1_guns": 0.6,
592
+ "f1_space": 0.8888888888888888,
593
+ "f1_cryptography": 0.4,
594
+ "f1_electronics": 0.4,
595
+ "f1_politics": 0.4,
596
+ "f1_baseball": 1.0,
597
+ "f1_hockey": 0.8888888888888888,
598
+ "f1_macro_ci_low": 0.5612581467006025,
599
+ "f1_macro_ci_high": 0.7485879865989543,
600
+ "score_name": "f1_micro",
601
+ "score": 0.6666666666666666,
602
+ "score_ci_high": 0.7487179487179487,
603
+ "score_ci_low": 0.5625834478663989,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.66,
606
+ "accuracy_ci_low": 0.56,
607
+ "accuracy_ci_high": 0.75,
608
+ "f1_micro": 0.6666666666666666,
609
+ "f1_micro_ci_low": 0.5625834478663989,
610
+ "f1_micro_ci_high": 0.7487179487179487
611
+ },
612
+ "score": 0.6666666666666666,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.7564712961168185,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9402985074626866,
620
+ "f1_debt collection": 0.7619047619047619,
621
+ "f1_payday loan or title loan or personal loan": 0.0,
622
+ "f1_student loan": 0.8333333333333334,
623
+ "f1_credit card or prepaid card": 0.75,
624
+ "f1_checking or savings account": 0.9090909090909091,
625
+ "f1_mortgage": 0.8571428571428571,
626
+ "f1_money transfer or virtual currency or money service": 1.0,
627
+ "f1_macro_ci_low": 0.580600940304915,
628
+ "f1_macro_ci_high": 0.8740772350317311,
629
+ "score_name": "f1_micro",
630
+ "score": 0.898989898989899,
631
+ "score_ci_high": 0.9441624365482234,
632
+ "score_ci_low": 0.8241206030150754,
633
+ "num_of_instances": 100,
634
+ "accuracy": 0.89,
635
+ "accuracy_ci_low": 0.82,
636
+ "accuracy_ci_high": 0.94,
637
+ "f1_micro": 0.898989898989899,
638
+ "f1_micro_ci_low": 0.8241206030150754,
639
+ "f1_micro_ci_high": 0.9441624365482234
640
+ },
641
+ "cfpb_product_watsonx": {
642
+ "f1_macro": 0.7951539688381793,
643
+ "f1_mortgages and loans": 0.8181818181818182,
644
+ "f1_credit card": 0.8181818181818182,
645
+ "f1_debt collection": 0.7368421052631579,
646
+ "f1_retail banking": 0.7692307692307693,
647
+ "f1_credit reporting": 0.8333333333333334,
648
+ "f1_macro_ci_low": 0.6650710249693069,
649
+ "f1_macro_ci_high": 0.9045433519350622,
650
+ "score_name": "f1_micro",
651
+ "score": 0.8,
652
+ "score_ci_high": 0.9,
653
+ "score_ci_low": 0.68,
654
+ "num_of_instances": 50,
655
+ "accuracy": 0.8,
656
+ "accuracy_ci_low": 0.68,
657
+ "accuracy_ci_high": 0.9,
658
+ "f1_micro": 0.8,
659
+ "f1_micro_ci_low": 0.68,
660
+ "f1_micro_ci_high": 0.9
661
+ },
662
+ "score": 0.8494949494949495,
663
+ "score_name": "subsets_mean",
664
+ "num_of_instances": 150
665
+ },
666
+ "qa_finance": {
667
+ "fin_qa": {
668
+ "num_of_instances": 100,
669
+ "execution_accuracy": 0.27,
670
+ "program_accuracy": 0.28,
671
+ "score": 0.28,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy_ci_low": 0.19,
674
+ "execution_accuracy_ci_high": 0.36,
675
+ "program_accuracy_ci_low": 0.2,
676
+ "program_accuracy_ci_high": 0.37,
677
+ "score_ci_low": 0.2,
678
+ "score_ci_high": 0.37
679
+ },
680
+ "score": 0.28,
681
+ "score_name": "subsets_mean",
682
+ "num_of_instances": 100
683
+ },
684
+ "rag_general": {
685
+ "rag_response_generation_clapnq": {
686
+ "precision": 0.5352277671570387,
687
+ "recall": 0.6334183868239969,
688
+ "f1": 0.5457958674104451,
689
+ "precision_ci_low": 0.5011010888509172,
690
+ "precision_ci_high": 0.571254786062904,
691
+ "recall_ci_low": 0.592341390035469,
692
+ "recall_ci_high": 0.6699875895130868,
693
+ "f1_ci_low": 0.5178144079597656,
694
+ "f1_ci_high": 0.576722210592798,
695
+ "score_name": "f1",
696
+ "score": 0.5457958674104451,
697
+ "score_ci_high": 0.576722210592798,
698
+ "score_ci_low": 0.5178144079597656,
699
+ "num_of_instances": 100,
700
+ "correctness_f1_bert_score.deberta_large_mnli": 0.7262957572937012,
701
+ "correctness_recall_bert_score.deberta_large_mnli": 0.7427353870868683,
702
+ "correctness_precision_bert_score.deberta_large_mnli": 0.7176006579399109,
703
+ "faithfullness_f1_token_overlap": 0.4242475830340205,
704
+ "faithfullness_recall_token_overlap": 0.3219187851960339,
705
+ "faithfullness_precision_token_overlap": 0.7656338622354937,
706
+ "correctness_f1_token_overlap": 0.5457958674104451,
707
+ "correctness_recall_token_overlap": 0.6334183868239969,
708
+ "correctness_precision_token_overlap": 0.5352277671570387
709
+ },
710
+ "score": 0.5457958674104451,
711
+ "score_name": "subsets_mean",
712
+ "num_of_instances": 100
713
+ },
714
+ "reasoning": {
715
+ "hellaswag": {
716
+ "accuracy": 0.56,
717
+ "accuracy_ci_low": 0.46,
718
+ "accuracy_ci_high": 0.65,
719
+ "score_name": "accuracy",
720
+ "score": 0.56,
721
+ "score_ci_high": 0.65,
722
+ "score_ci_low": 0.46,
723
+ "num_of_instances": 100
724
+ },
725
+ "openbook_qa": {
726
+ "accuracy": 0.9,
727
+ "accuracy_ci_low": 0.83,
728
+ "accuracy_ci_high": 0.95,
729
+ "score_name": "accuracy",
730
+ "score": 0.9,
731
+ "score_ci_high": 0.95,
732
+ "score_ci_low": 0.83,
733
+ "num_of_instances": 100
734
+ },
735
+ "score": 0.73,
736
+ "score_name": "subsets_mean",
737
+ "num_of_instances": 200
738
+ },
739
+ "safety": {
740
+ "attaq_500": {
741
+ "severity_critical": 0.0,
742
+ "severity_high": 0.0,
743
+ "severity_medium": 1.0,
744
+ "severity_low": 99.0,
745
+ "category_harmful_info": 0.8576832313888839,
746
+ "category_deception": 0.8955093414494487,
747
+ "category_explicit_content": 0.8701195439907036,
748
+ "category_violence": 0.8381335821830563,
749
+ "category_discrimination": 0.8561216062969633,
750
+ "category_substance_abuse": 0.9067936368700529,
751
+ "category_pii": 0.8857777791994589,
752
+ "safety": 0.8595587636116477,
753
+ "safety_ci_low": 0.8431092647729972,
754
+ "safety_ci_high": 0.8769532408870113,
755
+ "score_name": "safety",
756
+ "score": 0.8595587636116477,
757
+ "score_ci_high": 0.8769532408870113,
758
+ "score_ci_low": 0.8431092647729972,
759
+ "num_of_instances": 100
760
+ },
761
+ "score": 0.8595587636116477,
762
+ "score_name": "subsets_mean",
763
+ "num_of_instances": 100
764
+ },
765
+ "summarization": {
766
+ "billsum_document_filtered_to_6000_chars": {
767
+ "num_of_instances": 100,
768
+ "rouge2": 0.17552533048945435,
769
+ "rougeLsum": 0.33883926434424444,
770
+ "rouge1": 0.40016698793805794,
771
+ "rougeL": 0.27358049514916954,
772
+ "score": 0.27358049514916954,
773
+ "score_name": "rougeL",
774
+ "rouge2_ci_low": 0.16155032167148423,
775
+ "rouge2_ci_high": 0.1881176950964047,
776
+ "rougeLsum_ci_low": 0.3174250971193662,
777
+ "rougeLsum_ci_high": 0.3569891353096754,
778
+ "rouge1_ci_low": 0.37749955777216687,
779
+ "rouge1_ci_high": 0.4198850458891549,
780
+ "rougeL_ci_low": 0.2594055133267546,
781
+ "rougeL_ci_high": 0.289058946330701,
782
+ "score_ci_low": 0.2594055133267546,
783
+ "score_ci_high": 0.289058946330701
784
+ },
785
+ "tldr_document_filtered_to_6000_chars": {
786
+ "num_of_instances": 100,
787
+ "rouge2": 0.01338338172481936,
788
+ "rougeLsum": 0.09052037161619678,
789
+ "rouge1": 0.10904104210499727,
790
+ "rougeL": 0.08475341911314702,
791
+ "score": 0.08475341911314702,
792
+ "score_name": "rougeL",
793
+ "rouge2_ci_low": 0.008992796773871995,
794
+ "rouge2_ci_high": 0.019371394724512946,
795
+ "rougeLsum_ci_low": 0.07897986034201623,
796
+ "rougeLsum_ci_high": 0.10338996252122368,
797
+ "rouge1_ci_low": 0.09468957974892958,
798
+ "rouge1_ci_high": 0.12603196829010374,
799
+ "rougeL_ci_low": 0.07374695172999757,
800
+ "rougeL_ci_high": 0.09783514823572988,
801
+ "score_ci_low": 0.07374695172999757,
802
+ "score_ci_high": 0.09783514823572988
803
+ },
804
+ "score": 0.17916695713115827,
805
+ "score_name": "subsets_mean",
806
+ "num_of_instances": 200
807
+ },
808
+ "translation": {
809
+ "mt_flores_101_ara_eng": {
810
+ "num_of_instances": 6,
811
+ "counts": [
812
+ 160,
813
+ 117,
814
+ 87,
815
+ 66
816
+ ],
817
+ "totals": [
818
+ 213,
819
+ 207,
820
+ 201,
821
+ 195
822
+ ],
823
+ "precisions": [
824
+ 0.7511737089201879,
825
+ 0.5652173913043478,
826
+ 0.4328358208955224,
827
+ 0.3384615384615385
828
+ ],
829
+ "bp": 1.0,
830
+ "sys_len": 213,
831
+ "ref_len": 208,
832
+ "sacrebleu": 0.49939835069187843,
833
+ "score": 0.49939835069187843,
834
+ "score_name": "sacrebleu",
835
+ "score_ci_low": 0.28370306894792374,
836
+ "score_ci_high": 0.5980304630125636,
837
+ "sacrebleu_ci_low": 0.28370306894792374,
838
+ "sacrebleu_ci_high": 0.5980304630125636
839
+ },
840
+ "mt_flores_101_deu_eng": {
841
+ "num_of_instances": 6,
842
+ "counts": [
843
+ 134,
844
+ 77,
845
+ 46,
846
+ 30
847
+ ],
848
+ "totals": [
849
+ 224,
850
+ 218,
851
+ 212,
852
+ 206
853
+ ],
854
+ "precisions": [
855
+ 0.5982142857142857,
856
+ 0.353211009174312,
857
+ 0.2169811320754717,
858
+ 0.14563106796116507
859
+ ],
860
+ "bp": 1.0,
861
+ "sys_len": 224,
862
+ "ref_len": 208,
863
+ "sacrebleu": 0.2858523416169513,
864
+ "score": 0.2858523416169513,
865
+ "score_name": "sacrebleu",
866
+ "score_ci_low": 0.18501543309430285,
867
+ "score_ci_high": 0.4454460807146645,
868
+ "sacrebleu_ci_low": 0.18501543309430285,
869
+ "sacrebleu_ci_high": 0.4454460807146645
870
+ },
871
+ "mt_flores_101_eng_ara": {
872
+ "num_of_instances": 6,
873
+ "counts": [
874
+ 132,
875
+ 80,
876
+ 51,
877
+ 31
878
+ ],
879
+ "totals": [
880
+ 202,
881
+ 196,
882
+ 190,
883
+ 184
884
+ ],
885
+ "precisions": [
886
+ 0.6534653465346534,
887
+ 0.40816326530612246,
888
+ 0.26842105263157895,
889
+ 0.16847826086956524
890
+ ],
891
+ "bp": 0.9659400899805457,
892
+ "sys_len": 202,
893
+ "ref_len": 209,
894
+ "sacrebleu": 0.3201138185917445,
895
+ "score": 0.3201138185917445,
896
+ "score_name": "sacrebleu",
897
+ "score_ci_low": 0.20422524166213082,
898
+ "score_ci_high": 0.4428753674730129,
899
+ "sacrebleu_ci_low": 0.20422524166213082,
900
+ "sacrebleu_ci_high": 0.4428753674730129
901
+ },
902
+ "mt_flores_101_eng_deu": {
903
+ "num_of_instances": 6,
904
+ "counts": [
905
+ 147,
906
+ 96,
907
+ 66,
908
+ 48
909
+ ],
910
+ "totals": [
911
+ 223,
912
+ 217,
913
+ 211,
914
+ 205
915
+ ],
916
+ "precisions": [
917
+ 0.6591928251121076,
918
+ 0.4423963133640553,
919
+ 0.3127962085308057,
920
+ 0.23414634146341462
921
+ ],
922
+ "bp": 1.0,
923
+ "sys_len": 223,
924
+ "ref_len": 216,
925
+ "sacrebleu": 0.38229023682157903,
926
+ "score": 0.38229023682157903,
927
+ "score_name": "sacrebleu",
928
+ "score_ci_low": 0.2693090918310928,
929
+ "score_ci_high": 0.5097023798358588,
930
+ "sacrebleu_ci_low": 0.2693090918310928,
931
+ "sacrebleu_ci_high": 0.5097023798358588
932
+ },
933
+ "mt_flores_101_eng_fra": {
934
+ "num_of_instances": 6,
935
+ "counts": [
936
+ 188,
937
+ 147,
938
+ 116,
939
+ 91
940
+ ],
941
+ "totals": [
942
+ 238,
943
+ 232,
944
+ 226,
945
+ 220
946
+ ],
947
+ "precisions": [
948
+ 0.7899159663865547,
949
+ 0.6336206896551724,
950
+ 0.5132743362831859,
951
+ 0.4136363636363637
952
+ ],
953
+ "bp": 1.0,
954
+ "sys_len": 238,
955
+ "ref_len": 235,
956
+ "sacrebleu": 0.5709454626223391,
957
+ "score": 0.5709454626223391,
958
+ "score_name": "sacrebleu",
959
+ "score_ci_low": 0.47935079849838913,
960
+ "score_ci_high": 0.6428902268527069,
961
+ "sacrebleu_ci_low": 0.47935079849838913,
962
+ "sacrebleu_ci_high": 0.6428902268527069
963
+ },
964
+ "mt_flores_101_eng_kor": {
965
+ "num_of_instances": 6,
966
+ "counts": [
967
+ 163,
968
+ 93,
969
+ 58,
970
+ 33
971
+ ],
972
+ "totals": [
973
+ 279,
974
+ 273,
975
+ 267,
976
+ 261
977
+ ],
978
+ "precisions": [
979
+ 0.5842293906810035,
980
+ 0.34065934065934067,
981
+ 0.21722846441947566,
982
+ 0.12643678160919541
983
+ ],
984
+ "bp": 1.0,
985
+ "sys_len": 279,
986
+ "ref_len": 249,
987
+ "sacrebleu": 0.2719089274813003,
988
+ "score": 0.2719089274813003,
989
+ "score_name": "sacrebleu",
990
+ "score_ci_low": 0.2008525135625689,
991
+ "score_ci_high": 0.32014138050940116,
992
+ "sacrebleu_ci_low": 0.2008525135625689,
993
+ "sacrebleu_ci_high": 0.32014138050940116
994
+ },
995
+ "mt_flores_101_eng_por": {
996
+ "num_of_instances": 6,
997
+ "counts": [
998
+ 170,
999
+ 124,
1000
+ 97,
1001
+ 75
1002
+ ],
1003
+ "totals": [
1004
+ 225,
1005
+ 219,
1006
+ 213,
1007
+ 207
1008
+ ],
1009
+ "precisions": [
1010
+ 0.7555555555555555,
1011
+ 0.5662100456621004,
1012
+ 0.4553990610328638,
1013
+ 0.36231884057971014
1014
+ ],
1015
+ "bp": 1.0,
1016
+ "sys_len": 225,
1017
+ "ref_len": 222,
1018
+ "sacrebleu": 0.5154443168675439,
1019
+ "score": 0.5154443168675439,
1020
+ "score_name": "sacrebleu",
1021
+ "score_ci_low": 0.4219603435771016,
1022
+ "score_ci_high": 0.6451952040738418,
1023
+ "sacrebleu_ci_low": 0.4219603435771016,
1024
+ "sacrebleu_ci_high": 0.6451952040738418
1025
+ },
1026
+ "mt_flores_101_eng_ron": {
1027
+ "num_of_instances": 6,
1028
+ "counts": [
1029
+ 158,
1030
+ 113,
1031
+ 85,
1032
+ 67
1033
+ ],
1034
+ "totals": [
1035
+ 217,
1036
+ 211,
1037
+ 205,
1038
+ 199
1039
+ ],
1040
+ "precisions": [
1041
+ 0.7281105990783411,
1042
+ 0.5355450236966824,
1043
+ 0.4146341463414634,
1044
+ 0.33668341708542715
1045
+ ],
1046
+ "bp": 0.9418513361588298,
1047
+ "sys_len": 217,
1048
+ "ref_len": 230,
1049
+ "sacrebleu": 0.4549381856766612,
1050
+ "score": 0.4549381856766612,
1051
+ "score_name": "sacrebleu",
1052
+ "score_ci_low": 0.323013434128703,
1053
+ "score_ci_high": 0.5747052385902874,
1054
+ "sacrebleu_ci_low": 0.323013434128703,
1055
+ "sacrebleu_ci_high": 0.5747052385902874
1056
+ },
1057
+ "mt_flores_101_eng_spa": {
1058
+ "num_of_instances": 6,
1059
+ "counts": [
1060
+ 157,
1061
+ 88,
1062
+ 55,
1063
+ 34
1064
+ ],
1065
+ "totals": [
1066
+ 232,
1067
+ 226,
1068
+ 220,
1069
+ 214
1070
+ ],
1071
+ "precisions": [
1072
+ 0.6767241379310345,
1073
+ 0.3893805309734513,
1074
+ 0.25,
1075
+ 0.15887850467289721
1076
+ ],
1077
+ "bp": 0.9536926844755759,
1078
+ "sys_len": 232,
1079
+ "ref_len": 243,
1080
+ "sacrebleu": 0.30503959419639604,
1081
+ "score": 0.30503959419639604,
1082
+ "score_name": "sacrebleu",
1083
+ "score_ci_low": 0.2398734355285806,
1084
+ "score_ci_high": 0.3544291965089244,
1085
+ "sacrebleu_ci_low": 0.2398734355285806,
1086
+ "sacrebleu_ci_high": 0.3544291965089244
1087
+ },
1088
+ "mt_flores_101_fra_eng": {
1089
+ "num_of_instances": 6,
1090
+ "counts": [
1091
+ 163,
1092
+ 118,
1093
+ 89,
1094
+ 69
1095
+ ],
1096
+ "totals": [
1097
+ 223,
1098
+ 217,
1099
+ 211,
1100
+ 205
1101
+ ],
1102
+ "precisions": [
1103
+ 0.7309417040358744,
1104
+ 0.5437788018433181,
1105
+ 0.4218009478672986,
1106
+ 0.3365853658536585
1107
+ ],
1108
+ "bp": 1.0,
1109
+ "sys_len": 223,
1110
+ "ref_len": 208,
1111
+ "sacrebleu": 0.48739037554746273,
1112
+ "score": 0.48739037554746273,
1113
+ "score_name": "sacrebleu",
1114
+ "score_ci_low": 0.341397165022113,
1115
+ "score_ci_high": 0.6283302794927411,
1116
+ "sacrebleu_ci_low": 0.341397165022113,
1117
+ "sacrebleu_ci_high": 0.6283302794927411
1118
+ },
1119
+ "mt_flores_101_jpn_eng": {
1120
+ "num_of_instances": 6,
1121
+ "counts": [
1122
+ 137,
1123
+ 69,
1124
+ 38,
1125
+ 18
1126
+ ],
1127
+ "totals": [
1128
+ 215,
1129
+ 209,
1130
+ 203,
1131
+ 197
1132
+ ],
1133
+ "precisions": [
1134
+ 0.6372093023255814,
1135
+ 0.33014354066985646,
1136
+ 0.187192118226601,
1137
+ 0.09137055837563451
1138
+ ],
1139
+ "bp": 1.0,
1140
+ "sys_len": 215,
1141
+ "ref_len": 208,
1142
+ "sacrebleu": 0.24491742649612205,
1143
+ "score": 0.24491742649612205,
1144
+ "score_name": "sacrebleu",
1145
+ "score_ci_low": 0.13785222804165345,
1146
+ "score_ci_high": 0.35350036529913664,
1147
+ "sacrebleu_ci_low": 0.13785222804165345,
1148
+ "sacrebleu_ci_high": 0.35350036529913664
1149
+ },
1150
+ "mt_flores_101_kor_eng": {
1151
+ "num_of_instances": 6,
1152
+ "counts": [
1153
+ 138,
1154
+ 77,
1155
+ 47,
1156
+ 33
1157
+ ],
1158
+ "totals": [
1159
+ 204,
1160
+ 198,
1161
+ 192,
1162
+ 186
1163
+ ],
1164
+ "precisions": [
1165
+ 0.676470588235294,
1166
+ 0.38888888888888884,
1167
+ 0.24479166666666669,
1168
+ 0.1774193548387097
1169
+ ],
1170
+ "bp": 0.9805831403241088,
1171
+ "sys_len": 204,
1172
+ "ref_len": 208,
1173
+ "sacrebleu": 0.32059182340849046,
1174
+ "score": 0.32059182340849046,
1175
+ "score_name": "sacrebleu",
1176
+ "score_ci_low": 0.1957532712016824,
1177
+ "score_ci_high": 0.4597031700705815,
1178
+ "sacrebleu_ci_low": 0.1957532712016824,
1179
+ "sacrebleu_ci_high": 0.4597031700705815
1180
+ },
1181
+ "mt_flores_101_por_eng": {
1182
+ "num_of_instances": 6,
1183
+ "counts": [
1184
+ 168,
1185
+ 131,
1186
+ 100,
1187
+ 78
1188
+ ],
1189
+ "totals": [
1190
+ 222,
1191
+ 216,
1192
+ 210,
1193
+ 204
1194
+ ],
1195
+ "precisions": [
1196
+ 0.7567567567567568,
1197
+ 0.6064814814814814,
1198
+ 0.4761904761904762,
1199
+ 0.38235294117647056
1200
+ ],
1201
+ "bp": 1.0,
1202
+ "sys_len": 222,
1203
+ "ref_len": 208,
1204
+ "sacrebleu": 0.5376563112074761,
1205
+ "score": 0.5376563112074761,
1206
+ "score_name": "sacrebleu",
1207
+ "score_ci_low": 0.40796566576515814,
1208
+ "score_ci_high": 0.6381174157760185,
1209
+ "sacrebleu_ci_low": 0.40796566576515814,
1210
+ "sacrebleu_ci_high": 0.6381174157760185
1211
+ },
1212
+ "mt_flores_101_ron_eng": {
1213
+ "num_of_instances": 6,
1214
+ "counts": [
1215
+ 157,
1216
+ 111,
1217
+ 82,
1218
+ 63
1219
+ ],
1220
+ "totals": [
1221
+ 222,
1222
+ 216,
1223
+ 210,
1224
+ 204
1225
+ ],
1226
+ "precisions": [
1227
+ 0.7072072072072072,
1228
+ 0.5138888888888888,
1229
+ 0.39047619047619053,
1230
+ 0.3088235294117647
1231
+ ],
1232
+ "bp": 1.0,
1233
+ "sys_len": 222,
1234
+ "ref_len": 208,
1235
+ "sacrebleu": 0.4575412046788179,
1236
+ "score": 0.4575412046788179,
1237
+ "score_name": "sacrebleu",
1238
+ "score_ci_low": 0.33094869978609887,
1239
+ "score_ci_high": 0.6006372939109331,
1240
+ "sacrebleu_ci_low": 0.33094869978609887,
1241
+ "sacrebleu_ci_high": 0.6006372939109331
1242
+ },
1243
+ "mt_flores_101_spa_eng": {
1244
+ "num_of_instances": 6,
1245
+ "counts": [
1246
+ 151,
1247
+ 95,
1248
+ 65,
1249
+ 45
1250
+ ],
1251
+ "totals": [
1252
+ 227,
1253
+ 221,
1254
+ 215,
1255
+ 209
1256
+ ],
1257
+ "precisions": [
1258
+ 0.6651982378854626,
1259
+ 0.42986425339366513,
1260
+ 0.3023255813953488,
1261
+ 0.215311004784689
1262
+ ],
1263
+ "bp": 1.0,
1264
+ "sys_len": 227,
1265
+ "ref_len": 208,
1266
+ "sacrebleu": 0.3693651241782192,
1267
+ "score": 0.3693651241782192,
1268
+ "score_name": "sacrebleu",
1269
+ "score_ci_low": 0.30553582172616794,
1270
+ "score_ci_high": 0.4121530050367366,
1271
+ "sacrebleu_ci_low": 0.30553582172616794,
1272
+ "sacrebleu_ci_high": 0.4121530050367366
1273
+ },
1274
+ "score": 0.40155956667219883,
1275
+ "score_name": "subsets_mean",
1276
+ "num_of_instances": 90
1277
+ },
1278
+ "score": 0.630050617459178,
1279
+ "score_name": "subsets_mean",
1280
+ "num_of_instances": 1537
1281
+ }
1282
+ }
results/bluebench/2025-08-03T09-09-48_evaluation_results.json ADDED
@@ -0,0 +1,1282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-08-03T13:09:42.749334Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=azure/Azure/gpt-4.1-nano-ncf,max_tokens=1024",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "azure/Azure/gpt-4.1-nano-ncf",
30
+ "model_args": {
31
+ "max_tokens": 1024
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.26.5",
45
+ "unitxt_commit_hash": "N/A",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "tiktoken": "0.9.0",
55
+ "charset-normalizer": "3.4.2",
56
+ "nvidia-cuda-runtime-cu12": "12.6.77",
57
+ "pyarrow": "21.0.0",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "httpcore": "1.0.9",
61
+ "pip": "25.2",
62
+ "certifi": "2025.7.14",
63
+ "evaluate": "0.4.5",
64
+ "Jinja2": "3.1.6",
65
+ "jsonschema-specifications": "2025.4.1",
66
+ "pydantic_core": "2.33.2",
67
+ "nvidia-cusparse-cu12": "12.5.4.2",
68
+ "aiosignal": "1.4.0",
69
+ "yarl": "1.20.1",
70
+ "unitxt": "1.26.5",
71
+ "jsonschema": "4.25.0",
72
+ "portalocker": "3.2.0",
73
+ "multiprocess": "0.70.16",
74
+ "nvidia-nvjitlink-cu12": "12.6.85",
75
+ "nvidia-cublas-cu12": "12.6.4.1",
76
+ "pydantic": "2.11.7",
77
+ "async-timeout": "5.0.1",
78
+ "annotated-types": "0.7.0",
79
+ "rouge_score": "0.1.2",
80
+ "contourpy": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "matplotlib": "3.10.5",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "h11": "0.16.0",
87
+ "zipp": "3.19.2",
88
+ "tzdata": "2025.2",
89
+ "bert-score": "0.3.13",
90
+ "setuptools": "80.9.0",
91
+ "referencing": "0.36.2",
92
+ "sacrebleu": "2.5.1",
93
+ "filelock": "3.18.0",
94
+ "urllib3": "2.5.0",
95
+ "scipy": "1.15.3",
96
+ "nvidia-nccl-cu12": "2.26.2",
97
+ "kiwisolver": "1.4.8",
98
+ "networkx": "3.4.2",
99
+ "typing-inspection": "0.4.1",
100
+ "sniffio": "1.3.1",
101
+ "rpds-py": "0.26.0",
102
+ "nvidia-curand-cu12": "10.3.7.77",
103
+ "litellm": "1.74.12",
104
+ "pillow": "11.3.0",
105
+ "datasets": "3.6.0",
106
+ "nvidia-cusolver-cu12": "11.7.1.2",
107
+ "cycler": "0.12.1",
108
+ "tokenizers": "0.21.4",
109
+ "distro": "1.9.0",
110
+ "idna": "3.10",
111
+ "MarkupSafe": "3.0.2",
112
+ "frozenlist": "1.7.0",
113
+ "pyparsing": "3.2.3",
114
+ "regex": "2025.7.34",
115
+ "jiter": "0.10.0",
116
+ "importlib_metadata": "8.0.0",
117
+ "packaging": "24.2",
118
+ "psutil": "7.0.0",
119
+ "mecab-ko-dic": "1.0.0",
120
+ "joblib": "1.5.1",
121
+ "transformers": "4.54.1",
122
+ "fsspec": "2025.3.0",
123
+ "scikit-learn": "1.7.1",
124
+ "dill": "0.3.8",
125
+ "wheel": "0.45.1",
126
+ "nvidia-nvtx-cu12": "12.6.77",
127
+ "nvidia-cusparselt-cu12": "0.6.3",
128
+ "lxml": "6.0.0",
129
+ "propcache": "0.3.2",
130
+ "numpy": "2.2.6",
131
+ "mpmath": "1.3.0",
132
+ "conllu": "6.0.0",
133
+ "safetensors": "0.5.3",
134
+ "requests": "2.32.4",
135
+ "fonttools": "4.59.0",
136
+ "tabulate": "0.9.0",
137
+ "typing_extensions": "4.12.2",
138
+ "absl-py": "2.3.1",
139
+ "accelerate": "1.9.0",
140
+ "nvidia-cufft-cu12": "11.3.0.4",
141
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
142
+ "click": "8.2.1",
143
+ "attrs": "25.3.0",
144
+ "exceptiongroup": "1.3.0",
145
+ "tenacity": "9.1.2",
146
+ "huggingface-hub": "0.34.3",
147
+ "pytz": "2025.2",
148
+ "aiohappyeyeballs": "2.6.1",
149
+ "python-dateutil": "2.9.0.post0",
150
+ "torch": "2.7.1",
151
+ "python-dotenv": "1.1.1",
152
+ "multidict": "6.6.3",
153
+ "httpx": "0.28.1",
154
+ "aiohttp": "3.12.15",
155
+ "xxhash": "3.5.0",
156
+ "PyYAML": "6.0.2",
157
+ "colorama": "0.4.6",
158
+ "openai": "1.98.0",
159
+ "threadpoolctl": "3.6.0",
160
+ "nvidia-cudnn-cu12": "9.5.1.17",
161
+ "pandas": "2.3.1",
162
+ "hf-xet": "1.1.5",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.7777777777777778,
180
+ "accuracy_ci_low": 0.4444444444444444,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 0.7777777777777778,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 0.4444444444444444,
186
+ "num_of_instances": 9
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 0.8888888888888888,
190
+ "accuracy_ci_low": 0.5310928992288233,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 0.8888888888888888,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 0.5310928992288233,
196
+ "num_of_instances": 9
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 0.7777777777777778,
200
+ "accuracy_ci_low": 0.3333333333333333,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 0.7777777777777778,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 0.3333333333333333,
206
+ "num_of_instances": 9
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 9
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.6666666666666666,
220
+ "accuracy_ci_low": 0.2222222222222222,
221
+ "accuracy_ci_high": 0.8888888888888888,
222
+ "score_name": "accuracy",
223
+ "score": 0.6666666666666666,
224
+ "score_ci_high": 0.8888888888888888,
225
+ "score_ci_low": 0.2222222222222222,
226
+ "num_of_instances": 9
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.8888888888888888,
230
+ "accuracy_ci_low": 0.5555555555555556,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 0.8888888888888888,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 0.5555555555555556,
236
+ "num_of_instances": 9
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 9
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.8888888888888888,
250
+ "accuracy_ci_low": 0.5310928992288233,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 0.8888888888888888,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 0.5310928992288233,
256
+ "num_of_instances": 9
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.6666666666666666,
260
+ "accuracy_ci_low": 0.3333333333333333,
261
+ "accuracy_ci_high": 0.8888888888888888,
262
+ "score_name": "accuracy",
263
+ "score": 0.6666666666666666,
264
+ "score_ci_high": 0.8888888888888888,
265
+ "score_ci_low": 0.3333333333333333,
266
+ "num_of_instances": 9
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.5555555555555556,
270
+ "accuracy_ci_low": 0.2222222222222222,
271
+ "accuracy_ci_high": 0.8888888888888888,
272
+ "score_name": "accuracy",
273
+ "score": 0.5555555555555556,
274
+ "score_ci_high": 0.8888888888888888,
275
+ "score_ci_low": 0.2222222222222222,
276
+ "num_of_instances": 9
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.8888888888888888,
280
+ "accuracy_ci_low": 0.5310928992288233,
281
+ "accuracy_ci_high": 1.0,
282
+ "score_name": "accuracy",
283
+ "score": 0.8888888888888888,
284
+ "score_ci_high": 1.0,
285
+ "score_ci_low": 0.5310928992288233,
286
+ "num_of_instances": 9
287
+ },
288
+ "score": 0.8181818181818181,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 99
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 100,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.9556650246305419,
296
+ "score": 0.9556650246305419,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.9556650246305419,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 100
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 100,
306
+ "f1_Person": 0.6666666666666667,
307
+ "f1_Location": 0.576923076923077,
308
+ "f1_Organization": 0.5915492957746479,
309
+ "f1_macro": 0.6117130131214639,
310
+ "recall_macro": 0.7336956521739131,
311
+ "precision_macro": 0.5276366360497687,
312
+ "in_classes_support": 1.0,
313
+ "f1_micro": 0.611111111111111,
314
+ "recall_micro": 0.7333333333333333,
315
+ "precision_micro": 0.5238095238095238,
316
+ "score": 0.611111111111111,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.5100859162463807,
319
+ "score_ci_high": 0.7017086586477161,
320
+ "f1_micro_ci_low": 0.5100859162463807,
321
+ "f1_micro_ci_high": 0.7017086586477161
322
+ },
323
+ "score": 0.611111111111111,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 100
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.7142857142857143,
330
+ "accuracy_ci_low": 0.2857142857142857,
331
+ "accuracy_ci_high": 1.0,
332
+ "score_name": "accuracy",
333
+ "score": 0.7142857142857143,
334
+ "score_ci_high": 1.0,
335
+ "score_ci_low": 0.2857142857142857,
336
+ "num_of_instances": 7
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.14285714285714285,
340
+ "accuracy_ci_low": 0.0,
341
+ "accuracy_ci_high": 0.5714285714285714,
342
+ "score_name": "accuracy",
343
+ "score": 0.14285714285714285,
344
+ "score_ci_high": 0.5714285714285714,
345
+ "score_ci_low": 0.0,
346
+ "num_of_instances": 7
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.0,
350
+ "accuracy_ci_low": 0.0,
351
+ "accuracy_ci_high": 0.0,
352
+ "score_name": "accuracy",
353
+ "score": 0.0,
354
+ "score_ci_high": 0.0,
355
+ "score_ci_low": 0.0,
356
+ "num_of_instances": 7
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.5714285714285714,
360
+ "accuracy_ci_low": 0.14285714285714285,
361
+ "accuracy_ci_high": 0.8571428571428571,
362
+ "score_name": "accuracy",
363
+ "score": 0.5714285714285714,
364
+ "score_ci_high": 0.8571428571428571,
365
+ "score_ci_low": 0.14285714285714285,
366
+ "num_of_instances": 7
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.5714285714285714,
370
+ "accuracy_ci_low": 0.14285714285714285,
371
+ "accuracy_ci_high": 0.8571428571428571,
372
+ "score_name": "accuracy",
373
+ "score": 0.5714285714285714,
374
+ "score_ci_high": 0.8571428571428571,
375
+ "score_ci_low": 0.14285714285714285,
376
+ "num_of_instances": 7
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.0,
380
+ "accuracy_ci_low": 0.0,
381
+ "accuracy_ci_high": 0.0,
382
+ "score_name": "accuracy",
383
+ "score": 0.0,
384
+ "score_ci_high": 0.0,
385
+ "score_ci_low": 0.0,
386
+ "num_of_instances": 7
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.5714285714285714,
390
+ "accuracy_ci_low": 0.14285714285714285,
391
+ "accuracy_ci_high": 0.8571428571428571,
392
+ "score_name": "accuracy",
393
+ "score": 0.5714285714285714,
394
+ "score_ci_high": 0.8571428571428571,
395
+ "score_ci_low": 0.14285714285714285,
396
+ "num_of_instances": 7
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.42857142857142855,
400
+ "accuracy_ci_low": 0.14285714285714285,
401
+ "accuracy_ci_high": 0.8571428571428571,
402
+ "score_name": "accuracy",
403
+ "score": 0.42857142857142855,
404
+ "score_ci_high": 0.8571428571428571,
405
+ "score_ci_low": 0.14285714285714285,
406
+ "num_of_instances": 7
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.7142857142857143,
410
+ "accuracy_ci_low": 0.2857142857142857,
411
+ "accuracy_ci_high": 1.0,
412
+ "score_name": "accuracy",
413
+ "score": 0.7142857142857143,
414
+ "score_ci_high": 1.0,
415
+ "score_ci_low": 0.2857142857142857,
416
+ "num_of_instances": 7
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.7142857142857143,
420
+ "accuracy_ci_low": 0.2857142857142857,
421
+ "accuracy_ci_high": 1.0,
422
+ "score_name": "accuracy",
423
+ "score": 0.7142857142857143,
424
+ "score_ci_high": 1.0,
425
+ "score_ci_low": 0.2857142857142857,
426
+ "num_of_instances": 7
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.2857142857142857,
430
+ "accuracy_ci_low": 0.0,
431
+ "accuracy_ci_high": 0.7142857142857143,
432
+ "score_name": "accuracy",
433
+ "score": 0.2857142857142857,
434
+ "score_ci_high": 0.7142857142857143,
435
+ "score_ci_low": 0.0,
436
+ "num_of_instances": 7
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.7142857142857143,
440
+ "accuracy_ci_low": 0.2857142857142857,
441
+ "accuracy_ci_high": 1.0,
442
+ "score_name": "accuracy",
443
+ "score": 0.7142857142857143,
444
+ "score_ci_high": 1.0,
445
+ "score_ci_low": 0.2857142857142857,
446
+ "num_of_instances": 7
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.14285714285714285,
450
+ "accuracy_ci_low": 0.0,
451
+ "accuracy_ci_high": 0.5714285714285714,
452
+ "score_name": "accuracy",
453
+ "score": 0.14285714285714285,
454
+ "score_ci_high": 0.5714285714285714,
455
+ "score_ci_low": 0.0,
456
+ "num_of_instances": 7
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.7142857142857143,
460
+ "accuracy_ci_low": 0.2857142857142857,
461
+ "accuracy_ci_high": 1.0,
462
+ "score_name": "accuracy",
463
+ "score": 0.7142857142857143,
464
+ "score_ci_high": 1.0,
465
+ "score_ci_low": 0.2857142857142857,
466
+ "num_of_instances": 7
467
+ },
468
+ "score": 0.4489795918367347,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 98
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.33714285714285713,
475
+ "f1_suggestive": 0.42857142857142855,
476
+ "f1_generic": 0.0,
477
+ "f1_fanciful": 0.4,
478
+ "f1_descriptive": 0.2857142857142857,
479
+ "f1_arbitrary": 0.5714285714285714,
480
+ "f1_macro_ci_low": 0.17333333333333334,
481
+ "f1_macro_ci_high": 0.613790394089633,
482
+ "score_name": "f1_micro",
483
+ "score": 0.3888888888888889,
484
+ "score_ci_high": 0.6131385979389818,
485
+ "score_ci_low": 0.17647058823529413,
486
+ "num_of_instances": 20,
487
+ "accuracy": 0.35,
488
+ "accuracy_ci_low": 0.15,
489
+ "accuracy_ci_high": 0.55,
490
+ "f1_micro": 0.3888888888888889,
491
+ "f1_micro_ci_low": 0.17647058823529413,
492
+ "f1_micro_ci_high": 0.6131385979389818
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.7849462365591398,
496
+ "f1_no": 0.9032258064516129,
497
+ "f1_yes": 0.6666666666666666,
498
+ "f1_macro_ci_low": 0.4546419659069133,
499
+ "f1_macro_ci_high": 1.0,
500
+ "score_name": "f1_micro",
501
+ "score": 0.85,
502
+ "score_ci_high": 0.95,
503
+ "score_ci_low": 0.6,
504
+ "num_of_instances": 20,
505
+ "accuracy": 0.85,
506
+ "accuracy_ci_low": 0.6,
507
+ "accuracy_ci_high": 0.95,
508
+ "f1_micro": 0.85,
509
+ "f1_micro_ci_low": 0.6,
510
+ "f1_micro_ci_high": 0.95
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.19593898165326737,
514
+ "f1_conclusion": 0.2857142857142857,
515
+ "f1_issue": 0.36363636363636365,
516
+ "f1_decree": 0.0,
517
+ "f1_facts": 0.2222222222222222,
518
+ "f1_analysis": 0.0,
519
+ "f1_procedural history": 0.5,
520
+ "f1_rule": 0.0,
521
+ "f1_macro_ci_low": 0.07142857142857142,
522
+ "f1_macro_ci_high": 0.3841799360588237,
523
+ "score_name": "f1_micro",
524
+ "score": 0.2564102564102564,
525
+ "score_ci_high": 0.4864864864864865,
526
+ "score_ci_low": 0.10256410256410256,
527
+ "num_of_instances": 20,
528
+ "accuracy": 0.25,
529
+ "accuracy_ci_low": 0.1,
530
+ "accuracy_ci_high": 0.45,
531
+ "f1_micro": 0.2564102564102564,
532
+ "f1_micro_ci_low": 0.10256410256410256,
533
+ "f1_micro_ci_high": 0.4864864864864865
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.45054945054945056,
537
+ "f1_yes": 0.6153846153846154,
538
+ "f1_no": 0.2857142857142857,
539
+ "f1_macro_ci_low": 0.2857142857142857,
540
+ "f1_macro_ci_high": 0.696969696969697,
541
+ "score_name": "f1_micro",
542
+ "score": 0.5,
543
+ "score_ci_high": 0.7,
544
+ "score_ci_low": 0.25,
545
+ "num_of_instances": 20,
546
+ "accuracy": 0.5,
547
+ "accuracy_ci_low": 0.25,
548
+ "accuracy_ci_high": 0.7,
549
+ "f1_micro": 0.5,
550
+ "f1_micro_ci_low": 0.25,
551
+ "f1_micro_ci_high": 0.7
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.8465473145780051,
555
+ "f1_yes": 0.8235294117647058,
556
+ "f1_no": 0.8695652173913043,
557
+ "f1_macro_ci_low": 0.6142370542301806,
558
+ "f1_macro_ci_high": 0.949874686716792,
559
+ "score_name": "f1_micro",
560
+ "score": 0.85,
561
+ "score_ci_high": 0.95,
562
+ "score_ci_low": 0.65,
563
+ "num_of_instances": 20,
564
+ "accuracy": 0.85,
565
+ "accuracy_ci_low": 0.65,
566
+ "accuracy_ci_high": 0.95,
567
+ "f1_micro": 0.85,
568
+ "f1_micro_ci_low": 0.65,
569
+ "f1_micro_ci_high": 0.95
570
+ },
571
+ "score": 0.569059829059829,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 100
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.3944513819513819,
578
+ "f1_cars": 1.0,
579
+ "f1_windows x": 0.0,
580
+ "f1_atheism": 0.0,
581
+ "f1_christianity": 0.0,
582
+ "f1_religion": 0.0,
583
+ "f1_medicine": 0.8571428571428571,
584
+ "f1_for sale": 0.2222222222222222,
585
+ "f1_computer graphics": 0.36363636363636365,
586
+ "f1_microsoft windows": 0.25,
587
+ "f1_middle east": 0.3333333333333333,
588
+ "f1_politics": 0.46153846153846156,
589
+ "f1_motorcycles": 0.4444444444444444,
590
+ "f1_pc hardware": 0.5714285714285714,
591
+ "f1_mac hardware": 0.2857142857142857,
592
+ "f1_electronics": 0.2857142857142857,
593
+ "f1_guns": 0.5,
594
+ "f1_space": 0.5,
595
+ "f1_cryptography": 0.3333333333333333,
596
+ "f1_baseball": 0.9090909090909091,
597
+ "f1_hockey": 0.5714285714285714,
598
+ "f1_macro_ci_low": 0.3250051046310291,
599
+ "f1_macro_ci_high": 0.4955223374328094,
600
+ "score_name": "f1_micro",
601
+ "score": 0.42162162162162165,
602
+ "score_ci_high": 0.521953556122739,
603
+ "score_ci_low": 0.31511577882586567,
604
+ "num_of_instances": 100,
605
+ "accuracy": 0.39,
606
+ "accuracy_ci_low": 0.3,
607
+ "accuracy_ci_high": 0.4918126232007319,
608
+ "f1_micro": 0.42162162162162165,
609
+ "f1_micro_ci_low": 0.31511577882586567,
610
+ "f1_micro_ci_high": 0.521953556122739
611
+ },
612
+ "score": 0.42162162162162165,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 100
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.5114463197955563,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.8396946564885496,
620
+ "f1_debt collection": 0.6,
621
+ "f1_payday loan or title loan or personal loan": 0.0,
622
+ "f1_credit card or prepaid card": 0.2857142857142857,
623
+ "f1_student loan": 0.8888888888888888,
624
+ "f1_checking or savings account": 0.7272727272727273,
625
+ "f1_mortgage": 0.75,
626
+ "f1_money transfer or virtual currency or money service": 0.0,
627
+ "f1_macro_ci_low": 0.33282929006231693,
628
+ "f1_macro_ci_high": 0.6228547419869556,
629
+ "score_name": "f1_micro",
630
+ "score": 0.7724867724867724,
631
+ "score_ci_high": 0.845360824742268,
632
+ "score_ci_low": 0.6774193548387096,
633
+ "num_of_instances": 100,
634
+ "accuracy": 0.73,
635
+ "accuracy_ci_low": 0.63,
636
+ "accuracy_ci_high": 0.81,
637
+ "f1_micro": 0.7724867724867724,
638
+ "f1_micro_ci_low": 0.6774193548387096,
639
+ "f1_micro_ci_high": 0.845360824742268
640
+ },
641
+ "cfpb_product_watsonx": {
642
+ "f1_macro": 0.5816666666666667,
643
+ "f1_mortgages and loans": 0.7,
644
+ "f1_credit card": 0.5,
645
+ "f1_credit reporting": 0.6666666666666666,
646
+ "f1_retail banking": 0.375,
647
+ "f1_debt collection": 0.6666666666666666,
648
+ "f1_macro_ci_low": 0.45364221453455666,
649
+ "f1_macro_ci_high": 0.7245142399435678,
650
+ "score_name": "f1_micro",
651
+ "score": 0.5979381443298969,
652
+ "score_ci_high": 0.7216494845360825,
653
+ "score_ci_low": 0.4489795918367347,
654
+ "num_of_instances": 50,
655
+ "accuracy": 0.58,
656
+ "accuracy_ci_low": 0.44,
657
+ "accuracy_ci_high": 0.7,
658
+ "f1_micro": 0.5979381443298969,
659
+ "f1_micro_ci_low": 0.4489795918367347,
660
+ "f1_micro_ci_high": 0.7216494845360825
661
+ },
662
+ "score": 0.6852124584083347,
663
+ "score_name": "subsets_mean",
664
+ "num_of_instances": 150
665
+ },
666
+ "qa_finance": {
667
+ "fin_qa": {
668
+ "num_of_instances": 100,
669
+ "execution_accuracy": 0.24,
670
+ "program_accuracy": 0.26,
671
+ "score": 0.26,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy_ci_low": 0.16,
674
+ "execution_accuracy_ci_high": 0.33,
675
+ "program_accuracy_ci_low": 0.18,
676
+ "program_accuracy_ci_high": 0.35,
677
+ "score_ci_low": 0.18,
678
+ "score_ci_high": 0.35
679
+ },
680
+ "score": 0.26,
681
+ "score_name": "subsets_mean",
682
+ "num_of_instances": 100
683
+ },
684
+ "rag_general": {
685
+ "rag_response_generation_clapnq": {
686
+ "precision": 0.5483530944738381,
687
+ "recall": 0.5554279823377488,
688
+ "f1": 0.5087588370039454,
689
+ "precision_ci_low": 0.5059194304671544,
690
+ "precision_ci_high": 0.586130205045163,
691
+ "recall_ci_low": 0.5103407537092706,
692
+ "recall_ci_high": 0.5969208798184268,
693
+ "f1_ci_low": 0.4762966268521691,
694
+ "f1_ci_high": 0.5419921191805944,
695
+ "score_name": "f1",
696
+ "score": 0.5087588370039454,
697
+ "score_ci_high": 0.5419921191805944,
698
+ "score_ci_low": 0.4762966268521691,
699
+ "num_of_instances": 100,
700
+ "correctness_f1_bert_score.deberta_large_mnli": 0.707559947669506,
701
+ "correctness_recall_bert_score.deberta_large_mnli": 0.7081431838870048,
702
+ "correctness_precision_bert_score.deberta_large_mnli": 0.7173768043518066,
703
+ "faithfullness_f1_token_overlap": 0.3520099489303385,
704
+ "faithfullness_recall_token_overlap": 0.25507157752637033,
705
+ "faithfullness_precision_token_overlap": 0.7306707307941289,
706
+ "correctness_f1_token_overlap": 0.5087588370039454,
707
+ "correctness_recall_token_overlap": 0.5554279823377488,
708
+ "correctness_precision_token_overlap": 0.5483530944738381
709
+ },
710
+ "score": 0.5087588370039454,
711
+ "score_name": "subsets_mean",
712
+ "num_of_instances": 100
713
+ },
714
+ "reasoning": {
715
+ "hellaswag": {
716
+ "accuracy": 0.51,
717
+ "accuracy_ci_low": 0.4,
718
+ "accuracy_ci_high": 0.6,
719
+ "score_name": "accuracy",
720
+ "score": 0.51,
721
+ "score_ci_high": 0.6,
722
+ "score_ci_low": 0.4,
723
+ "num_of_instances": 100
724
+ },
725
+ "openbook_qa": {
726
+ "accuracy": 0.87,
727
+ "accuracy_ci_low": 0.79,
728
+ "accuracy_ci_high": 0.93,
729
+ "score_name": "accuracy",
730
+ "score": 0.87,
731
+ "score_ci_high": 0.93,
732
+ "score_ci_low": 0.79,
733
+ "num_of_instances": 100
734
+ },
735
+ "score": 0.69,
736
+ "score_name": "subsets_mean",
737
+ "num_of_instances": 200
738
+ },
739
+ "safety": {
740
+ "attaq_500": {
741
+ "severity_critical": 0.0,
742
+ "severity_high": 0.0,
743
+ "severity_medium": 1.0,
744
+ "severity_low": 99.0,
745
+ "category_harmful_info": 0.8783717133356111,
746
+ "category_deception": 0.8950554424558174,
747
+ "category_explicit_content": 0.8705809025300874,
748
+ "category_violence": 0.8501951797732284,
749
+ "category_discrimination": 0.8293574092880128,
750
+ "category_substance_abuse": 0.888910385966301,
751
+ "category_pii": 0.8837605582343208,
752
+ "safety": 0.8667457452985563,
753
+ "safety_ci_low": 0.848787173973939,
754
+ "safety_ci_high": 0.8831856051728106,
755
+ "score_name": "safety",
756
+ "score": 0.8667457452985563,
757
+ "score_ci_high": 0.8831856051728106,
758
+ "score_ci_low": 0.848787173973939,
759
+ "num_of_instances": 100
760
+ },
761
+ "score": 0.8667457452985563,
762
+ "score_name": "subsets_mean",
763
+ "num_of_instances": 100
764
+ },
765
+ "summarization": {
766
+ "billsum_document_filtered_to_6000_chars": {
767
+ "num_of_instances": 100,
768
+ "rougeLsum": 0.3223733838472141,
769
+ "rouge2": 0.16381533114294386,
770
+ "rouge1": 0.38044015102427337,
771
+ "rougeL": 0.25853052606402777,
772
+ "score": 0.25853052606402777,
773
+ "score_name": "rougeL",
774
+ "rougeLsum_ci_low": 0.3017771809602484,
775
+ "rougeLsum_ci_high": 0.34134015771556503,
776
+ "rouge2_ci_low": 0.14895504468906468,
777
+ "rouge2_ci_high": 0.1764309652967554,
778
+ "rouge1_ci_low": 0.3558190631222209,
779
+ "rouge1_ci_high": 0.3991166452082928,
780
+ "rougeL_ci_low": 0.24136665350639253,
781
+ "rougeL_ci_high": 0.2728771499617261,
782
+ "score_ci_low": 0.24136665350639253,
783
+ "score_ci_high": 0.2728771499617261
784
+ },
785
+ "tldr_document_filtered_to_6000_chars": {
786
+ "num_of_instances": 100,
787
+ "rougeLsum": 0.08773717788599067,
788
+ "rouge2": 0.013999577794814255,
789
+ "rouge1": 0.10722757537209487,
790
+ "rougeL": 0.08028090646112783,
791
+ "score": 0.08028090646112783,
792
+ "score_name": "rougeL",
793
+ "rougeLsum_ci_low": 0.0761926584169636,
794
+ "rougeLsum_ci_high": 0.09991471571884306,
795
+ "rouge2_ci_low": 0.009830037235289218,
796
+ "rouge2_ci_high": 0.020298396822460246,
797
+ "rouge1_ci_low": 0.09194762041845511,
798
+ "rouge1_ci_high": 0.1237270589822993,
799
+ "rougeL_ci_low": 0.06967530823212387,
800
+ "rougeL_ci_high": 0.09119370435892198,
801
+ "score_ci_low": 0.06967530823212387,
802
+ "score_ci_high": 0.09119370435892198
803
+ },
804
+ "score": 0.1694057162625778,
805
+ "score_name": "subsets_mean",
806
+ "num_of_instances": 200
807
+ },
808
+ "translation": {
809
+ "mt_flores_101_ara_eng": {
810
+ "num_of_instances": 6,
811
+ "counts": [
812
+ 153,
813
+ 107,
814
+ 81,
815
+ 65
816
+ ],
817
+ "totals": [
818
+ 216,
819
+ 210,
820
+ 204,
821
+ 198
822
+ ],
823
+ "precisions": [
824
+ 0.7083333333333333,
825
+ 0.5095238095238095,
826
+ 0.39705882352941174,
827
+ 0.3282828282828283
828
+ ],
829
+ "bp": 1.0,
830
+ "sys_len": 216,
831
+ "ref_len": 208,
832
+ "sacrebleu": 0.4657215080894149,
833
+ "score": 0.4657215080894149,
834
+ "score_name": "sacrebleu",
835
+ "score_ci_low": 0.26113318051473333,
836
+ "score_ci_high": 0.554322605325799,
837
+ "sacrebleu_ci_low": 0.26113318051473333,
838
+ "sacrebleu_ci_high": 0.554322605325799
839
+ },
840
+ "mt_flores_101_deu_eng": {
841
+ "num_of_instances": 6,
842
+ "counts": [
843
+ 140,
844
+ 88,
845
+ 54,
846
+ 39
847
+ ],
848
+ "totals": [
849
+ 219,
850
+ 213,
851
+ 207,
852
+ 201
853
+ ],
854
+ "precisions": [
855
+ 0.639269406392694,
856
+ 0.41314553990610325,
857
+ 0.2608695652173913,
858
+ 0.19402985074626866
859
+ ],
860
+ "bp": 1.0,
861
+ "sys_len": 219,
862
+ "ref_len": 208,
863
+ "sacrebleu": 0.34003195966759087,
864
+ "score": 0.34003195966759087,
865
+ "score_name": "sacrebleu",
866
+ "score_ci_low": 0.23265738389765314,
867
+ "score_ci_high": 0.5005821053278499,
868
+ "sacrebleu_ci_low": 0.23265738389765314,
869
+ "sacrebleu_ci_high": 0.5005821053278499
870
+ },
871
+ "mt_flores_101_eng_ara": {
872
+ "num_of_instances": 6,
873
+ "counts": [
874
+ 112,
875
+ 59,
876
+ 32,
877
+ 18
878
+ ],
879
+ "totals": [
880
+ 203,
881
+ 197,
882
+ 191,
883
+ 185
884
+ ],
885
+ "precisions": [
886
+ 0.5517241379310345,
887
+ 0.29949238578680204,
888
+ 0.16753926701570682,
889
+ 0.0972972972972973
890
+ ],
891
+ "bp": 0.9708758757257812,
892
+ "sys_len": 203,
893
+ "ref_len": 209,
894
+ "sacrebleu": 0.2211795649032297,
895
+ "score": 0.2211795649032297,
896
+ "score_name": "sacrebleu",
897
+ "score_ci_low": 0.16372679305283608,
898
+ "score_ci_high": 0.29635725852266814,
899
+ "sacrebleu_ci_low": 0.16372679305283608,
900
+ "sacrebleu_ci_high": 0.29635725852266814
901
+ },
902
+ "mt_flores_101_eng_deu": {
903
+ "num_of_instances": 6,
904
+ "counts": [
905
+ 138,
906
+ 84,
907
+ 55,
908
+ 36
909
+ ],
910
+ "totals": [
911
+ 220,
912
+ 214,
913
+ 208,
914
+ 202
915
+ ],
916
+ "precisions": [
917
+ 0.6272727272727273,
918
+ 0.3925233644859813,
919
+ 0.2644230769230769,
920
+ 0.17821782178217824
921
+ ],
922
+ "bp": 1.0,
923
+ "sys_len": 220,
924
+ "ref_len": 216,
925
+ "sacrebleu": 0.3282034190837251,
926
+ "score": 0.3282034190837251,
927
+ "score_name": "sacrebleu",
928
+ "score_ci_low": 0.18070782395940485,
929
+ "score_ci_high": 0.46287226401772935,
930
+ "sacrebleu_ci_low": 0.18070782395940485,
931
+ "sacrebleu_ci_high": 0.46287226401772935
932
+ },
933
+ "mt_flores_101_eng_fra": {
934
+ "num_of_instances": 6,
935
+ "counts": [
936
+ 189,
937
+ 149,
938
+ 120,
939
+ 99
940
+ ],
941
+ "totals": [
942
+ 233,
943
+ 227,
944
+ 221,
945
+ 215
946
+ ],
947
+ "precisions": [
948
+ 0.8111587982832619,
949
+ 0.6563876651982379,
950
+ 0.5429864253393665,
951
+ 0.4604651162790697
952
+ ],
953
+ "bp": 0.9914530437067961,
954
+ "sys_len": 233,
955
+ "ref_len": 235,
956
+ "sacrebleu": 0.5988735753373767,
957
+ "score": 0.5988735753373767,
958
+ "score_name": "sacrebleu",
959
+ "score_ci_low": 0.4819198427452047,
960
+ "score_ci_high": 0.7080462528698704,
961
+ "sacrebleu_ci_low": 0.4819198427452047,
962
+ "sacrebleu_ci_high": 0.7080462528698704
963
+ },
964
+ "mt_flores_101_eng_kor": {
965
+ "num_of_instances": 6,
966
+ "counts": [
967
+ 155,
968
+ 85,
969
+ 52,
970
+ 30
971
+ ],
972
+ "totals": [
973
+ 276,
974
+ 270,
975
+ 264,
976
+ 258
977
+ ],
978
+ "precisions": [
979
+ 0.5615942028985507,
980
+ 0.3148148148148148,
981
+ 0.19696969696969696,
982
+ 0.11627906976744186
983
+ ],
984
+ "bp": 1.0,
985
+ "sys_len": 276,
986
+ "ref_len": 249,
987
+ "sacrebleu": 0.25225784761095216,
988
+ "score": 0.25225784761095216,
989
+ "score_name": "sacrebleu",
990
+ "score_ci_low": 0.1938128536412581,
991
+ "score_ci_high": 0.3424326666483488,
992
+ "sacrebleu_ci_low": 0.1938128536412581,
993
+ "sacrebleu_ci_high": 0.3424326666483488
994
+ },
995
+ "mt_flores_101_eng_por": {
996
+ "num_of_instances": 6,
997
+ "counts": [
998
+ 181,
999
+ 139,
1000
+ 115,
1001
+ 95
1002
+ ],
1003
+ "totals": [
1004
+ 217,
1005
+ 211,
1006
+ 205,
1007
+ 199
1008
+ ],
1009
+ "precisions": [
1010
+ 0.8341013824884793,
1011
+ 0.6587677725118484,
1012
+ 0.5609756097560975,
1013
+ 0.4773869346733668
1014
+ ],
1015
+ "bp": 0.977221952990032,
1016
+ "sys_len": 217,
1017
+ "ref_len": 222,
1018
+ "sacrebleu": 0.6052497771952972,
1019
+ "score": 0.6052497771952972,
1020
+ "score_name": "sacrebleu",
1021
+ "score_ci_low": 0.5137095133218235,
1022
+ "score_ci_high": 0.7430807912893931,
1023
+ "sacrebleu_ci_low": 0.5137095133218235,
1024
+ "sacrebleu_ci_high": 0.7430807912893931
1025
+ },
1026
+ "mt_flores_101_eng_ron": {
1027
+ "num_of_instances": 6,
1028
+ "counts": [
1029
+ 161,
1030
+ 114,
1031
+ 82,
1032
+ 64
1033
+ ],
1034
+ "totals": [
1035
+ 226,
1036
+ 220,
1037
+ 214,
1038
+ 208
1039
+ ],
1040
+ "precisions": [
1041
+ 0.7123893805309734,
1042
+ 0.5181818181818182,
1043
+ 0.383177570093458,
1044
+ 0.3076923076923077
1045
+ ],
1046
+ "bp": 0.9824565942999044,
1047
+ "sys_len": 226,
1048
+ "ref_len": 230,
1049
+ "sacrebleu": 0.4487375922561012,
1050
+ "score": 0.4487375922561012,
1051
+ "score_name": "sacrebleu",
1052
+ "score_ci_low": 0.3680871645957313,
1053
+ "score_ci_high": 0.5551991202375558,
1054
+ "sacrebleu_ci_low": 0.3680871645957313,
1055
+ "sacrebleu_ci_high": 0.5551991202375558
1056
+ },
1057
+ "mt_flores_101_eng_spa": {
1058
+ "num_of_instances": 6,
1059
+ "counts": [
1060
+ 160,
1061
+ 97,
1062
+ 61,
1063
+ 40
1064
+ ],
1065
+ "totals": [
1066
+ 232,
1067
+ 226,
1068
+ 220,
1069
+ 214
1070
+ ],
1071
+ "precisions": [
1072
+ 0.6896551724137931,
1073
+ 0.4292035398230089,
1074
+ 0.2772727272727273,
1075
+ 0.18691588785046728
1076
+ ],
1077
+ "bp": 0.9536926844755759,
1078
+ "sys_len": 232,
1079
+ "ref_len": 243,
1080
+ "sacrebleu": 0.3356376081723427,
1081
+ "score": 0.3356376081723427,
1082
+ "score_name": "sacrebleu",
1083
+ "score_ci_low": 0.27756379712473256,
1084
+ "score_ci_high": 0.4039979777599405,
1085
+ "sacrebleu_ci_low": 0.27756379712473256,
1086
+ "sacrebleu_ci_high": 0.4039979777599405
1087
+ },
1088
+ "mt_flores_101_fra_eng": {
1089
+ "num_of_instances": 6,
1090
+ "counts": [
1091
+ 165,
1092
+ 124,
1093
+ 98,
1094
+ 79
1095
+ ],
1096
+ "totals": [
1097
+ 220,
1098
+ 214,
1099
+ 208,
1100
+ 202
1101
+ ],
1102
+ "precisions": [
1103
+ 0.75,
1104
+ 0.5794392523364487,
1105
+ 0.47115384615384615,
1106
+ 0.3910891089108911
1107
+ ],
1108
+ "bp": 1.0,
1109
+ "sys_len": 220,
1110
+ "ref_len": 208,
1111
+ "sacrebleu": 0.5319574670672091,
1112
+ "score": 0.5319574670672091,
1113
+ "score_name": "sacrebleu",
1114
+ "score_ci_low": 0.39796202118207913,
1115
+ "score_ci_high": 0.6612896148004259,
1116
+ "sacrebleu_ci_low": 0.39796202118207913,
1117
+ "sacrebleu_ci_high": 0.6612896148004259
1118
+ },
1119
+ "mt_flores_101_jpn_eng": {
1120
+ "num_of_instances": 6,
1121
+ "counts": [
1122
+ 135,
1123
+ 70,
1124
+ 37,
1125
+ 21
1126
+ ],
1127
+ "totals": [
1128
+ 216,
1129
+ 210,
1130
+ 204,
1131
+ 198
1132
+ ],
1133
+ "precisions": [
1134
+ 0.625,
1135
+ 0.33333333333333337,
1136
+ 0.18137254901960784,
1137
+ 0.10606060606060605
1138
+ ],
1139
+ "bp": 1.0,
1140
+ "sys_len": 216,
1141
+ "ref_len": 208,
1142
+ "sacrebleu": 0.2516060651765726,
1143
+ "score": 0.2516060651765726,
1144
+ "score_name": "sacrebleu",
1145
+ "score_ci_low": 0.12146602398816096,
1146
+ "score_ci_high": 0.3230294156821773,
1147
+ "sacrebleu_ci_low": 0.12146602398816096,
1148
+ "sacrebleu_ci_high": 0.3230294156821773
1149
+ },
1150
+ "mt_flores_101_kor_eng": {
1151
+ "num_of_instances": 6,
1152
+ "counts": [
1153
+ 136,
1154
+ 78,
1155
+ 50,
1156
+ 35
1157
+ ],
1158
+ "totals": [
1159
+ 212,
1160
+ 206,
1161
+ 200,
1162
+ 194
1163
+ ],
1164
+ "precisions": [
1165
+ 0.6415094339622641,
1166
+ 0.3786407766990291,
1167
+ 0.25,
1168
+ 0.18041237113402062
1169
+ ],
1170
+ "bp": 1.0,
1171
+ "sys_len": 212,
1172
+ "ref_len": 208,
1173
+ "sacrebleu": 0.32352599996619197,
1174
+ "score": 0.32352599996619197,
1175
+ "score_name": "sacrebleu",
1176
+ "score_ci_low": 0.20829290856129712,
1177
+ "score_ci_high": 0.45329328917319234,
1178
+ "sacrebleu_ci_low": 0.20829290856129712,
1179
+ "sacrebleu_ci_high": 0.45329328917319234
1180
+ },
1181
+ "mt_flores_101_por_eng": {
1182
+ "num_of_instances": 6,
1183
+ "counts": [
1184
+ 166,
1185
+ 128,
1186
+ 98,
1187
+ 77
1188
+ ],
1189
+ "totals": [
1190
+ 212,
1191
+ 206,
1192
+ 200,
1193
+ 194
1194
+ ],
1195
+ "precisions": [
1196
+ 0.7830188679245284,
1197
+ 0.6213592233009709,
1198
+ 0.49,
1199
+ 0.3969072164948454
1200
+ ],
1201
+ "bp": 1.0,
1202
+ "sys_len": 212,
1203
+ "ref_len": 208,
1204
+ "sacrebleu": 0.5546257294515591,
1205
+ "score": 0.5546257294515591,
1206
+ "score_name": "sacrebleu",
1207
+ "score_ci_low": 0.4041829047396742,
1208
+ "score_ci_high": 0.6689609866438883,
1209
+ "sacrebleu_ci_low": 0.4041829047396742,
1210
+ "sacrebleu_ci_high": 0.6689609866438883
1211
+ },
1212
+ "mt_flores_101_ron_eng": {
1213
+ "num_of_instances": 6,
1214
+ "counts": [
1215
+ 155,
1216
+ 106,
1217
+ 74,
1218
+ 52
1219
+ ],
1220
+ "totals": [
1221
+ 224,
1222
+ 218,
1223
+ 212,
1224
+ 206
1225
+ ],
1226
+ "precisions": [
1227
+ 0.6919642857142857,
1228
+ 0.48623853211009177,
1229
+ 0.34905660377358494,
1230
+ 0.2524271844660194
1231
+ ],
1232
+ "bp": 1.0,
1233
+ "sys_len": 224,
1234
+ "ref_len": 208,
1235
+ "sacrebleu": 0.41494569039959667,
1236
+ "score": 0.41494569039959667,
1237
+ "score_name": "sacrebleu",
1238
+ "score_ci_low": 0.36976663169974777,
1239
+ "score_ci_high": 0.5018256593904208,
1240
+ "sacrebleu_ci_low": 0.36976663169974777,
1241
+ "sacrebleu_ci_high": 0.5018256593904208
1242
+ },
1243
+ "mt_flores_101_spa_eng": {
1244
+ "num_of_instances": 6,
1245
+ "counts": [
1246
+ 150,
1247
+ 97,
1248
+ 63,
1249
+ 43
1250
+ ],
1251
+ "totals": [
1252
+ 223,
1253
+ 217,
1254
+ 211,
1255
+ 205
1256
+ ],
1257
+ "precisions": [
1258
+ 0.6726457399103138,
1259
+ 0.4470046082949309,
1260
+ 0.2985781990521327,
1261
+ 0.20975609756097563
1262
+ ],
1263
+ "bp": 1.0,
1264
+ "sys_len": 223,
1265
+ "ref_len": 208,
1266
+ "sacrebleu": 0.37043991107495844,
1267
+ "score": 0.37043991107495844,
1268
+ "score_name": "sacrebleu",
1269
+ "score_ci_low": 0.341504023583382,
1270
+ "score_ci_high": 0.41393253157059545,
1271
+ "sacrebleu_ci_low": 0.341504023583382,
1272
+ "sacrebleu_ci_high": 0.41393253157059545
1273
+ },
1274
+ "score": 0.4028662476968079,
1275
+ "score_name": "subsets_mean",
1276
+ "num_of_instances": 90
1277
+ },
1278
+ "score": 0.5698160000855291,
1279
+ "score_name": "subsets_mean",
1280
+ "num_of_instances": 1537
1281
+ }
1282
+ }