jbnayahu commited on
Commit
39310af
Β·
unverified Β·
1 Parent(s): 5dc0fc8

Updated results files

Browse files

Signed-off-by: Jonathan Bnayahu <[email protected]>

Files changed (14) hide show
  1. results/bluebench/2025-06-22T14-01-49_evaluation_results.json +1283 -0
  2. results/bluebench/{2025-06-19T17-18-35_evaluation_results.json β†’ 2025-06-22T15-05-33_evaluation_results.json} +679 -679
  3. results/bluebench/2025-06-22T17-10-54_evaluation_results.json +1283 -0
  4. results/bluebench/2025-06-22T19-25-42_evaluation_results.json +1283 -0
  5. results/bluebench/{2025-06-19T15-57-45_evaluation_results.json β†’ 2025-06-23T02-53-05_evaluation_results.json} +675 -675
  6. results/bluebench/{2025-06-19T16-09-06_evaluation_results.json β†’ 2025-06-23T03-17-57_evaluation_results.json} +681 -681
  7. results/bluebench/{2025-06-19T16-21-09_evaluation_results.json β†’ 2025-06-23T04-06-37_evaluation_results.json} +674 -674
  8. results/bluebench/2025-06-23T04-42-35_evaluation_results.json +1283 -0
  9. results/bluebench/{2025-06-19T18-10-05_evaluation_results.json β†’ 2025-06-23T05-36-33_evaluation_results.json} +686 -686
  10. results/bluebench/{2025-06-19T20-10-50_evaluation_results.json β†’ 2025-06-23T06-18-33_evaluation_results.json} +605 -605
  11. results/bluebench/{2025-06-21T08-38-27_evaluation_results.json β†’ 2025-06-23T08-43-46_evaluation_results.json} +701 -701
  12. results/bluebench/{2025-06-19T21-59-04_evaluation_results.json β†’ 2025-06-23T09-36-33_evaluation_results.json} +662 -662
  13. results/bluebench/{2025-06-21T09-36-54_evaluation_results.json β†’ 2025-06-23T14-18-29_evaluation_results.json} +584 -584
  14. results/bluebench/{2025-06-21T11-34-24_evaluation_results.json β†’ 2025-06-23T15-33-11_evaluation_results.json} +700 -700
results/bluebench/2025-06-22T14-01-49_evaluation_results.json ADDED
@@ -0,0 +1,1283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-06-22T18:01:46.346556Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=watsonx/meta-llama/llama-3-2-11b-vision-instruct,max_tokens=256",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "watsonx/meta-llama/llama-3-2-11b-vision-instruct",
30
+ "model_args": {
31
+ "max_tokens": 256
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "absl-py": "2.3.0",
55
+ "tiktoken": "0.9.0",
56
+ "charset-normalizer": "3.4.2",
57
+ "nvidia-cuda-runtime-cu12": "12.6.77",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "litellm": "1.72.6.post1",
61
+ "httpcore": "1.0.9",
62
+ "Jinja2": "3.1.6",
63
+ "jsonschema-specifications": "2025.4.1",
64
+ "pydantic_core": "2.33.2",
65
+ "nvidia-cusparse-cu12": "12.5.4.2",
66
+ "yarl": "1.20.1",
67
+ "openai": "1.88.0",
68
+ "portalocker": "3.2.0",
69
+ "pandas": "2.3.0",
70
+ "multiprocess": "0.70.16",
71
+ "jsonschema": "4.24.0",
72
+ "unitxt": "1.24.0",
73
+ "nvidia-nvjitlink-cu12": "12.6.85",
74
+ "nvidia-cublas-cu12": "12.6.4.1",
75
+ "pydantic": "2.11.7",
76
+ "async-timeout": "5.0.1",
77
+ "annotated-types": "0.7.0",
78
+ "rouge_score": "0.1.2",
79
+ "contourpy": "1.3.2",
80
+ "aiosignal": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "pillow": "11.2.1",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "pyarrow": "20.0.0",
87
+ "h11": "0.16.0",
88
+ "zipp": "3.19.2",
89
+ "tzdata": "2025.2",
90
+ "bert-score": "0.3.13",
91
+ "setuptools": "80.9.0",
92
+ "referencing": "0.36.2",
93
+ "sacrebleu": "2.5.1",
94
+ "filelock": "3.18.0",
95
+ "urllib3": "2.5.0",
96
+ "scipy": "1.15.3",
97
+ "nvidia-nccl-cu12": "2.26.2",
98
+ "kiwisolver": "1.4.8",
99
+ "networkx": "3.4.2",
100
+ "typing-inspection": "0.4.1",
101
+ "lxml": "5.4.0",
102
+ "sniffio": "1.3.1",
103
+ "scikit-learn": "1.7.0",
104
+ "nvidia-curand-cu12": "10.3.7.77",
105
+ "pip": "25.1.1",
106
+ "fonttools": "4.58.4",
107
+ "transformers": "4.52.4",
108
+ "datasets": "3.6.0",
109
+ "nvidia-cusolver-cu12": "11.7.1.2",
110
+ "cycler": "0.12.1",
111
+ "evaluate": "0.4.3",
112
+ "distro": "1.9.0",
113
+ "idna": "3.10",
114
+ "MarkupSafe": "3.0.2",
115
+ "frozenlist": "1.7.0",
116
+ "pyparsing": "3.2.3",
117
+ "jiter": "0.10.0",
118
+ "importlib_metadata": "8.0.0",
119
+ "packaging": "24.2",
120
+ "psutil": "7.0.0",
121
+ "mecab-ko-dic": "1.0.0",
122
+ "joblib": "1.5.1",
123
+ "fsspec": "2025.3.0",
124
+ "dill": "0.3.8",
125
+ "tokenizers": "0.21.1",
126
+ "wheel": "0.45.1",
127
+ "nvidia-nvtx-cu12": "12.6.77",
128
+ "nvidia-cusparselt-cu12": "0.6.3",
129
+ "hf-xet": "1.1.4",
130
+ "propcache": "0.3.2",
131
+ "numpy": "2.2.6",
132
+ "mpmath": "1.3.0",
133
+ "multidict": "6.5.0",
134
+ "conllu": "6.0.0",
135
+ "safetensors": "0.5.3",
136
+ "requests": "2.32.4",
137
+ "regex": "2024.11.6",
138
+ "aiohttp": "3.12.13",
139
+ "tabulate": "0.9.0",
140
+ "certifi": "2025.6.15",
141
+ "accelerate": "1.8.0",
142
+ "nvidia-cufft-cu12": "11.3.0.4",
143
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
+ "click": "8.2.1",
145
+ "typing_extensions": "4.12.2",
146
+ "attrs": "25.3.0",
147
+ "exceptiongroup": "1.3.0",
148
+ "tenacity": "9.1.2",
149
+ "pytz": "2025.2",
150
+ "aiohappyeyeballs": "2.6.1",
151
+ "python-dateutil": "2.9.0.post0",
152
+ "torch": "2.7.1",
153
+ "python-dotenv": "1.1.0",
154
+ "httpx": "0.28.1",
155
+ "matplotlib": "3.10.3",
156
+ "xxhash": "3.5.0",
157
+ "PyYAML": "6.0.2",
158
+ "huggingface-hub": "0.33.0",
159
+ "colorama": "0.4.6",
160
+ "rpds-py": "0.25.1",
161
+ "threadpoolctl": "3.6.0",
162
+ "nvidia-cudnn-cu12": "9.5.1.17",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.7444444444444445,
180
+ "accuracy_ci_low": 0.6555555555555556,
181
+ "accuracy_ci_high": 0.8333333333333334,
182
+ "score_name": "accuracy",
183
+ "score": 0.7444444444444445,
184
+ "score_ci_high": 0.8333333333333334,
185
+ "score_ci_low": 0.6555555555555556,
186
+ "num_of_instances": 90
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 0.7888888888888889,
190
+ "accuracy_ci_low": 0.7,
191
+ "accuracy_ci_high": 0.8666666666666667,
192
+ "score_name": "accuracy",
193
+ "score": 0.7888888888888889,
194
+ "score_ci_high": 0.8666666666666667,
195
+ "score_ci_low": 0.7,
196
+ "num_of_instances": 90
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 0.9111111111111111,
200
+ "accuracy_ci_low": 0.8444444444444444,
201
+ "accuracy_ci_high": 0.9555555555555556,
202
+ "score_name": "accuracy",
203
+ "score": 0.9111111111111111,
204
+ "score_ci_high": 0.9555555555555556,
205
+ "score_ci_low": 0.8444444444444444,
206
+ "num_of_instances": 90
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 0.7888888888888889,
210
+ "accuracy_ci_low": 0.7,
211
+ "accuracy_ci_high": 0.8666666666666667,
212
+ "score_name": "accuracy",
213
+ "score": 0.7888888888888889,
214
+ "score_ci_high": 0.8666666666666667,
215
+ "score_ci_low": 0.7,
216
+ "num_of_instances": 90
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.8111111111111111,
220
+ "accuracy_ci_low": 0.7222222222222222,
221
+ "accuracy_ci_high": 0.8888888888888888,
222
+ "score_name": "accuracy",
223
+ "score": 0.8111111111111111,
224
+ "score_ci_high": 0.8888888888888888,
225
+ "score_ci_low": 0.7222222222222222,
226
+ "num_of_instances": 90
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.9,
230
+ "accuracy_ci_low": 0.8222222222222222,
231
+ "accuracy_ci_high": 0.9555555555555556,
232
+ "score_name": "accuracy",
233
+ "score": 0.9,
234
+ "score_ci_high": 0.9555555555555556,
235
+ "score_ci_low": 0.8222222222222222,
236
+ "num_of_instances": 90
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.9666666666666667,
240
+ "accuracy_ci_low": 0.9111111111111111,
241
+ "accuracy_ci_high": 0.9888888888888889,
242
+ "score_name": "accuracy",
243
+ "score": 0.9666666666666667,
244
+ "score_ci_high": 0.9888888888888889,
245
+ "score_ci_low": 0.9111111111111111,
246
+ "num_of_instances": 90
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.8666666666666667,
250
+ "accuracy_ci_low": 0.7888888888888889,
251
+ "accuracy_ci_high": 0.9333333333333333,
252
+ "score_name": "accuracy",
253
+ "score": 0.8666666666666667,
254
+ "score_ci_high": 0.9333333333333333,
255
+ "score_ci_low": 0.7888888888888889,
256
+ "num_of_instances": 90
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.9111111111111111,
260
+ "accuracy_ci_low": 0.8412016500028439,
261
+ "accuracy_ci_high": 0.9555555555555556,
262
+ "score_name": "accuracy",
263
+ "score": 0.9111111111111111,
264
+ "score_ci_high": 0.9555555555555556,
265
+ "score_ci_low": 0.8412016500028439,
266
+ "num_of_instances": 90
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.8444444444444444,
270
+ "accuracy_ci_low": 0.7666666666666667,
271
+ "accuracy_ci_high": 0.9111111111111111,
272
+ "score_name": "accuracy",
273
+ "score": 0.8444444444444444,
274
+ "score_ci_high": 0.9111111111111111,
275
+ "score_ci_low": 0.7666666666666667,
276
+ "num_of_instances": 90
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.9111111111111111,
280
+ "accuracy_ci_low": 0.8333333333333334,
281
+ "accuracy_ci_high": 0.9555555555555556,
282
+ "score_name": "accuracy",
283
+ "score": 0.9111111111111111,
284
+ "score_ci_high": 0.9555555555555556,
285
+ "score_ci_low": 0.8333333333333334,
286
+ "num_of_instances": 90
287
+ },
288
+ "score": 0.8585858585858586,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 990
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.07721639656816015,
296
+ "score": 0.07721639656816015,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.07721639656816015,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 500
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 1000,
306
+ "f1_Person": 0.48831168831168825,
307
+ "f1_Organization": 0.35220125786163525,
308
+ "f1_Location": 0.3775100401606426,
309
+ "f1_macro": 0.406007662111322,
310
+ "recall_macro": 0.3667818453974414,
311
+ "precision_macro": 0.4584981753989352,
312
+ "in_classes_support": 0.7834862385321101,
313
+ "f1_micro": 0.3682242990654206,
314
+ "recall_micro": 0.37523809523809526,
315
+ "precision_micro": 0.3614678899082569,
316
+ "score": 0.3682242990654206,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.32532095178317566,
319
+ "score_ci_high": 0.4180775144242145,
320
+ "f1_micro_ci_low": 0.32532095178317566,
321
+ "f1_micro_ci_high": 0.4180775144242145
322
+ },
323
+ "score": 0.3682242990654206,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 1000
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.5633802816901409,
330
+ "accuracy_ci_low": 0.4507042253521127,
331
+ "accuracy_ci_high": 0.676056338028169,
332
+ "score_name": "accuracy",
333
+ "score": 0.5633802816901409,
334
+ "score_ci_high": 0.676056338028169,
335
+ "score_ci_low": 0.4507042253521127,
336
+ "num_of_instances": 71
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.2535211267605634,
340
+ "accuracy_ci_low": 0.15492957746478872,
341
+ "accuracy_ci_high": 0.36619718309859156,
342
+ "score_name": "accuracy",
343
+ "score": 0.2535211267605634,
344
+ "score_ci_high": 0.36619718309859156,
345
+ "score_ci_low": 0.15492957746478872,
346
+ "num_of_instances": 71
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.23943661971830985,
350
+ "accuracy_ci_low": 0.15492957746478872,
351
+ "accuracy_ci_high": 0.352112676056338,
352
+ "score_name": "accuracy",
353
+ "score": 0.23943661971830985,
354
+ "score_ci_high": 0.352112676056338,
355
+ "score_ci_low": 0.15492957746478872,
356
+ "num_of_instances": 71
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.36619718309859156,
360
+ "accuracy_ci_low": 0.2535211267605634,
361
+ "accuracy_ci_high": 0.4788732394366197,
362
+ "score_name": "accuracy",
363
+ "score": 0.36619718309859156,
364
+ "score_ci_high": 0.4788732394366197,
365
+ "score_ci_low": 0.2535211267605634,
366
+ "num_of_instances": 71
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.5492957746478874,
370
+ "accuracy_ci_low": 0.43661971830985913,
371
+ "accuracy_ci_high": 0.6619718309859155,
372
+ "score_name": "accuracy",
373
+ "score": 0.5492957746478874,
374
+ "score_ci_high": 0.6619718309859155,
375
+ "score_ci_low": 0.43661971830985913,
376
+ "num_of_instances": 71
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.23943661971830985,
380
+ "accuracy_ci_low": 0.15492957746478872,
381
+ "accuracy_ci_high": 0.352112676056338,
382
+ "score_name": "accuracy",
383
+ "score": 0.23943661971830985,
384
+ "score_ci_high": 0.352112676056338,
385
+ "score_ci_low": 0.15492957746478872,
386
+ "num_of_instances": 71
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.4788732394366197,
390
+ "accuracy_ci_low": 0.36619718309859156,
391
+ "accuracy_ci_high": 0.6056338028169014,
392
+ "score_name": "accuracy",
393
+ "score": 0.4788732394366197,
394
+ "score_ci_high": 0.6056338028169014,
395
+ "score_ci_low": 0.36619718309859156,
396
+ "num_of_instances": 71
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.5070422535211268,
400
+ "accuracy_ci_low": 0.39436619718309857,
401
+ "accuracy_ci_high": 0.6197183098591549,
402
+ "score_name": "accuracy",
403
+ "score": 0.5070422535211268,
404
+ "score_ci_high": 0.6197183098591549,
405
+ "score_ci_low": 0.39436619718309857,
406
+ "num_of_instances": 71
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.30985915492957744,
410
+ "accuracy_ci_low": 0.2112676056338028,
411
+ "accuracy_ci_high": 0.42820969566908634,
412
+ "score_name": "accuracy",
413
+ "score": 0.30985915492957744,
414
+ "score_ci_high": 0.42820969566908634,
415
+ "score_ci_low": 0.2112676056338028,
416
+ "num_of_instances": 71
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.14084507042253522,
420
+ "accuracy_ci_low": 0.07042253521126761,
421
+ "accuracy_ci_high": 0.22535211267605634,
422
+ "score_name": "accuracy",
423
+ "score": 0.14084507042253522,
424
+ "score_ci_high": 0.22535211267605634,
425
+ "score_ci_low": 0.07042253521126761,
426
+ "num_of_instances": 71
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.28169014084507044,
430
+ "accuracy_ci_low": 0.18309859154929578,
431
+ "accuracy_ci_high": 0.39436619718309857,
432
+ "score_name": "accuracy",
433
+ "score": 0.28169014084507044,
434
+ "score_ci_high": 0.39436619718309857,
435
+ "score_ci_low": 0.18309859154929578,
436
+ "num_of_instances": 71
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.4507042253521127,
440
+ "accuracy_ci_low": 0.323943661971831,
441
+ "accuracy_ci_high": 0.5633802816901409,
442
+ "score_name": "accuracy",
443
+ "score": 0.4507042253521127,
444
+ "score_ci_high": 0.5633802816901409,
445
+ "score_ci_low": 0.323943661971831,
446
+ "num_of_instances": 71
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.2535211267605634,
450
+ "accuracy_ci_low": 0.16901408450704225,
451
+ "accuracy_ci_high": 0.36619718309859156,
452
+ "score_name": "accuracy",
453
+ "score": 0.2535211267605634,
454
+ "score_ci_high": 0.36619718309859156,
455
+ "score_ci_low": 0.16901408450704225,
456
+ "num_of_instances": 71
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.5774647887323944,
460
+ "accuracy_ci_low": 0.4507042253521127,
461
+ "accuracy_ci_high": 0.6894343225712088,
462
+ "score_name": "accuracy",
463
+ "score": 0.5774647887323944,
464
+ "score_ci_high": 0.6894343225712088,
465
+ "score_ci_low": 0.4507042253521127,
466
+ "num_of_instances": 71
467
+ },
468
+ "score": 0.3722334004024145,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 994
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.5662558356676004,
475
+ "f1_suggestive": 0.4666666666666667,
476
+ "f1_arbitrary": 0.4444444444444444,
477
+ "f1_generic": 0.8571428571428571,
478
+ "f1_fanciful": 0.35714285714285715,
479
+ "f1_descriptive": 0.7058823529411765,
480
+ "f1_macro_ci_low": 0.47410052522342583,
481
+ "f1_macro_ci_high": 0.6713730404881563,
482
+ "score_name": "f1_micro",
483
+ "score": 0.5575757575757576,
484
+ "score_ci_high": 0.6506589298059469,
485
+ "score_ci_low": 0.4457831325301205,
486
+ "num_of_instances": 85,
487
+ "accuracy": 0.5411764705882353,
488
+ "accuracy_ci_low": 0.43529411764705883,
489
+ "accuracy_ci_high": 0.6352941176470588,
490
+ "f1_micro": 0.5575757575757576,
491
+ "f1_micro_ci_low": 0.4457831325301205,
492
+ "f1_micro_ci_high": 0.6506589298059469
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.575166034793364,
496
+ "f1_no": 0.6877470355731226,
497
+ "f1_yes": 0.46258503401360546,
498
+ "f1_macro_ci_low": 0.5066495066495067,
499
+ "f1_macro_ci_high": 0.6496773446094443,
500
+ "score_name": "f1_micro",
501
+ "score": 0.605,
502
+ "score_ci_high": 0.67,
503
+ "score_ci_low": 0.535,
504
+ "num_of_instances": 200,
505
+ "accuracy": 0.605,
506
+ "accuracy_ci_low": 0.535,
507
+ "accuracy_ci_high": 0.67,
508
+ "f1_micro": 0.605,
509
+ "f1_micro_ci_low": 0.535,
510
+ "f1_micro_ci_high": 0.67
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.3240051765352555,
514
+ "f1_conclusion": 0.0975609756097561,
515
+ "f1_analysis": 0.509090909090909,
516
+ "f1_decree": 0.34285714285714286,
517
+ "f1_issue": 0.22641509433962265,
518
+ "f1_procedural history": 0.29850746268656714,
519
+ "f1_facts": 0.4186046511627907,
520
+ "f1_rule": 0.375,
521
+ "f1_macro_ci_low": 0.2737291340244584,
522
+ "f1_macro_ci_high": 0.39709087675818633,
523
+ "score_name": "f1_micro",
524
+ "score": 0.3526448362720403,
525
+ "score_ci_high": 0.42317380352644834,
526
+ "score_ci_low": 0.29292929292929293,
527
+ "num_of_instances": 200,
528
+ "accuracy": 0.35,
529
+ "accuracy_ci_low": 0.29,
530
+ "accuracy_ci_high": 0.42,
531
+ "f1_micro": 0.3526448362720403,
532
+ "f1_micro_ci_low": 0.29292929292929293,
533
+ "f1_micro_ci_high": 0.42317380352644834
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.5769837972579975,
537
+ "f1_yes": 0.45517241379310347,
538
+ "f1_no": 0.6987951807228916,
539
+ "f1_macro_ci_low": 0.5127178863190986,
540
+ "f1_macro_ci_high": 0.6553872211311121,
541
+ "score_name": "f1_micro",
542
+ "score": 0.6091370558375635,
543
+ "score_ci_high": 0.6785772255666204,
544
+ "score_ci_low": 0.5449871465295629,
545
+ "num_of_instances": 200,
546
+ "accuracy": 0.6,
547
+ "accuracy_ci_low": 0.535,
548
+ "accuracy_ci_high": 0.67,
549
+ "f1_micro": 0.6091370558375635,
550
+ "f1_micro_ci_low": 0.5449871465295629,
551
+ "f1_micro_ci_high": 0.6785772255666204
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.9404761904761905,
555
+ "f1_yes": 0.9523809523809523,
556
+ "f1_no": 0.9285714285714286,
557
+ "f1_macro_ci_low": 0.8717038360531253,
558
+ "f1_macro_ci_high": 0.9763503609021853,
559
+ "score_name": "f1_micro",
560
+ "score": 0.9404761904761905,
561
+ "score_ci_high": 0.9764705882352941,
562
+ "score_ci_low": 0.8724795930656631,
563
+ "num_of_instances": 85,
564
+ "accuracy": 0.9294117647058824,
565
+ "accuracy_ci_low": 0.8470588235294118,
566
+ "accuracy_ci_high": 0.9764705882352941,
567
+ "f1_micro": 0.9404761904761905,
568
+ "f1_micro_ci_low": 0.8724795930656631,
569
+ "f1_micro_ci_high": 0.9764705882352941
570
+ },
571
+ "score": 0.6129667680323103,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 770
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.5153316959213506,
578
+ "f1_cars": 0.7317073170731707,
579
+ "f1_windows x": 0.08450704225352113,
580
+ "f1_computer graphics": 0.4948453608247423,
581
+ "f1_atheism": 0.2978723404255319,
582
+ "f1_religion": 0.05263157894736842,
583
+ "f1_medicine": 0.7733333333333333,
584
+ "f1_christianity": 0.5806451612903226,
585
+ "f1_microsoft windows": 0.4507042253521127,
586
+ "f1_middle east": 0.32727272727272727,
587
+ "f1_politics": 0.4132231404958678,
588
+ "f1_motorcycles": 0.7058823529411765,
589
+ "f1_pc hardware": 0.48520710059171596,
590
+ "f1_mac hardware": 0.5057471264367817,
591
+ "f1_electronics": 0.48739495798319327,
592
+ "f1_for sale": 0.5,
593
+ "f1_guns": 0.28125,
594
+ "f1_space": 0.7659574468085106,
595
+ "f1_cryptography": 0.6,
596
+ "f1_baseball": 0.8813559322033898,
597
+ "f1_hockey": 0.8870967741935484,
598
+ "f1_macro_ci_low": 0.4890786960094656,
599
+ "f1_macro_ci_high": 0.5464781246183315,
600
+ "score_name": "f1_micro",
601
+ "score": 0.5437325905292479,
602
+ "score_ci_high": 0.5741315636296753,
603
+ "score_ci_low": 0.5090753018614114,
604
+ "num_of_instances": 1000,
605
+ "accuracy": 0.488,
606
+ "accuracy_ci_low": 0.454,
607
+ "accuracy_ci_high": 0.519,
608
+ "f1_micro": 0.5437325905292479,
609
+ "f1_micro_ci_low": 0.5090753018614114,
610
+ "f1_micro_ci_high": 0.5741315636296753
611
+ },
612
+ "score": 0.5437325905292479,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 1000
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.6685227589041403,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9225251076040172,
620
+ "f1_checking or savings account": 0.5806451612903226,
621
+ "f1_debt collection": 0.5274725274725275,
622
+ "f1_credit card or prepaid card": 0.6371681415929203,
623
+ "f1_mortgage": 0.8059701492537313,
624
+ "f1_student loan": 0.8571428571428571,
625
+ "f1_money transfer or virtual currency or money service": 0.6181818181818182,
626
+ "f1_vehicle loan or lease": 0.6060606060606061,
627
+ "f1_payday loan or title loan or personal loan": 0.46153846153846156,
628
+ "f1_macro_ci_low": 0.6111841538128283,
629
+ "f1_macro_ci_high": 0.7335266591830523,
630
+ "score_name": "f1_micro",
631
+ "score": 0.8321536905965622,
632
+ "score_ci_high": 0.85326682230999,
633
+ "score_ci_low": 0.80760586975502,
634
+ "num_of_instances": 1000,
635
+ "accuracy": 0.823,
636
+ "accuracy_ci_low": 0.7962032615906698,
637
+ "accuracy_ci_high": 0.8449169646606582,
638
+ "f1_micro": 0.8321536905965622,
639
+ "f1_micro_ci_low": 0.80760586975502,
640
+ "f1_micro_ci_high": 0.85326682230999
641
+ },
642
+ "cfpb_product_watsonx": {
643
+ "f1_macro": 0.6712830729866164,
644
+ "f1_mortgages and loans": 0.7439024390243902,
645
+ "f1_credit card": 0.7777777777777778,
646
+ "f1_debt collection": 0.6571428571428571,
647
+ "f1_credit reporting": 0.7817589576547231,
648
+ "f1_retail banking": 0.3958333333333333,
649
+ "f1_macro_ci_low": 0.6287200378375363,
650
+ "f1_macro_ci_high": 0.7180257299728254,
651
+ "score_name": "f1_micro",
652
+ "score": 0.7097435897435898,
653
+ "score_ci_high": 0.7484617342104366,
654
+ "score_ci_low": 0.6680812073559,
655
+ "num_of_instances": 500,
656
+ "accuracy": 0.692,
657
+ "accuracy_ci_low": 0.648,
658
+ "accuracy_ci_high": 0.732,
659
+ "f1_micro": 0.7097435897435898,
660
+ "f1_micro_ci_low": 0.6680812073559,
661
+ "f1_micro_ci_high": 0.7484617342104366
662
+ },
663
+ "score": 0.770948640170076,
664
+ "score_name": "subsets_mean",
665
+ "num_of_instances": 1500
666
+ },
667
+ "qa_finance": {
668
+ "fin_qa": {
669
+ "num_of_instances": 1000,
670
+ "program_accuracy": 0.063,
671
+ "score": 0.063,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy": 0.053,
674
+ "program_accuracy_ci_low": 0.049,
675
+ "program_accuracy_ci_high": 0.07883525503658394,
676
+ "score_ci_low": 0.049,
677
+ "score_ci_high": 0.07883525503658394,
678
+ "execution_accuracy_ci_low": 0.04,
679
+ "execution_accuracy_ci_high": 0.06776975208467821
680
+ },
681
+ "score": 0.063,
682
+ "score_name": "subsets_mean",
683
+ "num_of_instances": 1000
684
+ },
685
+ "rag_general": {
686
+ "rag_response_generation_clapnq": {
687
+ "precision": 0.3102327261456618,
688
+ "recall": 0.5256916602580562,
689
+ "f1": 0.3173216770860886,
690
+ "precision_ci_low": 0.2902550859846034,
691
+ "precision_ci_high": 0.33175084087869144,
692
+ "recall_ci_low": 0.5093303819915139,
693
+ "recall_ci_high": 0.5427896271119333,
694
+ "f1_ci_low": 0.3004856741878323,
695
+ "f1_ci_high": 0.3346952161946201,
696
+ "score_name": "f1",
697
+ "score": 0.3173216770860886,
698
+ "score_ci_high": 0.3346952161946201,
699
+ "score_ci_low": 0.3004856741878323,
700
+ "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.5790909464160602,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6590411880612373,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5406127203504244,
704
+ "faithfullness_f1_token_overlap": 0.2677069264827017,
705
+ "faithfullness_recall_token_overlap": 0.2048583040694459,
706
+ "faithfullness_precision_token_overlap": 0.5374826116531715,
707
+ "correctness_f1_token_overlap": 0.3173216770860886,
708
+ "correctness_recall_token_overlap": 0.5256916602580562,
709
+ "correctness_precision_token_overlap": 0.3102327261456618
710
+ },
711
+ "score": 0.3173216770860886,
712
+ "score_name": "subsets_mean",
713
+ "num_of_instances": 600
714
+ },
715
+ "reasoning": {
716
+ "hellaswag": {
717
+ "accuracy": 0.472,
718
+ "accuracy_ci_low": 0.4377807805792856,
719
+ "accuracy_ci_high": 0.501,
720
+ "score_name": "accuracy",
721
+ "score": 0.472,
722
+ "score_ci_high": 0.501,
723
+ "score_ci_low": 0.4377807805792856,
724
+ "num_of_instances": 1000
725
+ },
726
+ "openbook_qa": {
727
+ "accuracy": 0.79,
728
+ "accuracy_ci_low": 0.756,
729
+ "accuracy_ci_high": 0.8278621971546426,
730
+ "score_name": "accuracy",
731
+ "score": 0.79,
732
+ "score_ci_high": 0.8278621971546426,
733
+ "score_ci_low": 0.756,
734
+ "num_of_instances": 500
735
+ },
736
+ "score": 0.631,
737
+ "score_name": "subsets_mean",
738
+ "num_of_instances": 1500
739
+ },
740
+ "safety": {
741
+ "attaq_500": {
742
+ "severity_critical": 0.0,
743
+ "severity_high": 0.0,
744
+ "severity_medium": 2.0,
745
+ "severity_low": 98.0,
746
+ "category_harmful_info": 0.828103977861622,
747
+ "category_deception": 0.8362863158073389,
748
+ "category_explicit_content": 0.8723303774616333,
749
+ "category_violence": 0.8265416233608173,
750
+ "category_discrimination": 0.7366517979710822,
751
+ "category_substance_abuse": 0.8473929514487585,
752
+ "category_pii": 0.8314695911550963,
753
+ "safety": 0.8279858794021938,
754
+ "safety_ci_low": 0.8061898558946505,
755
+ "safety_ci_high": 0.84660034746746,
756
+ "score_name": "safety",
757
+ "score": 0.8279858794021938,
758
+ "score_ci_high": 0.84660034746746,
759
+ "score_ci_low": 0.8061898558946505,
760
+ "num_of_instances": 100
761
+ },
762
+ "score": 0.8279858794021938,
763
+ "score_name": "subsets_mean",
764
+ "num_of_instances": 100
765
+ },
766
+ "summarization": {
767
+ "billsum_document_filtered_to_6000_chars": {
768
+ "num_of_instances": 528,
769
+ "rouge1": 0.42870762951811686,
770
+ "rouge2": 0.22634718223322206,
771
+ "rougeL": 0.3064227533266295,
772
+ "score": 0.3064227533266295,
773
+ "score_name": "rougeL",
774
+ "rougeLsum": 0.37414452263718584,
775
+ "rouge1_ci_low": 0.41813186584018475,
776
+ "rouge1_ci_high": 0.4383682523831221,
777
+ "rouge2_ci_low": 0.2179836143855743,
778
+ "rouge2_ci_high": 0.234852562715,
779
+ "rougeL_ci_low": 0.29790065420910344,
780
+ "rougeL_ci_high": 0.3146437618343804,
781
+ "score_ci_low": 0.29790065420910344,
782
+ "score_ci_high": 0.3146437618343804,
783
+ "rougeLsum_ci_low": 0.36454056998527534,
784
+ "rougeLsum_ci_high": 0.3838301065902944
785
+ },
786
+ "tldr_document_filtered_to_6000_chars": {
787
+ "num_of_instances": 1000,
788
+ "rouge1": 0.12785522529780569,
789
+ "rouge2": 0.018132164508293067,
790
+ "rougeL": 0.09085147406577235,
791
+ "score": 0.09085147406577235,
792
+ "score_name": "rougeL",
793
+ "rougeLsum": 0.10491828744788975,
794
+ "rouge1_ci_low": 0.12179050419663484,
795
+ "rouge1_ci_high": 0.13318684044580203,
796
+ "rouge2_ci_low": 0.016258834518891666,
797
+ "rouge2_ci_high": 0.02026468013917415,
798
+ "rougeL_ci_low": 0.08692929955144628,
799
+ "rougeL_ci_high": 0.0946230347296095,
800
+ "score_ci_low": 0.08692929955144628,
801
+ "score_ci_high": 0.0946230347296095,
802
+ "rougeLsum_ci_low": 0.10019902672341267,
803
+ "rougeLsum_ci_high": 0.10933176458351264
804
+ },
805
+ "score": 0.1986371136962009,
806
+ "score_name": "subsets_mean",
807
+ "num_of_instances": 1528
808
+ },
809
+ "translation": {
810
+ "mt_flores_101_ara_eng": {
811
+ "num_of_instances": 66,
812
+ "counts": [
813
+ 1196,
814
+ 710,
815
+ 465,
816
+ 324
817
+ ],
818
+ "totals": [
819
+ 1814,
820
+ 1748,
821
+ 1682,
822
+ 1616
823
+ ],
824
+ "precisions": [
825
+ 0.659316427783903,
826
+ 0.40617848970251713,
827
+ 0.27645659928656363,
828
+ 0.20049504950495048
829
+ ],
830
+ "bp": 1.0,
831
+ "sys_len": 1814,
832
+ "ref_len": 1734,
833
+ "sacrebleu": 0.3490481641487808,
834
+ "score": 0.3490481641487808,
835
+ "score_name": "sacrebleu",
836
+ "score_ci_low": 0.2982501441000675,
837
+ "score_ci_high": 0.39380586753445035,
838
+ "sacrebleu_ci_low": 0.2982501441000675,
839
+ "sacrebleu_ci_high": 0.39380586753445035
840
+ },
841
+ "mt_flores_101_deu_eng": {
842
+ "num_of_instances": 66,
843
+ "counts": [
844
+ 1266,
845
+ 804,
846
+ 543,
847
+ 375
848
+ ],
849
+ "totals": [
850
+ 1788,
851
+ 1722,
852
+ 1656,
853
+ 1590
854
+ ],
855
+ "precisions": [
856
+ 0.7080536912751678,
857
+ 0.46689895470383275,
858
+ 0.32789855072463764,
859
+ 0.2358490566037736
860
+ ],
861
+ "bp": 1.0,
862
+ "sys_len": 1788,
863
+ "ref_len": 1734,
864
+ "sacrebleu": 0.39986710952008375,
865
+ "score": 0.39986710952008375,
866
+ "score_name": "sacrebleu",
867
+ "score_ci_low": 0.3581256368637932,
868
+ "score_ci_high": 0.44700899058600674,
869
+ "sacrebleu_ci_low": 0.3581256368637932,
870
+ "sacrebleu_ci_high": 0.44700899058600674
871
+ },
872
+ "mt_flores_101_eng_ara": {
873
+ "num_of_instances": 66,
874
+ "counts": [
875
+ 809,
876
+ 376,
877
+ 189,
878
+ 90
879
+ ],
880
+ "totals": [
881
+ 1642,
882
+ 1576,
883
+ 1510,
884
+ 1444
885
+ ],
886
+ "precisions": [
887
+ 0.4926918392204629,
888
+ 0.23857868020304568,
889
+ 0.1251655629139073,
890
+ 0.062326869806094184
891
+ ],
892
+ "bp": 1.0,
893
+ "sys_len": 1642,
894
+ "ref_len": 1589,
895
+ "sacrebleu": 0.17401704653688835,
896
+ "score": 0.17401704653688835,
897
+ "score_name": "sacrebleu",
898
+ "score_ci_low": 0.1499482262533421,
899
+ "score_ci_high": 0.19937003139575787,
900
+ "sacrebleu_ci_low": 0.1499482262533421,
901
+ "sacrebleu_ci_high": 0.19937003139575787
902
+ },
903
+ "mt_flores_101_eng_deu": {
904
+ "num_of_instances": 66,
905
+ "counts": [
906
+ 1142,
907
+ 633,
908
+ 396,
909
+ 251
910
+ ],
911
+ "totals": [
912
+ 1860,
913
+ 1794,
914
+ 1728,
915
+ 1662
916
+ ],
917
+ "precisions": [
918
+ 0.613978494623656,
919
+ 0.3528428093645485,
920
+ 0.22916666666666669,
921
+ 0.1510228640192539
922
+ ],
923
+ "bp": 1.0,
924
+ "sys_len": 1860,
925
+ "ref_len": 1835,
926
+ "sacrebleu": 0.29426061967472056,
927
+ "score": 0.29426061967472056,
928
+ "score_name": "sacrebleu",
929
+ "score_ci_low": 0.24153648652872883,
930
+ "score_ci_high": 0.3377937358140578,
931
+ "sacrebleu_ci_low": 0.24153648652872883,
932
+ "sacrebleu_ci_high": 0.3377937358140578
933
+ },
934
+ "mt_flores_101_eng_fra": {
935
+ "num_of_instances": 66,
936
+ "counts": [
937
+ 1456,
938
+ 1023,
939
+ 777,
940
+ 595
941
+ ],
942
+ "totals": [
943
+ 2053,
944
+ 1987,
945
+ 1921,
946
+ 1855
947
+ ],
948
+ "precisions": [
949
+ 0.7092060399415491,
950
+ 0.5148465022647207,
951
+ 0.4044768349817803,
952
+ 0.32075471698113206
953
+ ],
954
+ "bp": 0.9927202458072129,
955
+ "sys_len": 2053,
956
+ "ref_len": 2068,
957
+ "sacrebleu": 0.46313340131929615,
958
+ "score": 0.46313340131929615,
959
+ "score_name": "sacrebleu",
960
+ "score_ci_low": 0.4233841077081067,
961
+ "score_ci_high": 0.5119360540835911,
962
+ "sacrebleu_ci_low": 0.4233841077081067,
963
+ "sacrebleu_ci_high": 0.5119360540835911
964
+ },
965
+ "mt_flores_101_eng_kor": {
966
+ "num_of_instances": 66,
967
+ "counts": [
968
+ 1292,
969
+ 644,
970
+ 363,
971
+ 211
972
+ ],
973
+ "totals": [
974
+ 2519,
975
+ 2453,
976
+ 2387,
977
+ 2321
978
+ ],
979
+ "precisions": [
980
+ 0.5129019452163557,
981
+ 0.26253567060741945,
982
+ 0.15207373271889402,
983
+ 0.09090909090909091
984
+ ],
985
+ "bp": 1.0,
986
+ "sys_len": 2519,
987
+ "ref_len": 2235,
988
+ "sacrebleu": 0.2077165240938849,
989
+ "score": 0.2077165240938849,
990
+ "score_name": "sacrebleu",
991
+ "score_ci_low": 0.1859830684811085,
992
+ "score_ci_high": 0.23202144404185795,
993
+ "sacrebleu_ci_low": 0.1859830684811085,
994
+ "sacrebleu_ci_high": 0.23202144404185795
995
+ },
996
+ "mt_flores_101_eng_por": {
997
+ "num_of_instances": 66,
998
+ "counts": [
999
+ 1391,
1000
+ 964,
1001
+ 706,
1002
+ 526
1003
+ ],
1004
+ "totals": [
1005
+ 1932,
1006
+ 1866,
1007
+ 1800,
1008
+ 1734
1009
+ ],
1010
+ "precisions": [
1011
+ 0.7199792960662527,
1012
+ 0.5166130760986066,
1013
+ 0.3922222222222222,
1014
+ 0.3033448673587082
1015
+ ],
1016
+ "bp": 1.0,
1017
+ "sys_len": 1932,
1018
+ "ref_len": 1916,
1019
+ "sacrebleu": 0.4586575663502692,
1020
+ "score": 0.4586575663502692,
1021
+ "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.4004417903982224,
1023
+ "score_ci_high": 0.4989599670645679,
1024
+ "sacrebleu_ci_low": 0.4004417903982224,
1025
+ "sacrebleu_ci_high": 0.4989599670645679
1026
+ },
1027
+ "mt_flores_101_eng_ron": {
1028
+ "num_of_instances": 66,
1029
+ "counts": [
1030
+ 1307,
1031
+ 878,
1032
+ 615,
1033
+ 449
1034
+ ],
1035
+ "totals": [
1036
+ 1965,
1037
+ 1899,
1038
+ 1833,
1039
+ 1767
1040
+ ],
1041
+ "precisions": [
1042
+ 0.6651399491094148,
1043
+ 0.4623486045286993,
1044
+ 0.3355155482815057,
1045
+ 0.25410299943406905
1046
+ ],
1047
+ "bp": 1.0,
1048
+ "sys_len": 1965,
1049
+ "ref_len": 1949,
1050
+ "sacrebleu": 0.4023937777690479,
1051
+ "score": 0.4023937777690479,
1052
+ "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.36178210560248414,
1054
+ "score_ci_high": 0.4461521227098032,
1055
+ "sacrebleu_ci_low": 0.36178210560248414,
1056
+ "sacrebleu_ci_high": 0.4461521227098032
1057
+ },
1058
+ "mt_flores_101_eng_spa": {
1059
+ "num_of_instances": 66,
1060
+ "counts": [
1061
+ 1254,
1062
+ 673,
1063
+ 395,
1064
+ 238
1065
+ ],
1066
+ "totals": [
1067
+ 2011,
1068
+ 1945,
1069
+ 1879,
1070
+ 1813
1071
+ ],
1072
+ "precisions": [
1073
+ 0.6235703630034809,
1074
+ 0.3460154241645244,
1075
+ 0.21021820117083553,
1076
+ 0.1312741312741313
1077
+ ],
1078
+ "bp": 0.9576603939644929,
1079
+ "sys_len": 2011,
1080
+ "ref_len": 2098,
1081
+ "sacrebleu": 0.266022962078398,
1082
+ "score": 0.266022962078398,
1083
+ "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.23487137512560524,
1085
+ "score_ci_high": 0.3006336038696202,
1086
+ "sacrebleu_ci_low": 0.23487137512560524,
1087
+ "sacrebleu_ci_high": 0.3006336038696202
1088
+ },
1089
+ "mt_flores_101_fra_eng": {
1090
+ "num_of_instances": 66,
1091
+ "counts": [
1092
+ 1307,
1093
+ 854,
1094
+ 592,
1095
+ 423
1096
+ ],
1097
+ "totals": [
1098
+ 1835,
1099
+ 1769,
1100
+ 1703,
1101
+ 1637
1102
+ ],
1103
+ "precisions": [
1104
+ 0.7122615803814714,
1105
+ 0.48275862068965514,
1106
+ 0.3476218438050499,
1107
+ 0.2583995113011607
1108
+ ],
1109
+ "bp": 1.0,
1110
+ "sys_len": 1835,
1111
+ "ref_len": 1734,
1112
+ "sacrebleu": 0.419220079381378,
1113
+ "score": 0.419220079381378,
1114
+ "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.37867823012959856,
1116
+ "score_ci_high": 0.457201247333676,
1117
+ "sacrebleu_ci_low": 0.37867823012959856,
1118
+ "sacrebleu_ci_high": 0.457201247333676
1119
+ },
1120
+ "mt_flores_101_jpn_eng": {
1121
+ "num_of_instances": 66,
1122
+ "counts": [
1123
+ 1067,
1124
+ 535,
1125
+ 301,
1126
+ 173
1127
+ ],
1128
+ "totals": [
1129
+ 1828,
1130
+ 1762,
1131
+ 1696,
1132
+ 1630
1133
+ ],
1134
+ "precisions": [
1135
+ 0.5836980306345734,
1136
+ 0.30363223609534623,
1137
+ 0.17747641509433962,
1138
+ 0.10613496932515337
1139
+ ],
1140
+ "bp": 1.0,
1141
+ "sys_len": 1828,
1142
+ "ref_len": 1734,
1143
+ "sacrebleu": 0.24037196462822435,
1144
+ "score": 0.24037196462822435,
1145
+ "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.20050825204453002,
1147
+ "score_ci_high": 0.29070324343505133,
1148
+ "sacrebleu_ci_low": 0.20050825204453002,
1149
+ "sacrebleu_ci_high": 0.29070324343505133
1150
+ },
1151
+ "mt_flores_101_kor_eng": {
1152
+ "num_of_instances": 66,
1153
+ "counts": [
1154
+ 1010,
1155
+ 482,
1156
+ 265,
1157
+ 153
1158
+ ],
1159
+ "totals": [
1160
+ 1770,
1161
+ 1704,
1162
+ 1638,
1163
+ 1572
1164
+ ],
1165
+ "precisions": [
1166
+ 0.5706214689265537,
1167
+ 0.2828638497652582,
1168
+ 0.16178266178266176,
1169
+ 0.09732824427480916
1170
+ ],
1171
+ "bp": 1.0,
1172
+ "sys_len": 1770,
1173
+ "ref_len": 1734,
1174
+ "sacrebleu": 0.22452985981795862,
1175
+ "score": 0.22452985981795862,
1176
+ "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.19219307877931052,
1178
+ "score_ci_high": 0.276921223757092,
1179
+ "sacrebleu_ci_low": 0.19219307877931052,
1180
+ "sacrebleu_ci_high": 0.276921223757092
1181
+ },
1182
+ "mt_flores_101_por_eng": {
1183
+ "num_of_instances": 66,
1184
+ "counts": [
1185
+ 1324,
1186
+ 914,
1187
+ 671,
1188
+ 506
1189
+ ],
1190
+ "totals": [
1191
+ 1810,
1192
+ 1744,
1193
+ 1678,
1194
+ 1612
1195
+ ],
1196
+ "precisions": [
1197
+ 0.7314917127071823,
1198
+ 0.5240825688073394,
1199
+ 0.39988081048867696,
1200
+ 0.31389578163771714
1201
+ ],
1202
+ "bp": 1.0,
1203
+ "sys_len": 1810,
1204
+ "ref_len": 1734,
1205
+ "sacrebleu": 0.4683616120269589,
1206
+ "score": 0.4683616120269589,
1207
+ "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.40532389997385815,
1209
+ "score_ci_high": 0.5208761253300637,
1210
+ "sacrebleu_ci_low": 0.40532389997385815,
1211
+ "sacrebleu_ci_high": 0.5208761253300637
1212
+ },
1213
+ "mt_flores_101_ron_eng": {
1214
+ "num_of_instances": 66,
1215
+ "counts": [
1216
+ 1284,
1217
+ 865,
1218
+ 609,
1219
+ 426
1220
+ ],
1221
+ "totals": [
1222
+ 1812,
1223
+ 1746,
1224
+ 1680,
1225
+ 1614
1226
+ ],
1227
+ "precisions": [
1228
+ 0.7086092715231789,
1229
+ 0.49541809851088203,
1230
+ 0.3625,
1231
+ 0.26394052044609667
1232
+ ],
1233
+ "bp": 1.0,
1234
+ "sys_len": 1812,
1235
+ "ref_len": 1734,
1236
+ "sacrebleu": 0.42810292438852193,
1237
+ "score": 0.42810292438852193,
1238
+ "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.3881114596567753,
1240
+ "score_ci_high": 0.4774362643095391,
1241
+ "sacrebleu_ci_low": 0.3881114596567753,
1242
+ "sacrebleu_ci_high": 0.4774362643095391
1243
+ },
1244
+ "mt_flores_101_spa_eng": {
1245
+ "num_of_instances": 66,
1246
+ "counts": [
1247
+ 1177,
1248
+ 630,
1249
+ 374,
1250
+ 229
1251
+ ],
1252
+ "totals": [
1253
+ 1912,
1254
+ 1846,
1255
+ 1780,
1256
+ 1714
1257
+ ],
1258
+ "precisions": [
1259
+ 0.6155857740585774,
1260
+ 0.3412784398699892,
1261
+ 0.2101123595505618,
1262
+ 0.13360560093348892
1263
+ ],
1264
+ "bp": 1.0,
1265
+ "sys_len": 1912,
1266
+ "ref_len": 1734,
1267
+ "sacrebleu": 0.2771203526997782,
1268
+ "score": 0.2771203526997782,
1269
+ "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.25170081898114677,
1271
+ "score_ci_high": 0.31867748756379854,
1272
+ "sacrebleu_ci_low": 0.25170081898114677,
1273
+ "sacrebleu_ci_high": 0.31867748756379854
1274
+ },
1275
+ "score": 0.33818826429561266,
1276
+ "score_name": "subsets_mean",
1277
+ "num_of_instances": 990
1278
+ },
1279
+ "score": 0.460003145217968,
1280
+ "score_name": "subsets_mean",
1281
+ "num_of_instances": 12472
1282
+ }
1283
+ }
results/bluebench/{2025-06-19T17-18-35_evaluation_results.json β†’ 2025-06-22T15-05-33_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-19T21:18:30.246956Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -42,7 +42,7 @@
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -176,226 +176,206 @@
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
- "accuracy": 0.34444444444444444,
180
- "accuracy_ci_low": 0.24444444444444444,
181
- "accuracy_ci_high": 0.4444444444444444,
182
  "score_name": "accuracy",
183
- "score": 0.34444444444444444,
184
- "score_ci_high": 0.4444444444444444,
185
- "score_ci_low": 0.24444444444444444,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
- "accuracy": 0.37777777777777777,
190
- "accuracy_ci_low": 0.2777777777777778,
191
- "accuracy_ci_high": 0.4817573779444034,
192
  "score_name": "accuracy",
193
- "score": 0.37777777777777777,
194
- "score_ci_high": 0.4817573779444034,
195
- "score_ci_low": 0.2777777777777778,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
- "accuracy": 0.4222222222222222,
200
- "accuracy_ci_low": 0.32222222222222224,
201
- "accuracy_ci_high": 0.5222222222222223,
202
  "score_name": "accuracy",
203
- "score": 0.4222222222222222,
204
- "score_ci_high": 0.5222222222222223,
205
- "score_ci_low": 0.32222222222222224,
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
- "accuracy": 0.3888888888888889,
210
  "accuracy_ci_low": 0.28888888888888886,
211
- "accuracy_ci_high": 0.4888888888888889,
212
  "score_name": "accuracy",
213
- "score": 0.3888888888888889,
214
- "score_ci_high": 0.4888888888888889,
215
  "score_ci_low": 0.28888888888888886,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
- "accuracy": 0.3333333333333333,
220
- "accuracy_ci_low": 0.24444444444444444,
221
- "accuracy_ci_high": 0.43333333333333335,
222
  "score_name": "accuracy",
223
- "score": 0.3333333333333333,
224
- "score_ci_high": 0.43333333333333335,
225
- "score_ci_low": 0.24444444444444444,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.45555555555555555,
230
- "accuracy_ci_low": 0.35555555555555557,
231
- "accuracy_ci_high": 0.5555555555555556,
232
  "score_name": "accuracy",
233
- "score": 0.45555555555555555,
234
- "score_ci_high": 0.5555555555555556,
235
- "score_ci_low": 0.35555555555555557,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
- "accuracy": 0.4777777777777778,
240
- "accuracy_ci_low": 0.37777777777777777,
241
- "accuracy_ci_high": 0.5888888888888889,
242
  "score_name": "accuracy",
243
- "score": 0.4777777777777778,
244
- "score_ci_high": 0.5888888888888889,
245
- "score_ci_low": 0.37777777777777777,
246
  "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
- "accuracy": 0.4222222222222222,
250
- "accuracy_ci_low": 0.3333333333333333,
251
- "accuracy_ci_high": 0.5333333333333333,
252
  "score_name": "accuracy",
253
- "score": 0.4222222222222222,
254
- "score_ci_high": 0.5333333333333333,
255
- "score_ci_low": 0.3333333333333333,
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
- "accuracy": 0.4888888888888889,
260
- "accuracy_ci_low": 0.37777777777777777,
261
- "accuracy_ci_high": 0.5888888888888889,
262
  "score_name": "accuracy",
263
- "score": 0.4888888888888889,
264
- "score_ci_high": 0.5888888888888889,
265
- "score_ci_low": 0.37777777777777777,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
  "accuracy": 0.5111111111111111,
270
  "accuracy_ci_low": 0.4111111111111111,
271
- "accuracy_ci_high": 0.6111111111111112,
272
  "score_name": "accuracy",
273
  "score": 0.5111111111111111,
274
- "score_ci_high": 0.6111111111111112,
275
  "score_ci_low": 0.4111111111111111,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.4,
280
  "accuracy_ci_low": 0.3,
281
- "accuracy_ci_high": 0.5111111111111111,
282
  "score_name": "accuracy",
283
- "score": 0.4,
284
- "score_ci_high": 0.5111111111111111,
285
  "score_ci_low": 0.3,
286
  "num_of_instances": 90
287
  },
288
- "score": 0.4202020202020202,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.5,
296
- "score": 0.5,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.5,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
- "f1_Person": 0.4545454545454546,
307
- "f1_Organization": 0.2292490118577075,
308
- "f1_Location": 0.227027027027027,
309
- "f1_macro": 0.3036071644767297,
310
- "recall_macro": 0.22361127874697093,
311
- "precision_macro": 0.5114786350741407,
312
- "in_classes_support": 0.7476923076923077,
313
- "f1_micro": 0.2941176470588235,
314
- "recall_micro": 0.23809523809523808,
315
- "precision_micro": 0.38461538461538464,
316
- "score": 0.2941176470588235,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.258452733561647,
319
- "score_ci_high": 0.33538361058823213,
320
- "f1_micro_ci_low": 0.258452733561647,
321
- "f1_micro_ci_high": 0.33538361058823213
322
  },
323
- "score": 0.2941176470588235,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
- "accuracy": 0.30985915492957744,
330
- "accuracy_ci_low": 0.19718309859154928,
331
- "accuracy_ci_high": 0.428782341390215,
332
  "score_name": "accuracy",
333
- "score": 0.30985915492957744,
334
- "score_ci_high": 0.428782341390215,
335
- "score_ci_low": 0.19718309859154928,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
- "accuracy": 0.15492957746478872,
340
- "accuracy_ci_low": 0.08450704225352113,
341
- "accuracy_ci_high": 0.2535211267605634,
342
  "score_name": "accuracy",
343
- "score": 0.15492957746478872,
344
- "score_ci_high": 0.2535211267605634,
345
- "score_ci_low": 0.08450704225352113,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
- "accuracy": 0.18309859154929578,
350
- "accuracy_ci_low": 0.11267605633802817,
351
- "accuracy_ci_high": 0.28169014084507044,
352
  "score_name": "accuracy",
353
- "score": 0.18309859154929578,
354
- "score_ci_high": 0.28169014084507044,
355
- "score_ci_low": 0.11267605633802817,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
- "accuracy": 0.19718309859154928,
360
- "accuracy_ci_low": 0.1267605633802817,
361
- "accuracy_ci_high": 0.30985915492957744,
362
  "score_name": "accuracy",
363
- "score": 0.19718309859154928,
364
- "score_ci_high": 0.30985915492957744,
365
- "score_ci_low": 0.1267605633802817,
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
- "accuracy": 0.4084507042253521,
370
- "accuracy_ci_low": 0.29577464788732394,
371
- "accuracy_ci_high": 0.5211267605633803,
372
  "score_name": "accuracy",
373
- "score": 0.4084507042253521,
374
- "score_ci_high": 0.5211267605633803,
375
- "score_ci_low": 0.29577464788732394,
376
  "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
- "accuracy": 0.15492957746478872,
380
- "accuracy_ci_low": 0.08450704225352113,
381
- "accuracy_ci_high": 0.2535211267605634,
382
- "score_name": "accuracy",
383
- "score": 0.15492957746478872,
384
- "score_ci_high": 0.2535211267605634,
385
- "score_ci_low": 0.08450704225352113,
386
- "num_of_instances": 71
387
- },
388
- "mmlu_pro_health": {
389
- "accuracy": 0.16901408450704225,
390
- "accuracy_ci_low": 0.09859154929577464,
391
- "accuracy_ci_high": 0.2535211267605634,
392
- "score_name": "accuracy",
393
- "score": 0.16901408450704225,
394
- "score_ci_high": 0.2535211267605634,
395
- "score_ci_low": 0.09859154929577464,
396
- "num_of_instances": 71
397
- },
398
- "mmlu_pro_history": {
399
  "accuracy": 0.18309859154929578,
400
  "accuracy_ci_low": 0.09859154929577464,
401
  "accuracy_ci_high": 0.28169014084507044,
@@ -405,17 +385,17 @@
405
  "score_ci_low": 0.09859154929577464,
406
  "num_of_instances": 71
407
  },
408
- "mmlu_pro_law": {
409
  "accuracy": 0.11267605633802817,
410
  "accuracy_ci_low": 0.056338028169014086,
411
- "accuracy_ci_high": 0.2112676056338028,
412
  "score_name": "accuracy",
413
  "score": 0.11267605633802817,
414
- "score_ci_high": 0.2112676056338028,
415
  "score_ci_low": 0.056338028169014086,
416
  "num_of_instances": 71
417
  },
418
- "mmlu_pro_math": {
419
  "accuracy": 0.09859154929577464,
420
  "accuracy_ci_low": 0.04225352112676056,
421
  "accuracy_ci_high": 0.18309859154929578,
@@ -425,384 +405,404 @@
425
  "score_ci_low": 0.04225352112676056,
426
  "num_of_instances": 71
427
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
  "mmlu_pro_other": {
429
- "accuracy": 0.15492957746478872,
430
- "accuracy_ci_low": 0.08450704225352113,
431
- "accuracy_ci_high": 0.2645029324911099,
432
  "score_name": "accuracy",
433
- "score": 0.15492957746478872,
434
- "score_ci_high": 0.2645029324911099,
435
- "score_ci_low": 0.08450704225352113,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
- "accuracy": 0.16901408450704225,
440
- "accuracy_ci_low": 0.08450704225352113,
441
- "accuracy_ci_high": 0.2676056338028169,
442
  "score_name": "accuracy",
443
- "score": 0.16901408450704225,
444
- "score_ci_high": 0.2676056338028169,
445
- "score_ci_low": 0.08450704225352113,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
- "accuracy": 0.04225352112676056,
450
- "accuracy_ci_low": 0.014084507042253521,
451
- "accuracy_ci_high": 0.11267605633802817,
452
  "score_name": "accuracy",
453
- "score": 0.04225352112676056,
454
- "score_ci_high": 0.11267605633802817,
455
- "score_ci_low": 0.014084507042253521,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
  "accuracy": 0.38028169014084506,
460
  "accuracy_ci_low": 0.2676056338028169,
461
- "accuracy_ci_high": 0.49295774647887325,
462
  "score_name": "accuracy",
463
  "score": 0.38028169014084506,
464
- "score_ci_high": 0.49295774647887325,
465
  "score_ci_low": 0.2676056338028169,
466
  "num_of_instances": 71
467
  },
468
- "score": 0.19416498993963782,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.23961477119371857,
475
- "f1_suggestive": 0.2564102564102564,
476
- "f1_generic": 0.22727272727272727,
477
- "f1_fanciful": 0.3076923076923077,
478
- "f1_descriptive": 0.09090909090909091,
479
- "f1_arbitrary": 0.3157894736842105,
480
- "f1_macro_ci_low": 0.16470343495436598,
481
- "f1_macro_ci_high": 0.3402970569238248,
482
  "score_name": "f1_micro",
483
- "score": 0.2485207100591716,
484
- "score_ci_high": 0.3565344458058143,
485
- "score_ci_low": 0.16674772165037405,
486
  "num_of_instances": 85,
487
- "accuracy": 0.24705882352941178,
488
- "accuracy_ci_low": 0.16470588235294117,
489
- "accuracy_ci_high": 0.35294117647058826,
490
- "f1_micro": 0.2485207100591716,
491
- "f1_micro_ci_low": 0.16674772165037405,
492
- "f1_micro_ci_high": 0.3565344458058143
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.45749941134918765,
496
- "f1_no": 0.656934306569343,
497
- "f1_yes": 0.25806451612903225,
498
- "f1_macro_ci_low": 0.4007210254458121,
499
- "f1_macro_ci_high": 0.523830186580906,
500
  "score_name": "f1_micro",
501
- "score": 0.5326633165829145,
502
- "score_ci_high": 0.5979899497487438,
503
- "score_ci_low": 0.4676003540226054,
504
  "num_of_instances": 200,
505
- "accuracy": 0.53,
506
- "accuracy_ci_low": 0.465,
507
- "accuracy_ci_high": 0.595,
508
- "f1_micro": 0.5326633165829145,
509
- "f1_micro_ci_low": 0.4676003540226054,
510
- "f1_micro_ci_high": 0.5979899497487438
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.14157824173747313,
514
- "f1_conclusion": 0.08333333333333333,
515
- "f1_analysis": 0.2900763358778626,
516
- "f1_decree": 0.06666666666666667,
517
- "f1_issue": 0.047619047619047616,
518
- "f1_facts": 0.13333333333333333,
519
- "f1_rule": 0.1935483870967742,
520
- "f1_procedural history": 0.17647058823529413,
521
- "f1_macro_ci_low": 0.09927560143449254,
522
- "f1_macro_ci_high": 0.19963080582055887,
523
  "score_name": "f1_micro",
524
- "score": 0.17857142857142858,
525
- "score_ci_high": 0.23469387755102042,
526
- "score_ci_low": 0.1235825927993309,
527
  "num_of_instances": 200,
528
- "accuracy": 0.175,
529
- "accuracy_ci_low": 0.12,
530
- "accuracy_ci_high": 0.23,
531
- "f1_micro": 0.17857142857142858,
532
- "f1_micro_ci_low": 0.1235825927993309,
533
- "f1_micro_ci_high": 0.23469387755102042
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.41428571428571426,
537
- "f1_yes": 0.4857142857142857,
538
- "f1_no": 0.34285714285714286,
539
- "f1_macro_ci_low": 0.35160188806998965,
540
- "f1_macro_ci_high": 0.4808674529166947,
541
  "score_name": "f1_micro",
542
- "score": 0.42077922077922075,
543
- "score_ci_high": 0.4846763437420372,
544
- "score_ci_low": 0.35535075567851304,
545
  "num_of_instances": 200,
546
- "accuracy": 0.405,
547
- "accuracy_ci_low": 0.34,
548
- "accuracy_ci_high": 0.465,
549
- "f1_micro": 0.42077922077922075,
550
- "f1_micro_ci_low": 0.35535075567851304,
551
- "f1_micro_ci_high": 0.4846763437420372
552
  },
553
  "legalbench_proa": {
554
- "f1_macro": 0.6461038961038961,
555
- "f1_yes": 0.6428571428571429,
556
- "f1_no": 0.6493506493506493,
557
- "f1_macro_ci_low": 0.5344060631732589,
558
- "f1_macro_ci_high": 0.745107042681059,
559
  "score_name": "f1_micro",
560
- "score": 0.6459627329192547,
561
- "score_ci_high": 0.7393939393939394,
562
- "score_ci_low": 0.5344831234199472,
563
  "num_of_instances": 85,
564
- "accuracy": 0.611764705882353,
565
- "accuracy_ci_low": 0.49411764705882355,
566
- "accuracy_ci_high": 0.7058823529411765,
567
- "f1_micro": 0.6459627329192547,
568
- "f1_micro_ci_low": 0.5344831234199472,
569
- "f1_micro_ci_high": 0.7393939393939394
570
  },
571
- "score": 0.40529948178239805,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.2276162418340563,
578
- "f1_cars": 0.5581395348837209,
579
  "f1_windows x": 0.0,
580
- "f1_atheism": 0.14634146341463414,
581
- "f1_religion": 0.18181818181818182,
582
- "f1_medicine": 0.2962962962962963,
583
- "f1_hockey": 0.48936170212765956,
584
- "f1_christianity": 0.3287671232876712,
585
- "f1_computer graphics": 0.13513513513513514,
586
- "f1_microsoft windows": 0.03571428571428571,
587
  "f1_middle east": 0.125,
588
- "f1_motorcycles": 0.1917808219178082,
 
589
  "f1_mac hardware": 0.0,
 
 
590
  "f1_for sale": 0.0,
591
- "f1_guns": 0.10714285714285714,
592
- "f1_politics": 0.2361111111111111,
593
  "f1_space": 0.39436619718309857,
594
- "f1_pc hardware": 0.0,
595
- "f1_cryptography": 0.32432432432432434,
596
- "f1_baseball": 0.7610619469026548,
597
- "f1_electronics": 0.24096385542168675,
598
- "f1_macro_ci_low": 0.20272698040510803,
599
- "f1_macro_ci_high": 0.2532565570480989,
600
  "score_name": "f1_micro",
601
- "score": 0.2679830747531735,
602
- "score_ci_high": 0.2978873823161142,
603
- "score_ci_low": 0.2355693496528132,
604
  "num_of_instances": 1000,
605
- "accuracy": 0.19,
606
- "accuracy_ci_low": 0.166,
607
- "accuracy_ci_high": 0.213,
608
- "f1_micro": 0.2679830747531735,
609
- "f1_micro_ci_low": 0.2355693496528132,
610
- "f1_micro_ci_high": 0.2978873823161142
611
  },
612
- "score": 0.2679830747531735,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.22216862358987682,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.696078431372549,
620
- "f1_credit card or prepaid card": 0.1518987341772152,
621
- "f1_debt collection": 0.22535211267605634,
622
- "f1_checking or savings account": 0.2222222222222222,
623
- "f1_vehicle loan or lease": 0.08333333333333333,
 
 
624
  "f1_payday loan or title loan or personal loan": 0.0,
625
- "f1_mortgage": 0.3157894736842105,
626
- "f1_money transfer or virtual currency or money service": 0.07407407407407407,
627
- "f1_student loan": 0.23076923076923078,
628
- "f1_macro_ci_low": 0.1842187730862839,
629
- "f1_macro_ci_high": 0.27331239167462773,
630
  "score_name": "f1_micro",
631
- "score": 0.5611940298507463,
632
- "score_ci_high": 0.5951679434295816,
633
- "score_ci_low": 0.5287106773010755,
634
  "num_of_instances": 1000,
635
- "accuracy": 0.47,
636
- "accuracy_ci_low": 0.43992255182914,
637
- "accuracy_ci_high": 0.504,
638
- "f1_micro": 0.5611940298507463,
639
- "f1_micro_ci_low": 0.5287106773010755,
640
- "f1_micro_ci_high": 0.5951679434295816
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.328995002480145,
644
- "f1_mortgages and loans": 0.3076923076923077,
645
- "f1_debt collection": 0.45038167938931295,
646
- "f1_credit card": 0.288135593220339,
647
- "f1_credit reporting": 0.5,
648
- "f1_retail banking": 0.09876543209876543,
649
- "f1_macro_ci_low": 0.2869304161724212,
650
- "f1_macro_ci_high": 0.37644896631739505,
651
  "score_name": "f1_micro",
652
- "score": 0.39184597961494905,
653
- "score_ci_high": 0.43742334452481374,
654
- "score_ci_low": 0.3475735981074829,
655
  "num_of_instances": 500,
656
- "accuracy": 0.346,
657
- "accuracy_ci_low": 0.306,
658
- "accuracy_ci_high": 0.39,
659
- "f1_micro": 0.39184597961494905,
660
- "f1_micro_ci_low": 0.3475735981074829,
661
- "f1_micro_ci_high": 0.43742334452481374
662
  },
663
- "score": 0.47652000473284767,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
- "program_accuracy": 0.027,
671
- "score": 0.027,
 
672
  "score_name": "program_accuracy",
673
- "execution_accuracy": 0.022,
674
- "program_accuracy_ci_low": 0.019,
675
- "program_accuracy_ci_high": 0.039,
676
- "score_ci_low": 0.019,
677
- "score_ci_high": 0.039,
678
- "execution_accuracy_ci_low": 0.014,
679
- "execution_accuracy_ci_high": 0.033
680
  },
681
- "score": 0.027,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.3192591413018271,
688
- "recall": 0.38991119820901343,
689
- "f1": 0.2793088311088913,
690
- "precision_ci_low": 0.2975520115964286,
691
- "precision_ci_high": 0.3411640955753825,
692
- "recall_ci_low": 0.3735203455510533,
693
- "recall_ci_high": 0.40905456252856565,
694
- "f1_ci_low": 0.26376571917741815,
695
- "f1_ci_high": 0.2967523754621306,
696
  "score_name": "f1",
697
- "score": 0.2793088311088913,
698
- "score_ci_high": 0.2967523754621306,
699
- "score_ci_low": 0.26376571917741815,
700
  "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.5586816079914569,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6062885612249375,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5444585413982471,
704
- "faithfullness_f1_token_overlap": 0.2284069948073265,
705
- "faithfullness_recall_token_overlap": 0.1691231317769043,
706
- "faithfullness_precision_token_overlap": 0.5822539469357956,
707
- "correctness_f1_token_overlap": 0.2793088311088913,
708
- "correctness_recall_token_overlap": 0.38991119820901343,
709
- "correctness_precision_token_overlap": 0.3192591413018271
710
  },
711
- "score": 0.2793088311088913,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.268,
718
- "accuracy_ci_low": 0.241,
719
- "accuracy_ci_high": 0.294,
720
  "score_name": "accuracy",
721
- "score": 0.268,
722
- "score_ci_high": 0.294,
723
- "score_ci_low": 0.241,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
- "accuracy": 0.536,
728
- "accuracy_ci_low": 0.496,
729
- "accuracy_ci_high": 0.582,
730
  "score_name": "accuracy",
731
- "score": 0.536,
732
- "score_ci_high": 0.582,
733
- "score_ci_low": 0.496,
734
  "num_of_instances": 500
735
  },
736
- "score": 0.402,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
  "severity_critical": 0.0,
743
- "severity_high": 2.0,
744
- "severity_medium": 4.0,
745
  "severity_low": 94.0,
746
- "category_harmful_info": 0.8100087784142013,
747
- "category_deception": 0.7944973061482112,
748
- "category_explicit_content": 0.8078695727192393,
749
- "category_violence": 0.8026550727940741,
750
- "category_discrimination": 0.7023729502208649,
751
- "category_substance_abuse": 0.857964905185832,
752
- "category_pii": 0.8635181895008794,
753
- "safety": 0.8019619875691001,
754
- "safety_ci_low": 0.7757910180183378,
755
- "safety_ci_high": 0.8246888836931806,
756
  "score_name": "safety",
757
- "score": 0.8019619875691001,
758
- "score_ci_high": 0.8246888836931806,
759
- "score_ci_low": 0.7757910180183378,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.8019619875691001,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
- "rouge2": 0.20059977802050133,
770
- "rouge1": 0.40705433873110675,
771
- "rougeL": 0.2833644024269708,
772
- "score": 0.2833644024269708,
773
  "score_name": "rougeL",
774
- "rougeLsum": 0.3484006202872336,
775
- "rouge2_ci_low": 0.19350867847403447,
776
- "rouge2_ci_high": 0.208746306103288,
777
- "rouge1_ci_low": 0.39709724247372435,
778
- "rouge1_ci_high": 0.41741048664304065,
779
- "rougeL_ci_low": 0.27611154671120425,
780
- "rougeL_ci_high": 0.291401862882032,
781
- "score_ci_low": 0.27611154671120425,
782
- "score_ci_high": 0.291401862882032,
783
- "rougeLsum_ci_low": 0.3393182862844001,
784
- "rougeLsum_ci_high": 0.35859357766397365
 
 
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
- "rouge2": 0.015549345441433063,
789
- "rouge1": 0.11519799005534682,
790
- "rougeL": 0.0840670089559512,
791
- "score": 0.0840670089559512,
792
  "score_name": "rougeL",
793
- "rougeLsum": 0.09578949363666936,
794
- "rouge2_ci_low": 0.013931235527451928,
795
- "rouge2_ci_high": 0.017483224052864014,
796
- "rouge1_ci_low": 0.10965978969392036,
797
- "rouge1_ci_high": 0.12037813563278642,
798
- "rougeL_ci_low": 0.08040950716646748,
799
- "rougeL_ci_high": 0.08756332939065774,
800
- "score_ci_low": 0.08040950716646748,
801
- "score_ci_high": 0.08756332939065774,
802
- "rougeLsum_ci_low": 0.0913667622653291,
803
- "rougeLsum_ci_high": 0.09990468987829387
 
 
804
  },
805
- "score": 0.183715705691461,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
@@ -810,444 +810,444 @@
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
- 975,
814
- 450,
815
- 239,
816
- 131
817
  ],
818
  "totals": [
819
- 1762,
820
- 1696,
821
- 1630,
822
- 1564
823
  ],
824
  "precisions": [
825
- 0.5533484676503972,
826
- 0.2653301886792453,
827
- 0.14662576687116563,
828
- 0.08375959079283887
829
  ],
830
  "bp": 1.0,
831
- "sys_len": 1762,
832
  "ref_len": 1734,
833
- "sacrebleu": 0.20606657614931506,
834
- "score": 0.20606657614931506,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.18092500119305566,
837
- "score_ci_high": 0.2337621289783969,
838
- "sacrebleu_ci_low": 0.18092500119305566,
839
- "sacrebleu_ci_high": 0.2337621289783969
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
- 1120,
845
- 595,
846
- 353,
847
- 211
848
  ],
849
  "totals": [
850
- 1761,
851
- 1695,
852
- 1629,
853
- 1563
854
  ],
855
  "precisions": [
856
- 0.6360022714366838,
857
- 0.35103244837758113,
858
- 0.21669736034376919,
859
- 0.13499680102367242
860
  ],
861
  "bp": 1.0,
862
- "sys_len": 1761,
863
  "ref_len": 1734,
864
- "sacrebleu": 0.2842796401730753,
865
- "score": 0.2842796401730753,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.25109489325457895,
868
- "score_ci_high": 0.31935419283000793,
869
- "sacrebleu_ci_low": 0.25109489325457895,
870
- "sacrebleu_ci_high": 0.31935419283000793
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
  485,
876
- 117,
877
- 36,
878
- 10
879
  ],
880
  "totals": [
881
- 1763,
882
- 1697,
883
- 1631,
884
- 1565
885
  ],
886
  "precisions": [
887
- 0.2750992626205332,
888
- 0.06894519740718916,
889
- 0.022072348252605765,
890
- 0.006389776357827476
891
  ],
892
  "bp": 1.0,
893
- "sys_len": 1763,
894
  "ref_len": 1589,
895
- "sacrebleu": 0.04044193351575661,
896
- "score": 0.04044193351575661,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.026004571558982913,
899
- "score_ci_high": 0.05783535488306116,
900
- "sacrebleu_ci_low": 0.026004571558982913,
901
- "sacrebleu_ci_high": 0.05783535488306116
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
- 946,
907
- 441,
908
- 236,
909
- 135
910
  ],
911
  "totals": [
912
- 1822,
913
- 1756,
914
- 1690,
915
- 1624
916
  ],
917
  "precisions": [
918
- 0.5192096597145993,
919
- 0.2511389521640091,
920
- 0.13964497041420118,
921
- 0.08312807881773399
922
  ],
923
- "bp": 0.9928903773336073,
924
- "sys_len": 1822,
925
  "ref_len": 1835,
926
- "sacrebleu": 0.19584332441613614,
927
- "score": 0.19584332441613614,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.17291558306420995,
930
- "score_ci_high": 0.2421406469227526,
931
- "sacrebleu_ci_low": 0.17291558306420995,
932
- "sacrebleu_ci_high": 0.2421406469227526
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
- 1235,
938
- 733,
939
- 491,
940
- 334
941
  ],
942
  "totals": [
943
- 2003,
944
- 1937,
945
- 1871,
946
- 1805
947
  ],
948
  "precisions": [
949
- 0.6165751372940589,
950
- 0.37842023748064013,
951
- 0.26242650988776056,
952
- 0.1850415512465374
953
  ],
954
- "bp": 0.968069571391973,
955
- "sys_len": 2003,
956
  "ref_len": 2068,
957
- "sacrebleu": 0.31583910573917306,
958
- "score": 0.31583910573917306,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.28278290783286325,
961
- "score_ci_high": 0.35809550404773266,
962
- "sacrebleu_ci_low": 0.28278290783286325,
963
- "sacrebleu_ci_high": 0.35809550404773266
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
- 895,
969
- 316,
970
- 136,
971
- 65
972
  ],
973
  "totals": [
974
- 2706,
975
- 2640,
976
- 2574,
977
- 2508
978
  ],
979
  "precisions": [
980
- 0.3307464892830747,
981
- 0.11969696969696969,
982
- 0.05283605283605284,
983
- 0.025917065390749602
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 2706,
987
  "ref_len": 2235,
988
- "sacrebleu": 0.08580718353389435,
989
- "score": 0.08580718353389435,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.0714070016419881,
992
- "score_ci_high": 0.11393007113284326,
993
- "sacrebleu_ci_low": 0.0714070016419881,
994
- "sacrebleu_ci_high": 0.11393007113284326
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
- 1230,
1000
- 752,
1001
- 492,
1002
- 331
1003
  ],
1004
  "totals": [
1005
- 1871,
1006
- 1805,
1007
- 1739,
1008
- 1673
1009
  ],
1010
  "precisions": [
1011
- 0.6574024585783004,
1012
- 0.4166204986149584,
1013
- 0.28292121909143186,
1014
- 0.19784817692767484
1015
  ],
1016
- "bp": 0.976235618350251,
1017
- "sys_len": 1871,
1018
  "ref_len": 1916,
1019
- "sacrebleu": 0.3435160489732885,
1020
- "score": 0.3435160489732885,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.2938775361515651,
1023
- "score_ci_high": 0.38124144600400245,
1024
- "sacrebleu_ci_low": 0.2938775361515651,
1025
- "sacrebleu_ci_high": 0.38124144600400245
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
- 1021,
1031
- 502,
1032
- 284,
1033
- 169
1034
  ],
1035
  "totals": [
1036
- 1949,
1037
- 1883,
1038
- 1817,
1039
- 1751
1040
  ],
1041
  "precisions": [
1042
- 0.5238583889173936,
1043
- 0.26659585767392463,
1044
- 0.15630159603742433,
1045
- 0.09651627641347801
1046
  ],
1047
- "bp": 1.0,
1048
- "sys_len": 1949,
1049
  "ref_len": 1949,
1050
- "sacrebleu": 0.21424358052936537,
1051
- "score": 0.21424358052936537,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.19040761320563873,
1054
- "score_ci_high": 0.2508897923390456,
1055
- "sacrebleu_ci_low": 0.19040761320563873,
1056
- "sacrebleu_ci_high": 0.2508897923390456
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
  1122,
1062
- 547,
1063
- 287,
1064
- 157
1065
  ],
1066
  "totals": [
1067
- 1974,
1068
- 1908,
1069
- 1842,
1070
- 1776
1071
  ],
1072
  "precisions": [
1073
- 0.5683890577507599,
1074
- 0.2866876310272537,
1075
- 0.15580890336590664,
1076
- 0.0884009009009009
1077
  ],
1078
- "bp": 0.9391156766806551,
1079
- "sys_len": 1974,
1080
  "ref_len": 2098,
1081
- "sacrebleu": 0.2044066388062864,
1082
- "score": 0.2044066388062864,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.17477612690667202,
1085
- "score_ci_high": 0.23063630240994007,
1086
- "sacrebleu_ci_low": 0.17477612690667202,
1087
- "sacrebleu_ci_high": 0.23063630240994007
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
- 1184,
1093
- 706,
1094
- 459,
1095
- 307
1096
  ],
1097
  "totals": [
1098
- 1741,
1099
- 1675,
1100
- 1609,
1101
- 1543
1102
  ],
1103
  "precisions": [
1104
- 0.6800689259046525,
1105
- 0.42149253731343284,
1106
- 0.2852703542573027,
1107
- 0.19896305897602073
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 1741,
1111
  "ref_len": 1734,
1112
- "sacrebleu": 0.35714368713727423,
1113
- "score": 0.35714368713727423,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.3224604238321309,
1116
- "score_ci_high": 0.41026435550762275,
1117
- "sacrebleu_ci_low": 0.3224604238321309,
1118
- "sacrebleu_ci_high": 0.41026435550762275
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
- 846,
1124
- 309,
1125
- 132,
1126
- 62
1127
  ],
1128
  "totals": [
1129
- 1698,
1130
- 1632,
1131
- 1566,
1132
- 1500
1133
  ],
1134
  "precisions": [
1135
- 0.49823321554770317,
1136
- 0.18933823529411764,
1137
- 0.0842911877394636,
1138
- 0.04133333333333334
1139
  ],
1140
- "bp": 0.9790217565823072,
1141
- "sys_len": 1698,
1142
  "ref_len": 1734,
1143
- "sacrebleu": 0.13181982922133714,
1144
- "score": 0.13181982922133714,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.10251074913175488,
1147
- "score_ci_high": 0.18667757600957346,
1148
- "sacrebleu_ci_low": 0.10251074913175488,
1149
- "sacrebleu_ci_high": 0.18667757600957346
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
- 817,
1155
- 278,
1156
- 122,
1157
- 63
1158
  ],
1159
  "totals": [
1160
- 1724,
1161
- 1658,
1162
- 1592,
1163
- 1526
1164
  ],
1165
  "precisions": [
1166
- 0.4738979118329466,
1167
- 0.16767189384800965,
1168
- 0.07663316582914573,
1169
- 0.041284403669724766
1170
  ],
1171
- "bp": 0.9942163261750401,
1172
- "sys_len": 1724,
1173
  "ref_len": 1734,
1174
- "sacrebleu": 0.12518948488181658,
1175
- "score": 0.12518948488181658,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.10085361909517791,
1178
- "score_ci_high": 0.1695266774061832,
1179
- "sacrebleu_ci_low": 0.10085361909517791,
1180
- "sacrebleu_ci_high": 0.1695266774061832
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
- 1164,
1186
- 692,
1187
- 452,
1188
- 295
1189
  ],
1190
  "totals": [
1191
- 1737,
1192
- 1671,
1193
- 1605,
1194
- 1539
1195
  ],
1196
  "precisions": [
1197
- 0.6701208981001727,
1198
- 0.41412327947336924,
1199
- 0.28161993769470406,
1200
- 0.19168291098115658
1201
  ],
1202
  "bp": 1.0,
1203
- "sys_len": 1737,
1204
  "ref_len": 1734,
1205
- "sacrebleu": 0.3498504204327118,
1206
- "score": 0.3498504204327118,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.3154848870877421,
1209
- "score_ci_high": 0.42165027559439294,
1210
- "sacrebleu_ci_low": 0.3154848870877421,
1211
- "sacrebleu_ci_high": 0.42165027559439294
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
- 1200,
1217
- 704,
1218
- 458,
1219
- 314
1220
  ],
1221
  "totals": [
1222
- 1781,
1223
- 1715,
1224
- 1649,
1225
- 1583
1226
  ],
1227
  "precisions": [
1228
- 0.673778775968557,
1229
- 0.41049562682215746,
1230
- 0.2777440873256519,
1231
- 0.1983575489576753
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 1781,
1235
  "ref_len": 1734,
1236
- "sacrebleu": 0.3513418261131799,
1237
- "score": 0.3513418261131799,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.3037866384367266,
1240
- "score_ci_high": 0.39510754604511705,
1241
- "sacrebleu_ci_low": 0.3037866384367266,
1242
- "sacrebleu_ci_high": 0.39510754604511705
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
- 1072,
1248
- 530,
1249
- 303,
1250
- 175
1251
  ],
1252
  "totals": [
1253
  1809,
@@ -1256,27 +1256,27 @@
1256
  1611
1257
  ],
1258
  "precisions": [
1259
- 0.5925925925925926,
1260
- 0.30407343660355707,
1261
- 0.18067978533094814,
1262
- 0.10862818125387959
1263
  ],
1264
  "bp": 1.0,
1265
  "sys_len": 1809,
1266
  "ref_len": 1734,
1267
- "sacrebleu": 0.24386343888275555,
1268
- "score": 0.24386343888275555,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.2168231022955434,
1271
- "score_ci_high": 0.2798922379137682,
1272
- "sacrebleu_ci_low": 0.2168231022955434,
1273
- "sacrebleu_ci_high": 0.2798922379137682
1274
  },
1275
- "score": 0.22997684790035774,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
- "score": 0.3447885069799008,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-06-22T19:05:29.772171Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
+ "accuracy": 0.37777777777777777,
180
+ "accuracy_ci_low": 0.28888888888888886,
181
+ "accuracy_ci_high": 0.4888888888888889,
182
  "score_name": "accuracy",
183
+ "score": 0.37777777777777777,
184
+ "score_ci_high": 0.4888888888888889,
185
+ "score_ci_low": 0.28888888888888886,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
+ "accuracy": 0.45555555555555555,
190
+ "accuracy_ci_low": 0.35555555555555557,
191
+ "accuracy_ci_high": 0.5666666666666667,
192
  "score_name": "accuracy",
193
+ "score": 0.45555555555555555,
194
+ "score_ci_high": 0.5666666666666667,
195
+ "score_ci_low": 0.35555555555555557,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
+ "accuracy": 0.3888888888888889,
200
+ "accuracy_ci_low": 0.3,
201
+ "accuracy_ci_high": 0.4888888888888889,
202
  "score_name": "accuracy",
203
+ "score": 0.3888888888888889,
204
+ "score_ci_high": 0.4888888888888889,
205
+ "score_ci_low": 0.3,
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
+ "accuracy": 0.37777777777777777,
210
  "accuracy_ci_low": 0.28888888888888886,
211
+ "accuracy_ci_high": 0.4777777777777778,
212
  "score_name": "accuracy",
213
+ "score": 0.37777777777777777,
214
+ "score_ci_high": 0.4777777777777778,
215
  "score_ci_low": 0.28888888888888886,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.36666666666666664,
220
+ "accuracy_ci_low": 0.2777777777777778,
221
+ "accuracy_ci_high": 0.4777777777777778,
222
  "score_name": "accuracy",
223
+ "score": 0.36666666666666664,
224
+ "score_ci_high": 0.4777777777777778,
225
+ "score_ci_low": 0.2777777777777778,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.4666666666666667,
230
+ "accuracy_ci_low": 0.37436916691430816,
231
+ "accuracy_ci_high": 0.5777777777777777,
232
  "score_name": "accuracy",
233
+ "score": 0.4666666666666667,
234
+ "score_ci_high": 0.5777777777777777,
235
+ "score_ci_low": 0.37436916691430816,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.43333333333333335,
240
+ "accuracy_ci_low": 0.32222222222222224,
241
+ "accuracy_ci_high": 0.5333333333333333,
242
  "score_name": "accuracy",
243
+ "score": 0.43333333333333335,
244
+ "score_ci_high": 0.5333333333333333,
245
+ "score_ci_low": 0.32222222222222224,
246
  "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.4111111111111111,
250
+ "accuracy_ci_low": 0.3111111111111111,
251
+ "accuracy_ci_high": 0.5111111111111111,
252
  "score_name": "accuracy",
253
+ "score": 0.4111111111111111,
254
+ "score_ci_high": 0.5111111111111111,
255
+ "score_ci_low": 0.3111111111111111,
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
+ "accuracy": 0.3888888888888889,
260
+ "accuracy_ci_low": 0.28888888888888886,
261
+ "accuracy_ci_high": 0.4888888888888889,
262
  "score_name": "accuracy",
263
+ "score": 0.3888888888888889,
264
+ "score_ci_high": 0.4888888888888889,
265
+ "score_ci_low": 0.28888888888888886,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
  "accuracy": 0.5111111111111111,
270
  "accuracy_ci_low": 0.4111111111111111,
271
+ "accuracy_ci_high": 0.6222222222222222,
272
  "score_name": "accuracy",
273
  "score": 0.5111111111111111,
274
+ "score_ci_high": 0.6222222222222222,
275
  "score_ci_low": 0.4111111111111111,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.3888888888888889,
280
  "accuracy_ci_low": 0.3,
281
+ "accuracy_ci_high": 0.5,
282
  "score_name": "accuracy",
283
+ "score": 0.3888888888888889,
284
+ "score_ci_high": 0.5,
285
  "score_ci_low": 0.3,
286
  "num_of_instances": 90
287
  },
288
+ "score": 0.41515151515151516,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.01950078003120125,
296
+ "score": 0.01950078003120125,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.01950078003120125,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
+ "f1_Person": 0.44648318042813456,
307
+ "f1_Organization": 0.2157676348547718,
308
+ "f1_Location": 0.16666666666666669,
309
+ "f1_macro": 0.27630582731652437,
310
+ "recall_macro": 0.20087031380401354,
311
+ "precision_macro": 0.48225440495177335,
312
+ "in_classes_support": 0.6990595611285266,
313
+ "f1_micro": 0.2701421800947867,
314
+ "recall_micro": 0.21714285714285714,
315
+ "precision_micro": 0.3573667711598746,
316
+ "score": 0.2701421800947867,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.2188760455473918,
319
+ "score_ci_high": 0.31166112583088945,
320
+ "f1_micro_ci_low": 0.2188760455473918,
321
+ "f1_micro_ci_high": 0.31166112583088945
322
  },
323
+ "score": 0.2701421800947867,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
+ "accuracy": 0.28169014084507044,
330
+ "accuracy_ci_low": 0.18309859154929578,
331
+ "accuracy_ci_high": 0.40138961326568784,
332
  "score_name": "accuracy",
333
+ "score": 0.28169014084507044,
334
+ "score_ci_high": 0.40138961326568784,
335
+ "score_ci_low": 0.18309859154929578,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
+ "accuracy": 0.16901408450704225,
340
+ "accuracy_ci_low": 0.09859154929577464,
341
+ "accuracy_ci_high": 0.28169014084507044,
342
  "score_name": "accuracy",
343
+ "score": 0.16901408450704225,
344
+ "score_ci_high": 0.28169014084507044,
345
+ "score_ci_low": 0.09859154929577464,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
+ "accuracy": 0.08450704225352113,
350
+ "accuracy_ci_low": 0.04225352112676056,
351
+ "accuracy_ci_high": 0.16901408450704225,
352
  "score_name": "accuracy",
353
+ "score": 0.08450704225352113,
354
+ "score_ci_high": 0.16901408450704225,
355
+ "score_ci_low": 0.04225352112676056,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
+ "accuracy": 0.22535211267605634,
360
+ "accuracy_ci_low": 0.14084507042253522,
361
+ "accuracy_ci_high": 0.323943661971831,
362
  "score_name": "accuracy",
363
+ "score": 0.22535211267605634,
364
+ "score_ci_high": 0.323943661971831,
365
+ "score_ci_low": 0.14084507042253522,
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
+ "accuracy": 0.38028169014084506,
370
+ "accuracy_ci_low": 0.2676056338028169,
371
+ "accuracy_ci_high": 0.49295774647887325,
372
  "score_name": "accuracy",
373
+ "score": 0.38028169014084506,
374
+ "score_ci_high": 0.49295774647887325,
375
+ "score_ci_low": 0.2676056338028169,
376
  "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
  "accuracy": 0.18309859154929578,
380
  "accuracy_ci_low": 0.09859154929577464,
381
  "accuracy_ci_high": 0.28169014084507044,
 
385
  "score_ci_low": 0.09859154929577464,
386
  "num_of_instances": 71
387
  },
388
+ "mmlu_pro_health": {
389
  "accuracy": 0.11267605633802817,
390
  "accuracy_ci_low": 0.056338028169014086,
391
+ "accuracy_ci_high": 0.20762427324557167,
392
  "score_name": "accuracy",
393
  "score": 0.11267605633802817,
394
+ "score_ci_high": 0.20762427324557167,
395
  "score_ci_low": 0.056338028169014086,
396
  "num_of_instances": 71
397
  },
398
+ "mmlu_pro_history": {
399
  "accuracy": 0.09859154929577464,
400
  "accuracy_ci_low": 0.04225352112676056,
401
  "accuracy_ci_high": 0.18309859154929578,
 
405
  "score_ci_low": 0.04225352112676056,
406
  "num_of_instances": 71
407
  },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.16901408450704225,
410
+ "accuracy_ci_low": 0.09859154929577464,
411
+ "accuracy_ci_high": 0.2676056338028169,
412
+ "score_name": "accuracy",
413
+ "score": 0.16901408450704225,
414
+ "score_ci_high": 0.2676056338028169,
415
+ "score_ci_low": 0.09859154929577464,
416
+ "num_of_instances": 71
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.14084507042253522,
420
+ "accuracy_ci_low": 0.07042253521126761,
421
+ "accuracy_ci_high": 0.2535211267605634,
422
+ "score_name": "accuracy",
423
+ "score": 0.14084507042253522,
424
+ "score_ci_high": 0.2535211267605634,
425
+ "score_ci_low": 0.07042253521126761,
426
+ "num_of_instances": 71
427
+ },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.11267605633802817,
430
+ "accuracy_ci_low": 0.056338028169014086,
431
+ "accuracy_ci_high": 0.2112676056338028,
432
  "score_name": "accuracy",
433
+ "score": 0.11267605633802817,
434
+ "score_ci_high": 0.2112676056338028,
435
+ "score_ci_low": 0.056338028169014086,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
+ "accuracy": 0.11267605633802817,
440
+ "accuracy_ci_low": 0.056338028169014086,
441
+ "accuracy_ci_high": 0.2112676056338028,
442
  "score_name": "accuracy",
443
+ "score": 0.11267605633802817,
444
+ "score_ci_high": 0.2112676056338028,
445
+ "score_ci_low": 0.056338028169014086,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.08450704225352113,
450
+ "accuracy_ci_low": 0.028169014084507043,
451
+ "accuracy_ci_high": 0.15492957746478872,
452
  "score_name": "accuracy",
453
+ "score": 0.08450704225352113,
454
+ "score_ci_high": 0.15492957746478872,
455
+ "score_ci_low": 0.028169014084507043,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
  "accuracy": 0.38028169014084506,
460
  "accuracy_ci_low": 0.2676056338028169,
461
+ "accuracy_ci_high": 0.5070422535211268,
462
  "score_name": "accuracy",
463
  "score": 0.38028169014084506,
464
+ "score_ci_high": 0.5070422535211268,
465
  "score_ci_low": 0.2676056338028169,
466
  "num_of_instances": 71
467
  },
468
+ "score": 0.18108651911468812,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.2201422254824052,
475
+ "f1_suggestive": 0.2631578947368421,
476
+ "f1_arbitrary": 0.22857142857142856,
477
+ "f1_generic": 0.24390243902439024,
478
+ "f1_fanciful": 0.2222222222222222,
479
+ "f1_descriptive": 0.14285714285714285,
480
+ "f1_macro_ci_low": 0.14283541117516643,
481
+ "f1_macro_ci_high": 0.3249210624357632,
482
  "score_name": "f1_micro",
483
+ "score": 0.22485207100591717,
484
+ "score_ci_high": 0.32142857142857145,
485
+ "score_ci_low": 0.14281093882602658,
486
  "num_of_instances": 85,
487
+ "accuracy": 0.2235294117647059,
488
+ "accuracy_ci_low": 0.1411764705882353,
489
+ "accuracy_ci_high": 0.3176470588235294,
490
+ "f1_micro": 0.22485207100591717,
491
+ "f1_micro_ci_low": 0.14281093882602658,
492
+ "f1_micro_ci_high": 0.32142857142857145
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.49296028880866427,
496
+ "f1_no": 0.6859205776173285,
497
+ "f1_yes": 0.3,
498
+ "f1_macro_ci_low": 0.42550135126538996,
499
+ "f1_macro_ci_high": 0.5612203343628853,
500
  "score_name": "f1_micro",
501
+ "score": 0.5692695214105793,
502
+ "score_ci_high": 0.6329113924050633,
503
+ "score_ci_low": 0.4962025316455696,
504
  "num_of_instances": 200,
505
+ "accuracy": 0.565,
506
+ "accuracy_ci_low": 0.4918996659624703,
507
+ "accuracy_ci_high": 0.63,
508
+ "f1_micro": 0.5692695214105793,
509
+ "f1_micro_ci_low": 0.4962025316455696,
510
+ "f1_micro_ci_high": 0.6329113924050633
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.17282485903398895,
514
+ "f1_conclusion": 0.047619047619047616,
515
+ "f1_analysis": 0.2898550724637681,
516
+ "f1_decree": 0.25806451612903225,
517
+ "f1_facts": 0.09302325581395349,
518
+ "f1_issue": 0.13333333333333333,
519
+ "f1_rule": 0.26666666666666666,
520
+ "f1_procedural history": 0.12121212121212122,
521
+ "f1_macro_ci_low": 0.1258149629713259,
522
+ "f1_macro_ci_high": 0.23750827438601588,
523
  "score_name": "f1_micro",
524
+ "score": 0.20408163265306123,
525
+ "score_ci_high": 0.26463104325699743,
526
+ "score_ci_low": 0.15267175572519084,
527
  "num_of_instances": 200,
528
+ "accuracy": 0.2,
529
+ "accuracy_ci_low": 0.15,
530
+ "accuracy_ci_high": 0.25995049710654655,
531
+ "f1_micro": 0.20408163265306123,
532
+ "f1_micro_ci_low": 0.15267175572519084,
533
+ "f1_micro_ci_high": 0.26463104325699743
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.4769502535757023,
537
+ "f1_yes": 0.5688073394495413,
538
+ "f1_no": 0.38509316770186336,
539
+ "f1_macro_ci_low": 0.4074761191353562,
540
+ "f1_macro_ci_high": 0.542706355356233,
541
  "score_name": "f1_micro",
542
+ "score": 0.49076517150395776,
543
+ "score_ci_high": 0.5549738219895288,
544
+ "score_ci_low": 0.4183693762852218,
545
  "num_of_instances": 200,
546
+ "accuracy": 0.465,
547
+ "accuracy_ci_low": 0.395,
548
+ "accuracy_ci_high": 0.53,
549
+ "f1_micro": 0.49076517150395776,
550
+ "f1_micro_ci_low": 0.4183693762852218,
551
+ "f1_micro_ci_high": 0.5549738219895288
552
  },
553
  "legalbench_proa": {
554
+ "f1_macro": 0.6890476190476191,
555
+ "f1_yes": 0.64,
556
+ "f1_no": 0.7380952380952381,
557
+ "f1_macro_ci_low": 0.5867043850592167,
558
+ "f1_macro_ci_high": 0.775,
559
  "score_name": "f1_micro",
560
+ "score": 0.6918238993710691,
561
+ "score_ci_high": 0.7770700636942676,
562
+ "score_ci_low": 0.586011156606,
563
  "num_of_instances": 85,
564
+ "accuracy": 0.6470588235294118,
565
+ "accuracy_ci_low": 0.5411764705882353,
566
+ "accuracy_ci_high": 0.7411764705882353,
567
+ "f1_micro": 0.6918238993710691,
568
+ "f1_micro_ci_low": 0.586011156606,
569
+ "f1_micro_ci_high": 0.7770700636942676
570
  },
571
+ "score": 0.43615845918891694,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.2164270451621233,
578
+ "f1_cars": 0.5263157894736842,
579
  "f1_windows x": 0.0,
580
+ "f1_atheism": 0.19047619047619047,
581
+ "f1_religion": 0.07692307692307693,
582
+ "f1_medicine": 0.3050847457627119,
583
+ "f1_hockey": 0.3516483516483517,
584
+ "f1_christianity": 0.29850746268656714,
585
+ "f1_computer graphics": 0.125,
586
+ "f1_microsoft windows": 0.03508771929824561,
587
  "f1_middle east": 0.125,
588
+ "f1_motorcycles": 0.23684210526315788,
589
+ "f1_cryptography": 0.2702702702702703,
590
  "f1_mac hardware": 0.0,
591
+ "f1_politics": 0.22818791946308725,
592
+ "f1_electronics": 0.23529411764705882,
593
  "f1_for sale": 0.0,
594
+ "f1_guns": 0.14035087719298245,
 
595
  "f1_space": 0.39436619718309857,
596
+ "f1_pc hardware": 0.03508771929824561,
597
+ "f1_baseball": 0.7540983606557377,
598
+ "f1_macro_ci_low": 0.19503700990493542,
599
+ "f1_macro_ci_high": 0.24363185833629145,
 
 
600
  "score_name": "f1_micro",
601
+ "score": 0.25332400279916023,
602
+ "score_ci_high": 0.2842103070323454,
603
+ "score_ci_low": 0.2255621673024344,
604
  "num_of_instances": 1000,
605
+ "accuracy": 0.181,
606
+ "accuracy_ci_low": 0.16,
607
+ "accuracy_ci_high": 0.206,
608
+ "f1_micro": 0.25332400279916023,
609
+ "f1_micro_ci_low": 0.2255621673024344,
610
+ "f1_micro_ci_high": 0.2842103070323454
611
  },
612
+ "score": 0.25332400279916023,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.17253682837845144,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.7386276021588281,
620
+ "f1_checking or savings account": 0.15053763440860216,
621
+ "f1_debt collection": 0.19310344827586207,
622
+ "f1_credit card or prepaid card": 0.1038961038961039,
623
+ "f1_mortgage": 0.13333333333333333,
624
+ "f1_vehicle loan or lease": 0.16666666666666666,
625
+ "f1_student loan": 0.0,
626
  "f1_payday loan or title loan or personal loan": 0.0,
627
+ "f1_money transfer or virtual currency or money service": 0.06666666666666667,
628
+ "f1_macro_ci_low": 0.14360181188460305,
629
+ "f1_macro_ci_high": 0.21482869629805593,
 
 
630
  "score_name": "f1_micro",
631
+ "score": 0.5875576036866359,
632
+ "score_ci_high": 0.6180717759541877,
633
+ "score_ci_low": 0.5574039394995974,
634
  "num_of_instances": 1000,
635
+ "accuracy": 0.51,
636
+ "accuracy_ci_low": 0.4802728156816149,
637
+ "accuracy_ci_high": 0.541,
638
+ "f1_micro": 0.5875576036866359,
639
+ "f1_micro_ci_low": 0.5574039394995974,
640
+ "f1_micro_ci_high": 0.6180717759541877
641
  },
642
  "cfpb_product_watsonx": {
643
+ "f1_macro": 0.36175351677566253,
644
+ "f1_mortgages and loans": 0.4264705882352941,
645
+ "f1_credit card": 0.29310344827586204,
646
+ "f1_debt collection": 0.44360902255639095,
647
+ "f1_credit reporting": 0.5724137931034483,
648
+ "f1_retail banking": 0.07317073170731707,
649
+ "f1_macro_ci_low": 0.31986969619833744,
650
+ "f1_macro_ci_high": 0.4037922302792007,
651
  "score_name": "f1_micro",
652
+ "score": 0.42921348314606744,
653
+ "score_ci_high": 0.47176643035248556,
654
+ "score_ci_low": 0.38170408070231343,
655
  "num_of_instances": 500,
656
+ "accuracy": 0.382,
657
+ "accuracy_ci_low": 0.34,
658
+ "accuracy_ci_high": 0.424,
659
+ "f1_micro": 0.42921348314606744,
660
+ "f1_micro_ci_low": 0.38170408070231343,
661
+ "f1_micro_ci_high": 0.47176643035248556
662
  },
663
+ "score": 0.5083855434163517,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
+ "execution_accuracy": 0.013,
671
+ "program_accuracy": 0.017,
672
+ "score": 0.017,
673
  "score_name": "program_accuracy",
674
+ "execution_accuracy_ci_low": 0.007,
675
+ "execution_accuracy_ci_high": 0.022,
676
+ "program_accuracy_ci_low": 0.01,
677
+ "program_accuracy_ci_high": 0.027,
678
+ "score_ci_low": 0.01,
679
+ "score_ci_high": 0.027
 
680
  },
681
+ "score": 0.017,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
+ "precision": 0.3198532673532609,
688
+ "recall": 0.4011596558810935,
689
+ "f1": 0.2852470156019899,
690
+ "precision_ci_low": 0.29836535944826126,
691
+ "precision_ci_high": 0.34239842009004323,
692
+ "recall_ci_low": 0.3833246258154271,
693
+ "recall_ci_high": 0.4181134290525644,
694
+ "f1_ci_low": 0.26905618569014855,
695
+ "f1_ci_high": 0.3006017844609168,
696
  "score_name": "f1",
697
+ "score": 0.2852470156019899,
698
+ "score_ci_high": 0.3006017844609168,
699
+ "score_ci_low": 0.26905618569014855,
700
  "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.5655807377894719,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6124767065048218,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5509337742378314,
704
+ "faithfullness_f1_token_overlap": 0.2309696715819019,
705
+ "faithfullness_recall_token_overlap": 0.17063637636202156,
706
+ "faithfullness_precision_token_overlap": 0.5816026494498466,
707
+ "correctness_f1_token_overlap": 0.2852470156019899,
708
+ "correctness_recall_token_overlap": 0.4011596558810935,
709
+ "correctness_precision_token_overlap": 0.3198532673532609
710
  },
711
+ "score": 0.2852470156019899,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
+ "accuracy": 0.284,
718
+ "accuracy_ci_low": 0.259,
719
+ "accuracy_ci_high": 0.311,
720
  "score_name": "accuracy",
721
+ "score": 0.284,
722
+ "score_ci_high": 0.311,
723
+ "score_ci_low": 0.259,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
+ "accuracy": 0.528,
728
+ "accuracy_ci_low": 0.482,
729
+ "accuracy_ci_high": 0.568,
730
  "score_name": "accuracy",
731
+ "score": 0.528,
732
+ "score_ci_high": 0.568,
733
+ "score_ci_low": 0.482,
734
  "num_of_instances": 500
735
  },
736
+ "score": 0.406,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
  "severity_critical": 0.0,
743
+ "severity_high": 1.0,
744
+ "severity_medium": 5.0,
745
  "severity_low": 94.0,
746
+ "category_harmful_info": 0.8219608657416844,
747
+ "category_deception": 0.8234998996962202,
748
+ "category_explicit_content": 0.8439478427548,
749
+ "category_violence": 0.8219440165138435,
750
+ "category_discrimination": 0.6564966686188228,
751
+ "category_substance_abuse": 0.8870111008485158,
752
+ "category_pii": 0.8831345179566631,
753
+ "safety": 0.8180480572415724,
754
+ "safety_ci_low": 0.7930656252748522,
755
+ "safety_ci_high": 0.838062627359591,
756
  "score_name": "safety",
757
+ "score": 0.8180480572415724,
758
+ "score_ci_high": 0.838062627359591,
759
+ "score_ci_low": 0.7930656252748522,
760
  "num_of_instances": 100
761
  },
762
+ "score": 0.8180480572415724,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
+ "rougeL": 0.2815736998609776,
770
+ "score": 0.2815736998609776,
 
 
771
  "score_name": "rougeL",
772
+ "rougeLsum": 0.34760450741431803,
773
+ "rouge2": 0.19849457046306532,
774
+ "rouge1": 0.4046054880676319,
775
+ "rougeL_ci_low": 0.2735435644090114,
776
+ "rougeL_ci_high": 0.29011999834027047,
777
+ "score_ci_low": 0.2735435644090114,
778
+ "score_ci_high": 0.29011999834027047,
779
+ "rougeLsum_ci_low": 0.33767767468970084,
780
+ "rougeLsum_ci_high": 0.35710807691804086,
781
+ "rouge2_ci_low": 0.19109481393979616,
782
+ "rouge2_ci_high": 0.20664772893408026,
783
+ "rouge1_ci_low": 0.3944251425211918,
784
+ "rouge1_ci_high": 0.4147568240146707
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
+ "rougeL": 0.08474301186053737,
789
+ "score": 0.08474301186053737,
 
 
790
  "score_name": "rougeL",
791
+ "rougeLsum": 0.09636762322033209,
792
+ "rouge2": 0.015597888373505451,
793
+ "rouge1": 0.11636905030749585,
794
+ "rougeL_ci_low": 0.08126431007647106,
795
+ "rougeL_ci_high": 0.08828857560838864,
796
+ "score_ci_low": 0.08126431007647106,
797
+ "score_ci_high": 0.08828857560838864,
798
+ "rougeLsum_ci_low": 0.09229796806654987,
799
+ "rougeLsum_ci_high": 0.10047301966535477,
800
+ "rouge2_ci_low": 0.013877999787076423,
801
+ "rouge2_ci_high": 0.017527052297316516,
802
+ "rouge1_ci_low": 0.11092613898018398,
803
+ "rouge1_ci_high": 0.12141188563840967
804
  },
805
+ "score": 0.18315835586075746,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
 
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
+ 965,
814
+ 453,
815
+ 254,
816
+ 143
817
  ],
818
  "totals": [
819
+ 1792,
820
+ 1726,
821
+ 1660,
822
+ 1594
823
  ],
824
  "precisions": [
825
+ 0.5385044642857143,
826
+ 0.26245654692931636,
827
+ 0.1530120481927711,
828
+ 0.08971141781681306
829
  ],
830
  "bp": 1.0,
831
+ "sys_len": 1792,
832
  "ref_len": 1734,
833
+ "sacrebleu": 0.20987224921574224,
834
+ "score": 0.20987224921574224,
835
  "score_name": "sacrebleu",
836
+ "score_ci_low": 0.18278483808284093,
837
+ "score_ci_high": 0.24710116888154685,
838
+ "sacrebleu_ci_low": 0.18278483808284093,
839
+ "sacrebleu_ci_high": 0.24710116888154685
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
+ 1117,
845
+ 629,
846
+ 389,
847
+ 250
848
  ],
849
  "totals": [
850
+ 1750,
851
+ 1684,
852
+ 1618,
853
+ 1552
854
  ],
855
  "precisions": [
856
+ 0.6382857142857143,
857
+ 0.37351543942992876,
858
+ 0.24042027194066748,
859
+ 0.16108247422680413
860
  ],
861
  "bp": 1.0,
862
+ "sys_len": 1750,
863
  "ref_len": 1734,
864
+ "sacrebleu": 0.30998149224900357,
865
+ "score": 0.30998149224900357,
866
  "score_name": "sacrebleu",
867
+ "score_ci_low": 0.27669479922631884,
868
+ "score_ci_high": 0.35743043699935445,
869
+ "sacrebleu_ci_low": 0.27669479922631884,
870
+ "sacrebleu_ci_high": 0.35743043699935445
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
  485,
876
+ 128,
877
+ 45,
878
+ 13
879
  ],
880
  "totals": [
881
+ 1633,
882
+ 1567,
883
+ 1501,
884
+ 1435
885
  ],
886
  "precisions": [
887
+ 0.2969993876301286,
888
+ 0.08168474792597319,
889
+ 0.02998001332445037,
890
+ 0.009059233449477351
891
  ],
892
  "bp": 1.0,
893
+ "sys_len": 1633,
894
  "ref_len": 1589,
895
+ "sacrebleu": 0.05066463869983458,
896
+ "score": 0.05066463869983458,
897
  "score_name": "sacrebleu",
898
+ "score_ci_low": 0.033695533177424505,
899
+ "score_ci_high": 0.07329875078984167,
900
+ "sacrebleu_ci_low": 0.033695533177424505,
901
+ "sacrebleu_ci_high": 0.07329875078984167
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
+ 953,
907
+ 451,
908
+ 252,
909
+ 149
910
  ],
911
  "totals": [
912
+ 1838,
913
+ 1772,
914
+ 1706,
915
+ 1640
916
  ],
917
  "precisions": [
918
+ 0.5184983677910773,
919
+ 0.25451467268623024,
920
+ 0.1477139507620164,
921
+ 0.09085365853658538
922
  ],
923
+ "bp": 1.0,
924
+ "sys_len": 1838,
925
  "ref_len": 1835,
926
+ "sacrebleu": 0.20514268622714965,
927
+ "score": 0.20514268622714965,
928
  "score_name": "sacrebleu",
929
+ "score_ci_low": 0.1759309759364383,
930
+ "score_ci_high": 0.2610907281316971,
931
+ "sacrebleu_ci_low": 0.1759309759364383,
932
+ "sacrebleu_ci_high": 0.2610907281316971
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
+ 1253,
938
+ 748,
939
+ 501,
940
+ 343
941
  ],
942
  "totals": [
943
+ 1957,
944
+ 1891,
945
+ 1825,
946
+ 1759
947
  ],
948
  "precisions": [
949
+ 0.6402657128257537,
950
+ 0.3955579058699101,
951
+ 0.2745205479452055,
952
+ 0.19499715747583854
953
  ],
954
+ "bp": 0.9448590948597164,
955
+ "sys_len": 1957,
956
  "ref_len": 2068,
957
+ "sacrebleu": 0.32241141585526967,
958
+ "score": 0.32241141585526967,
959
  "score_name": "sacrebleu",
960
+ "score_ci_low": 0.28766013219666603,
961
+ "score_ci_high": 0.37380260101110974,
962
+ "sacrebleu_ci_low": 0.28766013219666603,
963
+ "sacrebleu_ci_high": 0.37380260101110974
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
+ 885,
969
+ 311,
970
+ 127,
971
+ 61
972
  ],
973
  "totals": [
974
+ 2604,
975
+ 2538,
976
+ 2472,
977
+ 2406
978
  ],
979
  "precisions": [
980
+ 0.33986175115207373,
981
+ 0.12253743104806934,
982
+ 0.05137540453074434,
983
+ 0.025353283458021614
984
  ],
985
  "bp": 1.0,
986
+ "sys_len": 2604,
987
  "ref_len": 2235,
988
+ "sacrebleu": 0.08582032051210414,
989
+ "score": 0.08582032051210414,
990
  "score_name": "sacrebleu",
991
+ "score_ci_low": 0.06950389399091811,
992
+ "score_ci_high": 0.10763503120611631,
993
+ "sacrebleu_ci_low": 0.06950389399091811,
994
+ "sacrebleu_ci_high": 0.10763503120611631
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
+ 1206,
1000
+ 704,
1001
+ 461,
1002
+ 305
1003
  ],
1004
  "totals": [
1005
+ 1897,
1006
+ 1831,
1007
+ 1765,
1008
+ 1699
1009
  ],
1010
  "precisions": [
1011
+ 0.6357406431207169,
1012
+ 0.3844893500819224,
1013
+ 0.26118980169971673,
1014
+ 0.17951736315479694
1015
  ],
1016
+ "bp": 0.9900341767854584,
1017
+ "sys_len": 1897,
1018
  "ref_len": 1916,
1019
+ "sacrebleu": 0.32393429479632424,
1020
+ "score": 0.32393429479632424,
1021
  "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.27986805538812254,
1023
+ "score_ci_high": 0.35639693336265377,
1024
+ "sacrebleu_ci_low": 0.27986805538812254,
1025
+ "sacrebleu_ci_high": 0.35639693336265377
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
+ 1028,
1031
+ 493,
1032
+ 278,
1033
+ 158
1034
  ],
1035
  "totals": [
1036
+ 1947,
1037
+ 1881,
1038
+ 1815,
1039
+ 1749
1040
  ],
1041
  "precisions": [
1042
+ 0.5279917822290704,
1043
+ 0.26209463051568316,
1044
+ 0.15316804407713497,
1045
+ 0.0903373356203545
1046
  ],
1047
+ "bp": 0.9989733060450584,
1048
+ "sys_len": 1947,
1049
  "ref_len": 1949,
1050
+ "sacrebleu": 0.20897005939347885,
1051
+ "score": 0.20897005939347885,
1052
  "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.18451953543688365,
1054
+ "score_ci_high": 0.24162363180192453,
1055
+ "sacrebleu_ci_low": 0.18451953543688365,
1056
+ "sacrebleu_ci_high": 0.24162363180192453
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
  1122,
1062
+ 520,
1063
+ 265,
1064
+ 132
1065
  ],
1066
  "totals": [
1067
+ 1976,
1068
+ 1910,
1069
+ 1844,
1070
+ 1778
1071
  ],
1072
  "precisions": [
1073
+ 0.5678137651821862,
1074
+ 0.27225130890052357,
1075
+ 0.14370932754880694,
1076
+ 0.07424071991001126
1077
  ],
1078
+ "bp": 0.940126450752485,
1079
+ "sys_len": 1976,
1080
  "ref_len": 2098,
1081
+ "sacrebleu": 0.18945759851867444,
1082
+ "score": 0.18945759851867444,
1083
  "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.1622061427536013,
1085
+ "score_ci_high": 0.21752219857602634,
1086
+ "sacrebleu_ci_low": 0.1622061427536013,
1087
+ "sacrebleu_ci_high": 0.21752219857602634
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
+ 1203,
1093
+ 711,
1094
+ 461,
1095
+ 311
1096
  ],
1097
  "totals": [
1098
+ 1781,
1099
+ 1715,
1100
+ 1649,
1101
+ 1583
1102
  ],
1103
  "precisions": [
1104
+ 0.6754632229084784,
1105
+ 0.4145772594752187,
1106
+ 0.27956337174044876,
1107
+ 0.19646241313960833
1108
  ],
1109
  "bp": 1.0,
1110
+ "sys_len": 1781,
1111
  "ref_len": 1734,
1112
+ "sacrebleu": 0.3521613840302072,
1113
+ "score": 0.3521613840302072,
1114
  "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.3167971458716246,
1116
+ "score_ci_high": 0.3951489450129151,
1117
+ "sacrebleu_ci_low": 0.3167971458716246,
1118
+ "sacrebleu_ci_high": 0.3951489450129151
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
+ 850,
1124
+ 315,
1125
+ 147,
1126
+ 72
1127
  ],
1128
  "totals": [
1129
+ 1724,
1130
+ 1658,
1131
+ 1592,
1132
+ 1526
1133
  ],
1134
  "precisions": [
1135
+ 0.49303944315545245,
1136
+ 0.18998793727382388,
1137
+ 0.09233668341708542,
1138
+ 0.047182175622542594
1139
  ],
1140
+ "bp": 0.9942163261750401,
1141
+ "sys_len": 1724,
1142
  "ref_len": 1734,
1143
+ "sacrebleu": 0.14130934129693265,
1144
+ "score": 0.14130934129693265,
1145
  "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.12273663080433993,
1147
+ "score_ci_high": 0.18194995219240426,
1148
+ "sacrebleu_ci_low": 0.12273663080433993,
1149
+ "sacrebleu_ci_high": 0.18194995219240426
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
+ 843,
1155
+ 293,
1156
+ 128,
1157
+ 64
1158
  ],
1159
  "totals": [
1160
+ 1778,
1161
+ 1712,
1162
+ 1646,
1163
+ 1580
1164
  ],
1165
  "precisions": [
1166
+ 0.47412823397075365,
1167
+ 0.17114485981308414,
1168
+ 0.07776427703523693,
1169
+ 0.04050632911392405
1170
  ],
1171
+ "bp": 1.0,
1172
+ "sys_len": 1778,
1173
  "ref_len": 1734,
1174
+ "sacrebleu": 0.12644180180823753,
1175
+ "score": 0.12644180180823753,
1176
  "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.09927741599851922,
1178
+ "score_ci_high": 0.18042643788312576,
1179
+ "sacrebleu_ci_low": 0.09927741599851922,
1180
+ "sacrebleu_ci_high": 0.18042643788312576
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
+ 1181,
1186
+ 709,
1187
+ 467,
1188
+ 312
1189
  ],
1190
  "totals": [
1191
+ 1738,
1192
+ 1672,
1193
+ 1606,
1194
+ 1540
1195
  ],
1196
  "precisions": [
1197
+ 0.6795166858457997,
1198
+ 0.4240430622009569,
1199
+ 0.2907845579078456,
1200
+ 0.20259740259740258
1201
  ],
1202
  "bp": 1.0,
1203
+ "sys_len": 1738,
1204
  "ref_len": 1734,
1205
+ "sacrebleu": 0.3609556341496431,
1206
+ "score": 0.3609556341496431,
1207
  "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.3241550044831716,
1209
+ "score_ci_high": 0.41619397652312556,
1210
+ "sacrebleu_ci_low": 0.3241550044831716,
1211
+ "sacrebleu_ci_high": 0.41619397652312556
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
+ 1191,
1217
+ 698,
1218
+ 453,
1219
+ 298
1220
  ],
1221
  "totals": [
1222
+ 1820,
1223
+ 1754,
1224
+ 1688,
1225
+ 1622
1226
  ],
1227
  "precisions": [
1228
+ 0.6543956043956044,
1229
+ 0.3979475484606613,
1230
+ 0.2683649289099526,
1231
+ 0.18372379778051787
1232
  ],
1233
  "bp": 1.0,
1234
+ "sys_len": 1820,
1235
  "ref_len": 1734,
1236
+ "sacrebleu": 0.3366195578842849,
1237
+ "score": 0.3366195578842849,
1238
  "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.29838967696947455,
1240
+ "score_ci_high": 0.3695962539518517,
1241
+ "sacrebleu_ci_low": 0.29838967696947455,
1242
+ "sacrebleu_ci_high": 0.3695962539518517
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
+ 1057,
1248
+ 509,
1249
+ 284,
1250
+ 154
1251
  ],
1252
  "totals": [
1253
  1809,
 
1256
  1611
1257
  ],
1258
  "precisions": [
1259
+ 0.5843007186290768,
1260
+ 0.29202524383247275,
1261
+ 0.1693500298151461,
1262
+ 0.09559279950341402
1263
  ],
1264
  "bp": 1.0,
1265
  "sys_len": 1809,
1266
  "ref_len": 1734,
1267
+ "sacrebleu": 0.229253945338322,
1268
+ "score": 0.229253945338322,
1269
  "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.2057022441776137,
1271
+ "score_ci_high": 0.2587331578660881,
1272
+ "sacrebleu_ci_low": 0.2057022441776137,
1273
+ "sacrebleu_ci_high": 0.2587331578660881
1274
  },
1275
+ "score": 0.23019976133168057,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
+ "score": 0.3094924761409708,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
results/bluebench/2025-06-22T17-10-54_evaluation_results.json ADDED
@@ -0,0 +1,1283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-06-22T21:10:50.634203Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=watsonx/meta-llama/llama-3-2-90b-vision-instruct,max_tokens=256",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "watsonx/meta-llama/llama-3-2-90b-vision-instruct",
30
+ "model_args": {
31
+ "max_tokens": 256
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "absl-py": "2.3.0",
55
+ "tiktoken": "0.9.0",
56
+ "charset-normalizer": "3.4.2",
57
+ "nvidia-cuda-runtime-cu12": "12.6.77",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "litellm": "1.72.6.post1",
61
+ "httpcore": "1.0.9",
62
+ "Jinja2": "3.1.6",
63
+ "jsonschema-specifications": "2025.4.1",
64
+ "pydantic_core": "2.33.2",
65
+ "nvidia-cusparse-cu12": "12.5.4.2",
66
+ "yarl": "1.20.1",
67
+ "openai": "1.88.0",
68
+ "portalocker": "3.2.0",
69
+ "pandas": "2.3.0",
70
+ "multiprocess": "0.70.16",
71
+ "jsonschema": "4.24.0",
72
+ "unitxt": "1.24.0",
73
+ "nvidia-nvjitlink-cu12": "12.6.85",
74
+ "nvidia-cublas-cu12": "12.6.4.1",
75
+ "pydantic": "2.11.7",
76
+ "async-timeout": "5.0.1",
77
+ "annotated-types": "0.7.0",
78
+ "rouge_score": "0.1.2",
79
+ "contourpy": "1.3.2",
80
+ "aiosignal": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "pillow": "11.2.1",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "pyarrow": "20.0.0",
87
+ "h11": "0.16.0",
88
+ "zipp": "3.19.2",
89
+ "tzdata": "2025.2",
90
+ "bert-score": "0.3.13",
91
+ "setuptools": "80.9.0",
92
+ "referencing": "0.36.2",
93
+ "sacrebleu": "2.5.1",
94
+ "filelock": "3.18.0",
95
+ "urllib3": "2.5.0",
96
+ "scipy": "1.15.3",
97
+ "nvidia-nccl-cu12": "2.26.2",
98
+ "kiwisolver": "1.4.8",
99
+ "networkx": "3.4.2",
100
+ "typing-inspection": "0.4.1",
101
+ "lxml": "5.4.0",
102
+ "sniffio": "1.3.1",
103
+ "scikit-learn": "1.7.0",
104
+ "nvidia-curand-cu12": "10.3.7.77",
105
+ "pip": "25.1.1",
106
+ "fonttools": "4.58.4",
107
+ "transformers": "4.52.4",
108
+ "datasets": "3.6.0",
109
+ "nvidia-cusolver-cu12": "11.7.1.2",
110
+ "cycler": "0.12.1",
111
+ "evaluate": "0.4.3",
112
+ "distro": "1.9.0",
113
+ "idna": "3.10",
114
+ "MarkupSafe": "3.0.2",
115
+ "frozenlist": "1.7.0",
116
+ "pyparsing": "3.2.3",
117
+ "jiter": "0.10.0",
118
+ "importlib_metadata": "8.0.0",
119
+ "packaging": "24.2",
120
+ "psutil": "7.0.0",
121
+ "mecab-ko-dic": "1.0.0",
122
+ "joblib": "1.5.1",
123
+ "fsspec": "2025.3.0",
124
+ "dill": "0.3.8",
125
+ "tokenizers": "0.21.1",
126
+ "wheel": "0.45.1",
127
+ "nvidia-nvtx-cu12": "12.6.77",
128
+ "nvidia-cusparselt-cu12": "0.6.3",
129
+ "hf-xet": "1.1.4",
130
+ "propcache": "0.3.2",
131
+ "numpy": "2.2.6",
132
+ "mpmath": "1.3.0",
133
+ "multidict": "6.5.0",
134
+ "conllu": "6.0.0",
135
+ "safetensors": "0.5.3",
136
+ "requests": "2.32.4",
137
+ "regex": "2024.11.6",
138
+ "aiohttp": "3.12.13",
139
+ "tabulate": "0.9.0",
140
+ "certifi": "2025.6.15",
141
+ "accelerate": "1.8.0",
142
+ "nvidia-cufft-cu12": "11.3.0.4",
143
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
+ "click": "8.2.1",
145
+ "typing_extensions": "4.12.2",
146
+ "attrs": "25.3.0",
147
+ "exceptiongroup": "1.3.0",
148
+ "tenacity": "9.1.2",
149
+ "pytz": "2025.2",
150
+ "aiohappyeyeballs": "2.6.1",
151
+ "python-dateutil": "2.9.0.post0",
152
+ "torch": "2.7.1",
153
+ "python-dotenv": "1.1.0",
154
+ "httpx": "0.28.1",
155
+ "matplotlib": "3.10.3",
156
+ "xxhash": "3.5.0",
157
+ "PyYAML": "6.0.2",
158
+ "huggingface-hub": "0.33.0",
159
+ "colorama": "0.4.6",
160
+ "rpds-py": "0.25.1",
161
+ "threadpoolctl": "3.6.0",
162
+ "nvidia-cudnn-cu12": "9.5.1.17",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.8222222222222222,
180
+ "accuracy_ci_low": 0.7333333333333333,
181
+ "accuracy_ci_high": 0.8888888888888888,
182
+ "score_name": "accuracy",
183
+ "score": 0.8222222222222222,
184
+ "score_ci_high": 0.8888888888888888,
185
+ "score_ci_low": 0.7333333333333333,
186
+ "num_of_instances": 90
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 1.0,
190
+ "accuracy_ci_low": 1.0,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 1.0,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 1.0,
196
+ "num_of_instances": 90
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 0.9888888888888889,
200
+ "accuracy_ci_low": 0.9366915726689814,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 0.9888888888888889,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 0.9366915726689814,
206
+ "num_of_instances": 90
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 90
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 1.0,
220
+ "accuracy_ci_low": 1.0,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 1.0,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 1.0,
226
+ "num_of_instances": 90
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 90
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 90
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 1.0,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 90
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 1.0,
260
+ "accuracy_ci_low": 1.0,
261
+ "accuracy_ci_high": 1.0,
262
+ "score_name": "accuracy",
263
+ "score": 1.0,
264
+ "score_ci_high": 1.0,
265
+ "score_ci_low": 1.0,
266
+ "num_of_instances": 90
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.9777777777777777,
270
+ "accuracy_ci_low": 0.9222222222222223,
271
+ "accuracy_ci_high": 1.0,
272
+ "score_name": "accuracy",
273
+ "score": 0.9777777777777777,
274
+ "score_ci_high": 1.0,
275
+ "score_ci_low": 0.9222222222222223,
276
+ "num_of_instances": 90
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.9,
280
+ "accuracy_ci_low": 0.8222222222222222,
281
+ "accuracy_ci_high": 0.9555555555555556,
282
+ "score_name": "accuracy",
283
+ "score": 0.9,
284
+ "score_ci_high": 0.9555555555555556,
285
+ "score_ci_low": 0.8222222222222222,
286
+ "num_of_instances": 90
287
+ },
288
+ "score": 0.9717171717171718,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 990
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.15036803364879076,
296
+ "score": 0.15036803364879076,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.15036803364879076,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 500
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 1000,
306
+ "f1_Person": 0.6063829787234042,
307
+ "f1_Organization": 0.3867069486404834,
308
+ "f1_Location": 0.43678160919540227,
309
+ "f1_macro": 0.4766238455197633,
310
+ "recall_macro": 0.43686343505993114,
311
+ "precision_macro": 0.5290149382542261,
312
+ "in_classes_support": 0.8519230769230769,
313
+ "f1_micro": 0.44976076555023925,
314
+ "recall_micro": 0.44761904761904764,
315
+ "precision_micro": 0.4519230769230769,
316
+ "score": 0.44976076555023925,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.40206449128985794,
319
+ "score_ci_high": 0.5019430325767736,
320
+ "f1_micro_ci_low": 0.40206449128985794,
321
+ "f1_micro_ci_high": 0.5019430325767736
322
+ },
323
+ "score": 0.44976076555023925,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 1000
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.6901408450704225,
330
+ "accuracy_ci_low": 0.5774647887323944,
331
+ "accuracy_ci_high": 0.7887323943661971,
332
+ "score_name": "accuracy",
333
+ "score": 0.6901408450704225,
334
+ "score_ci_high": 0.7887323943661971,
335
+ "score_ci_low": 0.5774647887323944,
336
+ "num_of_instances": 71
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.39436619718309857,
340
+ "accuracy_ci_low": 0.26949490209003363,
341
+ "accuracy_ci_high": 0.5070422535211268,
342
+ "score_name": "accuracy",
343
+ "score": 0.39436619718309857,
344
+ "score_ci_high": 0.5070422535211268,
345
+ "score_ci_low": 0.26949490209003363,
346
+ "num_of_instances": 71
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.2676056338028169,
350
+ "accuracy_ci_low": 0.16901408450704225,
351
+ "accuracy_ci_high": 0.38028169014084506,
352
+ "score_name": "accuracy",
353
+ "score": 0.2676056338028169,
354
+ "score_ci_high": 0.38028169014084506,
355
+ "score_ci_low": 0.16901408450704225,
356
+ "num_of_instances": 71
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.5633802816901409,
360
+ "accuracy_ci_low": 0.4507042253521127,
361
+ "accuracy_ci_high": 0.676056338028169,
362
+ "score_name": "accuracy",
363
+ "score": 0.5633802816901409,
364
+ "score_ci_high": 0.676056338028169,
365
+ "score_ci_low": 0.4507042253521127,
366
+ "num_of_instances": 71
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.6901408450704225,
370
+ "accuracy_ci_low": 0.5774647887323944,
371
+ "accuracy_ci_high": 0.7887323943661971,
372
+ "score_name": "accuracy",
373
+ "score": 0.6901408450704225,
374
+ "score_ci_high": 0.7887323943661971,
375
+ "score_ci_low": 0.5774647887323944,
376
+ "num_of_instances": 71
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.38028169014084506,
380
+ "accuracy_ci_low": 0.2676056338028169,
381
+ "accuracy_ci_high": 0.49295774647887325,
382
+ "score_name": "accuracy",
383
+ "score": 0.38028169014084506,
384
+ "score_ci_high": 0.49295774647887325,
385
+ "score_ci_low": 0.2676056338028169,
386
+ "num_of_instances": 71
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.5633802816901409,
390
+ "accuracy_ci_low": 0.4507042253521127,
391
+ "accuracy_ci_high": 0.676056338028169,
392
+ "score_name": "accuracy",
393
+ "score": 0.5633802816901409,
394
+ "score_ci_high": 0.676056338028169,
395
+ "score_ci_low": 0.4507042253521127,
396
+ "num_of_instances": 71
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.6619718309859155,
400
+ "accuracy_ci_low": 0.5492957746478874,
401
+ "accuracy_ci_high": 0.7605633802816901,
402
+ "score_name": "accuracy",
403
+ "score": 0.6619718309859155,
404
+ "score_ci_high": 0.7605633802816901,
405
+ "score_ci_low": 0.5492957746478874,
406
+ "num_of_instances": 71
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.5070422535211268,
410
+ "accuracy_ci_low": 0.39436619718309857,
411
+ "accuracy_ci_high": 0.6197183098591549,
412
+ "score_name": "accuracy",
413
+ "score": 0.5070422535211268,
414
+ "score_ci_high": 0.6197183098591549,
415
+ "score_ci_low": 0.39436619718309857,
416
+ "num_of_instances": 71
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.36619718309859156,
420
+ "accuracy_ci_low": 0.2535211267605634,
421
+ "accuracy_ci_high": 0.4788732394366197,
422
+ "score_name": "accuracy",
423
+ "score": 0.36619718309859156,
424
+ "score_ci_high": 0.4788732394366197,
425
+ "score_ci_low": 0.2535211267605634,
426
+ "num_of_instances": 71
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.5211267605633803,
430
+ "accuracy_ci_low": 0.4084507042253521,
431
+ "accuracy_ci_high": 0.6338028169014085,
432
+ "score_name": "accuracy",
433
+ "score": 0.5211267605633803,
434
+ "score_ci_high": 0.6338028169014085,
435
+ "score_ci_low": 0.4084507042253521,
436
+ "num_of_instances": 71
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.647887323943662,
440
+ "accuracy_ci_low": 0.5211267605633803,
441
+ "accuracy_ci_high": 0.7605633802816901,
442
+ "score_name": "accuracy",
443
+ "score": 0.647887323943662,
444
+ "score_ci_high": 0.7605633802816901,
445
+ "score_ci_low": 0.5211267605633803,
446
+ "num_of_instances": 71
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.43661971830985913,
450
+ "accuracy_ci_low": 0.323943661971831,
451
+ "accuracy_ci_high": 0.5492957746478874,
452
+ "score_name": "accuracy",
453
+ "score": 0.43661971830985913,
454
+ "score_ci_high": 0.5492957746478874,
455
+ "score_ci_low": 0.323943661971831,
456
+ "num_of_instances": 71
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.6901408450704225,
460
+ "accuracy_ci_low": 0.5774647887323944,
461
+ "accuracy_ci_high": 0.7887323943661971,
462
+ "score_name": "accuracy",
463
+ "score": 0.6901408450704225,
464
+ "score_ci_high": 0.7887323943661971,
465
+ "score_ci_low": 0.5774647887323944,
466
+ "num_of_instances": 71
467
+ },
468
+ "score": 0.5271629778672032,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 994
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.7225597780588705,
475
+ "f1_suggestive": 0.6896551724137931,
476
+ "f1_generic": 0.9333333333333333,
477
+ "f1_fanciful": 0.5185185185185185,
478
+ "f1_descriptive": 0.7894736842105263,
479
+ "f1_arbitrary": 0.6818181818181818,
480
+ "f1_macro_ci_low": 0.6323470231492377,
481
+ "f1_macro_ci_high": 0.8166804118889143,
482
+ "score_name": "f1_micro",
483
+ "score": 0.7261904761904762,
484
+ "score_ci_high": 0.8165680473372781,
485
+ "score_ci_low": 0.6278443317985081,
486
+ "num_of_instances": 85,
487
+ "accuracy": 0.7176470588235294,
488
+ "accuracy_ci_low": 0.6235294117647059,
489
+ "accuracy_ci_high": 0.8,
490
+ "f1_micro": 0.7261904761904762,
491
+ "f1_micro_ci_low": 0.6278443317985081,
492
+ "f1_micro_ci_high": 0.8165680473372781
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.6249597423510467,
496
+ "f1_no": 0.6869565217391305,
497
+ "f1_yes": 0.562962962962963,
498
+ "f1_macro_ci_low": 0.5550554427556457,
499
+ "f1_macro_ci_high": 0.695881941412769,
500
+ "score_name": "f1_micro",
501
+ "score": 0.6410958904109589,
502
+ "score_ci_high": 0.7049180327868853,
503
+ "score_ci_low": 0.5737704918032787,
504
+ "num_of_instances": 200,
505
+ "accuracy": 0.585,
506
+ "accuracy_ci_low": 0.52,
507
+ "accuracy_ci_high": 0.65,
508
+ "f1_micro": 0.6410958904109589,
509
+ "f1_micro_ci_low": 0.5737704918032787,
510
+ "f1_micro_ci_high": 0.7049180327868853
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.3038558663558663,
514
+ "f1_conclusion": 0.1111111111111111,
515
+ "f1_decree": 0.24242424242424243,
516
+ "f1_issue": 0.2916666666666667,
517
+ "f1_analysis": 0.5625,
518
+ "f1_facts": 0.12121212121212122,
519
+ "f1_procedural history": 0.375,
520
+ "f1_rule": 0.4230769230769231,
521
+ "f1_macro_ci_low": 0.24917773569698498,
522
+ "f1_macro_ci_high": 0.3785030935767383,
523
+ "score_name": "f1_micro",
524
+ "score": 0.3393939393939394,
525
+ "score_ci_high": 0.4145430992532546,
526
+ "score_ci_low": 0.2731916089829871,
527
+ "num_of_instances": 200,
528
+ "accuracy": 0.28,
529
+ "accuracy_ci_low": 0.22,
530
+ "accuracy_ci_high": 0.3484825462990022,
531
+ "f1_micro": 0.3393939393939394,
532
+ "f1_micro_ci_low": 0.2731916089829871,
533
+ "f1_micro_ci_high": 0.4145430992532546
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.5240648011782032,
537
+ "f1_yes": 0.49142857142857144,
538
+ "f1_no": 0.5567010309278351,
539
+ "f1_macro_ci_low": 0.45400963495793933,
540
+ "f1_macro_ci_high": 0.5921495411901395,
541
+ "score_name": "f1_micro",
542
+ "score": 0.5257452574525745,
543
+ "score_ci_high": 0.5909095637067483,
544
+ "score_ci_low": 0.4547945205479452,
545
+ "num_of_instances": 200,
546
+ "accuracy": 0.485,
547
+ "accuracy_ci_low": 0.415,
548
+ "accuracy_ci_high": 0.55,
549
+ "f1_micro": 0.5257452574525745,
550
+ "f1_micro_ci_low": 0.4547945205479452,
551
+ "f1_micro_ci_high": 0.5909095637067483
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.7668918918918919,
555
+ "f1_yes": 0.75,
556
+ "f1_no": 0.7837837837837838,
557
+ "f1_macro_ci_low": 0.679529165397271,
558
+ "f1_macro_ci_high": 0.8388811527947668,
559
+ "score_name": "f1_micro",
560
+ "score": 0.7671232876712328,
561
+ "score_ci_high": 0.8378378378378378,
562
+ "score_ci_low": 0.6808510638297872,
563
+ "num_of_instances": 85,
564
+ "accuracy": 0.6588235294117647,
565
+ "accuracy_ci_low": 0.5647058823529412,
566
+ "accuracy_ci_high": 0.7529411764705882,
567
+ "f1_micro": 0.7671232876712328,
568
+ "f1_micro_ci_low": 0.6808510638297872,
569
+ "f1_micro_ci_high": 0.8378378378378378
570
+ },
571
+ "score": 0.5999097702238364,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 770
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.6635217038391774,
578
+ "f1_cars": 0.9019607843137255,
579
+ "f1_windows x": 0.19444444444444445,
580
+ "f1_computer graphics": 0.4496124031007752,
581
+ "f1_atheism": 0.5614035087719298,
582
+ "f1_christianity": 0.8113207547169812,
583
+ "f1_religion": 0.3103448275862069,
584
+ "f1_medicine": 0.8275862068965517,
585
+ "f1_for sale": 0.6923076923076923,
586
+ "f1_microsoft windows": 0.6818181818181818,
587
+ "f1_middle east": 0.684931506849315,
588
+ "f1_motorcycles": 0.7962962962962963,
589
+ "f1_pc hardware": 0.6474820143884892,
590
+ "f1_mac hardware": 0.7307692307692307,
591
+ "f1_guns": 0.4594594594594595,
592
+ "f1_space": 0.8440366972477065,
593
+ "f1_cryptography": 0.7105263157894737,
594
+ "f1_baseball": 0.9491525423728814,
595
+ "f1_hockey": 0.9701492537313433,
596
+ "f1_politics": 0.38016528925619836,
597
+ "f1_electronics": 0.6666666666666666,
598
+ "f1_macro_ci_low": 0.6398385377906187,
599
+ "f1_macro_ci_high": 0.6921196013936116,
600
+ "score_name": "f1_micro",
601
+ "score": 0.6843198338525441,
602
+ "score_ci_high": 0.7121991620876709,
603
+ "score_ci_low": 0.6566124058286812,
604
+ "num_of_instances": 1000,
605
+ "accuracy": 0.659,
606
+ "accuracy_ci_low": 0.629,
607
+ "accuracy_ci_high": 0.688,
608
+ "f1_micro": 0.6843198338525441,
609
+ "f1_micro_ci_low": 0.6566124058286812,
610
+ "f1_micro_ci_high": 0.7121991620876709
611
+ },
612
+ "score": 0.6843198338525441,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 1000
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.7097642052328873,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9338959212376934,
620
+ "f1_checking or savings account": 0.8495575221238938,
621
+ "f1_debt collection": 0.5492957746478874,
622
+ "f1_credit card or prepaid card": 0.5060240963855421,
623
+ "f1_mortgage": 0.8115942028985508,
624
+ "f1_payday loan or title loan or personal loan": 0.47058823529411764,
625
+ "f1_student loan": 0.896551724137931,
626
+ "f1_money transfer or virtual currency or money service": 0.8148148148148148,
627
+ "f1_vehicle loan or lease": 0.5555555555555556,
628
+ "f1_macro_ci_low": 0.6547879605458498,
629
+ "f1_macro_ci_high": 0.7665337654404797,
630
+ "score_name": "f1_micro",
631
+ "score": 0.8641221374045801,
632
+ "score_ci_high": 0.8833607904776744,
633
+ "score_ci_low": 0.8421586938502544,
634
+ "num_of_instances": 1000,
635
+ "accuracy": 0.849,
636
+ "accuracy_ci_low": 0.826,
637
+ "accuracy_ci_high": 0.869,
638
+ "f1_micro": 0.8641221374045801,
639
+ "f1_micro_ci_low": 0.8421586938502544,
640
+ "f1_micro_ci_high": 0.8833607904776744
641
+ },
642
+ "cfpb_product_watsonx": {
643
+ "f1_macro": 0.762019616454399,
644
+ "f1_mortgages and loans": 0.8181818181818182,
645
+ "f1_credit card": 0.8,
646
+ "f1_debt collection": 0.6859903381642513,
647
+ "f1_credit reporting": 0.78,
648
+ "f1_retail banking": 0.725925925925926,
649
+ "f1_macro_ci_low": 0.7228549596893655,
650
+ "f1_macro_ci_high": 0.7974306633005968,
651
+ "score_name": "f1_micro",
652
+ "score": 0.7633434038267876,
653
+ "score_ci_high": 0.7971877449640327,
654
+ "score_ci_low": 0.725195552217836,
655
+ "num_of_instances": 500,
656
+ "accuracy": 0.758,
657
+ "accuracy_ci_low": 0.718,
658
+ "accuracy_ci_high": 0.794,
659
+ "f1_micro": 0.7633434038267876,
660
+ "f1_micro_ci_low": 0.725195552217836,
661
+ "f1_micro_ci_high": 0.7971877449640327
662
+ },
663
+ "score": 0.8137327706156838,
664
+ "score_name": "subsets_mean",
665
+ "num_of_instances": 1500
666
+ },
667
+ "qa_finance": {
668
+ "fin_qa": {
669
+ "num_of_instances": 1000,
670
+ "program_accuracy": 0.215,
671
+ "score": 0.215,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy": 0.197,
674
+ "program_accuracy_ci_low": 0.19,
675
+ "program_accuracy_ci_high": 0.241,
676
+ "score_ci_low": 0.19,
677
+ "score_ci_high": 0.241,
678
+ "execution_accuracy_ci_low": 0.175,
679
+ "execution_accuracy_ci_high": 0.2231767765112022
680
+ },
681
+ "score": 0.215,
682
+ "score_name": "subsets_mean",
683
+ "num_of_instances": 1000
684
+ },
685
+ "rag_general": {
686
+ "rag_response_generation_clapnq": {
687
+ "precision": 0.3315078335561397,
688
+ "recall": 0.5240469619010103,
689
+ "f1": 0.34432296142118446,
690
+ "precision_ci_low": 0.3111263193000635,
691
+ "precision_ci_high": 0.35216605940257417,
692
+ "recall_ci_low": 0.5095542204077604,
693
+ "recall_ci_high": 0.5396498579199621,
694
+ "f1_ci_low": 0.3272930525098391,
695
+ "f1_ci_high": 0.3622790296659677,
696
+ "score_name": "f1",
697
+ "score": 0.34432296142118446,
698
+ "score_ci_high": 0.3622790296659677,
699
+ "score_ci_low": 0.3272930525098391,
700
+ "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.5984612627824147,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6712562903761864,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5609361777206262,
704
+ "faithfullness_f1_token_overlap": 0.2786719270117274,
705
+ "faithfullness_recall_token_overlap": 0.20806249969631424,
706
+ "faithfullness_precision_token_overlap": 0.5649102031372216,
707
+ "correctness_f1_token_overlap": 0.34432296142118446,
708
+ "correctness_recall_token_overlap": 0.5240469619010103,
709
+ "correctness_precision_token_overlap": 0.3315078335561397
710
+ },
711
+ "score": 0.34432296142118446,
712
+ "score_name": "subsets_mean",
713
+ "num_of_instances": 600
714
+ },
715
+ "reasoning": {
716
+ "hellaswag": {
717
+ "accuracy": 0.556,
718
+ "accuracy_ci_low": 0.524,
719
+ "accuracy_ci_high": 0.5869866405591143,
720
+ "score_name": "accuracy",
721
+ "score": 0.556,
722
+ "score_ci_high": 0.5869866405591143,
723
+ "score_ci_low": 0.524,
724
+ "num_of_instances": 1000
725
+ },
726
+ "openbook_qa": {
727
+ "accuracy": 0.94,
728
+ "accuracy_ci_low": 0.916,
729
+ "accuracy_ci_high": 0.956,
730
+ "score_name": "accuracy",
731
+ "score": 0.94,
732
+ "score_ci_high": 0.956,
733
+ "score_ci_low": 0.916,
734
+ "num_of_instances": 500
735
+ },
736
+ "score": 0.748,
737
+ "score_name": "subsets_mean",
738
+ "num_of_instances": 1500
739
+ },
740
+ "safety": {
741
+ "attaq_500": {
742
+ "severity_critical": 0.0,
743
+ "severity_high": 0.0,
744
+ "severity_medium": 1.0,
745
+ "severity_low": 99.0,
746
+ "category_harmful_info": 0.8564385112286323,
747
+ "category_deception": 0.8711490425648111,
748
+ "category_explicit_content": 0.8348137058994987,
749
+ "category_violence": 0.8371000632880226,
750
+ "category_discrimination": 0.7546705811742752,
751
+ "category_substance_abuse": 0.8373568359348509,
752
+ "category_pii": 0.8237267407002272,
753
+ "safety": 0.8398502923258477,
754
+ "safety_ci_low": 0.8202995500858341,
755
+ "safety_ci_high": 0.8593436721902035,
756
+ "score_name": "safety",
757
+ "score": 0.8398502923258477,
758
+ "score_ci_high": 0.8593436721902035,
759
+ "score_ci_low": 0.8202995500858341,
760
+ "num_of_instances": 100
761
+ },
762
+ "score": 0.8398502923258477,
763
+ "score_name": "subsets_mean",
764
+ "num_of_instances": 100
765
+ },
766
+ "summarization": {
767
+ "billsum_document_filtered_to_6000_chars": {
768
+ "num_of_instances": 528,
769
+ "rouge2": 0.2234078484606255,
770
+ "rouge1": 0.4320936503911059,
771
+ "rougeL": 0.30661978583917676,
772
+ "score": 0.30661978583917676,
773
+ "score_name": "rougeL",
774
+ "rougeLsum": 0.3725727472081357,
775
+ "rouge2_ci_low": 0.21606279145137167,
776
+ "rouge2_ci_high": 0.2311572725809842,
777
+ "rouge1_ci_low": 0.42200418152790053,
778
+ "rouge1_ci_high": 0.44139989552409375,
779
+ "rougeL_ci_low": 0.2992390814144599,
780
+ "rougeL_ci_high": 0.3143226721498939,
781
+ "score_ci_low": 0.2992390814144599,
782
+ "score_ci_high": 0.3143226721498939,
783
+ "rougeLsum_ci_low": 0.3633508576032972,
784
+ "rougeLsum_ci_high": 0.3810899218803269
785
+ },
786
+ "tldr_document_filtered_to_6000_chars": {
787
+ "num_of_instances": 1000,
788
+ "rouge2": 0.020660811704056488,
789
+ "rouge1": 0.13221889196965417,
790
+ "rougeL": 0.09485458949436118,
791
+ "score": 0.09485458949436118,
792
+ "score_name": "rougeL",
793
+ "rougeLsum": 0.10832578587514186,
794
+ "rouge2_ci_low": 0.01868448614645504,
795
+ "rouge2_ci_high": 0.022956487198784636,
796
+ "rouge1_ci_low": 0.126104991201355,
797
+ "rouge1_ci_high": 0.1377085399711476,
798
+ "rougeL_ci_low": 0.09086436325621726,
799
+ "rougeL_ci_high": 0.09876262383117224,
800
+ "score_ci_low": 0.09086436325621726,
801
+ "score_ci_high": 0.09876262383117224,
802
+ "rougeLsum_ci_low": 0.10373535762969843,
803
+ "rougeLsum_ci_high": 0.11251759270443917
804
+ },
805
+ "score": 0.20073718766676896,
806
+ "score_name": "subsets_mean",
807
+ "num_of_instances": 1528
808
+ },
809
+ "translation": {
810
+ "mt_flores_101_ara_eng": {
811
+ "num_of_instances": 66,
812
+ "counts": [
813
+ 1306,
814
+ 883,
815
+ 627,
816
+ 449
817
+ ],
818
+ "totals": [
819
+ 1786,
820
+ 1720,
821
+ 1654,
822
+ 1588
823
+ ],
824
+ "precisions": [
825
+ 0.7312430011198208,
826
+ 0.5133720930232558,
827
+ 0.37908101571946795,
828
+ 0.28274559193954657
829
+ ],
830
+ "bp": 1.0,
831
+ "sys_len": 1786,
832
+ "ref_len": 1734,
833
+ "sacrebleu": 0.44787360079689753,
834
+ "score": 0.44787360079689753,
835
+ "score_name": "sacrebleu",
836
+ "score_ci_low": 0.4036377627510155,
837
+ "score_ci_high": 0.4918827063832084,
838
+ "sacrebleu_ci_low": 0.4036377627510155,
839
+ "sacrebleu_ci_high": 0.4918827063832084
840
+ },
841
+ "mt_flores_101_deu_eng": {
842
+ "num_of_instances": 66,
843
+ "counts": [
844
+ 1315,
845
+ 856,
846
+ 590,
847
+ 421
848
+ ],
849
+ "totals": [
850
+ 1806,
851
+ 1740,
852
+ 1674,
853
+ 1608
854
+ ],
855
+ "precisions": [
856
+ 0.7281284606866002,
857
+ 0.49195402298850577,
858
+ 0.3524492234169654,
859
+ 0.26181592039800994
860
+ ],
861
+ "bp": 1.0,
862
+ "sys_len": 1806,
863
+ "ref_len": 1734,
864
+ "sacrebleu": 0.42638928564964085,
865
+ "score": 0.42638928564964085,
866
+ "score_name": "sacrebleu",
867
+ "score_ci_low": 0.39041673082772005,
868
+ "score_ci_high": 0.4759249097236273,
869
+ "sacrebleu_ci_low": 0.39041673082772005,
870
+ "sacrebleu_ci_high": 0.4759249097236273
871
+ },
872
+ "mt_flores_101_eng_ara": {
873
+ "num_of_instances": 66,
874
+ "counts": [
875
+ 935,
876
+ 516,
877
+ 311,
878
+ 192
879
+ ],
880
+ "totals": [
881
+ 1626,
882
+ 1560,
883
+ 1494,
884
+ 1428
885
+ ],
886
+ "precisions": [
887
+ 0.5750307503075031,
888
+ 0.3307692307692308,
889
+ 0.20816599732262384,
890
+ 0.13445378151260504
891
+ ],
892
+ "bp": 1.0,
893
+ "sys_len": 1626,
894
+ "ref_len": 1589,
895
+ "sacrebleu": 0.27011564955900186,
896
+ "score": 0.27011564955900186,
897
+ "score_name": "sacrebleu",
898
+ "score_ci_low": 0.2357102474208666,
899
+ "score_ci_high": 0.3104170869325649,
900
+ "sacrebleu_ci_low": 0.2357102474208666,
901
+ "sacrebleu_ci_high": 0.3104170869325649
902
+ },
903
+ "mt_flores_101_eng_deu": {
904
+ "num_of_instances": 66,
905
+ "counts": [
906
+ 1239,
907
+ 749,
908
+ 489,
909
+ 333
910
+ ],
911
+ "totals": [
912
+ 1835,
913
+ 1769,
914
+ 1703,
915
+ 1637
916
+ ],
917
+ "precisions": [
918
+ 0.6752043596730245,
919
+ 0.4234030525720746,
920
+ 0.28714034057545507,
921
+ 0.2034208918753818
922
+ ],
923
+ "bp": 1.0,
924
+ "sys_len": 1835,
925
+ "ref_len": 1835,
926
+ "sacrebleu": 0.35947587289557503,
927
+ "score": 0.35947587289557503,
928
+ "score_name": "sacrebleu",
929
+ "score_ci_low": 0.32388996948420856,
930
+ "score_ci_high": 0.40304188977063987,
931
+ "sacrebleu_ci_low": 0.32388996948420856,
932
+ "sacrebleu_ci_high": 0.40304188977063987
933
+ },
934
+ "mt_flores_101_eng_fra": {
935
+ "num_of_instances": 66,
936
+ "counts": [
937
+ 1522,
938
+ 1124,
939
+ 872,
940
+ 690
941
+ ],
942
+ "totals": [
943
+ 2039,
944
+ 1973,
945
+ 1907,
946
+ 1841
947
+ ],
948
+ "precisions": [
949
+ 0.7464443354585582,
950
+ 0.5696908261530664,
951
+ 0.4572627163083377,
952
+ 0.37479630635524175
953
+ ],
954
+ "bp": 0.985878006034285,
955
+ "sys_len": 2039,
956
+ "ref_len": 2068,
957
+ "sacrebleu": 0.5122389690371388,
958
+ "score": 0.5122389690371388,
959
+ "score_name": "sacrebleu",
960
+ "score_ci_low": 0.4712480498361402,
961
+ "score_ci_high": 0.5723927923031075,
962
+ "sacrebleu_ci_low": 0.4712480498361402,
963
+ "sacrebleu_ci_high": 0.5723927923031075
964
+ },
965
+ "mt_flores_101_eng_kor": {
966
+ "num_of_instances": 66,
967
+ "counts": [
968
+ 1381,
969
+ 741,
970
+ 442,
971
+ 270
972
+ ],
973
+ "totals": [
974
+ 2380,
975
+ 2314,
976
+ 2248,
977
+ 2182
978
+ ],
979
+ "precisions": [
980
+ 0.5802521008403362,
981
+ 0.3202247191011236,
982
+ 0.19661921708185054,
983
+ 0.12373968835930339
984
+ ],
985
+ "bp": 1.0,
986
+ "sys_len": 2380,
987
+ "ref_len": 2235,
988
+ "sacrebleu": 0.2592994758065073,
989
+ "score": 0.2592994758065073,
990
+ "score_name": "sacrebleu",
991
+ "score_ci_low": 0.2239202825861796,
992
+ "score_ci_high": 0.28514030316442807,
993
+ "sacrebleu_ci_low": 0.2239202825861796,
994
+ "sacrebleu_ci_high": 0.28514030316442807
995
+ },
996
+ "mt_flores_101_eng_por": {
997
+ "num_of_instances": 66,
998
+ "counts": [
999
+ 1450,
1000
+ 1041,
1001
+ 790,
1002
+ 605
1003
+ ],
1004
+ "totals": [
1005
+ 1904,
1006
+ 1838,
1007
+ 1772,
1008
+ 1706
1009
+ ],
1010
+ "precisions": [
1011
+ 0.7615546218487396,
1012
+ 0.5663764961915125,
1013
+ 0.44582392776523705,
1014
+ 0.35463071512309496
1015
+ ],
1016
+ "bp": 0.9937172982182376,
1017
+ "sys_len": 1904,
1018
+ "ref_len": 1916,
1019
+ "sacrebleu": 0.5078077801713752,
1020
+ "score": 0.5078077801713752,
1021
+ "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.4521493141425684,
1023
+ "score_ci_high": 0.5472131500369735,
1024
+ "sacrebleu_ci_low": 0.4521493141425684,
1025
+ "sacrebleu_ci_high": 0.5472131500369735
1026
+ },
1027
+ "mt_flores_101_eng_ron": {
1028
+ "num_of_instances": 66,
1029
+ "counts": [
1030
+ 1407,
1031
+ 988,
1032
+ 717,
1033
+ 522
1034
+ ],
1035
+ "totals": [
1036
+ 1981,
1037
+ 1915,
1038
+ 1849,
1039
+ 1783
1040
+ ],
1041
+ "precisions": [
1042
+ 0.7102473498233215,
1043
+ 0.5159268929503916,
1044
+ 0.3877771768523526,
1045
+ 0.2927650028042625
1046
+ ],
1047
+ "bp": 1.0,
1048
+ "sys_len": 1981,
1049
+ "ref_len": 1949,
1050
+ "sacrebleu": 0.4516216968085713,
1051
+ "score": 0.4516216968085713,
1052
+ "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.4139287472062087,
1054
+ "score_ci_high": 0.49223929725777865,
1055
+ "sacrebleu_ci_low": 0.4139287472062087,
1056
+ "sacrebleu_ci_high": 0.49223929725777865
1057
+ },
1058
+ "mt_flores_101_eng_spa": {
1059
+ "num_of_instances": 66,
1060
+ "counts": [
1061
+ 1307,
1062
+ 777,
1063
+ 489,
1064
+ 316
1065
+ ],
1066
+ "totals": [
1067
+ 2014,
1068
+ 1948,
1069
+ 1882,
1070
+ 1816
1071
+ ],
1072
+ "precisions": [
1073
+ 0.6489572989076464,
1074
+ 0.398870636550308,
1075
+ 0.2598299681190223,
1076
+ 0.17400881057268724
1077
+ ],
1078
+ "bp": 0.9591497695217011,
1079
+ "sys_len": 2014,
1080
+ "ref_len": 2098,
1081
+ "sacrebleu": 0.3154740151881343,
1082
+ "score": 0.3154740151881343,
1083
+ "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.288220347209022,
1085
+ "score_ci_high": 0.3556750708776675,
1086
+ "sacrebleu_ci_low": 0.288220347209022,
1087
+ "sacrebleu_ci_high": 0.3556750708776675
1088
+ },
1089
+ "mt_flores_101_fra_eng": {
1090
+ "num_of_instances": 66,
1091
+ "counts": [
1092
+ 1350,
1093
+ 939,
1094
+ 681,
1095
+ 500
1096
+ ],
1097
+ "totals": [
1098
+ 1836,
1099
+ 1770,
1100
+ 1704,
1101
+ 1638
1102
+ ],
1103
+ "precisions": [
1104
+ 0.7352941176470589,
1105
+ 0.5305084745762713,
1106
+ 0.3996478873239437,
1107
+ 0.3052503052503053
1108
+ ],
1109
+ "bp": 1.0,
1110
+ "sys_len": 1836,
1111
+ "ref_len": 1734,
1112
+ "sacrebleu": 0.46705901757494195,
1113
+ "score": 0.46705901757494195,
1114
+ "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.41922690444245675,
1116
+ "score_ci_high": 0.504377689163203,
1117
+ "sacrebleu_ci_low": 0.41922690444245675,
1118
+ "sacrebleu_ci_high": 0.504377689163203
1119
+ },
1120
+ "mt_flores_101_jpn_eng": {
1121
+ "num_of_instances": 66,
1122
+ "counts": [
1123
+ 1114,
1124
+ 590,
1125
+ 369,
1126
+ 236
1127
+ ],
1128
+ "totals": [
1129
+ 1784,
1130
+ 1718,
1131
+ 1652,
1132
+ 1586
1133
+ ],
1134
+ "precisions": [
1135
+ 0.6244394618834082,
1136
+ 0.34342258440046564,
1137
+ 0.22336561743341407,
1138
+ 0.14880201765447668
1139
+ ],
1140
+ "bp": 1.0,
1141
+ "sys_len": 1784,
1142
+ "ref_len": 1734,
1143
+ "sacrebleu": 0.2905601720675106,
1144
+ "score": 0.2905601720675106,
1145
+ "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.2583164259733059,
1147
+ "score_ci_high": 0.32976603144676014,
1148
+ "sacrebleu_ci_low": 0.2583164259733059,
1149
+ "sacrebleu_ci_high": 0.32976603144676014
1150
+ },
1151
+ "mt_flores_101_kor_eng": {
1152
+ "num_of_instances": 66,
1153
+ "counts": [
1154
+ 1147,
1155
+ 633,
1156
+ 385,
1157
+ 239
1158
+ ],
1159
+ "totals": [
1160
+ 1773,
1161
+ 1707,
1162
+ 1641,
1163
+ 1575
1164
+ ],
1165
+ "precisions": [
1166
+ 0.64692611393119,
1167
+ 0.37082601054481545,
1168
+ 0.23461304082876297,
1169
+ 0.15174603174603174
1170
+ ],
1171
+ "bp": 1.0,
1172
+ "sys_len": 1773,
1173
+ "ref_len": 1734,
1174
+ "sacrebleu": 0.3040000049079303,
1175
+ "score": 0.3040000049079303,
1176
+ "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.27388865177998245,
1178
+ "score_ci_high": 0.3607554043507509,
1179
+ "sacrebleu_ci_low": 0.27388865177998245,
1180
+ "sacrebleu_ci_high": 0.3607554043507509
1181
+ },
1182
+ "mt_flores_101_por_eng": {
1183
+ "num_of_instances": 66,
1184
+ "counts": [
1185
+ 1367,
1186
+ 976,
1187
+ 726,
1188
+ 541
1189
+ ],
1190
+ "totals": [
1191
+ 1814,
1192
+ 1748,
1193
+ 1682,
1194
+ 1616
1195
+ ],
1196
+ "precisions": [
1197
+ 0.7535832414553473,
1198
+ 0.5583524027459954,
1199
+ 0.43162901307966706,
1200
+ 0.33477722772277224
1201
+ ],
1202
+ "bp": 1.0,
1203
+ "sys_len": 1814,
1204
+ "ref_len": 1734,
1205
+ "sacrebleu": 0.496565496677853,
1206
+ "score": 0.496565496677853,
1207
+ "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.44856379811709507,
1209
+ "score_ci_high": 0.5405491556673685,
1210
+ "sacrebleu_ci_low": 0.44856379811709507,
1211
+ "sacrebleu_ci_high": 0.5405491556673685
1212
+ },
1213
+ "mt_flores_101_ron_eng": {
1214
+ "num_of_instances": 66,
1215
+ "counts": [
1216
+ 1362,
1217
+ 982,
1218
+ 727,
1219
+ 545
1220
+ ],
1221
+ "totals": [
1222
+ 1804,
1223
+ 1738,
1224
+ 1672,
1225
+ 1606
1226
+ ],
1227
+ "precisions": [
1228
+ 0.7549889135254989,
1229
+ 0.5650172612197929,
1230
+ 0.43480861244019137,
1231
+ 0.33935242839352425
1232
+ ],
1233
+ "bp": 1.0,
1234
+ "sys_len": 1804,
1235
+ "ref_len": 1734,
1236
+ "sacrebleu": 0.5008847938003845,
1237
+ "score": 0.5008847938003845,
1238
+ "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.45964511094313315,
1240
+ "score_ci_high": 0.5345023750169955,
1241
+ "sacrebleu_ci_low": 0.45964511094313315,
1242
+ "sacrebleu_ci_high": 0.5345023750169955
1243
+ },
1244
+ "mt_flores_101_spa_eng": {
1245
+ "num_of_instances": 66,
1246
+ "counts": [
1247
+ 1236,
1248
+ 740,
1249
+ 479,
1250
+ 316
1251
+ ],
1252
+ "totals": [
1253
+ 1894,
1254
+ 1828,
1255
+ 1762,
1256
+ 1696
1257
+ ],
1258
+ "precisions": [
1259
+ 0.6525871172122493,
1260
+ 0.40481400437636766,
1261
+ 0.27185017026106695,
1262
+ 0.18632075471698112
1263
+ ],
1264
+ "bp": 1.0,
1265
+ "sys_len": 1894,
1266
+ "ref_len": 1734,
1267
+ "sacrebleu": 0.34011142198108724,
1268
+ "score": 0.34011142198108724,
1269
+ "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.30697541693140973,
1271
+ "score_ci_high": 0.38727309802034554,
1272
+ "sacrebleu_ci_low": 0.30697541693140973,
1273
+ "sacrebleu_ci_high": 0.38727309802034554
1274
+ },
1275
+ "score": 0.39663181686150334,
1276
+ "score_name": "subsets_mean",
1277
+ "num_of_instances": 990
1278
+ },
1279
+ "score": 0.533962583211598,
1280
+ "score_name": "subsets_mean",
1281
+ "num_of_instances": 12472
1282
+ }
1283
+ }
results/bluebench/2025-06-22T19-25-42_evaluation_results.json ADDED
@@ -0,0 +1,1283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-06-22T23:25:38.430519Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=watsonx/meta-llama/llama-3-405b-instruct,max_tokens=256",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "watsonx/meta-llama/llama-3-405b-instruct",
30
+ "model_args": {
31
+ "max_tokens": 256
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "absl-py": "2.3.0",
55
+ "tiktoken": "0.9.0",
56
+ "charset-normalizer": "3.4.2",
57
+ "nvidia-cuda-runtime-cu12": "12.6.77",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "litellm": "1.72.6.post1",
61
+ "httpcore": "1.0.9",
62
+ "Jinja2": "3.1.6",
63
+ "jsonschema-specifications": "2025.4.1",
64
+ "pydantic_core": "2.33.2",
65
+ "nvidia-cusparse-cu12": "12.5.4.2",
66
+ "yarl": "1.20.1",
67
+ "openai": "1.88.0",
68
+ "portalocker": "3.2.0",
69
+ "pandas": "2.3.0",
70
+ "multiprocess": "0.70.16",
71
+ "jsonschema": "4.24.0",
72
+ "unitxt": "1.24.0",
73
+ "nvidia-nvjitlink-cu12": "12.6.85",
74
+ "nvidia-cublas-cu12": "12.6.4.1",
75
+ "pydantic": "2.11.7",
76
+ "async-timeout": "5.0.1",
77
+ "annotated-types": "0.7.0",
78
+ "rouge_score": "0.1.2",
79
+ "contourpy": "1.3.2",
80
+ "aiosignal": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "pillow": "11.2.1",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "pyarrow": "20.0.0",
87
+ "h11": "0.16.0",
88
+ "zipp": "3.19.2",
89
+ "tzdata": "2025.2",
90
+ "bert-score": "0.3.13",
91
+ "setuptools": "80.9.0",
92
+ "referencing": "0.36.2",
93
+ "sacrebleu": "2.5.1",
94
+ "filelock": "3.18.0",
95
+ "urllib3": "2.5.0",
96
+ "scipy": "1.15.3",
97
+ "nvidia-nccl-cu12": "2.26.2",
98
+ "kiwisolver": "1.4.8",
99
+ "networkx": "3.4.2",
100
+ "typing-inspection": "0.4.1",
101
+ "lxml": "5.4.0",
102
+ "sniffio": "1.3.1",
103
+ "scikit-learn": "1.7.0",
104
+ "nvidia-curand-cu12": "10.3.7.77",
105
+ "pip": "25.1.1",
106
+ "fonttools": "4.58.4",
107
+ "transformers": "4.52.4",
108
+ "datasets": "3.6.0",
109
+ "nvidia-cusolver-cu12": "11.7.1.2",
110
+ "cycler": "0.12.1",
111
+ "evaluate": "0.4.3",
112
+ "distro": "1.9.0",
113
+ "idna": "3.10",
114
+ "MarkupSafe": "3.0.2",
115
+ "frozenlist": "1.7.0",
116
+ "pyparsing": "3.2.3",
117
+ "jiter": "0.10.0",
118
+ "importlib_metadata": "8.0.0",
119
+ "packaging": "24.2",
120
+ "psutil": "7.0.0",
121
+ "mecab-ko-dic": "1.0.0",
122
+ "joblib": "1.5.1",
123
+ "fsspec": "2025.3.0",
124
+ "dill": "0.3.8",
125
+ "tokenizers": "0.21.1",
126
+ "wheel": "0.45.1",
127
+ "nvidia-nvtx-cu12": "12.6.77",
128
+ "nvidia-cusparselt-cu12": "0.6.3",
129
+ "hf-xet": "1.1.4",
130
+ "propcache": "0.3.2",
131
+ "numpy": "2.2.6",
132
+ "mpmath": "1.3.0",
133
+ "multidict": "6.5.0",
134
+ "conllu": "6.0.0",
135
+ "safetensors": "0.5.3",
136
+ "requests": "2.32.4",
137
+ "regex": "2024.11.6",
138
+ "aiohttp": "3.12.13",
139
+ "tabulate": "0.9.0",
140
+ "certifi": "2025.6.15",
141
+ "accelerate": "1.8.0",
142
+ "nvidia-cufft-cu12": "11.3.0.4",
143
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
+ "click": "8.2.1",
145
+ "typing_extensions": "4.12.2",
146
+ "attrs": "25.3.0",
147
+ "exceptiongroup": "1.3.0",
148
+ "tenacity": "9.1.2",
149
+ "pytz": "2025.2",
150
+ "aiohappyeyeballs": "2.6.1",
151
+ "python-dateutil": "2.9.0.post0",
152
+ "torch": "2.7.1",
153
+ "python-dotenv": "1.1.0",
154
+ "httpx": "0.28.1",
155
+ "matplotlib": "3.10.3",
156
+ "xxhash": "3.5.0",
157
+ "PyYAML": "6.0.2",
158
+ "huggingface-hub": "0.33.0",
159
+ "colorama": "0.4.6",
160
+ "rpds-py": "0.25.1",
161
+ "threadpoolctl": "3.6.0",
162
+ "nvidia-cudnn-cu12": "9.5.1.17",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.9777777777777777,
180
+ "accuracy_ci_low": 0.9190234736102009,
181
+ "accuracy_ci_high": 1.0,
182
+ "score_name": "accuracy",
183
+ "score": 0.9777777777777777,
184
+ "score_ci_high": 1.0,
185
+ "score_ci_low": 0.9190234736102009,
186
+ "num_of_instances": 90
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 0.9888888888888889,
190
+ "accuracy_ci_low": 0.9444444444444444,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 0.9888888888888889,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 0.9444444444444444,
196
+ "num_of_instances": 90
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 0.9888888888888889,
200
+ "accuracy_ci_low": 0.9333333333333333,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 0.9888888888888889,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 0.9333333333333333,
206
+ "num_of_instances": 90
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 90
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.9888888888888889,
220
+ "accuracy_ci_low": 0.9444444444444444,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 0.9888888888888889,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 0.9444444444444444,
226
+ "num_of_instances": 90
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 1.0,
230
+ "accuracy_ci_low": 1.0,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 1.0,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 1.0,
236
+ "num_of_instances": 90
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 90
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 1.0,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 90
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.9888888888888889,
260
+ "accuracy_ci_low": 0.9444444444444444,
261
+ "accuracy_ci_high": 1.0,
262
+ "score_name": "accuracy",
263
+ "score": 0.9888888888888889,
264
+ "score_ci_high": 1.0,
265
+ "score_ci_low": 0.9444444444444444,
266
+ "num_of_instances": 90
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 1.0,
270
+ "accuracy_ci_low": 1.0,
271
+ "accuracy_ci_high": 1.0,
272
+ "score_name": "accuracy",
273
+ "score": 1.0,
274
+ "score_ci_high": 1.0,
275
+ "score_ci_low": 1.0,
276
+ "num_of_instances": 90
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.8777777777777778,
280
+ "accuracy_ci_low": 0.8,
281
+ "accuracy_ci_high": 0.9333333333333333,
282
+ "score_name": "accuracy",
283
+ "score": 0.8777777777777778,
284
+ "score_ci_high": 0.9333333333333333,
285
+ "score_ci_low": 0.8,
286
+ "num_of_instances": 90
287
+ },
288
+ "score": 0.9828282828282828,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 990
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.12794268167860798,
296
+ "score": 0.12794268167860798,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.12794268167860798,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 500
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 1000,
306
+ "f1_Person": 0.6233062330623307,
307
+ "f1_Organization": 0.4037267080745342,
308
+ "f1_Location": 0.441860465116279,
309
+ "f1_macro": 0.4896311354177146,
310
+ "recall_macro": 0.44046090061205145,
311
+ "precision_macro": 0.5555845701415322,
312
+ "in_classes_support": 0.8122605363984674,
313
+ "f1_micro": 0.45272206303724927,
314
+ "recall_micro": 0.4514285714285714,
315
+ "precision_micro": 0.4540229885057471,
316
+ "score": 0.45272206303724927,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.3966894448307716,
319
+ "score_ci_high": 0.5096302708354611,
320
+ "f1_micro_ci_low": 0.3966894448307716,
321
+ "f1_micro_ci_high": 0.5096302708354611
322
+ },
323
+ "score": 0.45272206303724927,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 1000
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.7605633802816901,
330
+ "accuracy_ci_low": 0.6619718309859155,
331
+ "accuracy_ci_high": 0.8450704225352113,
332
+ "score_name": "accuracy",
333
+ "score": 0.7605633802816901,
334
+ "score_ci_high": 0.8450704225352113,
335
+ "score_ci_low": 0.6619718309859155,
336
+ "num_of_instances": 71
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.49295774647887325,
340
+ "accuracy_ci_low": 0.38028169014084506,
341
+ "accuracy_ci_high": 0.6056338028169014,
342
+ "score_name": "accuracy",
343
+ "score": 0.49295774647887325,
344
+ "score_ci_high": 0.6056338028169014,
345
+ "score_ci_low": 0.38028169014084506,
346
+ "num_of_instances": 71
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.43661971830985913,
350
+ "accuracy_ci_low": 0.31179550598679995,
351
+ "accuracy_ci_high": 0.5633802816901409,
352
+ "score_name": "accuracy",
353
+ "score": 0.43661971830985913,
354
+ "score_ci_high": 0.5633802816901409,
355
+ "score_ci_low": 0.31179550598679995,
356
+ "num_of_instances": 71
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.7323943661971831,
360
+ "accuracy_ci_low": 0.6056338028169014,
361
+ "accuracy_ci_high": 0.8309859154929577,
362
+ "score_name": "accuracy",
363
+ "score": 0.7323943661971831,
364
+ "score_ci_high": 0.8309859154929577,
365
+ "score_ci_low": 0.6056338028169014,
366
+ "num_of_instances": 71
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.7464788732394366,
370
+ "accuracy_ci_low": 0.6338028169014085,
371
+ "accuracy_ci_high": 0.8320697555200512,
372
+ "score_name": "accuracy",
373
+ "score": 0.7464788732394366,
374
+ "score_ci_high": 0.8320697555200512,
375
+ "score_ci_low": 0.6338028169014085,
376
+ "num_of_instances": 71
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.5633802816901409,
380
+ "accuracy_ci_low": 0.4507042253521127,
381
+ "accuracy_ci_high": 0.6901408450704225,
382
+ "score_name": "accuracy",
383
+ "score": 0.5633802816901409,
384
+ "score_ci_high": 0.6901408450704225,
385
+ "score_ci_low": 0.4507042253521127,
386
+ "num_of_instances": 71
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.5774647887323944,
390
+ "accuracy_ci_low": 0.4647887323943662,
391
+ "accuracy_ci_high": 0.6901408450704225,
392
+ "score_name": "accuracy",
393
+ "score": 0.5774647887323944,
394
+ "score_ci_high": 0.6901408450704225,
395
+ "score_ci_low": 0.4647887323943662,
396
+ "num_of_instances": 71
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.6901408450704225,
400
+ "accuracy_ci_low": 0.5633802816901409,
401
+ "accuracy_ci_high": 0.7887323943661971,
402
+ "score_name": "accuracy",
403
+ "score": 0.6901408450704225,
404
+ "score_ci_high": 0.7887323943661971,
405
+ "score_ci_low": 0.5633802816901409,
406
+ "num_of_instances": 71
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.7323943661971831,
410
+ "accuracy_ci_low": 0.6056338028169014,
411
+ "accuracy_ci_high": 0.8309859154929577,
412
+ "score_name": "accuracy",
413
+ "score": 0.7323943661971831,
414
+ "score_ci_high": 0.8309859154929577,
415
+ "score_ci_low": 0.6056338028169014,
416
+ "num_of_instances": 71
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.5633802816901409,
420
+ "accuracy_ci_low": 0.43661971830985913,
421
+ "accuracy_ci_high": 0.676056338028169,
422
+ "score_name": "accuracy",
423
+ "score": 0.5633802816901409,
424
+ "score_ci_high": 0.676056338028169,
425
+ "score_ci_low": 0.43661971830985913,
426
+ "num_of_instances": 71
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.6619718309859155,
430
+ "accuracy_ci_low": 0.5370780611967093,
431
+ "accuracy_ci_high": 0.7605633802816901,
432
+ "score_name": "accuracy",
433
+ "score": 0.6619718309859155,
434
+ "score_ci_high": 0.7605633802816901,
435
+ "score_ci_low": 0.5370780611967093,
436
+ "num_of_instances": 71
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.8028169014084507,
440
+ "accuracy_ci_low": 0.6981095742502579,
441
+ "accuracy_ci_high": 0.8873239436619719,
442
+ "score_name": "accuracy",
443
+ "score": 0.8028169014084507,
444
+ "score_ci_high": 0.8873239436619719,
445
+ "score_ci_low": 0.6981095742502579,
446
+ "num_of_instances": 71
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.6338028169014085,
450
+ "accuracy_ci_low": 0.5211267605633803,
451
+ "accuracy_ci_high": 0.7323943661971831,
452
+ "score_name": "accuracy",
453
+ "score": 0.6338028169014085,
454
+ "score_ci_high": 0.7323943661971831,
455
+ "score_ci_low": 0.5211267605633803,
456
+ "num_of_instances": 71
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.7323943661971831,
460
+ "accuracy_ci_low": 0.6197183098591549,
461
+ "accuracy_ci_high": 0.8309859154929577,
462
+ "score_name": "accuracy",
463
+ "score": 0.7323943661971831,
464
+ "score_ci_high": 0.8309859154929577,
465
+ "score_ci_low": 0.6197183098591549,
466
+ "num_of_instances": 71
467
+ },
468
+ "score": 0.6519114688128773,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 994
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.7710561497326203,
475
+ "f1_suggestive": 0.5882352941176471,
476
+ "f1_generic": 1.0,
477
+ "f1_fanciful": 0.8125,
478
+ "f1_descriptive": 0.7878787878787878,
479
+ "f1_arbitrary": 0.6666666666666666,
480
+ "f1_macro_ci_low": 0.6815687852174904,
481
+ "f1_macro_ci_high": 0.8487975830625909,
482
+ "score_name": "f1_micro",
483
+ "score": 0.7682926829268293,
484
+ "score_ci_high": 0.845238531816244,
485
+ "score_ci_low": 0.682034648754911,
486
+ "num_of_instances": 85,
487
+ "accuracy": 0.7411764705882353,
488
+ "accuracy_ci_low": 0.6470588235294118,
489
+ "accuracy_ci_high": 0.8235294117647058,
490
+ "f1_micro": 0.7682926829268293,
491
+ "f1_micro_ci_low": 0.682034648754911,
492
+ "f1_micro_ci_high": 0.845238531816244
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.7216529635538103,
496
+ "f1_no": 0.8398576512455516,
497
+ "f1_yes": 0.603448275862069,
498
+ "f1_macro_ci_low": 0.6520262757220233,
499
+ "f1_macro_ci_high": 0.7890273988307265,
500
+ "score_name": "f1_micro",
501
+ "score": 0.7707808564231738,
502
+ "score_ci_high": 0.8225396492391672,
503
+ "score_ci_low": 0.7085427135678392,
504
+ "num_of_instances": 200,
505
+ "accuracy": 0.765,
506
+ "accuracy_ci_low": 0.7,
507
+ "accuracy_ci_high": 0.815,
508
+ "f1_micro": 0.7707808564231738,
509
+ "f1_micro_ci_low": 0.7085427135678392,
510
+ "f1_micro_ci_high": 0.8225396492391672
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.30206082783981175,
514
+ "f1_conclusion": 0.13953488372093023,
515
+ "f1_issue": 0.13333333333333333,
516
+ "f1_decree": 0.3783783783783784,
517
+ "f1_rule": 0.4482758620689655,
518
+ "f1_analysis": 0.5647058823529412,
519
+ "f1_facts": 0.18604651162790697,
520
+ "f1_procedural history": 0.2641509433962264,
521
+ "f1_macro_ci_low": 0.2513517165690192,
522
+ "f1_macro_ci_high": 0.3775685968384507,
523
+ "score_name": "f1_micro",
524
+ "score": 0.33516483516483514,
525
+ "score_ci_high": 0.4075146671820192,
526
+ "score_ci_low": 0.272347535123403,
527
+ "num_of_instances": 200,
528
+ "accuracy": 0.305,
529
+ "accuracy_ci_low": 0.245,
530
+ "accuracy_ci_high": 0.375,
531
+ "f1_micro": 0.33516483516483514,
532
+ "f1_micro_ci_low": 0.272347535123403,
533
+ "f1_micro_ci_high": 0.4075146671820192
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.5047062641999351,
537
+ "f1_yes": 0.5991561181434599,
538
+ "f1_no": 0.41025641025641024,
539
+ "f1_macro_ci_low": 0.4417647306569189,
540
+ "f1_macro_ci_high": 0.5795532826410552,
541
+ "score_name": "f1_micro",
542
+ "score": 0.5241730279898219,
543
+ "score_ci_high": 0.5950630270095045,
544
+ "score_ci_low": 0.459552667145485,
545
+ "num_of_instances": 200,
546
+ "accuracy": 0.515,
547
+ "accuracy_ci_low": 0.45,
548
+ "accuracy_ci_high": 0.585,
549
+ "f1_micro": 0.5241730279898219,
550
+ "f1_micro_ci_low": 0.459552667145485,
551
+ "f1_micro_ci_high": 0.5950630270095045
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.9284195605953225,
555
+ "f1_yes": 0.9156626506024096,
556
+ "f1_no": 0.9411764705882353,
557
+ "f1_macro_ci_low": 0.8686025850356507,
558
+ "f1_macro_ci_high": 0.9706771979676585,
559
+ "score_name": "f1_micro",
560
+ "score": 0.9285714285714286,
561
+ "score_ci_high": 0.9704142011834319,
562
+ "score_ci_low": 0.8690476190476191,
563
+ "num_of_instances": 85,
564
+ "accuracy": 0.9176470588235294,
565
+ "accuracy_ci_low": 0.8588235294117647,
566
+ "accuracy_ci_high": 0.9647058823529412,
567
+ "f1_micro": 0.9285714285714286,
568
+ "f1_micro_ci_low": 0.8690476190476191,
569
+ "f1_micro_ci_high": 0.9704142011834319
570
+ },
571
+ "score": 0.6653965662152177,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 770
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.6486110220528901,
578
+ "f1_cars": 0.8958333333333334,
579
+ "f1_windows x": 0.09090909090909091,
580
+ "f1_computer graphics": 0.4793388429752066,
581
+ "f1_atheism": 0.5245901639344263,
582
+ "f1_religion": 0.044444444444444446,
583
+ "f1_medicine": 0.813953488372093,
584
+ "f1_christianity": 0.8727272727272727,
585
+ "f1_for sale": 0.7777777777777778,
586
+ "f1_microsoft windows": 0.7708333333333334,
587
+ "f1_middle east": 0.5671641791044776,
588
+ "f1_motorcycles": 0.7692307692307693,
589
+ "f1_pc hardware": 0.6046511627906976,
590
+ "f1_mac hardware": 0.7924528301886793,
591
+ "f1_electronics": 0.7291666666666666,
592
+ "f1_guns": 0.410958904109589,
593
+ "f1_space": 0.8846153846153846,
594
+ "f1_cryptography": 0.72,
595
+ "f1_baseball": 0.9391304347826087,
596
+ "f1_hockey": 0.9545454545454546,
597
+ "f1_politics": 0.32989690721649484,
598
+ "f1_macro_ci_low": 0.6245539233827745,
599
+ "f1_macro_ci_high": 0.6735741716064018,
600
+ "score_name": "f1_micro",
601
+ "score": 0.6871961102106969,
602
+ "score_ci_high": 0.7158552998269507,
603
+ "score_ci_low": 0.6568978311145116,
604
+ "num_of_instances": 1000,
605
+ "accuracy": 0.636,
606
+ "accuracy_ci_low": 0.6068817918985229,
607
+ "accuracy_ci_high": 0.666,
608
+ "f1_micro": 0.6871961102106969,
609
+ "f1_micro_ci_low": 0.6568978311145116,
610
+ "f1_micro_ci_high": 0.7158552998269507
611
+ },
612
+ "score": 0.6871961102106969,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 1000
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.7442070442021397,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9329529243937232,
620
+ "f1_checking or savings account": 0.8269230769230769,
621
+ "f1_debt collection": 0.5324675324675324,
622
+ "f1_credit card or prepaid card": 0.7933884297520661,
623
+ "f1_mortgage": 0.8405797101449275,
624
+ "f1_student loan": 0.896551724137931,
625
+ "f1_money transfer or virtual currency or money service": 0.875,
626
+ "f1_vehicle loan or lease": 0.6666666666666666,
627
+ "f1_payday loan or title loan or personal loan": 0.3333333333333333,
628
+ "f1_macro_ci_low": 0.6958471577865166,
629
+ "f1_macro_ci_high": 0.8045739980351424,
630
+ "score_name": "f1_micro",
631
+ "score": 0.8742393509127789,
632
+ "score_ci_high": 0.8929169783856484,
633
+ "score_ci_low": 0.852776904397444,
634
+ "num_of_instances": 1000,
635
+ "accuracy": 0.862,
636
+ "accuracy_ci_low": 0.839,
637
+ "accuracy_ci_high": 0.882,
638
+ "f1_micro": 0.8742393509127789,
639
+ "f1_micro_ci_low": 0.852776904397444,
640
+ "f1_micro_ci_high": 0.8929169783856484
641
+ },
642
+ "cfpb_product_watsonx": {
643
+ "f1_macro": 0.747877693509548,
644
+ "f1_mortgages and loans": 0.8248587570621468,
645
+ "f1_credit card": 0.7912087912087912,
646
+ "f1_debt collection": 0.683982683982684,
647
+ "f1_credit reporting": 0.7205882352941176,
648
+ "f1_retail banking": 0.71875,
649
+ "f1_macro_ci_low": 0.7065255253955101,
650
+ "f1_macro_ci_high": 0.7849687727329339,
651
+ "score_name": "f1_micro",
652
+ "score": 0.7434343434343434,
653
+ "score_ci_high": 0.7787863123983747,
654
+ "score_ci_low": 0.7018885821645714,
655
+ "num_of_instances": 500,
656
+ "accuracy": 0.736,
657
+ "accuracy_ci_low": 0.694,
658
+ "accuracy_ci_high": 0.772,
659
+ "f1_micro": 0.7434343434343434,
660
+ "f1_micro_ci_low": 0.7018885821645714,
661
+ "f1_micro_ci_high": 0.7787863123983747
662
+ },
663
+ "score": 0.8088368471735612,
664
+ "score_name": "subsets_mean",
665
+ "num_of_instances": 1500
666
+ },
667
+ "qa_finance": {
668
+ "fin_qa": {
669
+ "num_of_instances": 1000,
670
+ "program_accuracy": 0.212,
671
+ "score": 0.212,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy": 0.197,
674
+ "program_accuracy_ci_low": 0.185,
675
+ "program_accuracy_ci_high": 0.236,
676
+ "score_ci_low": 0.185,
677
+ "score_ci_high": 0.236,
678
+ "execution_accuracy_ci_low": 0.17257528462439894,
679
+ "execution_accuracy_ci_high": 0.222
680
+ },
681
+ "score": 0.212,
682
+ "score_name": "subsets_mean",
683
+ "num_of_instances": 1000
684
+ },
685
+ "rag_general": {
686
+ "rag_response_generation_clapnq": {
687
+ "precision": 0.3245919340419664,
688
+ "recall": 0.5553697849897254,
689
+ "f1": 0.34463972021312694,
690
+ "precision_ci_low": 0.30294812146038264,
691
+ "precision_ci_high": 0.34672435235830834,
692
+ "recall_ci_low": 0.5393969536805812,
693
+ "recall_ci_high": 0.5708437647062446,
694
+ "f1_ci_low": 0.32625746187015825,
695
+ "f1_ci_high": 0.3632802184669128,
696
+ "score_name": "f1",
697
+ "score": 0.34463972021312694,
698
+ "score_ci_high": 0.3632802184669128,
699
+ "score_ci_low": 0.32625746187015825,
700
+ "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.5976956130564213,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.678848825097084,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5559072712808848,
704
+ "faithfullness_f1_token_overlap": 0.2867915721228578,
705
+ "faithfullness_recall_token_overlap": 0.21812174721257188,
706
+ "faithfullness_precision_token_overlap": 0.546883220093946,
707
+ "correctness_f1_token_overlap": 0.34463972021312694,
708
+ "correctness_recall_token_overlap": 0.5553697849897254,
709
+ "correctness_precision_token_overlap": 0.3245919340419664
710
+ },
711
+ "score": 0.34463972021312694,
712
+ "score_name": "subsets_mean",
713
+ "num_of_instances": 600
714
+ },
715
+ "reasoning": {
716
+ "hellaswag": {
717
+ "accuracy": 0.57,
718
+ "accuracy_ci_low": 0.537,
719
+ "accuracy_ci_high": 0.598,
720
+ "score_name": "accuracy",
721
+ "score": 0.57,
722
+ "score_ci_high": 0.598,
723
+ "score_ci_low": 0.537,
724
+ "num_of_instances": 1000
725
+ },
726
+ "openbook_qa": {
727
+ "accuracy": 0.914,
728
+ "accuracy_ci_low": 0.886,
729
+ "accuracy_ci_high": 0.936,
730
+ "score_name": "accuracy",
731
+ "score": 0.914,
732
+ "score_ci_high": 0.936,
733
+ "score_ci_low": 0.886,
734
+ "num_of_instances": 500
735
+ },
736
+ "score": 0.742,
737
+ "score_name": "subsets_mean",
738
+ "num_of_instances": 1500
739
+ },
740
+ "safety": {
741
+ "attaq_500": {
742
+ "severity_critical": 1.0,
743
+ "severity_high": 0.0,
744
+ "severity_medium": 0.0,
745
+ "severity_low": 99.0,
746
+ "category_harmful_info": 0.8239067156488699,
747
+ "category_deception": 0.8416350447017737,
748
+ "category_explicit_content": 0.8482034478073168,
749
+ "category_violence": 0.8124119411385249,
750
+ "category_discrimination": 0.7214018125382681,
751
+ "category_substance_abuse": 0.8382871117856767,
752
+ "category_pii": 0.838832758643009,
753
+ "safety": 0.8184977591824201,
754
+ "safety_ci_low": 0.799251726518489,
755
+ "safety_ci_high": 0.83564691259826,
756
+ "score_name": "safety",
757
+ "score": 0.8184977591824201,
758
+ "score_ci_high": 0.83564691259826,
759
+ "score_ci_low": 0.799251726518489,
760
+ "num_of_instances": 100
761
+ },
762
+ "score": 0.8184977591824201,
763
+ "score_name": "subsets_mean",
764
+ "num_of_instances": 100
765
+ },
766
+ "summarization": {
767
+ "billsum_document_filtered_to_6000_chars": {
768
+ "num_of_instances": 528,
769
+ "rouge2": 0.21878117525659171,
770
+ "rouge1": 0.42882781698282896,
771
+ "rougeLsum": 0.36923160950814665,
772
+ "rougeL": 0.3012206645101065,
773
+ "score": 0.3012206645101065,
774
+ "score_name": "rougeL",
775
+ "rouge2_ci_low": 0.2116465564648193,
776
+ "rouge2_ci_high": 0.22611002289776966,
777
+ "rouge1_ci_low": 0.4181854383996789,
778
+ "rouge1_ci_high": 0.4380350294107447,
779
+ "rougeLsum_ci_low": 0.3593867077962995,
780
+ "rougeLsum_ci_high": 0.377981997024725,
781
+ "rougeL_ci_low": 0.2938004078438361,
782
+ "rougeL_ci_high": 0.30862765298917266,
783
+ "score_ci_low": 0.2938004078438361,
784
+ "score_ci_high": 0.30862765298917266
785
+ },
786
+ "tldr_document_filtered_to_6000_chars": {
787
+ "num_of_instances": 1000,
788
+ "rouge2": 0.01897199585186187,
789
+ "rouge1": 0.12886717133874953,
790
+ "rougeLsum": 0.10635879624885117,
791
+ "rougeL": 0.09266060514104649,
792
+ "score": 0.09266060514104649,
793
+ "score_name": "rougeL",
794
+ "rouge2_ci_low": 0.0169285768359538,
795
+ "rouge2_ci_high": 0.020965979315931374,
796
+ "rouge1_ci_low": 0.1230681468561615,
797
+ "rouge1_ci_high": 0.13427737204069826,
798
+ "rougeLsum_ci_low": 0.10189392757917064,
799
+ "rougeLsum_ci_high": 0.11089749692460946,
800
+ "rougeL_ci_low": 0.08868339152088286,
801
+ "rougeL_ci_high": 0.0963878314649574,
802
+ "score_ci_low": 0.08868339152088286,
803
+ "score_ci_high": 0.0963878314649574
804
+ },
805
+ "score": 0.1969406348255765,
806
+ "score_name": "subsets_mean",
807
+ "num_of_instances": 1528
808
+ },
809
+ "translation": {
810
+ "mt_flores_101_ara_eng": {
811
+ "num_of_instances": 66,
812
+ "counts": [
813
+ 1296,
814
+ 846,
815
+ 593,
816
+ 421
817
+ ],
818
+ "totals": [
819
+ 1768,
820
+ 1702,
821
+ 1636,
822
+ 1570
823
+ ],
824
+ "precisions": [
825
+ 0.7330316742081447,
826
+ 0.4970622796709753,
827
+ 0.36246943765281175,
828
+ 0.2681528662420382
829
+ ],
830
+ "bp": 1.0,
831
+ "sys_len": 1768,
832
+ "ref_len": 1734,
833
+ "sacrebleu": 0.4338072904737007,
834
+ "score": 0.4338072904737007,
835
+ "score_name": "sacrebleu",
836
+ "score_ci_low": 0.3840033634108722,
837
+ "score_ci_high": 0.4745374005130659,
838
+ "sacrebleu_ci_low": 0.3840033634108722,
839
+ "sacrebleu_ci_high": 0.4745374005130659
840
+ },
841
+ "mt_flores_101_deu_eng": {
842
+ "num_of_instances": 66,
843
+ "counts": [
844
+ 1324,
845
+ 883,
846
+ 626,
847
+ 442
848
+ ],
849
+ "totals": [
850
+ 1796,
851
+ 1730,
852
+ 1664,
853
+ 1598
854
+ ],
855
+ "precisions": [
856
+ 0.7371937639198218,
857
+ 0.5104046242774567,
858
+ 0.3762019230769231,
859
+ 0.2765957446808511
860
+ ],
861
+ "bp": 1.0,
862
+ "sys_len": 1796,
863
+ "ref_len": 1734,
864
+ "sacrebleu": 0.44482653871620387,
865
+ "score": 0.44482653871620387,
866
+ "score_name": "sacrebleu",
867
+ "score_ci_low": 0.4081818656109621,
868
+ "score_ci_high": 0.4860589069348345,
869
+ "sacrebleu_ci_low": 0.4081818656109621,
870
+ "sacrebleu_ci_high": 0.4860589069348345
871
+ },
872
+ "mt_flores_101_eng_ara": {
873
+ "num_of_instances": 66,
874
+ "counts": [
875
+ 898,
876
+ 497,
877
+ 303,
878
+ 187
879
+ ],
880
+ "totals": [
881
+ 1785,
882
+ 1719,
883
+ 1653,
884
+ 1587
885
+ ],
886
+ "precisions": [
887
+ 0.5030812324929972,
888
+ 0.28912158231529955,
889
+ 0.18330308529945555,
890
+ 0.1178323881537492
891
+ ],
892
+ "bp": 1.0,
893
+ "sys_len": 1785,
894
+ "ref_len": 1589,
895
+ "sacrebleu": 0.23674906403928667,
896
+ "score": 0.23674906403928667,
897
+ "score_name": "sacrebleu",
898
+ "score_ci_low": 0.18135183356554332,
899
+ "score_ci_high": 0.28960778605368953,
900
+ "sacrebleu_ci_low": 0.18135183356554332,
901
+ "sacrebleu_ci_high": 0.28960778605368953
902
+ },
903
+ "mt_flores_101_eng_deu": {
904
+ "num_of_instances": 66,
905
+ "counts": [
906
+ 1240,
907
+ 761,
908
+ 513,
909
+ 362
910
+ ],
911
+ "totals": [
912
+ 1823,
913
+ 1757,
914
+ 1691,
915
+ 1625
916
+ ],
917
+ "precisions": [
918
+ 0.6801974766867801,
919
+ 0.4331246442800228,
920
+ 0.3033707865168539,
921
+ 0.22276923076923075
922
+ ],
923
+ "bp": 0.9934390613382812,
924
+ "sys_len": 1823,
925
+ "ref_len": 1835,
926
+ "sacrebleu": 0.3731732035283488,
927
+ "score": 0.3731732035283488,
928
+ "score_name": "sacrebleu",
929
+ "score_ci_low": 0.3128221603250469,
930
+ "score_ci_high": 0.4084245255594999,
931
+ "sacrebleu_ci_low": 0.3128221603250469,
932
+ "sacrebleu_ci_high": 0.4084245255594999
933
+ },
934
+ "mt_flores_101_eng_fra": {
935
+ "num_of_instances": 66,
936
+ "counts": [
937
+ 1532,
938
+ 1143,
939
+ 898,
940
+ 722
941
+ ],
942
+ "totals": [
943
+ 2028,
944
+ 1962,
945
+ 1896,
946
+ 1830
947
+ ],
948
+ "precisions": [
949
+ 0.7554240631163708,
950
+ 0.5825688073394495,
951
+ 0.4736286919831224,
952
+ 0.3945355191256831
953
+ ],
954
+ "bp": 0.9804693769806172,
955
+ "sys_len": 2028,
956
+ "ref_len": 2068,
957
+ "sacrebleu": 0.5250486815503393,
958
+ "score": 0.5250486815503393,
959
+ "score_name": "sacrebleu",
960
+ "score_ci_low": 0.4776077157588871,
961
+ "score_ci_high": 0.57891652183332,
962
+ "sacrebleu_ci_low": 0.4776077157588871,
963
+ "sacrebleu_ci_high": 0.57891652183332
964
+ },
965
+ "mt_flores_101_eng_kor": {
966
+ "num_of_instances": 66,
967
+ "counts": [
968
+ 1351,
969
+ 728,
970
+ 447,
971
+ 279
972
+ ],
973
+ "totals": [
974
+ 2735,
975
+ 2669,
976
+ 2603,
977
+ 2537
978
+ ],
979
+ "precisions": [
980
+ 0.4939670932358318,
981
+ 0.27276133383289625,
982
+ 0.17172493276988093,
983
+ 0.10997240835632636
984
+ ],
985
+ "bp": 1.0,
986
+ "sys_len": 2735,
987
+ "ref_len": 2235,
988
+ "sacrebleu": 0.22459468717502307,
989
+ "score": 0.22459468717502307,
990
+ "score_name": "sacrebleu",
991
+ "score_ci_low": 0.1881220082315981,
992
+ "score_ci_high": 0.25342316010757016,
993
+ "sacrebleu_ci_low": 0.1881220082315981,
994
+ "sacrebleu_ci_high": 0.25342316010757016
995
+ },
996
+ "mt_flores_101_eng_por": {
997
+ "num_of_instances": 66,
998
+ "counts": [
999
+ 1429,
1000
+ 1021,
1001
+ 760,
1002
+ 570
1003
+ ],
1004
+ "totals": [
1005
+ 1901,
1006
+ 1835,
1007
+ 1769,
1008
+ 1703
1009
+ ],
1010
+ "precisions": [
1011
+ 0.751709626512362,
1012
+ 0.5564032697547684,
1013
+ 0.4296212549462973,
1014
+ 0.33470346447445687
1015
+ ],
1016
+ "bp": 0.9921404650355355,
1017
+ "sys_len": 1901,
1018
+ "ref_len": 1916,
1019
+ "sacrebleu": 0.49132583520106116,
1020
+ "score": 0.49132583520106116,
1021
+ "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.4478372466403723,
1023
+ "score_ci_high": 0.5303109141597054,
1024
+ "sacrebleu_ci_low": 0.4478372466403723,
1025
+ "sacrebleu_ci_high": 0.5303109141597054
1026
+ },
1027
+ "mt_flores_101_eng_ron": {
1028
+ "num_of_instances": 66,
1029
+ "counts": [
1030
+ 1387,
1031
+ 979,
1032
+ 711,
1033
+ 518
1034
+ ],
1035
+ "totals": [
1036
+ 1967,
1037
+ 1901,
1038
+ 1835,
1039
+ 1769
1040
+ ],
1041
+ "precisions": [
1042
+ 0.7051347229283172,
1043
+ 0.5149921094160969,
1044
+ 0.38746594005449597,
1045
+ 0.2928208027133974
1046
+ ],
1047
+ "bp": 1.0,
1048
+ "sys_len": 1967,
1049
+ "ref_len": 1949,
1050
+ "sacrebleu": 0.450533442657802,
1051
+ "score": 0.450533442657802,
1052
+ "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.41119874078427415,
1054
+ "score_ci_high": 0.4996205132749857,
1055
+ "sacrebleu_ci_low": 0.41119874078427415,
1056
+ "sacrebleu_ci_high": 0.4996205132749857
1057
+ },
1058
+ "mt_flores_101_eng_spa": {
1059
+ "num_of_instances": 66,
1060
+ "counts": [
1061
+ 1300,
1062
+ 749,
1063
+ 458,
1064
+ 283
1065
+ ],
1066
+ "totals": [
1067
+ 2033,
1068
+ 1967,
1069
+ 1901,
1070
+ 1835
1071
+ ],
1072
+ "precisions": [
1073
+ 0.6394490900147565,
1074
+ 0.3807829181494662,
1075
+ 0.24092582851130984,
1076
+ 0.1542234332425068
1077
+ ],
1078
+ "bp": 0.9685332604439724,
1079
+ "sys_len": 2033,
1080
+ "ref_len": 2098,
1081
+ "sacrebleu": 0.29870591960318976,
1082
+ "score": 0.29870591960318976,
1083
+ "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.2686987876011016,
1085
+ "score_ci_high": 0.3321355366475583,
1086
+ "sacrebleu_ci_low": 0.2686987876011016,
1087
+ "sacrebleu_ci_high": 0.3321355366475583
1088
+ },
1089
+ "mt_flores_101_fra_eng": {
1090
+ "num_of_instances": 66,
1091
+ "counts": [
1092
+ 1340,
1093
+ 946,
1094
+ 692,
1095
+ 509
1096
+ ],
1097
+ "totals": [
1098
+ 1799,
1099
+ 1733,
1100
+ 1667,
1101
+ 1601
1102
+ ],
1103
+ "precisions": [
1104
+ 0.7448582545858811,
1105
+ 0.5458742065781881,
1106
+ 0.4151169766046791,
1107
+ 0.31792629606495937
1108
+ ],
1109
+ "bp": 1.0,
1110
+ "sys_len": 1799,
1111
+ "ref_len": 1734,
1112
+ "sacrebleu": 0.4812999188340168,
1113
+ "score": 0.4812999188340168,
1114
+ "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.43585100595842746,
1116
+ "score_ci_high": 0.5287499225865158,
1117
+ "sacrebleu_ci_low": 0.43585100595842746,
1118
+ "sacrebleu_ci_high": 0.5287499225865158
1119
+ },
1120
+ "mt_flores_101_jpn_eng": {
1121
+ "num_of_instances": 66,
1122
+ "counts": [
1123
+ 1139,
1124
+ 642,
1125
+ 413,
1126
+ 271
1127
+ ],
1128
+ "totals": [
1129
+ 1798,
1130
+ 1732,
1131
+ 1666,
1132
+ 1600
1133
+ ],
1134
+ "precisions": [
1135
+ 0.6334816462736373,
1136
+ 0.37066974595842955,
1137
+ 0.24789915966386555,
1138
+ 0.169375
1139
+ ],
1140
+ "bp": 1.0,
1141
+ "sys_len": 1798,
1142
+ "ref_len": 1734,
1143
+ "sacrebleu": 0.3151094190111042,
1144
+ "score": 0.3151094190111042,
1145
+ "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.27523820425365936,
1147
+ "score_ci_high": 0.37080224288423497,
1148
+ "sacrebleu_ci_low": 0.27523820425365936,
1149
+ "sacrebleu_ci_high": 0.37080224288423497
1150
+ },
1151
+ "mt_flores_101_kor_eng": {
1152
+ "num_of_instances": 66,
1153
+ "counts": [
1154
+ 1096,
1155
+ 603,
1156
+ 369,
1157
+ 231
1158
+ ],
1159
+ "totals": [
1160
+ 1757,
1161
+ 1691,
1162
+ 1625,
1163
+ 1559
1164
+ ],
1165
+ "precisions": [
1166
+ 0.6237905520774046,
1167
+ 0.3565937315198108,
1168
+ 0.2270769230769231,
1169
+ 0.14817190506735087
1170
+ ],
1171
+ "bp": 1.0,
1172
+ "sys_len": 1757,
1173
+ "ref_len": 1734,
1174
+ "sacrebleu": 0.29412899612915067,
1175
+ "score": 0.29412899612915067,
1176
+ "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.2630441728848743,
1178
+ "score_ci_high": 0.3478998363728344,
1179
+ "sacrebleu_ci_low": 0.2630441728848743,
1180
+ "sacrebleu_ci_high": 0.3478998363728344
1181
+ },
1182
+ "mt_flores_101_por_eng": {
1183
+ "num_of_instances": 66,
1184
+ "counts": [
1185
+ 1366,
1186
+ 978,
1187
+ 742,
1188
+ 559
1189
+ ],
1190
+ "totals": [
1191
+ 1794,
1192
+ 1728,
1193
+ 1662,
1194
+ 1596
1195
+ ],
1196
+ "precisions": [
1197
+ 0.7614269788182831,
1198
+ 0.5659722222222222,
1199
+ 0.4464500601684717,
1200
+ 0.35025062656641603
1201
+ ],
1202
+ "bp": 1.0,
1203
+ "sys_len": 1794,
1204
+ "ref_len": 1734,
1205
+ "sacrebleu": 0.5094995397125037,
1206
+ "score": 0.5094995397125037,
1207
+ "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.45392787953470803,
1209
+ "score_ci_high": 0.5666406845959309,
1210
+ "sacrebleu_ci_low": 0.45392787953470803,
1211
+ "sacrebleu_ci_high": 0.5666406845959309
1212
+ },
1213
+ "mt_flores_101_ron_eng": {
1214
+ "num_of_instances": 66,
1215
+ "counts": [
1216
+ 1357,
1217
+ 953,
1218
+ 689,
1219
+ 517
1220
+ ],
1221
+ "totals": [
1222
+ 1780,
1223
+ 1714,
1224
+ 1648,
1225
+ 1582
1226
+ ],
1227
+ "precisions": [
1228
+ 0.7623595505617977,
1229
+ 0.5560093348891482,
1230
+ 0.4180825242718446,
1231
+ 0.3268015170670038
1232
+ ],
1233
+ "bp": 1.0,
1234
+ "sys_len": 1780,
1235
+ "ref_len": 1734,
1236
+ "sacrebleu": 0.49056549932677673,
1237
+ "score": 0.49056549932677673,
1238
+ "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.4449725178153994,
1240
+ "score_ci_high": 0.5499905859714643,
1241
+ "sacrebleu_ci_low": 0.4449725178153994,
1242
+ "sacrebleu_ci_high": 0.5499905859714643
1243
+ },
1244
+ "mt_flores_101_spa_eng": {
1245
+ "num_of_instances": 66,
1246
+ "counts": [
1247
+ 1196,
1248
+ 684,
1249
+ 436,
1250
+ 274
1251
+ ],
1252
+ "totals": [
1253
+ 1839,
1254
+ 1773,
1255
+ 1707,
1256
+ 1641
1257
+ ],
1258
+ "precisions": [
1259
+ 0.6503534529635671,
1260
+ 0.38578680203045684,
1261
+ 0.255418863503222,
1262
+ 0.16697135892748324
1263
+ ],
1264
+ "bp": 1.0,
1265
+ "sys_len": 1839,
1266
+ "ref_len": 1734,
1267
+ "sacrebleu": 0.3216236243185879,
1268
+ "score": 0.3216236243185879,
1269
+ "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.29765430133685095,
1271
+ "score_ci_high": 0.36660487766328476,
1272
+ "sacrebleu_ci_low": 0.29765430133685095,
1273
+ "sacrebleu_ci_high": 0.36660487766328476
1274
+ },
1275
+ "score": 0.39273277735180634,
1276
+ "score_name": "subsets_mean",
1277
+ "num_of_instances": 990
1278
+ },
1279
+ "score": 0.5448957624253402,
1280
+ "score_name": "subsets_mean",
1281
+ "num_of_instances": 12472
1282
+ }
1283
+ }
results/bluebench/{2025-06-19T15-57-45_evaluation_results.json β†’ 2025-06-23T02-53-05_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-19T19:57:39.981261Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -42,7 +42,7 @@
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -178,61 +178,61 @@
178
  "safety_bbq_age": {
179
  "accuracy": 0.6444444444444445,
180
  "accuracy_ci_low": 0.5444444444444444,
181
- "accuracy_ci_high": 0.7444444444444445,
182
  "score_name": "accuracy",
183
  "score": 0.6444444444444445,
184
- "score_ci_high": 0.7444444444444445,
185
  "score_ci_low": 0.5444444444444444,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
- "accuracy": 0.7222222222222222,
190
- "accuracy_ci_low": 0.6111111111111112,
191
  "accuracy_ci_high": 0.8,
192
  "score_name": "accuracy",
193
- "score": 0.7222222222222222,
194
  "score_ci_high": 0.8,
195
- "score_ci_low": 0.6111111111111112,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
  "accuracy": 0.9111111111111111,
200
- "accuracy_ci_low": 0.8333333333333334,
201
  "accuracy_ci_high": 0.9555555555555556,
202
  "score_name": "accuracy",
203
  "score": 0.9111111111111111,
204
  "score_ci_high": 0.9555555555555556,
205
- "score_ci_low": 0.8333333333333334,
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
- "accuracy": 0.7111111111111111,
210
- "accuracy_ci_low": 0.6111111111111112,
211
- "accuracy_ci_high": 0.8,
212
  "score_name": "accuracy",
213
- "score": 0.7111111111111111,
214
- "score_ci_high": 0.8,
215
- "score_ci_low": 0.6111111111111112,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
- "accuracy": 0.8333333333333334,
220
- "accuracy_ci_low": 0.7444444444444445,
221
- "accuracy_ci_high": 0.9,
222
  "score_name": "accuracy",
223
- "score": 0.8333333333333334,
224
- "score_ci_high": 0.9,
225
- "score_ci_low": 0.7444444444444445,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.9777777777777777,
230
- "accuracy_ci_low": 0.9333333333333333,
231
- "accuracy_ci_high": 1.0,
232
  "score_name": "accuracy",
233
- "score": 0.9777777777777777,
234
- "score_ci_high": 1.0,
235
- "score_ci_low": 0.9333333333333333,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
@@ -256,553 +256,553 @@
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
- "accuracy": 0.8111111111111111,
260
- "accuracy_ci_low": 0.7116197011994875,
261
- "accuracy_ci_high": 0.8888888888888888,
262
  "score_name": "accuracy",
263
- "score": 0.8111111111111111,
264
- "score_ci_high": 0.8888888888888888,
265
- "score_ci_low": 0.7116197011994875,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
- "accuracy": 0.6888888888888889,
270
- "accuracy_ci_low": 0.5777777777777777,
271
- "accuracy_ci_high": 0.7777777777777778,
272
  "score_name": "accuracy",
273
- "score": 0.6888888888888889,
274
- "score_ci_high": 0.7777777777777778,
275
- "score_ci_low": 0.5777777777777777,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.8,
280
- "accuracy_ci_low": 0.7111111111111111,
281
- "accuracy_ci_high": 0.8666666666666667,
282
  "score_name": "accuracy",
283
- "score": 0.8,
284
- "score_ci_high": 0.8666666666666667,
285
- "score_ci_low": 0.7111111111111111,
286
  "num_of_instances": 90
287
  },
288
- "score": 0.8161616161616162,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.5,
296
- "score": 0.5,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.5,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
- "f1_Person": 0.592375366568915,
307
- "f1_Location": 0.3668122270742358,
308
- "f1_Organization": 0.45367412140575075,
309
- "f1_macro": 0.4709539050163005,
310
- "recall_macro": 0.3969630056026483,
311
- "precision_macro": 0.5946970285442043,
312
- "in_classes_support": 0.7649572649572649,
313
- "f1_micro": 0.4310171198388721,
314
- "recall_micro": 0.4076190476190476,
315
- "precision_micro": 0.45726495726495725,
316
- "score": 0.4310171198388721,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.36016345918404075,
319
- "score_ci_high": 0.48021577272630167,
320
- "f1_micro_ci_low": 0.36016345918404075,
321
- "f1_micro_ci_high": 0.48021577272630167
322
  },
323
- "score": 0.4310171198388721,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
- "accuracy": 0.5352112676056338,
330
  "accuracy_ci_low": 0.43661971830985913,
331
- "accuracy_ci_high": 0.647887323943662,
332
  "score_name": "accuracy",
333
- "score": 0.5352112676056338,
334
- "score_ci_high": 0.647887323943662,
335
  "score_ci_low": 0.43661971830985913,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
- "accuracy": 0.22535211267605634,
340
- "accuracy_ci_low": 0.14084507042253522,
341
- "accuracy_ci_high": 0.323943661971831,
342
  "score_name": "accuracy",
343
- "score": 0.22535211267605634,
344
- "score_ci_high": 0.323943661971831,
345
- "score_ci_low": 0.14084507042253522,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
- "accuracy": 0.18309859154929578,
350
- "accuracy_ci_low": 0.10639771966263252,
351
- "accuracy_ci_high": 0.29577464788732394,
352
  "score_name": "accuracy",
353
- "score": 0.18309859154929578,
354
- "score_ci_high": 0.29577464788732394,
355
- "score_ci_low": 0.10639771966263252,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
- "accuracy": 0.323943661971831,
360
  "accuracy_ci_low": 0.2112676056338028,
361
- "accuracy_ci_high": 0.43661971830985913,
362
  "score_name": "accuracy",
363
- "score": 0.323943661971831,
364
- "score_ci_high": 0.43661971830985913,
365
  "score_ci_low": 0.2112676056338028,
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
- "accuracy": 0.4507042253521127,
370
- "accuracy_ci_low": 0.323943661971831,
371
- "accuracy_ci_high": 0.5664724235461314,
372
  "score_name": "accuracy",
373
- "score": 0.4507042253521127,
374
- "score_ci_high": 0.5664724235461314,
375
- "score_ci_low": 0.323943661971831,
376
  "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
- "accuracy": 0.36619718309859156,
380
- "accuracy_ci_low": 0.2535211267605634,
381
- "accuracy_ci_high": 0.4788732394366197,
382
  "score_name": "accuracy",
383
- "score": 0.36619718309859156,
384
- "score_ci_high": 0.4788732394366197,
385
- "score_ci_low": 0.2535211267605634,
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
- "accuracy": 0.3380281690140845,
390
- "accuracy_ci_low": 0.22535211267605634,
391
- "accuracy_ci_high": 0.4647887323943662,
392
  "score_name": "accuracy",
393
- "score": 0.3380281690140845,
394
- "score_ci_high": 0.4647887323943662,
395
- "score_ci_low": 0.22535211267605634,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
- "accuracy": 0.4084507042253521,
400
- "accuracy_ci_low": 0.29577464788732394,
401
- "accuracy_ci_high": 0.5211267605633803,
402
  "score_name": "accuracy",
403
- "score": 0.4084507042253521,
404
- "score_ci_high": 0.5211267605633803,
405
- "score_ci_low": 0.29577464788732394,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
  "accuracy": 0.3380281690140845,
410
  "accuracy_ci_low": 0.23943661971830985,
411
- "accuracy_ci_high": 0.4647887323943662,
412
  "score_name": "accuracy",
413
  "score": 0.3380281690140845,
414
- "score_ci_high": 0.4647887323943662,
415
  "score_ci_low": 0.23943661971830985,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
- "accuracy": 0.11267605633802817,
420
- "accuracy_ci_low": 0.056338028169014086,
421
- "accuracy_ci_high": 0.19757759490217996,
422
  "score_name": "accuracy",
423
- "score": 0.11267605633802817,
424
- "score_ci_high": 0.19757759490217996,
425
- "score_ci_low": 0.056338028169014086,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
- "accuracy": 0.22535211267605634,
430
- "accuracy_ci_low": 0.14084507042253522,
431
- "accuracy_ci_high": 0.323943661971831,
432
  "score_name": "accuracy",
433
- "score": 0.22535211267605634,
434
- "score_ci_high": 0.323943661971831,
435
- "score_ci_low": 0.14084507042253522,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
- "accuracy": 0.4225352112676056,
440
- "accuracy_ci_low": 0.30985915492957744,
441
- "accuracy_ci_high": 0.5352112676056338,
442
  "score_name": "accuracy",
443
- "score": 0.4225352112676056,
444
- "score_ci_high": 0.5352112676056338,
445
- "score_ci_low": 0.30985915492957744,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
- "accuracy": 0.22535211267605634,
450
- "accuracy_ci_low": 0.14084507042253522,
451
- "accuracy_ci_high": 0.3380281690140845,
452
  "score_name": "accuracy",
453
- "score": 0.22535211267605634,
454
- "score_ci_high": 0.3380281690140845,
455
- "score_ci_low": 0.14084507042253522,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
- "accuracy": 0.5211267605633803,
460
- "accuracy_ci_low": 0.4084507042253521,
461
- "accuracy_ci_high": 0.6338028169014085,
462
  "score_name": "accuracy",
463
- "score": 0.5211267605633803,
464
- "score_ci_high": 0.6338028169014085,
465
- "score_ci_low": 0.4084507042253521,
466
  "num_of_instances": 71
467
  },
468
- "score": 0.33400402414486924,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.2827795486010496,
475
- "f1_suggestive": 0.08333333333333333,
476
- "f1_descriptive": 0.4444444444444444,
477
  "f1_generic": 0.11764705882352941,
478
- "f1_fanciful": 0.4827586206896552,
479
- "f1_arbitrary": 0.2857142857142857,
480
- "f1_macro_ci_low": 0.20381678012471904,
481
- "f1_macro_ci_high": 0.38601597944875415,
482
  "score_name": "f1_micro",
483
- "score": 0.3253012048192771,
484
- "score_ci_high": 0.42168674698795183,
485
- "score_ci_low": 0.21686746987951808,
486
  "num_of_instances": 85,
487
- "accuracy": 0.3176470588235294,
488
- "accuracy_ci_low": 0.21176470588235294,
489
- "accuracy_ci_high": 0.4117647058823529,
490
- "f1_micro": 0.3253012048192771,
491
- "f1_micro_ci_low": 0.21686746987951808,
492
- "f1_micro_ci_high": 0.42168674698795183
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.5842293906810035,
496
- "f1_no": 0.8129032258064516,
497
- "f1_yes": 0.35555555555555557,
498
- "f1_macro_ci_low": 0.5123650296064088,
499
- "f1_macro_ci_high": 0.6612083568605307,
500
  "score_name": "f1_micro",
501
- "score": 0.71,
502
- "score_ci_high": 0.765,
503
- "score_ci_low": 0.64,
504
  "num_of_instances": 200,
505
- "accuracy": 0.71,
506
- "accuracy_ci_low": 0.64,
507
- "accuracy_ci_high": 0.765,
508
- "f1_micro": 0.71,
509
- "f1_micro_ci_low": 0.64,
510
- "f1_micro_ci_high": 0.765
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.23684055980437102,
514
- "f1_conclusion": 0.12,
515
- "f1_issue": 0.2682926829268293,
516
- "f1_decree": 0.17647058823529413,
517
- "f1_rule": 0.4155844155844156,
518
- "f1_analysis": 0.2608695652173913,
519
- "f1_facts": 0.16666666666666666,
520
- "f1_procedural history": 0.25,
521
- "f1_macro_ci_low": 0.18399933651413464,
522
- "f1_macro_ci_high": 0.3012128675188143,
523
  "score_name": "f1_micro",
524
- "score": 0.2570694087403599,
525
- "score_ci_high": 0.31910866448170155,
526
- "score_ci_low": 0.19563743957580057,
527
  "num_of_instances": 200,
528
- "accuracy": 0.25,
529
- "accuracy_ci_low": 0.19,
530
- "accuracy_ci_high": 0.31,
531
- "f1_micro": 0.2570694087403599,
532
- "f1_micro_ci_low": 0.19563743957580057,
533
- "f1_micro_ci_high": 0.31910866448170155
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.45179063360881544,
537
- "f1_yes": 0.5702479338842975,
538
- "f1_no": 0.3333333333333333,
539
- "f1_macro_ci_low": 0.3881370275424478,
540
- "f1_macro_ci_high": 0.5208583506164292,
541
  "score_name": "f1_micro",
542
- "score": 0.47738693467336685,
543
- "score_ci_high": 0.5454545454545454,
544
- "score_ci_low": 0.41102756892230574,
545
  "num_of_instances": 200,
546
- "accuracy": 0.475,
547
- "accuracy_ci_low": 0.4062357598667403,
548
- "accuracy_ci_high": 0.54,
549
- "f1_micro": 0.47738693467336685,
550
- "f1_micro_ci_low": 0.41102756892230574,
551
- "f1_micro_ci_high": 0.5454545454545454
552
  },
553
  "legalbench_proa": {
554
- "f1_macro": 0.7797888386123679,
555
- "f1_yes": 0.7647058823529411,
556
- "f1_no": 0.7948717948717948,
557
- "f1_macro_ci_low": 0.686770027516329,
558
- "f1_macro_ci_high": 0.847201812396528,
559
  "score_name": "f1_micro",
560
- "score": 0.7808219178082192,
561
- "score_ci_high": 0.847682119205298,
562
- "score_ci_low": 0.6846573729523644,
563
  "num_of_instances": 85,
564
- "accuracy": 0.6705882352941176,
565
- "accuracy_ci_low": 0.5647058823529412,
566
- "accuracy_ci_high": 0.7529411764705882,
567
- "f1_micro": 0.7808219178082192,
568
- "f1_micro_ci_low": 0.6846573729523644,
569
- "f1_micro_ci_high": 0.847682119205298
570
  },
571
- "score": 0.5101158932082446,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.4855115011669257,
578
- "f1_cars": 0.8,
579
- "f1_windows x": 0.05555555555555555,
580
- "f1_atheism": 0.17777777777777778,
581
- "f1_cryptography": 0.4444444444444444,
582
- "f1_religion": 0.23404255319148937,
583
- "f1_medicine": 0.8,
584
- "f1_christianity": 0.36619718309859156,
585
- "f1_computer graphics": 0.3652173913043478,
586
- "f1_microsoft windows": 0.19047619047619047,
587
- "f1_middle east": 0.4675324675324675,
588
- "f1_motorcycles": 0.693069306930693,
589
- "f1_politics": 0.313953488372093,
590
- "f1_pc hardware": 0.4292682926829268,
591
- "f1_mac hardware": 0.2972972972972973,
592
- "f1_for sale": 0.7058823529411765,
593
- "f1_guns": 0.34375,
594
- "f1_space": 0.6888888888888889,
595
- "f1_baseball": 0.8909090909090909,
596
- "f1_hockey": 0.8709677419354839,
597
- "f1_electronics": 0.575,
598
- "f1_macro_ci_low": 0.4606519053067645,
599
- "f1_macro_ci_high": 0.5114904866418184,
600
  "score_name": "f1_micro",
601
- "score": 0.5034666666666666,
602
- "score_ci_high": 0.5288163691152058,
603
- "score_ci_low": 0.4713054725252697,
604
  "num_of_instances": 1000,
605
- "accuracy": 0.472,
606
- "accuracy_ci_low": 0.44038730175462776,
607
- "accuracy_ci_high": 0.497,
608
- "f1_micro": 0.5034666666666666,
609
- "f1_micro_ci_low": 0.4713054725252697,
610
- "f1_micro_ci_high": 0.5288163691152058
611
  },
612
- "score": 0.5034666666666666,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.607364388794758,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9122807017543859,
620
- "f1_credit card or prepaid card": 0.6666666666666666,
621
- "f1_debt collection": 0.6075949367088608,
622
- "f1_checking or savings account": 0.673469387755102,
623
- "f1_money transfer or virtual currency or money service": 0.5777777777777777,
624
- "f1_vehicle loan or lease": 0.37037037037037035,
625
- "f1_mortgage": 0.6666666666666666,
626
- "f1_payday loan or title loan or personal loan": 0.2222222222222222,
627
- "f1_student loan": 0.7692307692307693,
628
- "f1_macro_ci_low": 0.5603416421881502,
629
- "f1_macro_ci_high": 0.6682100489708924,
630
  "score_name": "f1_micro",
631
- "score": 0.8273716951788491,
632
- "score_ci_high": 0.8505803933787175,
633
- "score_ci_low": 0.8031586690475525,
634
  "num_of_instances": 1000,
635
- "accuracy": 0.798,
636
- "accuracy_ci_low": 0.7700141366334644,
637
- "accuracy_ci_high": 0.8228416338853977,
638
- "f1_micro": 0.8273716951788491,
639
- "f1_micro_ci_low": 0.8031586690475525,
640
- "f1_micro_ci_high": 0.8505803933787175
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.6739108272125272,
644
- "f1_mortgages and loans": 0.7719298245614035,
645
- "f1_credit card": 0.7403314917127072,
646
- "f1_retail banking": 0.5797101449275363,
647
- "f1_debt collection": 0.5686274509803921,
648
- "f1_credit reporting": 0.7089552238805971,
649
- "f1_macro_ci_low": 0.6324404602189574,
650
- "f1_macro_ci_high": 0.7147768248953918,
651
  "score_name": "f1_micro",
652
- "score": 0.6777546777546778,
653
- "score_ci_high": 0.716590388897516,
654
- "score_ci_low": 0.6352085235971857,
655
  "num_of_instances": 500,
656
- "accuracy": 0.652,
657
- "accuracy_ci_low": 0.608,
658
- "accuracy_ci_high": 0.6909013646716825,
659
- "f1_micro": 0.6777546777546778,
660
- "f1_micro_ci_low": 0.6352085235971857,
661
- "f1_micro_ci_high": 0.716590388897516
662
  },
663
- "score": 0.7525631864667635,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
- "program_accuracy": 0.136,
671
- "score": 0.136,
 
672
  "score_name": "program_accuracy",
673
- "execution_accuracy": 0.112,
674
- "program_accuracy_ci_low": 0.114,
675
- "program_accuracy_ci_high": 0.15532916889351497,
676
- "score_ci_low": 0.114,
677
- "score_ci_high": 0.15532916889351497,
678
- "execution_accuracy_ci_low": 0.093,
679
- "execution_accuracy_ci_high": 0.132
680
  },
681
- "score": 0.136,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.34340385694506587,
688
- "recall": 0.5730464338072634,
689
- "f1": 0.3745794974329677,
690
- "precision_ci_low": 0.3234187185304744,
691
- "precision_ci_high": 0.3650589335945776,
692
- "recall_ci_low": 0.5557745525912291,
693
- "recall_ci_high": 0.5891018666307999,
694
- "f1_ci_low": 0.3573010951324523,
695
- "f1_ci_high": 0.3923997026360963,
696
  "score_name": "f1",
697
- "score": 0.3745794974329677,
698
- "score_ci_high": 0.3923997026360963,
699
- "score_ci_low": 0.3573010951324523,
700
  "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.6226274134715398,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6971947036186854,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5803041330973308,
704
- "faithfullness_f1_token_overlap": 0.333895617947972,
705
- "faithfullness_recall_token_overlap": 0.2631184151415058,
706
- "faithfullness_precision_token_overlap": 0.5911531265193607,
707
- "correctness_f1_token_overlap": 0.3745794974329677,
708
- "correctness_recall_token_overlap": 0.5730464338072634,
709
- "correctness_precision_token_overlap": 0.34340385694506587
710
  },
711
- "score": 0.3745794974329677,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.484,
718
- "accuracy_ci_low": 0.454,
719
- "accuracy_ci_high": 0.514,
720
  "score_name": "accuracy",
721
- "score": 0.484,
722
- "score_ci_high": 0.514,
723
- "score_ci_low": 0.454,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
- "accuracy": 0.806,
728
- "accuracy_ci_low": 0.772,
729
- "accuracy_ci_high": 0.84,
730
  "score_name": "accuracy",
731
- "score": 0.806,
732
- "score_ci_high": 0.84,
733
- "score_ci_low": 0.772,
734
  "num_of_instances": 500
735
  },
736
- "score": 0.645,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
- "severity_critical": 2.0,
743
- "severity_high": 1.0,
744
- "severity_medium": 1.0,
745
- "severity_low": 96.0,
746
- "category_harmful_info": 0.8320170236041858,
747
- "category_deception": 0.80982066540405,
748
- "category_explicit_content": 0.8411968264615898,
749
- "category_violence": 0.8222103803300314,
750
- "category_discrimination": 0.851427495597847,
751
- "category_substance_abuse": 0.923489530881246,
752
- "category_pii": 0.8410816744521812,
753
- "safety": 0.8333571752993802,
754
- "safety_ci_low": 0.8004974256643814,
755
- "safety_ci_high": 0.8560529591545912,
756
  "score_name": "safety",
757
- "score": 0.8333571752993802,
758
- "score_ci_high": 0.8560529591545912,
759
- "score_ci_low": 0.8004974256643814,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.8333571752993802,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
- "rouge2": 0.20644440587446686,
770
- "rouge1": 0.4218186965603418,
771
- "rougeL": 0.29171488107135435,
772
- "score": 0.29171488107135435,
773
  "score_name": "rougeL",
774
- "rougeLsum": 0.3573848941644133,
775
- "rouge2_ci_low": 0.19970855600689244,
776
- "rouge2_ci_high": 0.2141764351715554,
777
- "rouge1_ci_low": 0.41248489848485753,
778
- "rouge1_ci_high": 0.4306004852492735,
779
- "rougeL_ci_low": 0.284372337658834,
780
- "rougeL_ci_high": 0.29907980889509783,
781
- "score_ci_low": 0.284372337658834,
782
- "score_ci_high": 0.29907980889509783,
783
- "rougeLsum_ci_low": 0.3492659326802685,
784
- "rougeLsum_ci_high": 0.36590481273391734
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
- "rouge2": 0.0155586126994404,
789
- "rouge1": 0.11530035033575219,
790
- "rougeL": 0.0819857457679891,
791
- "score": 0.0819857457679891,
792
  "score_name": "rougeL",
793
- "rougeLsum": 0.09494455096055868,
794
- "rouge2_ci_low": 0.013676000237548778,
795
- "rouge2_ci_high": 0.01748477110760906,
796
- "rouge1_ci_low": 0.10949633575516456,
797
- "rouge1_ci_high": 0.12012750847071728,
798
- "rougeL_ci_low": 0.07832639561199897,
799
- "rougeL_ci_high": 0.08543803609753609,
800
- "score_ci_low": 0.07832639561199897,
801
- "score_ci_high": 0.08543803609753609,
802
- "rougeLsum_ci_low": 0.09027374868536467,
803
- "rougeLsum_ci_high": 0.0990496989831643
804
  },
805
- "score": 0.18685031341967173,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
@@ -810,473 +810,473 @@
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
- 1129,
814
- 620,
815
- 369,
816
- 232
817
  ],
818
  "totals": [
819
- 1854,
820
- 1788,
821
- 1722,
822
- 1656
823
  ],
824
  "precisions": [
825
- 0.6089536138079827,
826
- 0.34675615212527966,
827
- 0.21428571428571427,
828
- 0.14009661835748793
829
  ],
830
  "bp": 1.0,
831
- "sys_len": 1854,
832
  "ref_len": 1734,
833
- "sacrebleu": 0.28216771071430846,
834
- "score": 0.28216771071430846,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.2361776507614392,
837
- "score_ci_high": 0.31854760158610573,
838
- "sacrebleu_ci_low": 0.2361776507614392,
839
- "sacrebleu_ci_high": 0.31854760158610573
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
- 1222,
845
- 719,
846
- 458,
847
- 298
848
  ],
849
  "totals": [
850
- 1795,
851
- 1729,
852
- 1663,
853
- 1597
854
  ],
855
  "precisions": [
856
- 0.6807799442896936,
857
- 0.4158473105841527,
858
- 0.27540589296452195,
859
- 0.18659987476518475
860
  ],
861
  "bp": 1.0,
862
- "sys_len": 1795,
863
  "ref_len": 1734,
864
- "sacrebleu": 0.34730121824258303,
865
- "score": 0.34730121824258303,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.29893861972188673,
868
- "score_ci_high": 0.3929549500270372,
869
- "sacrebleu_ci_low": 0.29893861972188673,
870
- "sacrebleu_ci_high": 0.3929549500270372
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
- 640,
876
- 243,
877
- 115,
878
- 51
879
  ],
880
  "totals": [
881
- 2303,
882
- 2237,
883
- 2171,
884
- 2105
885
  ],
886
  "precisions": [
887
- 0.2778983933999131,
888
- 0.1086276262852034,
889
- 0.05297098111469369,
890
- 0.024228028503562947
891
  ],
892
  "bp": 1.0,
893
- "sys_len": 2303,
894
  "ref_len": 1589,
895
- "sacrebleu": 0.07889429589395064,
896
- "score": 0.07889429589395064,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.05783007476819273,
899
- "score_ci_high": 0.11334032544493618,
900
- "sacrebleu_ci_low": 0.05783007476819273,
901
- "sacrebleu_ci_high": 0.11334032544493618
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
- 1100,
907
- 591,
908
- 353,
909
- 222
910
  ],
911
  "totals": [
912
- 1847,
913
- 1781,
914
- 1715,
915
- 1649
916
  ],
917
  "precisions": [
918
- 0.5955603681645912,
919
- 0.3318360471645143,
920
- 0.20583090379008745,
921
- 0.13462704669496664
922
  ],
923
  "bp": 1.0,
924
- "sys_len": 1847,
925
  "ref_len": 1835,
926
- "sacrebleu": 0.27203392188147313,
927
- "score": 0.27203392188147313,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.23527738150795005,
930
- "score_ci_high": 0.3139246690058723,
931
- "sacrebleu_ci_low": 0.23527738150795005,
932
- "sacrebleu_ci_high": 0.3139246690058723
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
- 1379,
938
- 931,
939
- 680,
940
- 508
941
  ],
942
  "totals": [
943
- 2006,
944
- 1940,
945
- 1874,
946
- 1808
947
  ],
948
  "precisions": [
949
- 0.6874376869391824,
950
- 0.4798969072164948,
951
- 0.3628601921024546,
952
- 0.2809734513274336
953
  ],
954
- "bp": 0.9695654687972447,
955
- "sys_len": 2006,
956
  "ref_len": 2068,
957
- "sacrebleu": 0.4152155549652011,
958
- "score": 0.4152155549652011,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.3836282048944182,
961
- "score_ci_high": 0.4533991390034356,
962
- "sacrebleu_ci_low": 0.3836282048944182,
963
- "sacrebleu_ci_high": 0.4533991390034356
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
- 1035,
969
- 422,
970
- 219,
971
- 125
972
  ],
973
  "totals": [
974
- 3325,
975
- 3259,
976
- 3193,
977
- 3127
978
  ],
979
  "precisions": [
980
- 0.3112781954887218,
981
- 0.12948757287511506,
982
- 0.0685875352333229,
983
- 0.03997441637352094
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 3325,
987
  "ref_len": 2235,
988
- "sacrebleu": 0.10253001707008509,
989
- "score": 0.10253001707008509,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.08168831530386227,
992
- "score_ci_high": 0.1371689608842245,
993
- "sacrebleu_ci_low": 0.08168831530386227,
994
- "sacrebleu_ci_high": 0.1371689608842245
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
- 1365,
1000
- 935,
1001
- 696,
1002
- 519
1003
  ],
1004
  "totals": [
1005
- 1887,
1006
- 1821,
1007
- 1755,
1008
- 1689
1009
  ],
1010
  "precisions": [
1011
- 0.7233704292527823,
1012
- 0.513454146073586,
1013
- 0.3965811965811966,
1014
- 0.3072824156305506
1015
  ],
1016
- "bp": 0.9847491803389177,
1017
- "sys_len": 1887,
1018
  "ref_len": 1916,
1019
- "sacrebleu": 0.45421208890996323,
1020
- "score": 0.45421208890996323,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.402877439448594,
1023
- "score_ci_high": 0.49512376744317715,
1024
- "sacrebleu_ci_low": 0.402877439448594,
1025
- "sacrebleu_ci_high": 0.49512376744317715
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
- 930,
1031
- 427,
1032
- 223,
1033
- 122
1034
  ],
1035
  "totals": [
1036
- 1966,
1037
- 1900,
1038
- 1834,
1039
- 1768
1040
  ],
1041
  "precisions": [
1042
- 0.47304170905391657,
1043
- 0.22473684210526315,
1044
- 0.12159214830970556,
1045
- 0.06900452488687783
1046
  ],
1047
  "bp": 1.0,
1048
- "sys_len": 1966,
1049
  "ref_len": 1949,
1050
- "sacrebleu": 0.17281809069385612,
1051
- "score": 0.17281809069385612,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.1498451419191345,
1054
- "score_ci_high": 0.20240539093526114,
1055
- "sacrebleu_ci_low": 0.1498451419191345,
1056
- "sacrebleu_ci_high": 0.20240539093526114
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
- 1186,
1062
- 603,
1063
- 337,
1064
- 189
1065
  ],
1066
  "totals": [
1067
- 1982,
1068
- 1916,
1069
- 1850,
1070
- 1784
1071
  ],
1072
  "precisions": [
1073
- 0.5983854692230071,
1074
- 0.31471816283924847,
1075
- 0.1821621621621622,
1076
- 0.10594170403587444
1077
  ],
1078
- "bp": 0.9431530195225803,
1079
- "sys_len": 1982,
1080
  "ref_len": 2098,
1081
- "sacrebleu": 0.23157365627652982,
1082
- "score": 0.23157365627652982,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.20035664766275735,
1085
- "score_ci_high": 0.25766181006532113,
1086
- "sacrebleu_ci_low": 0.20035664766275735,
1087
- "sacrebleu_ci_high": 0.25766181006532113
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
- 1263,
1093
- 786,
1094
- 517,
1095
- 354
1096
  ],
1097
  "totals": [
1098
- 1831,
1099
- 1765,
1100
- 1699,
1101
- 1633
1102
  ],
1103
  "precisions": [
1104
- 0.6897870016384489,
1105
- 0.44532577903682724,
1106
- 0.3042966450853443,
1107
- 0.21677893447642377
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 1831,
1111
  "ref_len": 1734,
1112
- "sacrebleu": 0.3772912827525828,
1113
- "score": 0.3772912827525828,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.3318073525019781,
1116
- "score_ci_high": 0.42160864308969,
1117
- "sacrebleu_ci_low": 0.3318073525019781,
1118
- "sacrebleu_ci_high": 0.42160864308969
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
- 998,
1124
- 440,
1125
- 238,
1126
  140
1127
  ],
1128
  "totals": [
1129
- 1869,
1130
- 1803,
1131
- 1737,
1132
- 1671
1133
  ],
1134
  "precisions": [
1135
- 0.5339753879079722,
1136
- 0.24403771491957849,
1137
- 0.13701784686240645,
1138
- 0.08378216636744465
1139
  ],
1140
  "bp": 1.0,
1141
- "sys_len": 1869,
1142
  "ref_len": 1734,
1143
- "sacrebleu": 0.1966648424448395,
1144
- "score": 0.1966648424448395,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.16525947632394583,
1147
- "score_ci_high": 0.23654666880731012,
1148
- "sacrebleu_ci_low": 0.16525947632394583,
1149
- "sacrebleu_ci_high": 0.23654666880731012
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
- 952,
1155
- 450,
1156
- 247,
1157
- 136
1158
  ],
1159
  "totals": [
1160
- 1808,
1161
- 1742,
1162
- 1676,
1163
- 1610
1164
  ],
1165
  "precisions": [
1166
- 0.5265486725663717,
1167
- 0.25832376578645233,
1168
- 0.1473747016706444,
1169
- 0.084472049689441
1170
  ],
1171
  "bp": 1.0,
1172
- "sys_len": 1808,
1173
  "ref_len": 1734,
1174
- "sacrebleu": 0.2028545317121833,
1175
- "score": 0.2028545317121833,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.17683650189535868,
1178
- "score_ci_high": 0.25600334996477725,
1179
- "sacrebleu_ci_low": 0.17683650189535868,
1180
- "sacrebleu_ci_high": 0.25600334996477725
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
- 1256,
1186
- 812,
1187
- 560,
1188
- 392
1189
  ],
1190
  "totals": [
1191
- 1782,
1192
- 1716,
1193
- 1650,
1194
- 1584
1195
  ],
1196
  "precisions": [
1197
- 0.7048260381593715,
1198
- 0.4731934731934732,
1199
- 0.33939393939393936,
1200
- 0.2474747474747475
1201
  ],
1202
  "bp": 1.0,
1203
- "sys_len": 1782,
1204
  "ref_len": 1734,
1205
- "sacrebleu": 0.409108887747912,
1206
- "score": 0.409108887747912,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.36492758260668207,
1209
- "score_ci_high": 0.46703692233170646,
1210
- "sacrebleu_ci_low": 0.36492758260668207,
1211
- "sacrebleu_ci_high": 0.46703692233170646
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
- 1197,
1217
  729,
1218
- 477,
1219
- 310
1220
  ],
1221
  "totals": [
1222
- 1815,
1223
- 1749,
1224
- 1683,
1225
- 1617
1226
  ],
1227
  "precisions": [
1228
- 0.659504132231405,
1229
- 0.41680960548885077,
1230
- 0.28342245989304815,
1231
- 0.191713048855906
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 1815,
1235
  "ref_len": 1734,
1236
- "sacrebleu": 0.34959104085020015,
1237
- "score": 0.34959104085020015,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.30660403979630557,
1240
- "score_ci_high": 0.3928177080574808,
1241
- "sacrebleu_ci_low": 0.30660403979630557,
1242
- "sacrebleu_ci_high": 0.3928177080574808
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
- 1110,
1248
- 579,
1249
- 330,
1250
- 191
1251
  ],
1252
  "totals": [
1253
- 1811,
1254
- 1745,
1255
- 1679,
1256
- 1613
1257
  ],
1258
  "precisions": [
1259
- 0.6129210381004969,
1260
- 0.33180515759312323,
1261
- 0.19654556283502087,
1262
- 0.11841289522628642
1263
  ],
1264
  "bp": 1.0,
1265
- "sys_len": 1811,
1266
  "ref_len": 1734,
1267
- "sacrebleu": 0.2622934684900747,
1268
- "score": 0.2622934684900747,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.22911977694045185,
1271
- "score_ci_high": 0.30518353214635846,
1272
- "sacrebleu_ci_low": 0.22911977694045185,
1273
- "sacrebleu_ci_high": 0.30518353214635846
1274
  },
1275
- "score": 0.27697004057638286,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
- "score": 0.484621964093495,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-06-23T06:53:01.281933Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
178
  "safety_bbq_age": {
179
  "accuracy": 0.6444444444444445,
180
  "accuracy_ci_low": 0.5444444444444444,
181
+ "accuracy_ci_high": 0.7384996290160605,
182
  "score_name": "accuracy",
183
  "score": 0.6444444444444445,
184
+ "score_ci_high": 0.7384996290160605,
185
  "score_ci_low": 0.5444444444444444,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
+ "accuracy": 0.7111111111111111,
190
+ "accuracy_ci_low": 0.6,
191
  "accuracy_ci_high": 0.8,
192
  "score_name": "accuracy",
193
+ "score": 0.7111111111111111,
194
  "score_ci_high": 0.8,
195
+ "score_ci_low": 0.6,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
  "accuracy": 0.9111111111111111,
200
+ "accuracy_ci_low": 0.8444444444444444,
201
  "accuracy_ci_high": 0.9555555555555556,
202
  "score_name": "accuracy",
203
  "score": 0.9111111111111111,
204
  "score_ci_high": 0.9555555555555556,
205
+ "score_ci_low": 0.8444444444444444,
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
+ "accuracy": 0.7888888888888889,
210
+ "accuracy_ci_low": 0.7,
211
+ "accuracy_ci_high": 0.8666666666666667,
212
  "score_name": "accuracy",
213
+ "score": 0.7888888888888889,
214
+ "score_ci_high": 0.8666666666666667,
215
+ "score_ci_low": 0.7,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.7888888888888889,
220
+ "accuracy_ci_low": 0.6888888888888889,
221
+ "accuracy_ci_high": 0.8666666666666667,
222
  "score_name": "accuracy",
223
+ "score": 0.7888888888888889,
224
+ "score_ci_high": 0.8666666666666667,
225
+ "score_ci_low": 0.6888888888888889,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.9666666666666667,
230
+ "accuracy_ci_low": 0.9222222222222223,
231
+ "accuracy_ci_high": 0.9888888888888889,
232
  "score_name": "accuracy",
233
+ "score": 0.9666666666666667,
234
+ "score_ci_high": 0.9888888888888889,
235
+ "score_ci_low": 0.9222222222222223,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
 
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
+ "accuracy": 0.8333333333333334,
260
+ "accuracy_ci_low": 0.7333333333333333,
261
+ "accuracy_ci_high": 0.9,
262
  "score_name": "accuracy",
263
+ "score": 0.8333333333333334,
264
+ "score_ci_high": 0.9,
265
+ "score_ci_low": 0.7333333333333333,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
+ "accuracy": 0.7,
270
+ "accuracy_ci_low": 0.6,
271
+ "accuracy_ci_high": 0.7888888888888889,
272
  "score_name": "accuracy",
273
+ "score": 0.7,
274
+ "score_ci_high": 0.7888888888888889,
275
+ "score_ci_low": 0.6,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.8222222222222222,
280
+ "accuracy_ci_low": 0.7444444444444445,
281
+ "accuracy_ci_high": 0.8888888888888888,
282
  "score_name": "accuracy",
283
+ "score": 0.8222222222222222,
284
+ "score_ci_high": 0.8888888888888888,
285
+ "score_ci_low": 0.7444444444444445,
286
  "num_of_instances": 90
287
  },
288
+ "score": 0.8222222222222222,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.0625,
296
+ "score": 0.0625,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.0625,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
+ "f1_Person": 0.5798816568047337,
307
+ "f1_Organization": 0.42592592592592593,
308
+ "f1_Location": 0.40336134453781514,
309
+ "f1_macro": 0.46972297575615823,
310
+ "recall_macro": 0.40171664278500413,
311
+ "precision_macro": 0.5828611111111112,
312
+ "in_classes_support": 0.8064516129032258,
313
+ "f1_micro": 0.4343434343434343,
314
+ "recall_micro": 0.4095238095238095,
315
+ "precision_micro": 0.46236559139784944,
316
+ "score": 0.4343434343434343,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.3767195996125236,
319
+ "score_ci_high": 0.4839388766009964,
320
+ "f1_micro_ci_low": 0.3767195996125236,
321
+ "f1_micro_ci_high": 0.4839388766009964
322
  },
323
+ "score": 0.4343434343434343,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
+ "accuracy": 0.5492957746478874,
330
  "accuracy_ci_low": 0.43661971830985913,
331
+ "accuracy_ci_high": 0.6619718309859155,
332
  "score_name": "accuracy",
333
+ "score": 0.5492957746478874,
334
+ "score_ci_high": 0.6619718309859155,
335
  "score_ci_low": 0.43661971830985913,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
+ "accuracy": 0.2112676056338028,
340
+ "accuracy_ci_low": 0.1267605633802817,
341
+ "accuracy_ci_high": 0.30985915492957744,
342
  "score_name": "accuracy",
343
+ "score": 0.2112676056338028,
344
+ "score_ci_high": 0.30985915492957744,
345
+ "score_ci_low": 0.1267605633802817,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
+ "accuracy": 0.2112676056338028,
350
+ "accuracy_ci_low": 0.1267605633802817,
351
+ "accuracy_ci_high": 0.323943661971831,
352
  "score_name": "accuracy",
353
+ "score": 0.2112676056338028,
354
+ "score_ci_high": 0.323943661971831,
355
+ "score_ci_low": 0.1267605633802817,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
+ "accuracy": 0.30985915492957744,
360
  "accuracy_ci_low": 0.2112676056338028,
361
+ "accuracy_ci_high": 0.4225352112676056,
362
  "score_name": "accuracy",
363
+ "score": 0.30985915492957744,
364
+ "score_ci_high": 0.4225352112676056,
365
  "score_ci_low": 0.2112676056338028,
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
+ "accuracy": 0.4225352112676056,
370
+ "accuracy_ci_low": 0.29577464788732394,
371
+ "accuracy_ci_high": 0.5352112676056338,
372
  "score_name": "accuracy",
373
+ "score": 0.4225352112676056,
374
+ "score_ci_high": 0.5352112676056338,
375
+ "score_ci_low": 0.29577464788732394,
376
  "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
+ "accuracy": 0.23943661971830985,
380
+ "accuracy_ci_low": 0.14084507042253522,
381
+ "accuracy_ci_high": 0.352112676056338,
382
  "score_name": "accuracy",
383
+ "score": 0.23943661971830985,
384
+ "score_ci_high": 0.352112676056338,
385
+ "score_ci_low": 0.14084507042253522,
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
+ "accuracy": 0.323943661971831,
390
+ "accuracy_ci_low": 0.2112676056338028,
391
+ "accuracy_ci_high": 0.43661971830985913,
392
  "score_name": "accuracy",
393
+ "score": 0.323943661971831,
394
+ "score_ci_high": 0.43661971830985913,
395
+ "score_ci_low": 0.2112676056338028,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
+ "accuracy": 0.4507042253521127,
400
+ "accuracy_ci_low": 0.3380281690140845,
401
+ "accuracy_ci_high": 0.5633802816901409,
402
  "score_name": "accuracy",
403
+ "score": 0.4507042253521127,
404
+ "score_ci_high": 0.5633802816901409,
405
+ "score_ci_low": 0.3380281690140845,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
  "accuracy": 0.3380281690140845,
410
  "accuracy_ci_low": 0.23943661971830985,
411
+ "accuracy_ci_high": 0.4393434853289757,
412
  "score_name": "accuracy",
413
  "score": 0.3380281690140845,
414
+ "score_ci_high": 0.4393434853289757,
415
  "score_ci_low": 0.23943661971830985,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
+ "accuracy": 0.2112676056338028,
420
+ "accuracy_ci_low": 0.1267605633802817,
421
+ "accuracy_ci_high": 0.30985915492957744,
422
  "score_name": "accuracy",
423
+ "score": 0.2112676056338028,
424
+ "score_ci_high": 0.30985915492957744,
425
+ "score_ci_low": 0.1267605633802817,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.29577464788732394,
430
+ "accuracy_ci_low": 0.19718309859154928,
431
+ "accuracy_ci_high": 0.4084507042253521,
432
  "score_name": "accuracy",
433
+ "score": 0.29577464788732394,
434
+ "score_ci_high": 0.4084507042253521,
435
+ "score_ci_low": 0.19718309859154928,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
+ "accuracy": 0.4084507042253521,
440
+ "accuracy_ci_low": 0.2885703240152898,
441
+ "accuracy_ci_high": 0.5211267605633803,
442
  "score_name": "accuracy",
443
+ "score": 0.4084507042253521,
444
+ "score_ci_high": 0.5211267605633803,
445
+ "score_ci_low": 0.2885703240152898,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.2112676056338028,
450
+ "accuracy_ci_low": 0.1267605633802817,
451
+ "accuracy_ci_high": 0.30985915492957744,
452
  "score_name": "accuracy",
453
+ "score": 0.2112676056338028,
454
+ "score_ci_high": 0.30985915492957744,
455
+ "score_ci_low": 0.1267605633802817,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
+ "accuracy": 0.5492957746478874,
460
+ "accuracy_ci_low": 0.43661971830985913,
461
+ "accuracy_ci_high": 0.672415960906933,
462
  "score_name": "accuracy",
463
+ "score": 0.5492957746478874,
464
+ "score_ci_high": 0.672415960906933,
465
+ "score_ci_low": 0.43661971830985913,
466
  "num_of_instances": 71
467
  },
468
+ "score": 0.3380281690140845,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.29746332099273276,
475
+ "f1_suggestive": 0.0,
476
+ "f1_descriptive": 0.36363636363636365,
477
  "f1_generic": 0.11764705882352941,
478
+ "f1_fanciful": 0.6470588235294118,
479
+ "f1_arbitrary": 0.358974358974359,
480
+ "f1_macro_ci_low": 0.2234746436877424,
481
+ "f1_macro_ci_high": 0.3820540135751509,
482
  "score_name": "f1_micro",
483
+ "score": 0.3493975903614458,
484
+ "score_ci_high": 0.45121951219512196,
485
+ "score_ci_low": 0.25149700598802394,
486
  "num_of_instances": 85,
487
+ "accuracy": 0.3411764705882353,
488
+ "accuracy_ci_low": 0.24705882352941178,
489
+ "accuracy_ci_high": 0.4470588235294118,
490
+ "f1_micro": 0.3493975903614458,
491
+ "f1_micro_ci_low": 0.25149700598802394,
492
+ "f1_micro_ci_high": 0.45121951219512196
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.6415770609318996,
496
+ "f1_no": 0.8387096774193549,
497
+ "f1_yes": 0.4444444444444444,
498
+ "f1_macro_ci_low": 0.5666801252929456,
499
+ "f1_macro_ci_high": 0.7176297030965157,
500
  "score_name": "f1_micro",
501
+ "score": 0.75,
502
+ "score_ci_high": 0.805,
503
+ "score_ci_low": 0.685,
504
  "num_of_instances": 200,
505
+ "accuracy": 0.75,
506
+ "accuracy_ci_low": 0.685,
507
+ "accuracy_ci_high": 0.805,
508
+ "f1_micro": 0.75,
509
+ "f1_micro_ci_low": 0.685,
510
+ "f1_micro_ci_high": 0.805
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.2937287351505673,
514
+ "f1_conclusion": 0.16,
515
+ "f1_issue": 0.3291139240506329,
516
+ "f1_decree": 0.24242424242424243,
517
+ "f1_rule": 0.4931506849315068,
518
+ "f1_analysis": 0.2916666666666667,
519
+ "f1_facts": 0.21621621621621623,
520
+ "f1_procedural history": 0.3235294117647059,
521
+ "f1_macro_ci_low": 0.2356167023599295,
522
+ "f1_macro_ci_high": 0.3627174769966993,
523
  "score_name": "f1_micro",
524
+ "score": 0.31443298969072164,
525
+ "score_ci_high": 0.37945181171815084,
526
+ "score_ci_low": 0.24415584415584415,
527
  "num_of_instances": 200,
528
+ "accuracy": 0.305,
529
+ "accuracy_ci_low": 0.23726030718429333,
530
+ "accuracy_ci_high": 0.37,
531
+ "f1_micro": 0.31443298969072164,
532
+ "f1_micro_ci_low": 0.24415584415584415,
533
+ "f1_micro_ci_high": 0.37945181171815084
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.44871725481706,
537
+ "f1_yes": 0.5887096774193549,
538
+ "f1_no": 0.3087248322147651,
539
+ "f1_macro_ci_low": 0.3839275870787324,
540
+ "f1_macro_ci_high": 0.5191612607559799,
541
  "score_name": "f1_micro",
542
+ "score": 0.4836272040302267,
543
+ "score_ci_high": 0.5505050505050505,
544
+ "score_ci_low": 0.4120603015075377,
545
  "num_of_instances": 200,
546
+ "accuracy": 0.48,
547
+ "accuracy_ci_low": 0.4083713252748318,
548
+ "accuracy_ci_high": 0.545,
549
+ "f1_micro": 0.4836272040302267,
550
+ "f1_micro_ci_low": 0.4120603015075377,
551
+ "f1_micro_ci_high": 0.5505050505050505
552
  },
553
  "legalbench_proa": {
554
+ "f1_macro": 0.8044662309368191,
555
+ "f1_yes": 0.7941176470588235,
556
+ "f1_no": 0.8148148148148148,
557
+ "f1_macro_ci_low": 0.7223270079060395,
558
+ "f1_macro_ci_high": 0.87627946340442,
559
  "score_name": "f1_micro",
560
+ "score": 0.8053691275167785,
561
+ "score_ci_high": 0.8717948717948718,
562
+ "score_ci_low": 0.7140882327681733,
563
  "num_of_instances": 85,
564
+ "accuracy": 0.7058823529411765,
565
+ "accuracy_ci_low": 0.6,
566
+ "accuracy_ci_high": 0.8,
567
+ "f1_micro": 0.8053691275167785,
568
+ "f1_micro_ci_low": 0.7140882327681733,
569
+ "f1_micro_ci_high": 0.8717948717948718
570
  },
571
+ "score": 0.5405653823198345,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.4772436601851055,
578
+ "f1_cars": 0.7640449438202247,
579
+ "f1_pc hardware": 0.3557692307692308,
580
+ "f1_windows x": 0.028985507246376812,
581
+ "f1_electronics": 0.48717948717948717,
582
+ "f1_atheism": 0.20408163265306123,
583
+ "f1_politics": 0.34355828220858897,
584
+ "f1_religion": 0.2708333333333333,
585
+ "f1_medicine": 0.7948717948717948,
586
+ "f1_christianity": 0.4166666666666667,
587
+ "f1_for sale": 0.6067415730337079,
588
+ "f1_computer graphics": 0.42016806722689076,
589
+ "f1_microsoft windows": 0.25806451612903225,
590
+ "f1_middle east": 0.49382716049382713,
591
+ "f1_motorcycles": 0.6666666666666666,
592
+ "f1_mac hardware": 0.25,
593
+ "f1_guns": 0.23728813559322035,
594
+ "f1_space": 0.717391304347826,
595
+ "f1_cryptography": 0.5230769230769231,
596
+ "f1_baseball": 0.8461538461538461,
597
+ "f1_hockey": 0.859504132231405,
598
+ "f1_macro_ci_low": 0.45194761799386507,
599
+ "f1_macro_ci_high": 0.5063130462647102,
600
  "score_name": "f1_micro",
601
+ "score": 0.49115281501340485,
602
+ "score_ci_high": 0.5196912105086561,
603
+ "score_ci_low": 0.4585932126016045,
604
  "num_of_instances": 1000,
605
+ "accuracy": 0.458,
606
+ "accuracy_ci_low": 0.427,
607
+ "accuracy_ci_high": 0.4864735442740007,
608
+ "f1_micro": 0.49115281501340485,
609
+ "f1_micro_ci_low": 0.4585932126016045,
610
+ "f1_micro_ci_high": 0.5196912105086561
611
  },
612
+ "score": 0.49115281501340485,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.5988009590549132,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9113372093023255,
620
+ "f1_checking or savings account": 0.5542168674698795,
621
+ "f1_debt collection": 0.4779874213836478,
622
+ "f1_credit card or prepaid card": 0.6370370370370371,
623
+ "f1_mortgage": 0.7397260273972602,
624
+ "f1_student loan": 0.8461538461538461,
625
+ "f1_money transfer or virtual currency or money service": 0.4864864864864865,
626
+ "f1_vehicle loan or lease": 0.42857142857142855,
627
+ "f1_payday loan or title loan or personal loan": 0.3076923076923077,
628
+ "f1_macro_ci_low": 0.550125163696031,
629
+ "f1_macro_ci_high": 0.6692920824665255,
630
  "score_name": "f1_micro",
631
+ "score": 0.8145077720207254,
632
+ "score_ci_high": 0.8367924066551193,
633
+ "score_ci_low": 0.7900784551279257,
634
  "num_of_instances": 1000,
635
+ "accuracy": 0.786,
636
+ "accuracy_ci_low": 0.76,
637
+ "accuracy_ci_high": 0.81,
638
+ "f1_micro": 0.8145077720207254,
639
+ "f1_micro_ci_low": 0.7900784551279257,
640
+ "f1_micro_ci_high": 0.8367924066551193
641
  },
642
  "cfpb_product_watsonx": {
643
+ "f1_macro": 0.6991553168902936,
644
+ "f1_mortgages and loans": 0.8228571428571428,
645
+ "f1_credit card": 0.7428571428571429,
646
+ "f1_debt collection": 0.6116504854368932,
647
+ "f1_credit reporting": 0.7211895910780669,
648
+ "f1_retail banking": 0.5972222222222222,
649
+ "f1_macro_ci_low": 0.6611649815931737,
650
+ "f1_macro_ci_high": 0.7441131702771507,
651
  "score_name": "f1_micro",
652
+ "score": 0.7017543859649122,
653
+ "score_ci_high": 0.7444878377150386,
654
+ "score_ci_low": 0.6639049566735055,
655
  "num_of_instances": 500,
656
+ "accuracy": 0.68,
657
+ "accuracy_ci_low": 0.6415834821537145,
658
+ "accuracy_ci_high": 0.7250870857804175,
659
+ "f1_micro": 0.7017543859649122,
660
+ "f1_micro_ci_low": 0.6639049566735055,
661
+ "f1_micro_ci_high": 0.7444878377150386
662
  },
663
+ "score": 0.7581310789928188,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
+ "execution_accuracy": 0.113,
671
+ "program_accuracy": 0.137,
672
+ "score": 0.137,
673
  "score_name": "program_accuracy",
674
+ "execution_accuracy_ci_low": 0.094,
675
+ "execution_accuracy_ci_high": 0.133,
676
+ "program_accuracy_ci_low": 0.115,
677
+ "program_accuracy_ci_high": 0.159,
678
+ "score_ci_low": 0.115,
679
+ "score_ci_high": 0.159
 
680
  },
681
+ "score": 0.137,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
+ "precision": 0.3510702505438354,
688
+ "recall": 0.5809699679510664,
689
+ "f1": 0.3824309995878018,
690
+ "precision_ci_low": 0.33249065458630267,
691
+ "precision_ci_high": 0.37203127312002804,
692
+ "recall_ci_low": 0.5641121374917414,
693
+ "recall_ci_high": 0.5988884760579221,
694
+ "f1_ci_low": 0.3661582243989197,
695
+ "f1_ci_high": 0.4007102293320155,
696
  "score_name": "f1",
697
+ "score": 0.3824309995878018,
698
+ "score_ci_high": 0.4007102293320155,
699
+ "score_ci_low": 0.3661582243989197,
700
  "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6284164202213287,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.7022818158070246,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5862240339815616,
704
+ "faithfullness_f1_token_overlap": 0.32867960701983034,
705
+ "faithfullness_recall_token_overlap": 0.26185727010665827,
706
+ "faithfullness_precision_token_overlap": 0.5811151625996505,
707
+ "correctness_f1_token_overlap": 0.3824309995878018,
708
+ "correctness_recall_token_overlap": 0.5809699679510664,
709
+ "correctness_precision_token_overlap": 0.3510702505438354
710
  },
711
+ "score": 0.3824309995878018,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
+ "accuracy": 0.479,
718
+ "accuracy_ci_low": 0.448,
719
+ "accuracy_ci_high": 0.5103198452743031,
720
  "score_name": "accuracy",
721
+ "score": 0.479,
722
+ "score_ci_high": 0.5103198452743031,
723
+ "score_ci_low": 0.448,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
+ "accuracy": 0.818,
728
+ "accuracy_ci_low": 0.7804131917525572,
729
+ "accuracy_ci_high": 0.85,
730
  "score_name": "accuracy",
731
+ "score": 0.818,
732
+ "score_ci_high": 0.85,
733
+ "score_ci_low": 0.7804131917525572,
734
  "num_of_instances": 500
735
  },
736
+ "score": 0.6485,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
+ "severity_critical": 0.0,
743
+ "severity_high": 0.0,
744
+ "severity_medium": 0.0,
745
+ "severity_low": 100.0,
746
+ "category_harmful_info": 0.8486059237981127,
747
+ "category_deception": 0.8698270569997605,
748
+ "category_explicit_content": 0.8145937094032161,
749
+ "category_violence": 0.8388658185134685,
750
+ "category_discrimination": 0.8823170511732025,
751
+ "category_substance_abuse": 0.9462624288267559,
752
+ "category_pii": 0.8776307304700216,
753
+ "safety": 0.8519032163850758,
754
+ "safety_ci_low": 0.8310657418019435,
755
+ "safety_ci_high": 0.8712351615904984,
756
  "score_name": "safety",
757
+ "score": 0.8519032163850758,
758
+ "score_ci_high": 0.8712351615904984,
759
+ "score_ci_low": 0.8310657418019435,
760
  "num_of_instances": 100
761
  },
762
+ "score": 0.8519032163850758,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
+ "rougeLsum": 0.3610679948661998,
770
+ "rouge1": 0.4233619373045491,
771
+ "rougeL": 0.29420815957168495,
772
+ "score": 0.29420815957168495,
773
  "score_name": "rougeL",
774
+ "rouge2": 0.20840228792560456,
775
+ "rougeLsum_ci_low": 0.35255104965168327,
776
+ "rougeLsum_ci_high": 0.36915628751369906,
777
+ "rouge1_ci_low": 0.4141157245136807,
778
+ "rouge1_ci_high": 0.43171361247266377,
779
+ "rougeL_ci_low": 0.28665554382409086,
780
+ "rougeL_ci_high": 0.30100667120780134,
781
+ "score_ci_low": 0.28665554382409086,
782
+ "score_ci_high": 0.30100667120780134,
783
+ "rouge2_ci_low": 0.20121549945064432,
784
+ "rouge2_ci_high": 0.21553750893087562
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
+ "rougeLsum": 0.09608294803448246,
789
+ "rouge1": 0.11601113348984893,
790
+ "rougeL": 0.08329710711031496,
791
+ "score": 0.08329710711031496,
792
  "score_name": "rougeL",
793
+ "rouge2": 0.01614281525612853,
794
+ "rougeLsum_ci_low": 0.09161955792928417,
795
+ "rougeLsum_ci_high": 0.10006888471086645,
796
+ "rouge1_ci_low": 0.1103956147665806,
797
+ "rouge1_ci_high": 0.12113815092736294,
798
+ "rougeL_ci_low": 0.07939906960390719,
799
+ "rougeL_ci_high": 0.08668886729552314,
800
+ "score_ci_low": 0.07939906960390719,
801
+ "score_ci_high": 0.08668886729552314,
802
+ "rouge2_ci_low": 0.014303052357088147,
803
+ "rouge2_ci_high": 0.01823150788885683
804
  },
805
+ "score": 0.18875263334099995,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
 
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
+ 1127,
814
+ 611,
815
+ 358,
816
+ 217
817
  ],
818
  "totals": [
819
+ 1857,
820
+ 1791,
821
+ 1725,
822
+ 1659
823
  ],
824
  "precisions": [
825
+ 0.6068928379106086,
826
+ 0.3411501954215522,
827
+ 0.20753623188405798,
828
+ 0.13080168776371306
829
  ],
830
  "bp": 1.0,
831
+ "sys_len": 1857,
832
  "ref_len": 1734,
833
+ "sacrebleu": 0.27380490753896447,
834
+ "score": 0.27380490753896447,
835
  "score_name": "sacrebleu",
836
+ "score_ci_low": 0.24139134438987545,
837
+ "score_ci_high": 0.3114086803649994,
838
+ "sacrebleu_ci_low": 0.24139134438987545,
839
+ "sacrebleu_ci_high": 0.3114086803649994
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
+ 1217,
845
+ 742,
846
+ 486,
847
+ 324
848
  ],
849
  "totals": [
850
+ 1805,
851
+ 1739,
852
+ 1673,
853
+ 1607
854
  ],
855
  "precisions": [
856
+ 0.6742382271468144,
857
+ 0.42668200115008625,
858
+ 0.2904961147638972,
859
+ 0.20161792159303052
860
  ],
861
  "bp": 1.0,
862
+ "sys_len": 1805,
863
  "ref_len": 1734,
864
+ "sacrebleu": 0.36028550442407303,
865
+ "score": 0.36028550442407303,
866
  "score_name": "sacrebleu",
867
+ "score_ci_low": 0.3120172829600809,
868
+ "score_ci_high": 0.4057887928505002,
869
+ "sacrebleu_ci_low": 0.3120172829600809,
870
+ "sacrebleu_ci_high": 0.4057887928505002
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
+ 672,
876
+ 256,
877
+ 123,
878
+ 56
879
  ],
880
  "totals": [
881
+ 1845,
882
+ 1779,
883
+ 1713,
884
+ 1647
885
  ],
886
  "precisions": [
887
+ 0.36422764227642274,
888
+ 0.14390106801573918,
889
+ 0.07180385288966726,
890
+ 0.03400121432908318
891
  ],
892
  "bp": 1.0,
893
+ "sys_len": 1845,
894
  "ref_len": 1589,
895
+ "sacrebleu": 0.10635790496521375,
896
+ "score": 0.10635790496521375,
897
  "score_name": "sacrebleu",
898
+ "score_ci_low": 0.0652406937888321,
899
+ "score_ci_high": 0.1319350359039831,
900
+ "sacrebleu_ci_low": 0.0652406937888321,
901
+ "sacrebleu_ci_high": 0.1319350359039831
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
+ 1077,
907
+ 569,
908
+ 326,
909
+ 185
910
  ],
911
  "totals": [
912
+ 1845,
913
+ 1779,
914
+ 1713,
915
+ 1647
916
  ],
917
  "precisions": [
918
+ 0.583739837398374,
919
+ 0.3198426082068578,
920
+ 0.19030939871570343,
921
+ 0.11232544019429266
922
  ],
923
  "bp": 1.0,
924
+ "sys_len": 1845,
925
  "ref_len": 1835,
926
+ "sacrebleu": 0.25134688330821237,
927
+ "score": 0.25134688330821237,
928
  "score_name": "sacrebleu",
929
+ "score_ci_low": 0.2195832990093629,
930
+ "score_ci_high": 0.2837968314094506,
931
+ "sacrebleu_ci_low": 0.2195832990093629,
932
+ "sacrebleu_ci_high": 0.2837968314094506
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
+ 1424,
938
+ 986,
939
+ 714,
940
+ 526
941
  ],
942
  "totals": [
943
+ 1999,
944
+ 1933,
945
+ 1867,
946
+ 1801
947
  ],
948
  "precisions": [
949
+ 0.7123561780890445,
950
+ 0.5100879461976203,
951
+ 0.3824317086234601,
952
+ 0.29205996668517487
953
  ],
954
+ "bp": 0.9660716664698304,
955
+ "sys_len": 1999,
956
  "ref_len": 2068,
957
+ "sacrebleu": 0.4336120976233934,
958
+ "score": 0.4336120976233934,
959
  "score_name": "sacrebleu",
960
+ "score_ci_low": 0.39952659045667155,
961
+ "score_ci_high": 0.47193262493606236,
962
+ "sacrebleu_ci_low": 0.39952659045667155,
963
+ "sacrebleu_ci_high": 0.47193262493606236
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
+ 1019,
969
+ 419,
970
+ 213,
971
+ 113
972
  ],
973
  "totals": [
974
+ 3749,
975
+ 3683,
976
+ 3617,
977
+ 3552
978
  ],
979
  "precisions": [
980
+ 0.27180581488396904,
981
+ 0.11376595166983437,
982
+ 0.0588885816975394,
983
+ 0.031813063063063064
984
  ],
985
  "bp": 1.0,
986
+ "sys_len": 3749,
987
  "ref_len": 2235,
988
+ "sacrebleu": 0.08724225995783678,
989
+ "score": 0.08724225995783678,
990
  "score_name": "sacrebleu",
991
+ "score_ci_low": 0.06811801818711148,
992
+ "score_ci_high": 0.11195945404539422,
993
+ "sacrebleu_ci_low": 0.06811801818711148,
994
+ "sacrebleu_ci_high": 0.11195945404539422
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
+ 1341,
1000
+ 889,
1001
+ 634,
1002
+ 457
1003
  ],
1004
  "totals": [
1005
+ 1879,
1006
+ 1813,
1007
+ 1747,
1008
+ 1681
1009
  ],
1010
  "precisions": [
1011
+ 0.7136774880255454,
1012
+ 0.4903474903474903,
1013
+ 0.36290784201488263,
1014
+ 0.2718619869125521
1015
  ],
1016
+ "bp": 0.9805012826642417,
1017
+ "sys_len": 1879,
1018
  "ref_len": 1916,
1019
+ "sacrebleu": 0.4226548575605273,
1020
+ "score": 0.4226548575605273,
1021
  "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.3804303884338436,
1023
+ "score_ci_high": 0.46259367891162306,
1024
+ "sacrebleu_ci_low": 0.3804303884338436,
1025
+ "sacrebleu_ci_high": 0.46259367891162306
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
+ 966,
1031
+ 465,
1032
+ 258,
1033
+ 149
1034
  ],
1035
  "totals": [
1036
+ 2330,
1037
+ 2264,
1038
+ 2198,
1039
+ 2132
1040
  ],
1041
  "precisions": [
1042
+ 0.4145922746781116,
1043
+ 0.20538869257950532,
1044
+ 0.11737943585077343,
1045
+ 0.0698874296435272
1046
  ],
1047
  "bp": 1.0,
1048
+ "sys_len": 2330,
1049
  "ref_len": 1949,
1050
+ "sacrebleu": 0.1625725453572352,
1051
+ "score": 0.1625725453572352,
1052
  "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.11669140677951717,
1054
+ "score_ci_high": 0.20412983752636973,
1055
+ "sacrebleu_ci_low": 0.11669140677951717,
1056
+ "sacrebleu_ci_high": 0.20412983752636973
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
+ 1199,
1062
+ 637,
1063
+ 360,
1064
+ 199
1065
  ],
1066
  "totals": [
1067
+ 1973,
1068
+ 1907,
1069
+ 1841,
1070
+ 1775
1071
  ],
1072
  "precisions": [
1073
+ 0.607704004054739,
1074
+ 0.3340325117986366,
1075
+ 0.19554589896795221,
1076
+ 0.11211267605633803
1077
  ],
1078
+ "bp": 0.9386099296136466,
1079
+ "sys_len": 1973,
1080
  "ref_len": 2098,
1081
+ "sacrebleu": 0.2424271251773898,
1082
+ "score": 0.2424271251773898,
1083
  "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.21260065622080154,
1085
+ "score_ci_high": 0.26696534058145066,
1086
+ "sacrebleu_ci_low": 0.21260065622080154,
1087
+ "sacrebleu_ci_high": 0.26696534058145066
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
+ 1270,
1093
+ 795,
1094
+ 520,
1095
+ 348
1096
  ],
1097
  "totals": [
1098
+ 1847,
1099
+ 1781,
1100
+ 1715,
1101
+ 1649
1102
  ],
1103
  "precisions": [
1104
+ 0.6876015159718462,
1105
+ 0.446378439079169,
1106
+ 0.3032069970845481,
1107
+ 0.2110369921164342
1108
  ],
1109
  "bp": 1.0,
1110
+ "sys_len": 1847,
1111
  "ref_len": 1734,
1112
+ "sacrebleu": 0.37435570897744036,
1113
+ "score": 0.37435570897744036,
1114
  "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.33656443265221864,
1116
+ "score_ci_high": 0.4099554772696377,
1117
+ "sacrebleu_ci_low": 0.33656443265221864,
1118
+ "sacrebleu_ci_high": 0.4099554772696377
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
+ 988,
1124
+ 434,
1125
+ 239,
1126
  140
1127
  ],
1128
  "totals": [
1129
+ 1874,
1130
+ 1808,
1131
+ 1742,
1132
+ 1676
1133
  ],
1134
  "precisions": [
1135
+ 0.5272145144076841,
1136
+ 0.24004424778761063,
1137
+ 0.13719862227324914,
1138
+ 0.08353221957040573
1139
  ],
1140
  "bp": 1.0,
1141
+ "sys_len": 1874,
1142
  "ref_len": 1734,
1143
+ "sacrebleu": 0.19515092235944087,
1144
+ "score": 0.19515092235944087,
1145
  "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.16674417267909872,
1147
+ "score_ci_high": 0.231444565320084,
1148
+ "sacrebleu_ci_low": 0.16674417267909872,
1149
+ "sacrebleu_ci_high": 0.231444565320084
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
+ 976,
1155
+ 440,
1156
+ 232,
1157
+ 128
1158
  ],
1159
  "totals": [
1160
+ 1841,
1161
+ 1775,
1162
+ 1709,
1163
+ 1643
1164
  ],
1165
  "precisions": [
1166
+ 0.530146659424226,
1167
+ 0.24788732394366197,
1168
+ 0.13575190169689877,
1169
+ 0.07790626902008521
1170
  ],
1171
  "bp": 1.0,
1172
+ "sys_len": 1841,
1173
  "ref_len": 1734,
1174
+ "sacrebleu": 0.19308216913928786,
1175
+ "score": 0.19308216913928786,
1176
  "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.1691283364865516,
1178
+ "score_ci_high": 0.22228985058810502,
1179
+ "sacrebleu_ci_low": 0.1691283364865516,
1180
+ "sacrebleu_ci_high": 0.22228985058810502
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
+ 1260,
1186
+ 815,
1187
+ 565,
1188
+ 399
1189
  ],
1190
  "totals": [
1191
+ 1793,
1192
+ 1727,
1193
+ 1661,
1194
+ 1595
1195
  ],
1196
  "precisions": [
1197
+ 0.7027328499721137,
1198
+ 0.4719166184134337,
1199
+ 0.34015653220951236,
1200
+ 0.2501567398119122
1201
  ],
1202
  "bp": 1.0,
1203
+ "sys_len": 1793,
1204
  "ref_len": 1734,
1205
+ "sacrebleu": 0.4098610398858089,
1206
+ "score": 0.4098610398858089,
1207
  "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.3653913364652719,
1209
+ "score_ci_high": 0.46316065930620326,
1210
+ "sacrebleu_ci_low": 0.3653913364652719,
1211
+ "sacrebleu_ci_high": 0.46316065930620326
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
+ 1227,
1217
  729,
1218
+ 472,
1219
+ 294
1220
  ],
1221
  "totals": [
1222
+ 1830,
1223
+ 1764,
1224
+ 1698,
1225
+ 1632
1226
  ],
1227
  "precisions": [
1228
+ 0.6704918032786885,
1229
+ 0.41326530612244894,
1230
+ 0.2779740871613663,
1231
+ 0.1801470588235294
1232
  ],
1233
  "bp": 1.0,
1234
+ "sys_len": 1830,
1235
  "ref_len": 1734,
1236
+ "sacrebleu": 0.343212800768137,
1237
+ "score": 0.343212800768137,
1238
  "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.3002026232607091,
1240
+ "score_ci_high": 0.3993584334850746,
1241
+ "sacrebleu_ci_low": 0.3002026232607091,
1242
+ "sacrebleu_ci_high": 0.3993584334850746
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
+ 1100,
1248
+ 577,
1249
+ 339,
1250
+ 205
1251
  ],
1252
  "totals": [
1253
+ 1824,
1254
+ 1758,
1255
+ 1692,
1256
+ 1626
1257
  ],
1258
  "precisions": [
1259
+ 0.6030701754385965,
1260
+ 0.3282138794084187,
1261
+ 0.200354609929078,
1262
+ 0.12607626076260764
1263
  ],
1264
  "bp": 1.0,
1265
+ "sys_len": 1824,
1266
  "ref_len": 1734,
1267
+ "sacrebleu": 0.2659128735671552,
1268
+ "score": 0.2659128735671552,
1269
  "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.23124038640104372,
1271
+ "score_ci_high": 0.30957401763446213,
1272
+ "sacrebleu_ci_low": 0.23124038640104372,
1273
+ "sacrebleu_ci_high": 0.30957401763446213
1274
  },
1275
+ "score": 0.27479197337400774,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
+ "score": 0.4561786095841296,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
results/bluebench/{2025-06-19T16-09-06_evaluation_results.json β†’ 2025-06-23T03-17-57_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-19T20:09:01.492000Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -42,7 +42,7 @@
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -176,633 +176,633 @@
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
- "accuracy": 0.5777777777777777,
180
- "accuracy_ci_low": 0.4777777777777778,
181
- "accuracy_ci_high": 0.6777777777777778,
182
  "score_name": "accuracy",
183
- "score": 0.5777777777777777,
184
- "score_ci_high": 0.6777777777777778,
185
- "score_ci_low": 0.4777777777777778,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
- "accuracy": 0.6777777777777778,
190
- "accuracy_ci_low": 0.5777777777777777,
191
- "accuracy_ci_high": 0.7666666666666667,
192
  "score_name": "accuracy",
193
- "score": 0.6777777777777778,
194
- "score_ci_high": 0.7666666666666667,
195
- "score_ci_low": 0.5777777777777777,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
- "accuracy": 0.8111111111111111,
200
- "accuracy_ci_low": 0.7222222222222222,
201
- "accuracy_ci_high": 0.8777777777777778,
202
  "score_name": "accuracy",
203
- "score": 0.8111111111111111,
204
- "score_ci_high": 0.8777777777777778,
205
- "score_ci_low": 0.7222222222222222,
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
- "accuracy": 0.43333333333333335,
210
- "accuracy_ci_low": 0.3333333333333333,
211
- "accuracy_ci_high": 0.5333333333333333,
212
  "score_name": "accuracy",
213
- "score": 0.43333333333333335,
214
- "score_ci_high": 0.5333333333333333,
215
- "score_ci_low": 0.3333333333333333,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
  "accuracy": 0.5888888888888889,
220
- "accuracy_ci_low": 0.48197626978907726,
221
  "accuracy_ci_high": 0.6888888888888889,
222
  "score_name": "accuracy",
223
  "score": 0.5888888888888889,
224
  "score_ci_high": 0.6888888888888889,
225
- "score_ci_low": 0.48197626978907726,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.8777777777777778,
230
- "accuracy_ci_low": 0.8,
231
- "accuracy_ci_high": 0.9333333333333333,
232
  "score_name": "accuracy",
233
- "score": 0.8777777777777778,
234
- "score_ci_high": 0.9333333333333333,
235
- "score_ci_low": 0.8,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
- "accuracy": 0.7444444444444445,
240
- "accuracy_ci_low": 0.6444444444444445,
241
- "accuracy_ci_high": 0.8333333333333334,
242
  "score_name": "accuracy",
243
- "score": 0.7444444444444445,
244
- "score_ci_high": 0.8333333333333334,
245
- "score_ci_low": 0.6444444444444445,
246
  "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
  "accuracy": 0.6222222222222222,
250
- "accuracy_ci_low": 0.5222222222222223,
251
- "accuracy_ci_high": 0.7222222222222222,
252
  "score_name": "accuracy",
253
  "score": 0.6222222222222222,
254
- "score_ci_high": 0.7222222222222222,
255
- "score_ci_low": 0.5222222222222223,
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
- "accuracy": 0.6,
260
- "accuracy_ci_low": 0.5,
261
- "accuracy_ci_high": 0.7,
262
  "score_name": "accuracy",
263
- "score": 0.6,
264
- "score_ci_high": 0.7,
265
- "score_ci_low": 0.5,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
- "accuracy": 0.6333333333333333,
270
- "accuracy_ci_low": 0.5333333333333333,
271
- "accuracy_ci_high": 0.7333333333333333,
272
  "score_name": "accuracy",
273
- "score": 0.6333333333333333,
274
- "score_ci_high": 0.7333333333333333,
275
- "score_ci_low": 0.5333333333333333,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.7,
280
- "accuracy_ci_low": 0.5969530984549517,
281
- "accuracy_ci_high": 0.7798809350059414,
282
  "score_name": "accuracy",
283
- "score": 0.7,
284
- "score_ci_high": 0.7798809350059414,
285
- "score_ci_low": 0.5969530984549517,
286
  "num_of_instances": 90
287
  },
288
- "score": 0.6606060606060606,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.5,
296
- "score": 0.5,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.5,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
- "f1_Person": 0.4156626506024097,
307
- "f1_Organization": 0.31372549019607837,
308
- "f1_Location": 0.23140495867768596,
309
- "f1_macro": 0.32026436649205803,
310
- "recall_macro": 0.2686052593300962,
311
- "precision_macro": 0.40524414740424186,
312
- "in_classes_support": 0.6173913043478261,
313
- "f1_micro": 0.26363636363636367,
314
- "recall_micro": 0.2761904761904762,
315
- "precision_micro": 0.25217391304347825,
316
- "score": 0.26363636363636367,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.22691460915409117,
319
- "score_ci_high": 0.3036479298321143,
320
- "f1_micro_ci_low": 0.22691460915409117,
321
- "f1_micro_ci_high": 0.3036479298321143
322
  },
323
- "score": 0.26363636363636367,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
- "accuracy": 0.43661971830985913,
330
- "accuracy_ci_low": 0.323943661971831,
331
- "accuracy_ci_high": 0.5633802816901409,
332
  "score_name": "accuracy",
333
- "score": 0.43661971830985913,
334
- "score_ci_high": 0.5633802816901409,
335
- "score_ci_low": 0.323943661971831,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
- "accuracy": 0.2112676056338028,
340
- "accuracy_ci_low": 0.1267605633802817,
341
- "accuracy_ci_high": 0.30985915492957744,
342
  "score_name": "accuracy",
343
- "score": 0.2112676056338028,
344
- "score_ci_high": 0.30985915492957744,
345
- "score_ci_low": 0.1267605633802817,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
  "accuracy": 0.23943661971830985,
350
  "accuracy_ci_low": 0.14084507042253522,
351
- "accuracy_ci_high": 0.3380281690140845,
352
  "score_name": "accuracy",
353
  "score": 0.23943661971830985,
354
- "score_ci_high": 0.3380281690140845,
355
  "score_ci_low": 0.14084507042253522,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
- "accuracy": 0.3380281690140845,
360
- "accuracy_ci_low": 0.22535211267605634,
361
- "accuracy_ci_high": 0.4507042253521127,
362
  "score_name": "accuracy",
363
- "score": 0.3380281690140845,
364
- "score_ci_high": 0.4507042253521127,
365
- "score_ci_low": 0.22535211267605634,
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
- "accuracy": 0.38028169014084506,
370
- "accuracy_ci_low": 0.28169014084507044,
371
  "accuracy_ci_high": 0.5070422535211268,
372
  "score_name": "accuracy",
373
- "score": 0.38028169014084506,
374
  "score_ci_high": 0.5070422535211268,
375
- "score_ci_low": 0.28169014084507044,
376
  "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
- "accuracy": 0.2112676056338028,
380
- "accuracy_ci_low": 0.1267605633802817,
381
- "accuracy_ci_high": 0.30985915492957744,
382
  "score_name": "accuracy",
383
- "score": 0.2112676056338028,
384
- "score_ci_high": 0.30985915492957744,
385
- "score_ci_low": 0.1267605633802817,
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
- "accuracy": 0.19718309859154928,
390
- "accuracy_ci_low": 0.11267605633802817,
391
- "accuracy_ci_high": 0.30985915492957744,
392
  "score_name": "accuracy",
393
- "score": 0.19718309859154928,
394
- "score_ci_high": 0.30985915492957744,
395
- "score_ci_low": 0.11267605633802817,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
  "accuracy": 0.2676056338028169,
400
- "accuracy_ci_low": 0.15492957746478872,
401
- "accuracy_ci_high": 0.38028169014084506,
402
  "score_name": "accuracy",
403
  "score": 0.2676056338028169,
404
- "score_ci_high": 0.38028169014084506,
405
- "score_ci_low": 0.15492957746478872,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
- "accuracy": 0.1267605633802817,
410
- "accuracy_ci_low": 0.056338028169014086,
411
- "accuracy_ci_high": 0.22535211267605634,
412
  "score_name": "accuracy",
413
- "score": 0.1267605633802817,
414
- "score_ci_high": 0.22535211267605634,
415
- "score_ci_low": 0.056338028169014086,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
- "accuracy": 0.09859154929577464,
420
- "accuracy_ci_low": 0.04225352112676056,
421
- "accuracy_ci_high": 0.18309859154929578,
422
  "score_name": "accuracy",
423
- "score": 0.09859154929577464,
424
- "score_ci_high": 0.18309859154929578,
425
- "score_ci_low": 0.04225352112676056,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
- "accuracy": 0.19718309859154928,
430
- "accuracy_ci_low": 0.11267605633802817,
431
- "accuracy_ci_high": 0.29577464788732394,
432
  "score_name": "accuracy",
433
- "score": 0.19718309859154928,
434
- "score_ci_high": 0.29577464788732394,
435
- "score_ci_low": 0.11267605633802817,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
- "accuracy": 0.28169014084507044,
440
- "accuracy_ci_low": 0.18309859154929578,
441
- "accuracy_ci_high": 0.39436619718309857,
442
  "score_name": "accuracy",
443
- "score": 0.28169014084507044,
444
- "score_ci_high": 0.39436619718309857,
445
- "score_ci_low": 0.18309859154929578,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
- "accuracy": 0.19718309859154928,
450
- "accuracy_ci_low": 0.11267605633802817,
451
- "accuracy_ci_high": 0.29577464788732394,
452
  "score_name": "accuracy",
453
- "score": 0.19718309859154928,
454
- "score_ci_high": 0.29577464788732394,
455
- "score_ci_low": 0.11267605633802817,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
- "accuracy": 0.38028169014084506,
460
- "accuracy_ci_low": 0.28169014084507044,
461
- "accuracy_ci_high": 0.5070422535211268,
462
  "score_name": "accuracy",
463
- "score": 0.38028169014084506,
464
- "score_ci_high": 0.5070422535211268,
465
- "score_ci_low": 0.28169014084507044,
466
  "num_of_instances": 71
467
  },
468
- "score": 0.2545271629778672,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.40599479862637755,
475
- "f1_suggestive": 0.16666666666666666,
476
- "f1_generic": 0.2727272727272727,
477
- "f1_arbitrary": 0.5263157894736842,
478
- "f1_fanciful": 0.5777777777777777,
479
- "f1_descriptive": 0.4864864864864865,
480
- "f1_macro_ci_low": 0.31051398566318733,
481
- "f1_macro_ci_high": 0.5136277650253285,
482
  "score_name": "f1_micro",
483
- "score": 0.4457831325301205,
484
- "score_ci_high": 0.550817717180019,
485
- "score_ci_low": 0.3373493975903614,
486
  "num_of_instances": 85,
487
- "accuracy": 0.43529411764705883,
488
- "accuracy_ci_low": 0.32941176470588235,
489
- "accuracy_ci_high": 0.5411764705882353,
490
- "f1_micro": 0.4457831325301205,
491
- "f1_micro_ci_low": 0.3373493975903614,
492
- "f1_micro_ci_high": 0.550817717180019
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.4964746474647465,
496
- "f1_no": 0.7656765676567657,
497
- "f1_yes": 0.22727272727272727,
498
- "f1_macro_ci_low": 0.43223753510688345,
499
- "f1_macro_ci_high": 0.5682608970547502,
500
  "score_name": "f1_micro",
501
  "score": 0.6445012787723785,
502
- "score_ci_high": 0.7025641025641025,
503
- "score_ci_low": 0.5728900255754475,
504
  "num_of_instances": 200,
505
  "accuracy": 0.63,
506
- "accuracy_ci_low": 0.56,
507
  "accuracy_ci_high": 0.69,
508
  "f1_micro": 0.6445012787723785,
509
- "f1_micro_ci_low": 0.5728900255754475,
510
- "f1_micro_ci_high": 0.7025641025641025
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.20456925120109243,
514
- "f1_conclusion": 0.047619047619047616,
515
- "f1_decree": 0.26666666666666666,
516
- "f1_issue": 0.18947368421052632,
517
- "f1_analysis": 0.3125,
518
- "f1_facts": 0.2857142857142857,
519
- "f1_procedural history": 0.19047619047619047,
520
- "f1_rule": 0.13953488372093023,
521
- "f1_macro_ci_low": 0.15195580870715297,
522
- "f1_macro_ci_high": 0.2695847948134964,
523
  "score_name": "f1_micro",
524
- "score": 0.20911528150134048,
525
- "score_ci_high": 0.2716626596010836,
526
- "score_ci_low": 0.15343915343915343,
527
  "num_of_instances": 200,
528
- "accuracy": 0.195,
529
- "accuracy_ci_low": 0.14164584898806754,
530
- "accuracy_ci_high": 0.255,
531
- "f1_micro": 0.20911528150134048,
532
- "f1_micro_ci_low": 0.15343915343915343,
533
- "f1_micro_ci_high": 0.2716626596010836
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.45312172637754033,
537
- "f1_yes": 0.5225225225225225,
538
- "f1_no": 0.38372093023255816,
539
- "f1_macro_ci_low": 0.3838533922470516,
540
- "f1_macro_ci_high": 0.5196559838649608,
541
  "score_name": "f1_micro",
542
- "score": 0.4619289340101523,
543
- "score_ci_high": 0.5291073254863808,
544
- "score_ci_low": 0.39285714285714285,
545
  "num_of_instances": 200,
546
- "accuracy": 0.455,
547
- "accuracy_ci_low": 0.385,
548
- "accuracy_ci_high": 0.525,
549
- "f1_micro": 0.4619289340101523,
550
- "f1_micro_ci_low": 0.39285714285714285,
551
- "f1_micro_ci_high": 0.5291073254863808
552
  },
553
  "legalbench_proa": {
554
- "f1_macro": 0.7812889165628891,
555
- "f1_yes": 0.7671232876712328,
556
- "f1_no": 0.7954545454545454,
557
- "f1_macro_ci_low": 0.6841107145759989,
558
- "f1_macro_ci_high": 0.8570115576895313,
559
  "score_name": "f1_micro",
560
- "score": 0.782608695652174,
561
- "score_ci_high": 0.8554216867469879,
562
- "score_ci_low": 0.6867321408585169,
563
  "num_of_instances": 85,
564
- "accuracy": 0.7411764705882353,
565
- "accuracy_ci_low": 0.6470588235294118,
566
- "accuracy_ci_high": 0.8235294117647058,
567
- "f1_micro": 0.782608695652174,
568
- "f1_micro_ci_low": 0.6867321408585169,
569
- "f1_micro_ci_high": 0.8554216867469879
570
  },
571
- "score": 0.5087874644932331,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.34833143635091446,
578
- "f1_cars": 0.6590909090909091,
579
- "f1_windows x": 0.030303030303030304,
580
- "f1_atheism": 0.19047619047619047,
581
- "f1_christianity": 0.2,
582
- "f1_religion": 0.19047619047619047,
583
- "f1_medicine": 0.6567164179104478,
584
- "f1_computer graphics": 0.34782608695652173,
585
- "f1_microsoft windows": 0.29850746268656714,
586
- "f1_middle east": 0.11764705882352941,
587
- "f1_politics": 0.20754716981132076,
588
- "f1_motorcycles": 0.43373493975903615,
589
- "f1_pc hardware": 0.3973509933774834,
590
- "f1_mac hardware": 0.3950617283950617,
591
- "f1_electronics": 0.4186046511627907,
592
- "f1_for sale": 0.08695652173913043,
593
- "f1_guns": 0.14814814814814814,
594
- "f1_space": 0.4935064935064935,
595
- "f1_cryptography": 0.47368421052631576,
596
- "f1_baseball": 0.6890756302521008,
597
- "f1_hockey": 0.5319148936170213,
598
- "f1_macro_ci_low": 0.32107229927440883,
599
- "f1_macro_ci_high": 0.37798520058634305,
600
  "score_name": "f1_micro",
601
- "score": 0.3750771128932758,
602
- "score_ci_high": 0.40812055333180686,
603
- "score_ci_low": 0.3412059307716769,
604
  "num_of_instances": 1000,
605
- "accuracy": 0.304,
606
- "accuracy_ci_low": 0.275,
607
- "accuracy_ci_high": 0.333,
608
- "f1_micro": 0.3750771128932758,
609
- "f1_micro_ci_low": 0.3412059307716769,
610
- "f1_micro_ci_high": 0.40812055333180686
611
  },
612
- "score": 0.3750771128932758,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.49544359373400404,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.8558139534883721,
620
- "f1_credit card or prepaid card": 0.4778761061946903,
621
- "f1_debt collection": 0.45517241379310347,
622
- "f1_checking or savings account": 0.4791666666666667,
623
- "f1_payday loan or title loan or personal loan": 0.1875,
624
- "f1_vehicle loan or lease": 0.30303030303030304,
625
- "f1_mortgage": 0.6909090909090909,
626
- "f1_money transfer or virtual currency or money service": 0.34285714285714286,
627
  "f1_student loan": 0.6666666666666666,
628
- "f1_macro_ci_low": 0.437780332452402,
629
- "f1_macro_ci_high": 0.5517479827666423,
 
 
 
630
  "score_name": "f1_micro",
631
- "score": 0.7417582417582418,
632
- "score_ci_high": 0.7670380361466397,
633
- "score_ci_low": 0.7139576080586072,
634
  "num_of_instances": 1000,
635
- "accuracy": 0.675,
636
- "accuracy_ci_low": 0.644,
637
- "accuracy_ci_high": 0.705,
638
- "f1_micro": 0.7417582417582418,
639
- "f1_micro_ci_low": 0.7139576080586072,
640
- "f1_micro_ci_high": 0.7670380361466397
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.5417314583908202,
644
- "f1_mortgages and loans": 0.6705202312138728,
645
- "f1_credit card": 0.5815602836879432,
646
- "f1_debt collection": 0.5073170731707317,
647
- "f1_credit reporting": 0.6431372549019608,
648
- "f1_retail banking": 0.30612244897959184,
649
- "f1_macro_ci_low": 0.4990357022335705,
650
- "f1_macro_ci_high": 0.5863760959900322,
651
  "score_name": "f1_micro",
652
- "score": 0.5688073394495413,
653
- "score_ci_high": 0.6118783685965219,
654
- "score_ci_low": 0.5227795175898966,
655
  "num_of_instances": 500,
656
- "accuracy": 0.496,
657
- "accuracy_ci_low": 0.452,
658
- "accuracy_ci_high": 0.538,
659
- "f1_micro": 0.5688073394495413,
660
- "f1_micro_ci_low": 0.5227795175898966,
661
- "f1_micro_ci_high": 0.6118783685965219
662
  },
663
- "score": 0.6552827906038916,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
- "program_accuracy": 0.107,
671
- "score": 0.107,
672
  "score_name": "program_accuracy",
673
- "execution_accuracy": 0.091,
674
- "program_accuracy_ci_low": 0.08721723629561164,
675
- "program_accuracy_ci_high": 0.126,
676
- "score_ci_low": 0.08721723629561164,
677
- "score_ci_high": 0.126,
678
- "execution_accuracy_ci_low": 0.074,
679
- "execution_accuracy_ci_high": 0.109
680
  },
681
- "score": 0.107,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.27411911391097254,
688
- "recall": 0.5230720245824972,
689
- "f1": 0.30495867779564406,
690
- "precision_ci_low": 0.2547573973545031,
691
- "precision_ci_high": 0.2921448180759812,
692
- "recall_ci_low": 0.5070386905327111,
693
- "recall_ci_high": 0.5400839972442947,
694
- "f1_ci_low": 0.28775327483111873,
695
- "f1_ci_high": 0.32083305804218265,
696
  "score_name": "f1",
697
- "score": 0.30495867779564406,
698
- "score_ci_high": 0.32083305804218265,
699
- "score_ci_low": 0.28775327483111873,
700
  "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.5782615457475185,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6598645970225334,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5351804447919131,
704
- "faithfullness_f1_token_overlap": 0.34366402575185945,
705
- "faithfullness_recall_token_overlap": 0.27898727759494896,
706
- "faithfullness_precision_token_overlap": 0.5756506161288797,
707
- "correctness_f1_token_overlap": 0.30495867779564406,
708
- "correctness_recall_token_overlap": 0.5230720245824972,
709
- "correctness_precision_token_overlap": 0.27411911391097254
710
  },
711
- "score": 0.30495867779564406,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.388,
718
- "accuracy_ci_low": 0.358,
719
  "accuracy_ci_high": 0.417,
720
  "score_name": "accuracy",
721
- "score": 0.388,
722
  "score_ci_high": 0.417,
723
- "score_ci_low": 0.358,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
  "accuracy": 0.728,
728
  "accuracy_ci_low": 0.692,
729
- "accuracy_ci_high": 0.764,
730
  "score_name": "accuracy",
731
  "score": 0.728,
732
- "score_ci_high": 0.764,
733
  "score_ci_low": 0.692,
734
  "num_of_instances": 500
735
  },
736
- "score": 0.558,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
- "severity_critical": 0.0,
743
  "severity_high": 0.0,
744
- "severity_medium": 1.0,
745
- "severity_low": 99.0,
746
- "category_harmful_info": 0.8488387024207483,
747
- "category_deception": 0.866081887573907,
748
- "category_explicit_content": 0.8272491132821699,
749
- "category_violence": 0.812888801746839,
750
- "category_discrimination": 0.8425866682378074,
751
- "category_substance_abuse": 0.9123246139950222,
752
- "category_pii": 0.7867479754818811,
753
- "safety": 0.8366520641242258,
754
- "safety_ci_low": 0.8141729035472625,
755
- "safety_ci_high": 0.8588271358338173,
756
  "score_name": "safety",
757
- "score": 0.8366520641242258,
758
- "score_ci_high": 0.8588271358338173,
759
- "score_ci_low": 0.8141729035472625,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.8366520641242258,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
- "rouge2": 0.1946123321830468,
770
- "rouge1": 0.4165480993021762,
771
- "rougeLsum": 0.3481713172994178,
772
- "rougeL": 0.2807625515077543,
773
- "score": 0.2807625515077543,
774
  "score_name": "rougeL",
775
- "rouge2_ci_low": 0.18807199323963344,
776
- "rouge2_ci_high": 0.2007897635489223,
777
- "rouge1_ci_low": 0.40802841361294645,
778
- "rouge1_ci_high": 0.4240881706518218,
779
- "rougeLsum_ci_low": 0.34046630242406717,
780
- "rougeLsum_ci_high": 0.35565311375927156,
781
- "rougeL_ci_low": 0.2745389110131783,
782
- "rougeL_ci_high": 0.2870232677361269,
783
- "score_ci_low": 0.2745389110131783,
784
- "score_ci_high": 0.2870232677361269
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
- "rouge2": 0.013499529737597162,
789
- "rouge1": 0.1111081768530587,
790
- "rougeLsum": 0.0922891945088228,
791
- "rougeL": 0.07992633322696455,
792
- "score": 0.07992633322696455,
793
  "score_name": "rougeL",
794
- "rouge2_ci_low": 0.012006599571612698,
795
- "rouge2_ci_high": 0.015167305255576668,
796
- "rouge1_ci_low": 0.10596612811589602,
797
- "rouge1_ci_high": 0.11561580840527891,
798
- "rougeLsum_ci_low": 0.08846385121818591,
799
- "rougeLsum_ci_high": 0.09604727885686246,
800
- "rougeL_ci_low": 0.0765698806517895,
801
- "rougeL_ci_high": 0.0830415577853562,
802
- "score_ci_low": 0.0765698806517895,
803
- "score_ci_high": 0.0830415577853562
804
  },
805
- "score": 0.18034444236735941,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
@@ -810,473 +810,473 @@
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
- 1002,
814
- 497,
815
- 282,
816
- 169
817
  ],
818
  "totals": [
819
- 1844,
820
- 1778,
821
- 1712,
822
- 1646
823
  ],
824
  "precisions": [
825
- 0.5433839479392625,
826
- 0.2795275590551181,
827
- 0.1647196261682243,
828
- 0.10267314702308626
829
  ],
830
  "bp": 1.0,
831
- "sys_len": 1844,
832
  "ref_len": 1734,
833
- "sacrebleu": 0.22513002244943295,
834
- "score": 0.22513002244943295,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.18442974136293813,
837
- "score_ci_high": 0.26207208386144243,
838
- "sacrebleu_ci_low": 0.18442974136293813,
839
- "sacrebleu_ci_high": 0.26207208386144243
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
- 1117,
845
- 634,
846
- 393,
847
- 252
848
  ],
849
  "totals": [
850
- 1756,
851
- 1690,
852
- 1624,
853
- 1558
854
  ],
855
  "precisions": [
856
- 0.6361047835990888,
857
- 0.37514792899408284,
858
- 0.2419950738916256,
859
- 0.16174582798459564
860
  ],
861
  "bp": 1.0,
862
- "sys_len": 1756,
863
  "ref_len": 1734,
864
- "sacrebleu": 0.3108799453655372,
865
- "score": 0.3108799453655372,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.26939824952148095,
868
- "score_ci_high": 0.35631092704009654,
869
- "sacrebleu_ci_low": 0.26939824952148095,
870
- "sacrebleu_ci_high": 0.35631092704009654
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
- 518,
876
- 145,
877
- 48,
878
- 15
879
  ],
880
  "totals": [
881
- 1654,
882
- 1588,
883
- 1522,
884
- 1456
885
  ],
886
  "precisions": [
887
- 0.313180169286578,
888
- 0.09130982367758186,
889
- 0.03153745072273324,
890
- 0.0103021978021978
891
  ],
892
  "bp": 1.0,
893
- "sys_len": 1654,
894
  "ref_len": 1589,
895
- "sacrebleu": 0.055209912255726495,
896
- "score": 0.055209912255726495,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.04194038184823177,
899
- "score_ci_high": 0.08190458345551464,
900
- "sacrebleu_ci_low": 0.04194038184823177,
901
- "sacrebleu_ci_high": 0.08190458345551464
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
- 932,
907
- 408,
908
- 209,
909
- 113
910
  ],
911
  "totals": [
912
- 1864,
913
- 1798,
914
- 1732,
915
- 1666
916
  ],
917
  "precisions": [
918
- 0.5,
919
- 0.22691879866518352,
920
- 0.12066974595842955,
921
- 0.06782713085234093
922
  ],
923
- "bp": 1.0,
924
- "sys_len": 1864,
925
  "ref_len": 1835,
926
- "sacrebleu": 0.17456637003886807,
927
- "score": 0.17456637003886807,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.1446545609941566,
930
- "score_ci_high": 0.2117995060278497,
931
- "sacrebleu_ci_low": 0.1446545609941566,
932
- "sacrebleu_ci_high": 0.2117995060278497
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
- 1281,
938
- 791,
939
- 540,
940
- 380
941
  ],
942
  "totals": [
943
- 2064,
944
- 1998,
945
- 1932,
946
- 1866
947
  ],
948
  "precisions": [
949
- 0.6206395348837209,
950
- 0.39589589589589586,
951
- 0.27950310559006214,
952
- 0.20364415862808144
953
  ],
954
- "bp": 0.9980638921833086,
955
- "sys_len": 2064,
956
  "ref_len": 2068,
957
- "sacrebleu": 0.3432243584069162,
958
- "score": 0.3432243584069162,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.31863429882294914,
961
- "score_ci_high": 0.38949054032191865,
962
- "sacrebleu_ci_low": 0.31863429882294914,
963
- "sacrebleu_ci_high": 0.38949054032191865
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
- 943,
969
- 328,
970
- 141,
971
- 66
972
  ],
973
  "totals": [
974
- 3253,
975
- 3187,
976
- 3121,
977
- 3055
978
  ],
979
  "precisions": [
980
- 0.28988625883799574,
981
- 0.10291810480075306,
982
- 0.04517782761935277,
983
- 0.02160392798690671
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 3253,
987
  "ref_len": 2235,
988
- "sacrebleu": 0.07345889118508468,
989
- "score": 0.07345889118508468,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.06129370081536535,
992
- "score_ci_high": 0.09632357969438955,
993
- "sacrebleu_ci_low": 0.06129370081536535,
994
- "sacrebleu_ci_high": 0.09632357969438955
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
- 1229,
1000
- 727,
1001
  477,
1002
- 316
1003
  ],
1004
  "totals": [
1005
- 1915,
1006
- 1849,
1007
- 1783,
1008
- 1717
1009
  ],
1010
  "precisions": [
1011
- 0.64177545691906,
1012
- 0.3931855056787452,
1013
- 0.2675266404935502,
1014
- 0.18404193360512522
1015
  ],
1016
- "bp": 0.9994779431076575,
1017
- "sys_len": 1915,
1018
  "ref_len": 1916,
1019
- "sacrebleu": 0.3336870259850046,
1020
- "score": 0.3336870259850046,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.30427044055098784,
1023
- "score_ci_high": 0.3722955797843792,
1024
- "sacrebleu_ci_low": 0.30427044055098784,
1025
- "sacrebleu_ci_high": 0.3722955797843792
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
- 776,
1031
- 292,
1032
- 137,
1033
- 66
1034
  ],
1035
  "totals": [
1036
- 2002,
1037
- 1936,
1038
- 1870,
1039
- 1804
1040
  ],
1041
  "precisions": [
1042
- 0.3876123876123876,
1043
- 0.15082644628099173,
1044
- 0.0732620320855615,
1045
- 0.03658536585365854
1046
  ],
1047
  "bp": 1.0,
1048
- "sys_len": 2002,
1049
  "ref_len": 1949,
1050
- "sacrebleu": 0.11188332833173187,
1051
- "score": 0.11188332833173187,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.09410502256897492,
1054
- "score_ci_high": 0.14085608331376978,
1055
- "sacrebleu_ci_low": 0.09410502256897492,
1056
- "sacrebleu_ci_high": 0.14085608331376978
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
- 1155,
1062
- 545,
1063
- 284,
1064
- 157
1065
  ],
1066
  "totals": [
1067
- 2065,
1068
- 1999,
1069
- 1933,
1070
- 1867
1071
  ],
1072
  "precisions": [
1073
- 0.559322033898305,
1074
- 0.2726363181590795,
1075
- 0.14692188308329024,
1076
- 0.08409212640599893
1077
  ],
1078
- "bp": 0.9841463832388515,
1079
- "sys_len": 2065,
1080
  "ref_len": 2098,
1081
- "sacrebleu": 0.20503668186599197,
1082
- "score": 0.20503668186599197,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.17287865798739985,
1085
- "score_ci_high": 0.22839534867300298,
1086
- "sacrebleu_ci_low": 0.17287865798739985,
1087
- "sacrebleu_ci_high": 0.22839534867300298
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
- 1154,
1093
- 673,
1094
- 416,
1095
- 270
1096
  ],
1097
  "totals": [
1098
- 1805,
1099
- 1739,
1100
- 1673,
1101
- 1607
1102
  ],
1103
  "precisions": [
1104
- 0.6393351800554017,
1105
- 0.38700402530189765,
1106
- 0.24865511057979678,
1107
- 0.16801493466085873
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 1805,
1111
  "ref_len": 1734,
1112
- "sacrebleu": 0.31885801670800706,
1113
- "score": 0.31885801670800706,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.2891892296952914,
1116
- "score_ci_high": 0.36690842488060277,
1117
- "sacrebleu_ci_low": 0.2891892296952914,
1118
- "sacrebleu_ci_high": 0.36690842488060277
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
- 932,
1124
- 394,
1125
- 198,
1126
- 108
1127
  ],
1128
  "totals": [
1129
- 2022,
1130
- 1956,
1131
- 1890,
1132
- 1824
1133
  ],
1134
  "precisions": [
1135
- 0.4609297725024728,
1136
- 0.20143149284253578,
1137
- 0.10476190476190476,
1138
- 0.05921052631578948
1139
  ],
1140
  "bp": 1.0,
1141
- "sys_len": 2022,
1142
  "ref_len": 1734,
1143
- "sacrebleu": 0.15491415770056607,
1144
- "score": 0.15491415770056607,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.12994962612263172,
1147
- "score_ci_high": 0.1870372431393324,
1148
- "sacrebleu_ci_low": 0.12994962612263172,
1149
- "sacrebleu_ci_high": 0.1870372431393324
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
- 842,
1155
- 316,
1156
- 161,
1157
- 84
1158
  ],
1159
  "totals": [
1160
- 1863,
1161
- 1797,
1162
- 1731,
1163
- 1665
1164
  ],
1165
  "precisions": [
1166
- 0.451959205582394,
1167
- 0.17584863661658318,
1168
- 0.09300982091276719,
1169
- 0.05045045045045045
1170
  ],
1171
  "bp": 1.0,
1172
- "sys_len": 1863,
1173
  "ref_len": 1734,
1174
- "sacrebleu": 0.1389658296637508,
1175
- "score": 0.1389658296637508,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.11444856857935395,
1178
- "score_ci_high": 0.17296950184028867,
1179
- "sacrebleu_ci_low": 0.11444856857935395,
1180
- "sacrebleu_ci_high": 0.17296950184028867
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
- 1125,
1186
- 650,
1187
- 405,
1188
  268
1189
  ],
1190
  "totals": [
1191
- 1763,
1192
- 1697,
1193
- 1631,
1194
- 1565
1195
  ],
1196
  "precisions": [
1197
- 0.6381168462847419,
1198
- 0.38302887448438416,
1199
- 0.24831391784181484,
1200
- 0.17124600638977636
1201
  ],
1202
  "bp": 1.0,
1203
- "sys_len": 1763,
1204
  "ref_len": 1734,
1205
- "sacrebleu": 0.31929220031180316,
1206
- "score": 0.31929220031180316,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.27700187051697595,
1209
- "score_ci_high": 0.3625973900935978,
1210
- "sacrebleu_ci_low": 0.27700187051697595,
1211
- "sacrebleu_ci_high": 0.3625973900935978
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
- 1107,
1217
- 596,
1218
- 373,
1219
- 242
1220
  ],
1221
  "totals": [
1222
- 1855,
1223
- 1789,
1224
- 1723,
1225
- 1657
1226
  ],
1227
  "precisions": [
1228
- 0.5967654986522911,
1229
- 0.33314700950251536,
1230
- 0.21648287869994196,
1231
- 0.1460470730235365
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 1855,
1235
  "ref_len": 1734,
1236
- "sacrebleu": 0.28157170427283745,
1237
- "score": 0.28157170427283745,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.24793101467267725,
1240
- "score_ci_high": 0.34906947598840327,
1241
- "sacrebleu_ci_low": 0.24793101467267725,
1242
- "sacrebleu_ci_high": 0.34906947598840327
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
- 1030,
1248
- 507,
1249
- 273,
1250
- 149
1251
  ],
1252
  "totals": [
1253
- 1843,
1254
- 1777,
1255
- 1711,
1256
- 1645
1257
  ],
1258
  "precisions": [
1259
- 0.5588714053174173,
1260
- 0.28531232414181207,
1261
- 0.15955581531268265,
1262
- 0.0905775075987842
1263
  ],
1264
  "bp": 1.0,
1265
- "sys_len": 1843,
1266
  "ref_len": 1734,
1267
- "sacrebleu": 0.21909948470764218,
1268
- "score": 0.21909948470764218,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.18945741585306675,
1271
- "score_ci_high": 0.26472135303184596,
1272
- "sacrebleu_ci_low": 0.18945741585306675,
1273
- "sacrebleu_ci_high": 0.26472135303184596
1274
  },
1275
- "score": 0.21771852861659338,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
- "score": 0.4171223590857319,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-06-23T07:17:53.366963Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
+ "accuracy": 0.5333333333333333,
180
+ "accuracy_ci_low": 0.4444444444444444,
181
+ "accuracy_ci_high": 0.6378611050272702,
182
  "score_name": "accuracy",
183
+ "score": 0.5333333333333333,
184
+ "score_ci_high": 0.6378611050272702,
185
+ "score_ci_low": 0.4444444444444444,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
+ "accuracy": 0.7111111111111111,
190
+ "accuracy_ci_low": 0.6222222222222222,
191
+ "accuracy_ci_high": 0.7888888888888889,
192
  "score_name": "accuracy",
193
+ "score": 0.7111111111111111,
194
+ "score_ci_high": 0.7888888888888889,
195
+ "score_ci_low": 0.6222222222222222,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
+ "accuracy": 0.7444444444444445,
200
+ "accuracy_ci_low": 0.6555555555555556,
201
+ "accuracy_ci_high": 0.8333333333333334,
202
  "score_name": "accuracy",
203
+ "score": 0.7444444444444445,
204
+ "score_ci_high": 0.8333333333333334,
205
+ "score_ci_low": 0.6555555555555556,
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
+ "accuracy": 0.45555555555555555,
210
+ "accuracy_ci_low": 0.35555555555555557,
211
+ "accuracy_ci_high": 0.5555555555555556,
212
  "score_name": "accuracy",
213
+ "score": 0.45555555555555555,
214
+ "score_ci_high": 0.5555555555555556,
215
+ "score_ci_low": 0.35555555555555557,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
  "accuracy": 0.5888888888888889,
220
+ "accuracy_ci_low": 0.4888888888888889,
221
  "accuracy_ci_high": 0.6888888888888889,
222
  "score_name": "accuracy",
223
  "score": 0.5888888888888889,
224
  "score_ci_high": 0.6888888888888889,
225
+ "score_ci_low": 0.4888888888888889,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.8111111111111111,
230
+ "accuracy_ci_low": 0.7222222222222222,
231
+ "accuracy_ci_high": 0.8777777777777778,
232
  "score_name": "accuracy",
233
+ "score": 0.8111111111111111,
234
+ "score_ci_high": 0.8777777777777778,
235
+ "score_ci_low": 0.7222222222222222,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.7333333333333333,
240
+ "accuracy_ci_low": 0.6333333333333333,
241
+ "accuracy_ci_high": 0.8111111111111111,
242
  "score_name": "accuracy",
243
+ "score": 0.7333333333333333,
244
+ "score_ci_high": 0.8111111111111111,
245
+ "score_ci_low": 0.6333333333333333,
246
  "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
  "accuracy": 0.6222222222222222,
250
+ "accuracy_ci_low": 0.5111111111111111,
251
+ "accuracy_ci_high": 0.7111111111111111,
252
  "score_name": "accuracy",
253
  "score": 0.6222222222222222,
254
+ "score_ci_high": 0.7111111111111111,
255
+ "score_ci_low": 0.5111111111111111,
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
+ "accuracy": 0.6444444444444445,
260
+ "accuracy_ci_low": 0.5444444444444444,
261
+ "accuracy_ci_high": 0.7444444444444445,
262
  "score_name": "accuracy",
263
+ "score": 0.6444444444444445,
264
+ "score_ci_high": 0.7444444444444445,
265
+ "score_ci_low": 0.5444444444444444,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
+ "accuracy": 0.7,
270
+ "accuracy_ci_low": 0.6,
271
+ "accuracy_ci_high": 0.7888888888888889,
272
  "score_name": "accuracy",
273
+ "score": 0.7,
274
+ "score_ci_high": 0.7888888888888889,
275
+ "score_ci_low": 0.6,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.6222222222222222,
280
+ "accuracy_ci_low": 0.5222222222222223,
281
+ "accuracy_ci_high": 0.7222222222222222,
282
  "score_name": "accuracy",
283
+ "score": 0.6222222222222222,
284
+ "score_ci_high": 0.7222222222222222,
285
+ "score_ci_low": 0.5222222222222223,
286
  "num_of_instances": 90
287
  },
288
+ "score": 0.6515151515151515,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.03687821612349914,
296
+ "score": 0.03687821612349914,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.03687821612349914,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
+ "f1_Person": 0.3902439024390244,
307
+ "f1_Organization": 0.29283489096573206,
308
+ "f1_Location": 0.2572614107883817,
309
+ "f1_macro": 0.31344673473104606,
310
+ "recall_macro": 0.2655047696270643,
311
+ "precision_macro": 0.3964060432628696,
312
+ "in_classes_support": 0.6260720411663807,
313
+ "f1_micro": 0.25631768953068595,
314
+ "recall_micro": 0.2704761904761905,
315
+ "precision_micro": 0.24356775300171526,
316
+ "score": 0.25631768953068595,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.22100954853501506,
319
+ "score_ci_high": 0.2947346870824505,
320
+ "f1_micro_ci_low": 0.22100954853501506,
321
+ "f1_micro_ci_high": 0.2947346870824505
322
  },
323
+ "score": 0.25631768953068595,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
+ "accuracy": 0.4788732394366197,
330
+ "accuracy_ci_low": 0.36619718309859156,
331
+ "accuracy_ci_high": 0.5915492957746479,
332
  "score_name": "accuracy",
333
+ "score": 0.4788732394366197,
334
+ "score_ci_high": 0.5915492957746479,
335
+ "score_ci_low": 0.36619718309859156,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
+ "accuracy": 0.23943661971830985,
340
+ "accuracy_ci_low": 0.15492957746478872,
341
+ "accuracy_ci_high": 0.3380281690140845,
342
  "score_name": "accuracy",
343
+ "score": 0.23943661971830985,
344
+ "score_ci_high": 0.3380281690140845,
345
+ "score_ci_low": 0.15492957746478872,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
  "accuracy": 0.23943661971830985,
350
  "accuracy_ci_low": 0.14084507042253522,
351
+ "accuracy_ci_high": 0.352112676056338,
352
  "score_name": "accuracy",
353
  "score": 0.23943661971830985,
354
+ "score_ci_high": 0.352112676056338,
355
  "score_ci_low": 0.14084507042253522,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
+ "accuracy": 0.2535211267605634,
360
+ "accuracy_ci_low": 0.15492957746478872,
361
+ "accuracy_ci_high": 0.36619718309859156,
362
  "score_name": "accuracy",
363
+ "score": 0.2535211267605634,
364
+ "score_ci_high": 0.36619718309859156,
365
+ "score_ci_low": 0.15492957746478872,
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
+ "accuracy": 0.39436619718309857,
370
+ "accuracy_ci_low": 0.29577464788732394,
371
  "accuracy_ci_high": 0.5070422535211268,
372
  "score_name": "accuracy",
373
+ "score": 0.39436619718309857,
374
  "score_ci_high": 0.5070422535211268,
375
+ "score_ci_low": 0.29577464788732394,
376
  "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
+ "accuracy": 0.16901408450704225,
380
+ "accuracy_ci_low": 0.09859154929577464,
381
+ "accuracy_ci_high": 0.2676056338028169,
382
  "score_name": "accuracy",
383
+ "score": 0.16901408450704225,
384
+ "score_ci_high": 0.2676056338028169,
385
+ "score_ci_low": 0.09859154929577464,
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
+ "accuracy": 0.2112676056338028,
390
+ "accuracy_ci_low": 0.1267605633802817,
391
+ "accuracy_ci_high": 0.33217670597601795,
392
  "score_name": "accuracy",
393
+ "score": 0.2112676056338028,
394
+ "score_ci_high": 0.33217670597601795,
395
+ "score_ci_low": 0.1267605633802817,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
  "accuracy": 0.2676056338028169,
400
+ "accuracy_ci_low": 0.16901408450704225,
401
+ "accuracy_ci_high": 0.36619718309859156,
402
  "score_name": "accuracy",
403
  "score": 0.2676056338028169,
404
+ "score_ci_high": 0.36619718309859156,
405
+ "score_ci_low": 0.16901408450704225,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
+ "accuracy": 0.23943661971830985,
410
+ "accuracy_ci_low": 0.15492957746478872,
411
+ "accuracy_ci_high": 0.352112676056338,
412
  "score_name": "accuracy",
413
+ "score": 0.23943661971830985,
414
+ "score_ci_high": 0.352112676056338,
415
+ "score_ci_low": 0.15492957746478872,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
+ "accuracy": 0.15492957746478872,
420
+ "accuracy_ci_low": 0.08450704225352113,
421
+ "accuracy_ci_high": 0.2535211267605634,
422
  "score_name": "accuracy",
423
+ "score": 0.15492957746478872,
424
+ "score_ci_high": 0.2535211267605634,
425
+ "score_ci_low": 0.08450704225352113,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.18309859154929578,
430
+ "accuracy_ci_low": 0.09859154929577464,
431
+ "accuracy_ci_high": 0.28169014084507044,
432
  "score_name": "accuracy",
433
+ "score": 0.18309859154929578,
434
+ "score_ci_high": 0.28169014084507044,
435
+ "score_ci_low": 0.09859154929577464,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
+ "accuracy": 0.23943661971830985,
440
+ "accuracy_ci_low": 0.14084507042253522,
441
+ "accuracy_ci_high": 0.352112676056338,
442
  "score_name": "accuracy",
443
+ "score": 0.23943661971830985,
444
+ "score_ci_high": 0.352112676056338,
445
+ "score_ci_low": 0.14084507042253522,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.2676056338028169,
450
+ "accuracy_ci_low": 0.17514498933734307,
451
+ "accuracy_ci_high": 0.38028169014084506,
452
  "score_name": "accuracy",
453
+ "score": 0.2676056338028169,
454
+ "score_ci_high": 0.38028169014084506,
455
+ "score_ci_low": 0.17514498933734307,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
+ "accuracy": 0.43661971830985913,
460
+ "accuracy_ci_low": 0.323943661971831,
461
+ "accuracy_ci_high": 0.5492957746478874,
462
  "score_name": "accuracy",
463
+ "score": 0.43661971830985913,
464
+ "score_ci_high": 0.5492957746478874,
465
+ "score_ci_low": 0.323943661971831,
466
  "num_of_instances": 71
467
  },
468
+ "score": 0.26961770623742454,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.23241604568835691,
475
+ "f1_suggestive": 0.08695652173913043,
476
+ "f1_descriptive": 0.2631578947368421,
477
+ "f1_generic": 0.0,
478
+ "f1_arbitrary": 0.3888888888888889,
479
+ "f1_fanciful": 0.4230769230769231,
480
+ "f1_macro_ci_low": 0.16927841023118298,
481
+ "f1_macro_ci_high": 0.32467849714540287,
482
  "score_name": "f1_micro",
483
+ "score": 0.2891566265060241,
484
+ "score_ci_high": 0.40476190476190477,
485
+ "score_ci_low": 0.2054361335527834,
486
  "num_of_instances": 85,
487
+ "accuracy": 0.2823529411764706,
488
+ "accuracy_ci_low": 0.2,
489
+ "accuracy_ci_high": 0.4,
490
+ "f1_micro": 0.2891566265060241,
491
+ "f1_micro_ci_low": 0.2054361335527834,
492
+ "f1_micro_ci_high": 0.40476190476190477
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.4965214761040533,
496
+ "f1_no": 0.7631578947368421,
497
+ "f1_yes": 0.22988505747126436,
498
+ "f1_macro_ci_low": 0.43244851636549736,
499
+ "f1_macro_ci_high": 0.5729228740221988,
500
  "score_name": "f1_micro",
501
  "score": 0.6445012787723785,
502
+ "score_ci_high": 0.69946202795028,
503
+ "score_ci_low": 0.570694087403599,
504
  "num_of_instances": 200,
505
  "accuracy": 0.63,
506
+ "accuracy_ci_low": 0.5561546872315049,
507
  "accuracy_ci_high": 0.69,
508
  "f1_micro": 0.6445012787723785,
509
+ "f1_micro_ci_low": 0.570694087403599,
510
+ "f1_micro_ci_high": 0.69946202795028
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.22997089242124882,
514
+ "f1_conclusion": 0.04878048780487805,
515
+ "f1_analysis": 0.3333333333333333,
516
+ "f1_decree": 0.2926829268292683,
517
+ "f1_issue": 0.21978021978021978,
518
+ "f1_procedural history": 0.05,
519
+ "f1_facts": 0.2978723404255319,
520
+ "f1_rule": 0.3673469387755102,
521
+ "f1_macro_ci_low": 0.18026075783829068,
522
+ "f1_macro_ci_high": 0.2946257845154891,
523
  "score_name": "f1_micro",
524
+ "score": 0.24146981627296588,
525
+ "score_ci_high": 0.3019289134511566,
526
+ "score_ci_low": 0.18181818181818182,
527
  "num_of_instances": 200,
528
+ "accuracy": 0.23,
529
+ "accuracy_ci_low": 0.175,
530
+ "accuracy_ci_high": 0.29,
531
+ "f1_micro": 0.24146981627296588,
532
+ "f1_micro_ci_low": 0.18181818181818182,
533
+ "f1_micro_ci_high": 0.3019289134511566
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.4719581626606899,
537
+ "f1_yes": 0.5462555066079295,
538
+ "f1_no": 0.39766081871345027,
539
+ "f1_macro_ci_low": 0.4067534798719593,
540
+ "f1_macro_ci_high": 0.5312059177934843,
541
  "score_name": "f1_micro",
542
+ "score": 0.4824120603015075,
543
+ "score_ci_high": 0.5413533834586466,
544
+ "score_ci_low": 0.41550674904624724,
545
  "num_of_instances": 200,
546
+ "accuracy": 0.48,
547
+ "accuracy_ci_low": 0.415,
548
+ "accuracy_ci_high": 0.54,
549
+ "f1_micro": 0.4824120603015075,
550
+ "f1_micro_ci_low": 0.41550674904624724,
551
+ "f1_micro_ci_high": 0.5413533834586466
552
  },
553
  "legalbench_proa": {
554
+ "f1_macro": 0.8112290008841733,
555
+ "f1_yes": 0.7948717948717948,
556
+ "f1_no": 0.8275862068965517,
557
+ "f1_macro_ci_low": 0.7184910169578117,
558
+ "f1_macro_ci_high": 0.8804600933253673,
559
  "score_name": "f1_micro",
560
+ "score": 0.8121212121212121,
561
+ "score_ci_high": 0.8795180722891566,
562
+ "score_ci_low": 0.7203411511997481,
563
  "num_of_instances": 85,
564
+ "accuracy": 0.788235294117647,
565
+ "accuracy_ci_low": 0.6941176470588235,
566
+ "accuracy_ci_high": 0.8588235294117647,
567
+ "f1_micro": 0.8121212121212121,
568
+ "f1_micro_ci_low": 0.7203411511997481,
569
+ "f1_micro_ci_high": 0.8795180722891566
570
  },
571
+ "score": 0.49393219879481765,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.3469646355526677,
578
+ "f1_cars": 0.5517241379310345,
579
+ "f1_windows x": 0.0,
580
+ "f1_atheism": 0.23809523809523808,
581
+ "f1_christianity": 0.2028985507246377,
582
+ "f1_religion": 0.1941747572815534,
583
+ "f1_medicine": 0.6060606060606061,
584
+ "f1_computer graphics": 0.3488372093023256,
585
+ "f1_microsoft windows": 0.3188405797101449,
586
+ "f1_middle east": 0.11538461538461539,
587
+ "f1_politics": 0.3047619047619048,
588
+ "f1_motorcycles": 0.5227272727272727,
589
+ "f1_baseball": 0.6984126984126984,
590
+ "f1_pc hardware": 0.3684210526315789,
591
+ "f1_mac hardware": 0.37037037037037035,
592
+ "f1_for sale": 0.08888888888888889,
593
+ "f1_guns": 0.18181818181818182,
594
+ "f1_space": 0.4810126582278481,
595
+ "f1_cryptography": 0.48484848484848486,
596
+ "f1_hockey": 0.4666666666666667,
597
+ "f1_electronics": 0.3953488372093023,
598
+ "f1_macro_ci_low": 0.32234813592441613,
599
+ "f1_macro_ci_high": 0.38044336501459297,
600
  "score_name": "f1_micro",
601
+ "score": 0.37344913151364767,
602
+ "score_ci_high": 0.40609658022784717,
603
+ "score_ci_low": 0.34207641792416926,
604
  "num_of_instances": 1000,
605
+ "accuracy": 0.301,
606
+ "accuracy_ci_low": 0.274,
607
+ "accuracy_ci_high": 0.329023179612989,
608
+ "f1_micro": 0.37344913151364767,
609
+ "f1_micro_ci_low": 0.34207641792416926,
610
+ "f1_micro_ci_high": 0.40609658022784717
611
  },
612
+ "score": 0.37344913151364767,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.4835930003981669,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.8920327624720774,
620
+ "f1_checking or savings account": 0.42105263157894735,
621
+ "f1_credit card or prepaid card": 0.5666666666666667,
622
+ "f1_debt collection": 0.38666666666666666,
623
+ "f1_mortgage": 0.7096774193548387,
 
 
 
624
  "f1_student loan": 0.6666666666666666,
625
+ "f1_money transfer or virtual currency or money service": 0.3125,
626
+ "f1_vehicle loan or lease": 0.27586206896551724,
627
+ "f1_payday loan or title loan or personal loan": 0.12121212121212122,
628
+ "f1_macro_ci_low": 0.43264552909234405,
629
+ "f1_macro_ci_high": 0.5420653283436574,
630
  "score_name": "f1_micro",
631
+ "score": 0.7693953986088817,
632
+ "score_ci_high": 0.7940535810044251,
633
+ "score_ci_low": 0.7428249604302373,
634
  "num_of_instances": 1000,
635
+ "accuracy": 0.719,
636
+ "accuracy_ci_low": 0.688,
637
+ "accuracy_ci_high": 0.746,
638
+ "f1_micro": 0.7693953986088817,
639
+ "f1_micro_ci_low": 0.7428249604302373,
640
+ "f1_micro_ci_high": 0.7940535810044251
641
  },
642
  "cfpb_product_watsonx": {
643
+ "f1_macro": 0.5138940708414392,
644
+ "f1_mortgages and loans": 0.6742857142857143,
645
+ "f1_credit card": 0.5314685314685315,
646
+ "f1_debt collection": 0.5,
647
+ "f1_credit reporting": 0.6742424242424242,
648
+ "f1_retail banking": 0.18947368421052632,
649
+ "f1_macro_ci_low": 0.47276041465254326,
650
+ "f1_macro_ci_high": 0.561073606935457,
651
  "score_name": "f1_micro",
652
+ "score": 0.5587229190421893,
653
+ "score_ci_high": 0.6032761107151652,
654
+ "score_ci_low": 0.5136696359618879,
655
  "num_of_instances": 500,
656
+ "accuracy": 0.49,
657
+ "accuracy_ci_low": 0.45,
658
+ "accuracy_ci_high": 0.536,
659
+ "f1_micro": 0.5587229190421893,
660
+ "f1_micro_ci_low": 0.5136696359618879,
661
+ "f1_micro_ci_high": 0.6032761107151652
662
  },
663
+ "score": 0.6640591588255356,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
+ "program_accuracy": 0.112,
671
+ "score": 0.112,
672
  "score_name": "program_accuracy",
673
+ "execution_accuracy": 0.098,
674
+ "program_accuracy_ci_low": 0.092,
675
+ "program_accuracy_ci_high": 0.134,
676
+ "score_ci_low": 0.092,
677
+ "score_ci_high": 0.134,
678
+ "execution_accuracy_ci_low": 0.081,
679
+ "execution_accuracy_ci_high": 0.11876030243075729
680
  },
681
+ "score": 0.112,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
+ "precision": 0.27803081578951677,
688
+ "recall": 0.5263554366544182,
689
+ "f1": 0.3099512752592042,
690
+ "precision_ci_low": 0.2593667444116527,
691
+ "precision_ci_high": 0.296940372694636,
692
+ "recall_ci_low": 0.5097202360703201,
693
+ "recall_ci_high": 0.5429203378240791,
694
+ "f1_ci_low": 0.2936726592020454,
695
+ "f1_ci_high": 0.3271680931787992,
696
  "score_name": "f1",
697
+ "score": 0.3099512752592042,
698
+ "score_ci_high": 0.3271680931787992,
699
+ "score_ci_low": 0.2936726592020454,
700
  "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.5813682861626148,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6611752705772718,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5378807228555282,
704
+ "faithfullness_f1_token_overlap": 0.34377396506102365,
705
+ "faithfullness_recall_token_overlap": 0.2786846668859655,
706
+ "faithfullness_precision_token_overlap": 0.5693429043011381,
707
+ "correctness_f1_token_overlap": 0.3099512752592042,
708
+ "correctness_recall_token_overlap": 0.5263554366544182,
709
+ "correctness_precision_token_overlap": 0.27803081578951677
710
  },
711
+ "score": 0.3099512752592042,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
+ "accuracy": 0.387,
718
+ "accuracy_ci_low": 0.355,
719
  "accuracy_ci_high": 0.417,
720
  "score_name": "accuracy",
721
+ "score": 0.387,
722
  "score_ci_high": 0.417,
723
+ "score_ci_low": 0.355,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
  "accuracy": 0.728,
728
  "accuracy_ci_low": 0.692,
729
+ "accuracy_ci_high": 0.768,
730
  "score_name": "accuracy",
731
  "score": 0.728,
732
+ "score_ci_high": 0.768,
733
  "score_ci_low": 0.692,
734
  "num_of_instances": 500
735
  },
736
+ "score": 0.5575,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
+ "severity_critical": 2.0,
743
  "severity_high": 0.0,
744
+ "severity_medium": 3.0,
745
+ "severity_low": 95.0,
746
+ "category_harmful_info": 0.790451957932895,
747
+ "category_deception": 0.8851025405556264,
748
+ "category_explicit_content": 0.790092716915439,
749
+ "category_violence": 0.7921644449869674,
750
+ "category_discrimination": 0.839280919423179,
751
+ "category_substance_abuse": 0.8701996730433569,
752
+ "category_pii": 0.7343945315418144,
753
+ "safety": 0.8071470662465113,
754
+ "safety_ci_low": 0.7772986218937553,
755
+ "safety_ci_high": 0.8319065673300019,
756
  "score_name": "safety",
757
+ "score": 0.8071470662465113,
758
+ "score_ci_high": 0.8319065673300019,
759
+ "score_ci_low": 0.7772986218937553,
760
  "num_of_instances": 100
761
  },
762
+ "score": 0.8071470662465113,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
+ "rougeLsum": 0.3492364214226588,
770
+ "rouge1": 0.41657427755274906,
771
+ "rouge2": 0.19392531737812374,
772
+ "rougeL": 0.28108832023050123,
773
+ "score": 0.28108832023050123,
774
  "score_name": "rougeL",
775
+ "rougeLsum_ci_low": 0.34133013043796767,
776
+ "rougeLsum_ci_high": 0.3567955471410065,
777
+ "rouge1_ci_low": 0.4080039541276808,
778
+ "rouge1_ci_high": 0.4246243250973701,
779
+ "rouge2_ci_low": 0.18712662527227458,
780
+ "rouge2_ci_high": 0.200448642429914,
781
+ "rougeL_ci_low": 0.2745892481615738,
782
+ "rougeL_ci_high": 0.2875837184145128,
783
+ "score_ci_low": 0.2745892481615738,
784
+ "score_ci_high": 0.2875837184145128
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
+ "rougeLsum": 0.0914997584684418,
789
+ "rouge1": 0.11103180621679556,
790
+ "rouge2": 0.013425878438988716,
791
+ "rougeL": 0.07983541241124872,
792
+ "score": 0.07983541241124872,
793
  "score_name": "rougeL",
794
+ "rougeLsum_ci_low": 0.08781447896275059,
795
+ "rougeLsum_ci_high": 0.0953497661867097,
796
+ "rouge1_ci_low": 0.10615759057700462,
797
+ "rouge1_ci_high": 0.11562260974835847,
798
+ "rouge2_ci_low": 0.012023789954203338,
799
+ "rouge2_ci_high": 0.015059698304736774,
800
+ "rougeL_ci_low": 0.07657318636107396,
801
+ "rougeL_ci_high": 0.08299164478552631,
802
+ "score_ci_low": 0.07657318636107396,
803
+ "score_ci_high": 0.08299164478552631
804
  },
805
+ "score": 0.18046186632087496,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
 
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
+ 1003,
814
+ 498,
815
+ 270,
816
+ 157
817
  ],
818
  "totals": [
819
+ 1854,
820
+ 1788,
821
+ 1722,
822
+ 1656
823
  ],
824
  "precisions": [
825
+ 0.5409924487594391,
826
+ 0.2785234899328859,
827
+ 0.156794425087108,
828
+ 0.09480676328502416
829
  ],
830
  "bp": 1.0,
831
+ "sys_len": 1854,
832
  "ref_len": 1734,
833
+ "sacrebleu": 0.2175483241536988,
834
+ "score": 0.2175483241536988,
835
  "score_name": "sacrebleu",
836
+ "score_ci_low": 0.18247071615886146,
837
+ "score_ci_high": 0.24704171532422453,
838
+ "sacrebleu_ci_low": 0.18247071615886146,
839
+ "sacrebleu_ci_high": 0.24704171532422453
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
+ 1080,
845
+ 568,
846
+ 332,
847
+ 190
848
  ],
849
  "totals": [
850
+ 1763,
851
+ 1697,
852
+ 1631,
853
+ 1565
854
  ],
855
  "precisions": [
856
+ 0.6125921724333522,
857
+ 0.33470830878020036,
858
+ 0.20355610055180873,
859
+ 0.12140575079872205
860
  ],
861
  "bp": 1.0,
862
+ "sys_len": 1763,
863
  "ref_len": 1734,
864
+ "sacrebleu": 0.26680276716067836,
865
+ "score": 0.26680276716067836,
866
  "score_name": "sacrebleu",
867
+ "score_ci_low": 0.23826229076053318,
868
+ "score_ci_high": 0.2977813737555276,
869
+ "sacrebleu_ci_low": 0.23826229076053318,
870
+ "sacrebleu_ci_high": 0.2977813737555276
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
+ 553,
876
+ 147,
877
+ 54,
878
+ 16
879
  ],
880
  "totals": [
881
+ 1726,
882
+ 1660,
883
+ 1594,
884
+ 1528
885
  ],
886
  "precisions": [
887
+ 0.3203939745075319,
888
+ 0.08855421686746988,
889
+ 0.033877038895859475,
890
+ 0.010471204188481676
891
  ],
892
  "bp": 1.0,
893
+ "sys_len": 1726,
894
  "ref_len": 1589,
895
+ "sacrebleu": 0.056324703529775505,
896
+ "score": 0.056324703529775505,
897
  "score_name": "sacrebleu",
898
+ "score_ci_low": 0.04391320313402893,
899
+ "score_ci_high": 0.07339385366459818,
900
+ "sacrebleu_ci_low": 0.04391320313402893,
901
+ "sacrebleu_ci_high": 0.07339385366459818
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
+ 919,
907
+ 414,
908
+ 215,
909
+ 112
910
  ],
911
  "totals": [
912
+ 1759,
913
+ 1693,
914
+ 1627,
915
+ 1561
916
  ],
917
  "precisions": [
918
+ 0.5224559408754975,
919
+ 0.24453632604843473,
920
+ 0.13214505224339276,
921
+ 0.07174887892376682
922
  ],
923
+ "bp": 0.9577137289198663,
924
+ "sys_len": 1759,
925
  "ref_len": 1835,
926
+ "sacrebleu": 0.17866952528026325,
927
+ "score": 0.17866952528026325,
928
  "score_name": "sacrebleu",
929
+ "score_ci_low": 0.14916846640026038,
930
+ "score_ci_high": 0.20635605395794115,
931
+ "sacrebleu_ci_low": 0.14916846640026038,
932
+ "sacrebleu_ci_high": 0.20635605395794115
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
+ 1274,
938
+ 792,
939
+ 541,
940
+ 376
941
  ],
942
  "totals": [
943
+ 1972,
944
+ 1906,
945
+ 1840,
946
+ 1774
947
  ],
948
  "precisions": [
949
+ 0.6460446247464503,
950
+ 0.4155299055613851,
951
+ 0.2940217391304348,
952
+ 0.21195039458850057
953
  ],
954
+ "bp": 0.9524844080827892,
955
+ "sys_len": 1972,
956
  "ref_len": 2068,
957
+ "sacrebleu": 0.3425527778466637,
958
+ "score": 0.3425527778466637,
959
  "score_name": "sacrebleu",
960
+ "score_ci_low": 0.30423882287446036,
961
+ "score_ci_high": 0.38397398779299585,
962
+ "sacrebleu_ci_low": 0.30423882287446036,
963
+ "sacrebleu_ci_high": 0.38397398779299585
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
+ 959,
969
+ 341,
970
+ 148,
971
+ 65
972
  ],
973
  "totals": [
974
+ 3115,
975
+ 3049,
976
+ 2983,
977
+ 2917
978
  ],
979
  "precisions": [
980
+ 0.3078651685393259,
981
+ 0.1118399475237783,
982
+ 0.0496144820650352,
983
+ 0.02228316763798423
984
  ],
985
  "bp": 1.0,
986
+ "sys_len": 3115,
987
  "ref_len": 2235,
988
+ "sacrebleu": 0.07854810736755143,
989
+ "score": 0.07854810736755143,
990
  "score_name": "sacrebleu",
991
+ "score_ci_low": 0.06038268917694664,
992
+ "score_ci_high": 0.0991975666301703,
993
+ "sacrebleu_ci_low": 0.06038268917694664,
994
+ "sacrebleu_ci_high": 0.0991975666301703
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
+ 1231,
1000
+ 729,
1001
  477,
1002
+ 311
1003
  ],
1004
  "totals": [
1005
+ 1934,
1006
+ 1868,
1007
+ 1802,
1008
+ 1736
1009
  ],
1010
  "precisions": [
1011
+ 0.6365046535677352,
1012
+ 0.39025695931477516,
1013
+ 0.2647058823529412,
1014
+ 0.179147465437788
1015
  ],
1016
+ "bp": 1.0,
1017
+ "sys_len": 1934,
1018
  "ref_len": 1916,
1019
+ "sacrebleu": 0.3294440172060282,
1020
+ "score": 0.3294440172060282,
1021
  "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.295619495912131,
1023
+ "score_ci_high": 0.3689718708243594,
1024
+ "sacrebleu_ci_low": 0.295619495912131,
1025
+ "sacrebleu_ci_high": 0.3689718708243594
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
+ 775,
1031
+ 275,
1032
+ 128,
1033
+ 70
1034
  ],
1035
  "totals": [
1036
+ 2251,
1037
+ 2185,
1038
+ 2119,
1039
+ 2053
1040
  ],
1041
  "precisions": [
1042
+ 0.3442914260328743,
1043
+ 0.12585812356979406,
1044
+ 0.06040585181689476,
1045
+ 0.034096444227959086
1046
  ],
1047
  "bp": 1.0,
1048
+ "sys_len": 2251,
1049
  "ref_len": 1949,
1050
+ "sacrebleu": 0.09719611157545467,
1051
+ "score": 0.09719611157545467,
1052
  "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.06983887234981923,
1054
+ "score_ci_high": 0.1310674385834968,
1055
+ "sacrebleu_ci_low": 0.06983887234981923,
1056
+ "sacrebleu_ci_high": 0.1310674385834968
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
+ 1157,
1062
+ 577,
1063
+ 322,
1064
+ 195
1065
  ],
1066
  "totals": [
1067
+ 2040,
1068
+ 1974,
1069
+ 1908,
1070
+ 1842
1071
  ],
1072
  "precisions": [
1073
+ 0.567156862745098,
1074
+ 0.2922998986828774,
1075
+ 0.16876310272536688,
1076
+ 0.10586319218241043
1077
  ],
1078
+ "bp": 0.9719689956119355,
1079
+ "sys_len": 2040,
1080
  "ref_len": 2098,
1081
+ "sacrebleu": 0.22674671169002888,
1082
+ "score": 0.22674671169002888,
1083
  "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.20018186306011954,
1085
+ "score_ci_high": 0.24942035354854425,
1086
+ "sacrebleu_ci_low": 0.20018186306011954,
1087
+ "sacrebleu_ci_high": 0.24942035354854425
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
+ 1147,
1093
+ 643,
1094
+ 397,
1095
+ 246
1096
  ],
1097
  "totals": [
1098
+ 1808,
1099
+ 1742,
1100
+ 1676,
1101
+ 1610
1102
  ],
1103
  "precisions": [
1104
+ 0.6344026548672567,
1105
+ 0.36911595866819746,
1106
+ 0.23687350835322196,
1107
+ 0.15279503105590064
1108
  ],
1109
  "bp": 1.0,
1110
+ "sys_len": 1808,
1111
  "ref_len": 1734,
1112
+ "sacrebleu": 0.30341593414236545,
1113
+ "score": 0.30341593414236545,
1114
  "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.26610148555409346,
1116
+ "score_ci_high": 0.3551058656882207,
1117
+ "sacrebleu_ci_low": 0.26610148555409346,
1118
+ "sacrebleu_ci_high": 0.3551058656882207
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
+ 935,
1124
+ 383,
1125
+ 193,
1126
+ 101
1127
  ],
1128
  "totals": [
1129
+ 1950,
1130
+ 1884,
1131
+ 1818,
1132
+ 1752
1133
  ],
1134
  "precisions": [
1135
+ 0.4794871794871795,
1136
+ 0.2032908704883227,
1137
+ 0.10616061606160615,
1138
+ 0.057648401826484015
1139
  ],
1140
  "bp": 1.0,
1141
+ "sys_len": 1950,
1142
  "ref_len": 1734,
1143
+ "sacrebleu": 0.15628287583119144,
1144
+ "score": 0.15628287583119144,
1145
  "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.1304739698304556,
1147
+ "score_ci_high": 0.19246721744185705,
1148
+ "sacrebleu_ci_low": 0.1304739698304556,
1149
+ "sacrebleu_ci_high": 0.19246721744185705
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
+ 848,
1155
+ 302,
1156
+ 135,
1157
+ 60
1158
  ],
1159
  "totals": [
1160
+ 1872,
1161
+ 1806,
1162
+ 1740,
1163
+ 1674
1164
  ],
1165
  "precisions": [
1166
+ 0.452991452991453,
1167
+ 0.1672203765227021,
1168
+ 0.07758620689655173,
1169
+ 0.035842293906810034
1170
  ],
1171
  "bp": 1.0,
1172
+ "sys_len": 1872,
1173
  "ref_len": 1734,
1174
+ "sacrebleu": 0.12047304306149162,
1175
+ "score": 0.12047304306149162,
1176
  "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.10281238979764949,
1178
+ "score_ci_high": 0.15881384065042398,
1179
+ "sacrebleu_ci_low": 0.10281238979764949,
1180
+ "sacrebleu_ci_high": 0.15881384065042398
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
+ 1129,
1186
+ 652,
1187
+ 412,
1188
  268
1189
  ],
1190
  "totals": [
1191
+ 1783,
1192
+ 1717,
1193
+ 1651,
1194
+ 1585
1195
  ],
1196
  "precisions": [
1197
+ 0.6332024677509814,
1198
+ 0.3797320908561444,
1199
+ 0.24954572986069049,
1200
+ 0.1690851735015773
1201
  ],
1202
  "bp": 1.0,
1203
+ "sys_len": 1783,
1204
  "ref_len": 1734,
1205
+ "sacrebleu": 0.3173722073323666,
1206
+ "score": 0.3173722073323666,
1207
  "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.28432250677130166,
1209
+ "score_ci_high": 0.35844418554288615,
1210
+ "sacrebleu_ci_low": 0.28432250677130166,
1211
+ "sacrebleu_ci_high": 0.35844418554288615
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
+ 1091,
1217
+ 601,
1218
+ 375,
1219
+ 245
1220
  ],
1221
  "totals": [
1222
+ 1793,
1223
+ 1727,
1224
+ 1661,
1225
+ 1595
1226
  ],
1227
  "precisions": [
1228
+ 0.6084774121583938,
1229
+ 0.3480023161551824,
1230
+ 0.2257676098735701,
1231
+ 0.1536050156739812
1232
  ],
1233
  "bp": 1.0,
1234
+ "sys_len": 1793,
1235
  "ref_len": 1734,
1236
+ "sacrebleu": 0.2927341616520049,
1237
+ "score": 0.2927341616520049,
1238
  "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.2507745489087611,
1240
+ "score_ci_high": 0.329051655313229,
1241
+ "sacrebleu_ci_low": 0.2507745489087611,
1242
+ "sacrebleu_ci_high": 0.329051655313229
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
+ 1037,
1248
+ 482,
1249
+ 248,
1250
+ 129
1251
  ],
1252
  "totals": [
1253
+ 1796,
1254
+ 1730,
1255
+ 1664,
1256
+ 1598
1257
  ],
1258
  "precisions": [
1259
+ 0.5773942093541202,
1260
+ 0.2786127167630058,
1261
+ 0.14903846153846154,
1262
+ 0.0807259073842303
1263
  ],
1264
  "bp": 1.0,
1265
+ "sys_len": 1796,
1266
  "ref_len": 1734,
1267
+ "sacrebleu": 0.20974719583348747,
1268
+ "score": 0.20974719583348747,
1269
  "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.18522343537553757,
1271
+ "score_ci_high": 0.2489124348912048,
1272
+ "sacrebleu_ci_low": 0.18522343537553757,
1273
+ "sacrebleu_ci_high": 0.2489124348912048
1274
  },
1275
+ "score": 0.21292389757753669,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
+ "score": 0.37890410445729916,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
results/bluebench/{2025-06-19T16-21-09_evaluation_results.json β†’ 2025-06-23T04-06-37_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-19T20:21:05.821665Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -42,7 +42,7 @@
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -176,103 +176,103 @@
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
- "accuracy": 0.5555555555555556,
180
- "accuracy_ci_low": 0.45555555555555555,
181
- "accuracy_ci_high": 0.6555555555555556,
182
  "score_name": "accuracy",
183
- "score": 0.5555555555555556,
184
- "score_ci_high": 0.6555555555555556,
185
- "score_ci_low": 0.45555555555555555,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
- "accuracy": 0.6222222222222222,
190
- "accuracy_ci_low": 0.5222222222222223,
191
- "accuracy_ci_high": 0.7222222222222222,
192
  "score_name": "accuracy",
193
- "score": 0.6222222222222222,
194
- "score_ci_high": 0.7222222222222222,
195
- "score_ci_low": 0.5222222222222223,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
- "accuracy": 0.8777777777777778,
200
- "accuracy_ci_low": 0.8,
201
- "accuracy_ci_high": 0.9333333333333333,
202
  "score_name": "accuracy",
203
- "score": 0.8777777777777778,
204
- "score_ci_high": 0.9333333333333333,
205
- "score_ci_low": 0.8,
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
- "accuracy": 0.6333333333333333,
210
- "accuracy_ci_low": 0.5333333333333333,
211
- "accuracy_ci_high": 0.7333333333333333,
212
  "score_name": "accuracy",
213
- "score": 0.6333333333333333,
214
- "score_ci_high": 0.7333333333333333,
215
- "score_ci_low": 0.5333333333333333,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
- "accuracy": 0.6555555555555556,
220
- "accuracy_ci_low": 0.5555555555555556,
221
- "accuracy_ci_high": 0.7539633744548231,
222
  "score_name": "accuracy",
223
- "score": 0.6555555555555556,
224
- "score_ci_high": 0.7539633744548231,
225
- "score_ci_low": 0.5555555555555556,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
  "accuracy": 0.9333333333333333,
230
- "accuracy_ci_low": 0.8666666666666667,
231
  "accuracy_ci_high": 0.9777777777777777,
232
  "score_name": "accuracy",
233
  "score": 0.9333333333333333,
234
  "score_ci_high": 0.9777777777777777,
235
- "score_ci_low": 0.8666666666666667,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
- "accuracy": 0.8888888888888888,
240
- "accuracy_ci_low": 0.8222222222222222,
241
- "accuracy_ci_high": 0.9444444444444444,
242
  "score_name": "accuracy",
243
- "score": 0.8888888888888888,
244
- "score_ci_high": 0.9444444444444444,
245
- "score_ci_low": 0.8222222222222222,
246
  "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
- "accuracy": 0.9333333333333333,
250
- "accuracy_ci_low": 0.8666666666666667,
251
- "accuracy_ci_high": 0.9777777777777777,
252
  "score_name": "accuracy",
253
- "score": 0.9333333333333333,
254
- "score_ci_high": 0.9777777777777777,
255
- "score_ci_low": 0.8666666666666667,
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
- "accuracy": 0.7666666666666667,
260
- "accuracy_ci_low": 0.6720698151047421,
261
- "accuracy_ci_high": 0.8444444444444444,
262
  "score_name": "accuracy",
263
- "score": 0.7666666666666667,
264
- "score_ci_high": 0.8444444444444444,
265
- "score_ci_low": 0.6720698151047421,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
- "accuracy": 0.6333333333333333,
270
- "accuracy_ci_low": 0.5333333333333333,
271
- "accuracy_ci_high": 0.7283280971833935,
272
  "score_name": "accuracy",
273
- "score": 0.6333333333333333,
274
- "score_ci_high": 0.7283280971833935,
275
- "score_ci_low": 0.5333333333333333,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
@@ -285,54 +285,54 @@
285
  "score_ci_low": 0.6666666666666666,
286
  "num_of_instances": 90
287
  },
288
- "score": 0.7515151515151515,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.5,
296
- "score": 0.5,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.5,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
- "f1_Person": 0.5102639296187683,
307
- "f1_Organization": 0.3381294964028777,
308
- "f1_Location": 0.35652173913043483,
309
- "f1_macro": 0.40163838838402693,
310
- "recall_macro": 0.3240210323686792,
311
- "precision_macro": 0.530656067251462,
312
- "in_classes_support": 0.5625,
313
- "f1_micro": 0.31789282470481384,
314
- "recall_micro": 0.3333333333333333,
315
- "precision_micro": 0.3038194444444444,
316
- "score": 0.31789282470481384,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.26482961534023236,
319
- "score_ci_high": 0.37029988780714157,
320
- "f1_micro_ci_low": 0.26482961534023236,
321
- "f1_micro_ci_high": 0.37029988780714157
322
  },
323
- "score": 0.31789282470481384,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
- "accuracy": 0.5211267605633803,
330
- "accuracy_ci_low": 0.4084507042253521,
331
- "accuracy_ci_high": 0.6338028169014085,
332
  "score_name": "accuracy",
333
- "score": 0.5211267605633803,
334
- "score_ci_high": 0.6338028169014085,
335
- "score_ci_low": 0.4084507042253521,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
@@ -348,392 +348,392 @@
348
  "mmlu_pro_chemistry": {
349
  "accuracy": 0.23943661971830985,
350
  "accuracy_ci_low": 0.15492957746478872,
351
- "accuracy_ci_high": 0.3380281690140845,
352
  "score_name": "accuracy",
353
  "score": 0.23943661971830985,
354
- "score_ci_high": 0.3380281690140845,
355
  "score_ci_low": 0.15492957746478872,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
- "accuracy": 0.43661971830985913,
360
- "accuracy_ci_low": 0.323943661971831,
361
- "accuracy_ci_high": 0.5492957746478874,
362
  "score_name": "accuracy",
363
- "score": 0.43661971830985913,
364
- "score_ci_high": 0.5492957746478874,
365
- "score_ci_low": 0.323943661971831,
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
- "accuracy": 0.38028169014084506,
370
- "accuracy_ci_low": 0.2676056338028169,
371
- "accuracy_ci_high": 0.49295774647887325,
372
  "score_name": "accuracy",
373
- "score": 0.38028169014084506,
374
- "score_ci_high": 0.49295774647887325,
375
- "score_ci_low": 0.2676056338028169,
376
  "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
- "accuracy": 0.2535211267605634,
380
- "accuracy_ci_low": 0.16901408450704225,
381
- "accuracy_ci_high": 0.36048330202820134,
382
  "score_name": "accuracy",
383
- "score": 0.2535211267605634,
384
- "score_ci_high": 0.36048330202820134,
385
- "score_ci_low": 0.16901408450704225,
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
- "accuracy": 0.36619718309859156,
390
- "accuracy_ci_low": 0.2535211267605634,
391
- "accuracy_ci_high": 0.4788732394366197,
392
  "score_name": "accuracy",
393
- "score": 0.36619718309859156,
394
- "score_ci_high": 0.4788732394366197,
395
- "score_ci_low": 0.2535211267605634,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
- "accuracy": 0.36619718309859156,
400
- "accuracy_ci_low": 0.2535211267605634,
401
- "accuracy_ci_high": 0.4788732394366197,
402
  "score_name": "accuracy",
403
- "score": 0.36619718309859156,
404
- "score_ci_high": 0.4788732394366197,
405
- "score_ci_low": 0.2535211267605634,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
- "accuracy": 0.36619718309859156,
410
- "accuracy_ci_low": 0.2535211267605634,
411
- "accuracy_ci_high": 0.4788732394366197,
412
  "score_name": "accuracy",
413
- "score": 0.36619718309859156,
414
- "score_ci_high": 0.4788732394366197,
415
- "score_ci_low": 0.2535211267605634,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
- "accuracy": 0.1267605633802817,
420
- "accuracy_ci_low": 0.056338028169014086,
421
- "accuracy_ci_high": 0.22535211267605634,
422
  "score_name": "accuracy",
423
- "score": 0.1267605633802817,
424
- "score_ci_high": 0.22535211267605634,
425
- "score_ci_low": 0.056338028169014086,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
- "accuracy": 0.22535211267605634,
430
- "accuracy_ci_low": 0.14084507042253522,
431
- "accuracy_ci_high": 0.323943661971831,
432
  "score_name": "accuracy",
433
- "score": 0.22535211267605634,
434
- "score_ci_high": 0.323943661971831,
435
- "score_ci_low": 0.14084507042253522,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
- "accuracy": 0.4084507042253521,
440
- "accuracy_ci_low": 0.30985915492957744,
441
- "accuracy_ci_high": 0.5352112676056338,
442
  "score_name": "accuracy",
443
- "score": 0.4084507042253521,
444
- "score_ci_high": 0.5352112676056338,
445
- "score_ci_low": 0.30985915492957744,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
- "accuracy": 0.29577464788732394,
450
- "accuracy_ci_low": 0.19718309859154928,
451
- "accuracy_ci_high": 0.4084507042253521,
452
  "score_name": "accuracy",
453
- "score": 0.29577464788732394,
454
- "score_ci_high": 0.4084507042253521,
455
- "score_ci_low": 0.19718309859154928,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
  "accuracy": 0.5352112676056338,
460
- "accuracy_ci_low": 0.4084507042253521,
461
  "accuracy_ci_high": 0.647887323943662,
462
  "score_name": "accuracy",
463
  "score": 0.5352112676056338,
464
  "score_ci_high": 0.647887323943662,
465
- "score_ci_low": 0.4084507042253521,
466
  "num_of_instances": 71
467
  },
468
- "score": 0.33702213279678067,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.2696554985630616,
475
- "f1_suggestive": 0.2727272727272727,
476
- "f1_arbitrary": 0.43137254901960786,
477
- "f1_generic": 0.11764705882352941,
478
- "f1_fanciful": 0.2,
479
- "f1_descriptive": 0.32653061224489793,
480
- "f1_macro_ci_low": 0.18689773936584586,
481
- "f1_macro_ci_high": 0.37923074712363225,
482
  "score_name": "f1_micro",
483
- "score": 0.31446540880503143,
484
- "score_ci_high": 0.42038216560509556,
485
- "score_ci_low": 0.21656050955414013,
486
  "num_of_instances": 85,
487
- "accuracy": 0.29411764705882354,
488
- "accuracy_ci_low": 0.2,
489
  "accuracy_ci_high": 0.4,
490
- "f1_micro": 0.31446540880503143,
491
- "f1_micro_ci_low": 0.21656050955414013,
492
- "f1_micro_ci_high": 0.42038216560509556
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.5388253241800153,
496
- "f1_no": 0.7298245614035088,
497
  "f1_yes": 0.34782608695652173,
498
- "f1_macro_ci_low": 0.47191290375757455,
499
- "f1_macro_ci_high": 0.6216206779092042,
500
  "score_name": "f1_micro",
501
- "score": 0.636604774535809,
502
- "score_ci_high": 0.6985040092826637,
503
- "score_ci_low": 0.5691144311757004,
504
  "num_of_instances": 200,
505
- "accuracy": 0.6,
506
- "accuracy_ci_low": 0.53,
507
- "accuracy_ci_high": 0.665,
508
- "f1_micro": 0.636604774535809,
509
- "f1_micro_ci_low": 0.5691144311757004,
510
- "f1_micro_ci_high": 0.6985040092826637
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.2947177227927682,
514
- "f1_conclusion": 0.2127659574468085,
515
- "f1_decree": 0.23529411764705882,
516
- "f1_issue": 0.2711864406779661,
517
- "f1_rule": 0.42857142857142855,
518
- "f1_analysis": 0.4444444444444444,
519
- "f1_facts": 0.21621621621621623,
520
- "f1_procedural history": 0.2545454545454545,
521
- "f1_macro_ci_low": 0.23794703715833648,
522
- "f1_macro_ci_high": 0.36665623309642204,
523
  "score_name": "f1_micro",
524
- "score": 0.30409356725146197,
525
- "score_ci_high": 0.3711587285161421,
526
- "score_ci_low": 0.23855266549315363,
527
  "num_of_instances": 200,
528
- "accuracy": 0.26,
529
- "accuracy_ci_low": 0.2,
530
- "accuracy_ci_high": 0.32,
531
- "f1_micro": 0.30409356725146197,
532
- "f1_micro_ci_low": 0.23855266549315363,
533
- "f1_micro_ci_high": 0.3711587285161421
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.49092908191313905,
537
- "f1_yes": 0.5700934579439252,
538
- "f1_no": 0.4117647058823529,
539
- "f1_macro_ci_low": 0.4178065856787266,
540
- "f1_macro_ci_high": 0.5601203681213927,
541
  "score_name": "f1_micro",
542
- "score": 0.5,
543
- "score_ci_high": 0.566970455032283,
544
- "score_ci_low": 0.42555336134062,
545
  "num_of_instances": 200,
546
- "accuracy": 0.48,
547
- "accuracy_ci_low": 0.405,
548
- "accuracy_ci_high": 0.545,
549
- "f1_micro": 0.5,
550
- "f1_micro_ci_low": 0.42555336134062,
551
- "f1_micro_ci_high": 0.566970455032283
552
  },
553
  "legalbench_proa": {
554
- "f1_macro": 0.8315276273022751,
555
- "f1_yes": 0.8169014084507042,
556
- "f1_no": 0.8461538461538461,
557
- "f1_macro_ci_low": 0.7549023325928579,
558
- "f1_macro_ci_high": 0.890440353074843,
559
  "score_name": "f1_micro",
560
- "score": 0.8322147651006712,
561
- "score_ci_high": 0.8903225806451613,
562
- "score_ci_low": 0.7554946760306516,
563
  "num_of_instances": 85,
564
- "accuracy": 0.7294117647058823,
565
- "accuracy_ci_low": 0.6352941176470588,
566
- "accuracy_ci_high": 0.8117647058823529,
567
- "f1_micro": 0.8322147651006712,
568
- "f1_micro_ci_low": 0.7554946760306516,
569
- "f1_micro_ci_high": 0.8903225806451613
570
  },
571
- "score": 0.5174757031385947,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.42272407811143237,
578
- "f1_cars": 0.6078431372549019,
579
- "f1_pc hardware": 0.34080717488789236,
580
- "f1_windows x": 0.029850746268656716,
581
- "f1_computer graphics": 0.4367816091954023,
582
- "f1_atheism": 0.21739130434782608,
583
- "f1_religion": 0.23300970873786409,
584
- "f1_medicine": 0.8641975308641975,
585
  "f1_christianity": 0.1694915254237288,
586
- "f1_microsoft windows": 0.39436619718309857,
587
- "f1_middle east": 0.43037974683544306,
588
- "f1_politics": 0.291970802919708,
589
- "f1_motorcycles": 0.43902439024390244,
590
- "f1_mac hardware": 0.09090909090909091,
591
- "f1_for sale": 0.625,
592
- "f1_guns": 0.18181818181818182,
593
- "f1_space": 0.5569620253164557,
594
- "f1_cryptography": 0.4482758620689655,
595
- "f1_baseball": 0.8545454545454545,
596
- "f1_hockey": 0.859504132231405,
597
- "f1_electronics": 0.38235294117647056,
598
- "f1_macro_ci_low": 0.3988534736802405,
599
- "f1_macro_ci_high": 0.4557473948035634,
 
600
  "score_name": "f1_micro",
601
- "score": 0.44368600682593856,
602
- "score_ci_high": 0.47444463958776134,
603
- "score_ci_low": 0.4135801299006492,
604
  "num_of_instances": 1000,
605
- "accuracy": 0.39,
606
- "accuracy_ci_low": 0.36,
607
- "accuracy_ci_high": 0.418,
608
- "f1_micro": 0.44368600682593856,
609
- "f1_micro_ci_low": 0.4135801299006492,
610
- "f1_micro_ci_high": 0.47444463958776134
611
  },
612
- "score": 0.44368600682593856,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.6105828707367139,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9035153328347045,
620
- "f1_credit card or prepaid card": 0.5873015873015873,
621
- "f1_debt collection": 0.6375,
622
- "f1_checking or savings account": 0.75,
623
- "f1_money transfer or virtual currency or money service": 0.5777777777777777,
624
- "f1_vehicle loan or lease": 0.4666666666666667,
625
- "f1_mortgage": 0.6785714285714286,
626
- "f1_payday loan or title loan or personal loan": 0.17391304347826086,
627
- "f1_student loan": 0.72,
628
- "f1_macro_ci_low": 0.5575796516691159,
629
- "f1_macro_ci_high": 0.6705972502098242,
630
  "score_name": "f1_micro",
631
- "score": 0.8195173137460651,
632
- "score_ci_high": 0.842436974789916,
633
- "score_ci_low": 0.7946166113913405,
634
  "num_of_instances": 1000,
635
- "accuracy": 0.781,
636
  "accuracy_ci_low": 0.752,
637
- "accuracy_ci_high": 0.806,
638
- "f1_micro": 0.8195173137460651,
639
- "f1_micro_ci_low": 0.7946166113913405,
640
- "f1_micro_ci_high": 0.842436974789916
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.7132677588870594,
644
- "f1_mortgages and loans": 0.7771428571428571,
645
- "f1_credit card": 0.7023809523809523,
646
- "f1_debt collection": 0.6854460093896714,
647
- "f1_credit reporting": 0.7601476014760148,
648
- "f1_retail banking": 0.6412213740458015,
649
- "f1_macro_ci_low": 0.672279823384184,
650
- "f1_macro_ci_high": 0.7539657340394554,
651
  "score_name": "f1_micro",
652
- "score": 0.7202505219206681,
653
- "score_ci_high": 0.7576596149340853,
654
- "score_ci_low": 0.6805865270375967,
655
  "num_of_instances": 500,
656
- "accuracy": 0.69,
657
- "accuracy_ci_low": 0.65,
658
- "accuracy_ci_high": 0.73,
659
- "f1_micro": 0.7202505219206681,
660
- "f1_micro_ci_low": 0.6805865270375967,
661
- "f1_micro_ci_high": 0.7576596149340853
662
  },
663
- "score": 0.7698839178333665,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
- "execution_accuracy": 0.074,
671
- "program_accuracy": 0.085,
672
- "score": 0.085,
673
  "score_name": "program_accuracy",
674
- "execution_accuracy_ci_low": 0.058,
675
- "execution_accuracy_ci_high": 0.091,
676
- "program_accuracy_ci_low": 0.068,
677
- "program_accuracy_ci_high": 0.102,
678
- "score_ci_low": 0.068,
679
- "score_ci_high": 0.102
 
680
  },
681
- "score": 0.085,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.30022844870852566,
688
- "recall": 0.5840193774846996,
689
- "f1": 0.3357215148632638,
690
- "precision_ci_low": 0.28030967471726836,
691
- "precision_ci_high": 0.32121747414474766,
692
- "recall_ci_low": 0.565861900260428,
693
- "recall_ci_high": 0.59971992711831,
694
- "f1_ci_low": 0.3175124739653954,
695
- "f1_ci_high": 0.35218969004250933,
696
  "score_name": "f1",
697
- "score": 0.3357215148632638,
698
- "score_ci_high": 0.35218969004250933,
699
- "score_ci_low": 0.3175124739653954,
700
  "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.6000729690988859,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6848867723842462,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5528717579195896,
704
- "faithfullness_f1_token_overlap": 0.33597285355913525,
705
- "faithfullness_recall_token_overlap": 0.27114762054953845,
706
- "faithfullness_precision_token_overlap": 0.5569731492695155,
707
- "correctness_f1_token_overlap": 0.3357215148632638,
708
- "correctness_recall_token_overlap": 0.5840193774846996,
709
- "correctness_precision_token_overlap": 0.30022844870852566
710
  },
711
- "score": 0.3357215148632638,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.455,
718
- "accuracy_ci_low": 0.423,
719
- "accuracy_ci_high": 0.485,
720
  "score_name": "accuracy",
721
- "score": 0.455,
722
- "score_ci_high": 0.485,
723
- "score_ci_low": 0.423,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
- "accuracy": 0.782,
728
- "accuracy_ci_low": 0.744,
729
- "accuracy_ci_high": 0.818,
730
  "score_name": "accuracy",
731
- "score": 0.782,
732
- "score_ci_high": 0.818,
733
- "score_ci_low": 0.744,
734
  "num_of_instances": 500
735
  },
736
- "score": 0.6185,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
@@ -741,68 +741,68 @@
741
  "attaq_500": {
742
  "severity_critical": 0.0,
743
  "severity_high": 0.0,
744
- "severity_medium": 0.0,
745
- "severity_low": 100.0,
746
- "category_harmful_info": 0.8662501443103547,
747
- "category_deception": 0.8769584958149929,
748
- "category_explicit_content": 0.8376609406085929,
749
- "category_violence": 0.8456707175466277,
750
- "category_discrimination": 0.8991804392564864,
751
- "category_substance_abuse": 0.8935730718904071,
752
- "category_pii": 0.8467056638112774,
753
- "safety": 0.860165383811626,
754
- "safety_ci_low": 0.8426579493684276,
755
- "safety_ci_high": 0.8778207056405374,
756
  "score_name": "safety",
757
- "score": 0.860165383811626,
758
- "score_ci_high": 0.8778207056405374,
759
- "score_ci_low": 0.8426579493684276,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.860165383811626,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
- "rougeLsum": 0.3503820014906059,
770
- "rouge1": 0.4200585739584912,
771
- "rouge2": 0.20109007723824623,
772
- "rougeL": 0.28749112328021914,
773
- "score": 0.28749112328021914,
774
  "score_name": "rougeL",
775
- "rougeLsum_ci_low": 0.341921573094731,
776
- "rougeLsum_ci_high": 0.35863585426859207,
777
- "rouge1_ci_low": 0.41035793857223635,
778
- "rouge1_ci_high": 0.4281932704537228,
779
- "rouge2_ci_low": 0.19416899053732958,
780
- "rouge2_ci_high": 0.20872476773642967,
781
- "rougeL_ci_low": 0.2804794753326623,
782
- "rougeL_ci_high": 0.29447838537921134,
783
- "score_ci_low": 0.2804794753326623,
784
- "score_ci_high": 0.29447838537921134
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
- "rougeLsum": 0.0922932399263996,
789
- "rouge1": 0.11247814548815566,
790
- "rouge2": 0.015117853576507847,
791
- "rougeL": 0.07979202357473647,
792
- "score": 0.07979202357473647,
793
  "score_name": "rougeL",
794
- "rougeLsum_ci_low": 0.0880597944044916,
795
- "rougeLsum_ci_high": 0.09606464509440052,
796
- "rouge1_ci_low": 0.10733708561154955,
797
- "rouge1_ci_high": 0.11723898467910755,
798
- "rouge2_ci_low": 0.01362250797390663,
799
- "rouge2_ci_high": 0.0168799885499115,
800
- "rougeL_ci_low": 0.0764789144644062,
801
- "rougeL_ci_high": 0.08304032568245756,
802
- "score_ci_low": 0.0764789144644062,
803
- "score_ci_high": 0.08304032568245756
804
  },
805
- "score": 0.1836415734274778,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
@@ -810,473 +810,473 @@
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
- 1154,
814
- 637,
815
- 382,
816
- 237
817
  ],
818
  "totals": [
819
- 3013,
820
- 2947,
821
- 2881,
822
- 2815
823
  ],
824
  "precisions": [
825
- 0.383006969797544,
826
- 0.2161520190023753,
827
- 0.13259284970496357,
828
- 0.08419182948490231
829
  ],
830
  "bp": 1.0,
831
- "sys_len": 3013,
832
  "ref_len": 1734,
833
- "sacrebleu": 0.17435684678472682,
834
- "score": 0.17435684678472682,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.12709535962365245,
837
- "score_ci_high": 0.21064271607309265,
838
- "sacrebleu_ci_low": 0.12709535962365245,
839
- "sacrebleu_ci_high": 0.21064271607309265
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
- 1215,
845
- 695,
846
- 422,
847
- 256
848
  ],
849
  "totals": [
850
- 3433,
851
- 3367,
852
- 3301,
853
- 3235
854
  ],
855
  "precisions": [
856
- 0.35391785610253423,
857
- 0.20641520641520641,
858
- 0.12784004847016056,
859
- 0.07913446676970634
860
  ],
861
  "bp": 1.0,
862
- "sys_len": 3433,
863
  "ref_len": 1734,
864
- "sacrebleu": 0.16488046075977367,
865
- "score": 0.16488046075977367,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.12825986690370522,
868
- "score_ci_high": 0.20812836267228596,
869
- "sacrebleu_ci_low": 0.12825986690370522,
870
- "sacrebleu_ci_high": 0.20812836267228596
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
- 726,
876
- 321,
877
- 159,
878
- 82
879
  ],
880
  "totals": [
881
- 2297,
882
- 2231,
883
- 2165,
884
- 2099
885
  ],
886
  "precisions": [
887
- 0.3160644318676535,
888
- 0.14388166741371583,
889
- 0.07344110854503465,
890
- 0.03906622201048118
891
  ],
892
  "bp": 1.0,
893
- "sys_len": 2297,
894
  "ref_len": 1589,
895
- "sacrebleu": 0.10687605905530678,
896
- "score": 0.10687605905530678,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.08639846348006232,
899
- "score_ci_high": 0.13425269082562755,
900
- "sacrebleu_ci_low": 0.08639846348006232,
901
- "sacrebleu_ci_high": 0.13425269082562755
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
- 1066,
907
- 564,
908
- 332,
909
- 194
910
  ],
911
  "totals": [
912
- 2300,
913
- 2234,
914
- 2168,
915
- 2102
916
  ],
917
  "precisions": [
918
- 0.46347826086956523,
919
- 0.252461951656222,
920
- 0.15313653136531366,
921
- 0.0922930542340628
922
  ],
923
  "bp": 1.0,
924
- "sys_len": 2300,
925
  "ref_len": 1835,
926
- "sacrebleu": 0.2016593123773307,
927
- "score": 0.2016593123773307,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.177292145733578,
930
- "score_ci_high": 0.24439707428713803,
931
- "sacrebleu_ci_low": 0.177292145733578,
932
- "sacrebleu_ci_high": 0.24439707428713803
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
- 1409,
938
- 950,
939
- 692,
940
- 517
941
  ],
942
  "totals": [
943
- 3275,
944
- 3209,
945
- 3143,
946
- 3077
947
  ],
948
  "precisions": [
949
- 0.4302290076335878,
950
- 0.2960423808039888,
951
- 0.2201718103722558,
952
- 0.168020799480013
953
  ],
954
  "bp": 1.0,
955
- "sys_len": 3275,
956
  "ref_len": 2068,
957
- "sacrebleu": 0.2619959538476516,
958
- "score": 0.2619959538476516,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.21071110880640612,
961
- "score_ci_high": 0.30599931494111227,
962
- "sacrebleu_ci_low": 0.21071110880640612,
963
- "sacrebleu_ci_high": 0.30599931494111227
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
- 1096,
969
- 465,
970
- 233,
971
- 132
972
  ],
973
  "totals": [
974
- 3883,
975
- 3817,
976
- 3751,
977
- 3685
978
  ],
979
  "precisions": [
980
- 0.28225598763842386,
981
- 0.12182342153523709,
982
- 0.0621167688616369,
983
- 0.03582089552238806
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 3883,
987
  "ref_len": 2235,
988
- "sacrebleu": 0.09352545142421302,
989
- "score": 0.09352545142421302,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.0763987126727994,
992
- "score_ci_high": 0.11617390981932266,
993
- "sacrebleu_ci_low": 0.0763987126727994,
994
- "sacrebleu_ci_high": 0.11617390981932266
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
- 1328,
1000
- 850,
1001
- 588,
1002
- 412
1003
  ],
1004
  "totals": [
1005
- 3030,
1006
- 2964,
1007
- 2898,
1008
- 2832
1009
  ],
1010
  "precisions": [
1011
- 0.4382838283828383,
1012
- 0.286774628879892,
1013
- 0.2028985507246377,
1014
- 0.14548022598870058
1015
  ],
1016
  "bp": 1.0,
1017
- "sys_len": 3030,
1018
  "ref_len": 1916,
1019
- "sacrebleu": 0.2467997817029595,
1020
- "score": 0.2467997817029595,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.193392163449652,
1023
- "score_ci_high": 0.2974642241791255,
1024
- "sacrebleu_ci_low": 0.193392163449652,
1025
- "sacrebleu_ci_high": 0.2974642241791255
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
- 930,
1031
- 400,
1032
- 214,
1033
- 123
1034
  ],
1035
  "totals": [
1036
- 2961,
1037
- 2895,
1038
- 2829,
1039
- 2763
1040
  ],
1041
  "precisions": [
1042
- 0.3140830800405269,
1043
- 0.1381692573402418,
1044
- 0.07564510427712973,
1045
- 0.04451682953311618
1046
  ],
1047
  "bp": 1.0,
1048
- "sys_len": 2961,
1049
  "ref_len": 1949,
1050
- "sacrebleu": 0.1099487393546487,
1051
- "score": 0.1099487393546487,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.08284384518142485,
1054
- "score_ci_high": 0.13880651312628609,
1055
- "sacrebleu_ci_low": 0.08284384518142485,
1056
- "sacrebleu_ci_high": 0.13880651312628609
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
- 1217,
1062
- 624,
1063
- 347,
1064
- 198
1065
  ],
1066
  "totals": [
1067
- 3045,
1068
- 2979,
1069
  2913,
1070
- 2847
 
 
1071
  ],
1072
  "precisions": [
1073
- 0.399671592775041,
1074
- 0.20946626384692849,
1075
- 0.11912118091314795,
1076
- 0.06954689146469968
1077
  ],
1078
  "bp": 1.0,
1079
- "sys_len": 3045,
1080
  "ref_len": 2098,
1081
- "sacrebleu": 0.1622822499255264,
1082
- "score": 0.1622822499255264,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.13321857221475644,
1085
- "score_ci_high": 0.19390301665624113,
1086
- "sacrebleu_ci_low": 0.13321857221475644,
1087
- "sacrebleu_ci_high": 0.19390301665624113
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
- 1236,
1093
- 735,
1094
- 470,
1095
- 308
1096
  ],
1097
  "totals": [
1098
- 2952,
1099
- 2886,
1100
- 2820,
1101
- 2754
1102
  ],
1103
  "precisions": [
1104
- 0.4186991869918699,
1105
- 0.25467775467775466,
1106
- 0.16666666666666669,
1107
- 0.11183732752360204
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 2952,
1111
  "ref_len": 1734,
1112
- "sacrebleu": 0.2111456628673961,
1113
- "score": 0.2111456628673961,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.1728340034401921,
1116
- "score_ci_high": 0.26908287892628974,
1117
- "sacrebleu_ci_low": 0.1728340034401921,
1118
- "sacrebleu_ci_high": 0.26908287892628974
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
- 1018,
1124
- 437,
1125
- 232,
1126
- 128
1127
  ],
1128
  "totals": [
1129
- 3130,
1130
- 3064,
1131
- 2998,
1132
- 2932
1133
  ],
1134
  "precisions": [
1135
- 0.3252396166134185,
1136
- 0.14262402088772846,
1137
- 0.07738492328218813,
1138
- 0.04365620736698499
1139
  ],
1140
  "bp": 1.0,
1141
- "sys_len": 3130,
1142
  "ref_len": 1734,
1143
- "sacrebleu": 0.11188570922324435,
1144
- "score": 0.11188570922324435,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.09154049326122426,
1147
- "score_ci_high": 0.13827539969992217,
1148
- "sacrebleu_ci_low": 0.09154049326122426,
1149
- "sacrebleu_ci_high": 0.13827539969992217
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
- 986,
1155
- 447,
1156
- 233,
1157
- 127
1158
  ],
1159
  "totals": [
1160
- 3637,
1161
- 3571,
1162
- 3505,
1163
- 3439
1164
  ],
1165
  "precisions": [
1166
- 0.27110255705251585,
1167
- 0.12517502100252031,
1168
- 0.06647646219686162,
1169
- 0.03692933992439663
1170
  ],
1171
  "bp": 1.0,
1172
- "sys_len": 3637,
1173
  "ref_len": 1734,
1174
- "sacrebleu": 0.09553723823741646,
1175
- "score": 0.09553723823741646,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.06933902828362079,
1178
- "score_ci_high": 0.1273472328564688,
1179
- "sacrebleu_ci_low": 0.06933902828362079,
1180
- "sacrebleu_ci_high": 0.1273472328564688
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
- 1286,
1186
- 834,
1187
- 587,
1188
- 419
1189
  ],
1190
  "totals": [
1191
- 3404,
1192
- 3338,
1193
- 3272,
1194
- 3206
1195
  ],
1196
  "precisions": [
1197
- 0.37779083431257343,
1198
- 0.24985020970641103,
1199
- 0.17940097799511,
1200
- 0.13069245165315035
1201
  ],
1202
  "bp": 1.0,
1203
- "sys_len": 3404,
1204
  "ref_len": 1734,
1205
- "sacrebleu": 0.21689603438287544,
1206
- "score": 0.21689603438287544,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.18174547190909165,
1209
- "score_ci_high": 0.2734022486576191,
1210
- "sacrebleu_ci_low": 0.18174547190909165,
1211
- "sacrebleu_ci_high": 0.2734022486576191
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
- 1208,
1217
- 675,
1218
- 430,
1219
- 279
1220
  ],
1221
  "totals": [
1222
- 3677,
1223
- 3611,
1224
- 3545,
1225
- 3479
1226
  ],
1227
  "precisions": [
1228
- 0.32852869186837097,
1229
- 0.1869288285793409,
1230
- 0.12129760225669958,
1231
- 0.08019545846507617
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 3677,
1235
  "ref_len": 1734,
1236
- "sacrebleu": 0.15633740352446387,
1237
- "score": 0.15633740352446387,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.12255450743419968,
1240
- "score_ci_high": 0.17971859902386644,
1241
- "sacrebleu_ci_low": 0.12255450743419968,
1242
- "sacrebleu_ci_high": 0.17971859902386644
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
- 1135,
1248
- 581,
1249
- 336,
1250
- 202
1251
  ],
1252
  "totals": [
1253
- 3533,
1254
- 3467,
1255
- 3401,
1256
- 3335
1257
  ],
1258
  "precisions": [
1259
- 0.3212567223322955,
1260
- 0.16758004038073263,
1261
- 0.09879447221405468,
1262
- 0.06056971514242879
1263
  ],
1264
  "bp": 1.0,
1265
- "sys_len": 3533,
1266
  "ref_len": 1734,
1267
- "sacrebleu": 0.133972503470666,
1268
- "score": 0.133972503470666,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.10251876459928583,
1271
- "score_ci_high": 0.17481307519673603,
1272
- "sacrebleu_ci_low": 0.10251876459928583,
1273
- "sacrebleu_ci_high": 0.17481307519673603
1274
  },
1275
- "score": 0.1632066271292133,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
- "score": 0.45259314123432515,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-06-23T08:06:33.434344Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
+ "accuracy": 0.5,
180
+ "accuracy_ci_low": 0.4,
181
+ "accuracy_ci_high": 0.6,
182
  "score_name": "accuracy",
183
+ "score": 0.5,
184
+ "score_ci_high": 0.6,
185
+ "score_ci_low": 0.4,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
+ "accuracy": 0.6777777777777778,
190
+ "accuracy_ci_low": 0.5777777777777777,
191
+ "accuracy_ci_high": 0.7666666666666667,
192
  "score_name": "accuracy",
193
+ "score": 0.6777777777777778,
194
+ "score_ci_high": 0.7666666666666667,
195
+ "score_ci_low": 0.5777777777777777,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
+ "accuracy": 0.8333333333333334,
200
+ "accuracy_ci_low": 0.7444444444444445,
201
+ "accuracy_ci_high": 0.9,
202
  "score_name": "accuracy",
203
+ "score": 0.8333333333333334,
204
+ "score_ci_high": 0.9,
205
+ "score_ci_low": 0.7444444444444445,
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
+ "accuracy": 0.6666666666666666,
210
+ "accuracy_ci_low": 0.5666666666666667,
211
+ "accuracy_ci_high": 0.7555555555555555,
212
  "score_name": "accuracy",
213
+ "score": 0.6666666666666666,
214
+ "score_ci_high": 0.7555555555555555,
215
+ "score_ci_low": 0.5666666666666667,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.7111111111111111,
220
+ "accuracy_ci_low": 0.6111111111111112,
221
+ "accuracy_ci_high": 0.8,
222
  "score_name": "accuracy",
223
+ "score": 0.7111111111111111,
224
+ "score_ci_high": 0.8,
225
+ "score_ci_low": 0.6111111111111112,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
  "accuracy": 0.9333333333333333,
230
+ "accuracy_ci_low": 0.8777777777777778,
231
  "accuracy_ci_high": 0.9777777777777777,
232
  "score_name": "accuracy",
233
  "score": 0.9333333333333333,
234
  "score_ci_high": 0.9777777777777777,
235
+ "score_ci_low": 0.8777777777777778,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.8444444444444444,
240
+ "accuracy_ci_low": 0.7555555555555555,
241
+ "accuracy_ci_high": 0.9048361867497154,
242
  "score_name": "accuracy",
243
+ "score": 0.8444444444444444,
244
+ "score_ci_high": 0.9048361867497154,
245
+ "score_ci_low": 0.7555555555555555,
246
  "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.9111111111111111,
250
+ "accuracy_ci_low": 0.8333333333333334,
251
+ "accuracy_ci_high": 0.9555555555555556,
252
  "score_name": "accuracy",
253
+ "score": 0.9111111111111111,
254
+ "score_ci_high": 0.9555555555555556,
255
+ "score_ci_low": 0.8333333333333334,
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
+ "accuracy": 0.7444444444444445,
260
+ "accuracy_ci_low": 0.6555555555555556,
261
+ "accuracy_ci_high": 0.83090190108808,
262
  "score_name": "accuracy",
263
+ "score": 0.7444444444444445,
264
+ "score_ci_high": 0.83090190108808,
265
+ "score_ci_low": 0.6555555555555556,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
+ "accuracy": 0.6777777777777778,
270
+ "accuracy_ci_low": 0.5777777777777777,
271
+ "accuracy_ci_high": 0.7666666666666667,
272
  "score_name": "accuracy",
273
+ "score": 0.6777777777777778,
274
+ "score_ci_high": 0.7666666666666667,
275
+ "score_ci_low": 0.5777777777777777,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
 
285
  "score_ci_low": 0.6666666666666666,
286
  "num_of_instances": 90
287
  },
288
+ "score": 0.7515151515151516,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.08744186046511628,
296
+ "score": 0.08744186046511628,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.08744186046511628,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
+ "f1_Person": 0.52046783625731,
307
+ "f1_Location": 0.3275862068965517,
308
+ "f1_Organization": 0.3905723905723905,
309
+ "f1_macro": 0.41287547790875073,
310
+ "recall_macro": 0.34275188964299236,
311
+ "precision_macro": 0.5261312195216724,
312
+ "in_classes_support": 0.5945017182130584,
313
+ "f1_micro": 0.3342366757000903,
314
+ "recall_micro": 0.3523809523809524,
315
+ "precision_micro": 0.3178694158075601,
316
+ "score": 0.3342366757000903,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.29025426476142113,
319
+ "score_ci_high": 0.38246190736620644,
320
+ "f1_micro_ci_low": 0.29025426476142113,
321
+ "f1_micro_ci_high": 0.38246190736620644
322
  },
323
+ "score": 0.3342366757000903,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
+ "accuracy": 0.4225352112676056,
330
+ "accuracy_ci_low": 0.30985915492957744,
331
+ "accuracy_ci_high": 0.5352112676056338,
332
  "score_name": "accuracy",
333
+ "score": 0.4225352112676056,
334
+ "score_ci_high": 0.5352112676056338,
335
+ "score_ci_low": 0.30985915492957744,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
 
348
  "mmlu_pro_chemistry": {
349
  "accuracy": 0.23943661971830985,
350
  "accuracy_ci_low": 0.15492957746478872,
351
+ "accuracy_ci_high": 0.352112676056338,
352
  "score_name": "accuracy",
353
  "score": 0.23943661971830985,
354
+ "score_ci_high": 0.352112676056338,
355
  "score_ci_low": 0.15492957746478872,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
+ "accuracy": 0.4084507042253521,
360
+ "accuracy_ci_low": 0.29577464788732394,
361
+ "accuracy_ci_high": 0.5211267605633803,
362
  "score_name": "accuracy",
363
+ "score": 0.4084507042253521,
364
+ "score_ci_high": 0.5211267605633803,
365
+ "score_ci_low": 0.29577464788732394,
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
+ "accuracy": 0.4084507042253521,
370
+ "accuracy_ci_low": 0.29577464788732394,
371
+ "accuracy_ci_high": 0.5211267605633803,
372
  "score_name": "accuracy",
373
+ "score": 0.4084507042253521,
374
+ "score_ci_high": 0.5211267605633803,
375
+ "score_ci_low": 0.29577464788732394,
376
  "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
+ "accuracy": 0.23943661971830985,
380
+ "accuracy_ci_low": 0.15492957746478872,
381
+ "accuracy_ci_high": 0.3380281690140845,
382
  "score_name": "accuracy",
383
+ "score": 0.23943661971830985,
384
+ "score_ci_high": 0.3380281690140845,
385
+ "score_ci_low": 0.15492957746478872,
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
+ "accuracy": 0.352112676056338,
390
+ "accuracy_ci_low": 0.23943661971830985,
391
+ "accuracy_ci_high": 0.4647887323943662,
392
  "score_name": "accuracy",
393
+ "score": 0.352112676056338,
394
+ "score_ci_high": 0.4647887323943662,
395
+ "score_ci_low": 0.23943661971830985,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
+ "accuracy": 0.323943661971831,
400
+ "accuracy_ci_low": 0.21693057179778907,
401
+ "accuracy_ci_high": 0.43661971830985913,
402
  "score_name": "accuracy",
403
+ "score": 0.323943661971831,
404
+ "score_ci_high": 0.43661971830985913,
405
+ "score_ci_low": 0.21693057179778907,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
+ "accuracy": 0.30985915492957744,
410
+ "accuracy_ci_low": 0.2112676056338028,
411
+ "accuracy_ci_high": 0.42459270101591795,
412
  "score_name": "accuracy",
413
+ "score": 0.30985915492957744,
414
+ "score_ci_high": 0.42459270101591795,
415
+ "score_ci_low": 0.2112676056338028,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
+ "accuracy": 0.09859154929577464,
420
+ "accuracy_ci_low": 0.04225352112676056,
421
+ "accuracy_ci_high": 0.17777703477060838,
422
  "score_name": "accuracy",
423
+ "score": 0.09859154929577464,
424
+ "score_ci_high": 0.17777703477060838,
425
+ "score_ci_low": 0.04225352112676056,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.323943661971831,
430
+ "accuracy_ci_low": 0.22338079742223388,
431
+ "accuracy_ci_high": 0.43661971830985913,
432
  "score_name": "accuracy",
433
+ "score": 0.323943661971831,
434
+ "score_ci_high": 0.43661971830985913,
435
+ "score_ci_low": 0.22338079742223388,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
+ "accuracy": 0.4647887323943662,
440
+ "accuracy_ci_low": 0.352112676056338,
441
+ "accuracy_ci_high": 0.5915492957746479,
442
  "score_name": "accuracy",
443
+ "score": 0.4647887323943662,
444
+ "score_ci_high": 0.5915492957746479,
445
+ "score_ci_low": 0.352112676056338,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.18309859154929578,
450
+ "accuracy_ci_low": 0.11267605633802817,
451
+ "accuracy_ci_high": 0.28169014084507044,
452
  "score_name": "accuracy",
453
+ "score": 0.18309859154929578,
454
+ "score_ci_high": 0.28169014084507044,
455
+ "score_ci_low": 0.11267605633802817,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
  "accuracy": 0.5352112676056338,
460
+ "accuracy_ci_low": 0.4225352112676056,
461
  "accuracy_ci_high": 0.647887323943662,
462
  "score_name": "accuracy",
463
  "score": 0.5352112676056338,
464
  "score_ci_high": 0.647887323943662,
465
+ "score_ci_low": 0.4225352112676056,
466
  "num_of_instances": 71
467
  },
468
+ "score": 0.32193158953722334,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.3075373413771583,
475
+ "f1_suggestive": 0.36363636363636365,
476
+ "f1_arbitrary": 0.28,
477
+ "f1_generic": 0.3157894736842105,
478
+ "f1_fanciful": 0.1,
479
+ "f1_descriptive": 0.4782608695652174,
480
+ "f1_macro_ci_low": 0.22135682385238098,
481
+ "f1_macro_ci_high": 0.4258827689087187,
482
  "score_name": "f1_micro",
483
+ "score": 0.33121019108280253,
484
+ "score_ci_high": 0.43513626025637364,
485
+ "score_ci_low": 0.22818791946308725,
486
  "num_of_instances": 85,
487
+ "accuracy": 0.3058823529411765,
488
+ "accuracy_ci_low": 0.21176470588235294,
489
  "accuracy_ci_high": 0.4,
490
+ "f1_micro": 0.33121019108280253,
491
+ "f1_micro_ci_low": 0.22818791946308725,
492
+ "f1_micro_ci_high": 0.43513626025637364
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.563568215892054,
496
+ "f1_no": 0.7793103448275862,
497
  "f1_yes": 0.34782608695652173,
498
+ "f1_macro_ci_low": 0.49159571105513383,
499
+ "f1_macro_ci_high": 0.6365342652768683,
500
  "score_name": "f1_micro",
501
+ "score": 0.675392670157068,
502
+ "score_ci_high": 0.73489030467135,
503
+ "score_ci_low": 0.608918205032967,
504
  "num_of_instances": 200,
505
+ "accuracy": 0.645,
506
+ "accuracy_ci_low": 0.58,
507
+ "accuracy_ci_high": 0.705,
508
+ "f1_micro": 0.675392670157068,
509
+ "f1_micro_ci_low": 0.608918205032967,
510
+ "f1_micro_ci_high": 0.73489030467135
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.3062664077216316,
514
+ "f1_conclusion": 0.20833333333333334,
515
+ "f1_decree": 0.1875,
516
+ "f1_rule": 0.47761194029850745,
517
+ "f1_issue": 0.25,
518
+ "f1_analysis": 0.44,
519
+ "f1_facts": 0.2727272727272727,
520
+ "f1_procedural history": 0.3076923076923077,
521
+ "f1_macro_ci_low": 0.24659229419876927,
522
+ "f1_macro_ci_high": 0.3810118235674986,
523
  "score_name": "f1_micro",
524
+ "score": 0.3209169054441261,
525
+ "score_ci_high": 0.3885967259042703,
526
+ "score_ci_low": 0.2564102564102564,
527
  "num_of_instances": 200,
528
+ "accuracy": 0.28,
529
+ "accuracy_ci_low": 0.22,
530
+ "accuracy_ci_high": 0.34,
531
+ "f1_micro": 0.3209169054441261,
532
+ "f1_micro_ci_low": 0.2564102564102564,
533
+ "f1_micro_ci_high": 0.3885967259042703
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.5153888280394305,
537
+ "f1_yes": 0.6090909090909091,
538
+ "f1_no": 0.42168674698795183,
539
+ "f1_macro_ci_low": 0.44286956267940425,
540
+ "f1_macro_ci_high": 0.5822132955205006,
541
  "score_name": "f1_micro",
542
+ "score": 0.5284974093264249,
543
+ "score_ci_high": 0.5917634471129095,
544
+ "score_ci_low": 0.4572437728690022,
545
  "num_of_instances": 200,
546
+ "accuracy": 0.51,
547
+ "accuracy_ci_low": 0.44,
548
+ "accuracy_ci_high": 0.575,
549
+ "f1_micro": 0.5284974093264249,
550
+ "f1_micro_ci_low": 0.4572437728690022,
551
+ "f1_micro_ci_high": 0.5917634471129095
552
  },
553
  "legalbench_proa": {
554
+ "f1_macro": 0.8555844155844157,
555
+ "f1_yes": 0.88,
556
+ "f1_no": 0.8311688311688312,
557
+ "f1_macro_ci_low": 0.7810530949191602,
558
+ "f1_macro_ci_high": 0.911227513400763,
559
  "score_name": "f1_micro",
560
+ "score": 0.8552631578947368,
561
+ "score_ci_high": 0.9104714274063991,
562
+ "score_ci_low": 0.7791920429268818,
563
  "num_of_instances": 85,
564
+ "accuracy": 0.7647058823529411,
565
+ "accuracy_ci_low": 0.6705882352941176,
566
+ "accuracy_ci_high": 0.8470588235294118,
567
+ "f1_micro": 0.8552631578947368,
568
+ "f1_micro_ci_low": 0.7791920429268818,
569
+ "f1_micro_ci_high": 0.9104714274063991
570
  },
571
+ "score": 0.5422560667810317,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.4238304887304002,
578
+ "f1_cars": 0.6534653465346535,
579
+ "f1_pc hardware": 0.38095238095238093,
580
+ "f1_windows x": 0.0,
581
+ "f1_atheism": 0.2727272727272727,
582
+ "f1_religion": 0.22641509433962265,
583
+ "f1_medicine": 0.7901234567901234,
 
584
  "f1_christianity": 0.1694915254237288,
585
+ "f1_computer graphics": 0.3373493975903614,
586
+ "f1_microsoft windows": 0.37681159420289856,
587
+ "f1_middle east": 0.4594594594594595,
588
+ "f1_politics": 0.27906976744186046,
589
+ "f1_motorcycles": 0.4883720930232558,
590
+ "f1_mac hardware": 0.03125,
591
+ "f1_for sale": 0.6461538461538462,
592
+ "f1_guns": 0.18518518518518517,
593
+ "f1_space": 0.575,
594
+ "f1_cryptography": 0.5079365079365079,
595
+ "f1_baseball": 0.8468468468468469,
596
+ "f1_hockey": 0.85,
597
+ "f1_electronics": 0.4,
598
+ "f1_macro_ci_low": 0.39859097081154116,
599
+ "f1_macro_ci_high": 0.4545535978307604,
600
  "score_name": "f1_micro",
601
+ "score": 0.4485549132947977,
602
+ "score_ci_high": 0.4787668189917876,
603
+ "score_ci_low": 0.41661505505349583,
604
  "num_of_instances": 1000,
605
+ "accuracy": 0.388,
606
+ "accuracy_ci_low": 0.358,
607
+ "accuracy_ci_high": 0.4198351175250287,
608
+ "f1_micro": 0.4485549132947977,
609
+ "f1_micro_ci_low": 0.41661505505349583,
610
+ "f1_micro_ci_high": 0.4787668189917876
611
  },
612
+ "score": 0.4485549132947977,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.6126200216184788,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9002217294900222,
620
+ "f1_checking or savings account": 0.6451612903225806,
621
+ "f1_debt collection": 0.5066666666666667,
622
+ "f1_credit card or prepaid card": 0.6277372262773723,
623
+ "f1_mortgage": 0.7945205479452054,
624
+ "f1_student loan": 0.8461538461538461,
625
+ "f1_money transfer or virtual currency or money service": 0.4878048780487805,
626
+ "f1_payday loan or title loan or personal loan": 0.2608695652173913,
627
+ "f1_vehicle loan or lease": 0.4444444444444444,
628
+ "f1_macro_ci_low": 0.557724310665768,
629
+ "f1_macro_ci_high": 0.6722482288571774,
630
  "score_name": "f1_micro",
631
+ "score": 0.8101924076963078,
632
+ "score_ci_high": 0.8332463122584837,
633
+ "score_ci_low": 0.7850628587071383,
634
  "num_of_instances": 1000,
635
+ "accuracy": 0.779,
636
  "accuracy_ci_low": 0.752,
637
+ "accuracy_ci_high": 0.805,
638
+ "f1_micro": 0.8101924076963078,
639
+ "f1_micro_ci_low": 0.7850628587071383,
640
+ "f1_micro_ci_high": 0.8332463122584837
641
  },
642
  "cfpb_product_watsonx": {
643
+ "f1_macro": 0.6957226327824474,
644
+ "f1_mortgages and loans": 0.7861271676300579,
645
+ "f1_credit card": 0.735632183908046,
646
+ "f1_debt collection": 0.6605504587155964,
647
+ "f1_retail banking": 0.5853658536585366,
648
+ "f1_credit reporting": 0.7109375,
649
+ "f1_macro_ci_low": 0.6549701202052777,
650
+ "f1_macro_ci_high": 0.7382657469246365,
651
  "score_name": "f1_micro",
652
+ "score": 0.701271186440678,
653
+ "score_ci_high": 0.7411785857709632,
654
+ "score_ci_low": 0.6609516931464113,
655
  "num_of_instances": 500,
656
+ "accuracy": 0.662,
657
+ "accuracy_ci_low": 0.6202110366430569,
658
+ "accuracy_ci_high": 0.706,
659
+ "f1_micro": 0.701271186440678,
660
+ "f1_micro_ci_low": 0.6609516931464113,
661
+ "f1_micro_ci_high": 0.7411785857709632
662
  },
663
+ "score": 0.7557317970684929,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
+ "program_accuracy": 0.084,
671
+ "score": 0.084,
 
672
  "score_name": "program_accuracy",
673
+ "execution_accuracy": 0.073,
674
+ "program_accuracy_ci_low": 0.067,
675
+ "program_accuracy_ci_high": 0.10386305691021766,
676
+ "score_ci_low": 0.067,
677
+ "score_ci_high": 0.10386305691021766,
678
+ "execution_accuracy_ci_low": 0.057,
679
+ "execution_accuracy_ci_high": 0.091
680
  },
681
+ "score": 0.084,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
+ "precision": 0.2967925946544494,
688
+ "recall": 0.5841362559189178,
689
+ "f1": 0.3364987383722942,
690
+ "precision_ci_low": 0.2764979602656248,
691
+ "precision_ci_high": 0.3159857611493305,
692
+ "recall_ci_low": 0.5663103849045117,
693
+ "recall_ci_high": 0.6007142202829963,
694
+ "f1_ci_low": 0.3178259767982501,
695
+ "f1_ci_high": 0.35351716805909167,
696
  "score_name": "f1",
697
+ "score": 0.3364987383722942,
698
+ "score_ci_high": 0.35351716805909167,
699
+ "score_ci_low": 0.3178259767982501,
700
  "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.5976409501334031,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6827784284452597,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5494535167018573,
704
+ "faithfullness_f1_token_overlap": 0.34046513614196555,
705
+ "faithfullness_recall_token_overlap": 0.2738914194701684,
706
+ "faithfullness_precision_token_overlap": 0.5609296227002613,
707
+ "correctness_f1_token_overlap": 0.3364987383722942,
708
+ "correctness_recall_token_overlap": 0.5841362559189178,
709
+ "correctness_precision_token_overlap": 0.2967925946544494
710
  },
711
+ "score": 0.3364987383722942,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
+ "accuracy": 0.448,
718
+ "accuracy_ci_low": 0.417,
719
+ "accuracy_ci_high": 0.479,
720
  "score_name": "accuracy",
721
+ "score": 0.448,
722
+ "score_ci_high": 0.479,
723
+ "score_ci_low": 0.417,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
+ "accuracy": 0.78,
728
+ "accuracy_ci_low": 0.742,
729
+ "accuracy_ci_high": 0.816,
730
  "score_name": "accuracy",
731
+ "score": 0.78,
732
+ "score_ci_high": 0.816,
733
+ "score_ci_low": 0.742,
734
  "num_of_instances": 500
735
  },
736
+ "score": 0.614,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
 
741
  "attaq_500": {
742
  "severity_critical": 0.0,
743
  "severity_high": 0.0,
744
+ "severity_medium": 1.0,
745
+ "severity_low": 99.0,
746
+ "category_harmful_info": 0.8719441451812312,
747
+ "category_deception": 0.888891483497138,
748
+ "category_explicit_content": 0.8127973503386131,
749
+ "category_violence": 0.8689369508242678,
750
+ "category_discrimination": 0.8825277714502243,
751
+ "category_substance_abuse": 0.9410604351096683,
752
+ "category_pii": 0.7720797760067163,
753
+ "safety": 0.8674504241983716,
754
+ "safety_ci_low": 0.847960857428302,
755
+ "safety_ci_high": 0.8844523349924212,
756
  "score_name": "safety",
757
+ "score": 0.8674504241983716,
758
+ "score_ci_high": 0.8844523349924212,
759
+ "score_ci_low": 0.847960857428302,
760
  "num_of_instances": 100
761
  },
762
+ "score": 0.8674504241983716,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
+ "rouge1": 0.42001432363059676,
770
+ "rougeLsum": 0.35176737784751416,
771
+ "rouge2": 0.20090339450058858,
772
+ "rougeL": 0.2887794256459243,
773
+ "score": 0.2887794256459243,
774
  "score_name": "rougeL",
775
+ "rouge1_ci_low": 0.4115358503100108,
776
+ "rouge1_ci_high": 0.42788391457002284,
777
+ "rougeLsum_ci_low": 0.3432569747337183,
778
+ "rougeLsum_ci_high": 0.35847372198631006,
779
+ "rouge2_ci_low": 0.19492457543616534,
780
+ "rouge2_ci_high": 0.2075223454056542,
781
+ "rougeL_ci_low": 0.28220405170841467,
782
+ "rougeL_ci_high": 0.2953572975976334,
783
+ "score_ci_low": 0.28220405170841467,
784
+ "score_ci_high": 0.2953572975976334
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
+ "rouge1": 0.11196786218861304,
789
+ "rougeLsum": 0.09259288227162547,
790
+ "rouge2": 0.014304299542517345,
791
+ "rougeL": 0.08050830498137622,
792
+ "score": 0.08050830498137622,
793
  "score_name": "rougeL",
794
+ "rouge1_ci_low": 0.10730072397656114,
795
+ "rouge1_ci_high": 0.11708078514416911,
796
+ "rougeLsum_ci_low": 0.088799408920107,
797
+ "rougeLsum_ci_high": 0.09663277250494734,
798
+ "rouge2_ci_low": 0.01265361386023307,
799
+ "rouge2_ci_high": 0.01610624039999516,
800
+ "rougeL_ci_low": 0.07745615703093822,
801
+ "rougeL_ci_high": 0.08426746560170988,
802
+ "score_ci_low": 0.07745615703093822,
803
+ "score_ci_high": 0.08426746560170988
804
  },
805
+ "score": 0.18464386531365026,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
 
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
+ 1147,
814
+ 635,
815
+ 377,
816
+ 236
817
  ],
818
  "totals": [
819
+ 2783,
820
+ 2717,
821
+ 2651,
822
+ 2585
823
  ],
824
  "precisions": [
825
+ 0.41214516708587856,
826
+ 0.23371365476628636,
827
+ 0.14221048660882685,
828
+ 0.09129593810444873
829
  ],
830
  "bp": 1.0,
831
+ "sys_len": 2783,
832
  "ref_len": 1734,
833
+ "sacrebleu": 0.18805260077651942,
834
+ "score": 0.18805260077651942,
835
  "score_name": "sacrebleu",
836
+ "score_ci_low": 0.15590859876442242,
837
+ "score_ci_high": 0.22665030743269873,
838
+ "sacrebleu_ci_low": 0.15590859876442242,
839
+ "sacrebleu_ci_high": 0.22665030743269873
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
+ 1250,
845
+ 740,
846
+ 469,
847
+ 298
848
  ],
849
  "totals": [
850
+ 3365,
851
+ 3299,
852
+ 3233,
853
+ 3167
854
  ],
855
  "precisions": [
856
+ 0.37147102526002973,
857
+ 0.22431039709002729,
858
+ 0.1450665017012063,
859
+ 0.09409535838332808
860
  ],
861
  "bp": 1.0,
862
+ "sys_len": 3365,
863
  "ref_len": 1734,
864
+ "sacrebleu": 0.18364428677137226,
865
+ "score": 0.18364428677137226,
866
  "score_name": "sacrebleu",
867
+ "score_ci_low": 0.15133175793244782,
868
+ "score_ci_high": 0.238285104264321,
869
+ "sacrebleu_ci_low": 0.15133175793244782,
870
+ "sacrebleu_ci_high": 0.238285104264321
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
+ 701,
876
+ 279,
877
+ 122,
878
+ 55
879
  ],
880
  "totals": [
881
+ 2379,
882
+ 2313,
883
+ 2247,
884
+ 2181
885
  ],
886
  "precisions": [
887
+ 0.294661622530475,
888
+ 0.12062256809338522,
889
+ 0.054294615042278595,
890
+ 0.02521779000458505
891
  ],
892
  "bp": 1.0,
893
+ "sys_len": 2379,
894
  "ref_len": 1589,
895
+ "sacrebleu": 0.08352259557657876,
896
+ "score": 0.08352259557657876,
897
  "score_name": "sacrebleu",
898
+ "score_ci_low": 0.06605026431332355,
899
+ "score_ci_high": 0.10504705952927867,
900
+ "sacrebleu_ci_low": 0.06605026431332355,
901
+ "sacrebleu_ci_high": 0.10504705952927867
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
+ 1060,
907
+ 555,
908
+ 321,
909
+ 197
910
  ],
911
  "totals": [
912
+ 2307,
913
+ 2241,
914
+ 2175,
915
+ 2109
916
  ],
917
  "precisions": [
918
+ 0.45947117468573906,
919
+ 0.24765729585006693,
920
+ 0.14758620689655172,
921
+ 0.09340919867235657
922
  ],
923
  "bp": 1.0,
924
+ "sys_len": 2307,
925
  "ref_len": 1835,
926
+ "sacrebleu": 0.19901517998924645,
927
+ "score": 0.19901517998924645,
928
  "score_name": "sacrebleu",
929
+ "score_ci_low": 0.15118718491370434,
930
+ "score_ci_high": 0.24524942034353023,
931
+ "sacrebleu_ci_low": 0.15118718491370434,
932
+ "sacrebleu_ci_high": 0.24524942034353023
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
+ 1383,
938
+ 931,
939
+ 685,
940
+ 511
941
  ],
942
  "totals": [
943
+ 2499,
944
+ 2433,
945
+ 2367,
946
+ 2301
947
  ],
948
  "precisions": [
949
+ 0.553421368547419,
950
+ 0.3826551582408549,
951
+ 0.28939585973806503,
952
+ 0.222077357670578
953
  ],
954
  "bp": 1.0,
955
+ "sys_len": 2499,
956
  "ref_len": 2068,
957
+ "sacrebleu": 0.34155844112464445,
958
+ "score": 0.34155844112464445,
959
  "score_name": "sacrebleu",
960
+ "score_ci_low": 0.2882755414660873,
961
+ "score_ci_high": 0.40044941880570056,
962
+ "sacrebleu_ci_low": 0.2882755414660873,
963
+ "sacrebleu_ci_high": 0.40044941880570056
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
+ 1091,
969
+ 445,
970
+ 224,
971
+ 119
972
  ],
973
  "totals": [
974
+ 4751,
975
+ 4685,
976
+ 4619,
977
+ 4553
978
  ],
979
  "precisions": [
980
+ 0.2296358661334456,
981
+ 0.09498399146211313,
982
+ 0.04849534531283828,
983
+ 0.026136613222051394
984
  ],
985
  "bp": 1.0,
986
+ "sys_len": 4751,
987
  "ref_len": 2235,
988
+ "sacrebleu": 0.07251199865213667,
989
+ "score": 0.07251199865213667,
990
  "score_name": "sacrebleu",
991
+ "score_ci_low": 0.05587322252934343,
992
+ "score_ci_high": 0.09184707565044344,
993
+ "sacrebleu_ci_low": 0.05587322252934343,
994
+ "sacrebleu_ci_high": 0.09184707565044344
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
+ 1335,
1000
+ 873,
1001
+ 615,
1002
+ 436
1003
  ],
1004
  "totals": [
1005
+ 3124,
1006
+ 3058,
1007
+ 2992,
1008
+ 2926
1009
  ],
1010
  "precisions": [
1011
+ 0.427336747759283,
1012
+ 0.2854807063440157,
1013
+ 0.20554812834224598,
1014
+ 0.14900888585099112
1015
  ],
1016
  "bp": 1.0,
1017
+ "sys_len": 3124,
1018
  "ref_len": 1916,
1019
+ "sacrebleu": 0.24723968084246245,
1020
+ "score": 0.24723968084246245,
1021
  "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.1931280328121111,
1023
+ "score_ci_high": 0.3044993993983362,
1024
+ "sacrebleu_ci_low": 0.1931280328121111,
1025
+ "sacrebleu_ci_high": 0.3044993993983362
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
+ 897,
1031
+ 407,
1032
+ 221,
1033
+ 130
1034
  ],
1035
  "totals": [
1036
+ 2626,
1037
+ 2560,
1038
+ 2494,
1039
+ 2428
1040
  ],
1041
  "precisions": [
1042
+ 0.3415841584158416,
1043
+ 0.158984375,
1044
+ 0.08861267040898156,
1045
+ 0.05354200988467875
1046
  ],
1047
  "bp": 1.0,
1048
+ "sys_len": 2626,
1049
  "ref_len": 1949,
1050
+ "sacrebleu": 0.12669534688031472,
1051
+ "score": 0.12669534688031472,
1052
  "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.1000733380304748,
1054
+ "score_ci_high": 0.16178959885111238,
1055
+ "sacrebleu_ci_low": 0.1000733380304748,
1056
+ "sacrebleu_ci_high": 0.16178959885111238
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
+ 1203,
1062
+ 616,
1063
+ 335,
1064
+ 184
1065
  ],
1066
  "totals": [
 
 
1067
  2913,
1068
+ 2847,
1069
+ 2781,
1070
+ 2715
1071
  ],
1072
  "precisions": [
1073
+ 0.41297631307929966,
1074
+ 0.21636810677906568,
1075
+ 0.12046026609133405,
1076
+ 0.06777163904235727
1077
  ],
1078
  "bp": 1.0,
1079
+ "sys_len": 2913,
1080
  "ref_len": 2098,
1081
+ "sacrebleu": 0.16434350639643316,
1082
+ "score": 0.16434350639643316,
1083
  "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.1325515600195346,
1085
+ "score_ci_high": 0.20299768479868893,
1086
+ "sacrebleu_ci_low": 0.1325515600195346,
1087
+ "sacrebleu_ci_high": 0.20299768479868893
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
+ 1254,
1093
+ 732,
1094
+ 469,
1095
+ 322
1096
  ],
1097
  "totals": [
1098
+ 3278,
1099
+ 3212,
1100
+ 3146,
1101
+ 3080
1102
  ],
1103
  "precisions": [
1104
+ 0.3825503355704698,
1105
+ 0.22789539227895392,
1106
+ 0.14907819453274,
1107
+ 0.10454545454545455
1108
  ],
1109
  "bp": 1.0,
1110
+ "sys_len": 3278,
1111
  "ref_len": 1734,
1112
+ "sacrebleu": 0.19199320250461963,
1113
+ "score": 0.19199320250461963,
1114
  "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.15725080687797277,
1116
+ "score_ci_high": 0.24193576258661156,
1117
+ "sacrebleu_ci_low": 0.15725080687797277,
1118
+ "sacrebleu_ci_high": 0.24193576258661156
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
+ 1040,
1124
+ 474,
1125
+ 264,
1126
+ 161
1127
  ],
1128
  "totals": [
1129
+ 3106,
1130
+ 3040,
1131
+ 2974,
1132
+ 2908
1133
  ],
1134
  "precisions": [
1135
+ 0.334835801674179,
1136
+ 0.15592105263157896,
1137
+ 0.08876933422999328,
1138
+ 0.05536451169188446
1139
  ],
1140
  "bp": 1.0,
1141
+ "sys_len": 3106,
1142
  "ref_len": 1734,
1143
+ "sacrebleu": 0.1265632943373452,
1144
+ "score": 0.1265632943373452,
1145
  "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.10051707288201024,
1147
+ "score_ci_high": 0.15415847724283543,
1148
+ "sacrebleu_ci_low": 0.10051707288201024,
1149
+ "sacrebleu_ci_high": 0.15415847724283543
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
+ 978,
1155
+ 426,
1156
+ 216,
1157
+ 119
1158
  ],
1159
  "totals": [
1160
+ 3053,
1161
+ 2987,
1162
+ 2921,
1163
+ 2855
1164
  ],
1165
  "precisions": [
1166
+ 0.3203406485424173,
1167
+ 0.14261801138265817,
1168
+ 0.07394727832933927,
1169
+ 0.04168126094570928
1170
  ],
1171
  "bp": 1.0,
1172
+ "sys_len": 3053,
1173
  "ref_len": 1734,
1174
+ "sacrebleu": 0.10893372822633232,
1175
+ "score": 0.10893372822633232,
1176
  "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.08051704910100821,
1178
+ "score_ci_high": 0.1399793521343314,
1179
+ "sacrebleu_ci_low": 0.08051704910100821,
1180
+ "sacrebleu_ci_high": 0.1399793521343314
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
+ 1279,
1186
+ 820,
1187
+ 574,
1188
+ 416
1189
  ],
1190
  "totals": [
1191
+ 2919,
1192
+ 2853,
1193
+ 2787,
1194
+ 2721
1195
  ],
1196
  "precisions": [
1197
+ 0.4381637547105173,
1198
+ 0.28741675429372593,
1199
+ 0.2059562253318981,
1200
+ 0.15288496876148475
1201
  ],
1202
  "bp": 1.0,
1203
+ "sys_len": 2919,
1204
  "ref_len": 1734,
1205
+ "sacrebleu": 0.250941252136478,
1206
+ "score": 0.250941252136478,
1207
  "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.19373911879549774,
1209
+ "score_ci_high": 0.3143574197034948,
1210
+ "sacrebleu_ci_low": 0.19373911879549774,
1211
+ "sacrebleu_ci_high": 0.3143574197034948
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
+ 1215,
1217
+ 691,
1218
+ 421,
1219
+ 259
1220
  ],
1221
  "totals": [
1222
+ 2920,
1223
+ 2854,
1224
+ 2788,
1225
+ 2722
1226
  ],
1227
  "precisions": [
1228
+ 0.41609589041095885,
1229
+ 0.24211632796075683,
1230
+ 0.15100430416068866,
1231
+ 0.09515062454077883
1232
  ],
1233
  "bp": 1.0,
1234
+ "sys_len": 2920,
1235
  "ref_len": 1734,
1236
+ "sacrebleu": 0.19505389122054267,
1237
+ "score": 0.19505389122054267,
1238
  "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.16097395914199633,
1240
+ "score_ci_high": 0.23627234222780022,
1241
+ "sacrebleu_ci_low": 0.16097395914199633,
1242
+ "sacrebleu_ci_high": 0.23627234222780022
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
+ 1158,
1248
+ 586,
1249
+ 328,
1250
+ 189
1251
  ],
1252
  "totals": [
1253
+ 3432,
1254
+ 3366,
1255
+ 3300,
1256
+ 3234
1257
  ],
1258
  "precisions": [
1259
+ 0.3374125874125874,
1260
+ 0.1740938799762329,
1261
+ 0.0993939393939394,
1262
+ 0.05844155844155845
1263
  ],
1264
  "bp": 1.0,
1265
+ "sys_len": 3432,
1266
  "ref_len": 1734,
1267
+ "sacrebleu": 0.1359116294436951,
1268
+ "score": 0.1359116294436951,
1269
  "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.10538411838136834,
1271
+ "score_ci_high": 0.18075724261232987,
1272
+ "sacrebleu_ci_low": 0.10538411838136834,
1273
+ "sacrebleu_ci_high": 0.18075724261232987
1274
  },
1275
+ "score": 0.17439870899191476,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
+ "score": 0.42328152240293343,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
results/bluebench/2025-06-23T04-42-35_evaluation_results.json ADDED
@@ -0,0 +1,1283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-06-23T08:42:31.876970Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=watsonx/ibm/granite-3-8b-instruct,max_tokens=256",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "watsonx/ibm/granite-3-8b-instruct",
30
+ "model_args": {
31
+ "max_tokens": 256
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "absl-py": "2.3.0",
55
+ "tiktoken": "0.9.0",
56
+ "charset-normalizer": "3.4.2",
57
+ "nvidia-cuda-runtime-cu12": "12.6.77",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "litellm": "1.72.6.post1",
61
+ "httpcore": "1.0.9",
62
+ "Jinja2": "3.1.6",
63
+ "jsonschema-specifications": "2025.4.1",
64
+ "pydantic_core": "2.33.2",
65
+ "nvidia-cusparse-cu12": "12.5.4.2",
66
+ "yarl": "1.20.1",
67
+ "openai": "1.88.0",
68
+ "portalocker": "3.2.0",
69
+ "pandas": "2.3.0",
70
+ "multiprocess": "0.70.16",
71
+ "jsonschema": "4.24.0",
72
+ "unitxt": "1.24.0",
73
+ "nvidia-nvjitlink-cu12": "12.6.85",
74
+ "nvidia-cublas-cu12": "12.6.4.1",
75
+ "pydantic": "2.11.7",
76
+ "async-timeout": "5.0.1",
77
+ "annotated-types": "0.7.0",
78
+ "rouge_score": "0.1.2",
79
+ "contourpy": "1.3.2",
80
+ "aiosignal": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "pillow": "11.2.1",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "pyarrow": "20.0.0",
87
+ "h11": "0.16.0",
88
+ "zipp": "3.19.2",
89
+ "tzdata": "2025.2",
90
+ "bert-score": "0.3.13",
91
+ "setuptools": "80.9.0",
92
+ "referencing": "0.36.2",
93
+ "sacrebleu": "2.5.1",
94
+ "filelock": "3.18.0",
95
+ "urllib3": "2.5.0",
96
+ "scipy": "1.15.3",
97
+ "nvidia-nccl-cu12": "2.26.2",
98
+ "kiwisolver": "1.4.8",
99
+ "networkx": "3.4.2",
100
+ "typing-inspection": "0.4.1",
101
+ "lxml": "5.4.0",
102
+ "sniffio": "1.3.1",
103
+ "scikit-learn": "1.7.0",
104
+ "nvidia-curand-cu12": "10.3.7.77",
105
+ "pip": "25.1.1",
106
+ "fonttools": "4.58.4",
107
+ "transformers": "4.52.4",
108
+ "datasets": "3.6.0",
109
+ "nvidia-cusolver-cu12": "11.7.1.2",
110
+ "cycler": "0.12.1",
111
+ "evaluate": "0.4.3",
112
+ "distro": "1.9.0",
113
+ "idna": "3.10",
114
+ "MarkupSafe": "3.0.2",
115
+ "frozenlist": "1.7.0",
116
+ "pyparsing": "3.2.3",
117
+ "jiter": "0.10.0",
118
+ "importlib_metadata": "8.0.0",
119
+ "packaging": "24.2",
120
+ "psutil": "7.0.0",
121
+ "mecab-ko-dic": "1.0.0",
122
+ "joblib": "1.5.1",
123
+ "fsspec": "2025.3.0",
124
+ "dill": "0.3.8",
125
+ "tokenizers": "0.21.1",
126
+ "wheel": "0.45.1",
127
+ "nvidia-nvtx-cu12": "12.6.77",
128
+ "nvidia-cusparselt-cu12": "0.6.3",
129
+ "hf-xet": "1.1.4",
130
+ "propcache": "0.3.2",
131
+ "numpy": "2.2.6",
132
+ "mpmath": "1.3.0",
133
+ "multidict": "6.5.0",
134
+ "conllu": "6.0.0",
135
+ "safetensors": "0.5.3",
136
+ "requests": "2.32.4",
137
+ "regex": "2024.11.6",
138
+ "aiohttp": "3.12.13",
139
+ "tabulate": "0.9.0",
140
+ "certifi": "2025.6.15",
141
+ "accelerate": "1.8.0",
142
+ "nvidia-cufft-cu12": "11.3.0.4",
143
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
+ "click": "8.2.1",
145
+ "typing_extensions": "4.12.2",
146
+ "attrs": "25.3.0",
147
+ "exceptiongroup": "1.3.0",
148
+ "tenacity": "9.1.2",
149
+ "pytz": "2025.2",
150
+ "aiohappyeyeballs": "2.6.1",
151
+ "python-dateutil": "2.9.0.post0",
152
+ "torch": "2.7.1",
153
+ "python-dotenv": "1.1.0",
154
+ "httpx": "0.28.1",
155
+ "matplotlib": "3.10.3",
156
+ "xxhash": "3.5.0",
157
+ "PyYAML": "6.0.2",
158
+ "huggingface-hub": "0.33.0",
159
+ "colorama": "0.4.6",
160
+ "rpds-py": "0.25.1",
161
+ "threadpoolctl": "3.6.0",
162
+ "nvidia-cudnn-cu12": "9.5.1.17",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.6,
180
+ "accuracy_ci_low": 0.4961662149523231,
181
+ "accuracy_ci_high": 0.6888888888888889,
182
+ "score_name": "accuracy",
183
+ "score": 0.6,
184
+ "score_ci_high": 0.6888888888888889,
185
+ "score_ci_low": 0.4961662149523231,
186
+ "num_of_instances": 90
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 0.6888888888888889,
190
+ "accuracy_ci_low": 0.5777777777777777,
191
+ "accuracy_ci_high": 0.7666666666666667,
192
+ "score_name": "accuracy",
193
+ "score": 0.6888888888888889,
194
+ "score_ci_high": 0.7666666666666667,
195
+ "score_ci_low": 0.5777777777777777,
196
+ "num_of_instances": 90
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 0.9,
200
+ "accuracy_ci_low": 0.8222222222222222,
201
+ "accuracy_ci_high": 0.9555555555555556,
202
+ "score_name": "accuracy",
203
+ "score": 0.9,
204
+ "score_ci_high": 0.9555555555555556,
205
+ "score_ci_low": 0.8222222222222222,
206
+ "num_of_instances": 90
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 0.6888888888888889,
210
+ "accuracy_ci_low": 0.5780895036995246,
211
+ "accuracy_ci_high": 0.7888888888888889,
212
+ "score_name": "accuracy",
213
+ "score": 0.6888888888888889,
214
+ "score_ci_high": 0.7888888888888889,
215
+ "score_ci_low": 0.5780895036995246,
216
+ "num_of_instances": 90
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.7888888888888889,
220
+ "accuracy_ci_low": 0.689667704010142,
221
+ "accuracy_ci_high": 0.8555555555555555,
222
+ "score_name": "accuracy",
223
+ "score": 0.7888888888888889,
224
+ "score_ci_high": 0.8555555555555555,
225
+ "score_ci_low": 0.689667704010142,
226
+ "num_of_instances": 90
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.9222222222222223,
230
+ "accuracy_ci_low": 0.8444444444444444,
231
+ "accuracy_ci_high": 0.9666666666666667,
232
+ "score_name": "accuracy",
233
+ "score": 0.9222222222222223,
234
+ "score_ci_high": 0.9666666666666667,
235
+ "score_ci_low": 0.8444444444444444,
236
+ "num_of_instances": 90
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.9,
240
+ "accuracy_ci_low": 0.8222222222222222,
241
+ "accuracy_ci_high": 0.9555555555555556,
242
+ "score_name": "accuracy",
243
+ "score": 0.9,
244
+ "score_ci_high": 0.9555555555555556,
245
+ "score_ci_low": 0.8222222222222222,
246
+ "num_of_instances": 90
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.9555555555555556,
250
+ "accuracy_ci_low": 0.9,
251
+ "accuracy_ci_high": 0.9888888888888889,
252
+ "score_name": "accuracy",
253
+ "score": 0.9555555555555556,
254
+ "score_ci_high": 0.9888888888888889,
255
+ "score_ci_low": 0.9,
256
+ "num_of_instances": 90
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.8222222222222222,
260
+ "accuracy_ci_low": 0.7333333333333333,
261
+ "accuracy_ci_high": 0.8888888888888888,
262
+ "score_name": "accuracy",
263
+ "score": 0.8222222222222222,
264
+ "score_ci_high": 0.8888888888888888,
265
+ "score_ci_low": 0.7333333333333333,
266
+ "num_of_instances": 90
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.6777777777777778,
270
+ "accuracy_ci_low": 0.5777777777777777,
271
+ "accuracy_ci_high": 0.7666666666666667,
272
+ "score_name": "accuracy",
273
+ "score": 0.6777777777777778,
274
+ "score_ci_high": 0.7666666666666667,
275
+ "score_ci_low": 0.5777777777777777,
276
+ "num_of_instances": 90
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.8333333333333334,
280
+ "accuracy_ci_low": 0.7555555555555555,
281
+ "accuracy_ci_high": 0.9077323275921318,
282
+ "score_name": "accuracy",
283
+ "score": 0.8333333333333334,
284
+ "score_ci_high": 0.9077323275921318,
285
+ "score_ci_low": 0.7555555555555555,
286
+ "num_of_instances": 90
287
+ },
288
+ "score": 0.797979797979798,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 990
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.07200720072007201,
296
+ "score": 0.07200720072007201,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.07200720072007201,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 500
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 1000,
306
+ "f1_Person": 0.5089820359281437,
307
+ "f1_Organization": 0.3546511627906977,
308
+ "f1_Location": 0.3474903474903475,
309
+ "f1_macro": 0.4037078487363963,
310
+ "recall_macro": 0.3583554354766996,
311
+ "precision_macro": 0.4822578777124232,
312
+ "in_classes_support": 0.5928057553956835,
313
+ "f1_micro": 0.31311475409836065,
314
+ "recall_micro": 0.3638095238095238,
315
+ "precision_micro": 0.27482014388489207,
316
+ "score": 0.31311475409836065,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.2581926050371807,
319
+ "score_ci_high": 0.35574639217016485,
320
+ "f1_micro_ci_low": 0.2581926050371807,
321
+ "f1_micro_ci_high": 0.35574639217016485
322
+ },
323
+ "score": 0.31311475409836065,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 1000
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.5633802816901409,
330
+ "accuracy_ci_low": 0.4647887323943662,
331
+ "accuracy_ci_high": 0.676056338028169,
332
+ "score_name": "accuracy",
333
+ "score": 0.5633802816901409,
334
+ "score_ci_high": 0.676056338028169,
335
+ "score_ci_low": 0.4647887323943662,
336
+ "num_of_instances": 71
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.2535211267605634,
340
+ "accuracy_ci_low": 0.15492957746478872,
341
+ "accuracy_ci_high": 0.36619718309859156,
342
+ "score_name": "accuracy",
343
+ "score": 0.2535211267605634,
344
+ "score_ci_high": 0.36619718309859156,
345
+ "score_ci_low": 0.15492957746478872,
346
+ "num_of_instances": 71
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.23943661971830985,
350
+ "accuracy_ci_low": 0.15492957746478872,
351
+ "accuracy_ci_high": 0.36619718309859156,
352
+ "score_name": "accuracy",
353
+ "score": 0.23943661971830985,
354
+ "score_ci_high": 0.36619718309859156,
355
+ "score_ci_low": 0.15492957746478872,
356
+ "num_of_instances": 71
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.352112676056338,
360
+ "accuracy_ci_low": 0.2535211267605634,
361
+ "accuracy_ci_high": 0.47475562822206696,
362
+ "score_name": "accuracy",
363
+ "score": 0.352112676056338,
364
+ "score_ci_high": 0.47475562822206696,
365
+ "score_ci_low": 0.2535211267605634,
366
+ "num_of_instances": 71
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.4647887323943662,
370
+ "accuracy_ci_low": 0.3380281690140845,
371
+ "accuracy_ci_high": 0.5774647887323944,
372
+ "score_name": "accuracy",
373
+ "score": 0.4647887323943662,
374
+ "score_ci_high": 0.5774647887323944,
375
+ "score_ci_low": 0.3380281690140845,
376
+ "num_of_instances": 71
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.2535211267605634,
380
+ "accuracy_ci_low": 0.15492957746478872,
381
+ "accuracy_ci_high": 0.36619718309859156,
382
+ "score_name": "accuracy",
383
+ "score": 0.2535211267605634,
384
+ "score_ci_high": 0.36619718309859156,
385
+ "score_ci_low": 0.15492957746478872,
386
+ "num_of_instances": 71
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.352112676056338,
390
+ "accuracy_ci_low": 0.23943661971830985,
391
+ "accuracy_ci_high": 0.4647887323943662,
392
+ "score_name": "accuracy",
393
+ "score": 0.352112676056338,
394
+ "score_ci_high": 0.4647887323943662,
395
+ "score_ci_low": 0.23943661971830985,
396
+ "num_of_instances": 71
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.43661971830985913,
400
+ "accuracy_ci_low": 0.323943661971831,
401
+ "accuracy_ci_high": 0.5633802816901409,
402
+ "score_name": "accuracy",
403
+ "score": 0.43661971830985913,
404
+ "score_ci_high": 0.5633802816901409,
405
+ "score_ci_low": 0.323943661971831,
406
+ "num_of_instances": 71
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.30985915492957744,
410
+ "accuracy_ci_low": 0.2112676056338028,
411
+ "accuracy_ci_high": 0.4225352112676056,
412
+ "score_name": "accuracy",
413
+ "score": 0.30985915492957744,
414
+ "score_ci_high": 0.4225352112676056,
415
+ "score_ci_low": 0.2112676056338028,
416
+ "num_of_instances": 71
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.11267605633802817,
420
+ "accuracy_ci_low": 0.056338028169014086,
421
+ "accuracy_ci_high": 0.19718309859154928,
422
+ "score_name": "accuracy",
423
+ "score": 0.11267605633802817,
424
+ "score_ci_high": 0.19718309859154928,
425
+ "score_ci_low": 0.056338028169014086,
426
+ "num_of_instances": 71
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.323943661971831,
430
+ "accuracy_ci_low": 0.22193333267792992,
431
+ "accuracy_ci_high": 0.43661971830985913,
432
+ "score_name": "accuracy",
433
+ "score": 0.323943661971831,
434
+ "score_ci_high": 0.43661971830985913,
435
+ "score_ci_low": 0.22193333267792992,
436
+ "num_of_instances": 71
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.38028169014084506,
440
+ "accuracy_ci_low": 0.2535211267605634,
441
+ "accuracy_ci_high": 0.49295774647887325,
442
+ "score_name": "accuracy",
443
+ "score": 0.38028169014084506,
444
+ "score_ci_high": 0.49295774647887325,
445
+ "score_ci_low": 0.2535211267605634,
446
+ "num_of_instances": 71
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.16901408450704225,
450
+ "accuracy_ci_low": 0.09859154929577464,
451
+ "accuracy_ci_high": 0.2676056338028169,
452
+ "score_name": "accuracy",
453
+ "score": 0.16901408450704225,
454
+ "score_ci_high": 0.2676056338028169,
455
+ "score_ci_low": 0.09859154929577464,
456
+ "num_of_instances": 71
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.5492957746478874,
460
+ "accuracy_ci_low": 0.43661971830985913,
461
+ "accuracy_ci_high": 0.6619718309859155,
462
+ "score_name": "accuracy",
463
+ "score": 0.5492957746478874,
464
+ "score_ci_high": 0.6619718309859155,
465
+ "score_ci_low": 0.43661971830985913,
466
+ "num_of_instances": 71
467
+ },
468
+ "score": 0.34004024144869216,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 994
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.38313112869412325,
475
+ "f1_suggestive": 0.4,
476
+ "f1_descriptive": 0.49056603773584906,
477
+ "f1_generic": 0.1111111111111111,
478
+ "f1_fanciful": 0.5806451612903226,
479
+ "f1_arbitrary": 0.3333333333333333,
480
+ "f1_macro_ci_low": 0.293497629073193,
481
+ "f1_macro_ci_high": 0.49184198170551063,
482
+ "score_name": "f1_micro",
483
+ "score": 0.41420118343195267,
484
+ "score_ci_high": 0.5176470588235295,
485
+ "score_ci_low": 0.3058823529411765,
486
+ "num_of_instances": 85,
487
+ "accuracy": 0.4117647058823529,
488
+ "accuracy_ci_low": 0.3058823529411765,
489
+ "accuracy_ci_high": 0.5176470588235295,
490
+ "f1_micro": 0.41420118343195267,
491
+ "f1_micro_ci_low": 0.3058823529411765,
492
+ "f1_micro_ci_high": 0.5176470588235295
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.5893994540491356,
496
+ "f1_no": 0.821656050955414,
497
+ "f1_yes": 0.35714285714285715,
498
+ "f1_macro_ci_low": 0.5097301675555063,
499
+ "f1_macro_ci_high": 0.6745862952621396,
500
+ "score_name": "f1_micro",
501
+ "score": 0.7236180904522613,
502
+ "score_ci_high": 0.7788944723618091,
503
+ "score_ci_low": 0.6595134689262127,
504
+ "num_of_instances": 200,
505
+ "accuracy": 0.72,
506
+ "accuracy_ci_low": 0.655,
507
+ "accuracy_ci_high": 0.775,
508
+ "f1_micro": 0.7236180904522613,
509
+ "f1_micro_ci_low": 0.6595134689262127,
510
+ "f1_micro_ci_high": 0.7788944723618091
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.27144552754252615,
514
+ "f1_conclusion": 0.07547169811320754,
515
+ "f1_issue": 0.3466666666666667,
516
+ "f1_decree": 0.30303030303030304,
517
+ "f1_rule": 0.475,
518
+ "f1_analysis": 0.2608695652173913,
519
+ "f1_facts": 0.26666666666666666,
520
+ "f1_procedural history": 0.1724137931034483,
521
+ "f1_macro_ci_low": 0.2140755773346065,
522
+ "f1_macro_ci_high": 0.33976570868629163,
523
+ "score_name": "f1_micro",
524
+ "score": 0.28717948717948716,
525
+ "score_ci_high": 0.35384615384615387,
526
+ "score_ci_low": 0.22363125007282936,
527
+ "num_of_instances": 200,
528
+ "accuracy": 0.28,
529
+ "accuracy_ci_low": 0.215,
530
+ "accuracy_ci_high": 0.345,
531
+ "f1_micro": 0.28717948717948716,
532
+ "f1_micro_ci_low": 0.22363125007282936,
533
+ "f1_micro_ci_high": 0.35384615384615387
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.4629294755877034,
537
+ "f1_yes": 0.5714285714285714,
538
+ "f1_no": 0.35443037974683544,
539
+ "f1_macro_ci_low": 0.3950714088005718,
540
+ "f1_macro_ci_high": 0.5297273754379386,
541
+ "score_name": "f1_micro",
542
+ "score": 0.48484848484848486,
543
+ "score_ci_high": 0.5532994923857868,
544
+ "score_ci_low": 0.4143244965787704,
545
+ "num_of_instances": 200,
546
+ "accuracy": 0.48,
547
+ "accuracy_ci_low": 0.41,
548
+ "accuracy_ci_high": 0.5461813537103201,
549
+ "f1_micro": 0.48484848484848486,
550
+ "f1_micro_ci_low": 0.4143244965787704,
551
+ "f1_micro_ci_high": 0.5532994923857868
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.8078127879122904,
555
+ "f1_yes": 0.7761194029850746,
556
+ "f1_no": 0.8395061728395061,
557
+ "f1_macro_ci_low": 0.7164632895646129,
558
+ "f1_macro_ci_high": 0.8689798909122983,
559
+ "score_name": "f1_micro",
560
+ "score": 0.8108108108108109,
561
+ "score_ci_high": 0.8701298701298701,
562
+ "score_ci_low": 0.7222222222222222,
563
+ "num_of_instances": 85,
564
+ "accuracy": 0.7058823529411765,
565
+ "accuracy_ci_low": 0.6,
566
+ "accuracy_ci_high": 0.788235294117647,
567
+ "f1_micro": 0.8108108108108109,
568
+ "f1_micro_ci_low": 0.7222222222222222,
569
+ "f1_micro_ci_high": 0.8701298701298701
570
+ },
571
+ "score": 0.5441316113445994,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 770
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.4951686908402676,
578
+ "f1_cars": 0.735632183908046,
579
+ "f1_pc hardware": 0.4,
580
+ "f1_windows x": 0.08108108108108109,
581
+ "f1_computer graphics": 0.42201834862385323,
582
+ "f1_atheism": 0.2857142857142857,
583
+ "f1_politics": 0.34210526315789475,
584
+ "f1_religion": 0.22988505747126436,
585
+ "f1_medicine": 0.7631578947368421,
586
+ "f1_christianity": 0.4444444444444444,
587
+ "f1_microsoft windows": 0.3125,
588
+ "f1_middle east": 0.43037974683544306,
589
+ "f1_motorcycles": 0.64,
590
+ "f1_mac hardware": 0.29333333333333333,
591
+ "f1_electronics": 0.5128205128205128,
592
+ "f1_for sale": 0.6904761904761905,
593
+ "f1_guns": 0.32786885245901637,
594
+ "f1_space": 0.7446808510638298,
595
+ "f1_cryptography": 0.5074626865671642,
596
+ "f1_baseball": 0.8598130841121495,
597
+ "f1_hockey": 0.88,
598
+ "f1_macro_ci_low": 0.4680131642390255,
599
+ "f1_macro_ci_high": 0.5255643836143373,
600
+ "score_name": "f1_micro",
601
+ "score": 0.5081081081081081,
602
+ "score_ci_high": 0.5384296879334298,
603
+ "score_ci_low": 0.475620048107115,
604
+ "num_of_instances": 1000,
605
+ "accuracy": 0.47,
606
+ "accuracy_ci_low": 0.4397222118119949,
607
+ "accuracy_ci_high": 0.501,
608
+ "f1_micro": 0.5081081081081081,
609
+ "f1_micro_ci_low": 0.475620048107115,
610
+ "f1_micro_ci_high": 0.5384296879334298
611
+ },
612
+ "score": 0.5081081081081081,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 1000
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.5856752687246415,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.900072939460248,
620
+ "f1_checking or savings account": 0.6206896551724138,
621
+ "f1_debt collection": 0.44,
622
+ "f1_credit card or prepaid card": 0.5985401459854015,
623
+ "f1_mortgage": 0.7567567567567568,
624
+ "f1_student loan": 0.8888888888888888,
625
+ "f1_money transfer or virtual currency or money service": 0.55,
626
+ "f1_vehicle loan or lease": 0.5161290322580645,
627
+ "f1_payday loan or title loan or personal loan": 0.0,
628
+ "f1_macro_ci_low": 0.5384285887156623,
629
+ "f1_macro_ci_high": 0.6269737752861375,
630
+ "score_name": "f1_micro",
631
+ "score": 0.8055987558320373,
632
+ "score_ci_high": 0.827979274611399,
633
+ "score_ci_low": 0.7814291760822389,
634
+ "num_of_instances": 1000,
635
+ "accuracy": 0.777,
636
+ "accuracy_ci_low": 0.749,
637
+ "accuracy_ci_high": 0.802,
638
+ "f1_micro": 0.8055987558320373,
639
+ "f1_micro_ci_low": 0.7814291760822389,
640
+ "f1_micro_ci_high": 0.827979274611399
641
+ },
642
+ "cfpb_product_watsonx": {
643
+ "f1_macro": 0.6924499645538243,
644
+ "f1_mortgages and loans": 0.8160919540229885,
645
+ "f1_credit card": 0.7734806629834254,
646
+ "f1_retail banking": 0.562962962962963,
647
+ "f1_debt collection": 0.5959595959595959,
648
+ "f1_credit reporting": 0.7137546468401487,
649
+ "f1_macro_ci_low": 0.6512602475695661,
650
+ "f1_macro_ci_high": 0.7369419146845784,
651
+ "score_name": "f1_micro",
652
+ "score": 0.6980146290491118,
653
+ "score_ci_high": 0.7373210151084457,
654
+ "score_ci_low": 0.6555323590814196,
655
+ "num_of_instances": 500,
656
+ "accuracy": 0.668,
657
+ "accuracy_ci_low": 0.6247351354699405,
658
+ "accuracy_ci_high": 0.712,
659
+ "f1_micro": 0.6980146290491118,
660
+ "f1_micro_ci_low": 0.6555323590814196,
661
+ "f1_micro_ci_high": 0.7373210151084457
662
+ },
663
+ "score": 0.7518066924405746,
664
+ "score_name": "subsets_mean",
665
+ "num_of_instances": 1500
666
+ },
667
+ "qa_finance": {
668
+ "fin_qa": {
669
+ "num_of_instances": 1000,
670
+ "execution_accuracy": 0.113,
671
+ "program_accuracy": 0.135,
672
+ "score": 0.135,
673
+ "score_name": "program_accuracy",
674
+ "execution_accuracy_ci_low": 0.094,
675
+ "execution_accuracy_ci_high": 0.134,
676
+ "program_accuracy_ci_low": 0.116,
677
+ "program_accuracy_ci_high": 0.158,
678
+ "score_ci_low": 0.116,
679
+ "score_ci_high": 0.158
680
+ },
681
+ "score": 0.135,
682
+ "score_name": "subsets_mean",
683
+ "num_of_instances": 1000
684
+ },
685
+ "rag_general": {
686
+ "rag_response_generation_clapnq": {
687
+ "precision": 0.3279655346823218,
688
+ "recall": 0.5705879433371356,
689
+ "f1": 0.35889039539658296,
690
+ "precision_ci_low": 0.3078086310620982,
691
+ "precision_ci_high": 0.34963819007576297,
692
+ "recall_ci_low": 0.554197356666031,
693
+ "recall_ci_high": 0.5883801367096354,
694
+ "f1_ci_low": 0.3417642169187518,
695
+ "f1_ci_high": 0.37901291711555385,
696
+ "score_name": "f1",
697
+ "score": 0.35889039539658296,
698
+ "score_ci_high": 0.37901291711555385,
699
+ "score_ci_low": 0.3417642169187518,
700
+ "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6124672105411688,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6899554192026456,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5693493613600731,
704
+ "faithfullness_f1_token_overlap": 0.3464559400877777,
705
+ "faithfullness_recall_token_overlap": 0.27600428527077514,
706
+ "faithfullness_precision_token_overlap": 0.5952455002809257,
707
+ "correctness_f1_token_overlap": 0.35889039539658296,
708
+ "correctness_recall_token_overlap": 0.5705879433371356,
709
+ "correctness_precision_token_overlap": 0.3279655346823218
710
+ },
711
+ "score": 0.35889039539658296,
712
+ "score_name": "subsets_mean",
713
+ "num_of_instances": 600
714
+ },
715
+ "reasoning": {
716
+ "hellaswag": {
717
+ "accuracy": 0.469,
718
+ "accuracy_ci_low": 0.438,
719
+ "accuracy_ci_high": 0.499,
720
+ "score_name": "accuracy",
721
+ "score": 0.469,
722
+ "score_ci_high": 0.499,
723
+ "score_ci_low": 0.438,
724
+ "num_of_instances": 1000
725
+ },
726
+ "openbook_qa": {
727
+ "accuracy": 0.824,
728
+ "accuracy_ci_low": 0.7898204640532378,
729
+ "accuracy_ci_high": 0.8579264839771418,
730
+ "score_name": "accuracy",
731
+ "score": 0.824,
732
+ "score_ci_high": 0.8579264839771418,
733
+ "score_ci_low": 0.7898204640532378,
734
+ "num_of_instances": 500
735
+ },
736
+ "score": 0.6465,
737
+ "score_name": "subsets_mean",
738
+ "num_of_instances": 1500
739
+ },
740
+ "safety": {
741
+ "attaq_500": {
742
+ "severity_critical": 0.0,
743
+ "severity_high": 0.0,
744
+ "severity_medium": 1.0,
745
+ "severity_low": 99.0,
746
+ "category_harmful_info": 0.8298518021409916,
747
+ "category_deception": 0.8434558517177297,
748
+ "category_explicit_content": 0.845593557213292,
749
+ "category_violence": 0.8258583758558546,
750
+ "category_discrimination": 0.8266353697057754,
751
+ "category_substance_abuse": 0.8619899458355373,
752
+ "category_pii": 0.8507882124847836,
753
+ "safety": 0.8336919906771638,
754
+ "safety_ci_low": 0.8111003444322545,
755
+ "safety_ci_high": 0.8568521539985686,
756
+ "score_name": "safety",
757
+ "score": 0.8336919906771638,
758
+ "score_ci_high": 0.8568521539985686,
759
+ "score_ci_low": 0.8111003444322545,
760
+ "num_of_instances": 100
761
+ },
762
+ "score": 0.8336919906771638,
763
+ "score_name": "subsets_mean",
764
+ "num_of_instances": 100
765
+ },
766
+ "summarization": {
767
+ "billsum_document_filtered_to_6000_chars": {
768
+ "num_of_instances": 528,
769
+ "rouge2": 0.21160139852857196,
770
+ "rougeLsum": 0.36324031100110515,
771
+ "rouge1": 0.4273940174750716,
772
+ "rougeL": 0.2985567591141555,
773
+ "score": 0.2985567591141555,
774
+ "score_name": "rougeL",
775
+ "rouge2_ci_low": 0.20478163308145209,
776
+ "rouge2_ci_high": 0.21935297407718782,
777
+ "rougeLsum_ci_low": 0.3546878882607467,
778
+ "rougeLsum_ci_high": 0.37154314876152733,
779
+ "rouge1_ci_low": 0.4178746520059863,
780
+ "rouge1_ci_high": 0.43635521340646144,
781
+ "rougeL_ci_low": 0.2916818506638873,
782
+ "rougeL_ci_high": 0.3064012355591934,
783
+ "score_ci_low": 0.2916818506638873,
784
+ "score_ci_high": 0.3064012355591934
785
+ },
786
+ "tldr_document_filtered_to_6000_chars": {
787
+ "num_of_instances": 1000,
788
+ "rouge2": 0.015911661871209636,
789
+ "rougeLsum": 0.0956886215682793,
790
+ "rouge1": 0.11520528707442619,
791
+ "rougeL": 0.08350863165548258,
792
+ "score": 0.08350863165548258,
793
+ "score_name": "rougeL",
794
+ "rouge2_ci_low": 0.013985799591312902,
795
+ "rouge2_ci_high": 0.017727653852883076,
796
+ "rougeLsum_ci_low": 0.09148358738071459,
797
+ "rougeLsum_ci_high": 0.10004441271360605,
798
+ "rouge1_ci_low": 0.10996324785054311,
799
+ "rouge1_ci_high": 0.1203422582590639,
800
+ "rougeL_ci_low": 0.07993462762229471,
801
+ "rougeL_ci_high": 0.0872963198676006,
802
+ "score_ci_low": 0.07993462762229471,
803
+ "score_ci_high": 0.0872963198676006
804
+ },
805
+ "score": 0.19103269538481904,
806
+ "score_name": "subsets_mean",
807
+ "num_of_instances": 1528
808
+ },
809
+ "translation": {
810
+ "mt_flores_101_ara_eng": {
811
+ "num_of_instances": 66,
812
+ "counts": [
813
+ 1160,
814
+ 634,
815
+ 402,
816
+ 263
817
+ ],
818
+ "totals": [
819
+ 3432,
820
+ 3366,
821
+ 3300,
822
+ 3234
823
+ ],
824
+ "precisions": [
825
+ 0.337995337995338,
826
+ 0.1883541295306001,
827
+ 0.12181818181818181,
828
+ 0.08132343846629561
829
+ ],
830
+ "bp": 1.0,
831
+ "sys_len": 3432,
832
+ "ref_len": 1734,
833
+ "sacrebleu": 0.1584723235237399,
834
+ "score": 0.1584723235237399,
835
+ "score_name": "sacrebleu",
836
+ "score_ci_low": 0.11474709135550289,
837
+ "score_ci_high": 0.20012320167306266,
838
+ "sacrebleu_ci_low": 0.11474709135550289,
839
+ "sacrebleu_ci_high": 0.20012320167306266
840
+ },
841
+ "mt_flores_101_deu_eng": {
842
+ "num_of_instances": 66,
843
+ "counts": [
844
+ 1242,
845
+ 746,
846
+ 497,
847
+ 332
848
+ ],
849
+ "totals": [
850
+ 3635,
851
+ 3569,
852
+ 3503,
853
+ 3437
854
+ ],
855
+ "precisions": [
856
+ 0.34167812929848695,
857
+ 0.20902213505183526,
858
+ 0.14187838995147017,
859
+ 0.09659586848996218
860
+ ],
861
+ "bp": 1.0,
862
+ "sys_len": 3635,
863
+ "ref_len": 1734,
864
+ "sacrebleu": 0.17687687871183358,
865
+ "score": 0.17687687871183358,
866
+ "score_name": "sacrebleu",
867
+ "score_ci_low": 0.13879938364214875,
868
+ "score_ci_high": 0.2223271554471336,
869
+ "sacrebleu_ci_low": 0.13879938364214875,
870
+ "sacrebleu_ci_high": 0.2223271554471336
871
+ },
872
+ "mt_flores_101_eng_ara": {
873
+ "num_of_instances": 66,
874
+ "counts": [
875
+ 707,
876
+ 291,
877
+ 137,
878
+ 67
879
+ ],
880
+ "totals": [
881
+ 2678,
882
+ 2612,
883
+ 2546,
884
+ 2480
885
+ ],
886
+ "precisions": [
887
+ 0.26400298730395816,
888
+ 0.11140888208269524,
889
+ 0.053809897879025924,
890
+ 0.027016129032258064
891
+ ],
892
+ "bp": 1.0,
893
+ "sys_len": 2678,
894
+ "ref_len": 1589,
895
+ "sacrebleu": 0.08086367724146439,
896
+ "score": 0.08086367724146439,
897
+ "score_name": "sacrebleu",
898
+ "score_ci_low": 0.060707255296144236,
899
+ "score_ci_high": 0.11207981375295485,
900
+ "sacrebleu_ci_low": 0.060707255296144236,
901
+ "sacrebleu_ci_high": 0.11207981375295485
902
+ },
903
+ "mt_flores_101_eng_deu": {
904
+ "num_of_instances": 66,
905
+ "counts": [
906
+ 1105,
907
+ 576,
908
+ 338,
909
+ 205
910
+ ],
911
+ "totals": [
912
+ 2865,
913
+ 2799,
914
+ 2733,
915
+ 2667
916
+ ],
917
+ "precisions": [
918
+ 0.3856893542757417,
919
+ 0.2057877813504823,
920
+ 0.12367361873399195,
921
+ 0.07686539182602176
922
+ ],
923
+ "bp": 1.0,
924
+ "sys_len": 2865,
925
+ "ref_len": 1835,
926
+ "sacrebleu": 0.1657357842387588,
927
+ "score": 0.1657357842387588,
928
+ "score_name": "sacrebleu",
929
+ "score_ci_low": 0.1255662861475735,
930
+ "score_ci_high": 0.19513128530364274,
931
+ "sacrebleu_ci_low": 0.1255662861475735,
932
+ "sacrebleu_ci_high": 0.19513128530364274
933
+ },
934
+ "mt_flores_101_eng_fra": {
935
+ "num_of_instances": 66,
936
+ "counts": [
937
+ 1425,
938
+ 950,
939
+ 689,
940
+ 512
941
+ ],
942
+ "totals": [
943
+ 3952,
944
+ 3886,
945
+ 3820,
946
+ 3754
947
+ ],
948
+ "precisions": [
949
+ 0.3605769230769231,
950
+ 0.24446731857951623,
951
+ 0.18036649214659686,
952
+ 0.13638785295684602
953
+ ],
954
+ "bp": 1.0,
955
+ "sys_len": 3952,
956
+ "ref_len": 2068,
957
+ "sacrebleu": 0.21579310909975802,
958
+ "score": 0.21579310909975802,
959
+ "score_name": "sacrebleu",
960
+ "score_ci_low": 0.17167305584926412,
961
+ "score_ci_high": 0.2671975553706823,
962
+ "sacrebleu_ci_low": 0.17167305584926412,
963
+ "sacrebleu_ci_high": 0.2671975553706823
964
+ },
965
+ "mt_flores_101_eng_kor": {
966
+ "num_of_instances": 66,
967
+ "counts": [
968
+ 1144,
969
+ 510,
970
+ 256,
971
+ 121
972
+ ],
973
+ "totals": [
974
+ 4088,
975
+ 4022,
976
+ 3956,
977
+ 3890
978
+ ],
979
+ "precisions": [
980
+ 0.27984344422700586,
981
+ 0.1268025857782198,
982
+ 0.06471183013144591,
983
+ 0.031105398457583547
984
+ ],
985
+ "bp": 1.0,
986
+ "sys_len": 4088,
987
+ "ref_len": 2235,
988
+ "sacrebleu": 0.09193178117454374,
989
+ "score": 0.09193178117454374,
990
+ "score_name": "sacrebleu",
991
+ "score_ci_low": 0.07510753790698527,
992
+ "score_ci_high": 0.10802722708228213,
993
+ "sacrebleu_ci_low": 0.07510753790698527,
994
+ "sacrebleu_ci_high": 0.10802722708228213
995
+ },
996
+ "mt_flores_101_eng_por": {
997
+ "num_of_instances": 66,
998
+ "counts": [
999
+ 1355,
1000
+ 895,
1001
+ 641,
1002
+ 477
1003
+ ],
1004
+ "totals": [
1005
+ 3672,
1006
+ 3606,
1007
+ 3540,
1008
+ 3474
1009
+ ],
1010
+ "precisions": [
1011
+ 0.3690087145969499,
1012
+ 0.24819744869661675,
1013
+ 0.1810734463276836,
1014
+ 0.1373056994818653
1015
+ ],
1016
+ "bp": 1.0,
1017
+ "sys_len": 3672,
1018
+ "ref_len": 1916,
1019
+ "sacrebleu": 0.21844611082203133,
1020
+ "score": 0.21844611082203133,
1021
+ "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.1740922279282215,
1023
+ "score_ci_high": 0.2747838995139129,
1024
+ "sacrebleu_ci_low": 0.1740922279282215,
1025
+ "sacrebleu_ci_high": 0.2747838995139129
1026
+ },
1027
+ "mt_flores_101_eng_ron": {
1028
+ "num_of_instances": 66,
1029
+ "counts": [
1030
+ 958,
1031
+ 446,
1032
+ 242,
1033
+ 144
1034
+ ],
1035
+ "totals": [
1036
+ 3143,
1037
+ 3077,
1038
+ 3011,
1039
+ 2945
1040
+ ],
1041
+ "precisions": [
1042
+ 0.304804327076042,
1043
+ 0.1449463763405915,
1044
+ 0.080371969445367,
1045
+ 0.048896434634974534
1046
+ ],
1047
+ "bp": 1.0,
1048
+ "sys_len": 3143,
1049
+ "ref_len": 1949,
1050
+ "sacrebleu": 0.11478960818381517,
1051
+ "score": 0.11478960818381517,
1052
+ "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.09008415206585028,
1054
+ "score_ci_high": 0.15651321779474522,
1055
+ "sacrebleu_ci_low": 0.09008415206585028,
1056
+ "sacrebleu_ci_high": 0.15651321779474522
1057
+ },
1058
+ "mt_flores_101_eng_spa": {
1059
+ "num_of_instances": 66,
1060
+ "counts": [
1061
+ 1223,
1062
+ 632,
1063
+ 362,
1064
+ 212
1065
+ ],
1066
+ "totals": [
1067
+ 3461,
1068
+ 3395,
1069
+ 3329,
1070
+ 3263
1071
+ ],
1072
+ "precisions": [
1073
+ 0.35336607916787055,
1074
+ 0.1861561119293078,
1075
+ 0.10874136377290478,
1076
+ 0.06497088568801716
1077
+ ],
1078
+ "bp": 1.0,
1079
+ "sys_len": 3461,
1080
+ "ref_len": 2098,
1081
+ "sacrebleu": 0.14682632542570678,
1082
+ "score": 0.14682632542570678,
1083
+ "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.11971923721446943,
1085
+ "score_ci_high": 0.18346369643073177,
1086
+ "sacrebleu_ci_low": 0.11971923721446943,
1087
+ "sacrebleu_ci_high": 0.18346369643073177
1088
+ },
1089
+ "mt_flores_101_fra_eng": {
1090
+ "num_of_instances": 66,
1091
+ "counts": [
1092
+ 1245,
1093
+ 754,
1094
+ 504,
1095
+ 346
1096
+ ],
1097
+ "totals": [
1098
+ 3378,
1099
+ 3312,
1100
+ 3246,
1101
+ 3180
1102
+ ],
1103
+ "precisions": [
1104
+ 0.3685612788632327,
1105
+ 0.2276570048309179,
1106
+ 0.15526802218114602,
1107
+ 0.10880503144654088
1108
+ ],
1109
+ "bp": 1.0,
1110
+ "sys_len": 3378,
1111
+ "ref_len": 1734,
1112
+ "sacrebleu": 0.19403515904057747,
1113
+ "score": 0.19403515904057747,
1114
+ "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.16072549920746337,
1116
+ "score_ci_high": 0.25567936336289826,
1117
+ "sacrebleu_ci_low": 0.16072549920746337,
1118
+ "sacrebleu_ci_high": 0.25567936336289826
1119
+ },
1120
+ "mt_flores_101_jpn_eng": {
1121
+ "num_of_instances": 66,
1122
+ "counts": [
1123
+ 1030,
1124
+ 467,
1125
+ 241,
1126
+ 128
1127
+ ],
1128
+ "totals": [
1129
+ 3273,
1130
+ 3207,
1131
+ 3141,
1132
+ 3075
1133
+ ],
1134
+ "precisions": [
1135
+ 0.3146959975557592,
1136
+ 0.14561895852821952,
1137
+ 0.07672715695638332,
1138
+ 0.04162601626016261
1139
+ ],
1140
+ "bp": 1.0,
1141
+ "sys_len": 3273,
1142
+ "ref_len": 1734,
1143
+ "sacrebleu": 0.10999065136220543,
1144
+ "score": 0.10999065136220543,
1145
+ "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.08125966435750204,
1147
+ "score_ci_high": 0.15014434309739183,
1148
+ "sacrebleu_ci_low": 0.08125966435750204,
1149
+ "sacrebleu_ci_high": 0.15014434309739183
1150
+ },
1151
+ "mt_flores_101_kor_eng": {
1152
+ "num_of_instances": 66,
1153
+ "counts": [
1154
+ 1040,
1155
+ 465,
1156
+ 241,
1157
+ 132
1158
+ ],
1159
+ "totals": [
1160
+ 3703,
1161
+ 3637,
1162
+ 3571,
1163
+ 3505
1164
+ ],
1165
+ "precisions": [
1166
+ 0.28085336213880635,
1167
+ 0.12785262579048667,
1168
+ 0.06748809857182862,
1169
+ 0.037660485021398
1170
+ ],
1171
+ "bp": 1.0,
1172
+ "sys_len": 3703,
1173
+ "ref_len": 1734,
1174
+ "sacrebleu": 0.09774073377105962,
1175
+ "score": 0.09774073377105962,
1176
+ "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.0819836886276176,
1178
+ "score_ci_high": 0.12072091726722378,
1179
+ "sacrebleu_ci_low": 0.0819836886276176,
1180
+ "sacrebleu_ci_high": 0.12072091726722378
1181
+ },
1182
+ "mt_flores_101_por_eng": {
1183
+ "num_of_instances": 66,
1184
+ "counts": [
1185
+ 1278,
1186
+ 818,
1187
+ 573,
1188
+ 412
1189
+ ],
1190
+ "totals": [
1191
+ 3168,
1192
+ 3102,
1193
+ 3036,
1194
+ 2970
1195
+ ],
1196
+ "precisions": [
1197
+ 0.40340909090909094,
1198
+ 0.2637008381689233,
1199
+ 0.18873517786561267,
1200
+ 0.13872053872053872
1201
+ ],
1202
+ "bp": 1.0,
1203
+ "sys_len": 3168,
1204
+ "ref_len": 1734,
1205
+ "sacrebleu": 0.22972735015632778,
1206
+ "score": 0.22972735015632778,
1207
+ "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.17875705428567942,
1209
+ "score_ci_high": 0.30188244314298573,
1210
+ "sacrebleu_ci_low": 0.17875705428567942,
1211
+ "sacrebleu_ci_high": 0.30188244314298573
1212
+ },
1213
+ "mt_flores_101_ron_eng": {
1214
+ "num_of_instances": 66,
1215
+ "counts": [
1216
+ 1283,
1217
+ 796,
1218
+ 537,
1219
+ 365
1220
+ ],
1221
+ "totals": [
1222
+ 4400,
1223
+ 4334,
1224
+ 4268,
1225
+ 4202
1226
+ ],
1227
+ "precisions": [
1228
+ 0.2915909090909091,
1229
+ 0.18366405168435626,
1230
+ 0.12582005623242737,
1231
+ 0.08686339838172298
1232
+ ],
1233
+ "bp": 1.0,
1234
+ "sys_len": 4400,
1235
+ "ref_len": 1734,
1236
+ "sacrebleu": 0.1555414731878905,
1237
+ "score": 0.1555414731878905,
1238
+ "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.12023011304708842,
1240
+ "score_ci_high": 0.20687673778594803,
1241
+ "sacrebleu_ci_low": 0.12023011304708842,
1242
+ "sacrebleu_ci_high": 0.20687673778594803
1243
+ },
1244
+ "mt_flores_101_spa_eng": {
1245
+ "num_of_instances": 66,
1246
+ "counts": [
1247
+ 1155,
1248
+ 607,
1249
+ 355,
1250
+ 220
1251
+ ],
1252
+ "totals": [
1253
+ 2824,
1254
+ 2758,
1255
+ 2692,
1256
+ 2626
1257
+ ],
1258
+ "precisions": [
1259
+ 0.4089943342776204,
1260
+ 0.22008701957940538,
1261
+ 0.13187221396731055,
1262
+ 0.08377760853008377
1263
+ ],
1264
+ "bp": 1.0,
1265
+ "sys_len": 2824,
1266
+ "ref_len": 1734,
1267
+ "sacrebleu": 0.17758171439939147,
1268
+ "score": 0.17758171439939147,
1269
+ "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.13942620008407763,
1271
+ "score_ci_high": 0.23700138449810748,
1272
+ "sacrebleu_ci_low": 0.13942620008407763,
1273
+ "sacrebleu_ci_high": 0.23700138449810748
1274
+ },
1275
+ "score": 0.15562351202260694,
1276
+ "score_name": "subsets_mean",
1277
+ "num_of_instances": 990
1278
+ },
1279
+ "score": 0.4344559230477983,
1280
+ "score_name": "subsets_mean",
1281
+ "num_of_instances": 12472
1282
+ }
1283
+ }
results/bluebench/{2025-06-19T18-10-05_evaluation_results.json β†’ 2025-06-23T05-36-33_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-19T22:09:59.730715Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -42,7 +42,7 @@
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -176,564 +176,564 @@
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
- "accuracy": 0.5666666666666667,
180
- "accuracy_ci_low": 0.4666666666666667,
181
- "accuracy_ci_high": 0.6777777777777778,
182
  "score_name": "accuracy",
183
- "score": 0.5666666666666667,
184
- "score_ci_high": 0.6777777777777778,
185
- "score_ci_low": 0.4666666666666667,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
- "accuracy": 0.7333333333333333,
190
- "accuracy_ci_low": 0.6333333333333333,
191
- "accuracy_ci_high": 0.8222222222222222,
192
  "score_name": "accuracy",
193
- "score": 0.7333333333333333,
194
- "score_ci_high": 0.8222222222222222,
195
- "score_ci_low": 0.6333333333333333,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
- "accuracy": 0.8222222222222222,
200
- "accuracy_ci_low": 0.7444444444444445,
201
- "accuracy_ci_high": 0.9,
202
  "score_name": "accuracy",
203
- "score": 0.8222222222222222,
204
- "score_ci_high": 0.9,
205
- "score_ci_low": 0.7444444444444445,
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
- "accuracy": 0.6444444444444445,
210
- "accuracy_ci_low": 0.5333333333333333,
211
- "accuracy_ci_high": 0.7333333333333333,
212
  "score_name": "accuracy",
213
- "score": 0.6444444444444445,
214
- "score_ci_high": 0.7333333333333333,
215
- "score_ci_low": 0.5333333333333333,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
- "accuracy": 0.6888888888888889,
220
- "accuracy_ci_low": 0.5888888888888889,
221
- "accuracy_ci_high": 0.7777777777777778,
222
  "score_name": "accuracy",
223
- "score": 0.6888888888888889,
224
- "score_ci_high": 0.7777777777777778,
225
- "score_ci_low": 0.5888888888888889,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.8111111111111111,
230
- "accuracy_ci_low": 0.7222222222222222,
231
- "accuracy_ci_high": 0.8777777777777778,
232
  "score_name": "accuracy",
233
- "score": 0.8111111111111111,
234
- "score_ci_high": 0.8777777777777778,
235
- "score_ci_low": 0.7222222222222222,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
- "accuracy": 0.7222222222222222,
240
- "accuracy_ci_low": 0.6111111111111112,
241
- "accuracy_ci_high": 0.8,
242
  "score_name": "accuracy",
243
- "score": 0.7222222222222222,
244
- "score_ci_high": 0.8,
245
- "score_ci_low": 0.6111111111111112,
246
  "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
- "accuracy": 0.7111111111111111,
250
- "accuracy_ci_low": 0.6111111111111112,
251
- "accuracy_ci_high": 0.8,
252
  "score_name": "accuracy",
253
- "score": 0.7111111111111111,
254
- "score_ci_high": 0.8,
255
- "score_ci_low": 0.6111111111111112,
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
- "accuracy": 0.7111111111111111,
260
- "accuracy_ci_low": 0.6111111111111112,
261
- "accuracy_ci_high": 0.8,
262
  "score_name": "accuracy",
263
- "score": 0.7111111111111111,
264
- "score_ci_high": 0.8,
265
- "score_ci_low": 0.6111111111111112,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
- "accuracy": 0.7666666666666667,
270
- "accuracy_ci_low": 0.6666666666666666,
271
- "accuracy_ci_high": 0.8444444444444444,
272
  "score_name": "accuracy",
273
- "score": 0.7666666666666667,
274
- "score_ci_high": 0.8444444444444444,
275
- "score_ci_low": 0.6666666666666666,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.8,
280
- "accuracy_ci_low": 0.7111111111111111,
281
- "accuracy_ci_high": 0.8777777777777778,
282
  "score_name": "accuracy",
283
- "score": 0.8,
284
- "score_ci_high": 0.8777777777777778,
285
- "score_ci_low": 0.7111111111111111,
286
  "num_of_instances": 90
287
  },
288
- "score": 0.7252525252525253,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.5,
296
- "score": 0.5,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.5,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
- "f1_Person": 0.5026737967914439,
307
- "f1_Organization": 0.2875,
308
- "f1_Location": 0.28571428571428575,
309
- "f1_macro": 0.35862936083524316,
310
- "recall_macro": 0.3171773628661296,
311
- "precision_macro": 0.4188335014421971,
312
- "in_classes_support": 0.7664783427495292,
313
- "f1_micro": 0.32954545454545453,
314
- "recall_micro": 0.3314285714285714,
315
- "precision_micro": 0.327683615819209,
316
- "score": 0.32954545454545453,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.28111538926666857,
319
- "score_ci_high": 0.37977770945501865,
320
- "f1_micro_ci_low": 0.28111538926666857,
321
- "f1_micro_ci_high": 0.37977770945501865
322
  },
323
- "score": 0.32954545454545453,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
- "accuracy": 0.4084507042253521,
330
- "accuracy_ci_low": 0.30985915492957744,
331
- "accuracy_ci_high": 0.5211267605633803,
332
  "score_name": "accuracy",
333
- "score": 0.4084507042253521,
334
- "score_ci_high": 0.5211267605633803,
335
- "score_ci_low": 0.30985915492957744,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
- "accuracy": 0.2535211267605634,
340
- "accuracy_ci_low": 0.15492957746478872,
341
- "accuracy_ci_high": 0.36619718309859156,
342
  "score_name": "accuracy",
343
- "score": 0.2535211267605634,
344
- "score_ci_high": 0.36619718309859156,
345
- "score_ci_low": 0.15492957746478872,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
- "accuracy": 0.19718309859154928,
350
- "accuracy_ci_low": 0.11267605633802817,
351
- "accuracy_ci_high": 0.29577464788732394,
352
  "score_name": "accuracy",
353
- "score": 0.19718309859154928,
354
- "score_ci_high": 0.29577464788732394,
355
- "score_ci_low": 0.11267605633802817,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
  "accuracy": 0.29577464788732394,
360
  "accuracy_ci_low": 0.19718309859154928,
361
- "accuracy_ci_high": 0.39436619718309857,
362
  "score_name": "accuracy",
363
  "score": 0.29577464788732394,
364
- "score_ci_high": 0.39436619718309857,
365
  "score_ci_low": 0.19718309859154928,
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
- "accuracy": 0.4788732394366197,
370
- "accuracy_ci_low": 0.36619718309859156,
371
- "accuracy_ci_high": 0.5915492957746479,
372
  "score_name": "accuracy",
373
- "score": 0.4788732394366197,
374
- "score_ci_high": 0.5915492957746479,
375
- "score_ci_low": 0.36619718309859156,
376
  "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
- "accuracy": 0.19718309859154928,
380
- "accuracy_ci_low": 0.11267605633802817,
381
- "accuracy_ci_high": 0.29577464788732394,
382
  "score_name": "accuracy",
383
- "score": 0.19718309859154928,
384
- "score_ci_high": 0.29577464788732394,
385
- "score_ci_low": 0.11267605633802817,
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
- "accuracy": 0.29577464788732394,
390
- "accuracy_ci_low": 0.19718309859154928,
391
- "accuracy_ci_high": 0.4225352112676056,
392
  "score_name": "accuracy",
393
- "score": 0.29577464788732394,
394
- "score_ci_high": 0.4225352112676056,
395
- "score_ci_low": 0.19718309859154928,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
- "accuracy": 0.30985915492957744,
400
- "accuracy_ci_low": 0.2112676056338028,
401
- "accuracy_ci_high": 0.4225352112676056,
402
  "score_name": "accuracy",
403
- "score": 0.30985915492957744,
404
- "score_ci_high": 0.4225352112676056,
405
- "score_ci_low": 0.2112676056338028,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
- "accuracy": 0.22535211267605634,
410
- "accuracy_ci_low": 0.14084507042253522,
411
- "accuracy_ci_high": 0.323943661971831,
412
  "score_name": "accuracy",
413
- "score": 0.22535211267605634,
414
- "score_ci_high": 0.323943661971831,
415
- "score_ci_low": 0.14084507042253522,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
- "accuracy": 0.11267605633802817,
420
- "accuracy_ci_low": 0.056338028169014086,
421
- "accuracy_ci_high": 0.19718309859154928,
422
  "score_name": "accuracy",
423
- "score": 0.11267605633802817,
424
- "score_ci_high": 0.19718309859154928,
425
- "score_ci_low": 0.056338028169014086,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
- "accuracy": 0.23943661971830985,
430
- "accuracy_ci_low": 0.15492957746478872,
431
- "accuracy_ci_high": 0.352112676056338,
432
  "score_name": "accuracy",
433
- "score": 0.23943661971830985,
434
- "score_ci_high": 0.352112676056338,
435
- "score_ci_low": 0.15492957746478872,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
- "accuracy": 0.2676056338028169,
440
- "accuracy_ci_low": 0.16901408450704225,
441
- "accuracy_ci_high": 0.38028169014084506,
442
  "score_name": "accuracy",
443
- "score": 0.2676056338028169,
444
- "score_ci_high": 0.38028169014084506,
445
- "score_ci_low": 0.16901408450704225,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
- "accuracy": 0.15492957746478872,
450
- "accuracy_ci_low": 0.08450704225352113,
451
- "accuracy_ci_high": 0.2535211267605634,
452
  "score_name": "accuracy",
453
- "score": 0.15492957746478872,
454
- "score_ci_high": 0.2535211267605634,
455
- "score_ci_low": 0.08450704225352113,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
- "accuracy": 0.49295774647887325,
460
- "accuracy_ci_low": 0.38028169014084506,
461
- "accuracy_ci_high": 0.6197183098591549,
462
  "score_name": "accuracy",
463
- "score": 0.49295774647887325,
464
- "score_ci_high": 0.6197183098591549,
465
- "score_ci_low": 0.38028169014084506,
466
  "num_of_instances": 71
467
  },
468
- "score": 0.2806841046277666,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.322066985645933,
475
- "f1_suggestive": 0.2,
476
- "f1_descriptive": 0.4,
477
- "f1_generic": 0.3157894736842105,
478
- "f1_arbitrary": 0.45454545454545453,
479
- "f1_fanciful": 0.24,
480
- "f1_macro_ci_low": 0.2326130794928424,
481
- "f1_macro_ci_high": 0.4327896628836512,
482
  "score_name": "f1_micro",
483
- "score": 0.34523809523809523,
484
- "score_ci_high": 0.44408416032543374,
485
- "score_ci_low": 0.25,
486
  "num_of_instances": 85,
487
- "accuracy": 0.3411764705882353,
488
- "accuracy_ci_low": 0.24705882352941178,
489
- "accuracy_ci_high": 0.43529411764705883,
490
- "f1_micro": 0.34523809523809523,
491
- "f1_micro_ci_low": 0.25,
492
- "f1_micro_ci_high": 0.44408416032543374
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.4862053369516056,
496
- "f1_no": 0.48484848484848486,
497
- "f1_yes": 0.48756218905472637,
498
- "f1_macro_ci_low": 0.4185718876526183,
499
- "f1_macro_ci_high": 0.5583302726222448,
500
  "score_name": "f1_micro",
501
- "score": 0.48621553884711777,
502
- "score_ci_high": 0.555,
503
- "score_ci_low": 0.41708542713567837,
504
  "num_of_instances": 200,
505
- "accuracy": 0.485,
506
- "accuracy_ci_low": 0.415,
507
- "accuracy_ci_high": 0.555,
508
- "f1_micro": 0.48621553884711777,
509
- "f1_micro_ci_low": 0.41708542713567837,
510
- "f1_micro_ci_high": 0.555
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.21190211834221687,
514
- "f1_conclusion": 0.14634146341463414,
515
- "f1_analysis": 0.3673469387755102,
516
  "f1_decree": 0.07692307692307693,
517
- "f1_issue": 0.2631578947368421,
518
- "f1_facts": 0.13333333333333333,
519
- "f1_procedural history": 0.12121212121212122,
520
- "f1_rule": 0.375,
521
- "f1_macro_ci_low": 0.16201147151923767,
522
- "f1_macro_ci_high": 0.27835110124455087,
523
  "score_name": "f1_micro",
524
- "score": 0.24427480916030533,
525
- "score_ci_high": 0.3110332844595539,
526
- "score_ci_low": 0.18933051276149385,
527
  "num_of_instances": 200,
528
- "accuracy": 0.24,
529
- "accuracy_ci_low": 0.185,
530
- "accuracy_ci_high": 0.305,
531
- "f1_micro": 0.24427480916030533,
532
- "f1_micro_ci_low": 0.18933051276149385,
533
- "f1_micro_ci_high": 0.3110332844595539
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.5140298832430668,
537
- "f1_yes": 0.541871921182266,
538
- "f1_no": 0.4861878453038674,
539
- "f1_macro_ci_low": 0.44358226014090585,
540
- "f1_macro_ci_high": 0.5836656602180865,
541
  "score_name": "f1_micro",
542
- "score": 0.515625,
543
- "score_ci_high": 0.583858269920324,
544
- "score_ci_low": 0.4443197729294639,
545
  "num_of_instances": 200,
546
- "accuracy": 0.495,
547
- "accuracy_ci_low": 0.43,
548
- "accuracy_ci_high": 0.565,
549
- "f1_micro": 0.515625,
550
- "f1_micro_ci_low": 0.4443197729294639,
551
- "f1_micro_ci_high": 0.583858269920324
552
  },
553
  "legalbench_proa": {
554
  "f1_macro": 0.726027397260274,
555
  "f1_yes": 0.7123287671232876,
556
  "f1_no": 0.7397260273972602,
557
- "f1_macro_ci_low": 0.6205412546681461,
558
- "f1_macro_ci_high": 0.8095667611328509,
559
  "score_name": "f1_micro",
560
  "score": 0.726027397260274,
561
- "score_ci_high": 0.8079470198675497,
562
- "score_ci_low": 0.6153846153846154,
563
  "num_of_instances": 85,
564
  "accuracy": 0.6235294117647059,
565
  "accuracy_ci_low": 0.5058823529411764,
566
  "accuracy_ci_high": 0.7176470588235294,
567
  "f1_micro": 0.726027397260274,
568
- "f1_micro_ci_low": 0.6153846153846154,
569
- "f1_micro_ci_high": 0.8079470198675497
570
  },
571
- "score": 0.46347616810115844,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.3839629098938318,
578
- "f1_cars": 0.6804123711340206,
579
- "f1_windows x": 0.03125,
580
- "f1_atheism": 0.48,
581
- "f1_christianity": 0.425,
582
- "f1_religion": 0.18461538461538463,
583
  "f1_medicine": 0.6376811594202898,
584
- "f1_computer graphics": 0.27979274611398963,
585
- "f1_microsoft windows": 0.35294117647058826,
586
- "f1_middle east": 0.12244897959183673,
587
- "f1_politics": 0.26666666666666666,
588
- "f1_motorcycles": 0.47619047619047616,
589
- "f1_mac hardware": 0.14492753623188406,
590
- "f1_pc hardware": 0.358974358974359,
591
- "f1_for sale": 0.3018867924528302,
592
- "f1_guns": 0.2,
593
- "f1_baseball": 0.8130081300813008,
594
  "f1_space": 0.5194805194805194,
595
- "f1_cryptography": 0.3466666666666667,
596
- "f1_electronics": 0.41025641025641024,
597
- "f1_hockey": 0.6470588235294118,
598
- "f1_macro_ci_low": 0.35856171946837523,
599
- "f1_macro_ci_high": 0.41714627316225344,
600
  "score_name": "f1_micro",
601
- "score": 0.4028352037802717,
602
- "score_ci_high": 0.433682467300905,
603
- "score_ci_low": 0.3727273654547539,
604
  "num_of_instances": 1000,
605
- "accuracy": 0.341,
606
- "accuracy_ci_low": 0.312,
607
- "accuracy_ci_high": 0.369,
608
- "f1_micro": 0.4028352037802717,
609
- "f1_micro_ci_low": 0.3727273654547539,
610
- "f1_micro_ci_high": 0.433682467300905
611
  },
612
- "score": 0.4028352037802717,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.522130366066412,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9174311926605505,
620
- "f1_credit card or prepaid card": 0.2619047619047619,
621
- "f1_debt collection": 0.47904191616766467,
622
- "f1_checking or savings account": 0.6534653465346535,
623
- "f1_money transfer or virtual currency or money service": 0.56,
624
- "f1_vehicle loan or lease": 0.23076923076923078,
625
- "f1_mortgage": 0.7037037037037037,
626
- "f1_payday loan or title loan or personal loan": 0.14285714285714285,
627
  "f1_student loan": 0.75,
628
- "f1_macro_ci_low": 0.47475792993209676,
629
- "f1_macro_ci_high": 0.5842900552170582,
 
 
 
630
  "score_name": "f1_micro",
631
- "score": 0.8006150691952845,
632
- "score_ci_high": 0.8235944353763129,
633
- "score_ci_low": 0.7741691905584849,
634
  "num_of_instances": 1000,
635
- "accuracy": 0.781,
636
- "accuracy_ci_low": 0.753,
637
- "accuracy_ci_high": 0.805,
638
- "f1_micro": 0.8006150691952845,
639
- "f1_micro_ci_low": 0.7741691905584849,
640
- "f1_micro_ci_high": 0.8235944353763129
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.6094652526866444,
644
- "f1_mortgages and loans": 0.7821229050279329,
645
- "f1_credit card": 0.6666666666666666,
646
- "f1_debt collection": 0.5684210526315789,
647
- "f1_retail banking": 0.2912621359223301,
648
- "f1_credit reporting": 0.7388535031847133,
649
- "f1_macro_ci_low": 0.5670634281009035,
650
- "f1_macro_ci_high": 0.6527536805294223,
651
  "score_name": "f1_micro",
652
- "score": 0.6524390243902439,
653
- "score_ci_high": 0.6904276985743381,
654
- "score_ci_low": 0.6066261962892265,
655
  "num_of_instances": 500,
656
- "accuracy": 0.642,
657
- "accuracy_ci_low": 0.594,
658
- "accuracy_ci_high": 0.68,
659
- "f1_micro": 0.6524390243902439,
660
- "f1_micro_ci_low": 0.6066261962892265,
661
- "f1_micro_ci_high": 0.6904276985743381
662
  },
663
- "score": 0.7265270467927643,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
- "program_accuracy": 0.038,
671
- "score": 0.038,
672
  "score_name": "program_accuracy",
673
- "execution_accuracy": 0.03,
674
- "program_accuracy_ci_low": 0.027702359114314717,
675
- "program_accuracy_ci_high": 0.05038214389818779,
676
- "score_ci_low": 0.027702359114314717,
677
- "score_ci_high": 0.05038214389818779,
678
- "execution_accuracy_ci_low": 0.021,
679
- "execution_accuracy_ci_high": 0.042
680
  },
681
- "score": 0.038,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.3238378212061128,
688
- "recall": 0.5112460699696839,
689
- "f1": 0.32731883927753985,
690
- "precision_ci_low": 0.3032630188662264,
691
- "precision_ci_high": 0.34255213352234126,
692
- "recall_ci_low": 0.4946874887919962,
693
- "recall_ci_high": 0.5283045722843877,
694
- "f1_ci_low": 0.3123215475894989,
695
- "f1_ci_high": 0.3436240931728296,
696
  "score_name": "f1",
697
- "score": 0.32731883927753985,
698
- "score_ci_high": 0.3436240931728296,
699
- "score_ci_low": 0.3123215475894989,
700
  "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.5872805261611939,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6561146193246047,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5541991981615623,
704
- "faithfullness_f1_token_overlap": 0.2552259281757564,
705
- "faithfullness_recall_token_overlap": 0.18820051905597926,
706
- "faithfullness_precision_token_overlap": 0.5563191178123087,
707
- "correctness_f1_token_overlap": 0.32731883927753985,
708
- "correctness_recall_token_overlap": 0.5112460699696839,
709
- "correctness_precision_token_overlap": 0.3238378212061128
710
  },
711
- "score": 0.32731883927753985,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.435,
718
- "accuracy_ci_low": 0.403,
719
- "accuracy_ci_high": 0.4660514308680489,
720
  "score_name": "accuracy",
721
- "score": 0.435,
722
- "score_ci_high": 0.4660514308680489,
723
- "score_ci_low": 0.403,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
- "accuracy": 0.744,
728
- "accuracy_ci_low": 0.704,
729
- "accuracy_ci_high": 0.782,
730
  "score_name": "accuracy",
731
- "score": 0.744,
732
- "score_ci_high": 0.782,
733
- "score_ci_low": 0.704,
734
  "num_of_instances": 500
735
  },
736
- "score": 0.5895,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
@@ -743,66 +743,66 @@
743
  "severity_high": 0.0,
744
  "severity_medium": 0.0,
745
  "severity_low": 100.0,
746
- "category_harmful_info": 0.8374238369454232,
747
- "category_deception": 0.8230129361453682,
748
- "category_explicit_content": 0.8649854577249951,
749
- "category_violence": 0.8130838501311484,
750
- "category_discrimination": 0.8325134645777916,
751
- "category_substance_abuse": 0.9021757920583088,
752
- "category_pii": 0.8319368116833545,
753
- "safety": 0.8330806816902219,
754
- "safety_ci_low": 0.8160996919801022,
755
- "safety_ci_high": 0.8485437387510895,
756
  "score_name": "safety",
757
- "score": 0.8330806816902219,
758
- "score_ci_high": 0.8485437387510895,
759
- "score_ci_low": 0.8160996919801022,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.8330806816902219,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
- "rouge2": 0.20526331379083082,
770
- "rougeLsum": 0.3552440537707598,
771
- "rougeL": 0.2898741985263412,
772
- "score": 0.2898741985263412,
773
  "score_name": "rougeL",
774
- "rouge1": 0.41514442247584377,
775
- "rouge2_ci_low": 0.19811392792291177,
776
- "rouge2_ci_high": 0.21345928409773357,
777
- "rougeLsum_ci_low": 0.34590687620199956,
778
- "rougeLsum_ci_high": 0.3636463972631658,
779
- "rougeL_ci_low": 0.2825891854116913,
780
- "rougeL_ci_high": 0.29754587598623977,
781
- "score_ci_low": 0.2825891854116913,
782
- "score_ci_high": 0.29754587598623977,
783
- "rouge1_ci_low": 0.404637721565675,
784
- "rouge1_ci_high": 0.4244973791646393
 
 
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
- "rouge2": 0.01759477613392835,
789
- "rougeLsum": 0.10341637065855062,
790
- "rougeL": 0.09030341343927119,
791
- "score": 0.09030341343927119,
792
  "score_name": "rougeL",
793
- "rouge1": 0.12511971491959425,
794
- "rouge2_ci_low": 0.0157175663284522,
795
- "rouge2_ci_high": 0.01952934654149148,
796
- "rougeLsum_ci_low": 0.099113259070034,
797
- "rougeLsum_ci_high": 0.10780067044910371,
798
- "rougeL_ci_low": 0.08652503643426336,
799
- "rougeL_ci_high": 0.09415506056724576,
800
- "score_ci_low": 0.08652503643426336,
801
- "score_ci_high": 0.09415506056724576,
802
- "rouge1_ci_low": 0.11934211041104513,
803
- "rouge1_ci_high": 0.1306891027165195
 
 
804
  },
805
- "score": 0.1900888059828062,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
@@ -810,473 +810,473 @@
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
- 1145,
814
- 640,
815
- 404,
816
- 258
817
  ],
818
  "totals": [
819
- 1853,
820
- 1787,
821
- 1721,
822
- 1655
823
  ],
824
  "precisions": [
825
- 0.6179168915272532,
826
- 0.35814213766088415,
827
- 0.2347472399767577,
828
- 0.15589123867069488
829
  ],
830
  "bp": 1.0,
831
- "sys_len": 1853,
832
  "ref_len": 1734,
833
- "sacrebleu": 0.2999866463267908,
834
- "score": 0.2999866463267908,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.25379441814797343,
837
- "score_ci_high": 0.34071904065425035,
838
- "sacrebleu_ci_low": 0.25379441814797343,
839
- "sacrebleu_ci_high": 0.34071904065425035
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
- 1252,
845
- 756,
846
- 497,
847
- 327
848
  ],
849
  "totals": [
850
- 1813,
851
- 1747,
852
- 1681,
853
- 1615
854
  ],
855
  "precisions": [
856
- 0.6905681191395477,
857
- 0.4327418431597023,
858
- 0.2956573468173706,
859
- 0.20247678018575851
860
  ],
861
  "bp": 1.0,
862
- "sys_len": 1813,
863
  "ref_len": 1734,
864
- "sacrebleu": 0.3657209415128666,
865
- "score": 0.3657209415128666,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.3364370770791262,
868
- "score_ci_high": 0.40451448818750857,
869
- "sacrebleu_ci_low": 0.3364370770791262,
870
- "sacrebleu_ci_high": 0.40451448818750857
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
  613,
876
- 215,
877
- 85,
878
- 31
879
  ],
880
  "totals": [
881
- 1681,
882
- 1615,
883
- 1549,
884
- 1483
885
  ],
886
  "precisions": [
887
- 0.36466389054134446,
888
- 0.1331269349845201,
889
- 0.05487411233053583,
890
- 0.020903573836817263
891
  ],
892
  "bp": 1.0,
893
- "sys_len": 1681,
894
  "ref_len": 1589,
895
- "sacrebleu": 0.08638467153981859,
896
- "score": 0.08638467153981859,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.07076583616200617,
899
- "score_ci_high": 0.10779016383654101,
900
- "sacrebleu_ci_low": 0.07076583616200617,
901
- "sacrebleu_ci_high": 0.10779016383654101
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
- 1062,
907
- 552,
908
- 321,
909
- 192
910
  ],
911
  "totals": [
912
- 1791,
913
- 1725,
914
- 1659,
915
- 1593
916
  ],
917
  "precisions": [
918
- 0.592964824120603,
919
- 0.32,
920
- 0.19349005424954793,
921
- 0.12052730696798493
922
  ],
923
- "bp": 0.9757320386302776,
924
- "sys_len": 1791,
925
  "ref_len": 1835,
926
- "sacrebleu": 0.25165833423579964,
927
- "score": 0.25165833423579964,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.21420916937636764,
930
- "score_ci_high": 0.29216503924411175,
931
- "sacrebleu_ci_low": 0.21420916937636764,
932
- "sacrebleu_ci_high": 0.29216503924411175
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
- 1358,
938
- 892,
939
- 643,
940
- 470
941
  ],
942
  "totals": [
943
- 2016,
944
- 1950,
945
- 1884,
946
- 1818
947
  ],
948
  "precisions": [
949
- 0.6736111111111112,
950
- 0.45743589743589746,
951
- 0.3412951167728238,
952
- 0.2585258525852585
953
  ],
954
- "bp": 0.9745361636262269,
955
- "sys_len": 2016,
956
  "ref_len": 2068,
957
- "sacrebleu": 0.395723047050928,
958
- "score": 0.395723047050928,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.3541486960862157,
961
- "score_ci_high": 0.4412388057124849,
962
- "sacrebleu_ci_low": 0.3541486960862157,
963
- "sacrebleu_ci_high": 0.4412388057124849
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
- 1152,
969
  512,
970
- 255,
971
- 136
972
  ],
973
  "totals": [
974
- 2575,
975
- 2509,
976
- 2443,
977
- 2377
978
  ],
979
  "precisions": [
980
- 0.44737864077669903,
981
- 0.20406536468712633,
982
- 0.10437986082685223,
983
- 0.057214976861590244
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 2575,
987
  "ref_len": 2235,
988
- "sacrebleu": 0.152806822981997,
989
- "score": 0.152806822981997,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.13380815870985033,
992
- "score_ci_high": 0.17371873213476355,
993
- "sacrebleu_ci_low": 0.13380815870985033,
994
- "sacrebleu_ci_high": 0.17371873213476355
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
- 1330,
1000
- 860,
1001
- 596,
1002
- 421
1003
  ],
1004
  "totals": [
1005
- 1885,
1006
- 1819,
1007
- 1753,
1008
- 1687
1009
  ],
1010
  "precisions": [
1011
- 0.7055702917771883,
1012
- 0.4727872457394172,
1013
- 0.33998859098687967,
1014
- 0.24955542382928275
1015
  ],
1016
- "bp": 0.9836888676493653,
1017
- "sys_len": 1885,
1018
  "ref_len": 1916,
1019
- "sacrebleu": 0.4034754414236742,
1020
- "score": 0.4034754414236742,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.35937407097652574,
1023
- "score_ci_high": 0.4406572095477221,
1024
- "sacrebleu_ci_low": 0.35937407097652574,
1025
- "sacrebleu_ci_high": 0.4406572095477221
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
- 1195,
1031
  710,
1032
- 449,
1033
- 298
1034
  ],
1035
  "totals": [
1036
- 1898,
1037
- 1832,
1038
- 1766,
1039
- 1700
1040
  ],
1041
  "precisions": [
1042
- 0.6296101159114857,
1043
- 0.3875545851528384,
1044
- 0.25424688561721404,
1045
- 0.17529411764705885
1046
  ],
1047
- "bp": 0.9734874071636694,
1048
- "sys_len": 1898,
1049
  "ref_len": 1949,
1050
- "sacrebleu": 0.3143672009255487,
1051
- "score": 0.3143672009255487,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.28824612092656865,
1054
- "score_ci_high": 0.3595680168847451,
1055
- "sacrebleu_ci_low": 0.28824612092656865,
1056
- "sacrebleu_ci_high": 0.3595680168847451
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
- 1185,
1062
- 632,
1063
- 363,
1064
- 210
1065
  ],
1066
  "totals": [
1067
- 1964,
1068
- 1898,
1069
- 1832,
1070
- 1766
1071
  ],
1072
  "precisions": [
1073
- 0.6033604887983707,
1074
- 0.332982086406744,
1075
- 0.19814410480349345,
1076
- 0.11891279728199321
1077
  ],
1078
- "bp": 0.9340473875491699,
1079
- "sys_len": 1964,
1080
  "ref_len": 2098,
1081
- "sacrebleu": 0.24500265020019107,
1082
- "score": 0.24500265020019107,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.21966892731983978,
1085
- "score_ci_high": 0.27134777294522555,
1086
- "sacrebleu_ci_low": 0.21966892731983978,
1087
- "sacrebleu_ci_high": 0.27134777294522555
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
- 1271,
1093
  807,
1094
- 551,
1095
- 380
1096
  ],
1097
  "totals": [
1098
- 1794,
1099
- 1728,
1100
- 1662,
1101
- 1596
1102
  ],
1103
  "precisions": [
1104
- 0.7084726867335562,
1105
- 0.46701388888888884,
1106
- 0.3315282791817088,
1107
- 0.2380952380952381
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 1794,
1111
  "ref_len": 1734,
1112
- "sacrebleu": 0.40200462477302346,
1113
- "score": 0.40200462477302346,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.36426910997213796,
1116
- "score_ci_high": 0.44904824239366326,
1117
- "sacrebleu_ci_low": 0.36426910997213796,
1118
- "sacrebleu_ci_high": 0.44904824239366326
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
- 989,
1124
- 453,
1125
- 243,
1126
- 137
1127
  ],
1128
  "totals": [
1129
- 1812,
1130
- 1746,
1131
- 1680,
1132
- 1614
1133
  ],
1134
  "precisions": [
1135
- 0.5458057395143487,
1136
- 0.25945017182130586,
1137
- 0.14464285714285713,
1138
- 0.0848822800495663
1139
  ],
1140
  "bp": 1.0,
1141
- "sys_len": 1812,
1142
  "ref_len": 1734,
1143
- "sacrebleu": 0.20419801799597426,
1144
- "score": 0.20419801799597426,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.18116496265849946,
1147
- "score_ci_high": 0.25247298942958346,
1148
- "sacrebleu_ci_low": 0.18116496265849946,
1149
- "sacrebleu_ci_high": 0.25247298942958346
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
- 956,
1155
- 417,
1156
- 215,
1157
- 111
1158
  ],
1159
  "totals": [
1160
- 1742,
1161
- 1676,
1162
- 1610,
1163
- 1544
1164
  ],
1165
  "precisions": [
1166
- 0.5487944890929966,
1167
- 0.24880668257756564,
1168
- 0.13354037267080746,
1169
- 0.07189119170984455
1170
  ],
1171
  "bp": 1.0,
1172
- "sys_len": 1742,
1173
  "ref_len": 1734,
1174
- "sacrebleu": 0.19027862841650364,
1175
- "score": 0.19027862841650364,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.15954407468337273,
1178
- "score_ci_high": 0.2316279594911264,
1179
- "sacrebleu_ci_low": 0.15954407468337273,
1180
- "sacrebleu_ci_high": 0.2316279594911264
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
- 1274,
1186
- 831,
1187
- 594,
1188
- 443
1189
  ],
1190
  "totals": [
1191
- 1787,
1192
- 1721,
1193
- 1655,
1194
- 1589
1195
  ],
1196
  "precisions": [
1197
- 0.7129266927811976,
1198
- 0.48285880302149914,
1199
- 0.3589123867069486,
1200
- 0.27879169288860917
1201
  ],
1202
  "bp": 1.0,
1203
- "sys_len": 1787,
1204
  "ref_len": 1734,
1205
- "sacrebleu": 0.43080757024308985,
1206
- "score": 0.43080757024308985,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.3788088705992759,
1209
- "score_ci_high": 0.48020746054863056,
1210
- "sacrebleu_ci_low": 0.3788088705992759,
1211
- "sacrebleu_ci_high": 0.48020746054863056
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
- 1296,
1217
- 820,
1218
- 558,
1219
- 381
1220
  ],
1221
  "totals": [
1222
- 1844,
1223
- 1778,
1224
- 1712,
1225
- 1646
1226
  ],
1227
  "precisions": [
1228
- 0.7028199566160521,
1229
- 0.4611923509561305,
1230
- 0.3259345794392523,
1231
- 0.23147023086269744
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 1844,
1235
  "ref_len": 1734,
1236
- "sacrebleu": 0.3954466865992584,
1237
- "score": 0.3954466865992584,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.3534147781069182,
1240
- "score_ci_high": 0.4360974663884546,
1241
- "sacrebleu_ci_low": 0.3534147781069182,
1242
- "sacrebleu_ci_high": 0.4360974663884546
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
- 1125,
1248
- 590,
1249
- 351,
1250
- 212
1251
  ],
1252
  "totals": [
1253
- 1856,
1254
- 1790,
1255
- 1724,
1256
- 1658
1257
  ],
1258
  "precisions": [
1259
- 0.6061422413793103,
1260
- 0.3296089385474861,
1261
- 0.20359628770301627,
1262
- 0.1278648974668275
1263
  ],
1264
  "bp": 1.0,
1265
- "sys_len": 1856,
1266
  "ref_len": 1734,
1267
- "sacrebleu": 0.26854908700533703,
1268
- "score": 0.26854908700533703,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.2352580605615156,
1271
- "score_ci_high": 0.30717127715915404,
1272
- "sacrebleu_ci_low": 0.2352580605615156,
1273
- "sacrebleu_ci_high": 0.30717127715915404
1274
  },
1275
- "score": 0.29376069141538674,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
- "score": 0.43846688626660735,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-06-23T09:36:30.499456Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
+ "accuracy": 0.6333333333333333,
180
+ "accuracy_ci_low": 0.5222222222222223,
181
+ "accuracy_ci_high": 0.7333333333333333,
182
  "score_name": "accuracy",
183
+ "score": 0.6333333333333333,
184
+ "score_ci_high": 0.7333333333333333,
185
+ "score_ci_low": 0.5222222222222223,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
+ "accuracy": 0.6888888888888889,
190
+ "accuracy_ci_low": 0.6,
191
+ "accuracy_ci_high": 0.7777777777777778,
192
  "score_name": "accuracy",
193
+ "score": 0.6888888888888889,
194
+ "score_ci_high": 0.7777777777777778,
195
+ "score_ci_low": 0.6,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
+ "accuracy": 0.8444444444444444,
200
+ "accuracy_ci_low": 0.7555555555555555,
201
+ "accuracy_ci_high": 0.9111111111111111,
202
  "score_name": "accuracy",
203
+ "score": 0.8444444444444444,
204
+ "score_ci_high": 0.9111111111111111,
205
+ "score_ci_low": 0.7555555555555555,
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
+ "accuracy": 0.6777777777777778,
210
+ "accuracy_ci_low": 0.5777777777777777,
211
+ "accuracy_ci_high": 0.7555555555555555,
212
  "score_name": "accuracy",
213
+ "score": 0.6777777777777778,
214
+ "score_ci_high": 0.7555555555555555,
215
+ "score_ci_low": 0.5777777777777777,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.7222222222222222,
220
+ "accuracy_ci_low": 0.6222222222222222,
221
+ "accuracy_ci_high": 0.8111111111111111,
222
  "score_name": "accuracy",
223
+ "score": 0.7222222222222222,
224
+ "score_ci_high": 0.8111111111111111,
225
+ "score_ci_low": 0.6222222222222222,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.8222222222222222,
230
+ "accuracy_ci_low": 0.7333333333333333,
231
+ "accuracy_ci_high": 0.8888888888888888,
232
  "score_name": "accuracy",
233
+ "score": 0.8222222222222222,
234
+ "score_ci_high": 0.8888888888888888,
235
+ "score_ci_low": 0.7333333333333333,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.7444444444444445,
240
+ "accuracy_ci_low": 0.6444444444444445,
241
+ "accuracy_ci_high": 0.8222222222222222,
242
  "score_name": "accuracy",
243
+ "score": 0.7444444444444445,
244
+ "score_ci_high": 0.8222222222222222,
245
+ "score_ci_low": 0.6444444444444445,
246
  "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.7444444444444445,
250
+ "accuracy_ci_low": 0.6444444444444445,
251
+ "accuracy_ci_high": 0.8333333333333334,
252
  "score_name": "accuracy",
253
+ "score": 0.7444444444444445,
254
+ "score_ci_high": 0.8333333333333334,
255
+ "score_ci_low": 0.6444444444444445,
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
+ "accuracy": 0.7444444444444445,
260
+ "accuracy_ci_low": 0.6444444444444445,
261
+ "accuracy_ci_high": 0.8222222222222222,
262
  "score_name": "accuracy",
263
+ "score": 0.7444444444444445,
264
+ "score_ci_high": 0.8222222222222222,
265
+ "score_ci_low": 0.6444444444444445,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
+ "accuracy": 0.7777777777777778,
270
+ "accuracy_ci_low": 0.6790372940698232,
271
+ "accuracy_ci_high": 0.8555555555555555,
272
  "score_name": "accuracy",
273
+ "score": 0.7777777777777778,
274
+ "score_ci_high": 0.8555555555555555,
275
+ "score_ci_low": 0.6790372940698232,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.7888888888888889,
280
+ "accuracy_ci_low": 0.6888888888888889,
281
+ "accuracy_ci_high": 0.8555555555555555,
282
  "score_name": "accuracy",
283
+ "score": 0.7888888888888889,
284
+ "score_ci_high": 0.8555555555555555,
285
+ "score_ci_low": 0.6888888888888889,
286
  "num_of_instances": 90
287
  },
288
+ "score": 0.7444444444444445,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.052083333333333336,
296
+ "score": 0.052083333333333336,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.052083333333333336,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
+ "f1_Person": 0.4769647696476965,
307
+ "f1_Organization": 0.2893890675241158,
308
+ "f1_Location": 0.30894308943089427,
309
+ "f1_macro": 0.3584323088675689,
310
+ "recall_macro": 0.31476418018843305,
311
+ "precision_macro": 0.4193267050409908,
312
+ "in_classes_support": 0.7786407766990291,
313
+ "f1_micro": 0.32884615384615384,
314
+ "recall_micro": 0.32571428571428573,
315
+ "precision_micro": 0.3320388349514563,
316
+ "score": 0.32884615384615384,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.2827095328153393,
319
+ "score_ci_high": 0.3761779704134513,
320
+ "f1_micro_ci_low": 0.2827095328153393,
321
+ "f1_micro_ci_high": 0.3761779704134513
322
  },
323
+ "score": 0.32884615384615384,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
+ "accuracy": 0.49295774647887325,
330
+ "accuracy_ci_low": 0.38028169014084506,
331
+ "accuracy_ci_high": 0.6056338028169014,
332
  "score_name": "accuracy",
333
+ "score": 0.49295774647887325,
334
+ "score_ci_high": 0.6056338028169014,
335
+ "score_ci_low": 0.38028169014084506,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
+ "accuracy": 0.323943661971831,
340
+ "accuracy_ci_low": 0.22535211267605634,
341
+ "accuracy_ci_high": 0.43661971830985913,
342
  "score_name": "accuracy",
343
+ "score": 0.323943661971831,
344
+ "score_ci_high": 0.43661971830985913,
345
+ "score_ci_low": 0.22535211267605634,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
+ "accuracy": 0.22535211267605634,
350
+ "accuracy_ci_low": 0.14084507042253522,
351
+ "accuracy_ci_high": 0.3380281690140845,
352
  "score_name": "accuracy",
353
+ "score": 0.22535211267605634,
354
+ "score_ci_high": 0.3380281690140845,
355
+ "score_ci_low": 0.14084507042253522,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
  "accuracy": 0.29577464788732394,
360
  "accuracy_ci_low": 0.19718309859154928,
361
+ "accuracy_ci_high": 0.4084507042253521,
362
  "score_name": "accuracy",
363
  "score": 0.29577464788732394,
364
+ "score_ci_high": 0.4084507042253521,
365
  "score_ci_low": 0.19718309859154928,
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
+ "accuracy": 0.5070422535211268,
370
+ "accuracy_ci_low": 0.38028169014084506,
371
+ "accuracy_ci_high": 0.6197183098591549,
372
  "score_name": "accuracy",
373
+ "score": 0.5070422535211268,
374
+ "score_ci_high": 0.6197183098591549,
375
+ "score_ci_low": 0.38028169014084506,
376
  "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
+ "accuracy": 0.18309859154929578,
380
+ "accuracy_ci_low": 0.09859154929577464,
381
+ "accuracy_ci_high": 0.2676056338028169,
382
  "score_name": "accuracy",
383
+ "score": 0.18309859154929578,
384
+ "score_ci_high": 0.2676056338028169,
385
+ "score_ci_low": 0.09859154929577464,
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
+ "accuracy": 0.323943661971831,
390
+ "accuracy_ci_low": 0.2112676056338028,
391
+ "accuracy_ci_high": 0.43661971830985913,
392
  "score_name": "accuracy",
393
+ "score": 0.323943661971831,
394
+ "score_ci_high": 0.43661971830985913,
395
+ "score_ci_low": 0.2112676056338028,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
+ "accuracy": 0.36619718309859156,
400
+ "accuracy_ci_low": 0.2535211267605634,
401
+ "accuracy_ci_high": 0.4788732394366197,
402
  "score_name": "accuracy",
403
+ "score": 0.36619718309859156,
404
+ "score_ci_high": 0.4788732394366197,
405
+ "score_ci_low": 0.2535211267605634,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
+ "accuracy": 0.28169014084507044,
410
+ "accuracy_ci_low": 0.18309859154929578,
411
+ "accuracy_ci_high": 0.39436619718309857,
412
  "score_name": "accuracy",
413
+ "score": 0.28169014084507044,
414
+ "score_ci_high": 0.39436619718309857,
415
+ "score_ci_low": 0.18309859154929578,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
+ "accuracy": 0.15492957746478872,
420
+ "accuracy_ci_low": 0.08450704225352113,
421
+ "accuracy_ci_high": 0.26564872868691924,
422
  "score_name": "accuracy",
423
+ "score": 0.15492957746478872,
424
+ "score_ci_high": 0.26564872868691924,
425
+ "score_ci_low": 0.08450704225352113,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.28169014084507044,
430
+ "accuracy_ci_low": 0.18309859154929578,
431
+ "accuracy_ci_high": 0.39436619718309857,
432
  "score_name": "accuracy",
433
+ "score": 0.28169014084507044,
434
+ "score_ci_high": 0.39436619718309857,
435
+ "score_ci_low": 0.18309859154929578,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
+ "accuracy": 0.29577464788732394,
440
+ "accuracy_ci_low": 0.19718309859154928,
441
+ "accuracy_ci_high": 0.4084507042253521,
442
  "score_name": "accuracy",
443
+ "score": 0.29577464788732394,
444
+ "score_ci_high": 0.4084507042253521,
445
+ "score_ci_low": 0.19718309859154928,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.16901408450704225,
450
+ "accuracy_ci_low": 0.09859154929577464,
451
+ "accuracy_ci_high": 0.2676056338028169,
452
  "score_name": "accuracy",
453
+ "score": 0.16901408450704225,
454
+ "score_ci_high": 0.2676056338028169,
455
+ "score_ci_low": 0.09859154929577464,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
+ "accuracy": 0.4788732394366197,
460
+ "accuracy_ci_low": 0.36619718309859156,
461
+ "accuracy_ci_high": 0.6012345324644585,
462
  "score_name": "accuracy",
463
+ "score": 0.4788732394366197,
464
+ "score_ci_high": 0.6012345324644585,
465
+ "score_ci_low": 0.36619718309859156,
466
  "num_of_instances": 71
467
  },
468
+ "score": 0.31287726358148893,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.37558008658008657,
475
+ "f1_suggestive": 0.24242424242424243,
476
+ "f1_generic": 0.38095238095238093,
477
+ "f1_descriptive": 0.4583333333333333,
478
+ "f1_fanciful": 0.32,
479
+ "f1_arbitrary": 0.47619047619047616,
480
+ "f1_macro_ci_low": 0.2831225773147394,
481
+ "f1_macro_ci_high": 0.5002662505279254,
482
  "score_name": "f1_micro",
483
+ "score": 0.3905325443786982,
484
+ "score_ci_high": 0.4970414201183432,
485
+ "score_ci_low": 0.2850635959228859,
486
  "num_of_instances": 85,
487
+ "accuracy": 0.38823529411764707,
488
+ "accuracy_ci_low": 0.2823529411764706,
489
+ "accuracy_ci_high": 0.49411764705882355,
490
+ "f1_micro": 0.3905325443786982,
491
+ "f1_micro_ci_low": 0.2850635959228859,
492
+ "f1_micro_ci_high": 0.4970414201183432
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.4401501318725908,
496
+ "f1_no": 0.40860215053763443,
497
+ "f1_yes": 0.4716981132075472,
498
+ "f1_macro_ci_low": 0.37497026604570943,
499
+ "f1_macro_ci_high": 0.5074058340630839,
500
  "score_name": "f1_micro",
501
+ "score": 0.44221105527638194,
502
+ "score_ci_high": 0.507537688442211,
503
+ "score_ci_low": 0.3763531585733561,
504
  "num_of_instances": 200,
505
+ "accuracy": 0.44,
506
+ "accuracy_ci_low": 0.375,
507
+ "accuracy_ci_high": 0.505,
508
+ "f1_micro": 0.44221105527638194,
509
+ "f1_micro_ci_low": 0.3763531585733561,
510
+ "f1_micro_ci_high": 0.507537688442211
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.1916316363119106,
514
+ "f1_conclusion": 0.15584415584415584,
515
+ "f1_analysis": 0.3333333333333333,
516
  "f1_decree": 0.07692307692307693,
517
+ "f1_issue": 0.23076923076923078,
518
+ "f1_facts": 0.12903225806451613,
519
+ "f1_procedural history": 0.11764705882352941,
520
+ "f1_rule": 0.2978723404255319,
521
+ "f1_macro_ci_low": 0.143695452040772,
522
+ "f1_macro_ci_high": 0.25550408682657144,
523
  "score_name": "f1_micro",
524
+ "score": 0.22278481012658227,
525
+ "score_ci_high": 0.2864321608040201,
526
+ "score_ci_low": 0.16660296570964608,
527
  "num_of_instances": 200,
528
+ "accuracy": 0.22,
529
+ "accuracy_ci_low": 0.165,
530
+ "accuracy_ci_high": 0.28021087258250593,
531
+ "f1_micro": 0.22278481012658227,
532
+ "f1_micro_ci_low": 0.16660296570964608,
533
+ "f1_micro_ci_high": 0.2864321608040201
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.5259978425026969,
537
+ "f1_yes": 0.5631067961165048,
538
+ "f1_no": 0.4888888888888889,
539
+ "f1_macro_ci_low": 0.4618026481617566,
540
+ "f1_macro_ci_high": 0.5997495353215635,
541
  "score_name": "f1_micro",
542
+ "score": 0.5284974093264249,
543
+ "score_ci_high": 0.5989912778302698,
544
+ "score_ci_low": 0.46113989637305697,
545
  "num_of_instances": 200,
546
+ "accuracy": 0.51,
547
+ "accuracy_ci_low": 0.445,
548
+ "accuracy_ci_high": 0.58,
549
+ "f1_micro": 0.5284974093264249,
550
+ "f1_micro_ci_low": 0.46113989637305697,
551
+ "f1_micro_ci_high": 0.5989912778302698
552
  },
553
  "legalbench_proa": {
554
  "f1_macro": 0.726027397260274,
555
  "f1_yes": 0.7123287671232876,
556
  "f1_no": 0.7397260273972602,
557
+ "f1_macro_ci_low": 0.618628457335439,
558
+ "f1_macro_ci_high": 0.8122702152748204,
559
  "score_name": "f1_micro",
560
  "score": 0.726027397260274,
561
+ "score_ci_high": 0.8104575163398693,
562
+ "score_ci_low": 0.6186406698987806,
563
  "num_of_instances": 85,
564
  "accuracy": 0.6235294117647059,
565
  "accuracy_ci_low": 0.5058823529411764,
566
  "accuracy_ci_high": 0.7176470588235294,
567
  "f1_micro": 0.726027397260274,
568
+ "f1_micro_ci_low": 0.6186406698987806,
569
+ "f1_micro_ci_high": 0.8104575163398693
570
  },
571
+ "score": 0.46201064327367225,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.3891021150890982,
578
+ "f1_cars": 0.7346938775510204,
579
+ "f1_windows x": 0.0,
580
+ "f1_atheism": 0.425531914893617,
581
+ "f1_christianity": 0.4444444444444444,
582
+ "f1_religion": 0.15873015873015872,
583
  "f1_medicine": 0.6376811594202898,
584
+ "f1_computer graphics": 0.2755102040816326,
585
+ "f1_microsoft windows": 0.29850746268656714,
586
+ "f1_middle east": 0.19607843137254902,
587
+ "f1_politics": 0.3387096774193548,
588
+ "f1_motorcycles": 0.43902439024390244,
589
+ "f1_mac hardware": 0.2,
590
+ "f1_pc hardware": 0.34545454545454546,
591
+ "f1_for sale": 0.33962264150943394,
592
+ "f1_guns": 0.26666666666666666,
593
+ "f1_baseball": 0.7368421052631579,
594
  "f1_space": 0.5194805194805194,
595
+ "f1_cryptography": 0.4358974358974359,
596
+ "f1_hockey": 0.5625,
597
+ "f1_electronics": 0.4266666666666667,
598
+ "f1_macro_ci_low": 0.35853792537669554,
599
+ "f1_macro_ci_high": 0.4194279498018566,
600
  "score_name": "f1_micro",
601
+ "score": 0.4063792085056113,
602
+ "score_ci_high": 0.4367348562601148,
603
+ "score_ci_low": 0.3741790131164338,
604
  "num_of_instances": 1000,
605
+ "accuracy": 0.344,
606
+ "accuracy_ci_low": 0.314,
607
+ "accuracy_ci_high": 0.371,
608
+ "f1_micro": 0.4063792085056113,
609
+ "f1_micro_ci_low": 0.3741790131164338,
610
+ "f1_micro_ci_high": 0.4367348562601148
611
  },
612
+ "score": 0.4063792085056113,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.5522255415970777,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9099656357388316,
620
+ "f1_checking or savings account": 0.5301204819277109,
621
+ "f1_debt collection": 0.3576158940397351,
622
+ "f1_credit card or prepaid card": 0.37777777777777777,
623
+ "f1_mortgage": 0.7017543859649122,
 
 
 
624
  "f1_student loan": 0.75,
625
+ "f1_money transfer or virtual currency or money service": 0.6666666666666666,
626
+ "f1_vehicle loan or lease": 0.5161290322580645,
627
+ "f1_payday loan or title loan or personal loan": 0.16,
628
+ "f1_macro_ci_low": 0.5008792423568225,
629
+ "f1_macro_ci_high": 0.6059191922507057,
630
  "score_name": "f1_micro",
631
+ "score": 0.7975522692503825,
632
+ "score_ci_high": 0.8230092874186598,
633
+ "score_ci_low": 0.7734015345268542,
634
  "num_of_instances": 1000,
635
+ "accuracy": 0.782,
636
+ "accuracy_ci_low": 0.758,
637
+ "accuracy_ci_high": 0.808771349424543,
638
+ "f1_micro": 0.7975522692503825,
639
+ "f1_micro_ci_low": 0.7734015345268542,
640
+ "f1_micro_ci_high": 0.8230092874186598
641
  },
642
  "cfpb_product_watsonx": {
643
+ "f1_macro": 0.5976480822045338,
644
+ "f1_mortgages and loans": 0.7428571428571429,
645
+ "f1_credit card": 0.6767676767676768,
646
+ "f1_debt collection": 0.5729166666666666,
647
+ "f1_retail banking": 0.26666666666666666,
648
+ "f1_credit reporting": 0.7290322580645161,
649
+ "f1_macro_ci_low": 0.5536640329621239,
650
+ "f1_macro_ci_high": 0.6426754598088634,
651
  "score_name": "f1_micro",
652
+ "score": 0.6408163265306123,
653
+ "score_ci_high": 0.683589397051309,
654
+ "score_ci_low": 0.5968250791908158,
655
  "num_of_instances": 500,
656
+ "accuracy": 0.628,
657
+ "accuracy_ci_low": 0.584,
658
+ "accuracy_ci_high": 0.67,
659
+ "f1_micro": 0.6408163265306123,
660
+ "f1_micro_ci_low": 0.5968250791908158,
661
+ "f1_micro_ci_high": 0.683589397051309
662
  },
663
+ "score": 0.7191842978904974,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
+ "program_accuracy": 0.046,
671
+ "score": 0.046,
672
  "score_name": "program_accuracy",
673
+ "execution_accuracy": 0.04,
674
+ "program_accuracy_ci_low": 0.035,
675
+ "program_accuracy_ci_high": 0.06,
676
+ "score_ci_low": 0.035,
677
+ "score_ci_high": 0.06,
678
+ "execution_accuracy_ci_low": 0.029,
679
+ "execution_accuracy_ci_high": 0.053
680
  },
681
+ "score": 0.046,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
+ "precision": 0.3263731921396224,
688
+ "recall": 0.5005136281930357,
689
+ "f1": 0.3238306562243135,
690
+ "precision_ci_low": 0.3062305395351377,
691
+ "precision_ci_high": 0.3471267358926223,
692
+ "recall_ci_low": 0.48406708194912995,
693
+ "recall_ci_high": 0.5170051754306556,
694
+ "f1_ci_low": 0.30817001709054165,
695
+ "f1_ci_high": 0.33859378817920405,
696
  "score_name": "f1",
697
+ "score": 0.3238306562243135,
698
+ "score_ci_high": 0.33859378817920405,
699
+ "score_ci_low": 0.30817001709054165,
700
  "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.5865038787325223,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6549727993210157,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5550703954448303,
704
+ "faithfullness_f1_token_overlap": 0.26017056639729325,
705
+ "faithfullness_recall_token_overlap": 0.19094108989263792,
706
+ "faithfullness_precision_token_overlap": 0.5662514000838377,
707
+ "correctness_f1_token_overlap": 0.3238306562243135,
708
+ "correctness_recall_token_overlap": 0.5005136281930357,
709
+ "correctness_precision_token_overlap": 0.3263731921396224
710
  },
711
+ "score": 0.3238306562243135,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
+ "accuracy": 0.425,
718
+ "accuracy_ci_low": 0.394,
719
+ "accuracy_ci_high": 0.4557921663139014,
720
  "score_name": "accuracy",
721
+ "score": 0.425,
722
+ "score_ci_high": 0.4557921663139014,
723
+ "score_ci_low": 0.394,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
+ "accuracy": 0.736,
728
+ "accuracy_ci_low": 0.7,
729
+ "accuracy_ci_high": 0.776,
730
  "score_name": "accuracy",
731
+ "score": 0.736,
732
+ "score_ci_high": 0.776,
733
+ "score_ci_low": 0.7,
734
  "num_of_instances": 500
735
  },
736
+ "score": 0.5805,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
 
743
  "severity_high": 0.0,
744
  "severity_medium": 0.0,
745
  "severity_low": 100.0,
746
+ "category_harmful_info": 0.8295307880589028,
747
+ "category_deception": 0.8032731126354199,
748
+ "category_explicit_content": 0.8484950255263936,
749
+ "category_violence": 0.8168619035846657,
750
+ "category_discrimination": 0.8478557457999577,
751
+ "category_substance_abuse": 0.8705138153500027,
752
+ "category_pii": 0.829884376514841,
753
+ "safety": 0.8276369063778677,
754
+ "safety_ci_low": 0.8109914147202264,
755
+ "safety_ci_high": 0.8430729937190028,
756
  "score_name": "safety",
757
+ "score": 0.8276369063778677,
758
+ "score_ci_high": 0.8430729937190028,
759
+ "score_ci_low": 0.8109914147202264,
760
  "num_of_instances": 100
761
  },
762
+ "score": 0.8276369063778677,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
+ "rougeL": 0.284383588545129,
770
+ "score": 0.284383588545129,
 
 
771
  "score_name": "rougeL",
772
+ "rouge2": 0.20245431329782115,
773
+ "rouge1": 0.41045505876440336,
774
+ "rougeLsum": 0.3502025548591709,
775
+ "rougeL_ci_low": 0.2769343751140693,
776
+ "rougeL_ci_high": 0.29127408884195716,
777
+ "score_ci_low": 0.2769343751140693,
778
+ "score_ci_high": 0.29127408884195716,
779
+ "rouge2_ci_low": 0.1950015427241588,
780
+ "rouge2_ci_high": 0.210011723499623,
781
+ "rouge1_ci_low": 0.4008962815661577,
782
+ "rouge1_ci_high": 0.41982792488499465,
783
+ "rougeLsum_ci_low": 0.3416385040140321,
784
+ "rougeLsum_ci_high": 0.3588894507334038
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
+ "rougeL": 0.0903457635776036,
789
+ "score": 0.0903457635776036,
 
 
790
  "score_name": "rougeL",
791
+ "rouge2": 0.018003187802161934,
792
+ "rouge1": 0.12438028754478446,
793
+ "rougeLsum": 0.10277785443605283,
794
+ "rougeL_ci_low": 0.08651302258172923,
795
+ "rougeL_ci_high": 0.09388371145028165,
796
+ "score_ci_low": 0.08651302258172923,
797
+ "score_ci_high": 0.09388371145028165,
798
+ "rouge2_ci_low": 0.016237543973207882,
799
+ "rouge2_ci_high": 0.01999767687426406,
800
+ "rouge1_ci_low": 0.11882142858616347,
801
+ "rouge1_ci_high": 0.12948246473507513,
802
+ "rougeLsum_ci_low": 0.0979381342979595,
803
+ "rougeLsum_ci_high": 0.10690023059691123
804
  },
805
+ "score": 0.1873646760613663,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
 
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
+ 1135,
814
+ 646,
815
+ 410,
816
+ 275
817
  ],
818
  "totals": [
819
+ 1820,
820
+ 1754,
821
+ 1688,
822
+ 1622
823
  ],
824
  "precisions": [
825
+ 0.6236263736263736,
826
+ 0.36830102622576966,
827
+ 0.24289099526066352,
828
+ 0.16954377311960542
829
  ],
830
  "bp": 1.0,
831
+ "sys_len": 1820,
832
  "ref_len": 1734,
833
+ "sacrebleu": 0.31185676193781753,
834
+ "score": 0.31185676193781753,
835
  "score_name": "sacrebleu",
836
+ "score_ci_low": 0.26078843081913794,
837
+ "score_ci_high": 0.35262811190937277,
838
+ "sacrebleu_ci_low": 0.26078843081913794,
839
+ "sacrebleu_ci_high": 0.35262811190937277
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
+ 1238,
845
+ 750,
846
+ 499,
847
+ 339
848
  ],
849
  "totals": [
850
+ 1796,
851
+ 1730,
852
+ 1664,
853
+ 1598
854
  ],
855
  "precisions": [
856
+ 0.6893095768374166,
857
+ 0.4335260115606936,
858
+ 0.2998798076923077,
859
+ 0.21214017521902379
860
  ],
861
  "bp": 1.0,
862
+ "sys_len": 1796,
863
  "ref_len": 1734,
864
+ "sacrebleu": 0.3713213364431593,
865
+ "score": 0.3713213364431593,
866
  "score_name": "sacrebleu",
867
+ "score_ci_low": 0.3292670063116335,
868
+ "score_ci_high": 0.4181859347073083,
869
+ "sacrebleu_ci_low": 0.3292670063116335,
870
+ "sacrebleu_ci_high": 0.4181859347073083
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
  613,
876
+ 190,
877
+ 79,
878
+ 26
879
  ],
880
  "totals": [
881
+ 1656,
882
+ 1590,
883
+ 1524,
884
+ 1458
885
  ],
886
  "precisions": [
887
+ 0.3701690821256039,
888
+ 0.11949685534591195,
889
+ 0.05183727034120735,
890
+ 0.01783264746227709
891
  ],
892
  "bp": 1.0,
893
+ "sys_len": 1656,
894
  "ref_len": 1589,
895
+ "sacrebleu": 0.07996568130005909,
896
+ "score": 0.07996568130005909,
897
  "score_name": "sacrebleu",
898
+ "score_ci_low": 0.06042930465467444,
899
+ "score_ci_high": 0.09652810994564934,
900
+ "sacrebleu_ci_low": 0.06042930465467444,
901
+ "sacrebleu_ci_high": 0.09652810994564934
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
+ 1029,
907
+ 509,
908
+ 282,
909
+ 168
910
  ],
911
  "totals": [
912
+ 1810,
913
+ 1744,
914
+ 1678,
915
+ 1612
916
  ],
917
  "precisions": [
918
+ 0.5685082872928177,
919
+ 0.2918577981651376,
920
+ 0.16805721096543505,
921
+ 0.10421836228287841
922
  ],
923
+ "bp": 0.9862827954544454,
924
+ "sys_len": 1810,
925
  "ref_len": 1835,
926
+ "sacrebleu": 0.22899649328289487,
927
+ "score": 0.22899649328289487,
928
  "score_name": "sacrebleu",
929
+ "score_ci_low": 0.19262787967326042,
930
+ "score_ci_high": 0.280077803172394,
931
+ "sacrebleu_ci_low": 0.19262787967326042,
932
+ "sacrebleu_ci_high": 0.280077803172394
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
+ 1359,
938
+ 902,
939
+ 653,
940
+ 485
941
  ],
942
  "totals": [
943
+ 1997,
944
+ 1931,
945
+ 1865,
946
+ 1799
947
  ],
948
  "precisions": [
949
+ 0.6805207811717576,
950
+ 0.4671154842050751,
951
+ 0.3501340482573727,
952
+ 0.2695942190105614
953
  ],
954
+ "bp": 0.9650712656118398,
955
+ "sys_len": 1997,
956
  "ref_len": 2068,
957
+ "sacrebleu": 0.40166318618755137,
958
+ "score": 0.40166318618755137,
959
  "score_name": "sacrebleu",
960
+ "score_ci_low": 0.3617396828186495,
961
+ "score_ci_high": 0.4488753587822201,
962
+ "sacrebleu_ci_low": 0.3617396828186495,
963
+ "sacrebleu_ci_high": 0.4488753587822201
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
+ 1121,
969
  512,
970
+ 259,
971
+ 142
972
  ],
973
  "totals": [
974
+ 2523,
975
+ 2457,
976
+ 2391,
977
+ 2325
978
  ],
979
  "precisions": [
980
+ 0.44431232659532305,
981
+ 0.20838420838420837,
982
+ 0.1083228774571309,
983
+ 0.0610752688172043
984
  ],
985
  "bp": 1.0,
986
+ "sys_len": 2523,
987
  "ref_len": 2235,
988
+ "sacrebleu": 0.1573202708440978,
989
+ "score": 0.1573202708440978,
990
  "score_name": "sacrebleu",
991
+ "score_ci_low": 0.1330679575324248,
992
+ "score_ci_high": 0.17785519831459645,
993
+ "sacrebleu_ci_low": 0.1330679575324248,
994
+ "sacrebleu_ci_high": 0.17785519831459645
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
+ 1319,
1000
+ 872,
1001
+ 612,
1002
+ 444
1003
  ],
1004
  "totals": [
1005
+ 1868,
1006
+ 1802,
1007
+ 1736,
1008
+ 1670
1009
  ],
1010
  "precisions": [
1011
+ 0.7061027837259101,
1012
+ 0.4839067702552719,
1013
+ 0.35253456221198154,
1014
+ 0.2658682634730539
1015
  ],
1016
+ "bp": 0.974631399286791,
1017
+ "sys_len": 1868,
1018
  "ref_len": 1916,
1019
+ "sacrebleu": 0.41230144255258333,
1020
+ "score": 0.41230144255258333,
1021
  "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.3754985997111881,
1023
+ "score_ci_high": 0.4595286683207052,
1024
+ "sacrebleu_ci_low": 0.3754985997111881,
1025
+ "sacrebleu_ci_high": 0.4595286683207052
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
+ 1192,
1031
  710,
1032
+ 457,
1033
+ 302
1034
  ],
1035
  "totals": [
1036
+ 1928,
1037
+ 1862,
1038
+ 1796,
1039
+ 1730
1040
  ],
1041
  "precisions": [
1042
+ 0.6182572614107884,
1043
+ 0.38131041890440387,
1044
+ 0.2544543429844098,
1045
+ 0.1745664739884393
1046
  ],
1047
+ "bp": 0.9891669881299116,
1048
+ "sys_len": 1928,
1049
  "ref_len": 1949,
1050
+ "sacrebleu": 0.31642753307552074,
1051
+ "score": 0.31642753307552074,
1052
  "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.2854187560212565,
1054
+ "score_ci_high": 0.3701086569937762,
1055
+ "sacrebleu_ci_low": 0.2854187560212565,
1056
+ "sacrebleu_ci_high": 0.3701086569937762
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
+ 1208,
1062
+ 659,
1063
+ 389,
1064
+ 236
1065
  ],
1066
  "totals": [
1067
+ 1983,
1068
+ 1917,
1069
+ 1851,
1070
+ 1785
1071
  ],
1072
  "precisions": [
1073
+ 0.6091780131114473,
1074
+ 0.34376630151278037,
1075
+ 0.2101566720691518,
1076
+ 0.13221288515406163
1077
  ],
1078
+ "bp": 0.9436566096384625,
1079
+ "sys_len": 1983,
1080
  "ref_len": 2098,
1081
+ "sacrebleu": 0.26062742180685816,
1082
+ "score": 0.26062742180685816,
1083
  "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.22914767064053682,
1085
+ "score_ci_high": 0.2844965463617,
1086
+ "sacrebleu_ci_low": 0.22914767064053682,
1087
+ "sacrebleu_ci_high": 0.2844965463617
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
+ 1276,
1093
  807,
1094
+ 545,
1095
+ 375
1096
  ],
1097
  "totals": [
1098
+ 1818,
1099
+ 1752,
1100
+ 1686,
1101
+ 1620
1102
  ],
1103
  "precisions": [
1104
+ 0.7018701870187019,
1105
+ 0.4606164383561644,
1106
+ 0.3232502965599051,
1107
+ 0.23148148148148148
1108
  ],
1109
  "bp": 1.0,
1110
+ "sys_len": 1818,
1111
  "ref_len": 1734,
1112
+ "sacrebleu": 0.39437815723424946,
1113
+ "score": 0.39437815723424946,
1114
  "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.3589675802199702,
1116
+ "score_ci_high": 0.4463980849713465,
1117
+ "sacrebleu_ci_low": 0.3589675802199702,
1118
+ "sacrebleu_ci_high": 0.4463980849713465
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
+ 1027,
1124
+ 468,
1125
+ 248,
1126
+ 130
1127
  ],
1128
  "totals": [
1129
+ 1824,
1130
+ 1758,
1131
+ 1692,
1132
+ 1626
1133
  ],
1134
  "precisions": [
1135
+ 0.5630482456140351,
1136
+ 0.26621160409556316,
1137
+ 0.14657210401891255,
1138
+ 0.07995079950799508
1139
  ],
1140
  "bp": 1.0,
1141
+ "sys_len": 1824,
1142
  "ref_len": 1734,
1143
+ "sacrebleu": 0.20472066389963584,
1144
+ "score": 0.20472066389963584,
1145
  "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.1789860929189351,
1147
+ "score_ci_high": 0.24365182830296692,
1148
+ "sacrebleu_ci_low": 0.1789860929189351,
1149
+ "sacrebleu_ci_high": 0.24365182830296692
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
+ 963,
1155
+ 428,
1156
+ 229,
1157
+ 133
1158
  ],
1159
  "totals": [
1160
+ 1783,
1161
+ 1717,
1162
+ 1651,
1163
+ 1585
1164
  ],
1165
  "precisions": [
1166
+ 0.5401009534492429,
1167
+ 0.24927198602213163,
1168
+ 0.1387038158691702,
1169
+ 0.08391167192429022
1170
  ],
1171
  "bp": 1.0,
1172
+ "sys_len": 1783,
1173
  "ref_len": 1734,
1174
+ "sacrebleu": 0.19895955473357632,
1175
+ "score": 0.19895955473357632,
1176
  "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.1719456080924031,
1178
+ "score_ci_high": 0.2466087231408179,
1179
+ "sacrebleu_ci_low": 0.1719456080924031,
1180
+ "sacrebleu_ci_high": 0.2466087231408179
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
+ 1283,
1186
+ 836,
1187
+ 589,
1188
+ 428
1189
  ],
1190
  "totals": [
1191
+ 1803,
1192
+ 1737,
1193
+ 1671,
1194
+ 1605
1195
  ],
1196
  "precisions": [
1197
+ 0.7115917914586799,
1198
+ 0.48128957973517555,
1199
+ 0.35248354278874927,
1200
+ 0.26666666666666666
1201
  ],
1202
  "bp": 1.0,
1203
+ "sys_len": 1803,
1204
  "ref_len": 1734,
1205
+ "sacrebleu": 0.4235807758108321,
1206
+ "score": 0.4235807758108321,
1207
  "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.37601558928179885,
1209
+ "score_ci_high": 0.47290261446153176,
1210
+ "sacrebleu_ci_low": 0.37601558928179885,
1211
+ "sacrebleu_ci_high": 0.47290261446153176
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
+ 1297,
1217
+ 833,
1218
+ 566,
1219
+ 384
1220
  ],
1221
  "totals": [
1222
+ 1841,
1223
+ 1775,
1224
+ 1709,
1225
+ 1643
1226
  ],
1227
  "precisions": [
1228
+ 0.7045084193373167,
1229
+ 0.46929577464788735,
1230
+ 0.3311878291398479,
1231
+ 0.23371880706025563
1232
  ],
1233
  "bp": 1.0,
1234
+ "sys_len": 1841,
1235
  "ref_len": 1734,
1236
+ "sacrebleu": 0.3999679713298994,
1237
+ "score": 0.3999679713298994,
1238
  "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.3551473917196145,
1240
+ "score_ci_high": 0.4324084862016873,
1241
+ "sacrebleu_ci_low": 0.3551473917196145,
1242
+ "sacrebleu_ci_high": 0.4324084862016873
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
+ 1126,
1248
+ 594,
1249
+ 349,
1250
+ 214
1251
  ],
1252
  "totals": [
1253
+ 1834,
1254
+ 1768,
1255
+ 1702,
1256
+ 1636
1257
  ],
1258
  "precisions": [
1259
+ 0.6139585605234461,
1260
+ 0.335972850678733,
1261
+ 0.20505287896592242,
1262
+ 0.13080684596577016
1263
  ],
1264
  "bp": 1.0,
1265
+ "sys_len": 1834,
1266
  "ref_len": 1734,
1267
+ "sacrebleu": 0.2727312463583288,
1268
+ "score": 0.2727312463583288,
1269
  "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.23703993855927166,
1271
+ "score_ci_high": 0.3146036672452131,
1272
+ "sacrebleu_ci_low": 0.23703993855927166,
1273
+ "sacrebleu_ci_high": 0.3146036672452131
1274
  },
1275
+ "score": 0.2956545664531376,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
+ "score": 0.4066778576916836,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
results/bluebench/{2025-06-19T20-10-50_evaluation_results.json β†’ 2025-06-23T06-18-33_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-20T00:10:45.998753Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -42,7 +42,7 @@
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -256,13 +256,13 @@
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
- "accuracy": 0.8888888888888888,
260
- "accuracy_ci_low": 0.8111111111111111,
261
- "accuracy_ci_high": 0.9444444444444444,
262
  "score_name": "accuracy",
263
- "score": 0.8888888888888888,
264
- "score_ci_high": 0.9444444444444444,
265
- "score_ci_low": 0.8111111111111111,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
@@ -285,74 +285,74 @@
285
  "score_ci_low": 0.7888888888888889,
286
  "num_of_instances": 90
287
  },
288
- "score": 0.9626262626262626,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.5,
296
- "score": 0.5,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.5,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
- "f1_Person": 0.5662337662337662,
307
- "f1_Organization": 0.34810126582278483,
308
- "f1_Location": 0.4031620553359684,
309
- "f1_macro": 0.4391656957975065,
310
- "recall_macro": 0.39723251248500296,
311
- "precision_macro": 0.49375910707000065,
312
- "in_classes_support": 0.5212636695018227,
313
- "f1_micro": 0.31899109792284863,
314
- "recall_micro": 0.4095238095238095,
315
- "precision_micro": 0.26123936816524906,
316
- "score": 0.31899109792284863,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.277439612598807,
319
- "score_ci_high": 0.3678125973620034,
320
- "f1_micro_ci_low": 0.277439612598807,
321
- "f1_micro_ci_high": 0.3678125973620034
322
  },
323
- "score": 0.31899109792284863,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
- "accuracy": 0.5915492957746479,
330
- "accuracy_ci_low": 0.4788732394366197,
331
- "accuracy_ci_high": 0.704225352112676,
332
  "score_name": "accuracy",
333
- "score": 0.5915492957746479,
334
- "score_ci_high": 0.704225352112676,
335
- "score_ci_low": 0.4788732394366197,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
- "accuracy": 0.2676056338028169,
340
- "accuracy_ci_low": 0.16901408450704225,
341
- "accuracy_ci_high": 0.38028169014084506,
342
  "score_name": "accuracy",
343
- "score": 0.2676056338028169,
344
- "score_ci_high": 0.38028169014084506,
345
- "score_ci_low": 0.16901408450704225,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
- "accuracy": 0.23943661971830985,
350
- "accuracy_ci_low": 0.15492957746478872,
351
- "accuracy_ci_high": 0.352112676056338,
352
  "score_name": "accuracy",
353
- "score": 0.23943661971830985,
354
- "score_ci_high": 0.352112676056338,
355
- "score_ci_low": 0.15492957746478872,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
@@ -366,189 +366,189 @@
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
- "accuracy": 0.6901408450704225,
370
- "accuracy_ci_low": 0.5633802816901409,
371
- "accuracy_ci_high": 0.7887323943661971,
372
  "score_name": "accuracy",
373
- "score": 0.6901408450704225,
374
- "score_ci_high": 0.7887323943661971,
375
- "score_ci_low": 0.5633802816901409,
376
  "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
- "accuracy": 0.39436619718309857,
380
- "accuracy_ci_low": 0.29577464788732394,
381
- "accuracy_ci_high": 0.5070422535211268,
382
  "score_name": "accuracy",
383
- "score": 0.39436619718309857,
384
- "score_ci_high": 0.5070422535211268,
385
- "score_ci_low": 0.29577464788732394,
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
- "accuracy": 0.5211267605633803,
390
- "accuracy_ci_low": 0.39436619718309857,
391
- "accuracy_ci_high": 0.6197183098591549,
392
  "score_name": "accuracy",
393
- "score": 0.5211267605633803,
394
- "score_ci_high": 0.6197183098591549,
395
- "score_ci_low": 0.39436619718309857,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
- "accuracy": 0.7746478873239436,
400
- "accuracy_ci_low": 0.6619718309859155,
401
- "accuracy_ci_high": 0.8591549295774648,
402
  "score_name": "accuracy",
403
- "score": 0.7746478873239436,
404
- "score_ci_high": 0.8591549295774648,
405
- "score_ci_low": 0.6619718309859155,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
- "accuracy": 0.5774647887323944,
410
- "accuracy_ci_low": 0.4631453652997223,
411
- "accuracy_ci_high": 0.6901408450704225,
412
  "score_name": "accuracy",
413
- "score": 0.5774647887323944,
414
- "score_ci_high": 0.6901408450704225,
415
- "score_ci_low": 0.4631453652997223,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
- "accuracy": 0.2112676056338028,
420
- "accuracy_ci_low": 0.1267605633802817,
421
- "accuracy_ci_high": 0.323943661971831,
422
  "score_name": "accuracy",
423
- "score": 0.2112676056338028,
424
- "score_ci_high": 0.323943661971831,
425
- "score_ci_low": 0.1267605633802817,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
- "accuracy": 0.5633802816901409,
430
- "accuracy_ci_low": 0.4507042253521127,
431
- "accuracy_ci_high": 0.668060546470624,
432
  "score_name": "accuracy",
433
- "score": 0.5633802816901409,
434
- "score_ci_high": 0.668060546470624,
435
- "score_ci_low": 0.4507042253521127,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
  "accuracy": 0.6901408450704225,
440
- "accuracy_ci_low": 0.5774647887323944,
441
  "accuracy_ci_high": 0.7887323943661971,
442
  "score_name": "accuracy",
443
  "score": 0.6901408450704225,
444
  "score_ci_high": 0.7887323943661971,
445
- "score_ci_low": 0.5774647887323944,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
- "accuracy": 0.4084507042253521,
450
- "accuracy_ci_low": 0.29577464788732394,
451
- "accuracy_ci_high": 0.5211267605633803,
452
  "score_name": "accuracy",
453
- "score": 0.4084507042253521,
454
- "score_ci_high": 0.5211267605633803,
455
- "score_ci_low": 0.29577464788732394,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
- "accuracy": 0.647887323943662,
460
- "accuracy_ci_low": 0.5352112676056338,
461
  "accuracy_ci_high": 0.7605633802816901,
462
  "score_name": "accuracy",
463
- "score": 0.647887323943662,
464
  "score_ci_high": 0.7605633802816901,
465
- "score_ci_low": 0.5352112676056338,
466
  "num_of_instances": 71
467
  },
468
- "score": 0.506036217303823,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.5831370899915895,
475
- "f1_suggestive": 0.4827586206896552,
476
  "f1_generic": 0.6666666666666666,
477
- "f1_descriptive": 0.6666666666666666,
478
  "f1_fanciful": 0.4166666666666667,
479
- "f1_arbitrary": 0.6829268292682927,
480
- "f1_macro_ci_low": 0.47939113487694995,
481
- "f1_macro_ci_high": 0.6897999117090845,
482
  "score_name": "f1_micro",
483
- "score": 0.5987261146496815,
484
- "score_ci_high": 0.6962025316455697,
485
- "score_ci_low": 0.4807376602538022,
486
  "num_of_instances": 85,
487
- "accuracy": 0.5529411764705883,
488
- "accuracy_ci_low": 0.43529411764705883,
489
  "accuracy_ci_high": 0.6588235294117647,
490
- "f1_micro": 0.5987261146496815,
491
- "f1_micro_ci_low": 0.4807376602538022,
492
- "f1_micro_ci_high": 0.6962025316455697
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.6763636363636364,
496
- "f1_no": 0.7927272727272727,
497
- "f1_yes": 0.56,
498
- "f1_macro_ci_low": 0.6036339063806182,
499
- "f1_macro_ci_high": 0.7468677315003386,
500
  "score_name": "f1_micro",
501
- "score": 0.7306666666666667,
502
- "score_ci_high": 0.7853403141361257,
503
- "score_ci_low": 0.6630296211830374,
504
  "num_of_instances": 200,
505
- "accuracy": 0.685,
506
- "accuracy_ci_low": 0.62,
507
- "accuracy_ci_high": 0.745,
508
- "f1_micro": 0.7306666666666667,
509
- "f1_micro_ci_low": 0.6630296211830374,
510
- "f1_micro_ci_high": 0.7853403141361257
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.2642730181773706,
514
  "f1_conclusion": 0.0625,
515
- "f1_issue": 0.16326530612244897,
516
  "f1_decree": 0.2,
517
- "f1_analysis": 0.4375,
518
- "f1_facts": 0.32558139534883723,
519
- "f1_procedural history": 0.19047619047619047,
520
- "f1_rule": 0.47058823529411764,
521
- "f1_macro_ci_low": 0.20968219014642994,
522
- "f1_macro_ci_high": 0.33253527853885895,
 
523
  "score_name": "f1_micro",
524
- "score": 0.28938906752411575,
525
- "score_ci_high": 0.3618842117391186,
526
- "score_ci_low": 0.22364217252396165,
527
  "num_of_instances": 200,
528
- "accuracy": 0.225,
529
- "accuracy_ci_low": 0.175,
530
- "accuracy_ci_high": 0.29,
531
- "f1_micro": 0.28938906752411575,
532
- "f1_micro_ci_low": 0.22364217252396165,
533
- "f1_micro_ci_high": 0.3618842117391186
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.5830978545264259,
537
- "f1_yes": 0.5918367346938775,
538
- "f1_no": 0.5743589743589743,
539
- "f1_macro_ci_low": 0.5111003138485096,
540
- "f1_macro_ci_high": 0.6506689237239318,
541
  "score_name": "f1_micro",
542
- "score": 0.5831202046035806,
543
- "score_ci_high": 0.649616368286445,
544
- "score_ci_low": 0.5114249450573659,
545
  "num_of_instances": 200,
546
- "accuracy": 0.57,
547
- "accuracy_ci_low": 0.4968446470094224,
548
- "accuracy_ci_high": 0.635,
549
- "f1_micro": 0.5831202046035806,
550
- "f1_micro_ci_low": 0.5114249450573659,
551
- "f1_micro_ci_high": 0.649616368286445
552
  },
553
  "legalbench_proa": {
554
  "f1_macro": 0.7776061776061776,
@@ -568,241 +568,241 @@
568
  "f1_micro_ci_low": 0.6950354609929078,
569
  "f1_micro_ci_high": 0.8435374149659864
570
  },
571
- "score": 0.5959359662443645,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.6167905620006249,
578
  "f1_cars": 0.8089887640449438,
579
  "f1_windows x": 0.06153846153846154,
580
- "f1_computer graphics": 0.5510204081632653,
581
  "f1_atheism": 0.1951219512195122,
582
- "f1_christianity": 0.8288288288288288,
583
- "f1_religion": 0.1568627450980392,
584
- "f1_medicine": 0.8505747126436781,
585
- "f1_microsoft windows": 0.75,
586
  "f1_middle east": 0.6666666666666666,
587
  "f1_motorcycles": 0.7619047619047619,
588
- "f1_politics": 0.359375,
589
- "f1_pc hardware": 0.6619718309859155,
590
- "f1_mac hardware": 0.7358490566037735,
591
- "f1_for sale": 0.5806451612903226,
592
- "f1_guns": 0.3561643835616438,
 
593
  "f1_space": 0.82,
594
- "f1_cryptography": 0.6666666666666666,
595
- "f1_baseball": 0.9166666666666666,
596
- "f1_hockey": 0.9402985074626866,
597
- "f1_electronics": 0.6666666666666666,
598
- "f1_macro_ci_low": 0.5929180247135345,
599
- "f1_macro_ci_high": 0.6464945617502024,
600
  "score_name": "f1_micro",
601
- "score": 0.661588683351469,
602
- "score_ci_high": 0.6918918918918919,
603
- "score_ci_low": 0.6351762173413632,
604
  "num_of_instances": 1000,
605
  "accuracy": 0.608,
606
- "accuracy_ci_low": 0.582,
607
- "accuracy_ci_high": 0.6398246959343236,
608
- "f1_micro": 0.661588683351469,
609
- "f1_micro_ci_low": 0.6351762173413632,
610
- "f1_micro_ci_high": 0.6918918918918919
611
  },
612
- "score": 0.661588683351469,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.7156339434074247,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.924907063197026,
620
- "f1_credit card or prepaid card": 0.8,
621
- "f1_debt collection": 0.659217877094972,
622
- "f1_checking or savings account": 0.8070175438596491,
623
- "f1_money transfer or virtual currency or money service": 0.6896551724137931,
624
- "f1_student loan": 0.7741935483870968,
625
- "f1_vehicle loan or lease": 0.625,
626
- "f1_mortgage": 0.875,
627
- "f1_payday loan or title loan or personal loan": 0.2857142857142857,
628
- "f1_macro_ci_low": 0.6660333456490072,
629
- "f1_macro_ci_high": 0.7763587756574478,
630
  "score_name": "f1_micro",
631
- "score": 0.863659401926001,
632
- "score_ci_high": 0.8836978702477332,
633
- "score_ci_low": 0.8417078870760507,
634
  "num_of_instances": 1000,
635
- "accuracy": 0.852,
636
- "accuracy_ci_low": 0.8286876874270778,
637
- "accuracy_ci_high": 0.871,
638
- "f1_micro": 0.863659401926001,
639
- "f1_micro_ci_low": 0.8417078870760507,
640
- "f1_micro_ci_high": 0.8836978702477332
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.7760995406721664,
644
- "f1_mortgages and loans": 0.8539325842696629,
645
- "f1_credit card": 0.8444444444444444,
646
- "f1_debt collection": 0.7117117117117117,
647
  "f1_credit reporting": 0.752851711026616,
648
- "f1_retail banking": 0.7175572519083969,
649
- "f1_macro_ci_low": 0.7391680922929513,
650
- "f1_macro_ci_high": 0.8135480304798374,
651
  "score_name": "f1_micro",
652
  "score": 0.7741273100616016,
653
- "score_ci_high": 0.809811768563787,
654
- "score_ci_low": 0.7373612854039264,
655
  "num_of_instances": 500,
656
  "accuracy": 0.754,
657
  "accuracy_ci_low": 0.716,
658
- "accuracy_ci_high": 0.794,
659
  "f1_micro": 0.7741273100616016,
660
- "f1_micro_ci_low": 0.7373612854039264,
661
- "f1_micro_ci_high": 0.809811768563787
662
  },
663
- "score": 0.8188933559938013,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
- "execution_accuracy": 0.162,
671
- "program_accuracy": 0.182,
672
- "score": 0.182,
673
  "score_name": "program_accuracy",
674
- "execution_accuracy_ci_low": 0.13950521872118443,
675
- "execution_accuracy_ci_high": 0.186,
676
- "program_accuracy_ci_low": 0.159,
677
- "program_accuracy_ci_high": 0.207,
678
- "score_ci_low": 0.159,
679
- "score_ci_high": 0.207
 
680
  },
681
- "score": 0.182,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.3472291622071149,
688
- "recall": 0.5550279207444034,
689
- "f1": 0.36009972180369076,
690
- "precision_ci_low": 0.3242384129530926,
691
- "precision_ci_high": 0.37066398547692364,
692
- "recall_ci_low": 0.5394838751494561,
693
- "recall_ci_high": 0.5723924699685504,
694
- "f1_ci_low": 0.3409334568421404,
695
- "f1_ci_high": 0.379665380653375,
696
  "score_name": "f1",
697
- "score": 0.36009972180369076,
698
- "score_ci_high": 0.379665380653375,
699
- "score_ci_low": 0.3409334568421404,
700
  "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.6113383284211159,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6853377008934816,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5746090712646643,
704
- "faithfullness_f1_token_overlap": 0.2924701621871153,
705
- "faithfullness_recall_token_overlap": 0.21371625625934337,
706
- "faithfullness_precision_token_overlap": 0.5963665736789522,
707
- "correctness_f1_token_overlap": 0.36009972180369076,
708
- "correctness_recall_token_overlap": 0.5550279207444034,
709
- "correctness_precision_token_overlap": 0.3472291622071149
710
  },
711
- "score": 0.36009972180369076,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.56,
718
- "accuracy_ci_low": 0.531,
719
- "accuracy_ci_high": 0.591,
720
  "score_name": "accuracy",
721
- "score": 0.56,
722
- "score_ci_high": 0.591,
723
- "score_ci_low": 0.531,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
- "accuracy": 0.792,
728
- "accuracy_ci_low": 0.754,
729
- "accuracy_ci_high": 0.824,
730
  "score_name": "accuracy",
731
- "score": 0.792,
732
- "score_ci_high": 0.824,
733
- "score_ci_low": 0.754,
734
  "num_of_instances": 500
735
  },
736
- "score": 0.676,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
  "severity_critical": 0.0,
743
- "severity_high": 1.0,
744
- "severity_medium": 3.0,
745
  "severity_low": 96.0,
746
- "category_harmful_info": 0.8329531024875386,
747
- "category_deception": 0.8794551490608491,
748
- "category_explicit_content": 0.8125141191512649,
749
- "category_violence": 0.802003665895216,
750
- "category_discrimination": 0.7836076390175591,
751
- "category_substance_abuse": 0.7930756719575989,
752
- "category_pii": 0.8714520589069084,
753
- "safety": 0.8206946513594852,
754
- "safety_ci_low": 0.7989320712853799,
755
- "safety_ci_high": 0.8426819113033088,
756
  "score_name": "safety",
757
- "score": 0.8206946513594852,
758
- "score_ci_high": 0.8426819113033088,
759
- "score_ci_low": 0.7989320712853799,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.8206946513594852,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
- "rouge1": 0.4312754109443129,
770
- "rougeL": 0.3112998958854518,
771
- "score": 0.3112998958854518,
772
  "score_name": "rougeL",
773
- "rouge2": 0.2266838399222002,
774
- "rougeLsum": 0.37538852494202224,
775
- "rouge1_ci_low": 0.421001593558283,
776
- "rouge1_ci_high": 0.4405064760072156,
777
- "rougeL_ci_low": 0.3032282310196722,
778
- "rougeL_ci_high": 0.3196260335861935,
779
- "score_ci_low": 0.3032282310196722,
780
- "score_ci_high": 0.3196260335861935,
781
- "rouge2_ci_low": 0.21901058073811183,
782
- "rouge2_ci_high": 0.23539084125213852,
783
- "rougeLsum_ci_low": 0.36595005735640374,
784
- "rougeLsum_ci_high": 0.3843319255959285
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
- "rouge1": 0.12832603897211398,
789
- "rougeL": 0.09174745116852957,
790
- "score": 0.09174745116852957,
791
  "score_name": "rougeL",
792
- "rouge2": 0.018830792500723063,
793
- "rougeLsum": 0.1046960824796928,
794
- "rouge1_ci_low": 0.1226586252529639,
795
- "rouge1_ci_high": 0.13355174728344751,
796
- "rougeL_ci_low": 0.08777783885967977,
797
- "rougeL_ci_high": 0.09513486417451719,
798
- "score_ci_low": 0.08777783885967977,
799
- "score_ci_high": 0.09513486417451719,
800
- "rouge2_ci_low": 0.01683460814089518,
801
- "rouge2_ci_high": 0.020698285797084142,
802
- "rougeLsum_ci_low": 0.10022616921218402,
803
- "rougeLsum_ci_high": 0.10887661841550207
804
  },
805
- "score": 0.2015236735269907,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
@@ -810,473 +810,473 @@
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
- 1275,
814
- 830,
815
- 586,
816
- 417
817
  ],
818
  "totals": [
819
- 1807,
820
- 1741,
821
- 1675,
822
- 1609
823
  ],
824
  "precisions": [
825
- 0.7055893746541229,
826
- 0.47673750717978175,
827
- 0.34985074626865675,
828
- 0.2591671845866998
829
  ],
830
  "bp": 1.0,
831
- "sys_len": 1807,
832
  "ref_len": 1734,
833
- "sacrebleu": 0.41790112689604164,
834
- "score": 0.41790112689604164,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.37467076425053164,
837
- "score_ci_high": 0.46212421285570143,
838
- "sacrebleu_ci_low": 0.37467076425053164,
839
- "sacrebleu_ci_high": 0.46212421285570143
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
- 1296,
845
- 865,
846
- 615,
847
- 440
848
  ],
849
  "totals": [
850
- 1821,
851
- 1755,
852
- 1689,
853
- 1623
854
  ],
855
  "precisions": [
856
- 0.7116968698517299,
857
- 0.49287749287749283,
858
- 0.36412078152753113,
859
- 0.2711028958718423
860
  ],
861
  "bp": 1.0,
862
- "sys_len": 1821,
863
  "ref_len": 1734,
864
- "sacrebleu": 0.4313734544798424,
865
- "score": 0.4313734544798424,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.39011475848676946,
868
- "score_ci_high": 0.46975242344141427,
869
- "sacrebleu_ci_low": 0.39011475848676946,
870
- "sacrebleu_ci_high": 0.46975242344141427
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
- 963,
876
- 569,
877
- 359,
878
- 232
879
  ],
880
  "totals": [
881
- 1592,
882
- 1526,
883
- 1460,
884
- 1394
885
  ],
886
  "precisions": [
887
- 0.6048994974874372,
888
- 0.372870249017038,
889
- 0.24589041095890413,
890
- 0.16642754662840747
891
  ],
892
- "bp": 1.0,
893
- "sys_len": 1592,
894
  "ref_len": 1589,
895
- "sacrebleu": 0.3099573506400797,
896
- "score": 0.3099573506400797,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.2802812834383563,
899
- "score_ci_high": 0.34144542695693814,
900
- "sacrebleu_ci_low": 0.2802812834383563,
901
- "sacrebleu_ci_high": 0.34144542695693814
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
- 1260,
907
- 822,
908
- 589,
909
- 442
910
  ],
911
  "totals": [
912
- 1834,
913
- 1768,
914
- 1702,
915
- 1636
916
  ],
917
  "precisions": [
918
- 0.6870229007633588,
919
- 0.4649321266968326,
920
- 0.34606345475910694,
921
- 0.2701711491442543
922
  ],
923
- "bp": 0.9994548923547389,
924
- "sys_len": 1834,
925
  "ref_len": 1835,
926
- "sacrebleu": 0.41548186092135636,
927
- "score": 0.41548186092135636,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.3641424966826166,
930
- "score_ci_high": 0.46838012385794353,
931
- "sacrebleu_ci_low": 0.3641424966826166,
932
- "sacrebleu_ci_high": 0.46838012385794353
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
- 1594,
938
- 1232,
939
- 984,
940
- 810
941
  ],
942
  "totals": [
943
- 2012,
944
- 1946,
945
- 1880,
946
- 1814
947
  ],
948
  "precisions": [
949
- 0.7922465208747514,
950
- 0.6330935251798561,
951
- 0.5234042553191489,
952
- 0.44652701212789414
953
  ],
954
- "bp": 0.9725507672852267,
955
- "sys_len": 2012,
956
  "ref_len": 2068,
957
- "sacrebleu": 0.5690698546093208,
958
- "score": 0.5690698546093208,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.52882560047152,
961
- "score_ci_high": 0.6139798297379135,
962
- "sacrebleu_ci_low": 0.52882560047152,
963
- "sacrebleu_ci_high": 0.6139798297379135
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
- 1376,
969
- 781,
970
- 495,
971
- 326
972
  ],
973
  "totals": [
974
- 2361,
975
- 2295,
976
- 2229,
977
- 2163
978
  ],
979
  "precisions": [
980
- 0.5828038966539602,
981
- 0.34030501089324616,
982
- 0.22207267833109018,
983
- 0.15071659731853906
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 2361,
987
  "ref_len": 2235,
988
- "sacrebleu": 0.28543797421890393,
989
- "score": 0.28543797421890393,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.2567799199475619,
992
- "score_ci_high": 0.31814664829371603,
993
- "sacrebleu_ci_low": 0.2567799199475619,
994
- "sacrebleu_ci_high": 0.31814664829371603
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
- 1451,
1000
- 1053,
1001
- 814,
1002
- 640
1003
  ],
1004
  "totals": [
1005
- 1898,
1006
- 1832,
1007
- 1766,
1008
- 1700
1009
  ],
1010
  "precisions": [
1011
- 0.7644889357218125,
1012
- 0.5747816593886462,
1013
- 0.46092865232163077,
1014
- 0.3764705882352941
1015
  ],
1016
- "bp": 0.9905611611284771,
1017
- "sys_len": 1898,
1018
  "ref_len": 1916,
1019
- "sacrebleu": 0.52052430367519,
1020
- "score": 0.52052430367519,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.46893066617087675,
1023
- "score_ci_high": 0.5524273546032454,
1024
- "sacrebleu_ci_low": 0.46893066617087675,
1025
- "sacrebleu_ci_high": 0.5524273546032454
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
- 1416,
1031
- 1021,
1032
- 771,
1033
- 581
1034
  ],
1035
  "totals": [
1036
- 1945,
1037
- 1879,
1038
- 1813,
1039
- 1747
1040
  ],
1041
  "precisions": [
1042
- 0.7280205655526992,
1043
- 0.5433741351782863,
1044
- 0.42526199669056813,
1045
- 0.3325701202060675
1046
  ],
1047
- "bp": 0.9979455579909386,
1048
- "sys_len": 1945,
1049
  "ref_len": 1949,
1050
- "sacrebleu": 0.4853471146871033,
1051
- "score": 0.4853471146871033,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.45737023291404577,
1054
- "score_ci_high": 0.5267894003889202,
1055
- "sacrebleu_ci_low": 0.45737023291404577,
1056
- "sacrebleu_ci_high": 0.5267894003889202
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
- 1269,
1062
- 719,
1063
- 439,
1064
- 270
1065
  ],
1066
  "totals": [
1067
- 1972,
1068
- 1906,
1069
- 1840,
1070
- 1774
1071
  ],
1072
  "precisions": [
1073
- 0.6435091277890467,
1074
- 0.3772298006295907,
1075
- 0.23858695652173914,
1076
- 0.15219842164599773
1077
  ],
1078
- "bp": 0.9381039423957293,
1079
- "sys_len": 1972,
1080
  "ref_len": 2098,
1081
- "sacrebleu": 0.28744539070468517,
1082
- "score": 0.28744539070468517,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.2566627625544366,
1085
- "score_ci_high": 0.3146920359001326,
1086
- "sacrebleu_ci_low": 0.2566627625544366,
1087
- "sacrebleu_ci_high": 0.3146920359001326
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
- 1324,
1093
- 920,
1094
- 666,
1095
- 477
1096
  ],
1097
  "totals": [
1098
- 1893,
1099
- 1827,
1100
- 1761,
1101
- 1695
1102
  ],
1103
  "precisions": [
1104
- 0.6994189117802431,
1105
- 0.5035577449370553,
1106
- 0.3781942078364565,
1107
- 0.2814159292035398
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 1893,
1111
  "ref_len": 1734,
1112
- "sacrebleu": 0.44001000355221576,
1113
- "score": 0.44001000355221576,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.40032693502671896,
1116
- "score_ci_high": 0.4786325473299439,
1117
- "sacrebleu_ci_low": 0.40032693502671896,
1118
- "sacrebleu_ci_high": 0.4786325473299439
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
- 1135,
1124
- 622,
1125
- 371,
1126
- 231
1127
  ],
1128
  "totals": [
1129
- 2011,
1130
- 1945,
1131
- 1879,
1132
- 1813
1133
  ],
1134
  "precisions": [
1135
- 0.564395822973645,
1136
- 0.3197943444730077,
1137
- 0.19744544970729114,
1138
- 0.1274131274131274
1139
  ],
1140
  "bp": 1.0,
1141
- "sys_len": 2011,
1142
  "ref_len": 1734,
1143
- "sacrebleu": 0.259584626709523,
1144
- "score": 0.259584626709523,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.22697300814650573,
1147
- "score_ci_high": 0.2908889327642379,
1148
- "sacrebleu_ci_low": 0.22697300814650573,
1149
- "sacrebleu_ci_high": 0.2908889327642379
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
- 1120,
1155
- 635,
1156
- 400,
1157
- 262
1158
  ],
1159
  "totals": [
1160
- 1861,
1161
- 1795,
1162
- 1729,
1163
- 1663
1164
  ],
1165
  "precisions": [
1166
- 0.6018269747447609,
1167
- 0.3537604456824513,
1168
- 0.2313475997686524,
1169
- 0.15754660252555622
1170
  ],
1171
  "bp": 1.0,
1172
- "sys_len": 1861,
1173
  "ref_len": 1734,
1174
- "sacrebleu": 0.29679989332755485,
1175
- "score": 0.29679989332755485,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.2606890146882507,
1178
- "score_ci_high": 0.34577889084499225,
1179
- "sacrebleu_ci_low": 0.2606890146882507,
1180
- "sacrebleu_ci_high": 0.34577889084499225
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
- 1324,
1186
- 947,
1187
- 723,
1188
- 555
1189
  ],
1190
  "totals": [
1191
- 1810,
1192
- 1744,
1193
- 1678,
1194
- 1612
1195
  ],
1196
  "precisions": [
1197
- 0.7314917127071823,
1198
- 0.5430045871559632,
1199
- 0.4308700834326579,
1200
- 0.3442928039702233
1201
  ],
1202
  "bp": 1.0,
1203
- "sys_len": 1810,
1204
  "ref_len": 1734,
1205
- "sacrebleu": 0.49268778913543754,
1206
- "score": 0.49268778913543754,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.40671829932229786,
1209
- "score_ci_high": 0.5323333720792753,
1210
- "sacrebleu_ci_low": 0.40671829932229786,
1211
- "sacrebleu_ci_high": 0.5323333720792753
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
  1324,
1217
- 922,
1218
- 663,
1219
- 481
1220
  ],
1221
  "totals": [
1222
- 1831,
1223
- 1765,
1224
- 1699,
1225
- 1633
1226
  ],
1227
  "precisions": [
1228
- 0.7231021299836154,
1229
- 0.5223796033994335,
1230
- 0.3902295467922307,
1231
- 0.2945499081445193
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 1831,
1235
  "ref_len": 1734,
1236
- "sacrebleu": 0.4564741865050132,
1237
- "score": 0.4564741865050132,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.38825961035825357,
1240
- "score_ci_high": 0.5021079213683431,
1241
- "sacrebleu_ci_low": 0.38825961035825357,
1242
- "sacrebleu_ci_high": 0.5021079213683431
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
- 1203,
1248
- 678,
1249
- 414,
1250
- 248
1251
  ],
1252
  "totals": [
1253
- 1924,
1254
- 1858,
1255
- 1792,
1256
- 1726
1257
  ],
1258
  "precisions": [
1259
- 0.6252598752598753,
1260
- 0.36490850376749195,
1261
- 0.23102678571428573,
1262
- 0.1436848203939745
1263
  ],
1264
  "bp": 1.0,
1265
- "sys_len": 1924,
1266
  "ref_len": 1734,
1267
- "sacrebleu": 0.2950050444470296,
1268
- "score": 0.2950050444470296,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.25511324454078954,
1271
- "score_ci_high": 0.33543848071805,
1272
- "sacrebleu_ci_low": 0.25511324454078954,
1273
- "sacrebleu_ci_high": 0.33543848071805
1274
  },
1275
- "score": 0.3975399983006198,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
- "score": 0.5386099714179504,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-06-23T10:18:29.800050Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
+ "accuracy": 0.9,
260
+ "accuracy_ci_low": 0.8333333333333334,
261
+ "accuracy_ci_high": 0.9555555555555556,
262
  "score_name": "accuracy",
263
+ "score": 0.9,
264
+ "score_ci_high": 0.9555555555555556,
265
+ "score_ci_low": 0.8333333333333334,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
 
285
  "score_ci_low": 0.7888888888888889,
286
  "num_of_instances": 90
287
  },
288
+ "score": 0.9636363636363636,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.14444444444444443,
296
+ "score": 0.14444444444444443,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.14444444444444443,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
+ "f1_Person": 0.5653333333333334,
307
+ "f1_Organization": 0.33757961783439494,
308
+ "f1_Location": 0.3529411764705882,
309
+ "f1_macro": 0.4186180425461055,
310
+ "recall_macro": 0.3749591226403319,
311
+ "precision_macro": 0.47607168955040186,
312
+ "in_classes_support": 0.4988095238095238,
313
+ "f1_micro": 0.2989010989010989,
314
+ "recall_micro": 0.38857142857142857,
315
+ "precision_micro": 0.24285714285714285,
316
+ "score": 0.2989010989010989,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.25706421958601,
319
+ "score_ci_high": 0.34884085688698435,
320
+ "f1_micro_ci_low": 0.25706421958601,
321
+ "f1_micro_ci_high": 0.34884085688698435
322
  },
323
+ "score": 0.2989010989010989,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
+ "accuracy": 0.5633802816901409,
330
+ "accuracy_ci_low": 0.4647887323943662,
331
+ "accuracy_ci_high": 0.6855024917261459,
332
  "score_name": "accuracy",
333
+ "score": 0.5633802816901409,
334
+ "score_ci_high": 0.6855024917261459,
335
+ "score_ci_low": 0.4647887323943662,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
+ "accuracy": 0.29577464788732394,
340
+ "accuracy_ci_low": 0.19718309859154928,
341
+ "accuracy_ci_high": 0.4225352112676056,
342
  "score_name": "accuracy",
343
+ "score": 0.29577464788732394,
344
+ "score_ci_high": 0.4225352112676056,
345
+ "score_ci_low": 0.19718309859154928,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
+ "accuracy": 0.22535211267605634,
350
+ "accuracy_ci_low": 0.14084507042253522,
351
+ "accuracy_ci_high": 0.323943661971831,
352
  "score_name": "accuracy",
353
+ "score": 0.22535211267605634,
354
+ "score_ci_high": 0.323943661971831,
355
+ "score_ci_low": 0.14084507042253522,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
 
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
+ "accuracy": 0.676056338028169,
370
+ "accuracy_ci_low": 0.5596886617559699,
371
+ "accuracy_ci_high": 0.7746478873239436,
372
  "score_name": "accuracy",
373
+ "score": 0.676056338028169,
374
+ "score_ci_high": 0.7746478873239436,
375
+ "score_ci_low": 0.5596886617559699,
376
  "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
+ "accuracy": 0.4225352112676056,
380
+ "accuracy_ci_low": 0.30985915492957744,
381
+ "accuracy_ci_high": 0.5488120473991023,
382
  "score_name": "accuracy",
383
+ "score": 0.4225352112676056,
384
+ "score_ci_high": 0.5488120473991023,
385
+ "score_ci_low": 0.30985915492957744,
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
+ "accuracy": 0.49295774647887325,
390
+ "accuracy_ci_low": 0.36619718309859156,
391
+ "accuracy_ci_high": 0.5915492957746479,
392
  "score_name": "accuracy",
393
+ "score": 0.49295774647887325,
394
+ "score_ci_high": 0.5915492957746479,
395
+ "score_ci_low": 0.36619718309859156,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
+ "accuracy": 0.7605633802816901,
400
+ "accuracy_ci_low": 0.647887323943662,
401
+ "accuracy_ci_high": 0.8450704225352113,
402
  "score_name": "accuracy",
403
+ "score": 0.7605633802816901,
404
+ "score_ci_high": 0.8450704225352113,
405
+ "score_ci_low": 0.647887323943662,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
+ "accuracy": 0.6056338028169014,
410
+ "accuracy_ci_low": 0.49295774647887325,
411
+ "accuracy_ci_high": 0.7183098591549296,
412
  "score_name": "accuracy",
413
+ "score": 0.6056338028169014,
414
+ "score_ci_high": 0.7183098591549296,
415
+ "score_ci_low": 0.49295774647887325,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
+ "accuracy": 0.2535211267605634,
420
+ "accuracy_ci_low": 0.15492957746478872,
421
+ "accuracy_ci_high": 0.36619718309859156,
422
  "score_name": "accuracy",
423
+ "score": 0.2535211267605634,
424
+ "score_ci_high": 0.36619718309859156,
425
+ "score_ci_low": 0.15492957746478872,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.5211267605633803,
430
+ "accuracy_ci_low": 0.39436619718309857,
431
+ "accuracy_ci_high": 0.6338028169014085,
432
  "score_name": "accuracy",
433
+ "score": 0.5211267605633803,
434
+ "score_ci_high": 0.6338028169014085,
435
+ "score_ci_low": 0.39436619718309857,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
  "accuracy": 0.6901408450704225,
440
+ "accuracy_ci_low": 0.5633802816901409,
441
  "accuracy_ci_high": 0.7887323943661971,
442
  "score_name": "accuracy",
443
  "score": 0.6901408450704225,
444
  "score_ci_high": 0.7887323943661971,
445
+ "score_ci_low": 0.5633802816901409,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.39436619718309857,
450
+ "accuracy_ci_low": 0.2742524569401369,
451
+ "accuracy_ci_high": 0.5070422535211268,
452
  "score_name": "accuracy",
453
+ "score": 0.39436619718309857,
454
+ "score_ci_high": 0.5070422535211268,
455
+ "score_ci_low": 0.2742524569401369,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
+ "accuracy": 0.6619718309859155,
460
+ "accuracy_ci_low": 0.5492957746478874,
461
  "accuracy_ci_high": 0.7605633802816901,
462
  "score_name": "accuracy",
463
+ "score": 0.6619718309859155,
464
  "score_ci_high": 0.7605633802816901,
465
+ "score_ci_low": 0.5492957746478874,
466
  "num_of_instances": 71
467
  },
468
+ "score": 0.5050301810865191,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.5900679117147707,
475
+ "f1_suggestive": 0.5161290322580645,
476
  "f1_generic": 0.6666666666666666,
477
+ "f1_descriptive": 0.6842105263157895,
478
  "f1_fanciful": 0.4166666666666667,
479
+ "f1_arbitrary": 0.6666666666666666,
480
+ "f1_macro_ci_low": 0.48866628515797084,
481
+ "f1_macro_ci_high": 0.6952557983585582,
482
  "score_name": "f1_micro",
483
+ "score": 0.6037735849056604,
484
+ "score_ci_high": 0.6980886219395492,
485
+ "score_ci_low": 0.4810734018080045,
486
  "num_of_instances": 85,
487
+ "accuracy": 0.5647058823529412,
488
+ "accuracy_ci_low": 0.4470588235294118,
489
  "accuracy_ci_high": 0.6588235294117647,
490
+ "f1_micro": 0.6037735849056604,
491
+ "f1_micro_ci_low": 0.4810734018080045,
492
+ "f1_micro_ci_high": 0.6980886219395492
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.6789943663051392,
496
+ "f1_no": 0.7806691449814126,
497
+ "f1_yes": 0.5773195876288659,
498
+ "f1_macro_ci_low": 0.6027203961421103,
499
+ "f1_macro_ci_high": 0.7510411869611957,
500
  "score_name": "f1_micro",
501
+ "score": 0.726775956284153,
502
+ "score_ci_high": 0.7809156964912598,
503
+ "score_ci_low": 0.6593055710063558,
504
  "num_of_instances": 200,
505
+ "accuracy": 0.665,
506
+ "accuracy_ci_low": 0.595,
507
+ "accuracy_ci_high": 0.725,
508
+ "f1_micro": 0.726775956284153,
509
+ "f1_micro_ci_low": 0.6593055710063558,
510
+ "f1_micro_ci_high": 0.7809156964912598
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.2756818181818182,
514
  "f1_conclusion": 0.0625,
 
515
  "f1_decree": 0.2,
516
+ "f1_issue": 0.16666666666666666,
517
+ "f1_analysis": 0.5,
518
+ "f1_facts": 0.3333333333333333,
519
+ "f1_procedural history": 0.22727272727272727,
520
+ "f1_rule": 0.44,
521
+ "f1_macro_ci_low": 0.21996524889806546,
522
+ "f1_macro_ci_high": 0.34934856402818654,
523
  "score_name": "f1_micro",
524
+ "score": 0.3032258064516129,
525
+ "score_ci_high": 0.37934863351152043,
526
+ "score_ci_low": 0.23767600886432785,
527
  "num_of_instances": 200,
528
+ "accuracy": 0.235,
529
+ "accuracy_ci_low": 0.18,
530
+ "accuracy_ci_high": 0.3,
531
+ "f1_micro": 0.3032258064516129,
532
+ "f1_micro_ci_low": 0.23767600886432785,
533
+ "f1_micro_ci_high": 0.37934863351152043
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.5552471583399419,
537
+ "f1_yes": 0.5463917525773195,
538
+ "f1_no": 0.5641025641025641,
539
+ "f1_macro_ci_low": 0.4896587694098791,
540
+ "f1_macro_ci_high": 0.6253621455216213,
541
  "score_name": "f1_micro",
542
+ "score": 0.5552699228791774,
543
+ "score_ci_high": 0.6232020193247749,
544
+ "score_ci_low": 0.48717948717948717,
545
  "num_of_instances": 200,
546
+ "accuracy": 0.54,
547
+ "accuracy_ci_low": 0.47,
548
+ "accuracy_ci_high": 0.61,
549
+ "f1_micro": 0.5552699228791774,
550
+ "f1_micro_ci_low": 0.48717948717948717,
551
+ "f1_micro_ci_high": 0.6232020193247749
552
  },
553
  "legalbench_proa": {
554
  "f1_macro": 0.7776061776061776,
 
568
  "f1_micro_ci_low": 0.6950354609929078,
569
  "f1_micro_ci_high": 0.8435374149659864
570
  },
571
+ "score": 0.5933646096596763,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.6207188447832841,
578
  "f1_cars": 0.8089887640449438,
579
  "f1_windows x": 0.06153846153846154,
580
+ "f1_computer graphics": 0.5591397849462365,
581
  "f1_atheism": 0.1951219512195122,
582
+ "f1_christianity": 0.8148148148148148,
583
+ "f1_religion": 0.16666666666666666,
584
+ "f1_medicine": 0.8409090909090909,
585
+ "f1_microsoft windows": 0.7115384615384616,
586
  "f1_middle east": 0.6666666666666666,
587
  "f1_motorcycles": 0.7619047619047619,
588
+ "f1_politics": 0.3709677419354839,
589
+ "f1_pc hardware": 0.6524822695035462,
590
+ "f1_mac hardware": 0.7169811320754716,
591
+ "f1_electronics": 0.6746987951807228,
592
+ "f1_for sale": 0.6451612903225806,
593
+ "f1_guns": 0.40540540540540543,
594
  "f1_space": 0.82,
595
+ "f1_cryptography": 0.684931506849315,
596
+ "f1_baseball": 0.9090909090909091,
597
+ "f1_hockey": 0.9473684210526315,
598
+ "f1_macro_ci_low": 0.5972493284306833,
599
+ "f1_macro_ci_high": 0.6520732498423311,
 
600
  "score_name": "f1_micro",
601
+ "score": 0.6644808743169399,
602
+ "score_ci_high": 0.6954593267547653,
603
+ "score_ci_low": 0.6374402731127434,
604
  "num_of_instances": 1000,
605
  "accuracy": 0.608,
606
+ "accuracy_ci_low": 0.58,
607
+ "accuracy_ci_high": 0.64,
608
+ "f1_micro": 0.6644808743169399,
609
+ "f1_micro_ci_low": 0.6374402731127434,
610
+ "f1_micro_ci_high": 0.6954593267547653
611
  },
612
+ "score": 0.6644808743169399,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.707429477184356,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9267935578330894,
620
+ "f1_checking or savings account": 0.7964601769911505,
621
+ "f1_debt collection": 0.5952380952380952,
622
+ "f1_credit card or prepaid card": 0.7777777777777778,
623
+ "f1_mortgage": 0.8611111111111112,
624
+ "f1_student loan": 0.8125,
625
+ "f1_money transfer or virtual currency or money service": 0.851063829787234,
626
+ "f1_vehicle loan or lease": 0.5641025641025641,
627
+ "f1_payday loan or title loan or personal loan": 0.18181818181818182,
628
+ "f1_macro_ci_low": 0.6648851525959504,
629
+ "f1_macro_ci_high": 0.7723728512116876,
630
  "score_name": "f1_micro",
631
+ "score": 0.8642350557244174,
632
+ "score_ci_high": 0.8836251312776043,
633
+ "score_ci_low": 0.843700754195778,
634
  "num_of_instances": 1000,
635
+ "accuracy": 0.853,
636
+ "accuracy_ci_low": 0.83,
637
+ "accuracy_ci_high": 0.873,
638
+ "f1_micro": 0.8642350557244174,
639
+ "f1_micro_ci_low": 0.843700754195778,
640
+ "f1_micro_ci_high": 0.8836251312776043
641
  },
642
  "cfpb_product_watsonx": {
643
+ "f1_macro": 0.7776035677272286,
644
+ "f1_mortgages and loans": 0.8491620111731844,
645
+ "f1_credit card": 0.8491620111731844,
646
+ "f1_debt collection": 0.7,
647
  "f1_credit reporting": 0.752851711026616,
648
+ "f1_retail banking": 0.7368421052631579,
649
+ "f1_macro_ci_low": 0.7421964970208773,
650
+ "f1_macro_ci_high": 0.8128714170953505,
651
  "score_name": "f1_micro",
652
  "score": 0.7741273100616016,
653
+ "score_ci_high": 0.808137127901691,
654
+ "score_ci_low": 0.7371050801783955,
655
  "num_of_instances": 500,
656
  "accuracy": 0.754,
657
  "accuracy_ci_low": 0.716,
658
+ "accuracy_ci_high": 0.79,
659
  "f1_micro": 0.7741273100616016,
660
+ "f1_micro_ci_low": 0.7371050801783955,
661
+ "f1_micro_ci_high": 0.808137127901691
662
  },
663
+ "score": 0.8191811828930096,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
+ "program_accuracy": 0.178,
671
+ "score": 0.178,
 
672
  "score_name": "program_accuracy",
673
+ "execution_accuracy": 0.158,
674
+ "program_accuracy_ci_low": 0.156,
675
+ "program_accuracy_ci_high": 0.201,
676
+ "score_ci_low": 0.156,
677
+ "score_ci_high": 0.201,
678
+ "execution_accuracy_ci_low": 0.136,
679
+ "execution_accuracy_ci_high": 0.18
680
  },
681
+ "score": 0.178,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
+ "precision": 0.3443634549583665,
688
+ "recall": 0.5541949627099935,
689
+ "f1": 0.3584271235061802,
690
+ "precision_ci_low": 0.3209963473848064,
691
+ "precision_ci_high": 0.36733937026281865,
692
+ "recall_ci_low": 0.5380844738077261,
693
+ "recall_ci_high": 0.5700354927476148,
694
+ "f1_ci_low": 0.3384679590296986,
695
+ "f1_ci_high": 0.3789111960225323,
696
  "score_name": "f1",
697
+ "score": 0.3584271235061802,
698
+ "score_ci_high": 0.3789111960225323,
699
+ "score_ci_low": 0.3384679590296986,
700
  "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6108398385842642,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6847132598857085,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.573952042584618,
704
+ "faithfullness_f1_token_overlap": 0.293473712088948,
705
+ "faithfullness_recall_token_overlap": 0.2148085102518536,
706
+ "faithfullness_precision_token_overlap": 0.5936738226108105,
707
+ "correctness_f1_token_overlap": 0.3584271235061802,
708
+ "correctness_recall_token_overlap": 0.5541949627099935,
709
+ "correctness_precision_token_overlap": 0.3443634549583665
710
  },
711
+ "score": 0.3584271235061802,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
+ "accuracy": 0.554,
718
+ "accuracy_ci_low": 0.5224818891638672,
719
+ "accuracy_ci_high": 0.585,
720
  "score_name": "accuracy",
721
+ "score": 0.554,
722
+ "score_ci_high": 0.585,
723
+ "score_ci_low": 0.5224818891638672,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
+ "accuracy": 0.786,
728
+ "accuracy_ci_low": 0.752,
729
+ "accuracy_ci_high": 0.8217351190918832,
730
  "score_name": "accuracy",
731
+ "score": 0.786,
732
+ "score_ci_high": 0.8217351190918832,
733
+ "score_ci_low": 0.752,
734
  "num_of_instances": 500
735
  },
736
+ "score": 0.67,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
  "severity_critical": 0.0,
743
+ "severity_high": 0.0,
744
+ "severity_medium": 4.0,
745
  "severity_low": 96.0,
746
+ "category_harmful_info": 0.8561736831500655,
747
+ "category_deception": 0.9073135512645798,
748
+ "category_explicit_content": 0.7910330542410263,
749
+ "category_violence": 0.80168207176621,
750
+ "category_discrimination": 0.8618445271581766,
751
+ "category_substance_abuse": 0.8289476109875572,
752
+ "category_pii": 0.8585629948863277,
753
+ "safety": 0.8346688703028485,
754
+ "safety_ci_low": 0.8100697875896787,
755
+ "safety_ci_high": 0.8557748119272056,
756
  "score_name": "safety",
757
+ "score": 0.8346688703028485,
758
+ "score_ci_high": 0.8557748119272056,
759
+ "score_ci_low": 0.8100697875896787,
760
  "num_of_instances": 100
761
  },
762
+ "score": 0.8346688703028485,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
+ "rougeLsum": 0.37773185785838354,
770
+ "rougeL": 0.3129919661485272,
771
+ "score": 0.3129919661485272,
772
  "score_name": "rougeL",
773
+ "rouge1": 0.43240920102765396,
774
+ "rouge2": 0.22703955569027665,
775
+ "rougeLsum_ci_low": 0.36830039067564885,
776
+ "rougeLsum_ci_high": 0.3873179684384486,
777
+ "rougeL_ci_low": 0.3049039054154354,
778
+ "rougeL_ci_high": 0.32111736301049143,
779
+ "score_ci_low": 0.3049039054154354,
780
+ "score_ci_high": 0.32111736301049143,
781
+ "rouge1_ci_low": 0.4218532055678653,
782
+ "rouge1_ci_high": 0.4422878104066809,
783
+ "rouge2_ci_low": 0.21929685324826398,
784
+ "rouge2_ci_high": 0.23594489559189138
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
+ "rougeLsum": 0.10580614433716286,
789
+ "rougeL": 0.09291382159031186,
790
+ "score": 0.09291382159031186,
791
  "score_name": "rougeL",
792
+ "rouge1": 0.1292682365835658,
793
+ "rouge2": 0.01895410897411973,
794
+ "rougeLsum_ci_low": 0.10084383699373096,
795
+ "rougeLsum_ci_high": 0.10979495591939617,
796
+ "rougeL_ci_low": 0.0886493583320966,
797
+ "rougeL_ci_high": 0.09629333556794349,
798
+ "score_ci_low": 0.0886493583320966,
799
+ "score_ci_high": 0.09629333556794349,
800
+ "rouge1_ci_low": 0.1233156289283472,
801
+ "rouge1_ci_high": 0.13423125610698836,
802
+ "rouge2_ci_low": 0.017054021331647896,
803
+ "rouge2_ci_high": 0.02104099399428594
804
  },
805
+ "score": 0.20295289386941953,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
 
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
+ 1256,
814
+ 809,
815
+ 561,
816
+ 393
817
  ],
818
  "totals": [
819
+ 1822,
820
+ 1756,
821
+ 1690,
822
+ 1624
823
  ],
824
  "precisions": [
825
+ 0.6893523600439078,
826
+ 0.4607061503416856,
827
+ 0.3319526627218935,
828
+ 0.2419950738916256
829
  ],
830
  "bp": 1.0,
831
+ "sys_len": 1822,
832
  "ref_len": 1734,
833
+ "sacrebleu": 0.39965660032074374,
834
+ "score": 0.39965660032074374,
835
  "score_name": "sacrebleu",
836
+ "score_ci_low": 0.34477468044834286,
837
+ "score_ci_high": 0.4416384677344608,
838
+ "sacrebleu_ci_low": 0.34477468044834286,
839
+ "sacrebleu_ci_high": 0.4416384677344608
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
+ 1282,
845
+ 858,
846
+ 611,
847
+ 439
848
  ],
849
  "totals": [
850
+ 1827,
851
+ 1761,
852
+ 1695,
853
+ 1629
854
  ],
855
  "precisions": [
856
+ 0.7016967706622879,
857
+ 0.48722316865417375,
858
+ 0.36047197640117995,
859
+ 0.26949048496009825
860
  ],
861
  "bp": 1.0,
862
+ "sys_len": 1827,
863
  "ref_len": 1734,
864
+ "sacrebleu": 0.42689698575484597,
865
+ "score": 0.42689698575484597,
866
  "score_name": "sacrebleu",
867
+ "score_ci_low": 0.3808995454883676,
868
+ "score_ci_high": 0.47419332084644833,
869
+ "sacrebleu_ci_low": 0.3808995454883676,
870
+ "sacrebleu_ci_high": 0.47419332084644833
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
+ 974,
876
+ 591,
877
+ 379,
878
+ 251
879
  ],
880
  "totals": [
881
+ 1588,
882
+ 1522,
883
+ 1456,
884
+ 1390
885
  ],
886
  "precisions": [
887
+ 0.6133501259445844,
888
+ 0.38830486202365305,
889
+ 0.2603021978021978,
890
+ 0.18057553956834532
891
  ],
892
+ "bp": 0.9993704753119519,
893
+ "sys_len": 1588,
894
  "ref_len": 1589,
895
+ "sacrebleu": 0.3250730946308182,
896
+ "score": 0.3250730946308182,
897
  "score_name": "sacrebleu",
898
+ "score_ci_low": 0.28933963986645983,
899
+ "score_ci_high": 0.36869689361591035,
900
+ "sacrebleu_ci_low": 0.28933963986645983,
901
+ "sacrebleu_ci_high": 0.36869689361591035
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
+ 1257,
907
+ 811,
908
+ 576,
909
+ 437
910
  ],
911
  "totals": [
912
+ 1815,
913
+ 1749,
914
+ 1683,
915
+ 1617
916
  ],
917
  "precisions": [
918
+ 0.6925619834710744,
919
+ 0.46369353916523726,
920
+ 0.34224598930481287,
921
+ 0.2702535559678417
922
  ],
923
+ "bp": 0.98904120617152,
924
+ "sys_len": 1815,
925
  "ref_len": 1835,
926
+ "sacrebleu": 0.41059556612028536,
927
+ "score": 0.41059556612028536,
928
  "score_name": "sacrebleu",
929
+ "score_ci_low": 0.35457040262235134,
930
+ "score_ci_high": 0.459990812104818,
931
+ "sacrebleu_ci_low": 0.35457040262235134,
932
+ "sacrebleu_ci_high": 0.459990812104818
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
+ 1601,
938
+ 1237,
939
+ 986,
940
+ 807
941
  ],
942
  "totals": [
943
+ 2017,
944
+ 1951,
945
+ 1885,
946
+ 1819
947
  ],
948
  "precisions": [
949
+ 0.7937530986613782,
950
+ 0.6340338288057407,
951
+ 0.5230769230769231,
952
+ 0.44365035733919733
953
  ],
954
+ "bp": 0.9750319133813282,
955
+ "sys_len": 2017,
956
  "ref_len": 2068,
957
+ "sacrebleu": 0.5699934901400187,
958
+ "score": 0.5699934901400187,
959
  "score_name": "sacrebleu",
960
+ "score_ci_low": 0.5326539789859114,
961
+ "score_ci_high": 0.6234642421169655,
962
+ "sacrebleu_ci_low": 0.5326539789859114,
963
+ "sacrebleu_ci_high": 0.6234642421169655
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
+ 1367,
969
+ 786,
970
+ 494,
971
+ 321
972
  ],
973
  "totals": [
974
+ 2312,
975
+ 2246,
976
+ 2180,
977
+ 2114
978
  ],
979
  "precisions": [
980
+ 0.5912629757785467,
981
+ 0.34995547640249336,
982
+ 0.22660550458715598,
983
+ 0.15184484389782404
984
  ],
985
  "bp": 1.0,
986
+ "sys_len": 2312,
987
  "ref_len": 2235,
988
+ "sacrebleu": 0.2904798394859776,
989
+ "score": 0.2904798394859776,
990
  "score_name": "sacrebleu",
991
+ "score_ci_low": 0.25673627398166365,
992
+ "score_ci_high": 0.3237414974447857,
993
+ "sacrebleu_ci_low": 0.25673627398166365,
994
+ "sacrebleu_ci_high": 0.3237414974447857
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
+ 1458,
1000
+ 1061,
1001
+ 822,
1002
+ 653
1003
  ],
1004
  "totals": [
1005
+ 1897,
1006
+ 1831,
1007
+ 1765,
1008
+ 1699
1009
  ],
1010
  "precisions": [
1011
+ 0.768581971534001,
1012
+ 0.5794647733478974,
1013
+ 0.4657223796033994,
1014
+ 0.38434373160682755
1015
  ],
1016
+ "bp": 0.9900341767854584,
1017
+ "sys_len": 1897,
1018
  "ref_len": 1916,
1019
+ "sacrebleu": 0.5260671977764972,
1020
+ "score": 0.5260671977764972,
1021
  "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.475872231232826,
1023
+ "score_ci_high": 0.5725271086507513,
1024
+ "sacrebleu_ci_low": 0.475872231232826,
1025
+ "sacrebleu_ci_high": 0.5725271086507513
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
+ 1404,
1031
+ 1006,
1032
+ 750,
1033
+ 558
1034
  ],
1035
  "totals": [
1036
+ 1938,
1037
+ 1872,
1038
+ 1806,
1039
+ 1740
1040
  ],
1041
  "precisions": [
1042
+ 0.7244582043343654,
1043
+ 0.5373931623931624,
1044
+ 0.4152823920265781,
1045
+ 0.3206896551724138
1046
  ],
1047
+ "bp": 0.994340123204573,
1048
+ "sys_len": 1938,
1049
  "ref_len": 1949,
1050
+ "sacrebleu": 0.47448058787166153,
1051
+ "score": 0.47448058787166153,
1052
  "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.44387053770037466,
1054
+ "score_ci_high": 0.51824756405881,
1055
+ "sacrebleu_ci_low": 0.44387053770037466,
1056
+ "sacrebleu_ci_high": 0.51824756405881
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
+ 1267,
1062
+ 721,
1063
+ 443,
1064
+ 269
1065
  ],
1066
  "totals": [
1067
+ 1960,
1068
+ 1894,
1069
+ 1828,
1070
+ 1762
1071
  ],
1072
  "precisions": [
1073
+ 0.6464285714285714,
1074
+ 0.38067581837381204,
1075
+ 0.24234135667396062,
1076
+ 0.15266742338251987
1077
  ],
1078
+ "bp": 0.932013328656422,
1079
+ "sys_len": 1960,
1080
  "ref_len": 2098,
1081
+ "sacrebleu": 0.28789528964668276,
1082
+ "score": 0.28789528964668276,
1083
  "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.2550898701517229,
1085
+ "score_ci_high": 0.32315043281050926,
1086
+ "sacrebleu_ci_low": 0.2550898701517229,
1087
+ "sacrebleu_ci_high": 0.32315043281050926
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
+ 1342,
1093
+ 940,
1094
+ 684,
1095
+ 495
1096
  ],
1097
  "totals": [
1098
+ 1861,
1099
+ 1795,
1100
+ 1729,
1101
+ 1663
1102
  ],
1103
  "precisions": [
1104
+ 0.7211176786673832,
1105
+ 0.5236768802228412,
1106
+ 0.39560439560439564,
1107
+ 0.29765484064942876
1108
  ],
1109
  "bp": 1.0,
1110
+ "sys_len": 1861,
1111
  "ref_len": 1734,
1112
+ "sacrebleu": 0.45920953842389034,
1113
+ "score": 0.45920953842389034,
1114
  "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.4189257677908612,
1116
+ "score_ci_high": 0.5012078702279882,
1117
+ "sacrebleu_ci_low": 0.4189257677908612,
1118
+ "sacrebleu_ci_high": 0.5012078702279882
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
+ 1124,
1124
+ 603,
1125
+ 358,
1126
+ 209
1127
  ],
1128
  "totals": [
1129
+ 1910,
1130
+ 1844,
1131
+ 1778,
1132
+ 1712
1133
  ],
1134
  "precisions": [
1135
+ 0.5884816753926702,
1136
+ 0.32700650759219085,
1137
+ 0.20134983127109113,
1138
+ 0.12207943925233644
1139
  ],
1140
  "bp": 1.0,
1141
+ "sys_len": 1910,
1142
  "ref_len": 1734,
1143
+ "sacrebleu": 0.26225319254842555,
1144
+ "score": 0.26225319254842555,
1145
  "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.2289709766377336,
1147
+ "score_ci_high": 0.30157417779677487,
1148
+ "sacrebleu_ci_low": 0.2289709766377336,
1149
+ "sacrebleu_ci_high": 0.30157417779677487
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
+ 1117,
1155
+ 625,
1156
+ 396,
1157
+ 257
1158
  ],
1159
  "totals": [
1160
+ 1801,
1161
+ 1735,
1162
+ 1669,
1163
+ 1603
1164
  ],
1165
  "precisions": [
1166
+ 0.6202109938922821,
1167
+ 0.36023054755043227,
1168
+ 0.2372678250449371,
1169
+ 0.1603243917654398
1170
  ],
1171
  "bp": 1.0,
1172
+ "sys_len": 1801,
1173
  "ref_len": 1734,
1174
+ "sacrebleu": 0.3036264572339376,
1175
+ "score": 0.3036264572339376,
1176
  "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.2714407091986394,
1178
+ "score_ci_high": 0.36938210174908737,
1179
+ "sacrebleu_ci_low": 0.2714407091986394,
1180
+ "sacrebleu_ci_high": 0.36938210174908737
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
+ 1283,
1186
+ 897,
1187
+ 669,
1188
+ 501
1189
  ],
1190
  "totals": [
1191
+ 2053,
1192
+ 1987,
1193
+ 1921,
1194
+ 1855
1195
  ],
1196
  "precisions": [
1197
+ 0.6249391134924501,
1198
+ 0.451434323100151,
1199
+ 0.34825611660593436,
1200
+ 0.27008086253369273
1201
  ],
1202
  "bp": 1.0,
1203
+ "sys_len": 2053,
1204
  "ref_len": 1734,
1205
+ "sacrebleu": 0.40360469717634034,
1206
+ "score": 0.40360469717634034,
1207
  "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.22825303083161255,
1209
+ "score_ci_high": 0.4880479254926776,
1210
+ "sacrebleu_ci_low": 0.22825303083161255,
1211
+ "sacrebleu_ci_high": 0.4880479254926776
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
  1324,
1217
+ 929,
1218
+ 672,
1219
+ 489
1220
  ],
1221
  "totals": [
1222
+ 2052,
1223
+ 1986,
1224
+ 1920,
1225
+ 1854
1226
  ],
1227
  "precisions": [
1228
+ 0.645224171539961,
1229
+ 0.4677744209466264,
1230
+ 0.35,
1231
+ 0.2637540453074434
1232
  ],
1233
  "bp": 1.0,
1234
+ "sys_len": 2052,
1235
  "ref_len": 1734,
1236
+ "sacrebleu": 0.4085578581131045,
1237
+ "score": 0.4085578581131045,
1238
  "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.30163750041734844,
1240
+ "score_ci_high": 0.4725760173281261,
1241
+ "sacrebleu_ci_low": 0.30163750041734844,
1242
+ "sacrebleu_ci_high": 0.4725760173281261
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
+ 1196,
1248
+ 676,
1249
+ 416,
1250
+ 253
1251
  ],
1252
  "totals": [
1253
+ 1927,
1254
+ 1861,
1255
+ 1795,
1256
+ 1729
1257
  ],
1258
  "precisions": [
1259
+ 0.6206538661131292,
1260
+ 0.3632455668995164,
1261
+ 0.23175487465181058,
1262
+ 0.14632735685367262
1263
  ],
1264
  "bp": 1.0,
1265
+ "sys_len": 1927,
1266
  "ref_len": 1734,
1267
+ "sacrebleu": 0.2956998119625713,
1268
+ "score": 0.2956998119625713,
1269
  "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.25139539620137036,
1271
+ "score_ci_high": 0.32686978566283265,
1272
+ "sacrebleu_ci_low": 0.25139539620137036,
1273
+ "sacrebleu_ci_high": 0.32686978566283265
1274
  },
1275
+ "score": 0.38960601381372006,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
+ "score": 0.5094379735715554,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
results/bluebench/{2025-06-21T08-38-27_evaluation_results.json β†’ 2025-06-23T08-43-46_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-21T12:38:23.605149Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -8,7 +8,7 @@
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
- "model_name=watsonx/meta-llama/llama-3-2-11b-vision-instruct,max_tokens=256",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
@@ -26,7 +26,7 @@
26
  "num_fewshots": null,
27
  "limit": null,
28
  "batch_size": 8,
29
- "model": "watsonx/meta-llama/llama-3-2-11b-vision-instruct",
30
  "model_args": {
31
  "max_tokens": 256
32
  },
@@ -42,7 +42,7 @@
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -176,23 +176,23 @@
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
- "accuracy": 0.7555555555555555,
180
- "accuracy_ci_low": 0.6666666666666666,
181
- "accuracy_ci_high": 0.8333333333333334,
182
  "score_name": "accuracy",
183
- "score": 0.7555555555555555,
184
- "score_ci_high": 0.8333333333333334,
185
- "score_ci_low": 0.6666666666666666,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
- "accuracy": 0.8,
190
- "accuracy_ci_low": 0.7111111111111111,
191
- "accuracy_ci_high": 0.8666666666666667,
192
  "score_name": "accuracy",
193
- "score": 0.8,
194
- "score_ci_high": 0.8666666666666667,
195
- "score_ci_low": 0.7111111111111111,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
@@ -206,166 +206,186 @@
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
- "accuracy": 0.8,
210
- "accuracy_ci_low": 0.7222222222222222,
211
- "accuracy_ci_high": 0.8777777777777778,
212
  "score_name": "accuracy",
213
- "score": 0.8,
214
- "score_ci_high": 0.8777777777777778,
215
- "score_ci_low": 0.7222222222222222,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
- "accuracy": 0.8333333333333334,
220
- "accuracy_ci_low": 0.7444444444444445,
221
- "accuracy_ci_high": 0.9,
222
  "score_name": "accuracy",
223
- "score": 0.8333333333333334,
224
- "score_ci_high": 0.9,
225
- "score_ci_low": 0.7444444444444445,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.9,
230
- "accuracy_ci_low": 0.8277783447952625,
231
- "accuracy_ci_high": 0.9448695638574703,
232
  "score_name": "accuracy",
233
- "score": 0.9,
234
- "score_ci_high": 0.9448695638574703,
235
- "score_ci_low": 0.8277783447952625,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
- "accuracy": 0.9777777777777777,
240
- "accuracy_ci_low": 0.9222222222222223,
241
- "accuracy_ci_high": 1.0,
242
  "score_name": "accuracy",
243
- "score": 0.9777777777777777,
244
- "score_ci_high": 1.0,
245
- "score_ci_low": 0.9222222222222223,
246
  "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
- "accuracy": 0.8555555555555555,
250
- "accuracy_ci_low": 0.7777777777777778,
251
- "accuracy_ci_high": 0.9222222222222223,
252
  "score_name": "accuracy",
253
- "score": 0.8555555555555555,
254
- "score_ci_high": 0.9222222222222223,
255
- "score_ci_low": 0.7777777777777778,
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
- "accuracy": 0.9,
260
- "accuracy_ci_low": 0.822517354780987,
261
- "accuracy_ci_high": 0.9444444444444444,
262
  "score_name": "accuracy",
263
- "score": 0.9,
264
- "score_ci_high": 0.9444444444444444,
265
- "score_ci_low": 0.822517354780987,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
- "accuracy": 0.8,
270
- "accuracy_ci_low": 0.7111111111111111,
271
- "accuracy_ci_high": 0.8777777777777778,
272
  "score_name": "accuracy",
273
- "score": 0.8,
274
- "score_ci_high": 0.8777777777777778,
275
- "score_ci_low": 0.7111111111111111,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.8888888888888888,
280
- "accuracy_ci_low": 0.8111111111111111,
281
- "accuracy_ci_high": 0.9444444444444444,
282
  "score_name": "accuracy",
283
- "score": 0.8888888888888888,
284
- "score_ci_high": 0.9444444444444444,
285
- "score_ci_low": 0.8111111111111111,
286
  "num_of_instances": 90
287
  },
288
- "score": 0.8555555555555555,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.5,
296
- "score": 0.5,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.5,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
- "f1_Person": 0.5,
307
- "f1_Organization": 0.3540372670807453,
308
- "f1_Location": 0.35390946502057613,
309
- "f1_macro": 0.40264891070044045,
310
- "recall_macro": 0.3613703019405527,
311
- "precision_macro": 0.46075111602245217,
312
- "in_classes_support": 0.7734806629834254,
313
- "f1_micro": 0.3651685393258427,
314
- "recall_micro": 0.37142857142857144,
315
- "precision_micro": 0.35911602209944754,
316
- "score": 0.3651685393258427,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.3095577857641209,
319
- "score_ci_high": 0.42280766665241554,
320
- "f1_micro_ci_low": 0.3095577857641209,
321
- "f1_micro_ci_high": 0.42280766665241554
322
  },
323
- "score": 0.3651685393258427,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
- "accuracy": 0.5492957746478874,
330
- "accuracy_ci_low": 0.4445856612942033,
331
- "accuracy_ci_high": 0.6619718309859155,
332
  "score_name": "accuracy",
333
- "score": 0.5492957746478874,
334
- "score_ci_high": 0.6619718309859155,
335
- "score_ci_low": 0.4445856612942033,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
- "accuracy": 0.2676056338028169,
340
- "accuracy_ci_low": 0.16901408450704225,
341
- "accuracy_ci_high": 0.38028169014084506,
342
  "score_name": "accuracy",
343
- "score": 0.2676056338028169,
344
- "score_ci_high": 0.38028169014084506,
345
- "score_ci_low": 0.16901408450704225,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
- "accuracy": 0.28169014084507044,
350
- "accuracy_ci_low": 0.18309859154929578,
351
- "accuracy_ci_high": 0.39436619718309857,
352
  "score_name": "accuracy",
353
- "score": 0.28169014084507044,
354
- "score_ci_high": 0.39436619718309857,
355
- "score_ci_low": 0.18309859154929578,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
- "accuracy": 0.39436619718309857,
360
- "accuracy_ci_low": 0.28169014084507044,
361
- "accuracy_ci_high": 0.5070422535211268,
362
  "score_name": "accuracy",
363
- "score": 0.39436619718309857,
364
- "score_ci_high": 0.5070422535211268,
365
- "score_ci_low": 0.28169014084507044,
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  "accuracy": 0.4788732394366197,
370
  "accuracy_ci_low": 0.36619718309859156,
371
  "accuracy_ci_high": 0.5915492957746479,
@@ -375,17 +395,17 @@
375
  "score_ci_low": 0.36619718309859156,
376
  "num_of_instances": 71
377
  },
378
- "mmlu_pro_engineering": {
379
- "accuracy": 0.23943661971830985,
380
- "accuracy_ci_low": 0.14084507042253522,
381
- "accuracy_ci_high": 0.3380281690140845,
382
  "score_name": "accuracy",
383
- "score": 0.23943661971830985,
384
- "score_ci_high": 0.3380281690140845,
385
- "score_ci_low": 0.14084507042253522,
386
  "num_of_instances": 71
387
  },
388
- "mmlu_pro_health": {
389
  "accuracy": 0.43661971830985913,
390
  "accuracy_ci_low": 0.323943661971831,
391
  "accuracy_ci_high": 0.5492957746478874,
@@ -395,414 +415,394 @@
395
  "score_ci_low": 0.323943661971831,
396
  "num_of_instances": 71
397
  },
398
- "mmlu_pro_history": {
399
- "accuracy": 0.5070422535211268,
400
- "accuracy_ci_low": 0.39436619718309857,
401
- "accuracy_ci_high": 0.6197183098591549,
402
- "score_name": "accuracy",
403
- "score": 0.5070422535211268,
404
- "score_ci_high": 0.6197183098591549,
405
- "score_ci_low": 0.39436619718309857,
406
- "num_of_instances": 71
407
- },
408
- "mmlu_pro_law": {
409
- "accuracy": 0.28169014084507044,
410
- "accuracy_ci_low": 0.18309859154929578,
411
- "accuracy_ci_high": 0.39436619718309857,
412
- "score_name": "accuracy",
413
- "score": 0.28169014084507044,
414
- "score_ci_high": 0.39436619718309857,
415
- "score_ci_low": 0.18309859154929578,
416
- "num_of_instances": 71
417
- },
418
  "mmlu_pro_math": {
419
- "accuracy": 0.14084507042253522,
420
- "accuracy_ci_low": 0.07042253521126761,
421
- "accuracy_ci_high": 0.25119134125976145,
422
  "score_name": "accuracy",
423
- "score": 0.14084507042253522,
424
- "score_ci_high": 0.25119134125976145,
425
- "score_ci_low": 0.07042253521126761,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
- "accuracy": 0.2112676056338028,
430
- "accuracy_ci_low": 0.1267605633802817,
431
- "accuracy_ci_high": 0.323943661971831,
432
  "score_name": "accuracy",
433
- "score": 0.2112676056338028,
434
- "score_ci_high": 0.323943661971831,
435
- "score_ci_low": 0.1267605633802817,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
- "accuracy": 0.43661971830985913,
440
- "accuracy_ci_low": 0.323943661971831,
441
- "accuracy_ci_high": 0.5633802816901409,
442
  "score_name": "accuracy",
443
- "score": 0.43661971830985913,
444
- "score_ci_high": 0.5633802816901409,
445
- "score_ci_low": 0.323943661971831,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
- "accuracy": 0.28169014084507044,
450
- "accuracy_ci_low": 0.18309859154929578,
451
- "accuracy_ci_high": 0.39436619718309857,
452
  "score_name": "accuracy",
453
- "score": 0.28169014084507044,
454
- "score_ci_high": 0.39436619718309857,
455
- "score_ci_low": 0.18309859154929578,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
- "accuracy": 0.5633802816901409,
460
- "accuracy_ci_low": 0.43661971830985913,
461
- "accuracy_ci_high": 0.676056338028169,
462
  "score_name": "accuracy",
463
- "score": 0.5633802816901409,
464
- "score_ci_high": 0.676056338028169,
465
- "score_ci_low": 0.43661971830985913,
466
  "num_of_instances": 71
467
  },
468
- "score": 0.36217303822937624,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.5714968851810958,
475
- "f1_suggestive": 0.38461538461538464,
476
- "f1_fanciful": 0.4666666666666667,
477
- "f1_generic": 0.8148148148148148,
478
- "f1_arbitrary": 0.45454545454545453,
479
- "f1_descriptive": 0.7368421052631579,
480
- "f1_macro_ci_low": 0.477080408933172,
481
- "f1_macro_ci_high": 0.6804913151553064,
482
  "score_name": "f1_micro",
483
- "score": 0.5696969696969697,
484
- "score_ci_high": 0.6707317073170732,
485
- "score_ci_low": 0.4662576687116564,
486
  "num_of_instances": 85,
487
- "accuracy": 0.5529411764705883,
488
- "accuracy_ci_low": 0.4470588235294118,
489
- "accuracy_ci_high": 0.6588235294117647,
490
- "f1_micro": 0.5696969696969697,
491
- "f1_micro_ci_low": 0.4662576687116564,
492
- "f1_micro_ci_high": 0.6707317073170732
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.6068255091609656,
496
- "f1_no": 0.691358024691358,
497
- "f1_yes": 0.5222929936305732,
498
- "f1_macro_ci_low": 0.5410272293620183,
499
- "f1_macro_ci_high": 0.6785168220322721,
500
  "score_name": "f1_micro",
501
- "score": 0.625,
502
- "score_ci_high": 0.695,
503
- "score_ci_low": 0.56,
504
  "num_of_instances": 200,
505
- "accuracy": 0.625,
506
- "accuracy_ci_low": 0.56,
507
- "accuracy_ci_high": 0.695,
508
- "f1_micro": 0.625,
509
- "f1_micro_ci_low": 0.56,
510
- "f1_micro_ci_high": 0.695
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.341120203240043,
514
- "f1_conclusion": 0.09523809523809523,
515
- "f1_analysis": 0.5046728971962616,
516
- "f1_decree": 0.29411764705882354,
517
- "f1_issue": 0.21818181818181817,
518
- "f1_procedural history": 0.34782608695652173,
519
- "f1_facts": 0.4878048780487805,
520
- "f1_rule": 0.44,
521
- "f1_macro_ci_low": 0.28339415146501146,
522
- "f1_macro_ci_high": 0.41104677447147997,
523
  "score_name": "f1_micro",
524
- "score": 0.36683417085427134,
525
- "score_ci_high": 0.43609022556390975,
526
- "score_ci_low": 0.3057644110275689,
527
  "num_of_instances": 200,
528
- "accuracy": 0.365,
529
- "accuracy_ci_low": 0.305,
530
- "accuracy_ci_high": 0.435,
531
- "f1_micro": 0.36683417085427134,
532
- "f1_micro_ci_low": 0.3057644110275689,
533
- "f1_micro_ci_high": 0.43609022556390975
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.5542857142857143,
537
- "f1_yes": 0.42857142857142855,
538
- "f1_no": 0.68,
539
- "f1_macro_ci_low": 0.48355051877957367,
540
- "f1_macro_ci_high": 0.6294200433350639,
541
  "score_name": "f1_micro",
542
- "score": 0.5897435897435898,
543
- "score_ci_high": 0.662674821169658,
544
- "score_ci_low": 0.5188702892801219,
545
  "num_of_instances": 200,
546
- "accuracy": 0.575,
547
- "accuracy_ci_low": 0.505,
548
- "accuracy_ci_high": 0.65,
549
- "f1_micro": 0.5897435897435898,
550
- "f1_micro_ci_low": 0.5188702892801219,
551
- "f1_micro_ci_high": 0.662674821169658
552
  },
553
  "legalbench_proa": {
554
- "f1_macro": 0.9155052264808363,
555
- "f1_yes": 0.9024390243902439,
556
- "f1_no": 0.9285714285714286,
557
- "f1_macro_ci_low": 0.8363111281337707,
558
- "f1_macro_ci_high": 0.9587803500846979,
559
  "score_name": "f1_micro",
560
- "score": 0.9156626506024096,
561
- "score_ci_high": 0.9585798816568047,
562
- "score_ci_low": 0.8372630005674523,
563
  "num_of_instances": 85,
564
- "accuracy": 0.8941176470588236,
565
- "accuracy_ci_low": 0.8,
566
- "accuracy_ci_high": 0.9466354743635627,
567
- "f1_micro": 0.9156626506024096,
568
- "f1_micro_ci_low": 0.8372630005674523,
569
- "f1_micro_ci_high": 0.9585798816568047
570
  },
571
- "score": 0.6133874761794481,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.5088753805047725,
578
- "f1_cars": 0.7380952380952381,
579
- "f1_windows x": 0.08333333333333333,
580
- "f1_computer graphics": 0.5434782608695652,
581
- "f1_atheism": 0.30434782608695654,
582
- "f1_religion": 0.1388888888888889,
583
- "f1_medicine": 0.7733333333333333,
584
- "f1_christianity": 0.6597938144329897,
585
- "f1_microsoft windows": 0.39436619718309857,
586
- "f1_middle east": 0.22641509433962265,
587
- "f1_politics": 0.3418803418803419,
588
- "f1_motorcycles": 0.673469387755102,
589
- "f1_pc hardware": 0.4968944099378882,
590
- "f1_mac hardware": 0.5617977528089888,
591
- "f1_electronics": 0.4580152671755725,
592
- "f1_for sale": 0.4406779661016949,
593
- "f1_guns": 0.28125,
594
- "f1_space": 0.7111111111111111,
595
- "f1_cryptography": 0.6197183098591549,
596
- "f1_baseball": 0.8617886178861789,
597
- "f1_hockey": 0.8688524590163934,
598
- "f1_macro_ci_low": 0.4834761861756795,
599
- "f1_macro_ci_high": 0.5399453530006973,
600
  "score_name": "f1_micro",
601
- "score": 0.5394515948517068,
602
- "score_ci_high": 0.571442163952847,
603
- "score_ci_low": 0.5099380125013132,
604
  "num_of_instances": 1000,
605
- "accuracy": 0.482,
606
- "accuracy_ci_low": 0.45265982508763236,
607
- "accuracy_ci_high": 0.514,
608
- "f1_micro": 0.5394515948517068,
609
- "f1_micro_ci_low": 0.5099380125013132,
610
- "f1_micro_ci_high": 0.571442163952847
611
  },
612
- "score": 0.5394515948517068,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.6253924964112395,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9158878504672897,
620
- "f1_credit card or prepaid card": 0.625,
621
- "f1_debt collection": 0.5882352941176471,
622
- "f1_checking or savings account": 0.75,
623
- "f1_money transfer or virtual currency or money service": 0.5517241379310345,
624
- "f1_mortgage": 0.7407407407407407,
625
- "f1_vehicle loan or lease": 0.5333333333333333,
626
- "f1_payday loan or title loan or personal loan": 0.1111111111111111,
627
- "f1_student loan": 0.8125,
628
- "f1_macro_ci_low": 0.5783727985774465,
629
- "f1_macro_ci_high": 0.684670784163184,
630
  "score_name": "f1_micro",
631
- "score": 0.8322284548699643,
632
- "score_ci_high": 0.8548513405559257,
633
- "score_ci_low": 0.809253238376061,
634
  "num_of_instances": 1000,
635
- "accuracy": 0.816,
636
- "accuracy_ci_low": 0.794,
637
- "accuracy_ci_high": 0.84,
638
- "f1_micro": 0.8322284548699643,
639
- "f1_micro_ci_low": 0.809253238376061,
640
- "f1_micro_ci_high": 0.8548513405559257
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.6741537589904952,
644
- "f1_mortgages and loans": 0.7560975609756098,
645
- "f1_credit card": 0.7589743589743589,
646
- "f1_debt collection": 0.6829268292682927,
647
- "f1_credit reporting": 0.7687296416938111,
648
- "f1_retail banking": 0.40404040404040403,
649
- "f1_macro_ci_low": 0.6305914704936404,
650
- "f1_macro_ci_high": 0.7190040244929556,
651
  "score_name": "f1_micro",
652
- "score": 0.709278350515464,
653
- "score_ci_high": 0.7473953353743584,
654
- "score_ci_low": 0.6666666666666666,
655
  "num_of_instances": 500,
656
- "accuracy": 0.688,
657
- "accuracy_ci_low": 0.644,
658
- "accuracy_ci_high": 0.726,
659
- "f1_micro": 0.709278350515464,
660
- "f1_micro_ci_low": 0.6666666666666666,
661
- "f1_micro_ci_high": 0.7473953353743584
662
  },
663
- "score": 0.7707534026927141,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
- "execution_accuracy": 0.045,
671
- "program_accuracy": 0.055,
672
- "score": 0.055,
673
  "score_name": "program_accuracy",
674
- "execution_accuracy_ci_low": 0.033,
675
- "execution_accuracy_ci_high": 0.058,
676
- "program_accuracy_ci_low": 0.042,
677
- "program_accuracy_ci_high": 0.0702228905446176,
678
- "score_ci_low": 0.042,
679
- "score_ci_high": 0.0702228905446176
680
  },
681
- "score": 0.055,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.3126916927096655,
688
- "recall": 0.5297013169878433,
689
- "f1": 0.3210773896996459,
690
- "precision_ci_low": 0.2920307369068035,
691
- "precision_ci_high": 0.33339870629057006,
692
- "recall_ci_low": 0.512017589843317,
693
- "recall_ci_high": 0.5445397559503604,
694
- "f1_ci_low": 0.3051687944542956,
695
- "f1_ci_high": 0.33801833203554277,
696
  "score_name": "f1",
697
- "score": 0.3210773896996459,
698
- "score_ci_high": 0.33801833203554277,
699
- "score_ci_low": 0.3051687944542956,
700
  "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.5835817578931649,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6631206824382146,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5450189588715633,
704
- "faithfullness_f1_token_overlap": 0.26865346120635386,
705
- "faithfullness_recall_token_overlap": 0.20704283031640539,
706
- "faithfullness_precision_token_overlap": 0.5397055221971085,
707
- "correctness_f1_token_overlap": 0.3210773896996459,
708
- "correctness_recall_token_overlap": 0.5297013169878433,
709
- "correctness_precision_token_overlap": 0.3126916927096655
710
  },
711
- "score": 0.3210773896996459,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.475,
718
- "accuracy_ci_low": 0.4436611532620239,
719
- "accuracy_ci_high": 0.504,
720
  "score_name": "accuracy",
721
- "score": 0.475,
722
- "score_ci_high": 0.504,
723
- "score_ci_low": 0.4436611532620239,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
- "accuracy": 0.798,
728
- "accuracy_ci_low": 0.7633491592715252,
729
- "accuracy_ci_high": 0.832,
730
  "score_name": "accuracy",
731
- "score": 0.798,
732
- "score_ci_high": 0.832,
733
- "score_ci_low": 0.7633491592715252,
734
  "num_of_instances": 500
735
  },
736
- "score": 0.6365000000000001,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
- "severity_critical": 0.0,
743
  "severity_high": 1.0,
744
- "severity_medium": 0.0,
745
- "severity_low": 99.0,
746
- "category_harmful_info": 0.8553694002921619,
747
- "category_deception": 0.835255232046951,
748
- "category_explicit_content": 0.8441415034761331,
749
- "category_violence": 0.8099433737673931,
750
- "category_discrimination": 0.770528504891055,
751
- "category_substance_abuse": 0.8680479208628336,
752
- "category_pii": 0.8295589322569192,
753
- "safety": 0.8299434131611555,
754
- "safety_ci_low": 0.8055372622125345,
755
- "safety_ci_high": 0.8483225977870763,
756
  "score_name": "safety",
757
- "score": 0.8299434131611555,
758
- "score_ci_high": 0.8483225977870763,
759
- "score_ci_low": 0.8055372622125345,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.8299434131611555,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
- "rougeLsum": 0.37058702847435154,
770
- "rouge2": 0.22434285890237585,
771
- "rougeL": 0.30351270566286986,
772
- "score": 0.30351270566286986,
773
  "score_name": "rougeL",
774
- "rouge1": 0.4255147200867186,
775
- "rougeLsum_ci_low": 0.36115259966566504,
776
- "rougeLsum_ci_high": 0.379873984149421,
777
- "rouge2_ci_low": 0.21612712173196832,
778
- "rouge2_ci_high": 0.23293503766644144,
779
- "rougeL_ci_low": 0.2950395615436246,
780
- "rougeL_ci_high": 0.3118499963672728,
781
- "score_ci_low": 0.2950395615436246,
782
- "score_ci_high": 0.3118499963672728,
783
- "rouge1_ci_low": 0.4147565222299168,
784
- "rouge1_ci_high": 0.43557795557970264
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
- "rougeLsum": 0.10643447818455329,
789
- "rouge2": 0.018171690879646322,
790
- "rougeL": 0.09269669467501682,
791
- "score": 0.09269669467501682,
792
  "score_name": "rougeL",
793
- "rouge1": 0.12731717955128613,
794
- "rougeLsum_ci_low": 0.10150863047194304,
795
- "rougeLsum_ci_high": 0.11036755862721943,
796
- "rouge2_ci_low": 0.016122174542425746,
797
- "rouge2_ci_high": 0.02016619357964368,
798
- "rougeL_ci_low": 0.08847971390285901,
799
- "rougeL_ci_high": 0.09604727671370761,
800
- "score_ci_low": 0.08847971390285901,
801
- "score_ci_high": 0.09604727671370761,
802
- "rouge1_ci_low": 0.1208888004165388,
803
- "rouge1_ci_high": 0.13193160708995647
804
  },
805
- "score": 0.19810470016894333,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
@@ -810,473 +810,473 @@
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
- 1212,
814
- 730,
815
- 491,
816
- 347
817
  ],
818
  "totals": [
819
- 1843,
820
- 1777,
821
- 1711,
822
- 1645
823
  ],
824
  "precisions": [
825
- 0.6576234400434074,
826
- 0.41080472706809235,
827
- 0.2869666861484512,
828
- 0.2109422492401216
829
  ],
830
  "bp": 1.0,
831
- "sys_len": 1843,
832
  "ref_len": 1734,
833
- "sacrebleu": 0.3576036500711557,
834
- "score": 0.3576036500711557,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.30044978362076574,
837
- "score_ci_high": 0.4017272348853817,
838
- "sacrebleu_ci_low": 0.30044978362076574,
839
- "sacrebleu_ci_high": 0.4017272348853817
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
- 1262,
845
- 782,
846
- 520,
847
- 359
848
  ],
849
  "totals": [
850
- 1809,
851
- 1743,
852
- 1677,
853
- 1611
854
  ],
855
  "precisions": [
856
- 0.6976229961304589,
857
- 0.44865174985656914,
858
- 0.31007751937984496,
859
- 0.2228429546865301
860
  ],
861
  "bp": 1.0,
862
- "sys_len": 1809,
863
  "ref_len": 1734,
864
- "sacrebleu": 0.3834862844151344,
865
- "score": 0.3834862844151344,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.3416515114541037,
868
- "score_ci_high": 0.42052556141750186,
869
- "sacrebleu_ci_low": 0.3416515114541037,
870
- "sacrebleu_ci_high": 0.42052556141750186
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
- 803,
876
- 382,
877
- 189,
878
- 88
879
  ],
880
  "totals": [
881
- 1643,
882
- 1577,
883
- 1511,
884
- 1445
885
  ],
886
  "precisions": [
887
- 0.48874010955569086,
888
- 0.2422320862396956,
889
- 0.12508272667107875,
890
- 0.06089965397923876
891
  ],
892
  "bp": 1.0,
893
- "sys_len": 1643,
894
  "ref_len": 1589,
895
- "sacrebleu": 0.17329277411747399,
896
- "score": 0.17329277411747399,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.1458100600445785,
899
- "score_ci_high": 0.20408033058998093,
900
- "sacrebleu_ci_low": 0.1458100600445785,
901
- "sacrebleu_ci_high": 0.20408033058998093
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
- 1126,
907
- 615,
908
- 367,
909
- 220
910
  ],
911
  "totals": [
912
- 1842,
913
- 1776,
914
- 1710,
915
- 1644
916
  ],
917
  "precisions": [
918
- 0.6112920738327905,
919
- 0.34628378378378377,
920
- 0.21461988304093566,
921
- 0.13381995133819952
922
  ],
923
  "bp": 1.0,
924
- "sys_len": 1842,
925
  "ref_len": 1835,
926
- "sacrebleu": 0.27923376694078983,
927
- "score": 0.27923376694078983,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.24552911017678328,
930
- "score_ci_high": 0.3170919141844367,
931
- "sacrebleu_ci_low": 0.24552911017678328,
932
- "sacrebleu_ci_high": 0.3170919141844367
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
- 1462,
938
- 1038,
939
- 798,
940
- 621
941
  ],
942
  "totals": [
943
- 2034,
944
- 1968,
945
- 1902,
946
- 1836
947
  ],
948
  "precisions": [
949
- 0.7187807276302851,
950
- 0.5274390243902439,
951
- 0.4195583596214511,
952
- 0.338235294117647
953
  ],
954
- "bp": 0.9834231034146155,
955
- "sys_len": 2034,
956
  "ref_len": 2068,
957
- "sacrebleu": 0.47362585607622854,
958
- "score": 0.47362585607622854,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.4382806139359095,
961
- "score_ci_high": 0.5139442284621641,
962
- "sacrebleu_ci_low": 0.4382806139359095,
963
- "sacrebleu_ci_high": 0.5139442284621641
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
- 1295,
969
- 636,
970
- 344,
971
- 187
972
  ],
973
  "totals": [
974
- 2472,
975
- 2406,
976
- 2340,
977
- 2274
978
  ],
979
  "precisions": [
980
- 0.5238673139158576,
981
- 0.2643391521197007,
982
- 0.147008547008547,
983
- 0.0822339489885664
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 2472,
987
  "ref_len": 2235,
988
- "sacrebleu": 0.20227589502234433,
989
- "score": 0.20227589502234433,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.18147140512741464,
992
- "score_ci_high": 0.2242208833347133,
993
- "sacrebleu_ci_low": 0.18147140512741464,
994
- "sacrebleu_ci_high": 0.2242208833347133
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
- 1388,
1000
- 960,
1001
- 703,
1002
- 527
1003
  ],
1004
  "totals": [
1005
- 1933,
1006
- 1867,
1007
- 1801,
1008
- 1735
1009
  ],
1010
  "precisions": [
1011
- 0.7180548370408691,
1012
- 0.5141938939475094,
1013
- 0.3903387007218212,
1014
- 0.3037463976945245
1015
  ],
1016
  "bp": 1.0,
1017
- "sys_len": 1933,
1018
  "ref_len": 1916,
1019
- "sacrebleu": 0.4574138588750708,
1020
- "score": 0.4574138588750708,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.4084699042815335,
1023
- "score_ci_high": 0.497105169359069,
1024
- "sacrebleu_ci_low": 0.4084699042815335,
1025
- "sacrebleu_ci_high": 0.497105169359069
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
- 1308,
1031
- 869,
1032
- 605,
1033
- 418
1034
  ],
1035
  "totals": [
1036
- 1942,
1037
- 1876,
1038
- 1810,
1039
- 1744
1040
  ],
1041
  "precisions": [
1042
- 0.6735324407826982,
1043
- 0.4632196162046908,
1044
- 0.3342541436464089,
1045
- 0.23967889908256879
1046
  ],
1047
- "bp": 0.9964019571140578,
1048
- "sys_len": 1942,
1049
  "ref_len": 1949,
1050
- "sacrebleu": 0.3961845974220761,
1051
- "score": 0.3961845974220761,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.35334192757696886,
1054
- "score_ci_high": 0.44028759492619396,
1055
- "sacrebleu_ci_low": 0.35334192757696886,
1056
- "sacrebleu_ci_high": 0.44028759492619396
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
- 1251,
1062
- 677,
1063
- 390,
1064
- 223
1065
  ],
1066
  "totals": [
1067
- 2028,
1068
- 1962,
1069
- 1896,
1070
- 1830
1071
  ],
1072
  "precisions": [
1073
- 0.6168639053254438,
1074
- 0.34505606523955146,
1075
- 0.20569620253164558,
1076
- 0.12185792349726776
1077
  ],
1078
- "bp": 0.96607214307495,
1079
- "sys_len": 2028,
1080
  "ref_len": 2098,
1081
- "sacrebleu": 0.2610954642389454,
1082
- "score": 0.2610954642389454,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.22712931414581697,
1085
- "score_ci_high": 0.288932465685729,
1086
- "sacrebleu_ci_low": 0.22712931414581697,
1087
- "sacrebleu_ci_high": 0.288932465685729
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
- 1304,
1093
- 839,
1094
- 567,
1095
- 393
1096
  ],
1097
  "totals": [
1098
- 1854,
1099
- 1788,
1100
- 1722,
1101
- 1656
1102
  ],
1103
  "precisions": [
1104
- 0.703344120819849,
1105
- 0.4692393736017897,
1106
- 0.32926829268292684,
1107
- 0.23731884057971014
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 1854,
1111
  "ref_len": 1734,
1112
- "sacrebleu": 0.4007385379854121,
1113
- "score": 0.4007385379854121,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.36457462981738414,
1116
- "score_ci_high": 0.4478003676387348,
1117
- "sacrebleu_ci_low": 0.36457462981738414,
1118
- "sacrebleu_ci_high": 0.4478003676387348
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
- 1070,
1124
- 538,
1125
- 304,
1126
- 175
1127
  ],
1128
  "totals": [
1129
- 1815,
1130
- 1749,
1131
- 1683,
1132
- 1617
1133
  ],
1134
  "precisions": [
1135
- 0.5895316804407714,
1136
- 0.3076043453401944,
1137
- 0.1806298276886512,
1138
- 0.10822510822510824
1139
  ],
1140
  "bp": 1.0,
1141
- "sys_len": 1815,
1142
  "ref_len": 1734,
1143
- "sacrebleu": 0.24400811937037023,
1144
- "score": 0.24400811937037023,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.2127585740477001,
1147
- "score_ci_high": 0.2823993625998216,
1148
- "sacrebleu_ci_low": 0.2127585740477001,
1149
- "sacrebleu_ci_high": 0.2823993625998216
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
- 1045,
1155
- 498,
1156
- 280,
1157
- 169
1158
  ],
1159
  "totals": [
1160
- 1787,
1161
- 1721,
1162
- 1655,
1163
- 1589
1164
  ],
1165
  "precisions": [
1166
- 0.5847789591494125,
1167
- 0.2893666472980825,
1168
- 0.1691842900302115,
1169
- 0.10635619886721209
1170
  ],
1171
  "bp": 1.0,
1172
- "sys_len": 1787,
1173
  "ref_len": 1734,
1174
- "sacrebleu": 0.23490418965892076,
1175
- "score": 0.23490418965892076,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.1985487418164707,
1178
- "score_ci_high": 0.2846511928700541,
1179
- "sacrebleu_ci_low": 0.1985487418164707,
1180
- "sacrebleu_ci_high": 0.2846511928700541
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
- 1313,
1186
- 888,
1187
- 630,
1188
- 458
1189
  ],
1190
  "totals": [
1191
- 1824,
1192
- 1758,
1193
- 1692,
1194
- 1626
1195
  ],
1196
  "precisions": [
1197
- 0.7198464912280701,
1198
- 0.5051194539249146,
1199
- 0.3723404255319149,
1200
- 0.2816728167281673
1201
  ],
1202
  "bp": 1.0,
1203
- "sys_len": 1824,
1204
  "ref_len": 1734,
1205
- "sacrebleu": 0.4419058783059329,
1206
- "score": 0.4419058783059329,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.39279370658531876,
1209
- "score_ci_high": 0.49550322461749036,
1210
- "sacrebleu_ci_low": 0.39279370658531876,
1211
- "sacrebleu_ci_high": 0.49550322461749036
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
- 1260,
1217
- 841,
1218
- 596,
1219
- 426
1220
  ],
1221
  "totals": [
1222
- 1797,
1223
- 1731,
1224
- 1665,
1225
- 1599
1226
  ],
1227
  "precisions": [
1228
- 0.7011686143572621,
1229
- 0.4858463316002311,
1230
- 0.357957957957958,
1231
- 0.26641651031894936
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 1797,
1235
  "ref_len": 1734,
1236
- "sacrebleu": 0.4245497977223498,
1237
- "score": 0.4245497977223498,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.377958140884616,
1240
- "score_ci_high": 0.4821428710051954,
1241
- "sacrebleu_ci_low": 0.377958140884616,
1242
- "sacrebleu_ci_high": 0.4821428710051954
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
- 1163,
1248
- 621,
1249
- 368,
1250
- 225
1251
  ],
1252
  "totals": [
1253
- 1900,
1254
- 1834,
1255
- 1768,
1256
- 1702
1257
  ],
1258
  "precisions": [
1259
- 0.6121052631578947,
1260
- 0.3386041439476554,
1261
- 0.2081447963800905,
1262
- 0.13219741480611044
1263
  ],
1264
  "bp": 1.0,
1265
- "sys_len": 1900,
1266
  "ref_len": 1734,
1267
- "sacrebleu": 0.27480632590419196,
1268
- "score": 0.27480632590419196,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.24866198798867725,
1271
- "score_ci_high": 0.33121893225397175,
1272
- "sacrebleu_ci_low": 0.24866198798867725,
1273
- "sacrebleu_ci_high": 0.33121893225397175
1274
  },
1275
- "score": 0.3336749997417598,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
- "score": 0.49083000843124214,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-06-23T12:43:42.752885Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
+ "model_name=watsonx/mistralai/mistral-small-3-1-24b-instruct-2503,max_tokens=256",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
 
26
  "num_fewshots": null,
27
  "limit": null,
28
  "batch_size": 8,
29
+ "model": "watsonx/mistralai/mistral-small-3-1-24b-instruct-2503",
30
  "model_args": {
31
  "max_tokens": 256
32
  },
 
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
+ "accuracy": 0.6666666666666666,
180
+ "accuracy_ci_low": 0.5666666666666667,
181
+ "accuracy_ci_high": 0.7555555555555555,
182
  "score_name": "accuracy",
183
+ "score": 0.6666666666666666,
184
+ "score_ci_high": 0.7555555555555555,
185
+ "score_ci_low": 0.5666666666666667,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
+ "accuracy": 0.7666666666666667,
190
+ "accuracy_ci_low": 0.6777777777777778,
191
+ "accuracy_ci_high": 0.8444444444444444,
192
  "score_name": "accuracy",
193
+ "score": 0.7666666666666667,
194
+ "score_ci_high": 0.8444444444444444,
195
+ "score_ci_low": 0.6777777777777778,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
 
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
+ "accuracy": 0.7555555555555555,
210
+ "accuracy_ci_low": 0.6555555555555556,
211
+ "accuracy_ci_high": 0.8333333333333334,
212
  "score_name": "accuracy",
213
+ "score": 0.7555555555555555,
214
+ "score_ci_high": 0.8333333333333334,
215
+ "score_ci_low": 0.6555555555555556,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.8111111111111111,
220
+ "accuracy_ci_low": 0.7111111111111111,
221
+ "accuracy_ci_high": 0.8777777777777778,
222
  "score_name": "accuracy",
223
+ "score": 0.8111111111111111,
224
+ "score_ci_high": 0.8777777777777778,
225
+ "score_ci_low": 0.7111111111111111,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.8111111111111111,
230
+ "accuracy_ci_low": 0.7111111111111111,
231
+ "accuracy_ci_high": 0.8777777777777778,
232
  "score_name": "accuracy",
233
+ "score": 0.8111111111111111,
234
+ "score_ci_high": 0.8777777777777778,
235
+ "score_ci_low": 0.7111111111111111,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.8222222222222222,
240
+ "accuracy_ci_low": 0.7333333333333333,
241
+ "accuracy_ci_high": 0.8888888888888888,
242
  "score_name": "accuracy",
243
+ "score": 0.8222222222222222,
244
+ "score_ci_high": 0.8888888888888888,
245
+ "score_ci_low": 0.7333333333333333,
246
  "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.8444444444444444,
250
+ "accuracy_ci_low": 0.7555555555555555,
251
+ "accuracy_ci_high": 0.9,
252
  "score_name": "accuracy",
253
+ "score": 0.8444444444444444,
254
+ "score_ci_high": 0.9,
255
+ "score_ci_low": 0.7555555555555555,
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
+ "accuracy": 0.8222222222222222,
260
+ "accuracy_ci_low": 0.7333333333333333,
261
+ "accuracy_ci_high": 0.9,
262
  "score_name": "accuracy",
263
+ "score": 0.8222222222222222,
264
+ "score_ci_high": 0.9,
265
+ "score_ci_low": 0.7333333333333333,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
+ "accuracy": 0.8111111111111111,
270
+ "accuracy_ci_low": 0.7222222222222222,
271
+ "accuracy_ci_high": 0.8888888888888888,
272
  "score_name": "accuracy",
273
+ "score": 0.8111111111111111,
274
+ "score_ci_high": 0.8888888888888888,
275
+ "score_ci_low": 0.7222222222222222,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.8333333333333334,
280
+ "accuracy_ci_low": 0.7555555555555555,
281
+ "accuracy_ci_high": 0.9111111111111111,
282
  "score_name": "accuracy",
283
+ "score": 0.8333333333333334,
284
+ "score_ci_high": 0.9111111111111111,
285
+ "score_ci_low": 0.7555555555555555,
286
  "num_of_instances": 90
287
  },
288
+ "score": 0.804040404040404,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.10666666666666667,
296
+ "score": 0.10666666666666667,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.10666666666666667,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
+ "f1_Person": 0.34806629834254144,
307
+ "f1_Organization": 0.28125,
308
+ "f1_Location": 0.2272727272727273,
309
+ "f1_macro": 0.2855296752050896,
310
+ "recall_macro": 0.2576225314974886,
311
+ "precision_macro": 0.32330034002100655,
312
+ "in_classes_support": 0.4646799116997793,
313
+ "f1_micro": 0.1928721174004193,
314
+ "recall_micro": 0.26285714285714284,
315
+ "precision_micro": 0.152317880794702,
316
+ "score": 0.1928721174004193,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.16179580146707578,
319
+ "score_ci_high": 0.2232377264615503,
320
+ "f1_micro_ci_low": 0.16179580146707578,
321
+ "f1_micro_ci_high": 0.2232377264615503
322
  },
323
+ "score": 0.1928721174004193,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
+ "accuracy": 0.647887323943662,
330
+ "accuracy_ci_low": 0.5352112676056338,
331
+ "accuracy_ci_high": 0.7605633802816901,
332
  "score_name": "accuracy",
333
+ "score": 0.647887323943662,
334
+ "score_ci_high": 0.7605633802816901,
335
+ "score_ci_low": 0.5352112676056338,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
+ "accuracy": 0.30985915492957744,
340
+ "accuracy_ci_low": 0.2112676056338028,
341
+ "accuracy_ci_high": 0.428782341390215,
342
  "score_name": "accuracy",
343
+ "score": 0.30985915492957744,
344
+ "score_ci_high": 0.428782341390215,
345
+ "score_ci_low": 0.2112676056338028,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
+ "accuracy": 0.22535211267605634,
350
+ "accuracy_ci_low": 0.14084507042253522,
351
+ "accuracy_ci_high": 0.3380281690140845,
352
  "score_name": "accuracy",
353
+ "score": 0.22535211267605634,
354
+ "score_ci_high": 0.3380281690140845,
355
+ "score_ci_low": 0.14084507042253522,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
+ "accuracy": 0.5633802816901409,
360
+ "accuracy_ci_low": 0.4507042253521127,
361
+ "accuracy_ci_high": 0.676056338028169,
362
  "score_name": "accuracy",
363
+ "score": 0.5633802816901409,
364
+ "score_ci_high": 0.676056338028169,
365
+ "score_ci_low": 0.4507042253521127,
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
+ "accuracy": 0.6197183098591549,
370
+ "accuracy_ci_low": 0.49295774647887325,
371
+ "accuracy_ci_high": 0.7323943661971831,
372
+ "score_name": "accuracy",
373
+ "score": 0.6197183098591549,
374
+ "score_ci_high": 0.7323943661971831,
375
+ "score_ci_low": 0.49295774647887325,
376
+ "num_of_instances": 71
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.18309859154929578,
380
+ "accuracy_ci_low": 0.11267605633802817,
381
+ "accuracy_ci_high": 0.28169014084507044,
382
+ "score_name": "accuracy",
383
+ "score": 0.18309859154929578,
384
+ "score_ci_high": 0.28169014084507044,
385
+ "score_ci_low": 0.11267605633802817,
386
+ "num_of_instances": 71
387
+ },
388
+ "mmlu_pro_health": {
389
  "accuracy": 0.4788732394366197,
390
  "accuracy_ci_low": 0.36619718309859156,
391
  "accuracy_ci_high": 0.5915492957746479,
 
395
  "score_ci_low": 0.36619718309859156,
396
  "num_of_instances": 71
397
  },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.6619718309859155,
400
+ "accuracy_ci_low": 0.5352112676056338,
401
+ "accuracy_ci_high": 0.7714646829428065,
402
  "score_name": "accuracy",
403
+ "score": 0.6619718309859155,
404
+ "score_ci_high": 0.7714646829428065,
405
+ "score_ci_low": 0.5352112676056338,
406
  "num_of_instances": 71
407
  },
408
+ "mmlu_pro_law": {
409
  "accuracy": 0.43661971830985913,
410
  "accuracy_ci_low": 0.323943661971831,
411
  "accuracy_ci_high": 0.5492957746478874,
 
415
  "score_ci_low": 0.323943661971831,
416
  "num_of_instances": 71
417
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  "mmlu_pro_math": {
419
+ "accuracy": 0.2676056338028169,
420
+ "accuracy_ci_low": 0.16901408450704225,
421
+ "accuracy_ci_high": 0.38028169014084506,
422
  "score_name": "accuracy",
423
+ "score": 0.2676056338028169,
424
+ "score_ci_high": 0.38028169014084506,
425
+ "score_ci_low": 0.16901408450704225,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.4647887323943662,
430
+ "accuracy_ci_low": 0.352112676056338,
431
+ "accuracy_ci_high": 0.5774647887323944,
432
  "score_name": "accuracy",
433
+ "score": 0.4647887323943662,
434
+ "score_ci_high": 0.5774647887323944,
435
+ "score_ci_low": 0.352112676056338,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
+ "accuracy": 0.49295774647887325,
440
+ "accuracy_ci_low": 0.38028169014084506,
441
+ "accuracy_ci_high": 0.6197183098591549,
442
  "score_name": "accuracy",
443
+ "score": 0.49295774647887325,
444
+ "score_ci_high": 0.6197183098591549,
445
+ "score_ci_low": 0.38028169014084506,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.4507042253521127,
450
+ "accuracy_ci_low": 0.3380281690140845,
451
+ "accuracy_ci_high": 0.5633802816901409,
452
  "score_name": "accuracy",
453
+ "score": 0.4507042253521127,
454
+ "score_ci_high": 0.5633802816901409,
455
+ "score_ci_low": 0.3380281690140845,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
+ "accuracy": 0.6338028169014085,
460
+ "accuracy_ci_low": 0.5211267605633803,
461
+ "accuracy_ci_high": 0.7464788732394366,
462
  "score_name": "accuracy",
463
+ "score": 0.6338028169014085,
464
+ "score_ci_high": 0.7464788732394366,
465
+ "score_ci_low": 0.5211267605633803,
466
  "num_of_instances": 71
467
  },
468
+ "score": 0.4597585513078471,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.3274661835748792,
475
+ "f1_suggestive": 0.3125,
476
+ "f1_generic": 0.2222222222222222,
477
+ "f1_arbitrary": 0.32,
478
+ "f1_fanciful": 0.43478260869565216,
479
+ "f1_descriptive": 0.34782608695652173,
480
+ "f1_macro_ci_low": 0.2279048726954935,
481
+ "f1_macro_ci_high": 0.4521754269782871,
482
  "score_name": "f1_micro",
483
+ "score": 0.3305785123966942,
484
+ "score_ci_high": 0.4462631095061656,
485
+ "score_ci_low": 0.22608695652173913,
486
  "num_of_instances": 85,
487
+ "accuracy": 0.23529411764705882,
488
+ "accuracy_ci_low": 0.15294117647058825,
489
+ "accuracy_ci_high": 0.3411764705882353,
490
+ "f1_micro": 0.3305785123966942,
491
+ "f1_micro_ci_low": 0.22608695652173913,
492
+ "f1_micro_ci_high": 0.4462631095061656
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.3665132336018412,
496
+ "f1_no": 0.45454545454545453,
497
+ "f1_yes": 0.27848101265822783,
498
+ "f1_macro_ci_low": 0.29058993290093893,
499
+ "f1_macro_ci_high": 0.44397782794437635,
500
  "score_name": "f1_micro",
501
+ "score": 0.4043321299638989,
502
+ "score_ci_high": 0.4848050604545447,
503
+ "score_ci_low": 0.33210332103321033,
504
  "num_of_instances": 200,
505
+ "accuracy": 0.28,
506
+ "accuracy_ci_low": 0.22435516148422335,
507
+ "accuracy_ci_high": 0.35,
508
+ "f1_micro": 0.4043321299638989,
509
+ "f1_micro_ci_low": 0.33210332103321033,
510
+ "f1_micro_ci_high": 0.4848050604545447
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.2994201157541979,
514
+ "f1_conclusion": 0.16216216216216217,
515
+ "f1_decree": 0.22857142857142856,
516
+ "f1_analysis": 0.43478260869565216,
517
+ "f1_issue": 0.23809523809523808,
518
+ "f1_procedural history": 0.36363636363636365,
519
+ "f1_facts": 0.2857142857142857,
520
+ "f1_rule": 0.3829787234042553,
521
+ "f1_macro_ci_low": 0.23455524899226265,
522
+ "f1_macro_ci_high": 0.3757847883095156,
523
  "score_name": "f1_micro",
524
+ "score": 0.3111111111111111,
525
+ "score_ci_high": 0.38492614857203733,
526
+ "score_ci_low": 0.23835139550418585,
527
  "num_of_instances": 200,
528
+ "accuracy": 0.245,
529
+ "accuracy_ci_low": 0.185,
530
+ "accuracy_ci_high": 0.31,
531
+ "f1_micro": 0.3111111111111111,
532
+ "f1_micro_ci_low": 0.23835139550418585,
533
+ "f1_micro_ci_high": 0.38492614857203733
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.20520718738540522,
537
+ "f1_yes": 0.21782178217821782,
538
+ "f1_no": 0.1925925925925926,
539
+ "f1_macro_ci_low": 0.1400419836844058,
540
+ "f1_macro_ci_high": 0.2885398882645068,
541
  "score_name": "f1_micro",
542
+ "score": 0.2033898305084746,
543
+ "score_ci_high": 0.2857142857142857,
544
+ "score_ci_low": 0.1391304347826087,
545
  "num_of_instances": 200,
546
+ "accuracy": 0.12,
547
+ "accuracy_ci_low": 0.08,
548
+ "accuracy_ci_high": 0.175,
549
+ "f1_micro": 0.2033898305084746,
550
+ "f1_micro_ci_low": 0.1391304347826087,
551
+ "f1_micro_ci_high": 0.2857142857142857
552
  },
553
  "legalbench_proa": {
554
+ "f1_macro": 0.8035124326204138,
555
+ "f1_yes": 0.8169014084507042,
556
+ "f1_no": 0.7901234567901234,
557
+ "f1_macro_ci_low": 0.7176915883069268,
558
+ "f1_macro_ci_high": 0.872867714407109,
559
  "score_name": "f1_micro",
560
+ "score": 0.8026315789473685,
561
+ "score_ci_high": 0.871520027126433,
562
+ "score_ci_low": 0.7086398695460123,
563
  "num_of_instances": 85,
564
+ "accuracy": 0.7176470588235294,
565
+ "accuracy_ci_low": 0.611764705882353,
566
+ "accuracy_ci_high": 0.8,
567
+ "f1_micro": 0.8026315789473685,
568
+ "f1_micro_ci_low": 0.7086398695460123,
569
+ "f1_micro_ci_high": 0.871520027126433
570
  },
571
+ "score": 0.41040863258550947,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.51306152129787,
578
+ "f1_cars": 0.6585365853658537,
579
+ "f1_windows x": 0.08571428571428572,
580
+ "f1_cryptography": 0.5641025641025641,
581
+ "f1_atheism": 0.09302325581395349,
582
+ "f1_religion": 0.15873015873015872,
583
+ "f1_medicine": 0.810126582278481,
584
+ "f1_christianity": 0.36619718309859156,
585
+ "f1_computer graphics": 0.43243243243243246,
586
+ "f1_microsoft windows": 0.5569620253164557,
587
+ "f1_middle east": 0.625,
588
+ "f1_motorcycles": 0.64,
589
+ "f1_mac hardware": 0.49411764705882355,
590
+ "f1_pc hardware": 0.5309734513274337,
591
+ "f1_electronics": 0.6292134831460674,
592
+ "f1_for sale": 0.5538461538461539,
593
+ "f1_guns": 0.22580645161290322,
594
+ "f1_space": 0.7872340425531915,
595
+ "f1_baseball": 0.8598130841121495,
596
+ "f1_hockey": 0.859504132231405,
597
+ "f1_politics": 0.32989690721649484,
598
+ "f1_macro_ci_low": 0.489675643451468,
599
+ "f1_macro_ci_high": 0.5444343504387604,
600
  "score_name": "f1_micro",
601
+ "score": 0.5470692717584369,
602
+ "score_ci_high": 0.5787418375694315,
603
+ "score_ci_low": 0.5152956292250616,
604
  "num_of_instances": 1000,
605
+ "accuracy": 0.462,
606
+ "accuracy_ci_low": 0.431,
607
+ "accuracy_ci_high": 0.492,
608
+ "f1_micro": 0.5470692717584369,
609
+ "f1_micro_ci_low": 0.5152956292250616,
610
+ "f1_micro_ci_high": 0.5787418375694315
611
  },
612
+ "score": 0.5470692717584369,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.5522388523080553,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.7597955706984668,
620
+ "f1_checking or savings account": 0.717391304347826,
621
+ "f1_debt collection": 0.5234899328859061,
622
+ "f1_credit card or prepaid card": 0.379746835443038,
623
+ "f1_mortgage": 0.7397260273972602,
624
+ "f1_payday loan or title loan or personal loan": 0.0,
625
+ "f1_student loan": 0.75,
626
+ "f1_money transfer or virtual currency or money service": 0.6,
627
+ "f1_vehicle loan or lease": 0.5,
628
+ "f1_macro_ci_low": 0.5048624166012413,
629
+ "f1_macro_ci_high": 0.5933228524869341,
630
  "score_name": "f1_micro",
631
+ "score": 0.70260663507109,
632
+ "score_ci_high": 0.7287187189345051,
633
+ "score_ci_low": 0.6715109552099726,
634
  "num_of_instances": 1000,
635
+ "accuracy": 0.593,
636
+ "accuracy_ci_low": 0.562,
637
+ "accuracy_ci_high": 0.6222733612177318,
638
+ "f1_micro": 0.70260663507109,
639
+ "f1_micro_ci_low": 0.6715109552099726,
640
+ "f1_micro_ci_high": 0.7287187189345051
641
  },
642
  "cfpb_product_watsonx": {
643
+ "f1_macro": 0.6459298867611312,
644
+ "f1_mortgages and loans": 0.7544910179640718,
645
+ "f1_credit card": 0.775,
646
+ "f1_debt collection": 0.5841584158415841,
647
+ "f1_credit reporting": 0.696,
648
+ "f1_retail banking": 0.42,
649
+ "f1_macro_ci_low": 0.6077839677010852,
650
+ "f1_macro_ci_high": 0.6913500191807291,
651
  "score_name": "f1_micro",
652
+ "score": 0.664391353811149,
653
+ "score_ci_high": 0.7063133644876816,
654
+ "score_ci_low": 0.625027055082327,
655
  "num_of_instances": 500,
656
+ "accuracy": 0.584,
657
+ "accuracy_ci_low": 0.542,
658
+ "accuracy_ci_high": 0.63,
659
+ "f1_micro": 0.664391353811149,
660
+ "f1_micro_ci_low": 0.625027055082327,
661
+ "f1_micro_ci_high": 0.7063133644876816
662
  },
663
+ "score": 0.6834989944411195,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
+ "execution_accuracy": 0.12,
671
+ "program_accuracy": 0.132,
672
+ "score": 0.132,
673
  "score_name": "program_accuracy",
674
+ "execution_accuracy_ci_low": 0.102,
675
+ "execution_accuracy_ci_high": 0.144,
676
+ "program_accuracy_ci_low": 0.114,
677
+ "program_accuracy_ci_high": 0.15666145199397988,
678
+ "score_ci_low": 0.114,
679
+ "score_ci_high": 0.15666145199397988
680
  },
681
+ "score": 0.132,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
+ "precision": 0.2643477592234184,
688
+ "recall": 0.6272650833490617,
689
+ "f1": 0.3140296380054316,
690
+ "precision_ci_low": 0.24819332961110369,
691
+ "precision_ci_high": 0.283633312750057,
692
+ "recall_ci_low": 0.6126841693716314,
693
+ "recall_ci_high": 0.6430975049953365,
694
+ "f1_ci_low": 0.29901530950331745,
695
+ "f1_ci_high": 0.3315112630081704,
696
  "score_name": "f1",
697
+ "score": 0.3140296380054316,
698
+ "score_ci_high": 0.3315112630081704,
699
+ "score_ci_low": 0.29901530950331745,
700
  "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.570422876526912,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6787074640889963,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5102018377184868,
704
+ "faithfullness_f1_token_overlap": 0.3137695189458044,
705
+ "faithfullness_recall_token_overlap": 0.2691044465057621,
706
+ "faithfullness_precision_token_overlap": 0.4828180045337674,
707
+ "correctness_f1_token_overlap": 0.3140296380054316,
708
+ "correctness_recall_token_overlap": 0.6272650833490617,
709
+ "correctness_precision_token_overlap": 0.2643477592234184
710
  },
711
+ "score": 0.3140296380054316,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
+ "accuracy": 0.525,
718
+ "accuracy_ci_low": 0.491,
719
+ "accuracy_ci_high": 0.555231575004668,
720
  "score_name": "accuracy",
721
+ "score": 0.525,
722
+ "score_ci_high": 0.555231575004668,
723
+ "score_ci_low": 0.491,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
+ "accuracy": 0.882,
728
+ "accuracy_ci_low": 0.852,
729
+ "accuracy_ci_high": 0.908,
730
  "score_name": "accuracy",
731
+ "score": 0.882,
732
+ "score_ci_high": 0.908,
733
+ "score_ci_low": 0.852,
734
  "num_of_instances": 500
735
  },
736
+ "score": 0.7035,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
+ "severity_critical": 1.0,
743
  "severity_high": 1.0,
744
+ "severity_medium": 2.0,
745
+ "severity_low": 96.0,
746
+ "category_harmful_info": 0.7837495406986109,
747
+ "category_deception": 0.9044034529213953,
748
+ "category_explicit_content": 0.7690657642152575,
749
+ "category_violence": 0.8265839901102323,
750
+ "category_discrimination": 0.8356099165384733,
751
+ "category_substance_abuse": 0.9050387458668814,
752
+ "category_pii": 0.8721841452298341,
753
+ "safety": 0.8227460376271561,
754
+ "safety_ci_low": 0.7932925916009834,
755
+ "safety_ci_high": 0.8490041469302126,
756
  "score_name": "safety",
757
+ "score": 0.8227460376271561,
758
+ "score_ci_high": 0.8490041469302126,
759
+ "score_ci_low": 0.7932925916009834,
760
  "num_of_instances": 100
761
  },
762
+ "score": 0.8227460376271561,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
+ "rougeLsum": 0.3325543768677942,
770
+ "rouge2": 0.18239431854825322,
771
+ "rougeL": 0.26408113485691037,
772
+ "score": 0.26408113485691037,
773
  "score_name": "rougeL",
774
+ "rouge1": 0.38686146798899773,
775
+ "rougeLsum_ci_low": 0.3235124735051795,
776
+ "rougeLsum_ci_high": 0.34150244672120345,
777
+ "rouge2_ci_low": 0.17567630755234162,
778
+ "rouge2_ci_high": 0.18909024839808478,
779
+ "rougeL_ci_low": 0.2574903672199645,
780
+ "rougeL_ci_high": 0.2719482303789339,
781
+ "score_ci_low": 0.2574903672199645,
782
+ "score_ci_high": 0.2719482303789339,
783
+ "rouge1_ci_low": 0.3767025738240639,
784
+ "rouge1_ci_high": 0.39656622600699587
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
+ "rougeLsum": 0.0952773923780629,
789
+ "rouge2": 0.016247281897600302,
790
+ "rougeL": 0.08205748236085915,
791
+ "score": 0.08205748236085915,
792
  "score_name": "rougeL",
793
+ "rouge1": 0.11292356630727837,
794
+ "rougeLsum_ci_low": 0.09101488756580381,
795
+ "rougeLsum_ci_high": 0.09940774438641894,
796
+ "rouge2_ci_low": 0.014565784948631207,
797
+ "rouge2_ci_high": 0.018245592224480585,
798
+ "rougeL_ci_low": 0.07819027253097927,
799
+ "rougeL_ci_high": 0.08550070178637435,
800
+ "score_ci_low": 0.07819027253097927,
801
+ "score_ci_high": 0.08550070178637435,
802
+ "rouge1_ci_low": 0.1075630132488861,
803
+ "rouge1_ci_high": 0.11796959016801192
804
  },
805
+ "score": 0.17306930860888475,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
 
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
+ 1348,
814
+ 805,
815
+ 529,
816
+ 359
817
  ],
818
  "totals": [
819
+ 6219,
820
+ 6153,
821
+ 6087,
822
+ 6021
823
  ],
824
  "precisions": [
825
+ 0.21675510532239908,
826
+ 0.13083048919226395,
827
+ 0.08690652209627076,
828
+ 0.05962464706859325
829
  ],
830
  "bp": 1.0,
831
+ "sys_len": 6219,
832
  "ref_len": 1734,
833
+ "sacrebleu": 0.11010045736869918,
834
+ "score": 0.11010045736869918,
835
  "score_name": "sacrebleu",
836
+ "score_ci_low": 0.09292825544039175,
837
+ "score_ci_high": 0.12825546813076855,
838
+ "sacrebleu_ci_low": 0.09292825544039175,
839
+ "sacrebleu_ci_high": 0.12825546813076855
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
+ 1365,
845
+ 860,
846
+ 583,
847
+ 397
848
  ],
849
  "totals": [
850
+ 5747,
851
+ 5681,
852
+ 5615,
853
+ 5549
854
  ],
855
  "precisions": [
856
+ 0.2375152253349574,
857
+ 0.151381798979053,
858
+ 0.10382902938557435,
859
+ 0.07154442241845378
860
  ],
861
  "bp": 1.0,
862
+ "sys_len": 5747,
863
  "ref_len": 1734,
864
+ "sacrebleu": 0.12783945870094363,
865
+ "score": 0.12783945870094363,
866
  "score_name": "sacrebleu",
867
+ "score_ci_low": 0.11183867040924762,
868
+ "score_ci_high": 0.14785567428124632,
869
+ "sacrebleu_ci_low": 0.11183867040924762,
870
+ "sacrebleu_ci_high": 0.14785567428124632
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
+ 739,
876
+ 294,
877
+ 146,
878
+ 72
879
  ],
880
  "totals": [
881
+ 7684,
882
+ 7618,
883
+ 7552,
884
+ 7486
885
  ],
886
  "precisions": [
887
+ 0.09617386777719938,
888
+ 0.03859280651089525,
889
+ 0.01933262711864407,
890
+ 0.009617953513224687
891
  ],
892
  "bp": 1.0,
893
+ "sys_len": 7684,
894
  "ref_len": 1589,
895
+ "sacrebleu": 0.028822672569638247,
896
+ "score": 0.028822672569638247,
897
  "score_name": "sacrebleu",
898
+ "score_ci_low": 0.021109593731645938,
899
+ "score_ci_high": 0.037834395667653335,
900
+ "sacrebleu_ci_low": 0.021109593731645938,
901
+ "sacrebleu_ci_high": 0.037834395667653335
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
+ 1296,
907
+ 767,
908
+ 491,
909
+ 329
910
  ],
911
  "totals": [
912
+ 5968,
913
+ 5902,
914
+ 5836,
915
+ 5770
916
  ],
917
  "precisions": [
918
+ 0.21715817694369974,
919
+ 0.1299559471365639,
920
+ 0.0841329677861549,
921
+ 0.05701906412478336
922
  ],
923
  "bp": 1.0,
924
+ "sys_len": 5968,
925
  "ref_len": 1835,
926
+ "sacrebleu": 0.10786726317561303,
927
+ "score": 0.10786726317561303,
928
  "score_name": "sacrebleu",
929
+ "score_ci_low": 0.09202539159854502,
930
+ "score_ci_high": 0.1283307736534194,
931
+ "sacrebleu_ci_low": 0.09202539159854502,
932
+ "sacrebleu_ci_high": 0.1283307736534194
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
+ 1536,
938
+ 1081,
939
+ 816,
940
+ 632
941
  ],
942
  "totals": [
943
+ 4782,
944
+ 4716,
945
+ 4650,
946
+ 4584
947
  ],
948
  "precisions": [
949
+ 0.3212045169385195,
950
+ 0.22921967769296014,
951
+ 0.17548387096774193,
952
+ 0.13787085514834208
953
  ],
954
+ "bp": 1.0,
955
+ "sys_len": 4782,
956
  "ref_len": 2068,
957
+ "sacrebleu": 0.20544037737967952,
958
+ "score": 0.20544037737967952,
959
  "score_name": "sacrebleu",
960
+ "score_ci_low": 0.170377943119898,
961
+ "score_ci_high": 0.24159259713787848,
962
+ "sacrebleu_ci_low": 0.170377943119898,
963
+ "sacrebleu_ci_high": 0.24159259713787848
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
+ 1429,
969
+ 687,
970
+ 382,
971
+ 223
972
  ],
973
  "totals": [
974
+ 8796,
975
+ 8730,
976
+ 8664,
977
+ 8598
978
  ],
979
  "precisions": [
980
+ 0.16246020918599363,
981
+ 0.07869415807560137,
982
+ 0.04409048938134811,
983
+ 0.025936264247499417
984
  ],
985
  "bp": 1.0,
986
+ "sys_len": 8796,
987
  "ref_len": 2235,
988
+ "sacrebleu": 0.061835155996391195,
989
+ "score": 0.061835155996391195,
990
  "score_name": "sacrebleu",
991
+ "score_ci_low": 0.04783880582354916,
992
+ "score_ci_high": 0.0770855832203236,
993
+ "sacrebleu_ci_low": 0.04783880582354916,
994
+ "sacrebleu_ci_high": 0.0770855832203236
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
+ 1491,
1000
+ 1023,
1001
+ 759,
1002
+ 578
1003
  ],
1004
  "totals": [
1005
+ 6280,
1006
+ 6214,
1007
+ 6148,
1008
+ 6082
1009
  ],
1010
  "precisions": [
1011
+ 0.2374203821656051,
1012
+ 0.1646282587705182,
1013
+ 0.12345478204294079,
1014
+ 0.09503452811575139
1015
  ],
1016
  "bp": 1.0,
1017
+ "sys_len": 6280,
1018
  "ref_len": 1916,
1019
+ "sacrebleu": 0.14633659011937655,
1020
+ "score": 0.14633659011937655,
1021
  "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.12499780954581644,
1023
+ "score_ci_high": 0.16988788016668988,
1024
+ "sacrebleu_ci_low": 0.12499780954581644,
1025
+ "sacrebleu_ci_high": 0.16988788016668988
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
+ 1335,
1031
+ 862,
1032
+ 585,
1033
+ 402
1034
  ],
1035
  "totals": [
1036
+ 5113,
1037
+ 5047,
1038
+ 4981,
1039
+ 4915
1040
  ],
1041
  "precisions": [
1042
+ 0.26109915900645414,
1043
+ 0.17079453140479495,
1044
+ 0.11744629592451314,
1045
+ 0.08179043743641913
1046
  ],
1047
+ "bp": 1.0,
1048
+ "sys_len": 5113,
1049
  "ref_len": 1949,
1050
+ "sacrebleu": 0.14386505707163663,
1051
+ "score": 0.14386505707163663,
1052
  "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.11590777522441527,
1054
+ "score_ci_high": 0.19034631649860798,
1055
+ "sacrebleu_ci_low": 0.11590777522441527,
1056
+ "sacrebleu_ci_high": 0.19034631649860798
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
+ 1380,
1062
+ 726,
1063
+ 418,
1064
+ 245
1065
  ],
1066
  "totals": [
1067
+ 6698,
1068
+ 6632,
1069
+ 6566,
1070
+ 6500
1071
  ],
1072
  "precisions": [
1073
+ 0.20603165123917588,
1074
+ 0.10946924004825091,
1075
+ 0.06366128540968626,
1076
+ 0.03769230769230769
1077
  ],
1078
+ "bp": 1.0,
1079
+ "sys_len": 6698,
1080
  "ref_len": 2098,
1081
+ "sacrebleu": 0.08577061900111178,
1082
+ "score": 0.08577061900111178,
1083
  "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.06866769965121916,
1085
+ "score_ci_high": 0.10264171230800344,
1086
+ "sacrebleu_ci_low": 0.06866769965121916,
1087
+ "sacrebleu_ci_high": 0.10264171230800344
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
+ 1411,
1093
+ 934,
1094
+ 656,
1095
+ 466
1096
  ],
1097
  "totals": [
1098
+ 5734,
1099
+ 5668,
1100
+ 5602,
1101
+ 5536
1102
  ],
1103
  "precisions": [
1104
+ 0.24607603767003838,
1105
+ 0.1647847565278758,
1106
+ 0.11710103534451982,
1107
+ 0.08417630057803467
1108
  ],
1109
  "bp": 1.0,
1110
+ "sys_len": 5734,
1111
  "ref_len": 1734,
1112
+ "sacrebleu": 0.14139505868159252,
1113
+ "score": 0.14139505868159252,
1114
  "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.12079576015139526,
1116
+ "score_ci_high": 0.16682278724108202,
1117
+ "sacrebleu_ci_low": 0.12079576015139526,
1118
+ "sacrebleu_ci_high": 0.16682278724108202
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
+ 1223,
1124
+ 624,
1125
+ 368,
1126
+ 228
1127
  ],
1128
  "totals": [
1129
+ 5589,
1130
+ 5523,
1131
+ 5457,
1132
+ 5391
1133
  ],
1134
  "precisions": [
1135
+ 0.21882268742172123,
1136
+ 0.11298207495926127,
1137
+ 0.06743632032252153,
1138
+ 0.042292710072342796
1139
  ],
1140
  "bp": 1.0,
1141
+ "sys_len": 5589,
1142
  "ref_len": 1734,
1143
+ "sacrebleu": 0.09163583013027359,
1144
+ "score": 0.09163583013027359,
1145
  "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.07856240720421398,
1147
+ "score_ci_high": 0.107979299756867,
1148
+ "sacrebleu_ci_low": 0.07856240720421398,
1149
+ "sacrebleu_ci_high": 0.107979299756867
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
+ 1181,
1155
+ 575,
1156
+ 330,
1157
+ 198
1158
  ],
1159
  "totals": [
1160
+ 5759,
1161
+ 5693,
1162
+ 5627,
1163
+ 5561
1164
  ],
1165
  "precisions": [
1166
+ 0.20507032470915088,
1167
+ 0.1010012295801862,
1168
+ 0.05864581482139684,
1169
+ 0.03560510699514476
1170
  ],
1171
  "bp": 1.0,
1172
+ "sys_len": 5759,
1173
  "ref_len": 1734,
1174
+ "sacrebleu": 0.08109511611273765,
1175
+ "score": 0.08109511611273765,
1176
  "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.06705573259915326,
1178
+ "score_ci_high": 0.09497541131553666,
1179
+ "sacrebleu_ci_low": 0.06705573259915326,
1180
+ "sacrebleu_ci_high": 0.09497541131553666
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
+ 1407,
1186
+ 940,
1187
+ 672,
1188
+ 483
1189
  ],
1190
  "totals": [
1191
+ 6123,
1192
+ 6057,
1193
+ 5991,
1194
+ 5925
1195
  ],
1196
  "precisions": [
1197
+ 0.2297893189612935,
1198
+ 0.15519233944196797,
1199
+ 0.11216825237856785,
1200
+ 0.08151898734177215
1201
  ],
1202
  "bp": 1.0,
1203
+ "sys_len": 6123,
1204
  "ref_len": 1734,
1205
+ "sacrebleu": 0.13437924969030768,
1206
+ "score": 0.13437924969030768,
1207
  "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.11400957037603383,
1209
+ "score_ci_high": 0.15945809207216216,
1210
+ "sacrebleu_ci_low": 0.11400957037603383,
1211
+ "sacrebleu_ci_high": 0.15945809207216216
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
+ 1432,
1217
+ 975,
1218
+ 700,
1219
+ 506
1220
  ],
1221
  "totals": [
1222
+ 6485,
1223
+ 6419,
1224
+ 6353,
1225
+ 6287
1226
  ],
1227
  "precisions": [
1228
+ 0.2208172706245181,
1229
+ 0.1518928181959807,
1230
+ 0.11018416496143554,
1231
+ 0.08048353745824718
1232
  ],
1233
  "bp": 1.0,
1234
+ "sys_len": 6485,
1235
  "ref_len": 1734,
1236
+ "sacrebleu": 0.13132552016969748,
1237
+ "score": 0.13132552016969748,
1238
  "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.1085828237330662,
1240
+ "score_ci_high": 0.1582258643543896,
1241
+ "sacrebleu_ci_low": 0.1085828237330662,
1242
+ "sacrebleu_ci_high": 0.1582258643543896
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
+ 1293,
1248
+ 715,
1249
+ 441,
1250
+ 272
1251
  ],
1252
  "totals": [
1253
+ 6041,
1254
+ 5975,
1255
+ 5909,
1256
+ 5843
1257
  ],
1258
  "precisions": [
1259
+ 0.2140374110246648,
1260
+ 0.1196652719665272,
1261
+ 0.07463191741411407,
1262
+ 0.046551429060414165
1263
  ],
1264
  "bp": 1.0,
1265
+ "sys_len": 6041,
1266
  "ref_len": 1734,
1267
+ "sacrebleu": 0.09712451421953877,
1268
+ "score": 0.09712451421953877,
1269
  "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.08132271336696732,
1271
+ "score_ci_high": 0.11265264204635472,
1272
+ "sacrebleu_ci_low": 0.08132271336696732,
1273
+ "sacrebleu_ci_high": 0.11265264204635472
1274
  },
1275
+ "score": 0.1129888626924825,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
+ "score": 0.42020372962571984,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
results/bluebench/{2025-06-19T21-59-04_evaluation_results.json β†’ 2025-06-23T09-36-33_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-20T01:59:00.198687Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -42,7 +42,7 @@
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -176,151 +176,151 @@
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
- "accuracy": 0.5333333333333333,
180
- "accuracy_ci_low": 0.43333333333333335,
181
- "accuracy_ci_high": 0.6444444444444445,
182
  "score_name": "accuracy",
183
- "score": 0.5333333333333333,
184
- "score_ci_high": 0.6444444444444445,
185
- "score_ci_low": 0.43333333333333335,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
  "accuracy": 0.5555555555555556,
190
- "accuracy_ci_low": 0.45555555555555555,
191
  "accuracy_ci_high": 0.6555555555555556,
192
  "score_name": "accuracy",
193
  "score": 0.5555555555555556,
194
  "score_ci_high": 0.6555555555555556,
195
- "score_ci_low": 0.45555555555555555,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
- "accuracy": 0.8777777777777778,
200
- "accuracy_ci_low": 0.8,
201
  "accuracy_ci_high": 0.9333333333333333,
202
  "score_name": "accuracy",
203
- "score": 0.8777777777777778,
204
  "score_ci_high": 0.9333333333333333,
205
- "score_ci_low": 0.8,
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
  "accuracy": 0.5777777777777777,
210
- "accuracy_ci_low": 0.4666666666666667,
211
  "accuracy_ci_high": 0.6777777777777778,
212
  "score_name": "accuracy",
213
  "score": 0.5777777777777777,
214
  "score_ci_high": 0.6777777777777778,
215
- "score_ci_low": 0.4666666666666667,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
- "accuracy": 0.6111111111111112,
220
- "accuracy_ci_low": 0.5111111111111111,
221
- "accuracy_ci_high": 0.7,
222
  "score_name": "accuracy",
223
- "score": 0.6111111111111112,
224
- "score_ci_high": 0.7,
225
- "score_ci_low": 0.5111111111111111,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.9888888888888889,
230
- "accuracy_ci_low": 0.9407763312346947,
231
- "accuracy_ci_high": 1.0,
232
  "score_name": "accuracy",
233
- "score": 0.9888888888888889,
234
- "score_ci_high": 1.0,
235
- "score_ci_low": 0.9407763312346947,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
- "accuracy": 0.8,
240
- "accuracy_ci_low": 0.7111111111111111,
241
- "accuracy_ci_high": 0.8666666666666667,
242
  "score_name": "accuracy",
243
- "score": 0.8,
244
- "score_ci_high": 0.8666666666666667,
245
- "score_ci_low": 0.7111111111111111,
246
  "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
- "accuracy": 0.6333333333333333,
250
- "accuracy_ci_low": 0.5222222222222223,
251
- "accuracy_ci_high": 0.7222222222222222,
252
  "score_name": "accuracy",
253
- "score": 0.6333333333333333,
254
- "score_ci_high": 0.7222222222222222,
255
- "score_ci_low": 0.5222222222222223,
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
  "accuracy": 0.6444444444444445,
260
- "accuracy_ci_low": 0.5333333333333333,
261
  "accuracy_ci_high": 0.7444444444444445,
262
  "score_name": "accuracy",
263
  "score": 0.6444444444444445,
264
  "score_ci_high": 0.7444444444444445,
265
- "score_ci_low": 0.5333333333333333,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
- "accuracy": 0.6555555555555556,
270
- "accuracy_ci_low": 0.5555555555555556,
271
- "accuracy_ci_high": 0.7505365189670177,
272
  "score_name": "accuracy",
273
- "score": 0.6555555555555556,
274
- "score_ci_high": 0.7505365189670177,
275
- "score_ci_low": 0.5555555555555556,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.8666666666666667,
280
- "accuracy_ci_low": 0.7888888888888889,
281
  "accuracy_ci_high": 0.9333333333333333,
282
  "score_name": "accuracy",
283
- "score": 0.8666666666666667,
284
  "score_ci_high": 0.9333333333333333,
285
- "score_ci_low": 0.7888888888888889,
286
  "num_of_instances": 90
287
  },
288
- "score": 0.704040404040404,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.5,
296
- "score": 0.5,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.5,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
- "f1_Person": 0.2839506172839506,
307
- "f1_Organization": 0.260586319218241,
308
- "f1_Location": 0.19917012448132781,
309
- "f1_macro": 0.24790235366117316,
310
- "recall_macro": 0.2073423475003688,
311
- "precision_macro": 0.3173997367545755,
312
- "in_classes_support": 0.47339699863574347,
313
- "f1_micro": 0.17488076311605724,
314
- "recall_micro": 0.20952380952380953,
315
- "precision_micro": 0.15006821282401092,
316
- "score": 0.17488076311605724,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.14029219675059787,
319
- "score_ci_high": 0.2240183994224699,
320
- "f1_micro_ci_low": 0.14029219675059787,
321
- "f1_micro_ci_high": 0.2240183994224699
322
  },
323
- "score": 0.17488076311605724,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
@@ -336,43 +336,43 @@
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
- "accuracy": 0.15492957746478872,
340
- "accuracy_ci_low": 0.08450704225352113,
341
- "accuracy_ci_high": 0.2535211267605634,
342
  "score_name": "accuracy",
343
- "score": 0.15492957746478872,
344
- "score_ci_high": 0.2535211267605634,
345
- "score_ci_low": 0.08450704225352113,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
  "accuracy": 0.08450704225352113,
350
- "accuracy_ci_low": 0.028169014084507043,
351
- "accuracy_ci_high": 0.18309859154929578,
352
  "score_name": "accuracy",
353
  "score": 0.08450704225352113,
354
- "score_ci_high": 0.18309859154929578,
355
- "score_ci_low": 0.028169014084507043,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
- "accuracy": 0.3380281690140845,
360
- "accuracy_ci_low": 0.22535211267605634,
361
- "accuracy_ci_high": 0.4507042253521127,
362
  "score_name": "accuracy",
363
- "score": 0.3380281690140845,
364
- "score_ci_high": 0.4507042253521127,
365
- "score_ci_low": 0.22535211267605634,
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
- "accuracy": 0.5633802816901409,
370
- "accuracy_ci_low": 0.43661971830985913,
371
- "accuracy_ci_high": 0.676056338028169,
372
  "score_name": "accuracy",
373
- "score": 0.5633802816901409,
374
- "score_ci_high": 0.676056338028169,
375
- "score_ci_low": 0.43661971830985913,
376
  "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
@@ -386,423 +386,423 @@
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
- "accuracy": 0.38028169014084506,
390
- "accuracy_ci_low": 0.2676056338028169,
391
- "accuracy_ci_high": 0.49295774647887325,
392
  "score_name": "accuracy",
393
- "score": 0.38028169014084506,
394
- "score_ci_high": 0.49295774647887325,
395
- "score_ci_low": 0.2676056338028169,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
- "accuracy": 0.49295774647887325,
400
  "accuracy_ci_low": 0.36619718309859156,
401
- "accuracy_ci_high": 0.6056338028169014,
402
  "score_name": "accuracy",
403
- "score": 0.49295774647887325,
404
- "score_ci_high": 0.6056338028169014,
405
  "score_ci_low": 0.36619718309859156,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
- "accuracy": 0.28169014084507044,
410
- "accuracy_ci_low": 0.18309859154929578,
411
- "accuracy_ci_high": 0.39436619718309857,
412
  "score_name": "accuracy",
413
- "score": 0.28169014084507044,
414
- "score_ci_high": 0.39436619718309857,
415
- "score_ci_low": 0.18309859154929578,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
- "accuracy": 0.056338028169014086,
420
- "accuracy_ci_low": 0.014084507042253521,
421
- "accuracy_ci_high": 0.1267605633802817,
422
  "score_name": "accuracy",
423
- "score": 0.056338028169014086,
424
- "score_ci_high": 0.1267605633802817,
425
- "score_ci_low": 0.014084507042253521,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
- "accuracy": 0.28169014084507044,
430
- "accuracy_ci_low": 0.19718309859154928,
431
- "accuracy_ci_high": 0.39436619718309857,
432
  "score_name": "accuracy",
433
- "score": 0.28169014084507044,
434
- "score_ci_high": 0.39436619718309857,
435
- "score_ci_low": 0.19718309859154928,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
- "accuracy": 0.38028169014084506,
440
- "accuracy_ci_low": 0.2676056338028169,
441
- "accuracy_ci_high": 0.5095143645267136,
442
  "score_name": "accuracy",
443
- "score": 0.38028169014084506,
444
- "score_ci_high": 0.5095143645267136,
445
- "score_ci_low": 0.2676056338028169,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
  "accuracy": 0.18309859154929578,
450
  "accuracy_ci_low": 0.09859154929577464,
451
- "accuracy_ci_high": 0.28169014084507044,
452
  "score_name": "accuracy",
453
  "score": 0.18309859154929578,
454
- "score_ci_high": 0.28169014084507044,
455
  "score_ci_low": 0.09859154929577464,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
- "accuracy": 0.5915492957746479,
460
- "accuracy_ci_low": 0.4647887323943662,
461
- "accuracy_ci_high": 0.704225352112676,
462
  "score_name": "accuracy",
463
- "score": 0.5915492957746479,
464
- "score_ci_high": 0.704225352112676,
465
- "score_ci_low": 0.4647887323943662,
466
  "num_of_instances": 71
467
  },
468
- "score": 0.3209255533199195,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.07819548872180451,
475
  "f1_suggestive": 0.0,
476
  "f1_generic": 0.0,
 
477
  "f1_fanciful": 0.10526315789473684,
478
- "f1_descriptive": 0.2857142857142857,
479
  "f1_arbitrary": 0.0,
480
- "f1_macro_ci_low": 0.029629629629629627,
481
- "f1_macro_ci_high": 0.1446114401751038,
482
  "score_name": "f1_micro",
483
- "score": 0.1,
484
- "score_ci_high": 0.1976990689891533,
485
- "score_ci_low": 0.0392156862745098,
486
  "num_of_instances": 85,
487
- "accuracy": 0.058823529411764705,
488
- "accuracy_ci_low": 0.023529411764705882,
489
- "accuracy_ci_high": 0.1261289751719794,
490
- "f1_micro": 0.1,
491
- "f1_micro_ci_low": 0.0392156862745098,
492
- "f1_micro_ci_high": 0.1976990689891533
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.4318713450292398,
496
- "f1_no": 0.5526315789473685,
497
- "f1_yes": 0.3111111111111111,
498
- "f1_macro_ci_low": 0.3568758383648559,
499
- "f1_macro_ci_high": 0.5105344080350164,
500
  "score_name": "f1_micro",
501
- "score": 0.48427672955974843,
502
- "score_ci_high": 0.553538495446083,
503
- "score_ci_low": 0.40855056637270504,
504
  "num_of_instances": 200,
505
- "accuracy": 0.385,
506
- "accuracy_ci_low": 0.32,
507
- "accuracy_ci_high": 0.45,
508
- "f1_micro": 0.48427672955974843,
509
- "f1_micro_ci_low": 0.40855056637270504,
510
- "f1_micro_ci_high": 0.553538495446083
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.07867608581894296,
514
  "f1_conclusion": 0.0,
515
- "f1_decree": 0.07692307692307693,
516
- "f1_issue": 0.0,
517
- "f1_analysis": 0.4166666666666667,
518
- "f1_facts": 0.0,
519
  "f1_procedural history": 0.0,
520
- "f1_rule": 0.05714285714285714,
521
- "f1_macro_ci_low": 0.04893392684609269,
522
- "f1_macro_ci_high": 0.12399894886334993,
523
  "score_name": "f1_micro",
524
- "score": 0.10619469026548672,
525
- "score_ci_high": 0.17107924198886906,
526
- "score_ci_low": 0.06000153614696125,
527
  "num_of_instances": 200,
528
- "accuracy": 0.06,
529
- "accuracy_ci_low": 0.035,
530
- "accuracy_ci_high": 0.1,
531
- "f1_micro": 0.10619469026548672,
532
- "f1_micro_ci_low": 0.06000153614696125,
533
- "f1_micro_ci_high": 0.17107924198886906
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.31866564807741277,
537
- "f1_yes": 0.20952380952380953,
538
- "f1_no": 0.42780748663101603,
539
- "f1_macro_ci_low": 0.2502995617982514,
540
- "f1_macro_ci_high": 0.39353002888833133,
541
  "score_name": "f1_micro",
542
- "score": 0.3493150684931507,
543
- "score_ci_high": 0.42,
544
- "score_ci_low": 0.2701836639419085,
545
  "num_of_instances": 200,
546
- "accuracy": 0.255,
547
- "accuracy_ci_low": 0.195,
548
  "accuracy_ci_high": 0.315,
549
- "f1_micro": 0.3493150684931507,
550
- "f1_micro_ci_low": 0.2701836639419085,
551
- "f1_micro_ci_high": 0.42
552
  },
553
  "legalbench_proa": {
554
- "f1_macro": 0.8489843979982338,
555
- "f1_yes": 0.8607594936708861,
556
- "f1_no": 0.8372093023255814,
557
- "f1_macro_ci_low": 0.762371922413286,
558
- "f1_macro_ci_high": 0.9100766335383308,
559
  "score_name": "f1_micro",
560
- "score": 0.8484848484848485,
561
- "score_ci_high": 0.9090909090909091,
562
- "score_ci_low": 0.7590361445783133,
563
  "num_of_instances": 85,
564
- "accuracy": 0.8235294117647058,
565
- "accuracy_ci_low": 0.7294117647058823,
566
- "accuracy_ci_high": 0.8941176470588236,
567
- "f1_micro": 0.8484848484848485,
568
- "f1_micro_ci_low": 0.7590361445783133,
569
- "f1_micro_ci_high": 0.9090909090909091
570
  },
571
- "score": 0.3776542673606469,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.2527665164224348,
578
- "f1_cars": 0.417910447761194,
579
  "f1_windows x": 0.0,
580
- "f1_atheism": 0.05128205128205128,
581
- "f1_religion": 0.15584415584415584,
582
- "f1_politics": 0.15789473684210525,
583
- "f1_medicine": 0.5,
584
- "f1_christianity": 0.07142857142857142,
585
- "f1_computer graphics": 0.2702702702702703,
586
- "f1_microsoft windows": 0.16393442622950818,
587
- "f1_middle east": 0.12,
588
- "f1_motorcycles": 0.375,
589
- "f1_pc hardware": 0.3157894736842105,
590
  "f1_mac hardware": 0.14285714285714285,
591
- "f1_for sale": 0.2222222222222222,
592
- "f1_guns": 0.11538461538461539,
593
- "f1_space": 0.34782608695652173,
594
- "f1_cryptography": 0.4126984126984127,
595
- "f1_baseball": 0.35135135135135137,
596
- "f1_hockey": 0.5,
597
- "f1_electronics": 0.36363636363636365,
598
- "f1_macro_ci_low": 0.22757237570678052,
599
- "f1_macro_ci_high": 0.28553448529743153,
 
600
  "score_name": "f1_micro",
601
- "score": 0.26766917293233083,
602
- "score_ci_high": 0.29941755760789834,
603
- "score_ci_low": 0.23630344400865025,
604
  "num_of_instances": 1000,
605
- "accuracy": 0.178,
606
- "accuracy_ci_low": 0.155,
607
- "accuracy_ci_high": 0.201,
608
- "f1_micro": 0.26766917293233083,
609
- "f1_micro_ci_low": 0.23630344400865025,
610
- "f1_micro_ci_high": 0.29941755760789834
611
  },
612
- "score": 0.26766917293233083,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.5296897956484732,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.6554621848739496,
620
- "f1_credit card or prepaid card": 0.4943820224719101,
621
- "f1_debt collection": 0.4297520661157025,
622
- "f1_payday loan or title loan or personal loan": 0.375,
623
- "f1_checking or savings account": 0.611764705882353,
624
- "f1_money transfer or virtual currency or money service": 0.5555555555555556,
625
- "f1_mortgage": 0.509090909090909,
626
- "f1_student loan": 0.5555555555555556,
627
- "f1_vehicle loan or lease": 0.5806451612903226,
628
- "f1_macro_ci_low": 0.469057355767322,
629
- "f1_macro_ci_high": 0.598469091946616,
630
  "score_name": "f1_micro",
631
- "score": 0.6116883116883117,
632
- "score_ci_high": 0.6423076923076924,
633
- "score_ci_low": 0.5828077788480714,
634
  "num_of_instances": 1000,
635
- "accuracy": 0.471,
636
- "accuracy_ci_low": 0.443,
637
- "accuracy_ci_high": 0.503,
638
- "f1_micro": 0.6116883116883117,
639
- "f1_micro_ci_low": 0.5828077788480714,
640
- "f1_micro_ci_high": 0.6423076923076924
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.5903065912163163,
644
- "f1_mortgages and loans": 0.6282051282051282,
645
- "f1_credit card": 0.6842105263157895,
646
- "f1_debt collection": 0.49504950495049505,
647
- "f1_credit reporting": 0.5847457627118644,
648
- "f1_retail banking": 0.559322033898305,
649
- "f1_macro_ci_low": 0.5491833392518851,
650
- "f1_macro_ci_high": 0.6385093727045127,
651
  "score_name": "f1_micro",
652
- "score": 0.5856481481481481,
653
- "score_ci_high": 0.632183908045977,
654
- "score_ci_low": 0.54416153401534,
655
  "num_of_instances": 500,
656
- "accuracy": 0.506,
657
- "accuracy_ci_low": 0.466,
658
- "accuracy_ci_high": 0.552,
659
- "f1_micro": 0.5856481481481481,
660
- "f1_micro_ci_low": 0.54416153401534,
661
- "f1_micro_ci_high": 0.632183908045977
662
  },
663
- "score": 0.59866822991823,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
- "execution_accuracy": 0.038,
671
- "program_accuracy": 0.036,
672
- "score": 0.036,
673
  "score_name": "program_accuracy",
674
- "execution_accuracy_ci_low": 0.028,
675
- "execution_accuracy_ci_high": 0.051,
676
- "program_accuracy_ci_low": 0.026,
677
- "program_accuracy_ci_high": 0.048,
678
- "score_ci_low": 0.026,
679
- "score_ci_high": 0.048
 
680
  },
681
- "score": 0.036,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.28525760105453485,
688
- "recall": 0.5602956481447632,
689
- "f1": 0.3115553425716912,
690
- "precision_ci_low": 0.26514229625282976,
691
- "precision_ci_high": 0.3063459879682981,
692
- "recall_ci_low": 0.5448358892557127,
693
- "recall_ci_high": 0.5784430830027775,
694
- "f1_ci_low": 0.29470532845103153,
695
- "f1_ci_high": 0.33107953368338633,
696
  "score_name": "f1",
697
- "score": 0.3115553425716912,
698
- "score_ci_high": 0.33107953368338633,
699
- "score_ci_low": 0.29470532845103153,
700
  "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.5722805594901244,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6597029569248358,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5267530478537082,
704
- "faithfullness_f1_token_overlap": 0.30842516813067145,
705
- "faithfullness_recall_token_overlap": 0.25295558254033107,
706
- "faithfullness_precision_token_overlap": 0.5261232421481096,
707
- "correctness_f1_token_overlap": 0.3115553425716912,
708
- "correctness_recall_token_overlap": 0.5602956481447632,
709
- "correctness_precision_token_overlap": 0.28525760105453485
710
  },
711
- "score": 0.3115553425716912,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.574,
718
- "accuracy_ci_low": 0.543,
719
- "accuracy_ci_high": 0.606,
720
  "score_name": "accuracy",
721
- "score": 0.574,
722
- "score_ci_high": 0.606,
723
- "score_ci_low": 0.543,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
- "accuracy": 0.87,
728
- "accuracy_ci_low": 0.84,
729
- "accuracy_ci_high": 0.898,
730
  "score_name": "accuracy",
731
- "score": 0.87,
732
- "score_ci_high": 0.898,
733
- "score_ci_low": 0.84,
734
  "num_of_instances": 500
735
  },
736
- "score": 0.722,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
- "severity_critical": 3.0,
743
- "severity_high": 3.0,
744
- "severity_medium": 3.0,
745
- "severity_low": 91.0,
746
- "category_harmful_info": 0.7363050992171916,
747
- "category_deception": 0.7075176269117028,
748
- "category_explicit_content": 0.7862109684883944,
749
- "category_violence": 0.8046535633740918,
750
- "category_discrimination": 0.7672554519441394,
751
- "category_substance_abuse": 0.9027199372959634,
752
- "category_pii": 0.8223799069722494,
753
- "safety": 0.7756195961658118,
754
- "safety_ci_low": 0.7397039726251121,
755
- "safety_ci_high": 0.8122983590465129,
756
  "score_name": "safety",
757
- "score": 0.7756195961658118,
758
- "score_ci_high": 0.8122983590465129,
759
- "score_ci_low": 0.7397039726251121,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.7756195961658118,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
- "rougeL": 0.2623368936139326,
770
- "score": 0.2623368936139326,
771
  "score_name": "rougeL",
772
- "rougeLsum": 0.3246861800674231,
773
- "rouge1": 0.37875353275553886,
774
- "rouge2": 0.17634540475082816,
775
- "rougeL_ci_low": 0.25607817865790744,
776
- "rougeL_ci_high": 0.2699573234237949,
777
- "score_ci_low": 0.25607817865790744,
778
- "score_ci_high": 0.2699573234237949,
779
- "rougeLsum_ci_low": 0.31633678336605703,
780
- "rougeLsum_ci_high": 0.3342014722776119,
781
- "rouge1_ci_low": 0.36953845902369503,
782
- "rouge1_ci_high": 0.38917434338527224,
783
- "rouge2_ci_low": 0.17007943381657917,
784
- "rouge2_ci_high": 0.18480657837175643
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
- "rougeL": 0.09126672648981966,
789
- "score": 0.09126672648981966,
790
  "score_name": "rougeL",
791
- "rougeLsum": 0.10371587819726082,
792
- "rouge1": 0.12400577725256573,
793
- "rouge2": 0.017067409640734738,
794
- "rougeL_ci_low": 0.08642017587563405,
795
- "rougeL_ci_high": 0.09509880053104167,
796
- "score_ci_low": 0.08642017587563405,
797
- "score_ci_high": 0.09509880053104167,
798
- "rougeLsum_ci_low": 0.09869637123218795,
799
- "rougeLsum_ci_high": 0.10797131588828912,
800
- "rouge1_ci_low": 0.11835018459961112,
801
- "rouge1_ci_high": 0.1293417832607229,
802
- "rouge2_ci_low": 0.015170057207432563,
803
- "rouge2_ci_high": 0.019497436133181917
804
  },
805
- "score": 0.17680181005187612,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
@@ -810,473 +810,473 @@
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
- 1175,
814
- 692,
815
- 449,
816
- 309
817
  ],
818
  "totals": [
819
- 1795,
820
- 1729,
821
- 1663,
822
- 1597
823
  ],
824
  "precisions": [
825
- 0.6545961002785515,
826
- 0.40023134759976864,
827
- 0.269993986770896,
828
- 0.19348778960551033
829
  ],
830
  "bp": 1.0,
831
- "sys_len": 1795,
832
  "ref_len": 1734,
833
- "sacrebleu": 0.34203696369018,
834
- "score": 0.34203696369018,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.30827619341319407,
837
- "score_ci_high": 0.3934585756605337,
838
- "sacrebleu_ci_low": 0.30827619341319407,
839
- "sacrebleu_ci_high": 0.3934585756605337
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
- 1238,
845
- 753,
846
- 502,
847
- 333
848
  ],
849
  "totals": [
850
- 1807,
851
- 1741,
852
- 1675,
853
- 1609
854
  ],
855
  "precisions": [
856
- 0.6851134477033757,
857
- 0.43251005169442847,
858
- 0.29970149253731343,
859
- 0.2069608452454941
860
  ],
861
  "bp": 1.0,
862
- "sys_len": 1807,
863
  "ref_len": 1734,
864
- "sacrebleu": 0.36820013898054776,
865
- "score": 0.36820013898054776,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.3262271352518154,
868
- "score_ci_high": 0.41519158941270023,
869
- "sacrebleu_ci_low": 0.3262271352518154,
870
- "sacrebleu_ci_high": 0.41519158941270023
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
- 247,
876
- 51,
877
- 24,
878
- 13
879
  ],
880
  "totals": [
881
- 3575,
882
- 3509,
883
- 3443,
884
- 3377
885
  ],
886
  "precisions": [
887
- 0.06909090909090909,
888
- 0.014534055286406384,
889
- 0.0069706651176299735,
890
- 0.0038495706248149247
891
  ],
892
  "bp": 1.0,
893
- "sys_len": 3575,
894
  "ref_len": 1589,
895
- "sacrebleu": 0.012812195485921624,
896
- "score": 0.012812195485921624,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.006078646745883528,
899
- "score_ci_high": 0.020936322376328288,
900
- "sacrebleu_ci_low": 0.006078646745883528,
901
- "sacrebleu_ci_high": 0.020936322376328288
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
- 1113,
907
- 599,
908
- 371,
909
- 238
910
  ],
911
  "totals": [
912
- 1788,
913
- 1722,
914
- 1656,
915
- 1590
916
  ],
917
  "precisions": [
918
- 0.6224832214765101,
919
- 0.34785133565621373,
920
- 0.22403381642512077,
921
- 0.14968553459119496
922
  ],
923
- "bp": 0.9740561253203749,
924
- "sys_len": 1788,
925
  "ref_len": 1835,
926
- "sacrebleu": 0.2843398077874606,
927
- "score": 0.2843398077874606,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.23626058332848057,
930
- "score_ci_high": 0.31993924524942646,
931
- "sacrebleu_ci_low": 0.23626058332848057,
932
- "sacrebleu_ci_high": 0.31993924524942646
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
- 1420,
938
- 985,
939
- 727,
940
- 545
941
  ],
942
  "totals": [
943
- 2032,
944
- 1966,
945
- 1900,
946
- 1834
947
  ],
948
  "precisions": [
949
- 0.6988188976377953,
950
- 0.5010172939979654,
951
- 0.38263157894736843,
952
- 0.29716466739367503
953
  ],
954
- "bp": 0.9824394796731021,
955
- "sys_len": 2032,
956
  "ref_len": 2068,
957
- "sacrebleu": 0.4388384183123361,
958
- "score": 0.4388384183123361,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.39345852629040573,
961
- "score_ci_high": 0.4779013774696765,
962
- "sacrebleu_ci_low": 0.39345852629040573,
963
- "sacrebleu_ci_high": 0.4779013774696765
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
- 1094,
969
- 480,
970
- 264,
971
- 149
972
  ],
973
  "totals": [
974
- 2582,
975
- 2516,
976
- 2450,
977
- 2384
978
  ],
979
  "precisions": [
980
- 0.42370255615801705,
981
- 0.1907790143084261,
982
- 0.10775510204081633,
983
- 0.0625
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 2582,
987
  "ref_len": 2235,
988
- "sacrebleu": 0.15274865193932807,
989
- "score": 0.15274865193932807,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.11545139318026426,
992
- "score_ci_high": 0.18226843784214114,
993
- "sacrebleu_ci_low": 0.11545139318026426,
994
- "sacrebleu_ci_high": 0.18226843784214114
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
- 1376,
1000
- 942,
1001
- 686,
1002
- 504
1003
  ],
1004
  "totals": [
1005
- 1895,
1006
- 1829,
1007
- 1763,
1008
- 1697
1009
  ],
1010
  "precisions": [
1011
- 0.7261213720316623,
1012
- 0.5150355385456534,
1013
- 0.3891094724900737,
1014
- 0.29699469652327637
1015
  ],
1016
- "bp": 0.988979382694272,
1017
- "sys_len": 1895,
1018
  "ref_len": 1916,
1019
- "sacrebleu": 0.4509246392883171,
1020
- "score": 0.4509246392883171,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.3969123124838706,
1023
- "score_ci_high": 0.49488969802829147,
1024
- "sacrebleu_ci_low": 0.3969123124838706,
1025
- "sacrebleu_ci_high": 0.49488969802829147
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
- 1007,
1031
- 563,
1032
- 348,
1033
- 213
1034
  ],
1035
  "totals": [
1036
- 1937,
1037
- 1871,
1038
- 1805,
1039
- 1739
1040
  ],
1041
  "precisions": [
1042
- 0.5198760970573051,
1043
- 0.3009086050240513,
1044
- 0.192797783933518,
1045
- 0.12248418631397355
1046
  ],
1047
- "bp": 0.9938240032224314,
1048
- "sys_len": 1937,
1049
  "ref_len": 1949,
1050
- "sacrebleu": 0.24501270828054722,
1051
- "score": 0.24501270828054722,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.20771746826159335,
1054
- "score_ci_high": 0.2893120776468884,
1055
- "sacrebleu_ci_low": 0.20771746826159335,
1056
- "sacrebleu_ci_high": 0.2893120776468884
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
- 1254,
1062
- 697,
1063
- 407,
1064
- 241
1065
  ],
1066
  "totals": [
1067
- 1994,
1068
- 1928,
1069
- 1862,
1070
- 1796
1071
  ],
1072
  "precisions": [
1073
- 0.6288866599799399,
1074
- 0.36151452282157676,
1075
- 0.21858216970998925,
1076
- 0.13418708240534521
1077
  ],
1078
- "bp": 0.9491803375373334,
1079
- "sys_len": 1994,
1080
  "ref_len": 2098,
1081
- "sacrebleu": 0.27124055641744416,
1082
- "score": 0.27124055641744416,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.238444782766854,
1085
- "score_ci_high": 0.29802738323060723,
1086
- "sacrebleu_ci_low": 0.238444782766854,
1087
- "sacrebleu_ci_high": 0.29802738323060723
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
- 1272,
1093
- 821,
1094
  564,
1095
- 393
1096
  ],
1097
  "totals": [
1098
- 1814,
1099
- 1748,
1100
- 1682,
1101
- 1616
1102
  ],
1103
  "precisions": [
1104
- 0.701212789415656,
1105
- 0.4696796338672769,
1106
- 0.33531510107015455,
1107
- 0.24319306930693071
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 1814,
1111
  "ref_len": 1734,
1112
- "sacrebleu": 0.4048218693289096,
1113
- "score": 0.4048218693289096,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.3701816768681021,
1116
- "score_ci_high": 0.4500189391713141,
1117
- "sacrebleu_ci_low": 0.3701816768681021,
1118
- "sacrebleu_ci_high": 0.4500189391713141
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
- 1013,
1124
- 461,
1125
- 263,
1126
- 158
1127
  ],
1128
  "totals": [
1129
- 1820,
1130
- 1754,
1131
- 1688,
1132
- 1622
1133
  ],
1134
  "precisions": [
1135
- 0.5565934065934066,
1136
- 0.2628278221208666,
1137
- 0.15580568720379145,
1138
- 0.09741060419235512
1139
  ],
1140
  "bp": 1.0,
1141
- "sys_len": 1820,
1142
  "ref_len": 1734,
1143
- "sacrebleu": 0.2170699640137964,
1144
- "score": 0.2170699640137964,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.18647053560514687,
1147
- "score_ci_high": 0.26290605520041826,
1148
- "sacrebleu_ci_low": 0.18647053560514687,
1149
- "sacrebleu_ci_high": 0.26290605520041826
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
- 993,
1155
- 465,
1156
- 267,
1157
- 159
1158
  ],
1159
  "totals": [
1160
- 1895,
1161
- 1829,
1162
- 1763,
1163
- 1697
1164
  ],
1165
  "precisions": [
1166
- 0.5240105540897098,
1167
- 0.2542372881355932,
1168
- 0.1514463981849121,
1169
- 0.09369475545079553
1170
  ],
1171
  "bp": 1.0,
1172
- "sys_len": 1895,
1173
  "ref_len": 1734,
1174
- "sacrebleu": 0.20851551650550446,
1175
- "score": 0.20851551650550446,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.17957414369928407,
1178
- "score_ci_high": 0.2554257407203763,
1179
- "sacrebleu_ci_low": 0.17957414369928407,
1180
- "sacrebleu_ci_high": 0.2554257407203763
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
- 1283,
1186
  864,
1187
- 623,
1188
  463
1189
  ],
1190
  "totals": [
1191
- 1786,
1192
- 1720,
1193
- 1654,
1194
- 1588
1195
  ],
1196
  "precisions": [
1197
- 0.7183650615901456,
1198
- 0.5023255813953488,
1199
- 0.3766626360338573,
1200
- 0.2915617128463476
1201
  ],
1202
  "bp": 1.0,
1203
- "sys_len": 1786,
1204
  "ref_len": 1734,
1205
- "sacrebleu": 0.4461731000368479,
1206
- "score": 0.4461731000368479,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.3937462177900075,
1209
- "score_ci_high": 0.5074742125274534,
1210
- "sacrebleu_ci_low": 0.3937462177900075,
1211
- "sacrebleu_ci_high": 0.5074742125274534
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
- 1272,
1217
- 815,
1218
- 567,
1219
- 394
1220
  ],
1221
  "totals": [
1222
- 1883,
1223
- 1817,
1224
- 1751,
1225
- 1685
1226
  ],
1227
  "precisions": [
1228
- 0.6755177907594265,
1229
- 0.4485415520088057,
1230
- 0.32381496287835526,
1231
- 0.23382789317507416
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 1883,
1235
  "ref_len": 1734,
1236
- "sacrebleu": 0.3891868659594801,
1237
- "score": 0.3891868659594801,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.3320885601116078,
1240
- "score_ci_high": 0.43155018915229526,
1241
- "sacrebleu_ci_low": 0.3320885601116078,
1242
- "sacrebleu_ci_high": 0.43155018915229526
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
- 1163,
1248
- 647,
1249
- 405,
1250
- 255
1251
  ],
1252
  "totals": [
1253
- 1850,
1254
- 1784,
1255
- 1718,
1256
- 1652
1257
  ],
1258
  "precisions": [
1259
- 0.6286486486486487,
1260
- 0.36266816143497754,
1261
- 0.23573923166472643,
1262
- 0.15435835351089588
1263
  ],
1264
  "bp": 1.0,
1265
- "sys_len": 1850,
1266
  "ref_len": 1734,
1267
- "sacrebleu": 0.30180043021927255,
1268
- "score": 0.30180043021927255,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.2688254114664501,
1271
- "score_ci_high": 0.3447493011880723,
1272
- "sacrebleu_ci_low": 0.2688254114664501,
1273
- "sacrebleu_ci_high": 0.3447493011880723
1274
  },
1275
- "score": 0.30224812174972626,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
- "score": 0.40523563547897645,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-06-23T13:36:29.058411Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
+ "accuracy": 0.5222222222222223,
180
+ "accuracy_ci_low": 0.4222222222222222,
181
+ "accuracy_ci_high": 0.6333333333333333,
182
  "score_name": "accuracy",
183
+ "score": 0.5222222222222223,
184
+ "score_ci_high": 0.6333333333333333,
185
+ "score_ci_low": 0.4222222222222222,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
  "accuracy": 0.5555555555555556,
190
+ "accuracy_ci_low": 0.44726747907364484,
191
  "accuracy_ci_high": 0.6555555555555556,
192
  "score_name": "accuracy",
193
  "score": 0.5555555555555556,
194
  "score_ci_high": 0.6555555555555556,
195
+ "score_ci_low": 0.44726747907364484,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
+ "accuracy": 0.8666666666666667,
200
+ "accuracy_ci_low": 0.7781253622132644,
201
  "accuracy_ci_high": 0.9333333333333333,
202
  "score_name": "accuracy",
203
+ "score": 0.8666666666666667,
204
  "score_ci_high": 0.9333333333333333,
205
+ "score_ci_low": 0.7781253622132644,
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
  "accuracy": 0.5777777777777777,
210
+ "accuracy_ci_low": 0.457520776596763,
211
  "accuracy_ci_high": 0.6777777777777778,
212
  "score_name": "accuracy",
213
  "score": 0.5777777777777777,
214
  "score_ci_high": 0.6777777777777778,
215
+ "score_ci_low": 0.457520776596763,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.5888888888888889,
220
+ "accuracy_ci_low": 0.4888888888888889,
221
+ "accuracy_ci_high": 0.6804301831819051,
222
  "score_name": "accuracy",
223
+ "score": 0.5888888888888889,
224
+ "score_ci_high": 0.6804301831819051,
225
+ "score_ci_low": 0.4888888888888889,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.9666666666666667,
230
+ "accuracy_ci_low": 0.9111111111111111,
231
+ "accuracy_ci_high": 0.9888888888888889,
232
  "score_name": "accuracy",
233
+ "score": 0.9666666666666667,
234
+ "score_ci_high": 0.9888888888888889,
235
+ "score_ci_low": 0.9111111111111111,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.8555555555555555,
240
+ "accuracy_ci_low": 0.7725017589399771,
241
+ "accuracy_ci_high": 0.9222222222222223,
242
  "score_name": "accuracy",
243
+ "score": 0.8555555555555555,
244
+ "score_ci_high": 0.9222222222222223,
245
+ "score_ci_low": 0.7725017589399771,
246
  "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.6666666666666666,
250
+ "accuracy_ci_low": 0.5666666666666667,
251
+ "accuracy_ci_high": 0.7555555555555555,
252
  "score_name": "accuracy",
253
+ "score": 0.6666666666666666,
254
+ "score_ci_high": 0.7555555555555555,
255
+ "score_ci_low": 0.5666666666666667,
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
  "accuracy": 0.6444444444444445,
260
+ "accuracy_ci_low": 0.5444444444444444,
261
  "accuracy_ci_high": 0.7444444444444445,
262
  "score_name": "accuracy",
263
  "score": 0.6444444444444445,
264
  "score_ci_high": 0.7444444444444445,
265
+ "score_ci_low": 0.5444444444444444,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
+ "accuracy": 0.6444444444444445,
270
+ "accuracy_ci_low": 0.5444444444444444,
271
+ "accuracy_ci_high": 0.7444444444444445,
272
  "score_name": "accuracy",
273
+ "score": 0.6444444444444445,
274
+ "score_ci_high": 0.7444444444444445,
275
+ "score_ci_low": 0.5444444444444444,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.8777777777777778,
280
+ "accuracy_ci_low": 0.8,
281
  "accuracy_ci_high": 0.9333333333333333,
282
  "score_name": "accuracy",
283
+ "score": 0.8777777777777778,
284
  "score_ci_high": 0.9333333333333333,
285
+ "score_ci_low": 0.8,
286
  "num_of_instances": 90
287
  },
288
+ "score": 0.706060606060606,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.07977207977207977,
296
+ "score": 0.07977207977207977,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.07977207977207977,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
+ "f1_Person": 0.3180428134556575,
307
+ "f1_Organization": 0.2747603833865815,
308
+ "f1_Location": 0.22950819672131148,
309
+ "f1_macro": 0.27410379785451683,
310
+ "recall_macro": 0.23183858884648992,
311
+ "precision_macro": 0.3454658738569018,
312
+ "in_classes_support": 0.5302806499261448,
313
+ "f1_micro": 0.20465890183028285,
314
+ "recall_micro": 0.2342857142857143,
315
+ "precision_micro": 0.18168389955686853,
316
+ "score": 0.20465890183028285,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.1623107942071267,
319
+ "score_ci_high": 0.24302164655950423,
320
+ "f1_micro_ci_low": 0.1623107942071267,
321
+ "f1_micro_ci_high": 0.24302164655950423
322
  },
323
+ "score": 0.20465890183028285,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
 
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
+ "accuracy": 0.1267605633802817,
340
+ "accuracy_ci_low": 0.056338028169014086,
341
+ "accuracy_ci_high": 0.2112676056338028,
342
  "score_name": "accuracy",
343
+ "score": 0.1267605633802817,
344
+ "score_ci_high": 0.2112676056338028,
345
+ "score_ci_low": 0.056338028169014086,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
  "accuracy": 0.08450704225352113,
350
+ "accuracy_ci_low": 0.04225352112676056,
351
+ "accuracy_ci_high": 0.16901408450704225,
352
  "score_name": "accuracy",
353
  "score": 0.08450704225352113,
354
+ "score_ci_high": 0.16901408450704225,
355
+ "score_ci_low": 0.04225352112676056,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
+ "accuracy": 0.4084507042253521,
360
+ "accuracy_ci_low": 0.30985915492957744,
361
+ "accuracy_ci_high": 0.5211267605633803,
362
  "score_name": "accuracy",
363
+ "score": 0.4084507042253521,
364
+ "score_ci_high": 0.5211267605633803,
365
+ "score_ci_low": 0.30985915492957744,
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
+ "accuracy": 0.5211267605633803,
370
+ "accuracy_ci_low": 0.39436619718309857,
371
+ "accuracy_ci_high": 0.6338028169014085,
372
  "score_name": "accuracy",
373
+ "score": 0.5211267605633803,
374
+ "score_ci_high": 0.6338028169014085,
375
+ "score_ci_low": 0.39436619718309857,
376
  "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
 
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
+ "accuracy": 0.3380281690140845,
390
+ "accuracy_ci_low": 0.22535211267605634,
391
+ "accuracy_ci_high": 0.4507042253521127,
392
  "score_name": "accuracy",
393
+ "score": 0.3380281690140845,
394
+ "score_ci_high": 0.4507042253521127,
395
+ "score_ci_low": 0.22535211267605634,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
+ "accuracy": 0.4788732394366197,
400
  "accuracy_ci_low": 0.36619718309859156,
401
+ "accuracy_ci_high": 0.5915492957746479,
402
  "score_name": "accuracy",
403
+ "score": 0.4788732394366197,
404
+ "score_ci_high": 0.5915492957746479,
405
  "score_ci_low": 0.36619718309859156,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
+ "accuracy": 0.36619718309859156,
410
+ "accuracy_ci_low": 0.2676056338028169,
411
+ "accuracy_ci_high": 0.4788732394366197,
412
  "score_name": "accuracy",
413
+ "score": 0.36619718309859156,
414
+ "score_ci_high": 0.4788732394366197,
415
+ "score_ci_low": 0.2676056338028169,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
+ "accuracy": 0.08450704225352113,
420
+ "accuracy_ci_low": 0.028169014084507043,
421
+ "accuracy_ci_high": 0.16901408450704225,
422
  "score_name": "accuracy",
423
+ "score": 0.08450704225352113,
424
+ "score_ci_high": 0.16901408450704225,
425
+ "score_ci_low": 0.028169014084507043,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.323943661971831,
430
+ "accuracy_ci_low": 0.22535211267605634,
431
+ "accuracy_ci_high": 0.4397440034897243,
432
  "score_name": "accuracy",
433
+ "score": 0.323943661971831,
434
+ "score_ci_high": 0.4397440034897243,
435
+ "score_ci_low": 0.22535211267605634,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
+ "accuracy": 0.4225352112676056,
440
+ "accuracy_ci_low": 0.30985915492957744,
441
+ "accuracy_ci_high": 0.5492957746478874,
442
  "score_name": "accuracy",
443
+ "score": 0.4225352112676056,
444
+ "score_ci_high": 0.5492957746478874,
445
+ "score_ci_low": 0.30985915492957744,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
  "accuracy": 0.18309859154929578,
450
  "accuracy_ci_low": 0.09859154929577464,
451
+ "accuracy_ci_high": 0.28910654360361887,
452
  "score_name": "accuracy",
453
  "score": 0.18309859154929578,
454
+ "score_ci_high": 0.28910654360361887,
455
  "score_ci_low": 0.09859154929577464,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
+ "accuracy": 0.5633802816901409,
460
+ "accuracy_ci_low": 0.4507042253521127,
461
+ "accuracy_ci_high": 0.676056338028169,
462
  "score_name": "accuracy",
463
+ "score": 0.5633802816901409,
464
+ "score_ci_high": 0.676056338028169,
465
+ "score_ci_low": 0.4507042253521127,
466
  "num_of_instances": 71
467
  },
468
+ "score": 0.3289738430583501,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.09605263157894736,
475
  "f1_suggestive": 0.0,
476
  "f1_generic": 0.0,
477
+ "f1_descriptive": 0.375,
478
  "f1_fanciful": 0.10526315789473684,
 
479
  "f1_arbitrary": 0.0,
480
+ "f1_macro_ci_low": 0.048484848484848485,
481
+ "f1_macro_ci_high": 0.1610036081002675,
482
  "score_name": "f1_micro",
483
+ "score": 0.13333333333333333,
484
+ "score_ci_high": 0.24299065420560748,
485
+ "score_ci_low": 0.05825242718446602,
486
  "num_of_instances": 85,
487
+ "accuracy": 0.08235294117647059,
488
+ "accuracy_ci_low": 0.03529411764705882,
489
+ "accuracy_ci_high": 0.15294117647058825,
490
+ "f1_micro": 0.13333333333333333,
491
+ "f1_micro_ci_low": 0.05825242718446602,
492
+ "f1_micro_ci_high": 0.24299065420560748
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.46628060735944554,
496
+ "f1_no": 0.5809128630705395,
497
+ "f1_yes": 0.3516483516483517,
498
+ "f1_macro_ci_low": 0.3928270444081002,
499
+ "f1_macro_ci_high": 0.5415825326375369,
500
  "score_name": "f1_micro",
501
+ "score": 0.5180722891566265,
502
+ "score_ci_high": 0.58253132966529,
503
+ "score_ci_low": 0.44652531947540486,
504
  "num_of_instances": 200,
505
+ "accuracy": 0.43,
506
+ "accuracy_ci_low": 0.365,
507
+ "accuracy_ci_high": 0.495,
508
+ "f1_micro": 0.5180722891566265,
509
+ "f1_micro_ci_low": 0.44652531947540486,
510
+ "f1_micro_ci_high": 0.58253132966529
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.09994226986403168,
514
  "f1_conclusion": 0.0,
515
+ "f1_decree": 0.14814814814814814,
516
+ "f1_issue": 0.05714285714285714,
517
+ "f1_analysis": 0.3076923076923077,
518
+ "f1_facts": 0.06896551724137931,
519
  "f1_procedural history": 0.0,
520
+ "f1_rule": 0.11764705882352941,
521
+ "f1_macro_ci_low": 0.06467882036635723,
522
+ "f1_macro_ci_high": 0.16271896227970067,
523
  "score_name": "f1_micro",
524
+ "score": 0.1222707423580786,
525
+ "score_ci_high": 0.1896551724137931,
526
+ "score_ci_low": 0.07144817486457739,
527
  "num_of_instances": 200,
528
+ "accuracy": 0.07,
529
+ "accuracy_ci_low": 0.04,
530
+ "accuracy_ci_high": 0.11032816661500704,
531
+ "f1_micro": 0.1222707423580786,
532
+ "f1_micro_ci_low": 0.07144817486457739,
533
+ "f1_micro_ci_high": 0.1896551724137931
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.3036750483558994,
537
+ "f1_yes": 0.18181818181818182,
538
+ "f1_no": 0.425531914893617,
539
+ "f1_macro_ci_low": 0.24346443663418135,
540
+ "f1_macro_ci_high": 0.37985983397043105,
541
  "score_name": "f1_micro",
542
+ "score": 0.33557046979865773,
543
+ "score_ci_high": 0.41208424597764826,
544
+ "score_ci_low": 0.26697141622873294,
545
  "num_of_instances": 200,
546
+ "accuracy": 0.25,
547
+ "accuracy_ci_low": 0.2,
548
  "accuracy_ci_high": 0.315,
549
+ "f1_micro": 0.33557046979865773,
550
+ "f1_micro_ci_low": 0.26697141622873294,
551
+ "f1_micro_ci_high": 0.41208424597764826
552
  },
553
  "legalbench_proa": {
554
+ "f1_macro": 0.8285413216920066,
555
+ "f1_yes": 0.821917808219178,
556
+ "f1_no": 0.8351648351648352,
557
+ "f1_macro_ci_low": 0.7370610449522892,
558
+ "f1_macro_ci_high": 0.8925462752093225,
559
  "score_name": "f1_micro",
560
+ "score": 0.8292682926829268,
561
+ "score_ci_high": 0.891566265060241,
562
+ "score_ci_low": 0.7393939393939394,
563
  "num_of_instances": 85,
564
+ "accuracy": 0.8,
565
+ "accuracy_ci_low": 0.7058823529411765,
566
+ "accuracy_ci_high": 0.8705882352941177,
567
+ "f1_micro": 0.8292682926829268,
568
+ "f1_micro_ci_low": 0.7393939393939394,
569
+ "f1_micro_ci_high": 0.891566265060241
570
  },
571
+ "score": 0.3877030254659246,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.2883227310464808,
578
+ "f1_cars": 0.45454545454545453,
579
  "f1_windows x": 0.0,
580
+ "f1_atheism": 0.1,
581
+ "f1_religion": 0.11594202898550725,
582
+ "f1_medicine": 0.5161290322580645,
583
+ "f1_christianity": 0.2,
584
+ "f1_computer graphics": 0.24615384615384617,
585
+ "f1_microsoft windows": 0.2,
586
+ "f1_middle east": 0.23529411764705882,
587
+ "f1_motorcycles": 0.43902439024390244,
588
+ "f1_pc hardware": 0.38095238095238093,
 
589
  "f1_mac hardware": 0.14285714285714285,
590
+ "f1_for sale": 0.2127659574468085,
591
+ "f1_guns": 0.04,
592
+ "f1_space": 0.4166666666666667,
593
+ "f1_cryptography": 0.36065573770491804,
594
+ "f1_baseball": 0.37333333333333335,
595
+ "f1_hockey": 0.6060606060606061,
596
+ "f1_politics": 0.23376623376623376,
597
+ "f1_electronics": 0.49230769230769234,
598
+ "f1_macro_ci_low": 0.2617609426183389,
599
+ "f1_macro_ci_high": 0.31709395962356884,
600
  "score_name": "f1_micro",
601
+ "score": 0.30918595967139656,
602
+ "score_ci_high": 0.3396825281712499,
603
+ "score_ci_low": 0.2773193184164219,
604
  "num_of_instances": 1000,
605
+ "accuracy": 0.207,
606
+ "accuracy_ci_low": 0.183,
607
+ "accuracy_ci_high": 0.231,
608
+ "f1_micro": 0.30918595967139656,
609
+ "f1_micro_ci_low": 0.2773193184164219,
610
+ "f1_micro_ci_high": 0.3396825281712499
611
  },
612
+ "score": 0.30918595967139656,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.4658504084132011,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.6103646833013435,
620
+ "f1_checking or savings account": 0.43243243243243246,
621
+ "f1_debt collection": 0.35294117647058826,
622
+ "f1_credit card or prepaid card": 0.4883720930232558,
623
+ "f1_money transfer or virtual currency or money service": 0.48,
624
+ "f1_mortgage": 0.6885245901639344,
625
+ "f1_payday loan or title loan or personal loan": 0.16666666666666666,
626
+ "f1_student loan": 0.5217391304347826,
627
+ "f1_vehicle loan or lease": 0.45161290322580644,
628
+ "f1_macro_ci_low": 0.4151030067042806,
629
+ "f1_macro_ci_high": 0.5374380194007881,
630
  "score_name": "f1_micro",
631
+ "score": 0.5647530040053405,
632
+ "score_ci_high": 0.5946299934512115,
633
+ "score_ci_low": 0.5324708819498815,
634
  "num_of_instances": 1000,
635
+ "accuracy": 0.423,
636
+ "accuracy_ci_low": 0.392,
637
+ "accuracy_ci_high": 0.4538616190423828,
638
+ "f1_micro": 0.5647530040053405,
639
+ "f1_micro_ci_low": 0.5324708819498815,
640
+ "f1_micro_ci_high": 0.5946299934512115
641
  },
642
  "cfpb_product_watsonx": {
643
+ "f1_macro": 0.6154980967666208,
644
+ "f1_mortgages and loans": 0.6794871794871795,
645
+ "f1_credit card": 0.6394557823129252,
646
+ "f1_debt collection": 0.5405405405405406,
647
+ "f1_credit reporting": 0.6307053941908713,
648
+ "f1_retail banking": 0.5873015873015873,
649
+ "f1_macro_ci_low": 0.5757444877146287,
650
+ "f1_macro_ci_high": 0.6632621576134956,
651
  "score_name": "f1_micro",
652
+ "score": 0.6152046783625731,
653
+ "score_ci_high": 0.6592588246755606,
654
+ "score_ci_low": 0.5758293838862559,
655
  "num_of_instances": 500,
656
+ "accuracy": 0.526,
657
+ "accuracy_ci_low": 0.488,
658
+ "accuracy_ci_high": 0.572065074842346,
659
+ "f1_micro": 0.6152046783625731,
660
+ "f1_micro_ci_low": 0.5758293838862559,
661
+ "f1_micro_ci_high": 0.6592588246755606
662
  },
663
+ "score": 0.5899788411839568,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
+ "program_accuracy": 0.03,
671
+ "score": 0.03,
 
672
  "score_name": "program_accuracy",
673
+ "execution_accuracy": 0.03,
674
+ "program_accuracy_ci_low": 0.02,
675
+ "program_accuracy_ci_high": 0.041,
676
+ "score_ci_low": 0.02,
677
+ "score_ci_high": 0.041,
678
+ "execution_accuracy_ci_low": 0.02,
679
+ "execution_accuracy_ci_high": 0.042
680
  },
681
+ "score": 0.03,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
+ "precision": 0.27959949741292506,
688
+ "recall": 0.5616895325224012,
689
+ "f1": 0.3093936462410963,
690
+ "precision_ci_low": 0.26037333098993004,
691
+ "precision_ci_high": 0.2998496234035471,
692
+ "recall_ci_low": 0.545189324623663,
693
+ "recall_ci_high": 0.5772522419606227,
694
+ "f1_ci_low": 0.29233863350762596,
695
+ "f1_ci_high": 0.3260584980677657,
696
  "score_name": "f1",
697
+ "score": 0.3093936462410963,
698
+ "score_ci_high": 0.3260584980677657,
699
+ "score_ci_low": 0.29233863350762596,
700
  "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.5708662017683188,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6597496373951435,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5238148674865564,
704
+ "faithfullness_f1_token_overlap": 0.3063464322583528,
705
+ "faithfullness_recall_token_overlap": 0.2515040891461165,
706
+ "faithfullness_precision_token_overlap": 0.5201254248503554,
707
+ "correctness_f1_token_overlap": 0.3093936462410963,
708
+ "correctness_recall_token_overlap": 0.5616895325224012,
709
+ "correctness_precision_token_overlap": 0.27959949741292506
710
  },
711
+ "score": 0.3093936462410963,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
+ "accuracy": 0.572,
718
+ "accuracy_ci_low": 0.5408695080796895,
719
+ "accuracy_ci_high": 0.602,
720
  "score_name": "accuracy",
721
+ "score": 0.572,
722
+ "score_ci_high": 0.602,
723
+ "score_ci_low": 0.5408695080796895,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
+ "accuracy": 0.878,
728
+ "accuracy_ci_low": 0.846,
729
+ "accuracy_ci_high": 0.9048842512733182,
730
  "score_name": "accuracy",
731
+ "score": 0.878,
732
+ "score_ci_high": 0.9048842512733182,
733
+ "score_ci_low": 0.846,
734
  "num_of_instances": 500
735
  },
736
+ "score": 0.725,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
+ "severity_critical": 2.0,
743
+ "severity_high": 2.0,
744
+ "severity_medium": 1.0,
745
+ "severity_low": 95.0,
746
+ "category_harmful_info": 0.8059234746569206,
747
+ "category_deception": 0.7821050891370483,
748
+ "category_explicit_content": 0.8131378136499965,
749
+ "category_violence": 0.7824397089225905,
750
+ "category_discrimination": 0.7646755935653806,
751
+ "category_substance_abuse": 0.7601792050732505,
752
+ "category_pii": 0.7583640239856861,
753
+ "safety": 0.7892763628562292,
754
+ "safety_ci_low": 0.7568372385512269,
755
+ "safety_ci_high": 0.8199271972816702,
756
  "score_name": "safety",
757
+ "score": 0.7892763628562292,
758
+ "score_ci_high": 0.8199271972816702,
759
+ "score_ci_low": 0.7568372385512269,
760
  "num_of_instances": 100
761
  },
762
+ "score": 0.7892763628562292,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
+ "rougeL": 0.26029440810817217,
770
+ "score": 0.26029440810817217,
771
  "score_name": "rougeL",
772
+ "rouge1": 0.37812164542305304,
773
+ "rougeLsum": 0.3224592121117338,
774
+ "rouge2": 0.1735255109874066,
775
+ "rougeL_ci_low": 0.25326019704193026,
776
+ "rougeL_ci_high": 0.2676771085873056,
777
+ "score_ci_low": 0.25326019704193026,
778
+ "score_ci_high": 0.2676771085873056,
779
+ "rouge1_ci_low": 0.36799150913690026,
780
+ "rouge1_ci_high": 0.3873907236511595,
781
+ "rougeLsum_ci_low": 0.3130244535911371,
782
+ "rougeLsum_ci_high": 0.33132370273160416,
783
+ "rouge2_ci_low": 0.16695272546253506,
784
+ "rouge2_ci_high": 0.18091296094174728
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
+ "rougeL": 0.09390373141967649,
789
+ "score": 0.09390373141967649,
790
  "score_name": "rougeL",
791
+ "rouge1": 0.12667295475773943,
792
+ "rougeLsum": 0.10750099464975159,
793
+ "rouge2": 0.01898752670157484,
794
+ "rougeL_ci_low": 0.08937719185127607,
795
+ "rougeL_ci_high": 0.09832477865928776,
796
+ "score_ci_low": 0.08937719185127607,
797
+ "score_ci_high": 0.09832477865928776,
798
+ "rouge1_ci_low": 0.12055496978215834,
799
+ "rouge1_ci_high": 0.13195192205981862,
800
+ "rougeLsum_ci_low": 0.10257751488972287,
801
+ "rougeLsum_ci_high": 0.11236007191539416,
802
+ "rouge2_ci_low": 0.01673170907001085,
803
+ "rouge2_ci_high": 0.02151839340904419
804
  },
805
+ "score": 0.17709906976392434,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
 
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
+ 1176,
814
+ 681,
815
+ 439,
816
+ 289
817
  ],
818
  "totals": [
819
+ 1805,
820
+ 1739,
821
+ 1673,
822
+ 1607
823
  ],
824
  "precisions": [
825
+ 0.6515235457063712,
826
+ 0.39160437032777456,
827
+ 0.26240286909742977,
828
+ 0.17983820784069696
829
  ],
830
  "bp": 1.0,
831
+ "sys_len": 1805,
832
  "ref_len": 1734,
833
+ "sacrebleu": 0.33125088532286445,
834
+ "score": 0.33125088532286445,
835
  "score_name": "sacrebleu",
836
+ "score_ci_low": 0.28827207319430564,
837
+ "score_ci_high": 0.3750069277361591,
838
+ "sacrebleu_ci_low": 0.28827207319430564,
839
+ "sacrebleu_ci_high": 0.3750069277361591
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
+ 1224,
845
+ 746,
846
+ 495,
847
+ 332
848
  ],
849
  "totals": [
850
+ 1783,
851
+ 1717,
852
+ 1651,
853
+ 1585
854
  ],
855
  "precisions": [
856
+ 0.6864834548513741,
857
+ 0.4344787419918463,
858
+ 0.29981829194427617,
859
+ 0.20946372239747635
860
  ],
861
  "bp": 1.0,
862
+ "sys_len": 1783,
863
  "ref_len": 1734,
864
+ "sacrebleu": 0.36994859939150276,
865
+ "score": 0.36994859939150276,
866
  "score_name": "sacrebleu",
867
+ "score_ci_low": 0.3148788426694038,
868
+ "score_ci_high": 0.4105592207294174,
869
+ "sacrebleu_ci_low": 0.3148788426694038,
870
+ "sacrebleu_ci_high": 0.4105592207294174
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
+ 215,
876
+ 39,
877
+ 18,
878
+ 6
879
  ],
880
  "totals": [
881
+ 3272,
882
+ 3206,
883
+ 3140,
884
+ 3074
885
  ],
886
  "precisions": [
887
+ 0.06570904645476773,
888
+ 0.012164691203992516,
889
+ 0.005732484076433121,
890
+ 0.001951854261548471
891
  ],
892
  "bp": 1.0,
893
+ "sys_len": 3272,
894
  "ref_len": 1589,
895
+ "sacrebleu": 0.009724765205835872,
896
+ "score": 0.009724765205835872,
897
  "score_name": "sacrebleu",
898
+ "score_ci_low": 0.00598430838144436,
899
+ "score_ci_high": 0.015966807436499916,
900
+ "sacrebleu_ci_low": 0.00598430838144436,
901
+ "sacrebleu_ci_high": 0.015966807436499916
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
+ 1112,
907
+ 617,
908
+ 379,
909
+ 249
910
  ],
911
  "totals": [
912
+ 1879,
913
+ 1813,
914
+ 1747,
915
+ 1681
916
  ],
917
  "precisions": [
918
+ 0.5918041511442257,
919
+ 0.34031991174848314,
920
+ 0.21694333142530053,
921
+ 0.14812611540749554
922
  ],
923
+ "bp": 1.0,
924
+ "sys_len": 1879,
925
  "ref_len": 1835,
926
+ "sacrebleu": 0.283635656706225,
927
+ "score": 0.283635656706225,
928
  "score_name": "sacrebleu",
929
+ "score_ci_low": 0.25483093745296115,
930
+ "score_ci_high": 0.3284369566535284,
931
+ "sacrebleu_ci_low": 0.25483093745296115,
932
+ "sacrebleu_ci_high": 0.3284369566535284
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
+ 1422,
938
+ 990,
939
+ 740,
940
+ 568
941
  ],
942
  "totals": [
943
+ 2012,
944
+ 1946,
945
+ 1880,
946
+ 1814
947
  ],
948
  "precisions": [
949
+ 0.7067594433399603,
950
+ 0.5087358684480986,
951
+ 0.39361702127659576,
952
+ 0.3131201764057332
953
  ],
954
+ "bp": 0.9725507672852267,
955
+ "sys_len": 2012,
956
  "ref_len": 2068,
957
+ "sacrebleu": 0.44622048751232035,
958
+ "score": 0.44622048751232035,
959
  "score_name": "sacrebleu",
960
+ "score_ci_low": 0.4016486544135287,
961
+ "score_ci_high": 0.492468721152314,
962
+ "sacrebleu_ci_low": 0.4016486544135287,
963
+ "sacrebleu_ci_high": 0.492468721152314
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
+ 983,
969
+ 445,
970
+ 221,
971
+ 116
972
  ],
973
  "totals": [
974
+ 2522,
975
+ 2456,
976
+ 2390,
977
+ 2324
978
  ],
979
  "precisions": [
980
+ 0.3897700237906423,
981
+ 0.18118892508143322,
982
+ 0.09246861924686192,
983
+ 0.04991394148020654
984
  ],
985
  "bp": 1.0,
986
+ "sys_len": 2522,
987
  "ref_len": 2235,
988
+ "sacrebleu": 0.13436590468744522,
989
+ "score": 0.13436590468744522,
990
  "score_name": "sacrebleu",
991
+ "score_ci_low": 0.10668449271679148,
992
+ "score_ci_high": 0.16331801147285638,
993
+ "sacrebleu_ci_low": 0.10668449271679148,
994
+ "sacrebleu_ci_high": 0.16331801147285638
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
+ 1397,
1000
+ 976,
1001
+ 718,
1002
+ 534
1003
  ],
1004
  "totals": [
1005
+ 1885,
1006
+ 1819,
1007
+ 1753,
1008
+ 1687
1009
  ],
1010
  "precisions": [
1011
+ 0.7411140583554376,
1012
+ 0.536558548653106,
1013
+ 0.40958357102110665,
1014
+ 0.31653823355068167
1015
  ],
1016
+ "bp": 0.9836888676493653,
1017
+ "sys_len": 1885,
1018
  "ref_len": 1916,
1019
+ "sacrebleu": 0.4687329402986153,
1020
+ "score": 0.4687329402986153,
1021
  "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.4090027394886287,
1023
+ "score_ci_high": 0.5072542898902915,
1024
+ "sacrebleu_ci_low": 0.4090027394886287,
1025
+ "sacrebleu_ci_high": 0.5072542898902915
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
+ 1075,
1031
+ 589,
1032
+ 357,
1033
+ 218
1034
  ],
1035
  "totals": [
1036
+ 2037,
1037
+ 1971,
1038
+ 1905,
1039
+ 1839
1040
  ],
1041
  "precisions": [
1042
+ 0.5277368679430535,
1043
+ 0.29883307965499745,
1044
+ 0.1874015748031496,
1045
+ 0.11854268624252312
1046
  ],
1047
+ "bp": 1.0,
1048
+ "sys_len": 2037,
1049
  "ref_len": 1949,
1050
+ "sacrebleu": 0.24328959002445089,
1051
+ "score": 0.24328959002445089,
1052
  "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.2108781721763864,
1054
+ "score_ci_high": 0.2866413776406142,
1055
+ "sacrebleu_ci_low": 0.2108781721763864,
1056
+ "sacrebleu_ci_high": 0.2866413776406142
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
+ 1223,
1062
+ 645,
1063
+ 371,
1064
+ 219
1065
  ],
1066
  "totals": [
1067
+ 2012,
1068
+ 1946,
1069
+ 1880,
1070
+ 1814
1071
  ],
1072
  "precisions": [
1073
+ 0.6078528827037774,
1074
+ 0.3314491264131552,
1075
+ 0.1973404255319149,
1076
+ 0.12072767364939362
1077
  ],
1078
+ "bp": 0.9581570887075945,
1079
+ "sys_len": 2012,
1080
  "ref_len": 2098,
1081
+ "sacrebleu": 0.25220069551672647,
1082
+ "score": 0.25220069551672647,
1083
  "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.21455240851266189,
1085
+ "score_ci_high": 0.28127381859222883,
1086
+ "sacrebleu_ci_low": 0.21455240851266189,
1087
+ "sacrebleu_ci_high": 0.28127381859222883
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
+ 1265,
1093
+ 811,
1094
  564,
1095
+ 403
1096
  ],
1097
  "totals": [
1098
+ 1799,
1099
+ 1733,
1100
+ 1667,
1101
+ 1601
1102
  ],
1103
  "precisions": [
1104
+ 0.7031684269038355,
1105
+ 0.4679746105020196,
1106
+ 0.33833233353329334,
1107
+ 0.2517176764522173
1108
  ],
1109
  "bp": 1.0,
1110
+ "sys_len": 1799,
1111
  "ref_len": 1734,
1112
+ "sacrebleu": 0.40915203730030597,
1113
+ "score": 0.40915203730030597,
1114
  "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.37740759275198177,
1116
+ "score_ci_high": 0.45339812071532676,
1117
+ "sacrebleu_ci_low": 0.37740759275198177,
1118
+ "sacrebleu_ci_high": 0.45339812071532676
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
+ 1032,
1124
+ 484,
1125
+ 292,
1126
+ 181
1127
  ],
1128
  "totals": [
1129
+ 1873,
1130
+ 1807,
1131
+ 1741,
1132
+ 1675
1133
  ],
1134
  "precisions": [
1135
+ 0.5509877202349173,
1136
+ 0.26784726065301606,
1137
+ 0.16771970132107986,
1138
+ 0.10805970149253731
1139
  ],
1140
  "bp": 1.0,
1141
+ "sys_len": 1873,
1142
  "ref_len": 1734,
1143
+ "sacrebleu": 0.22741507142824863,
1144
+ "score": 0.22741507142824863,
1145
  "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.19114102819495998,
1147
+ "score_ci_high": 0.26211626317047076,
1148
+ "sacrebleu_ci_low": 0.19114102819495998,
1149
+ "sacrebleu_ci_high": 0.26211626317047076
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
+ 1019,
1155
+ 503,
1156
+ 310,
1157
+ 204
1158
  ],
1159
  "totals": [
1160
+ 1801,
1161
+ 1735,
1162
+ 1669,
1163
+ 1603
1164
  ],
1165
  "precisions": [
1166
+ 0.5657967795669073,
1167
+ 0.28991354466858793,
1168
+ 0.18573996405032955,
1169
+ 0.1272613849033063
1170
  ],
1171
  "bp": 1.0,
1172
+ "sys_len": 1801,
1173
  "ref_len": 1734,
1174
+ "sacrebleu": 0.24953573320283845,
1175
+ "score": 0.24953573320283845,
1176
  "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.20393561832427076,
1178
+ "score_ci_high": 0.27875367055825584,
1179
+ "sacrebleu_ci_low": 0.20393561832427076,
1180
+ "sacrebleu_ci_high": 0.27875367055825584
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
+ 1285,
1186
  864,
1187
+ 625,
1188
  463
1189
  ],
1190
  "totals": [
1191
+ 1899,
1192
+ 1833,
1193
+ 1767,
1194
+ 1701
1195
  ],
1196
  "precisions": [
1197
+ 0.6766719325961031,
1198
+ 0.4713584288052373,
1199
+ 0.35370684776457273,
1200
+ 0.2721928277483833
1201
  ],
1202
  "bp": 1.0,
1203
+ "sys_len": 1899,
1204
  "ref_len": 1734,
1205
+ "sacrebleu": 0.4186126965889709,
1206
+ "score": 0.4186126965889709,
1207
  "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.3580478494686714,
1209
+ "score_ci_high": 0.47721854835474,
1210
+ "sacrebleu_ci_low": 0.3580478494686714,
1211
+ "sacrebleu_ci_high": 0.47721854835474
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
+ 1286,
1217
+ 859,
1218
+ 598,
1219
+ 417
1220
  ],
1221
  "totals": [
1222
+ 1799,
1223
+ 1733,
1224
+ 1667,
1225
+ 1601
1226
  ],
1227
  "precisions": [
1228
+ 0.7148415786548082,
1229
+ 0.4956722446624351,
1230
+ 0.3587282543491302,
1231
+ 0.26046221111805123
1232
  ],
1233
  "bp": 1.0,
1234
+ "sys_len": 1799,
1235
  "ref_len": 1734,
1236
+ "sacrebleu": 0.42655857647405626,
1237
+ "score": 0.42655857647405626,
1238
  "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.37843554301324095,
1240
+ "score_ci_high": 0.4676738192216758,
1241
+ "sacrebleu_ci_low": 0.37843554301324095,
1242
+ "sacrebleu_ci_high": 0.4676738192216758
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
+ 1178,
1248
+ 657,
1249
+ 409,
1250
+ 266
1251
  ],
1252
  "totals": [
1253
+ 1874,
1254
+ 1808,
1255
+ 1742,
1256
+ 1676
1257
  ],
1258
  "precisions": [
1259
+ 0.6286019210245464,
1260
+ 0.36338495575221236,
1261
+ 0.23478760045924227,
1262
+ 0.15871121718377088
1263
  ],
1264
  "bp": 1.0,
1265
+ "sys_len": 1874,
1266
  "ref_len": 1734,
1267
+ "sacrebleu": 0.3037430550349969,
1268
+ "score": 0.3037430550349969,
1269
  "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.26128349825222114,
1271
+ "score_ci_high": 0.3531214265467947,
1272
+ "sacrebleu_ci_low": 0.26128349825222114,
1273
+ "sacrebleu_ci_high": 0.3531214265467947
1274
  },
1275
+ "score": 0.3049591129796936,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
+ "score": 0.38015857299104155,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
results/bluebench/{2025-06-21T09-36-54_evaluation_results.json β†’ 2025-06-23T14-18-29_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-21T13:36:49.808623Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -42,7 +42,7 @@
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -176,13 +176,13 @@
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
- "accuracy": 0.7777777777777778,
180
- "accuracy_ci_low": 0.6777777777777778,
181
  "accuracy_ci_high": 0.8555555555555555,
182
  "score_name": "accuracy",
183
- "score": 0.7777777777777778,
184
  "score_ci_high": 0.8555555555555555,
185
- "score_ci_low": 0.6777777777777778,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
@@ -216,13 +216,13 @@
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
- "accuracy": 1.0,
220
- "accuracy_ci_low": 1.0,
221
  "accuracy_ci_high": 1.0,
222
  "score_name": "accuracy",
223
- "score": 1.0,
224
  "score_ci_high": 1.0,
225
- "score_ci_low": 1.0,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
@@ -276,51 +276,51 @@
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.8666666666666667,
280
- "accuracy_ci_low": 0.7888888888888889,
281
- "accuracy_ci_high": 0.9333333333333333,
282
  "score_name": "accuracy",
283
- "score": 0.8666666666666667,
284
- "score_ci_high": 0.9333333333333333,
285
- "score_ci_low": 0.7888888888888889,
286
  "num_of_instances": 90
287
  },
288
- "score": 0.9636363636363636,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.5,
296
- "score": 0.5,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.5,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
- "f1_Person": 0.5744680851063829,
307
- "f1_Organization": 0.3625,
308
- "f1_Location": 0.3764705882352941,
309
- "f1_macro": 0.4378128911138923,
310
- "recall_macro": 0.39500837147693896,
311
- "precision_macro": 0.49432322991751926,
312
- "in_classes_support": 0.6292466765140325,
313
- "f1_micro": 0.3560732113144759,
314
- "recall_micro": 0.4076190476190476,
315
- "precision_micro": 0.31610044313146235,
316
- "score": 0.3560732113144759,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.31585267626167035,
319
- "score_ci_high": 0.4082105948099384,
320
- "f1_micro_ci_low": 0.31585267626167035,
321
- "f1_micro_ci_high": 0.4082105948099384
322
  },
323
- "score": 0.3560732113144759,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
@@ -336,23 +336,23 @@
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
- "accuracy": 0.19718309859154928,
340
  "accuracy_ci_low": 0.11267605633802817,
341
- "accuracy_ci_high": 0.30985915492957744,
342
  "score_name": "accuracy",
343
- "score": 0.19718309859154928,
344
- "score_ci_high": 0.30985915492957744,
345
  "score_ci_low": 0.11267605633802817,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
- "accuracy": 0.1267605633802817,
350
- "accuracy_ci_low": 0.056338028169014086,
351
- "accuracy_ci_high": 0.22535211267605634,
352
  "score_name": "accuracy",
353
- "score": 0.1267605633802817,
354
- "score_ci_high": 0.22535211267605634,
355
- "score_ci_low": 0.056338028169014086,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
@@ -377,52 +377,52 @@
377
  },
378
  "mmlu_pro_engineering": {
379
  "accuracy": 0.323943661971831,
380
- "accuracy_ci_low": 0.2112676056338028,
381
  "accuracy_ci_high": 0.43661971830985913,
382
  "score_name": "accuracy",
383
  "score": 0.323943661971831,
384
  "score_ci_high": 0.43661971830985913,
385
- "score_ci_low": 0.2112676056338028,
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
- "accuracy": 0.5492957746478874,
390
- "accuracy_ci_low": 0.43661971830985913,
391
- "accuracy_ci_high": 0.6619718309859155,
392
  "score_name": "accuracy",
393
- "score": 0.5492957746478874,
394
- "score_ci_high": 0.6619718309859155,
395
- "score_ci_low": 0.43661971830985913,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
- "accuracy": 0.6901408450704225,
400
- "accuracy_ci_low": 0.5774647887323944,
401
- "accuracy_ci_high": 0.7887323943661971,
402
  "score_name": "accuracy",
403
- "score": 0.6901408450704225,
404
- "score_ci_high": 0.7887323943661971,
405
- "score_ci_low": 0.5774647887323944,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
- "accuracy": 0.5774647887323944,
410
- "accuracy_ci_low": 0.4647887323943662,
411
  "accuracy_ci_high": 0.6901408450704225,
412
  "score_name": "accuracy",
413
- "score": 0.5774647887323944,
414
  "score_ci_high": 0.6901408450704225,
415
- "score_ci_low": 0.4647887323943662,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
  "accuracy": 0.29577464788732394,
420
- "accuracy_ci_low": 0.2112676056338028,
421
- "accuracy_ci_high": 0.415768498221757,
422
  "score_name": "accuracy",
423
  "score": 0.29577464788732394,
424
- "score_ci_high": 0.415768498221757,
425
- "score_ci_low": 0.2112676056338028,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
@@ -438,21 +438,21 @@
438
  "mmlu_pro_philosophy": {
439
  "accuracy": 0.5915492957746479,
440
  "accuracy_ci_low": 0.4647887323943662,
441
- "accuracy_ci_high": 0.6901408450704225,
442
  "score_name": "accuracy",
443
  "score": 0.5915492957746479,
444
- "score_ci_high": 0.6901408450704225,
445
  "score_ci_low": 0.4647887323943662,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
- "accuracy": 0.3380281690140845,
450
- "accuracy_ci_low": 0.22535211267605634,
451
- "accuracy_ci_high": 0.4507042253521127,
452
  "score_name": "accuracy",
453
- "score": 0.3380281690140845,
454
- "score_ci_high": 0.4507042253521127,
455
- "score_ci_low": 0.22535211267605634,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
@@ -465,90 +465,90 @@
465
  "score_ci_low": 0.5915492957746479,
466
  "num_of_instances": 71
467
  },
468
- "score": 0.48893360160965793,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.6576598179941833,
475
  "f1_suggestive": 0.5882352941176471,
476
  "f1_generic": 0.72,
477
  "f1_descriptive": 0.6818181818181818,
478
- "f1_fanciful": 0.6666666666666666,
479
- "f1_arbitrary": 0.631578947368421,
480
- "f1_macro_ci_low": 0.5539989867710009,
481
- "f1_macro_ci_high": 0.7630121059181559,
482
  "score_name": "f1_micro",
483
  "score": 0.6547619047619048,
484
- "score_ci_high": 0.75,
485
- "score_ci_low": 0.5476190476190477,
486
  "num_of_instances": 85,
487
  "accuracy": 0.6470588235294118,
488
  "accuracy_ci_low": 0.5411764705882353,
489
  "accuracy_ci_high": 0.7411764705882353,
490
  "f1_micro": 0.6547619047619048,
491
- "f1_micro_ci_low": 0.5476190476190477,
492
- "f1_micro_ci_high": 0.75
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.5644255319148936,
496
- "f1_no": 0.6808510638297872,
497
- "f1_yes": 0.448,
498
- "f1_macro_ci_low": 0.4955233440030641,
499
- "f1_macro_ci_high": 0.637195719539535,
500
  "score_name": "f1_micro",
501
- "score": 0.6,
502
- "score_ci_high": 0.6666666666666666,
503
- "score_ci_low": 0.5292645910446814,
504
  "num_of_instances": 200,
505
- "accuracy": 0.54,
506
- "accuracy_ci_low": 0.47,
507
  "accuracy_ci_high": 0.605,
508
- "f1_micro": 0.6,
509
- "f1_micro_ci_low": 0.5292645910446814,
510
- "f1_micro_ci_high": 0.6666666666666666
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.26417086507814297,
514
- "f1_conclusion": 0.15384615384615385,
515
- "f1_decree": 0.13333333333333333,
516
  "f1_issue": 0.18181818181818182,
517
- "f1_analysis": 0.5633802816901409,
518
- "f1_facts": 0.13793103448275862,
519
- "f1_procedural history": 0.3384615384615385,
520
- "f1_rule": 0.3404255319148936,
521
- "f1_macro_ci_low": 0.2096003414946908,
522
- "f1_macro_ci_high": 0.3335047284451988,
523
  "score_name": "f1_micro",
524
- "score": 0.3076923076923077,
525
- "score_ci_high": 0.37785141927318056,
526
- "score_ci_low": 0.24264738787974433,
527
  "num_of_instances": 200,
528
- "accuracy": 0.25,
529
- "accuracy_ci_low": 0.195,
530
- "accuracy_ci_high": 0.315,
531
- "f1_micro": 0.3076923076923077,
532
- "f1_micro_ci_low": 0.24264738787974433,
533
- "f1_micro_ci_high": 0.37785141927318056
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.463898065125059,
537
- "f1_yes": 0.4662576687116564,
538
- "f1_no": 0.46153846153846156,
539
- "f1_macro_ci_low": 0.39561838155337925,
540
- "f1_macro_ci_high": 0.5401290279855109,
541
  "score_name": "f1_micro",
542
- "score": 0.463855421686747,
543
- "score_ci_high": 0.5388200914855517,
544
- "score_ci_low": 0.3937587086040863,
545
  "num_of_instances": 200,
546
- "accuracy": 0.385,
547
- "accuracy_ci_low": 0.32,
548
- "accuracy_ci_high": 0.455,
549
- "f1_micro": 0.463855421686747,
550
- "f1_micro_ci_low": 0.3937587086040863,
551
- "f1_micro_ci_high": 0.5388200914855517
552
  },
553
  "legalbench_proa": {
554
  "f1_macro": 0.75,
@@ -568,172 +568,172 @@
568
  "f1_micro_ci_low": 0.6573326079878734,
569
  "f1_micro_ci_high": 0.8234882632928148
570
  },
571
- "score": 0.5552619268281919,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.6134758195135697,
578
- "f1_cars": 0.8297872340425532,
579
- "f1_windows x": 0.14492753623188406,
580
- "f1_computer graphics": 0.41379310344827586,
581
- "f1_atheism": 0.48148148148148145,
582
- "f1_religion": 0.24561403508771928,
583
- "f1_medicine": 0.8372093023255814,
584
- "f1_christianity": 0.7878787878787878,
585
- "f1_microsoft windows": 0.611764705882353,
586
- "f1_middle east": 0.5970149253731343,
587
- "f1_motorcycles": 0.7128712871287128,
588
- "f1_pc hardware": 0.5736434108527132,
589
- "f1_mac hardware": 0.6382978723404256,
590
- "f1_electronics": 0.6436781609195402,
591
- "f1_for sale": 0.7027027027027027,
592
- "f1_guns": 0.34375,
593
- "f1_space": 0.8269230769230769,
594
- "f1_cryptography": 0.6486486486486487,
595
- "f1_baseball": 0.9122807017543859,
596
  "f1_politics": 0.3787878787878788,
597
- "f1_hockey": 0.9384615384615385,
598
- "f1_macro_ci_low": 0.5870232871618599,
599
- "f1_macro_ci_high": 0.6399401092520309,
600
  "score_name": "f1_micro",
601
- "score": 0.6360655737704918,
602
- "score_ci_high": 0.6652078896587189,
603
- "score_ci_low": 0.606009312496833,
604
  "num_of_instances": 1000,
605
- "accuracy": 0.582,
606
- "accuracy_ci_low": 0.549,
607
- "accuracy_ci_high": 0.61,
608
- "f1_micro": 0.6360655737704918,
609
- "f1_micro_ci_low": 0.606009312496833,
610
- "f1_micro_ci_high": 0.6652078896587189
611
  },
612
- "score": 0.6360655737704918,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.7310250832696968,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9301994301994302,
620
- "f1_payday loan or title loan or personal loan": 0.2222222222222222,
621
- "f1_mortgage": 0.8450704225352113,
622
- "f1_credit card or prepaid card": 0.7596899224806202,
623
- "f1_debt collection": 0.6887417218543046,
624
- "f1_vehicle loan or lease": 0.7906976744186046,
625
- "f1_checking or savings account": 0.8210526315789474,
626
- "f1_money transfer or virtual currency or money service": 0.625,
627
- "f1_student loan": 0.896551724137931,
628
- "f1_macro_ci_low": 0.6859846742091775,
629
- "f1_macro_ci_high": 0.7782039487360795,
630
  "score_name": "f1_micro",
631
- "score": 0.8752515090543259,
632
- "score_ci_high": 0.8950697745652848,
633
- "score_ci_low": 0.8535042875641251,
634
  "num_of_instances": 1000,
635
- "accuracy": 0.87,
636
- "accuracy_ci_low": 0.848,
637
- "accuracy_ci_high": 0.8900650893159325,
638
- "f1_micro": 0.8752515090543259,
639
- "f1_micro_ci_low": 0.8535042875641251,
640
- "f1_micro_ci_high": 0.8950697745652848
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.7446539429987555,
644
- "f1_mortgages and loans": 0.8,
645
  "f1_credit card": 0.7865168539325843,
646
- "f1_debt collection": 0.6972477064220184,
647
- "f1_credit reporting": 0.7835051546391752,
648
- "f1_retail banking": 0.656,
649
- "f1_macro_ci_low": 0.7076662872810797,
650
- "f1_macro_ci_high": 0.7839689190392729,
651
  "score_name": "f1_micro",
652
- "score": 0.75177304964539,
653
- "score_ci_high": 0.7885153719683515,
654
- "score_ci_low": 0.7144890867826504,
655
  "num_of_instances": 500,
656
- "accuracy": 0.742,
657
- "accuracy_ci_low": 0.7044918597766052,
658
- "accuracy_ci_high": 0.78,
659
- "f1_micro": 0.75177304964539,
660
- "f1_micro_ci_low": 0.7144890867826504,
661
- "f1_micro_ci_high": 0.7885153719683515
662
  },
663
- "score": 0.8135122793498579,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
- "program_accuracy": 0.206,
671
- "score": 0.206,
672
  "score_name": "program_accuracy",
673
- "execution_accuracy": 0.183,
674
- "program_accuracy_ci_low": 0.18192581131748398,
675
- "program_accuracy_ci_high": 0.233,
676
- "score_ci_low": 0.18192581131748398,
677
- "score_ci_high": 0.233,
678
- "execution_accuracy_ci_low": 0.16,
679
- "execution_accuracy_ci_high": 0.206
680
  },
681
- "score": 0.206,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.2829195176672266,
688
- "recall": 0.5895069651787405,
689
- "f1": 0.3247642449571121,
690
- "precision_ci_low": 0.2632314020521836,
691
- "precision_ci_high": 0.3018747287495355,
692
- "recall_ci_low": 0.5757048544050242,
693
- "recall_ci_high": 0.605461197768975,
694
- "f1_ci_low": 0.3070149800069332,
695
- "f1_ci_high": 0.3415261837814938,
696
  "score_name": "f1",
697
- "score": 0.3247642449571121,
698
- "score_ci_high": 0.3415261837814938,
699
- "score_ci_low": 0.3070149800069332,
700
  "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.5840552796920141,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6764627607663473,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5322982901086409,
704
- "faithfullness_f1_token_overlap": 0.3161988054051969,
705
- "faithfullness_recall_token_overlap": 0.2527973404819663,
706
- "faithfullness_precision_token_overlap": 0.5297932448406707,
707
- "correctness_f1_token_overlap": 0.3247642449571121,
708
- "correctness_recall_token_overlap": 0.5895069651787405,
709
- "correctness_precision_token_overlap": 0.2829195176672266
710
  },
711
- "score": 0.3247642449571121,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.568,
718
- "accuracy_ci_low": 0.536,
719
- "accuracy_ci_high": 0.597,
720
  "score_name": "accuracy",
721
- "score": 0.568,
722
- "score_ci_high": 0.597,
723
- "score_ci_low": 0.536,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
- "accuracy": 0.922,
728
- "accuracy_ci_low": 0.8937961961124387,
729
- "accuracy_ci_high": 0.9425849640401368,
730
  "score_name": "accuracy",
731
- "score": 0.922,
732
- "score_ci_high": 0.9425849640401368,
733
- "score_ci_low": 0.8937961961124387,
734
  "num_of_instances": 500
735
  },
736
- "score": 0.745,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
@@ -741,68 +741,68 @@
741
  "attaq_500": {
742
  "severity_critical": 0.0,
743
  "severity_high": 0.0,
744
- "severity_medium": 1.0,
745
- "severity_low": 99.0,
746
- "category_harmful_info": 0.8334540272453089,
747
- "category_deception": 0.8512123076602666,
748
- "category_explicit_content": 0.8684844600431846,
749
- "category_violence": 0.8207998569862591,
750
- "category_discrimination": 0.7722197345324925,
751
- "category_substance_abuse": 0.8600103222661548,
752
- "category_pii": 0.8025329068855003,
753
- "safety": 0.8309456067584041,
754
- "safety_ci_low": 0.812541088262446,
755
- "safety_ci_high": 0.8500828095785149,
756
  "score_name": "safety",
757
- "score": 0.8309456067584041,
758
- "score_ci_high": 0.8500828095785149,
759
- "score_ci_low": 0.812541088262446,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.8309456067584041,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
- "rouge1": 0.4363286082688575,
770
- "rouge2": 0.22072608165229177,
771
- "rougeL": 0.3067201488969288,
772
- "score": 0.3067201488969288,
 
773
  "score_name": "rougeL",
774
- "rougeLsum": 0.37348424474826925,
775
- "rouge1_ci_low": 0.42622540172251744,
776
- "rouge1_ci_high": 0.445479768051287,
777
- "rouge2_ci_low": 0.2133469390728694,
778
- "rouge2_ci_high": 0.22852486335907615,
779
- "rougeL_ci_low": 0.2990472341490495,
780
- "rougeL_ci_high": 0.31469167779345575,
781
- "score_ci_low": 0.2990472341490495,
782
- "score_ci_high": 0.31469167779345575,
783
- "rougeLsum_ci_low": 0.36439724635298765,
784
- "rougeLsum_ci_high": 0.3821672398567385
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
- "rouge1": 0.13281253260941045,
789
- "rouge2": 0.020216836126571446,
790
- "rougeL": 0.0953008337162679,
791
- "score": 0.0953008337162679,
 
792
  "score_name": "rougeL",
793
- "rougeLsum": 0.10871871026938495,
794
- "rouge1_ci_low": 0.1270654167753335,
795
- "rouge1_ci_high": 0.13798234251944885,
796
- "rouge2_ci_low": 0.01830191124480353,
797
- "rouge2_ci_high": 0.022113898574922495,
798
- "rougeL_ci_low": 0.09124510500356316,
799
- "rougeL_ci_high": 0.098681444438937,
800
- "score_ci_low": 0.09124510500356316,
801
- "score_ci_high": 0.098681444438937,
802
- "rougeLsum_ci_low": 0.10412203835097511,
803
- "rougeLsum_ci_high": 0.11266943648159043
804
  },
805
- "score": 0.20101049130659834,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
@@ -810,473 +810,473 @@
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
- 1297,
814
- 853,
815
- 602,
816
- 432
817
  ],
818
  "totals": [
819
- 1773,
820
- 1707,
821
- 1641,
822
- 1575
823
  ],
824
  "precisions": [
825
- 0.7315284827975184,
826
- 0.4997070884592853,
827
- 0.3668494820231566,
828
- 0.27428571428571424
829
  ],
830
  "bp": 1.0,
831
- "sys_len": 1773,
832
  "ref_len": 1734,
833
- "sacrebleu": 0.4379348051104114,
834
- "score": 0.4379348051104114,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.3839572779473718,
837
- "score_ci_high": 0.47542268740294724,
838
- "sacrebleu_ci_low": 0.3839572779473718,
839
- "sacrebleu_ci_high": 0.47542268740294724
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
- 1336,
845
- 889,
846
- 624,
847
- 446
848
  ],
849
  "totals": [
850
- 1793,
851
- 1727,
852
- 1661,
853
- 1595
854
  ],
855
  "precisions": [
856
- 0.7451199107640825,
857
- 0.5147654892877823,
858
- 0.37567730282962075,
859
- 0.27962382445141065
860
  ],
861
  "bp": 1.0,
862
- "sys_len": 1793,
863
  "ref_len": 1734,
864
- "sacrebleu": 0.4480290559468694,
865
- "score": 0.4480290559468694,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.40758551741171895,
868
- "score_ci_high": 0.49424014474329564,
869
- "sacrebleu_ci_low": 0.40758551741171895,
870
- "sacrebleu_ci_high": 0.49424014474329564
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
- 908,
876
- 498,
877
- 301,
878
- 181
879
  ],
880
  "totals": [
881
- 1600,
882
- 1534,
883
- 1468,
884
- 1402
885
  ],
886
  "precisions": [
887
- 0.5675,
888
- 0.3246414602346806,
889
- 0.20504087193460488,
890
- 0.1291012838801712
891
  ],
892
- "bp": 1.0,
893
- "sys_len": 1600,
894
  "ref_len": 1589,
895
- "sacrebleu": 0.26426230123240096,
896
- "score": 0.26426230123240096,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.2206088677339497,
899
- "score_ci_high": 0.3043911445742578,
900
- "sacrebleu_ci_low": 0.2206088677339497,
901
- "sacrebleu_ci_high": 0.3043911445742578
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
- 1252,
907
- 762,
908
- 513,
909
- 371
910
  ],
911
  "totals": [
912
- 1850,
913
- 1784,
914
- 1718,
915
- 1652
916
  ],
917
  "precisions": [
918
- 0.6767567567567567,
919
- 0.42713004484304934,
920
- 0.29860302677532014,
921
- 0.2245762711864407
922
  ],
923
  "bp": 1.0,
924
- "sys_len": 1850,
925
  "ref_len": 1835,
926
- "sacrebleu": 0.37313217401540816,
927
- "score": 0.37313217401540816,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.3292059035458714,
930
- "score_ci_high": 0.4247888842695035,
931
- "sacrebleu_ci_low": 0.3292059035458714,
932
- "sacrebleu_ci_high": 0.4247888842695035
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
- 1553,
938
- 1160,
939
- 918,
940
- 734
941
  ],
942
  "totals": [
943
- 2035,
944
- 1969,
945
- 1903,
946
- 1837
947
  ],
948
  "precisions": [
949
- 0.763144963144963,
950
- 0.5891315388522093,
951
- 0.4823962165002627,
952
- 0.3995645073489385
953
  ],
954
- "bp": 0.9839145587719164,
955
- "sys_len": 2035,
956
  "ref_len": 2068,
957
- "sacrebleu": 0.5338385284906572,
958
- "score": 0.5338385284906572,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.4960508343377122,
961
- "score_ci_high": 0.5778992367091751,
962
- "sacrebleu_ci_low": 0.4960508343377122,
963
- "sacrebleu_ci_high": 0.5778992367091751
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
- 1322,
969
- 700,
970
- 421,
971
- 259
972
  ],
973
  "totals": [
974
- 2388,
975
- 2322,
976
- 2256,
977
- 2190
978
  ],
979
  "precisions": [
980
- 0.5536013400335008,
981
- 0.301464254952627,
982
- 0.18661347517730498,
983
- 0.1182648401826484
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 2388,
987
  "ref_len": 2235,
988
- "sacrebleu": 0.2463530263425343,
989
- "score": 0.2463530263425343,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.21902218576525906,
992
- "score_ci_high": 0.28268262467368716,
993
- "sacrebleu_ci_low": 0.21902218576525906,
994
- "sacrebleu_ci_high": 0.28268262467368716
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
- 1463,
1000
- 1077,
1001
- 833,
1002
- 645
1003
  ],
1004
  "totals": [
1005
- 1890,
1006
- 1824,
1007
- 1758,
1008
- 1692
1009
  ],
1010
  "precisions": [
1011
- 0.774074074074074,
1012
- 0.5904605263157895,
1013
- 0.47383390216154725,
1014
- 0.3812056737588652
1015
  ],
1016
- "bp": 0.9863375760488048,
1017
- "sys_len": 1890,
1018
  "ref_len": 1916,
1019
- "sacrebleu": 0.5287072892340219,
1020
- "score": 0.5287072892340219,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.47933696238952217,
1023
- "score_ci_high": 0.5714963999565859,
1024
- "sacrebleu_ci_low": 0.47933696238952217,
1025
- "sacrebleu_ci_high": 0.5714963999565859
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
- 1402,
1031
- 981,
1032
- 716,
1033
- 523
1034
  ],
1035
  "totals": [
1036
- 1969,
1037
- 1903,
1038
- 1837,
1039
- 1771
1040
  ],
1041
  "precisions": [
1042
- 0.71203656678517,
1043
- 0.5155018392012611,
1044
- 0.3897659227000544,
1045
- 0.295313382269904
1046
  ],
1047
  "bp": 1.0,
1048
- "sys_len": 1969,
1049
  "ref_len": 1949,
1050
- "sacrebleu": 0.4533721907576224,
1051
- "score": 0.4533721907576224,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.41581094731566015,
1054
- "score_ci_high": 0.498033544687375,
1055
- "sacrebleu_ci_low": 0.41581094731566015,
1056
- "sacrebleu_ci_high": 0.498033544687375
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
- 1285,
1062
- 735,
1063
- 453,
1064
- 283
1065
  ],
1066
  "totals": [
1067
- 1994,
1068
- 1928,
1069
- 1862,
1070
- 1796
1071
  ],
1072
  "precisions": [
1073
- 0.6444332998996991,
1074
- 0.38122406639004147,
1075
- 0.24328678839957035,
1076
- 0.15757238307349666
1077
  ],
1078
- "bp": 0.9491803375373334,
1079
- "sys_len": 1994,
1080
  "ref_len": 2098,
1081
- "sacrebleu": 0.2956909013134059,
1082
- "score": 0.2956909013134059,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.2691627285024834,
1085
- "score_ci_high": 0.32834717984418194,
1086
- "sacrebleu_ci_low": 0.2691627285024834,
1087
- "sacrebleu_ci_high": 0.32834717984418194
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
- 1366,
1093
  964,
1094
- 696,
1095
- 499
1096
  ],
1097
  "totals": [
1098
- 1844,
1099
- 1778,
1100
- 1712,
1101
- 1646
1102
  ],
1103
  "precisions": [
1104
- 0.7407809110629067,
1105
- 0.5421822272215973,
1106
- 0.40654205607476634,
1107
- 0.30315917375455653
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 1844,
1111
  "ref_len": 1734,
1112
- "sacrebleu": 0.47168581235777585,
1113
- "score": 0.47168581235777585,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.41832464742975967,
1116
- "score_ci_high": 0.5113720236775223,
1117
- "sacrebleu_ci_low": 0.41832464742975967,
1118
- "sacrebleu_ci_high": 0.5113720236775223
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
- 1121,
1124
- 627,
1125
- 398,
1126
- 260
1127
  ],
1128
  "totals": [
1129
- 1831,
1130
- 1765,
1131
- 1699,
1132
- 1633
1133
  ],
1134
  "precisions": [
1135
- 0.6122337520480612,
1136
- 0.35524079320113316,
1137
- 0.23425544437904647,
1138
- 0.15921616656460502
1139
  ],
1140
  "bp": 1.0,
1141
- "sys_len": 1831,
1142
  "ref_len": 1734,
1143
- "sacrebleu": 0.30010915149877077,
1144
- "score": 0.30010915149877077,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.25220636468972635,
1147
- "score_ci_high": 0.34253906716484656,
1148
- "sacrebleu_ci_low": 0.25220636468972635,
1149
- "sacrebleu_ci_high": 0.34253906716484656
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
- 1108,
1155
- 607,
1156
- 365,
1157
- 233
1158
  ],
1159
  "totals": [
1160
- 1751,
1161
- 1685,
1162
- 1619,
1163
- 1553
1164
  ],
1165
  "precisions": [
1166
- 0.6327812678469446,
1167
- 0.3602373887240356,
1168
- 0.22544780728844965,
1169
- 0.150032195750161
1170
  ],
1171
- "bp": 1.0,
1172
- "sys_len": 1751,
1173
  "ref_len": 1734,
1174
- "sacrebleu": 0.2963249503420515,
1175
- "score": 0.2963249503420515,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.25949260863076823,
1178
- "score_ci_high": 0.3478140976036152,
1179
- "sacrebleu_ci_low": 0.25949260863076823,
1180
- "sacrebleu_ci_high": 0.3478140976036152
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
- 1380,
1186
- 1014,
1187
- 776,
1188
- 594
1189
  ],
1190
  "totals": [
1191
- 1815,
1192
- 1749,
1193
- 1683,
1194
- 1617
1195
  ],
1196
  "precisions": [
1197
- 0.7603305785123967,
1198
- 0.5797598627787307,
1199
- 0.46108140225787286,
1200
- 0.3673469387755102
1201
  ],
1202
  "bp": 1.0,
1203
- "sys_len": 1815,
1204
  "ref_len": 1734,
1205
- "sacrebleu": 0.5227284747817351,
1206
- "score": 0.5227284747817351,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.46746182790145246,
1209
- "score_ci_high": 0.5686400938537622,
1210
- "sacrebleu_ci_low": 0.46746182790145246,
1211
- "sacrebleu_ci_high": 0.5686400938537622
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
- 1375,
1217
- 972,
1218
- 711,
1219
- 529
1220
  ],
1221
  "totals": [
1222
- 1800,
1223
- 1734,
1224
- 1668,
1225
- 1602
1226
  ],
1227
  "precisions": [
1228
- 0.7638888888888888,
1229
- 0.560553633217993,
1230
- 0.4262589928057554,
1231
- 0.33021223470661676
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 1800,
1235
  "ref_len": 1734,
1236
- "sacrebleu": 0.49548272782749336,
1237
- "score": 0.49548272782749336,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.45134581590558903,
1240
- "score_ci_high": 0.5288737168538972,
1241
- "sacrebleu_ci_low": 0.45134581590558903,
1242
- "sacrebleu_ci_high": 0.5288737168538972
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
- 1218,
1248
- 705,
1249
- 454,
1250
  300
1251
  ],
1252
  "totals": [
1253
- 1885,
1254
- 1819,
1255
- 1753,
1256
- 1687
1257
  ],
1258
  "precisions": [
1259
- 0.6461538461538461,
1260
- 0.38757559098405714,
1261
- 0.2589845978322875,
1262
- 0.17783046828689983
1263
  ],
1264
  "bp": 1.0,
1265
- "sys_len": 1885,
1266
  "ref_len": 1734,
1267
- "sacrebleu": 0.32771258154747107,
1268
- "score": 0.32771258154747107,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.3009156083594827,
1271
- "score_ci_high": 0.3820347897282282,
1272
- "sacrebleu_ci_low": 0.3009156083594827,
1273
- "sacrebleu_ci_high": 0.3820347897282282
1274
  },
1275
- "score": 0.3996909313865753,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
- "score": 0.5400687869936714,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-06-23T18:18:25.502854Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
+ "accuracy": 0.7888888888888889,
180
+ "accuracy_ci_low": 0.7,
181
  "accuracy_ci_high": 0.8555555555555555,
182
  "score_name": "accuracy",
183
+ "score": 0.7888888888888889,
184
  "score_ci_high": 0.8555555555555555,
185
+ "score_ci_low": 0.7,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
 
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.9888888888888889,
220
+ "accuracy_ci_low": 0.9555555555555556,
221
  "accuracy_ci_high": 1.0,
222
  "score_name": "accuracy",
223
+ "score": 0.9888888888888889,
224
  "score_ci_high": 1.0,
225
+ "score_ci_low": 0.9555555555555556,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
 
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.8555555555555555,
280
+ "accuracy_ci_low": 0.7777777777777778,
281
+ "accuracy_ci_high": 0.9222222222222223,
282
  "score_name": "accuracy",
283
+ "score": 0.8555555555555555,
284
+ "score_ci_high": 0.9222222222222223,
285
+ "score_ci_low": 0.7777777777777778,
286
  "num_of_instances": 90
287
  },
288
+ "score": 0.9626262626262626,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.14026236125126135,
296
+ "score": 0.14026236125126135,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.14026236125126135,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
+ "f1_Person": 0.5721784776902887,
307
+ "f1_Organization": 0.37837837837837834,
308
+ "f1_Location": 0.3692307692307692,
309
+ "f1_macro": 0.4399292084331454,
310
+ "recall_macro": 0.40673591832987227,
311
+ "precision_macro": 0.48338733915656995,
312
+ "in_classes_support": 0.6414285714285715,
313
+ "f1_micro": 0.35918367346938773,
314
+ "recall_micro": 0.41904761904761906,
315
+ "precision_micro": 0.3142857142857143,
316
+ "score": 0.35918367346938773,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.311485746926114,
319
+ "score_ci_high": 0.40594536569316764,
320
+ "f1_micro_ci_low": 0.311485746926114,
321
+ "f1_micro_ci_high": 0.40594536569316764
322
  },
323
+ "score": 0.35918367346938773,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
 
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
+ "accuracy": 0.18309859154929578,
340
  "accuracy_ci_low": 0.11267605633802817,
341
+ "accuracy_ci_high": 0.28169014084507044,
342
  "score_name": "accuracy",
343
+ "score": 0.18309859154929578,
344
+ "score_ci_high": 0.28169014084507044,
345
  "score_ci_low": 0.11267605633802817,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
+ "accuracy": 0.15492957746478872,
350
+ "accuracy_ci_low": 0.08450704225352113,
351
+ "accuracy_ci_high": 0.2535211267605634,
352
  "score_name": "accuracy",
353
+ "score": 0.15492957746478872,
354
+ "score_ci_high": 0.2535211267605634,
355
+ "score_ci_low": 0.08450704225352113,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
 
377
  },
378
  "mmlu_pro_engineering": {
379
  "accuracy": 0.323943661971831,
380
+ "accuracy_ci_low": 0.22535211267605634,
381
  "accuracy_ci_high": 0.43661971830985913,
382
  "score_name": "accuracy",
383
  "score": 0.323943661971831,
384
  "score_ci_high": 0.43661971830985913,
385
+ "score_ci_low": 0.22535211267605634,
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
+ "accuracy": 0.5352112676056338,
390
+ "accuracy_ci_low": 0.4225352112676056,
391
+ "accuracy_ci_high": 0.647887323943662,
392
  "score_name": "accuracy",
393
+ "score": 0.5352112676056338,
394
+ "score_ci_high": 0.647887323943662,
395
+ "score_ci_low": 0.4225352112676056,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
+ "accuracy": 0.7323943661971831,
400
+ "accuracy_ci_low": 0.6310963819783834,
401
+ "accuracy_ci_high": 0.8309859154929577,
402
  "score_name": "accuracy",
403
+ "score": 0.7323943661971831,
404
+ "score_ci_high": 0.8309859154929577,
405
+ "score_ci_low": 0.6310963819783834,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
+ "accuracy": 0.5915492957746479,
410
+ "accuracy_ci_low": 0.4788732394366197,
411
  "accuracy_ci_high": 0.6901408450704225,
412
  "score_name": "accuracy",
413
+ "score": 0.5915492957746479,
414
  "score_ci_high": 0.6901408450704225,
415
+ "score_ci_low": 0.4788732394366197,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
  "accuracy": 0.29577464788732394,
420
+ "accuracy_ci_low": 0.19718309859154928,
421
+ "accuracy_ci_high": 0.41750158298380896,
422
  "score_name": "accuracy",
423
  "score": 0.29577464788732394,
424
+ "score_ci_high": 0.41750158298380896,
425
+ "score_ci_low": 0.19718309859154928,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
 
438
  "mmlu_pro_philosophy": {
439
  "accuracy": 0.5915492957746479,
440
  "accuracy_ci_low": 0.4647887323943662,
441
+ "accuracy_ci_high": 0.704225352112676,
442
  "score_name": "accuracy",
443
  "score": 0.5915492957746479,
444
+ "score_ci_high": 0.704225352112676,
445
  "score_ci_low": 0.4647887323943662,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.30985915492957744,
450
+ "accuracy_ci_low": 0.19718309859154928,
451
+ "accuracy_ci_high": 0.4084507042253521,
452
  "score_name": "accuracy",
453
+ "score": 0.30985915492957744,
454
+ "score_ci_high": 0.4084507042253521,
455
+ "score_ci_low": 0.19718309859154928,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
 
465
  "score_ci_low": 0.5915492957746479,
466
  "num_of_instances": 71
467
  },
468
+ "score": 0.4909456740442656,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.6597867569632275,
475
  "f1_suggestive": 0.5882352941176471,
476
  "f1_generic": 0.72,
477
  "f1_descriptive": 0.6818181818181818,
478
+ "f1_fanciful": 0.7142857142857143,
479
+ "f1_arbitrary": 0.5945945945945946,
480
+ "f1_macro_ci_low": 0.548464495780585,
481
+ "f1_macro_ci_high": 0.7575557118629758,
482
  "score_name": "f1_micro",
483
  "score": 0.6547619047619048,
484
+ "score_ci_high": 0.7425149700598802,
485
+ "score_ci_low": 0.5437048440428358,
486
  "num_of_instances": 85,
487
  "accuracy": 0.6470588235294118,
488
  "accuracy_ci_low": 0.5411764705882353,
489
  "accuracy_ci_high": 0.7411764705882353,
490
  "f1_micro": 0.6547619047619048,
491
+ "f1_micro_ci_low": 0.5437048440428358,
492
+ "f1_micro_ci_high": 0.7425149700598802
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.5899229232086782,
496
+ "f1_no": 0.6637168141592921,
497
+ "f1_yes": 0.5161290322580645,
498
+ "f1_macro_ci_low": 0.5156319338532668,
499
+ "f1_macro_ci_high": 0.6621360959437094,
500
  "score_name": "f1_micro",
501
+ "score": 0.6114285714285714,
502
+ "score_ci_high": 0.6798739003144315,
503
+ "score_ci_low": 0.5364733968179762,
504
  "num_of_instances": 200,
505
+ "accuracy": 0.535,
506
+ "accuracy_ci_low": 0.465,
507
  "accuracy_ci_high": 0.605,
508
+ "f1_micro": 0.6114285714285714,
509
+ "f1_micro_ci_low": 0.5364733968179762,
510
+ "f1_micro_ci_high": 0.6798739003144315
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.24082948201110896,
514
+ "f1_conclusion": 0.10810810810810811,
515
+ "f1_decree": 0.12903225806451613,
516
  "f1_issue": 0.18181818181818182,
517
+ "f1_analysis": 0.5833333333333334,
518
+ "f1_facts": 0.06666666666666667,
519
+ "f1_procedural history": 0.3125,
520
+ "f1_rule": 0.30434782608695654,
521
+ "f1_macro_ci_low": 0.19135126191537805,
522
+ "f1_macro_ci_high": 0.31037088994163425,
523
  "score_name": "f1_micro",
524
+ "score": 0.29012345679012347,
525
+ "score_ci_high": 0.3634815160611135,
526
+ "score_ci_low": 0.22855500349415586,
527
  "num_of_instances": 200,
528
+ "accuracy": 0.235,
529
+ "accuracy_ci_low": 0.185,
530
+ "accuracy_ci_high": 0.3,
531
+ "f1_micro": 0.29012345679012347,
532
+ "f1_micro_ci_low": 0.22855500349415586,
533
+ "f1_micro_ci_high": 0.3634815160611135
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.4444123800591588,
537
+ "f1_yes": 0.4550898203592814,
538
+ "f1_no": 0.43373493975903615,
539
+ "f1_macro_ci_low": 0.3748865543059881,
540
+ "f1_macro_ci_high": 0.5178711641402558,
541
  "score_name": "f1_micro",
542
+ "score": 0.4444444444444444,
543
+ "score_ci_high": 0.5162242117942616,
544
+ "score_ci_low": 0.37379019448718637,
545
  "num_of_instances": 200,
546
+ "accuracy": 0.37,
547
+ "accuracy_ci_low": 0.305,
548
+ "accuracy_ci_high": 0.435,
549
+ "f1_micro": 0.4444444444444444,
550
+ "f1_micro_ci_low": 0.37379019448718637,
551
+ "f1_micro_ci_high": 0.5162242117942616
552
  },
553
  "legalbench_proa": {
554
  "f1_macro": 0.75,
 
568
  "f1_micro_ci_low": 0.6573326079878734,
569
  "f1_micro_ci_high": 0.8234882632928148
570
  },
571
+ "score": 0.5501516754850089,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.625283027988775,
578
+ "f1_cars": 0.8541666666666666,
579
+ "f1_windows x": 0.11940298507462686,
580
+ "f1_computer graphics": 0.4297520661157025,
581
+ "f1_atheism": 0.509090909090909,
582
+ "f1_religion": 0.2222222222222222,
583
+ "f1_medicine": 0.8705882352941177,
584
+ "f1_christianity": 0.7755102040816326,
585
+ "f1_microsoft windows": 0.6436781609195402,
586
+ "f1_middle east": 0.6666666666666666,
587
+ "f1_motorcycles": 0.7326732673267327,
588
+ "f1_pc hardware": 0.5846153846153846,
589
+ "f1_mac hardware": 0.6458333333333334,
590
+ "f1_electronics": 0.6666666666666666,
591
+ "f1_for sale": 0.6944444444444444,
592
+ "f1_guns": 0.36923076923076925,
593
+ "f1_space": 0.8235294117647058,
594
+ "f1_cryptography": 0.6575342465753424,
595
+ "f1_baseball": 0.9310344827586207,
596
  "f1_politics": 0.3787878787878788,
597
+ "f1_hockey": 0.9302325581395349,
598
+ "f1_macro_ci_low": 0.5984293289041998,
599
+ "f1_macro_ci_high": 0.65230217299566,
600
  "score_name": "f1_micro",
601
+ "score": 0.6474114441416894,
602
+ "score_ci_high": 0.6749803309845304,
603
+ "score_ci_low": 0.6158904109589041,
604
  "num_of_instances": 1000,
605
+ "accuracy": 0.594,
606
+ "accuracy_ci_low": 0.561,
607
+ "accuracy_ci_high": 0.622,
608
+ "f1_micro": 0.6474114441416894,
609
+ "f1_micro_ci_low": 0.6158904109589041,
610
+ "f1_micro_ci_high": 0.6749803309845304
611
  },
612
+ "score": 0.6474114441416894,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.7879467396802322,
619
+ "f1_student loan": 0.8888888888888888,
620
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9461756373937678,
621
+ "f1_debt collection": 0.6428571428571429,
622
+ "f1_checking or savings account": 0.8222222222222222,
623
+ "f1_mortgage": 0.9705882352941176,
624
+ "f1_payday loan or title loan or personal loan": 0.5333333333333333,
625
+ "f1_credit card or prepaid card": 0.8666666666666667,
626
+ "f1_money transfer or virtual currency or money service": 0.7111111111111111,
627
+ "f1_vehicle loan or lease": 0.7096774193548387,
628
+ "f1_macro_ci_low": 0.7309747205142173,
629
+ "f1_macro_ci_high": 0.8394377629013812,
630
  "score_name": "f1_micro",
631
+ "score": 0.8977732793522267,
632
+ "score_ci_high": 0.9148163850441952,
633
+ "score_ci_low": 0.8791739655658123,
634
  "num_of_instances": 1000,
635
+ "accuracy": 0.887,
636
+ "accuracy_ci_low": 0.8679599560953464,
637
+ "accuracy_ci_high": 0.906,
638
+ "f1_micro": 0.8977732793522267,
639
+ "f1_micro_ci_low": 0.8791739655658123,
640
+ "f1_micro_ci_high": 0.9148163850441952
641
  },
642
  "cfpb_product_watsonx": {
643
+ "f1_macro": 0.755452767113785,
644
+ "f1_mortgages and loans": 0.8135593220338984,
645
  "f1_credit card": 0.7865168539325843,
646
+ "f1_debt collection": 0.7069767441860465,
647
+ "f1_credit reporting": 0.7876712328767124,
648
+ "f1_retail banking": 0.6825396825396826,
649
+ "f1_macro_ci_low": 0.7186153400597911,
650
+ "f1_macro_ci_high": 0.7954872943230671,
651
  "score_name": "f1_micro",
652
+ "score": 0.7611336032388664,
653
+ "score_ci_high": 0.797979797979798,
654
+ "score_ci_low": 0.7233852933885262,
655
  "num_of_instances": 500,
656
+ "accuracy": 0.752,
657
+ "accuracy_ci_low": 0.712,
658
+ "accuracy_ci_high": 0.79,
659
+ "f1_micro": 0.7611336032388664,
660
+ "f1_micro_ci_low": 0.7233852933885262,
661
+ "f1_micro_ci_high": 0.797979797979798
662
  },
663
+ "score": 0.8294534412955465,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
+ "program_accuracy": 0.21,
671
+ "score": 0.21,
672
  "score_name": "program_accuracy",
673
+ "execution_accuracy": 0.19,
674
+ "program_accuracy_ci_low": 0.1850718210152138,
675
+ "program_accuracy_ci_high": 0.236,
676
+ "score_ci_low": 0.1850718210152138,
677
+ "score_ci_high": 0.236,
678
+ "execution_accuracy_ci_low": 0.167,
679
+ "execution_accuracy_ci_high": 0.214
680
  },
681
+ "score": 0.21,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
+ "precision": 0.2793294516338928,
688
+ "recall": 0.5853975586330095,
689
+ "f1": 0.32244875528474853,
690
+ "precision_ci_low": 0.2608509633355691,
691
+ "precision_ci_high": 0.2992012655344498,
692
+ "recall_ci_low": 0.5709386747947787,
693
+ "recall_ci_high": 0.6004552716919522,
694
+ "f1_ci_low": 0.3057656195014431,
695
+ "f1_ci_high": 0.34039188672012827,
696
  "score_name": "f1",
697
+ "score": 0.32244875528474853,
698
+ "score_ci_high": 0.34039188672012827,
699
+ "score_ci_low": 0.3057656195014431,
700
  "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.583492674678564,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6757716050744057,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5312159793823957,
704
+ "faithfullness_f1_token_overlap": 0.31789392117250886,
705
+ "faithfullness_recall_token_overlap": 0.25491265786006334,
706
+ "faithfullness_precision_token_overlap": 0.5284813664748244,
707
+ "correctness_f1_token_overlap": 0.32244875528474853,
708
+ "correctness_recall_token_overlap": 0.5853975586330095,
709
+ "correctness_precision_token_overlap": 0.2793294516338928
710
  },
711
+ "score": 0.32244875528474853,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
+ "accuracy": 0.565,
718
+ "accuracy_ci_low": 0.5335774990883203,
719
+ "accuracy_ci_high": 0.5932386235436204,
720
  "score_name": "accuracy",
721
+ "score": 0.565,
722
+ "score_ci_high": 0.5932386235436204,
723
+ "score_ci_low": 0.5335774990883203,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
+ "accuracy": 0.92,
728
+ "accuracy_ci_low": 0.892,
729
+ "accuracy_ci_high": 0.94,
730
  "score_name": "accuracy",
731
+ "score": 0.92,
732
+ "score_ci_high": 0.94,
733
+ "score_ci_low": 0.892,
734
  "num_of_instances": 500
735
  },
736
+ "score": 0.7424999999999999,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
 
741
  "attaq_500": {
742
  "severity_critical": 0.0,
743
  "severity_high": 0.0,
744
+ "severity_medium": 0.0,
745
+ "severity_low": 100.0,
746
+ "category_harmful_info": 0.8275590381670063,
747
+ "category_deception": 0.870389111565821,
748
+ "category_explicit_content": 0.8619736799537534,
749
+ "category_violence": 0.839932223764204,
750
+ "category_discrimination": 0.7697427272796631,
751
+ "category_substance_abuse": 0.8608841189907658,
752
+ "category_pii": 0.7655655101493553,
753
+ "safety": 0.8361458782349815,
754
+ "safety_ci_low": 0.8192866587424859,
755
+ "safety_ci_high": 0.8545105900812325,
756
  "score_name": "safety",
757
+ "score": 0.8361458782349815,
758
+ "score_ci_high": 0.8545105900812325,
759
+ "score_ci_low": 0.8192866587424859,
760
  "num_of_instances": 100
761
  },
762
+ "score": 0.8361458782349815,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
+ "rougeLsum": 0.3756142003700213,
770
+ "rouge1": 0.43798911333105917,
771
+ "rouge2": 0.22301544369181786,
772
+ "rougeL": 0.30881611781607615,
773
+ "score": 0.30881611781607615,
774
  "score_name": "rougeL",
775
+ "rougeLsum_ci_low": 0.3665764288194776,
776
+ "rougeLsum_ci_high": 0.3845640981112231,
777
+ "rouge1_ci_low": 0.42803373903414665,
778
+ "rouge1_ci_high": 0.44745782528977346,
779
+ "rouge2_ci_low": 0.21565182499600158,
780
+ "rouge2_ci_high": 0.23166608743475037,
781
+ "rougeL_ci_low": 0.3011644909175205,
782
+ "rougeL_ci_high": 0.31722072238042875,
783
+ "score_ci_low": 0.3011644909175205,
784
+ "score_ci_high": 0.31722072238042875
 
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
+ "rougeLsum": 0.1094469715392792,
789
+ "rouge1": 0.13301752816755213,
790
+ "rouge2": 0.020621633401068214,
791
+ "rougeL": 0.09635378924374519,
792
+ "score": 0.09635378924374519,
793
  "score_name": "rougeL",
794
+ "rougeLsum_ci_low": 0.10482036952689613,
795
+ "rougeLsum_ci_high": 0.11359992785988014,
796
+ "rouge1_ci_low": 0.12738784135782572,
797
+ "rouge1_ci_high": 0.13845012766033873,
798
+ "rouge2_ci_low": 0.01856074580818113,
799
+ "rouge2_ci_high": 0.02259991124480518,
800
+ "rougeL_ci_low": 0.09230760346929477,
801
+ "rougeL_ci_high": 0.09984237535822288,
802
+ "score_ci_low": 0.09230760346929477,
803
+ "score_ci_high": 0.09984237535822288
 
804
  },
805
+ "score": 0.20258495352991068,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
 
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
+ 1310,
814
+ 862,
815
+ 608,
816
+ 433
817
  ],
818
  "totals": [
819
+ 1791,
820
+ 1725,
821
+ 1659,
822
+ 1593
823
  ],
824
  "precisions": [
825
+ 0.7314349525404802,
826
+ 0.49971014492753624,
827
+ 0.36648583484026526,
828
+ 0.27181418706842436
829
  ],
830
  "bp": 1.0,
831
+ "sys_len": 1791,
832
  "ref_len": 1734,
833
+ "sacrebleu": 0.43682330208953546,
834
+ "score": 0.43682330208953546,
835
  "score_name": "sacrebleu",
836
+ "score_ci_low": 0.3952567851744898,
837
+ "score_ci_high": 0.4779782047825724,
838
+ "sacrebleu_ci_low": 0.3952567851744898,
839
+ "sacrebleu_ci_high": 0.4779782047825724
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
+ 1330,
845
+ 885,
846
+ 621,
847
+ 444
848
  ],
849
  "totals": [
850
+ 1803,
851
+ 1737,
852
+ 1671,
853
+ 1605
854
  ],
855
  "precisions": [
856
+ 0.7376594564614531,
857
+ 0.5094991364421416,
858
+ 0.37163375224416517,
859
+ 0.2766355140186916
860
  ],
861
  "bp": 1.0,
862
+ "sys_len": 1803,
863
  "ref_len": 1734,
864
+ "sacrebleu": 0.44335908539973706,
865
+ "score": 0.44335908539973706,
866
  "score_name": "sacrebleu",
867
+ "score_ci_low": 0.407676677665113,
868
+ "score_ci_high": 0.49444892387735484,
869
+ "sacrebleu_ci_low": 0.407676677665113,
870
+ "sacrebleu_ci_high": 0.49444892387735484
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
+ 904,
876
+ 502,
877
+ 300,
878
+ 175
879
  ],
880
  "totals": [
881
+ 1585,
882
+ 1519,
883
+ 1453,
884
+ 1387
885
  ],
886
  "precisions": [
887
+ 0.5703470031545741,
888
+ 0.3304805793285056,
889
+ 0.20646937370956642,
890
+ 0.12617159336697908
891
  ],
892
+ "bp": 0.9974795224450381,
893
+ "sys_len": 1585,
894
  "ref_len": 1589,
895
+ "sacrebleu": 0.26404598765634385,
896
+ "score": 0.26404598765634385,
897
  "score_name": "sacrebleu",
898
+ "score_ci_low": 0.23497301086217232,
899
+ "score_ci_high": 0.29259086123320976,
900
+ "sacrebleu_ci_low": 0.23497301086217232,
901
+ "sacrebleu_ci_high": 0.29259086123320976
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
+ 1246,
907
+ 765,
908
+ 517,
909
+ 376
910
  ],
911
  "totals": [
912
+ 1853,
913
+ 1787,
914
+ 1721,
915
+ 1655
916
  ],
917
  "precisions": [
918
+ 0.6724230976794388,
919
+ 0.42809177392277564,
920
+ 0.3004067402672865,
921
+ 0.22719033232628397
922
  ],
923
  "bp": 1.0,
924
+ "sys_len": 1853,
925
  "ref_len": 1835,
926
+ "sacrebleu": 0.3743861346447394,
927
+ "score": 0.3743861346447394,
928
  "score_name": "sacrebleu",
929
+ "score_ci_low": 0.3331443738925502,
930
+ "score_ci_high": 0.4167892583826109,
931
+ "sacrebleu_ci_low": 0.3331443738925502,
932
+ "sacrebleu_ci_high": 0.4167892583826109
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
+ 1562,
938
+ 1176,
939
+ 936,
940
+ 755
941
  ],
942
  "totals": [
943
+ 2040,
944
+ 1974,
945
+ 1908,
946
+ 1842
947
  ],
948
  "precisions": [
949
+ 0.7656862745098039,
950
+ 0.5957446808510638,
951
+ 0.49056603773584906,
952
+ 0.40988056460369166
953
  ],
954
+ "bp": 0.9863682748637871,
955
+ "sys_len": 2040,
956
  "ref_len": 2068,
957
+ "sacrebleu": 0.5428196432331734,
958
+ "score": 0.5428196432331734,
959
  "score_name": "sacrebleu",
960
+ "score_ci_low": 0.509892185181122,
961
+ "score_ci_high": 0.5871694828358561,
962
+ "sacrebleu_ci_low": 0.509892185181122,
963
+ "sacrebleu_ci_high": 0.5871694828358561
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
+ 1356,
969
+ 727,
970
+ 432,
971
+ 274
972
  ],
973
  "totals": [
974
+ 2382,
975
+ 2316,
976
+ 2250,
977
+ 2184
978
  ],
979
  "precisions": [
980
+ 0.5692695214105793,
981
+ 0.3139032815198618,
982
+ 0.192,
983
+ 0.12545787545787546
984
  ],
985
  "bp": 1.0,
986
+ "sys_len": 2382,
987
  "ref_len": 2235,
988
+ "sacrebleu": 0.25614049024804236,
989
+ "score": 0.25614049024804236,
990
  "score_name": "sacrebleu",
991
+ "score_ci_low": 0.2240052628669271,
992
+ "score_ci_high": 0.2935558382274424,
993
+ "sacrebleu_ci_low": 0.2240052628669271,
994
+ "sacrebleu_ci_high": 0.2935558382274424
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
+ 1455,
1000
+ 1063,
1001
+ 821,
1002
+ 637
1003
  ],
1004
  "totals": [
1005
+ 1900,
1006
+ 1834,
1007
+ 1768,
1008
+ 1702
1009
  ],
1010
  "precisions": [
1011
+ 0.7657894736842106,
1012
+ 0.579607415485278,
1013
+ 0.4643665158371041,
1014
+ 0.37426556991774385
1015
  ],
1016
+ "bp": 0.9916143051127146,
1017
+ "sys_len": 1900,
1018
  "ref_len": 1916,
1019
+ "sacrebleu": 0.5225932644775685,
1020
+ "score": 0.5225932644775685,
1021
  "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.4737364435189365,
1023
+ "score_ci_high": 0.5631623567289689,
1024
+ "sacrebleu_ci_low": 0.4737364435189365,
1025
+ "sacrebleu_ci_high": 0.5631623567289689
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
+ 1404,
1031
+ 989,
1032
+ 719,
1033
+ 525
1034
  ],
1035
  "totals": [
1036
+ 1962,
1037
+ 1896,
1038
+ 1830,
1039
+ 1764
1040
  ],
1041
  "precisions": [
1042
+ 0.7155963302752294,
1043
+ 0.5216244725738397,
1044
+ 0.39289617486338796,
1045
+ 0.2976190476190476
1046
  ],
1047
  "bp": 1.0,
1048
+ "sys_len": 1962,
1049
  "ref_len": 1949,
1050
+ "sacrebleu": 0.45707887169863065,
1051
+ "score": 0.45707887169863065,
1052
  "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.4211360031313033,
1054
+ "score_ci_high": 0.5197096344136953,
1055
+ "sacrebleu_ci_low": 0.4211360031313033,
1056
+ "sacrebleu_ci_high": 0.5197096344136953
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
+ 1297,
1062
+ 753,
1063
+ 472,
1064
+ 301
1065
  ],
1066
  "totals": [
1067
+ 2014,
1068
+ 1948,
1069
+ 1882,
1070
+ 1816
1071
  ],
1072
  "precisions": [
1073
+ 0.6439920556107249,
1074
+ 0.38655030800821355,
1075
+ 0.2507970244420829,
1076
+ 0.1657488986784141
1077
  ],
1078
+ "bp": 0.9591497695217011,
1079
+ "sys_len": 2014,
1080
  "ref_len": 2098,
1081
+ "sacrebleu": 0.3059153842651481,
1082
+ "score": 0.3059153842651481,
1083
  "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.2738471419962368,
1085
+ "score_ci_high": 0.33744567062204633,
1086
+ "sacrebleu_ci_low": 0.2738471419962368,
1087
+ "sacrebleu_ci_high": 0.33744567062204633
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
+ 1371,
1093
  964,
1094
+ 693,
1095
+ 491
1096
  ],
1097
  "totals": [
1098
+ 1839,
1099
+ 1773,
1100
+ 1707,
1101
+ 1641
1102
  ],
1103
  "precisions": [
1104
+ 0.7455138662316476,
1105
+ 0.5437112239142696,
1106
+ 0.40597539543058,
1107
+ 0.2992078001218769
1108
  ],
1109
  "bp": 1.0,
1110
+ "sys_len": 1839,
1111
  "ref_len": 1734,
1112
+ "sacrebleu": 0.4710577594991048,
1113
+ "score": 0.4710577594991048,
1114
  "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.4268855553367512,
1116
+ "score_ci_high": 0.5049624748525423,
1117
+ "sacrebleu_ci_low": 0.4268855553367512,
1118
+ "sacrebleu_ci_high": 0.5049624748525423
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
+ 1130,
1124
+ 638,
1125
+ 409,
1126
+ 268
1127
  ],
1128
  "totals": [
1129
+ 1794,
1130
+ 1728,
1131
+ 1662,
1132
+ 1596
1133
  ],
1134
  "precisions": [
1135
+ 0.6298773690078038,
1136
+ 0.36921296296296297,
1137
+ 0.24608904933814682,
1138
+ 0.16791979949874686
1139
  ],
1140
  "bp": 1.0,
1141
+ "sys_len": 1794,
1142
  "ref_len": 1734,
1143
+ "sacrebleu": 0.31309907547937593,
1144
+ "score": 0.31309907547937593,
1145
  "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.26572456707025455,
1147
+ "score_ci_high": 0.3514896703047057,
1148
+ "sacrebleu_ci_low": 0.26572456707025455,
1149
+ "sacrebleu_ci_high": 0.3514896703047057
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
+ 1112,
1155
+ 613,
1156
+ 383,
1157
+ 250
1158
  ],
1159
  "totals": [
1160
+ 1725,
1161
+ 1659,
1162
+ 1593,
1163
+ 1527
1164
  ],
1165
  "precisions": [
1166
+ 0.6446376811594203,
1167
+ 0.3694996986136227,
1168
+ 0.24042686754551162,
1169
+ 0.16371971185330714
1170
  ],
1171
+ "bp": 0.9947961956419216,
1172
+ "sys_len": 1725,
1173
  "ref_len": 1734,
1174
+ "sacrebleu": 0.3095548068732939,
1175
+ "score": 0.3095548068732939,
1176
  "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.27129838546648216,
1178
+ "score_ci_high": 0.3652127158245783,
1179
+ "sacrebleu_ci_low": 0.27129838546648216,
1180
+ "sacrebleu_ci_high": 0.3652127158245783
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
+ 1382,
1186
+ 1013,
1187
+ 771,
1188
+ 586
1189
  ],
1190
  "totals": [
1191
+ 1816,
1192
+ 1750,
1193
+ 1684,
1194
+ 1618
1195
  ],
1196
  "precisions": [
1197
+ 0.7610132158590308,
1198
+ 0.5788571428571428,
1199
+ 0.4578384798099763,
1200
+ 0.3621755253399258
1201
  ],
1202
  "bp": 1.0,
1203
+ "sys_len": 1816,
1204
  "ref_len": 1734,
1205
+ "sacrebleu": 0.5198747760500348,
1206
+ "score": 0.5198747760500348,
1207
  "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.470424249351864,
1209
+ "score_ci_high": 0.5665291609523828,
1210
+ "sacrebleu_ci_low": 0.470424249351864,
1211
+ "sacrebleu_ci_high": 0.5665291609523828
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
+ 1376,
1217
+ 980,
1218
+ 722,
1219
+ 541
1220
  ],
1221
  "totals": [
1222
+ 1801,
1223
+ 1735,
1224
+ 1669,
1225
+ 1603
1226
  ],
1227
  "precisions": [
1228
+ 0.7640199888950583,
1229
+ 0.5648414985590778,
1230
+ 0.43259436788496103,
1231
+ 0.3374922021210231
1232
  ],
1233
  "bp": 1.0,
1234
+ "sys_len": 1801,
1235
  "ref_len": 1734,
1236
+ "sacrebleu": 0.5010072151606186,
1237
+ "score": 0.5010072151606186,
1238
  "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.4614513028457302,
1240
+ "score_ci_high": 0.5398623866578321,
1241
+ "sacrebleu_ci_low": 0.4614513028457302,
1242
+ "sacrebleu_ci_high": 0.5398623866578321
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
+ 1219,
1248
+ 713,
1249
+ 457,
1250
  300
1251
  ],
1252
  "totals": [
1253
+ 1888,
1254
+ 1822,
1255
+ 1756,
1256
+ 1690
1257
  ],
1258
  "precisions": [
1259
+ 0.6456567796610169,
1260
+ 0.3913282107574094,
1261
+ 0.260250569476082,
1262
+ 0.17751479289940827
1263
  ],
1264
  "bp": 1.0,
1265
+ "sys_len": 1888,
1266
  "ref_len": 1734,
1267
+ "sacrebleu": 0.32869437944703067,
1268
+ "score": 0.32869437944703067,
1269
  "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.30148825657210887,
1271
+ "score_ci_high": 0.3740478120880907,
1272
+ "sacrebleu_ci_low": 0.30148825657210887,
1273
+ "sacrebleu_ci_high": 0.3740478120880907
1274
  },
1275
+ "score": 0.40309667841482516,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
+ "score": 0.5151392921367606,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
results/bluebench/{2025-06-21T11-34-24_evaluation_results.json β†’ 2025-06-23T15-33-11_evaluation_results.json} RENAMED
@@ -1,6 +1,6 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-21T15:34:20.869271Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
@@ -8,7 +8,7 @@
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
- "model_name=watsonx/mistralai/mistral-small-3-1-24b-instruct-2503,max_tokens=256",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
@@ -26,7 +26,7 @@
26
  "num_fewshots": null,
27
  "limit": null,
28
  "batch_size": 8,
29
- "model": "watsonx/mistralai/mistral-small-3-1-24b-instruct-2503",
30
  "model_args": {
31
  "max_tokens": 256
32
  },
@@ -42,7 +42,7 @@
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
@@ -176,633 +176,633 @@
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
- "accuracy": 0.6555555555555556,
180
- "accuracy_ci_low": 0.5444444444444444,
181
- "accuracy_ci_high": 0.7333333333333333,
182
  "score_name": "accuracy",
183
- "score": 0.6555555555555556,
184
- "score_ci_high": 0.7333333333333333,
185
- "score_ci_low": 0.5444444444444444,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
- "accuracy": 0.7333333333333333,
190
- "accuracy_ci_low": 0.6333333333333333,
191
- "accuracy_ci_high": 0.8222222222222222,
192
  "score_name": "accuracy",
193
- "score": 0.7333333333333333,
194
- "score_ci_high": 0.8222222222222222,
195
- "score_ci_low": 0.6333333333333333,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
- "accuracy": 0.8666666666666667,
200
- "accuracy_ci_low": 0.7666666666666667,
201
- "accuracy_ci_high": 0.9222222222222223,
202
  "score_name": "accuracy",
203
- "score": 0.8666666666666667,
204
- "score_ci_high": 0.9222222222222223,
205
- "score_ci_low": 0.7666666666666667,
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
- "accuracy": 0.7777777777777778,
210
- "accuracy_ci_low": 0.6777777777777778,
211
- "accuracy_ci_high": 0.8555555555555555,
212
  "score_name": "accuracy",
213
- "score": 0.7777777777777778,
214
- "score_ci_high": 0.8555555555555555,
215
- "score_ci_low": 0.6777777777777778,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
- "accuracy": 0.8666666666666667,
220
- "accuracy_ci_low": 0.7777777777777778,
221
- "accuracy_ci_high": 0.9222222222222223,
222
  "score_name": "accuracy",
223
- "score": 0.8666666666666667,
224
- "score_ci_high": 0.9222222222222223,
225
- "score_ci_low": 0.7777777777777778,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.9111111111111111,
230
- "accuracy_ci_low": 0.8444444444444444,
231
- "accuracy_ci_high": 0.9555555555555556,
232
  "score_name": "accuracy",
233
- "score": 0.9111111111111111,
234
- "score_ci_high": 0.9555555555555556,
235
- "score_ci_low": 0.8444444444444444,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
- "accuracy": 0.7666666666666667,
240
- "accuracy_ci_low": 0.6666666666666666,
241
- "accuracy_ci_high": 0.8444444444444444,
242
  "score_name": "accuracy",
243
- "score": 0.7666666666666667,
244
- "score_ci_high": 0.8444444444444444,
245
- "score_ci_low": 0.6666666666666666,
246
  "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
- "accuracy": 0.8888888888888888,
250
- "accuracy_ci_low": 0.8222222222222222,
251
- "accuracy_ci_high": 0.9444444444444444,
252
  "score_name": "accuracy",
253
- "score": 0.8888888888888888,
254
- "score_ci_high": 0.9444444444444444,
255
- "score_ci_low": 0.8222222222222222,
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
- "accuracy": 0.7333333333333333,
260
- "accuracy_ci_low": 0.6333333333333333,
261
- "accuracy_ci_high": 0.8111111111111111,
262
  "score_name": "accuracy",
263
- "score": 0.7333333333333333,
264
- "score_ci_high": 0.8111111111111111,
265
- "score_ci_low": 0.6333333333333333,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
  "accuracy": 0.8888888888888888,
270
- "accuracy_ci_low": 0.8041323028207193,
271
  "accuracy_ci_high": 0.9444444444444444,
272
  "score_name": "accuracy",
273
  "score": 0.8888888888888888,
274
  "score_ci_high": 0.9444444444444444,
275
- "score_ci_low": 0.8041323028207193,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.7777777777777778,
280
- "accuracy_ci_low": 0.6888888888888889,
281
- "accuracy_ci_high": 0.8555555555555555,
282
  "score_name": "accuracy",
283
- "score": 0.7777777777777778,
284
- "score_ci_high": 0.8555555555555555,
285
- "score_ci_low": 0.6888888888888889,
286
  "num_of_instances": 90
287
  },
288
- "score": 0.806060606060606,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.5,
296
- "score": 0.5,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
- "score": 0.5,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
- "f1_Person": 0.3625730994152047,
307
- "f1_Organization": 0.2893081761006289,
308
- "f1_Location": 0.33070866141732286,
309
- "f1_macro": 0.32752997897771885,
310
- "recall_macro": 0.2851171336228018,
311
- "precision_macro": 0.39184928961146603,
312
- "in_classes_support": 0.433184855233853,
313
- "f1_micro": 0.21082220660576245,
314
- "recall_micro": 0.2857142857142857,
315
- "precision_micro": 0.16703786191536749,
316
- "score": 0.21082220660576245,
317
  "score_name": "f1_micro",
318
- "score_ci_low": 0.18138158087338846,
319
- "score_ci_high": 0.2505537736745896,
320
- "f1_micro_ci_low": 0.18138158087338846,
321
- "f1_micro_ci_high": 0.2505537736745896
322
  },
323
- "score": 0.21082220660576245,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
- "accuracy": 0.676056338028169,
330
- "accuracy_ci_low": 0.5492957746478874,
331
- "accuracy_ci_high": 0.7746478873239436,
332
  "score_name": "accuracy",
333
- "score": 0.676056338028169,
334
- "score_ci_high": 0.7746478873239436,
335
- "score_ci_low": 0.5492957746478874,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
- "accuracy": 0.22535211267605634,
340
- "accuracy_ci_low": 0.14084507042253522,
341
- "accuracy_ci_high": 0.3380281690140845,
342
  "score_name": "accuracy",
343
- "score": 0.22535211267605634,
344
- "score_ci_high": 0.3380281690140845,
345
- "score_ci_low": 0.14084507042253522,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
  "accuracy": 0.23943661971830985,
350
  "accuracy_ci_low": 0.14084507042253522,
351
- "accuracy_ci_high": 0.352112676056338,
352
  "score_name": "accuracy",
353
  "score": 0.23943661971830985,
354
- "score_ci_high": 0.352112676056338,
355
  "score_ci_low": 0.14084507042253522,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
- "accuracy": 0.5352112676056338,
360
- "accuracy_ci_low": 0.4225352112676056,
361
- "accuracy_ci_high": 0.647887323943662,
362
  "score_name": "accuracy",
363
- "score": 0.5352112676056338,
364
- "score_ci_high": 0.647887323943662,
365
- "score_ci_low": 0.4225352112676056,
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
- "accuracy": 0.676056338028169,
370
- "accuracy_ci_low": 0.5492957746478874,
371
- "accuracy_ci_high": 0.7746478873239436,
372
  "score_name": "accuracy",
373
- "score": 0.676056338028169,
374
- "score_ci_high": 0.7746478873239436,
375
- "score_ci_low": 0.5492957746478874,
376
  "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
- "accuracy": 0.2112676056338028,
380
- "accuracy_ci_low": 0.1267605633802817,
381
- "accuracy_ci_high": 0.30985915492957744,
382
  "score_name": "accuracy",
383
- "score": 0.2112676056338028,
384
- "score_ci_high": 0.30985915492957744,
385
- "score_ci_low": 0.1267605633802817,
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
- "accuracy": 0.5070422535211268,
390
- "accuracy_ci_low": 0.39436619718309857,
391
- "accuracy_ci_high": 0.6197183098591549,
392
  "score_name": "accuracy",
393
- "score": 0.5070422535211268,
394
- "score_ci_high": 0.6197183098591549,
395
- "score_ci_low": 0.39436619718309857,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
- "accuracy": 0.6197183098591549,
400
- "accuracy_ci_low": 0.49295774647887325,
401
- "accuracy_ci_high": 0.7323943661971831,
402
  "score_name": "accuracy",
403
- "score": 0.6197183098591549,
404
- "score_ci_high": 0.7323943661971831,
405
- "score_ci_low": 0.49295774647887325,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
- "accuracy": 0.43661971830985913,
410
- "accuracy_ci_low": 0.323943661971831,
411
- "accuracy_ci_high": 0.5492957746478874,
412
  "score_name": "accuracy",
413
- "score": 0.43661971830985913,
414
- "score_ci_high": 0.5492957746478874,
415
- "score_ci_low": 0.323943661971831,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
- "accuracy": 0.352112676056338,
420
- "accuracy_ci_low": 0.23943661971830985,
421
- "accuracy_ci_high": 0.4647887323943662,
422
  "score_name": "accuracy",
423
- "score": 0.352112676056338,
424
- "score_ci_high": 0.4647887323943662,
425
- "score_ci_low": 0.23943661971830985,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
- "accuracy": 0.5070422535211268,
430
- "accuracy_ci_low": 0.39436619718309857,
431
- "accuracy_ci_high": 0.6056338028169014,
432
  "score_name": "accuracy",
433
- "score": 0.5070422535211268,
434
- "score_ci_high": 0.6056338028169014,
435
- "score_ci_low": 0.39436619718309857,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
- "accuracy": 0.49295774647887325,
440
- "accuracy_ci_low": 0.38028169014084506,
441
- "accuracy_ci_high": 0.6197183098591549,
442
  "score_name": "accuracy",
443
- "score": 0.49295774647887325,
444
- "score_ci_high": 0.6197183098591549,
445
- "score_ci_low": 0.38028169014084506,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
- "accuracy": 0.323943661971831,
450
- "accuracy_ci_low": 0.22535211267605634,
451
- "accuracy_ci_high": 0.4225352112676056,
452
  "score_name": "accuracy",
453
- "score": 0.323943661971831,
454
- "score_ci_high": 0.4225352112676056,
455
- "score_ci_low": 0.22535211267605634,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
- "accuracy": 0.6197183098591549,
460
- "accuracy_ci_low": 0.49295774647887325,
461
- "accuracy_ci_high": 0.7183098591549296,
462
  "score_name": "accuracy",
463
- "score": 0.6197183098591549,
464
- "score_ci_high": 0.7183098591549296,
465
- "score_ci_low": 0.49295774647887325,
466
  "num_of_instances": 71
467
  },
468
- "score": 0.45875251509054327,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
- "f1_macro": 0.3580404378230465,
475
- "f1_suggestive": 0.375,
476
- "f1_generic": 0.4,
477
- "f1_arbitrary": 0.3076923076923077,
478
- "f1_fanciful": 0.43478260869565216,
479
- "f1_descriptive": 0.2727272727272727,
480
- "f1_macro_ci_low": 0.24962639354166832,
481
- "f1_macro_ci_high": 0.48946907875285006,
482
  "score_name": "f1_micro",
483
- "score": 0.35772357723577236,
484
- "score_ci_high": 0.47244094488188976,
485
- "score_ci_low": 0.25002795229813274,
486
  "num_of_instances": 85,
487
- "accuracy": 0.25882352941176473,
488
- "accuracy_ci_low": 0.1716911469146947,
489
- "accuracy_ci_high": 0.35294117647058826,
490
- "f1_micro": 0.35772357723577236,
491
- "f1_micro_ci_low": 0.25002795229813274,
492
- "f1_micro_ci_high": 0.47244094488188976
493
  },
494
  "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.28985125707516124,
496
- "f1_no": 0.5233644859813084,
497
- "f1_yes": 0.056338028169014086,
498
- "f1_macro_ci_low": 0.23940411227724898,
499
- "f1_macro_ci_high": 0.3484173679194581,
500
  "score_name": "f1_micro",
501
- "score": 0.4070175438596491,
502
- "score_ci_high": 0.4836620089714397,
503
- "score_ci_low": 0.32857142857142857,
504
  "num_of_instances": 200,
505
- "accuracy": 0.29,
506
- "accuracy_ci_low": 0.225,
507
- "accuracy_ci_high": 0.355,
508
- "f1_micro": 0.4070175438596491,
509
- "f1_micro_ci_low": 0.32857142857142857,
510
- "f1_micro_ci_high": 0.4836620089714397
511
  },
512
  "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.2830193138674671,
514
- "f1_conclusion": 0.2222222222222222,
515
- "f1_decree": 0.25,
516
- "f1_issue": 0.18604651162790697,
517
- "f1_analysis": 0.32558139534883723,
518
- "f1_facts": 0.26666666666666666,
519
- "f1_procedural history": 0.3384615384615385,
520
- "f1_rule": 0.39215686274509803,
521
- "f1_macro_ci_low": 0.22077760516552375,
522
- "f1_macro_ci_high": 0.3674671451575924,
523
  "score_name": "f1_micro",
524
- "score": 0.2920634920634921,
525
- "score_ci_high": 0.37441470472532123,
526
- "score_ci_low": 0.2264866423312643,
527
  "num_of_instances": 200,
528
- "accuracy": 0.23,
529
- "accuracy_ci_low": 0.175,
530
- "accuracy_ci_high": 0.3,
531
- "f1_micro": 0.2920634920634921,
532
- "f1_micro_ci_low": 0.2264866423312643,
533
- "f1_micro_ci_high": 0.37441470472532123
534
  },
535
  "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.20266040688575898,
537
- "f1_yes": 0.2222222222222222,
538
- "f1_no": 0.18309859154929578,
539
- "f1_macro_ci_low": 0.13797716111770406,
540
- "f1_macro_ci_high": 0.27925168740954265,
541
  "score_name": "f1_micro",
542
- "score": 0.1991701244813278,
543
- "score_ci_high": 0.27571125309001354,
544
- "score_ci_low": 0.13617021276595745,
545
  "num_of_instances": 200,
546
- "accuracy": 0.12,
547
- "accuracy_ci_low": 0.08,
548
- "accuracy_ci_high": 0.175,
549
- "f1_micro": 0.1991701244813278,
550
- "f1_micro_ci_low": 0.13617021276595745,
551
- "f1_micro_ci_high": 0.27571125309001354
552
  },
553
  "legalbench_proa": {
554
- "f1_macro": 0.8320072332730561,
555
- "f1_yes": 0.8285714285714286,
556
- "f1_no": 0.8354430379746836,
557
- "f1_macro_ci_low": 0.749025367448282,
558
- "f1_macro_ci_high": 0.890690718451174,
559
  "score_name": "f1_micro",
560
- "score": 0.8322147651006712,
561
- "score_ci_high": 0.8903225806451613,
562
- "score_ci_low": 0.7482933733536249,
563
  "num_of_instances": 85,
564
- "accuracy": 0.7294117647058823,
565
- "accuracy_ci_low": 0.6235294117647059,
566
- "accuracy_ci_high": 0.8117647058823529,
567
- "f1_micro": 0.8322147651006712,
568
- "f1_micro_ci_low": 0.7482933733536249,
569
- "f1_micro_ci_high": 0.8903225806451613
570
  },
571
- "score": 0.4176379005481825,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
- "f1_macro": 0.5251289790109961,
578
- "f1_cars": 0.735632183908046,
579
- "f1_windows x": 0.11594202898550725,
580
- "f1_atheism": 0.18181818181818182,
581
- "f1_religion": 0.14285714285714285,
582
- "f1_medicine": 0.825,
583
- "f1_christianity": 0.43243243243243246,
584
- "f1_for sale": 0.7142857142857143,
585
- "f1_computer graphics": 0.5,
586
- "f1_microsoft windows": 0.4507042253521127,
587
- "f1_middle east": 0.5641025641025641,
588
- "f1_motorcycles": 0.66,
589
- "f1_pc hardware": 0.4528301886792453,
590
- "f1_mac hardware": 0.5494505494505495,
591
- "f1_electronics": 0.574468085106383,
592
- "f1_guns": 0.2647058823529412,
593
- "f1_space": 0.7333333333333333,
594
- "f1_cryptography": 0.5897435897435898,
595
- "f1_baseball": 0.8380952380952381,
596
- "f1_hockey": 0.8403361344537815,
597
- "f1_politics": 0.3368421052631579,
598
- "f1_macro_ci_low": 0.4982372111894923,
599
- "f1_macro_ci_high": 0.5536034958738426,
600
  "score_name": "f1_micro",
601
- "score": 0.5513196480938416,
602
- "score_ci_high": 0.580897965540632,
603
- "score_ci_low": 0.519239825515335,
604
  "num_of_instances": 1000,
605
- "accuracy": 0.47,
606
- "accuracy_ci_low": 0.439,
607
- "accuracy_ci_high": 0.5,
608
- "f1_micro": 0.5513196480938416,
609
- "f1_micro_ci_low": 0.519239825515335,
610
- "f1_micro_ci_high": 0.580897965540632
611
  },
612
- "score": 0.5513196480938416,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
- "f1_macro": 0.598937790062507,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.800990916597853,
620
- "f1_money transfer or virtual currency or money service": 0.6530612244897959,
621
- "f1_student loan": 0.7741935483870968,
622
- "f1_credit card or prepaid card": 0.5154639175257731,
623
- "f1_debt collection": 0.5394736842105263,
624
- "f1_payday loan or title loan or personal loan": 0.0,
625
- "f1_vehicle loan or lease": 0.6206896551724138,
626
- "f1_mortgage": 0.8,
627
- "f1_checking or savings account": 0.6865671641791045,
628
- "f1_macro_ci_low": 0.5590827549101115,
629
- "f1_macro_ci_high": 0.6436069882866612,
630
  "score_name": "f1_micro",
631
- "score": 0.7434435575826682,
632
- "score_ci_high": 0.771404250423866,
633
- "score_ci_low": 0.719193914944442,
634
  "num_of_instances": 1000,
635
- "accuracy": 0.652,
636
- "accuracy_ci_low": 0.623,
637
- "accuracy_ci_high": 0.684,
638
- "f1_micro": 0.7434435575826682,
639
- "f1_micro_ci_low": 0.719193914944442,
640
- "f1_micro_ci_high": 0.771404250423866
641
  },
642
  "cfpb_product_watsonx": {
643
- "f1_macro": 0.645506649922517,
644
- "f1_mortgages and loans": 0.7393939393939394,
645
- "f1_credit card": 0.7088607594936709,
646
- "f1_debt collection": 0.6268656716417911,
647
- "f1_retail banking": 0.4752475247524752,
648
- "f1_credit reporting": 0.6771653543307087,
649
- "f1_macro_ci_low": 0.6022236233880557,
650
- "f1_macro_ci_high": 0.6875999406252342,
651
  "score_name": "f1_micro",
652
- "score": 0.6598407281001137,
653
- "score_ci_high": 0.6994441194697146,
654
- "score_ci_low": 0.6181710190063682,
655
  "num_of_instances": 500,
656
- "accuracy": 0.58,
657
- "accuracy_ci_low": 0.5369658568845046,
658
- "accuracy_ci_high": 0.624,
659
- "f1_micro": 0.6598407281001137,
660
- "f1_micro_ci_low": 0.6181710190063682,
661
- "f1_micro_ci_high": 0.6994441194697146
662
  },
663
- "score": 0.701642142841391,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
- "execution_accuracy": 0.111,
671
- "program_accuracy": 0.124,
672
- "score": 0.124,
673
  "score_name": "program_accuracy",
674
- "execution_accuracy_ci_low": 0.092,
675
- "execution_accuracy_ci_high": 0.132,
676
- "program_accuracy_ci_low": 0.104,
677
- "program_accuracy_ci_high": 0.145,
678
- "score_ci_low": 0.104,
679
- "score_ci_high": 0.145
680
  },
681
- "score": 0.124,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
- "precision": 0.25881567594573107,
688
- "recall": 0.634151229634738,
689
- "f1": 0.31118943159428636,
690
- "precision_ci_low": 0.2427503435734933,
691
- "precision_ci_high": 0.2766316249638964,
692
- "recall_ci_low": 0.6185798223025679,
693
- "recall_ci_high": 0.6499187341273587,
694
- "f1_ci_low": 0.2955441533424316,
695
- "f1_ci_high": 0.32732829081701914,
696
  "score_name": "f1",
697
- "score": 0.31118943159428636,
698
- "score_ci_high": 0.32732829081701914,
699
- "score_ci_low": 0.2955441533424316,
700
  "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.5709441083172957,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6812374199926853,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5089003006368875,
704
- "faithfullness_f1_token_overlap": 0.31344518499715596,
705
- "faithfullness_recall_token_overlap": 0.2668468802457335,
706
- "faithfullness_precision_token_overlap": 0.47679019585643934,
707
- "correctness_f1_token_overlap": 0.31118943159428636,
708
- "correctness_recall_token_overlap": 0.634151229634738,
709
- "correctness_precision_token_overlap": 0.25881567594573107
710
  },
711
- "score": 0.31118943159428636,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
- "accuracy": 0.538,
718
- "accuracy_ci_low": 0.505,
719
- "accuracy_ci_high": 0.57,
720
  "score_name": "accuracy",
721
- "score": 0.538,
722
- "score_ci_high": 0.57,
723
- "score_ci_low": 0.505,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
- "accuracy": 0.87,
728
- "accuracy_ci_low": 0.84,
729
- "accuracy_ci_high": 0.898,
730
  "score_name": "accuracy",
731
- "score": 0.87,
732
- "score_ci_high": 0.898,
733
- "score_ci_low": 0.84,
734
  "num_of_instances": 500
735
  },
736
- "score": 0.704,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
  "severity_critical": 3.0,
743
- "severity_high": 0.0,
744
- "severity_medium": 2.0,
745
- "severity_low": 95.0,
746
- "category_harmful_info": 0.8016373084946758,
747
- "category_deception": 0.8822153681757474,
748
- "category_explicit_content": 0.832710394654611,
749
- "category_violence": 0.7825738151986448,
750
- "category_discrimination": 0.8220807708918102,
751
- "category_substance_abuse": 0.826748459537824,
752
- "category_pii": 0.7723372398565213,
753
- "safety": 0.809054309744388,
754
- "safety_ci_low": 0.7772938002229317,
755
- "safety_ci_high": 0.8361201240729302,
756
  "score_name": "safety",
757
- "score": 0.809054309744388,
758
- "score_ci_high": 0.8361201240729302,
759
- "score_ci_low": 0.7772938002229317,
760
  "num_of_instances": 100
761
  },
762
- "score": 0.809054309744388,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
- "rouge1": 0.389351160643424,
770
- "rougeL": 0.26506148266380164,
771
- "score": 0.26506148266380164,
 
772
  "score_name": "rougeL",
773
- "rougeLsum": 0.3329771315874926,
774
- "rouge2": 0.18480811116641105,
775
- "rouge1_ci_low": 0.3793704942651626,
776
- "rouge1_ci_high": 0.3991145148139802,
777
- "rougeL_ci_low": 0.2574910713981229,
778
- "rougeL_ci_high": 0.2723394364118134,
779
- "score_ci_low": 0.2574910713981229,
780
- "score_ci_high": 0.2723394364118134,
781
- "rougeLsum_ci_low": 0.32368402136971663,
782
- "rougeLsum_ci_high": 0.3419744690065713,
783
- "rouge2_ci_low": 0.17799828534417894,
784
- "rouge2_ci_high": 0.19189115183254815
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
- "rouge1": 0.11298250338252126,
789
- "rougeL": 0.08177611720945774,
790
- "score": 0.08177611720945774,
 
791
  "score_name": "rougeL",
792
- "rougeLsum": 0.0950863205219803,
793
- "rouge2": 0.015717793961971215,
794
- "rouge1_ci_low": 0.10712881957329778,
795
- "rouge1_ci_high": 0.11773128033523193,
796
- "rougeL_ci_low": 0.07786560263494875,
797
- "rougeL_ci_high": 0.08512540575510288,
798
- "score_ci_low": 0.07786560263494875,
799
- "score_ci_high": 0.08512540575510288,
800
- "rougeLsum_ci_low": 0.09078789141905777,
801
- "rougeLsum_ci_high": 0.09900306122637068,
802
- "rouge2_ci_low": 0.013982816599457718,
803
- "rouge2_ci_high": 0.0176417953678172
804
  },
805
- "score": 0.17341879993662968,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
@@ -810,473 +810,473 @@
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
- 1335,
814
- 792,
815
- 517,
816
- 343
817
  ],
818
  "totals": [
819
- 5784,
820
- 5718,
821
- 5652,
822
- 5586
823
  ],
824
  "precisions": [
825
- 0.2308091286307054,
826
- 0.1385099685204617,
827
- 0.09147204529370134,
828
- 0.06140350877192983
829
  ],
830
  "bp": 1.0,
831
- "sys_len": 5784,
832
  "ref_len": 1734,
833
- "sacrebleu": 0.11575876843383869,
834
- "score": 0.11575876843383869,
835
  "score_name": "sacrebleu",
836
- "score_ci_low": 0.09677258086487982,
837
- "score_ci_high": 0.14198802557654883,
838
- "sacrebleu_ci_low": 0.09677258086487982,
839
- "sacrebleu_ci_high": 0.14198802557654883
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
- 1356,
845
- 783,
846
- 496,
847
- 322
848
  ],
849
  "totals": [
850
- 6913,
851
- 6847,
852
- 6781,
853
- 6715
854
  ],
855
  "precisions": [
856
- 0.19615217705771734,
857
- 0.11435665254856142,
858
- 0.07314555375313375,
859
- 0.047952345495160094
860
  ],
861
  "bp": 1.0,
862
- "sys_len": 6913,
863
  "ref_len": 1734,
864
- "sacrebleu": 0.09418095467735833,
865
- "score": 0.09418095467735833,
866
  "score_name": "sacrebleu",
867
- "score_ci_low": 0.07855318506180965,
868
- "score_ci_high": 0.11463100599546205,
869
- "sacrebleu_ci_low": 0.07855318506180965,
870
- "sacrebleu_ci_high": 0.11463100599546205
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
- 767,
876
- 320,
877
- 153,
878
- 74
879
  ],
880
  "totals": [
881
- 8433,
882
- 8367,
883
- 8301,
884
- 8235
885
  ],
886
  "precisions": [
887
- 0.09095221154986363,
888
- 0.03824548822756066,
889
- 0.018431514275388507,
890
- 0.008986035215543413
891
  ],
892
  "bp": 1.0,
893
- "sys_len": 8433,
894
  "ref_len": 1589,
895
- "sacrebleu": 0.027550573998889032,
896
- "score": 0.027550573998889032,
897
  "score_name": "sacrebleu",
898
- "score_ci_low": 0.021208555129186346,
899
- "score_ci_high": 0.03586774827340232,
900
- "sacrebleu_ci_low": 0.021208555129186346,
901
- "sacrebleu_ci_high": 0.03586774827340232
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
- 1257,
907
- 721,
908
- 459,
909
- 305
910
  ],
911
  "totals": [
912
- 5818,
913
- 5752,
914
- 5686,
915
- 5620
916
  ],
917
  "precisions": [
918
- 0.2160536266758336,
919
- 0.12534770514603616,
920
- 0.08072458670418571,
921
- 0.05427046263345196
922
  ],
923
- "bp": 1.0,
924
- "sys_len": 5818,
925
  "ref_len": 1835,
926
- "sacrebleu": 0.10436666970008357,
927
- "score": 0.10436666970008357,
928
  "score_name": "sacrebleu",
929
- "score_ci_low": 0.08345328311475483,
930
- "score_ci_high": 0.13560613265358804,
931
- "sacrebleu_ci_low": 0.08345328311475483,
932
- "sacrebleu_ci_high": 0.13560613265358804
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
- 1584,
938
- 1138,
939
- 882,
940
- 702
941
  ],
942
  "totals": [
943
- 5212,
944
- 5146,
945
- 5080,
946
- 5014
947
  ],
948
  "precisions": [
949
- 0.3039140445126631,
950
- 0.22114263505635445,
951
- 0.17362204724409447,
952
- 0.14000797766254489
953
  ],
954
  "bp": 1.0,
955
- "sys_len": 5212,
956
  "ref_len": 2068,
957
- "sacrebleu": 0.20104590260613545,
958
- "score": 0.20104590260613545,
959
  "score_name": "sacrebleu",
960
- "score_ci_low": 0.15816587969091414,
961
- "score_ci_high": 0.23752616478719799,
962
- "sacrebleu_ci_low": 0.15816587969091414,
963
- "sacrebleu_ci_high": 0.23752616478719799
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
- 1408,
969
- 697,
970
- 390,
971
- 222
972
  ],
973
  "totals": [
974
- 7618,
975
- 7552,
976
- 7486,
977
- 7420
978
  ],
979
  "precisions": [
980
- 0.18482541349435547,
981
- 0.09229343220338983,
982
- 0.052097248196633715,
983
- 0.029919137466307276
984
  ],
985
  "bp": 1.0,
986
- "sys_len": 7618,
987
  "ref_len": 2235,
988
- "sacrebleu": 0.071808207221539,
989
- "score": 0.071808207221539,
990
  "score_name": "sacrebleu",
991
- "score_ci_low": 0.06067958452990026,
992
- "score_ci_high": 0.08476974354932754,
993
- "sacrebleu_ci_low": 0.06067958452990026,
994
- "sacrebleu_ci_high": 0.08476974354932754
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
- 1475,
1000
- 1028,
1001
- 769,
1002
- 585
1003
  ],
1004
  "totals": [
1005
- 5487,
1006
- 5421,
1007
- 5355,
1008
- 5289
1009
  ],
1010
  "precisions": [
1011
- 0.26881720430107525,
1012
- 0.1896329090573695,
1013
- 0.14360410830999068,
1014
- 0.1106069200226886
1015
  ],
1016
  "bp": 1.0,
1017
- "sys_len": 5487,
1018
  "ref_len": 1916,
1019
- "sacrebleu": 0.16868636617394397,
1020
- "score": 0.16868636617394397,
1021
  "score_name": "sacrebleu",
1022
- "score_ci_low": 0.1449996381584022,
1023
- "score_ci_high": 0.19615430198730424,
1024
- "sacrebleu_ci_low": 0.1449996381584022,
1025
- "sacrebleu_ci_high": 0.19615430198730424
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
- 1326,
1031
- 841,
1032
- 567,
1033
- 389
1034
  ],
1035
  "totals": [
1036
- 4837,
1037
- 4771,
1038
- 4705,
1039
- 4639
1040
  ],
1041
  "precisions": [
1042
- 0.2741368616911309,
1043
- 0.17627331796269124,
1044
- 0.12051009564293304,
1045
- 0.0838542789394266
1046
  ],
1047
  "bp": 1.0,
1048
- "sys_len": 4837,
1049
  "ref_len": 1949,
1050
- "sacrebleu": 0.14865368140396532,
1051
- "score": 0.14865368140396532,
1052
  "score_name": "sacrebleu",
1053
- "score_ci_low": 0.12285617051153769,
1054
- "score_ci_high": 0.20722793764019112,
1055
- "sacrebleu_ci_low": 0.12285617051153769,
1056
- "sacrebleu_ci_high": 0.20722793764019112
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
- 1381,
1062
- 759,
1063
- 459,
1064
- 281
1065
  ],
1066
  "totals": [
1067
- 6405,
1068
- 6339,
1069
- 6273,
1070
- 6207
1071
  ],
1072
  "precisions": [
1073
- 0.21561280249804843,
1074
- 0.11973497397065783,
1075
- 0.07317073170731708,
1076
- 0.04527146769776059
1077
  ],
1078
- "bp": 1.0,
1079
- "sys_len": 6405,
1080
  "ref_len": 2098,
1081
- "sacrebleu": 0.09616441307249027,
1082
- "score": 0.09616441307249027,
1083
  "score_name": "sacrebleu",
1084
- "score_ci_low": 0.07930542472032699,
1085
- "score_ci_high": 0.11825351259228321,
1086
- "sacrebleu_ci_low": 0.07930542472032699,
1087
- "sacrebleu_ci_high": 0.11825351259228321
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
- 1391,
1093
- 930,
1094
- 644,
1095
- 448
1096
  ],
1097
  "totals": [
1098
- 5836,
1099
- 5770,
1100
- 5704,
1101
- 5638
1102
  ],
1103
  "precisions": [
1104
- 0.23834818368745714,
1105
- 0.1611785095320624,
1106
- 0.11290322580645162,
1107
- 0.07946080170273147
1108
  ],
1109
  "bp": 1.0,
1110
- "sys_len": 5836,
1111
  "ref_len": 1734,
1112
- "sacrebleu": 0.13625252798661136,
1113
- "score": 0.13625252798661136,
1114
  "score_name": "sacrebleu",
1115
- "score_ci_low": 0.11732338312550004,
1116
- "score_ci_high": 0.15953611321490402,
1117
- "sacrebleu_ci_low": 0.11732338312550004,
1118
- "sacrebleu_ci_high": 0.15953611321490402
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
- 1250,
1124
- 617,
1125
- 343,
1126
- 187
1127
  ],
1128
  "totals": [
1129
- 6443,
1130
- 6377,
1131
- 6311,
1132
- 6245
1133
  ],
1134
  "precisions": [
1135
- 0.1940090020176936,
1136
- 0.09675395954210445,
1137
- 0.054349548407542386,
1138
- 0.029943955164131304
1139
  ],
1140
  "bp": 1.0,
1141
- "sys_len": 6443,
1142
  "ref_len": 1734,
1143
- "sacrebleu": 0.07434451508550546,
1144
- "score": 0.07434451508550546,
1145
  "score_name": "sacrebleu",
1146
- "score_ci_low": 0.0644058696100964,
1147
- "score_ci_high": 0.08685819903559298,
1148
- "sacrebleu_ci_low": 0.0644058696100964,
1149
- "sacrebleu_ci_high": 0.08685819903559298
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
- 1162,
1155
- 546,
1156
- 295,
1157
- 168
1158
  ],
1159
  "totals": [
1160
- 5914,
1161
- 5848,
1162
- 5782,
1163
- 5716
1164
  ],
1165
  "precisions": [
1166
- 0.19648292188028407,
1167
- 0.09336525307797537,
1168
- 0.05102040816326531,
1169
- 0.02939118264520644
1170
  ],
1171
  "bp": 1.0,
1172
- "sys_len": 5914,
1173
  "ref_len": 1734,
1174
- "sacrebleu": 0.07242154227292893,
1175
- "score": 0.07242154227292893,
1176
  "score_name": "sacrebleu",
1177
- "score_ci_low": 0.06291349695976677,
1178
- "score_ci_high": 0.08531939881489306,
1179
- "sacrebleu_ci_low": 0.06291349695976677,
1180
- "sacrebleu_ci_high": 0.08531939881489306
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
- 1417,
1186
- 974,
1187
- 723,
1188
- 547
1189
  ],
1190
  "totals": [
1191
- 6392,
1192
- 6326,
1193
- 6260,
1194
- 6194
1195
  ],
1196
  "precisions": [
1197
- 0.22168335419274093,
1198
- 0.15396775213404995,
1199
- 0.11549520766773164,
1200
- 0.08831126896997095
1201
  ],
1202
  "bp": 1.0,
1203
- "sys_len": 6392,
1204
  "ref_len": 1734,
1205
- "sacrebleu": 0.13659529350611513,
1206
- "score": 0.13659529350611513,
1207
  "score_name": "sacrebleu",
1208
- "score_ci_low": 0.11528113587288888,
1209
- "score_ci_high": 0.15836557569065857,
1210
- "sacrebleu_ci_low": 0.11528113587288888,
1211
- "sacrebleu_ci_high": 0.15836557569065857
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
- 1395,
1217
- 948,
1218
- 686,
1219
- 506
1220
  ],
1221
  "totals": [
1222
- 5680,
1223
- 5614,
1224
- 5548,
1225
- 5482
1226
  ],
1227
  "precisions": [
1228
- 0.24559859154929575,
1229
- 0.16886355539722125,
1230
- 0.12364816149963952,
1231
- 0.09230207953301715
1232
  ],
1233
  "bp": 1.0,
1234
- "sys_len": 5680,
1235
  "ref_len": 1734,
1236
- "sacrebleu": 0.14749939449155924,
1237
- "score": 0.14749939449155924,
1238
  "score_name": "sacrebleu",
1239
- "score_ci_low": 0.1292139766827155,
1240
- "score_ci_high": 0.1819243976706418,
1241
- "sacrebleu_ci_low": 0.1292139766827155,
1242
- "sacrebleu_ci_high": 0.1819243976706418
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
- 1300,
1248
- 712,
1249
- 450,
1250
- 290
1251
  ],
1252
  "totals": [
1253
- 6166,
1254
- 6100,
1255
- 6034,
1256
- 5968
1257
  ],
1258
  "precisions": [
1259
- 0.21083360363282516,
1260
- 0.11672131147540984,
1261
- 0.07457739476300962,
1262
- 0.04859249329758713
1263
  ],
1264
  "bp": 1.0,
1265
- "sys_len": 6166,
1266
  "ref_len": 1734,
1267
- "sacrebleu": 0.09717769541368253,
1268
- "score": 0.09717769541368253,
1269
  "score_name": "sacrebleu",
1270
- "score_ci_low": 0.08585452687395417,
1271
- "score_ci_high": 0.11046953561336194,
1272
- "sacrebleu_ci_low": 0.08585452687395417,
1273
- "sacrebleu_ci_high": 0.11046953561336194
1274
  },
1275
- "score": 0.11283376706964308,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
- "score": 0.4523639482757903,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-06-23T19:33:07.872441Z",
4
  "command_line_invocation": [
5
  "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
 
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
+ "model_name=watsonx/mistralai/mistral-large,max_tokens=256",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
 
26
  "num_fewshots": null,
27
  "limit": null,
28
  "batch_size": 8,
29
+ "model": "watsonx/mistralai/mistral-large",
30
  "model_args": {
31
  "max_tokens": 256
32
  },
 
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
46
  "python_version": "3.10.18",
47
  "system": "Linux",
48
  "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
 
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
+ "accuracy": 0.8666666666666667,
180
+ "accuracy_ci_low": 0.7888888888888889,
181
+ "accuracy_ci_high": 0.9222222222222223,
182
  "score_name": "accuracy",
183
+ "score": 0.8666666666666667,
184
+ "score_ci_high": 0.9222222222222223,
185
+ "score_ci_low": 0.7888888888888889,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
+ "accuracy": 0.8888888888888888,
190
+ "accuracy_ci_low": 0.8111111111111111,
191
+ "accuracy_ci_high": 0.9444444444444444,
192
  "score_name": "accuracy",
193
+ "score": 0.8888888888888888,
194
+ "score_ci_high": 0.9444444444444444,
195
+ "score_ci_low": 0.8111111111111111,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
199
+ "accuracy": 0.9888888888888889,
200
+ "accuracy_ci_low": 0.9283857779145438,
201
+ "accuracy_ci_high": 1.0,
202
  "score_name": "accuracy",
203
+ "score": 0.9888888888888889,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 0.9283857779145438,
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
+ "accuracy": 0.8666666666666667,
210
+ "accuracy_ci_low": 0.7858277377703305,
211
+ "accuracy_ci_high": 0.9333333333333333,
212
  "score_name": "accuracy",
213
+ "score": 0.8666666666666667,
214
+ "score_ci_high": 0.9333333333333333,
215
+ "score_ci_low": 0.7858277377703305,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.9333333333333333,
220
+ "accuracy_ci_low": 0.8666666666666667,
221
+ "accuracy_ci_high": 0.9777777777777777,
222
  "score_name": "accuracy",
223
+ "score": 0.9333333333333333,
224
+ "score_ci_high": 0.9777777777777777,
225
+ "score_ci_low": 0.8666666666666667,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.9555555555555556,
230
+ "accuracy_ci_low": 0.9,
231
+ "accuracy_ci_high": 0.9888888888888889,
232
  "score_name": "accuracy",
233
+ "score": 0.9555555555555556,
234
+ "score_ci_high": 0.9888888888888889,
235
+ "score_ci_low": 0.9,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.9111111111111111,
240
+ "accuracy_ci_low": 0.8333333333333334,
241
+ "accuracy_ci_high": 0.9555555555555556,
242
  "score_name": "accuracy",
243
+ "score": 0.9111111111111111,
244
+ "score_ci_high": 0.9555555555555556,
245
+ "score_ci_low": 0.8333333333333334,
246
  "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.8666666666666667,
250
+ "accuracy_ci_low": 0.7888888888888889,
251
+ "accuracy_ci_high": 0.9222222222222223,
252
  "score_name": "accuracy",
253
+ "score": 0.8666666666666667,
254
+ "score_ci_high": 0.9222222222222223,
255
+ "score_ci_low": 0.7888888888888889,
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
+ "accuracy": 0.8666666666666667,
260
+ "accuracy_ci_low": 0.788388746882511,
261
+ "accuracy_ci_high": 0.9222222222222223,
262
  "score_name": "accuracy",
263
+ "score": 0.8666666666666667,
264
+ "score_ci_high": 0.9222222222222223,
265
+ "score_ci_low": 0.788388746882511,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
  "accuracy": 0.8888888888888888,
270
+ "accuracy_ci_low": 0.8111111111111111,
271
  "accuracy_ci_high": 0.9444444444444444,
272
  "score_name": "accuracy",
273
  "score": 0.8888888888888888,
274
  "score_ci_high": 0.9444444444444444,
275
+ "score_ci_low": 0.8111111111111111,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.9222222222222223,
280
+ "accuracy_ci_low": 0.8555555555555555,
281
+ "accuracy_ci_high": 0.9666666666666667,
282
  "score_name": "accuracy",
283
+ "score": 0.9222222222222223,
284
+ "score_ci_high": 0.9666666666666667,
285
+ "score_ci_low": 0.8555555555555555,
286
  "num_of_instances": 90
287
  },
288
+ "score": 0.9050505050505051,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
292
  "chatbot_abilities": {
293
  "arena_hard_generation_english_gpt_4_0314_reference": {
294
  "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.09158878504672897,
296
+ "score": 0.09158878504672897,
297
  "score_name": "llama_3_70b_instruct_template_arena_hard"
298
  },
299
+ "score": 0.09158878504672897,
300
  "score_name": "subsets_mean",
301
  "num_of_instances": 500
302
  },
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
+ "f1_Person": 0.16666666666666666,
307
+ "f1_Organization": 0.03252032520325203,
308
+ "f1_Location": 0.06666666666666667,
309
+ "f1_macro": 0.08861788617886178,
310
+ "recall_macro": 0.061454532512588166,
311
+ "precision_macro": 0.1605612378704432,
312
+ "in_classes_support": 0.2943548387096774,
313
+ "f1_micro": 0.055161544523246654,
314
+ "recall_micro": 0.06666666666666667,
315
+ "precision_micro": 0.04704301075268817,
316
+ "score": 0.055161544523246654,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.04056157487156359,
319
+ "score_ci_high": 0.07432669896850874,
320
+ "f1_micro_ci_low": 0.04056157487156359,
321
+ "f1_micro_ci_high": 0.07432669896850874
322
  },
323
+ "score": 0.055161544523246654,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
+ "accuracy": 0.7323943661971831,
330
+ "accuracy_ci_low": 0.6197183098591549,
331
+ "accuracy_ci_high": 0.8309859154929577,
332
  "score_name": "accuracy",
333
+ "score": 0.7323943661971831,
334
+ "score_ci_high": 0.8309859154929577,
335
+ "score_ci_low": 0.6197183098591549,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
+ "accuracy": 0.39436619718309857,
340
+ "accuracy_ci_low": 0.28169014084507044,
341
+ "accuracy_ci_high": 0.5070422535211268,
342
  "score_name": "accuracy",
343
+ "score": 0.39436619718309857,
344
+ "score_ci_high": 0.5070422535211268,
345
+ "score_ci_low": 0.28169014084507044,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
  "accuracy": 0.23943661971830985,
350
  "accuracy_ci_low": 0.14084507042253522,
351
+ "accuracy_ci_high": 0.3380281690140845,
352
  "score_name": "accuracy",
353
  "score": 0.23943661971830985,
354
+ "score_ci_high": 0.3380281690140845,
355
  "score_ci_low": 0.14084507042253522,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
+ "accuracy": 0.647887323943662,
360
+ "accuracy_ci_low": 0.5352112676056338,
361
+ "accuracy_ci_high": 0.7464788732394366,
362
  "score_name": "accuracy",
363
+ "score": 0.647887323943662,
364
+ "score_ci_high": 0.7464788732394366,
365
+ "score_ci_low": 0.5352112676056338,
366
  "num_of_instances": 71
367
  },
368
  "mmlu_pro_economics": {
369
+ "accuracy": 0.7323943661971831,
370
+ "accuracy_ci_low": 0.6056338028169014,
371
+ "accuracy_ci_high": 0.8309859154929577,
372
  "score_name": "accuracy",
373
+ "score": 0.7323943661971831,
374
+ "score_ci_high": 0.8309859154929577,
375
+ "score_ci_low": 0.6056338028169014,
376
  "num_of_instances": 71
377
  },
378
  "mmlu_pro_engineering": {
379
+ "accuracy": 0.2676056338028169,
380
+ "accuracy_ci_low": 0.16901408450704225,
381
+ "accuracy_ci_high": 0.38028169014084506,
382
  "score_name": "accuracy",
383
+ "score": 0.2676056338028169,
384
+ "score_ci_high": 0.38028169014084506,
385
+ "score_ci_low": 0.16901408450704225,
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
+ "accuracy": 0.6056338028169014,
390
+ "accuracy_ci_low": 0.4788732394366197,
391
+ "accuracy_ci_high": 0.704225352112676,
392
  "score_name": "accuracy",
393
+ "score": 0.6056338028169014,
394
+ "score_ci_high": 0.704225352112676,
395
+ "score_ci_low": 0.4788732394366197,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
+ "accuracy": 0.647887323943662,
400
+ "accuracy_ci_low": 0.5211267605633803,
401
+ "accuracy_ci_high": 0.7464788732394366,
402
  "score_name": "accuracy",
403
+ "score": 0.647887323943662,
404
+ "score_ci_high": 0.7464788732394366,
405
+ "score_ci_low": 0.5211267605633803,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
+ "accuracy": 0.5211267605633803,
410
+ "accuracy_ci_low": 0.40913735882879854,
411
+ "accuracy_ci_high": 0.6338028169014085,
412
  "score_name": "accuracy",
413
+ "score": 0.5211267605633803,
414
+ "score_ci_high": 0.6338028169014085,
415
+ "score_ci_low": 0.40913735882879854,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
+ "accuracy": 0.4225352112676056,
420
+ "accuracy_ci_low": 0.30985915492957744,
421
+ "accuracy_ci_high": 0.5275288557194965,
422
  "score_name": "accuracy",
423
+ "score": 0.4225352112676056,
424
+ "score_ci_high": 0.5275288557194965,
425
+ "score_ci_low": 0.30985915492957744,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.647887323943662,
430
+ "accuracy_ci_low": 0.5211267605633803,
431
+ "accuracy_ci_high": 0.7464788732394366,
432
  "score_name": "accuracy",
433
+ "score": 0.647887323943662,
434
+ "score_ci_high": 0.7464788732394366,
435
+ "score_ci_low": 0.5211267605633803,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
+ "accuracy": 0.6619718309859155,
440
+ "accuracy_ci_low": 0.5492957746478874,
441
+ "accuracy_ci_high": 0.7746478873239436,
442
  "score_name": "accuracy",
443
+ "score": 0.6619718309859155,
444
+ "score_ci_high": 0.7746478873239436,
445
+ "score_ci_low": 0.5492957746478874,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.39436619718309857,
450
+ "accuracy_ci_low": 0.28169014084507044,
451
+ "accuracy_ci_high": 0.5070422535211268,
452
  "score_name": "accuracy",
453
+ "score": 0.39436619718309857,
454
+ "score_ci_high": 0.5070422535211268,
455
+ "score_ci_low": 0.28169014084507044,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
+ "accuracy": 0.8028169014084507,
460
+ "accuracy_ci_low": 0.704225352112676,
461
+ "accuracy_ci_high": 0.8873239436619719,
462
  "score_name": "accuracy",
463
+ "score": 0.8028169014084507,
464
+ "score_ci_high": 0.8873239436619719,
465
+ "score_ci_low": 0.704225352112676,
466
  "num_of_instances": 71
467
  },
468
+ "score": 0.5513078470824949,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.30082491488530444,
475
+ "f1_suggestive": 0.2926829268292683,
476
+ "f1_generic": 0.3157894736842105,
477
+ "f1_fanciful": 0.2,
478
+ "f1_descriptive": 0.2608695652173913,
479
+ "f1_arbitrary": 0.43478260869565216,
480
+ "f1_macro_ci_low": 0.21165741181075054,
481
+ "f1_macro_ci_high": 0.4315719879768282,
482
  "score_name": "f1_micro",
483
+ "score": 0.30158730158730157,
484
+ "score_ci_high": 0.421875,
485
+ "score_ci_low": 0.2033898305084746,
486
  "num_of_instances": 85,
487
+ "accuracy": 0.2235294117647059,
488
+ "accuracy_ci_low": 0.15294117647058825,
489
+ "accuracy_ci_high": 0.32721667655979375,
490
+ "f1_micro": 0.30158730158730157,
491
+ "f1_micro_ci_low": 0.2033898305084746,
492
+ "f1_micro_ci_high": 0.421875
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.2087664168882443,
496
+ "f1_no": 0.38578680203045684,
497
+ "f1_yes": 0.031746031746031744,
498
+ "f1_macro_ci_low": 0.16304347826086957,
499
+ "f1_macro_ci_high": 0.27128054977534694,
500
  "score_name": "f1_micro",
501
+ "score": 0.3,
502
+ "score_ci_high": 0.37342833232881084,
503
+ "score_ci_low": 0.22950819672131148,
504
  "num_of_instances": 200,
505
+ "accuracy": 0.195,
506
+ "accuracy_ci_low": 0.145,
507
+ "accuracy_ci_high": 0.2511829758893259,
508
+ "f1_micro": 0.3,
509
+ "f1_micro_ci_low": 0.22950819672131148,
510
+ "f1_micro_ci_high": 0.37342833232881084
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.1326239897668469,
514
+ "f1_conclusion": 0.0,
515
+ "f1_decree": 0.14814814814814814,
516
+ "f1_issue": 0.05714285714285714,
517
+ "f1_analysis": 0.15,
518
+ "f1_facts": 0.06666666666666667,
519
+ "f1_procedural history": 0.25,
520
+ "f1_rule": 0.2564102564102564,
521
+ "f1_macro_ci_low": 0.0850100965627728,
522
+ "f1_macro_ci_high": 0.1996422423627835,
523
  "score_name": "f1_micro",
524
+ "score": 0.14345991561181434,
525
+ "score_ci_high": 0.2175732217573222,
526
+ "score_ci_low": 0.08928571428571429,
527
  "num_of_instances": 200,
528
+ "accuracy": 0.085,
529
+ "accuracy_ci_low": 0.055,
530
+ "accuracy_ci_high": 0.135,
531
+ "f1_micro": 0.14345991561181434,
532
+ "f1_micro_ci_low": 0.08928571428571429,
533
+ "f1_micro_ci_high": 0.2175732217573222
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.27590718171176754,
537
+ "f1_yes": 0.3106796116504854,
538
+ "f1_no": 0.24113475177304963,
539
+ "f1_macro_ci_low": 0.2059942090622298,
540
+ "f1_macro_ci_high": 0.35594480291914055,
541
  "score_name": "f1_micro",
542
+ "score": 0.27049180327868855,
543
+ "score_ci_high": 0.3511987633583538,
544
+ "score_ci_low": 0.20259771606756347,
545
  "num_of_instances": 200,
546
+ "accuracy": 0.165,
547
+ "accuracy_ci_low": 0.12,
548
+ "accuracy_ci_high": 0.225,
549
+ "f1_micro": 0.27049180327868855,
550
+ "f1_micro_ci_low": 0.20259771606756347,
551
+ "f1_micro_ci_high": 0.3511987633583538
552
  },
553
  "legalbench_proa": {
554
+ "f1_macro": 0.7697368421052632,
555
+ "f1_yes": 0.75,
556
+ "f1_no": 0.7894736842105263,
557
+ "f1_macro_ci_low": 0.6817007087256215,
558
+ "f1_macro_ci_high": 0.8427704260296438,
559
  "score_name": "f1_micro",
560
+ "score": 0.7714285714285715,
561
+ "score_ci_high": 0.8435374149659864,
562
+ "score_ci_low": 0.6821705426356589,
563
  "num_of_instances": 85,
564
+ "accuracy": 0.6352941176470588,
565
+ "accuracy_ci_low": 0.5176470588235295,
566
+ "accuracy_ci_high": 0.7294117647058823,
567
+ "f1_micro": 0.7714285714285715,
568
+ "f1_micro_ci_low": 0.6821705426356589,
569
+ "f1_micro_ci_high": 0.8435374149659864
570
  },
571
+ "score": 0.3573935183812752,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.31185856604704804,
578
+ "f1_cars": 0.5294117647058824,
579
+ "f1_windows x": 0.0,
580
+ "f1_computer graphics": 0.21875,
581
+ "f1_atheism": 0.0,
582
+ "f1_religion": 0.1935483870967742,
583
+ "f1_medicine": 0.4444444444444444,
584
+ "f1_christianity": 0.07272727272727272,
585
+ "f1_microsoft windows": 0.13793103448275862,
586
+ "f1_middle east": 0.2692307692307692,
587
+ "f1_motorcycles": 0.41975308641975306,
588
+ "f1_pc hardware": 0.47191011235955055,
589
+ "f1_mac hardware": 0.4943820224719101,
590
+ "f1_for sale": 0.2608695652173913,
591
+ "f1_guns": 0.0784313725490196,
592
+ "f1_space": 0.4594594594594595,
593
+ "f1_cryptography": 0.3389830508474576,
594
+ "f1_baseball": 0.46153846153846156,
595
+ "f1_hockey": 0.5918367346938775,
596
+ "f1_politics": 0.37142857142857144,
597
+ "f1_electronics": 0.4225352112676056,
598
+ "f1_macro_ci_low": 0.2856078509025084,
599
+ "f1_macro_ci_high": 0.3426020462142722,
600
  "score_name": "f1_micro",
601
+ "score": 0.3433734939759036,
602
+ "score_ci_high": 0.3755990938286412,
603
+ "score_ci_low": 0.3096235116477192,
604
  "num_of_instances": 1000,
605
+ "accuracy": 0.228,
606
+ "accuracy_ci_low": 0.202,
607
+ "accuracy_ci_high": 0.253,
608
+ "f1_micro": 0.3433734939759036,
609
+ "f1_micro_ci_low": 0.3096235116477192,
610
+ "f1_micro_ci_high": 0.3755990938286412
611
  },
612
+ "score": 0.3433734939759036,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.6519421467580517,
619
+ "f1_student loan": 0.75,
620
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.7592592592592593,
621
+ "f1_debt collection": 0.5138888888888888,
622
+ "f1_checking or savings account": 0.7073170731707317,
623
+ "f1_mortgage": 0.7931034482758621,
624
+ "f1_payday loan or title loan or personal loan": 0.4444444444444444,
625
+ "f1_credit card or prepaid card": 0.6727272727272727,
626
+ "f1_money transfer or virtual currency or money service": 0.6341463414634146,
627
+ "f1_vehicle loan or lease": 0.5925925925925926,
628
+ "f1_macro_ci_low": 0.5871374382389457,
629
+ "f1_macro_ci_high": 0.7213495817777442,
630
  "score_name": "f1_micro",
631
+ "score": 0.7237076648841355,
632
+ "score_ci_high": 0.7514570299103845,
633
+ "score_ci_low": 0.6955584945084361,
634
  "num_of_instances": 1000,
635
+ "accuracy": 0.609,
636
+ "accuracy_ci_low": 0.577,
637
+ "accuracy_ci_high": 0.641,
638
+ "f1_micro": 0.7237076648841355,
639
+ "f1_micro_ci_low": 0.6955584945084361,
640
+ "f1_micro_ci_high": 0.7514570299103845
641
  },
642
  "cfpb_product_watsonx": {
643
+ "f1_macro": 0.6498825672130026,
644
+ "f1_mortgages and loans": 0.6711409395973155,
645
+ "f1_credit card": 0.6853146853146853,
646
+ "f1_debt collection": 0.56,
647
+ "f1_credit reporting": 0.7279151943462897,
648
+ "f1_retail banking": 0.6050420168067226,
649
+ "f1_macro_ci_low": 0.6099650997428573,
650
+ "f1_macro_ci_high": 0.6950683737354554,
651
  "score_name": "f1_micro",
652
+ "score": 0.6605293440736478,
653
+ "score_ci_high": 0.7005417538024762,
654
+ "score_ci_low": 0.6186622377558174,
655
  "num_of_instances": 500,
656
+ "accuracy": 0.574,
657
+ "accuracy_ci_low": 0.534,
658
+ "accuracy_ci_high": 0.618,
659
+ "f1_micro": 0.6605293440736478,
660
+ "f1_micro_ci_low": 0.6186622377558174,
661
+ "f1_micro_ci_high": 0.7005417538024762
662
  },
663
+ "score": 0.6921185044788917,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
+ "execution_accuracy": 0.113,
671
+ "program_accuracy": 0.114,
672
+ "score": 0.114,
673
  "score_name": "program_accuracy",
674
+ "execution_accuracy_ci_low": 0.095,
675
+ "execution_accuracy_ci_high": 0.135,
676
+ "program_accuracy_ci_low": 0.096,
677
+ "program_accuracy_ci_high": 0.135,
678
+ "score_ci_low": 0.096,
679
+ "score_ci_high": 0.135
680
  },
681
+ "score": 0.114,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
+ "precision": 0.3193234609846695,
688
+ "recall": 0.6156352660927287,
689
+ "f1": 0.3579933019872818,
690
+ "precision_ci_low": 0.2995158081508201,
691
+ "precision_ci_high": 0.34026768128986357,
692
+ "recall_ci_low": 0.599899034156362,
693
+ "recall_ci_high": 0.6313695528855681,
694
+ "f1_ci_low": 0.34026079062391323,
695
+ "f1_ci_high": 0.37577002560346623,
696
  "score_name": "f1",
697
+ "score": 0.3579933019872818,
698
+ "score_ci_high": 0.37577002560346623,
699
+ "score_ci_low": 0.34026079062391323,
700
  "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6041086231172085,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6908169340590636,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5559258947273096,
704
+ "faithfullness_f1_token_overlap": 0.3085026884811217,
705
+ "faithfullness_recall_token_overlap": 0.24647156332743023,
706
+ "faithfullness_precision_token_overlap": 0.536789234125785,
707
+ "correctness_f1_token_overlap": 0.3579933019872818,
708
+ "correctness_recall_token_overlap": 0.6156352660927287,
709
+ "correctness_precision_token_overlap": 0.3193234609846695
710
  },
711
+ "score": 0.3579933019872818,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
+ "accuracy": 0.57,
718
+ "accuracy_ci_low": 0.537,
719
+ "accuracy_ci_high": 0.599,
720
  "score_name": "accuracy",
721
+ "score": 0.57,
722
+ "score_ci_high": 0.599,
723
+ "score_ci_low": 0.537,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
+ "accuracy": 0.924,
728
+ "accuracy_ci_low": 0.8978500821223476,
729
+ "accuracy_ci_high": 0.944,
730
  "score_name": "accuracy",
731
+ "score": 0.924,
732
+ "score_ci_high": 0.944,
733
+ "score_ci_low": 0.8978500821223476,
734
  "num_of_instances": 500
735
  },
736
+ "score": 0.747,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
740
  "safety": {
741
  "attaq_500": {
742
  "severity_critical": 3.0,
743
+ "severity_high": 1.0,
744
+ "severity_medium": 3.0,
745
+ "severity_low": 93.0,
746
+ "category_harmful_info": 0.7688285382612356,
747
+ "category_deception": 0.8504795281033058,
748
+ "category_explicit_content": 0.8499560091230604,
749
+ "category_violence": 0.8008088633891136,
750
+ "category_discrimination": 0.7961234071898081,
751
+ "category_substance_abuse": 0.8387069291538662,
752
+ "category_pii": 0.8280106396586807,
753
+ "safety": 0.8051073061449763,
754
+ "safety_ci_low": 0.7662176976956901,
755
+ "safety_ci_high": 0.8348793957598559,
756
  "score_name": "safety",
757
+ "score": 0.8051073061449763,
758
+ "score_ci_high": 0.8348793957598559,
759
+ "score_ci_low": 0.7662176976956901,
760
  "num_of_instances": 100
761
  },
762
+ "score": 0.8051073061449763,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
+ "rouge1": 0.4079437392329112,
770
+ "rougeLsum": 0.352934731899625,
771
+ "rougeL": 0.2881192661001498,
772
+ "score": 0.2881192661001498,
773
  "score_name": "rougeL",
774
+ "rouge2": 0.20309268400418845,
775
+ "rouge1_ci_low": 0.39815436123221776,
776
+ "rouge1_ci_high": 0.41769481108434164,
777
+ "rougeLsum_ci_low": 0.3428414762936942,
778
+ "rougeLsum_ci_high": 0.36147234302505843,
779
+ "rougeL_ci_low": 0.2810508281662768,
780
+ "rougeL_ci_high": 0.2958753092187963,
781
+ "score_ci_low": 0.2810508281662768,
782
+ "score_ci_high": 0.2958753092187963,
783
+ "rouge2_ci_low": 0.19601529597372655,
784
+ "rouge2_ci_high": 0.21114250528228737
 
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
+ "rouge1": 0.12777796269592456,
789
+ "rougeLsum": 0.10603826262531557,
790
+ "rougeL": 0.09179328413499686,
791
+ "score": 0.09179328413499686,
792
  "score_name": "rougeL",
793
+ "rouge2": 0.018851084572764187,
794
+ "rouge1_ci_low": 0.12168881499661288,
795
+ "rouge1_ci_high": 0.1332671550509936,
796
+ "rougeLsum_ci_low": 0.10116850286523955,
797
+ "rougeLsum_ci_high": 0.1106709838674053,
798
+ "rougeL_ci_low": 0.08725116318521317,
799
+ "rougeL_ci_high": 0.09575800533068254,
800
+ "score_ci_low": 0.08725116318521317,
801
+ "score_ci_high": 0.09575800533068254,
802
+ "rouge2_ci_low": 0.01662623690565808,
803
+ "rouge2_ci_high": 0.020972697895929062
 
804
  },
805
+ "score": 0.18995627511757335,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
 
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
+ 1289,
814
+ 858,
815
+ 605,
816
+ 439
817
  ],
818
  "totals": [
819
+ 1947,
820
+ 1881,
821
+ 1815,
822
+ 1749
823
  ],
824
  "precisions": [
825
+ 0.6620441705187469,
826
+ 0.456140350877193,
827
+ 0.33333333333333337,
828
+ 0.2510005717552887
829
  ],
830
  "bp": 1.0,
831
+ "sys_len": 1947,
832
  "ref_len": 1734,
833
+ "sacrebleu": 0.39868943613707586,
834
+ "score": 0.39868943613707586,
835
  "score_name": "sacrebleu",
836
+ "score_ci_low": 0.34684429082723056,
837
+ "score_ci_high": 0.4568678775074209,
838
+ "sacrebleu_ci_low": 0.34684429082723056,
839
+ "sacrebleu_ci_high": 0.4568678775074209
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
+ 1323,
845
+ 887,
846
+ 629,
847
+ 444
848
  ],
849
  "totals": [
850
+ 2483,
851
+ 2417,
852
+ 2351,
853
+ 2285
854
  ],
855
  "precisions": [
856
+ 0.5328231977446637,
857
+ 0.36698386429458,
858
+ 0.26754572522330927,
859
+ 0.19431072210065647
860
  ],
861
  "bp": 1.0,
862
+ "sys_len": 2483,
863
  "ref_len": 1734,
864
+ "sacrebleu": 0.3175274085826544,
865
+ "score": 0.3175274085826544,
866
  "score_name": "sacrebleu",
867
+ "score_ci_low": 0.271260768895482,
868
+ "score_ci_high": 0.365445678313604,
869
+ "sacrebleu_ci_low": 0.271260768895482,
870
+ "sacrebleu_ci_high": 0.365445678313604
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
+ 940,
876
+ 521,
877
+ 315,
878
+ 191
879
  ],
880
  "totals": [
881
+ 1688,
882
+ 1622,
883
+ 1556,
884
+ 1490
885
  ],
886
  "precisions": [
887
+ 0.556872037914692,
888
+ 0.3212083847102343,
889
+ 0.20244215938303342,
890
+ 0.12818791946308725
891
  ],
892
  "bp": 1.0,
893
+ "sys_len": 1688,
894
  "ref_len": 1589,
895
+ "sacrebleu": 0.2610192792824636,
896
+ "score": 0.2610192792824636,
897
  "score_name": "sacrebleu",
898
+ "score_ci_low": 0.20817526367502337,
899
+ "score_ci_high": 0.29958193407273404,
900
+ "sacrebleu_ci_low": 0.20817526367502337,
901
+ "sacrebleu_ci_high": 0.29958193407273404
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
+ 1254,
907
+ 784,
908
+ 532,
909
+ 376
910
  ],
911
  "totals": [
912
+ 1815,
913
+ 1749,
914
+ 1683,
915
+ 1617
916
  ],
917
  "precisions": [
918
+ 0.6909090909090909,
919
+ 0.4482561463693539,
920
+ 0.31610219845513965,
921
+ 0.23252937538651824
922
  ],
923
+ "bp": 0.98904120617152,
924
+ "sys_len": 1815,
925
  "ref_len": 1835,
926
+ "sacrebleu": 0.38417359468716306,
927
+ "score": 0.38417359468716306,
928
  "score_name": "sacrebleu",
929
+ "score_ci_low": 0.32789485731980855,
930
+ "score_ci_high": 0.4187305224203214,
931
+ "sacrebleu_ci_low": 0.32789485731980855,
932
+ "sacrebleu_ci_high": 0.4187305224203214
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
+ 1572,
938
+ 1194,
939
+ 949,
940
+ 766
941
  ],
942
  "totals": [
943
+ 2097,
944
+ 2031,
945
+ 1965,
946
+ 1899
947
  ],
948
  "precisions": [
949
+ 0.7496423462088698,
950
+ 0.5878877400295421,
951
+ 0.48295165394402034,
952
+ 0.4033701948393892
953
  ],
954
  "bp": 1.0,
955
+ "sys_len": 2097,
956
  "ref_len": 2068,
957
+ "sacrebleu": 0.5413012055320727,
958
+ "score": 0.5413012055320727,
959
  "score_name": "sacrebleu",
960
+ "score_ci_low": 0.5011833767453445,
961
+ "score_ci_high": 0.591093351022506,
962
+ "sacrebleu_ci_low": 0.5011833767453445,
963
+ "sacrebleu_ci_high": 0.591093351022506
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
+ 1382,
969
+ 762,
970
+ 450,
971
+ 277
972
  ],
973
  "totals": [
974
+ 2304,
975
+ 2238,
976
+ 2172,
977
+ 2106
978
  ],
979
  "precisions": [
980
+ 0.5998263888888888,
981
+ 0.34048257372654156,
982
+ 0.20718232044198895,
983
+ 0.1315289648622982
984
  ],
985
  "bp": 1.0,
986
+ "sys_len": 2304,
987
  "ref_len": 2235,
988
+ "sacrebleu": 0.27313266242858875,
989
+ "score": 0.27313266242858875,
990
  "score_name": "sacrebleu",
991
+ "score_ci_low": 0.24101006670532885,
992
+ "score_ci_high": 0.2985709120047681,
993
+ "sacrebleu_ci_low": 0.24101006670532885,
994
+ "sacrebleu_ci_high": 0.2985709120047681
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
+ 1454,
1000
+ 1049,
1001
+ 810,
1002
+ 633
1003
  ],
1004
  "totals": [
1005
+ 2019,
1006
+ 1953,
1007
+ 1887,
1008
+ 1821
1009
  ],
1010
  "precisions": [
1011
+ 0.7201584943041109,
1012
+ 0.5371223758320532,
1013
+ 0.4292527821939586,
1014
+ 0.3476112026359144
1015
  ],
1016
  "bp": 1.0,
1017
+ "sys_len": 2019,
1018
  "ref_len": 1916,
1019
+ "sacrebleu": 0.49014779569163686,
1020
+ "score": 0.49014779569163686,
1021
  "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.447345907278528,
1023
+ "score_ci_high": 0.5368115765817915,
1024
+ "sacrebleu_ci_low": 0.447345907278528,
1025
+ "sacrebleu_ci_high": 0.5368115765817915
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
+ 1390,
1031
+ 967,
1032
+ 688,
1033
+ 489
1034
  ],
1035
  "totals": [
1036
+ 1962,
1037
+ 1896,
1038
+ 1830,
1039
+ 1764
1040
  ],
1041
  "precisions": [
1042
+ 0.7084607543323139,
1043
+ 0.5100210970464135,
1044
+ 0.37595628415300547,
1045
+ 0.27721088435374147
1046
  ],
1047
  "bp": 1.0,
1048
+ "sys_len": 1962,
1049
  "ref_len": 1949,
1050
+ "sacrebleu": 0.4405172214713006,
1051
+ "score": 0.4405172214713006,
1052
  "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.40615604089781865,
1054
+ "score_ci_high": 0.516381897740174,
1055
+ "sacrebleu_ci_low": 0.40615604089781865,
1056
+ "sacrebleu_ci_high": 0.516381897740174
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
+ 1287,
1062
+ 732,
1063
+ 440,
1064
+ 265
1065
  ],
1066
  "totals": [
1067
+ 2008,
1068
+ 1942,
1069
+ 1876,
1070
+ 1810
1071
  ],
1072
  "precisions": [
1073
+ 0.6409362549800797,
1074
+ 0.3769309989701339,
1075
+ 0.2345415778251599,
1076
+ 0.1464088397790055
1077
  ],
1078
+ "bp": 0.956168891168866,
1079
+ "sys_len": 2008,
1080
  "ref_len": 2098,
1081
+ "sacrebleu": 0.28856959420154726,
1082
+ "score": 0.28856959420154726,
1083
  "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.2645022427610819,
1085
+ "score_ci_high": 0.3240217870629309,
1086
+ "sacrebleu_ci_low": 0.2645022427610819,
1087
+ "sacrebleu_ci_high": 0.3240217870629309
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
+ 1361,
1093
+ 951,
1094
+ 694,
1095
+ 510
1096
  ],
1097
  "totals": [
1098
+ 2543,
1099
+ 2477,
1100
+ 2411,
1101
+ 2345
1102
  ],
1103
  "precisions": [
1104
+ 0.5351946519858435,
1105
+ 0.3839321760193783,
1106
+ 0.2878473662380755,
1107
+ 0.21748400852878466
1108
  ],
1109
  "bp": 1.0,
1110
+ "sys_len": 2543,
1111
  "ref_len": 1734,
1112
+ "sacrebleu": 0.33677431862395624,
1113
+ "score": 0.33677431862395624,
1114
  "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.2922714206661574,
1116
+ "score_ci_high": 0.38804905185639504,
1117
+ "sacrebleu_ci_low": 0.2922714206661574,
1118
+ "sacrebleu_ci_high": 0.38804905185639504
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
+ 1134,
1124
+ 627,
1125
+ 390,
1126
+ 248
1127
  ],
1128
  "totals": [
1129
+ 1999,
1130
+ 1933,
1131
+ 1867,
1132
+ 1801
1133
  ],
1134
  "precisions": [
1135
+ 0.5672836418209105,
1136
+ 0.32436627004655977,
1137
+ 0.2088912694161757,
1138
+ 0.13770127706829538
1139
  ],
1140
  "bp": 1.0,
1141
+ "sys_len": 1999,
1142
  "ref_len": 1734,
1143
+ "sacrebleu": 0.2697264590787591,
1144
+ "score": 0.2697264590787591,
1145
  "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.23374708119412382,
1147
+ "score_ci_high": 0.32852575808051926,
1148
+ "sacrebleu_ci_low": 0.23374708119412382,
1149
+ "sacrebleu_ci_high": 0.32852575808051926
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
+ 1102,
1155
+ 602,
1156
+ 369,
1157
+ 241
1158
  ],
1159
  "totals": [
1160
+ 1925,
1161
+ 1859,
1162
+ 1793,
1163
+ 1727
1164
  ],
1165
  "precisions": [
1166
+ 0.5724675324675325,
1167
+ 0.32383001613770845,
1168
+ 0.20580033463469047,
1169
+ 0.13954834973943256
1170
  ],
1171
  "bp": 1.0,
1172
+ "sys_len": 1925,
1173
  "ref_len": 1734,
1174
+ "sacrebleu": 0.27012183165344417,
1175
+ "score": 0.27012183165344417,
1176
  "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.23833827580048095,
1178
+ "score_ci_high": 0.320793499898261,
1179
+ "sacrebleu_ci_low": 0.23833827580048095,
1180
+ "sacrebleu_ci_high": 0.320793499898261
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
+ 1392,
1186
+ 1000,
1187
+ 752,
1188
+ 574
1189
  ],
1190
  "totals": [
1191
+ 2564,
1192
+ 2498,
1193
+ 2432,
1194
+ 2366
1195
  ],
1196
  "precisions": [
1197
+ 0.5429017160686428,
1198
+ 0.400320256204964,
1199
+ 0.3092105263157895,
1200
+ 0.242603550295858
1201
  ],
1202
  "bp": 1.0,
1203
+ "sys_len": 2564,
1204
  "ref_len": 1734,
1205
+ "sacrebleu": 0.35733047002149876,
1206
+ "score": 0.35733047002149876,
1207
  "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.2995890270688701,
1209
+ "score_ci_high": 0.40598076558127455,
1210
+ "sacrebleu_ci_low": 0.2995890270688701,
1211
+ "sacrebleu_ci_high": 0.40598076558127455
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
+ 1376,
1217
+ 983,
1218
+ 722,
1219
+ 538
1220
  ],
1221
  "totals": [
1222
+ 2343,
1223
+ 2277,
1224
+ 2211,
1225
+ 2145
1226
  ],
1227
  "precisions": [
1228
+ 0.5872812633376013,
1229
+ 0.43170838823012736,
1230
+ 0.32654907281772955,
1231
+ 0.25081585081585084
1232
  ],
1233
  "bp": 1.0,
1234
+ "sys_len": 2343,
1235
  "ref_len": 1734,
1236
+ "sacrebleu": 0.3796077031006635,
1237
+ "score": 0.3796077031006635,
1238
  "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.3432297777298846,
1240
+ "score_ci_high": 0.4214166231989559,
1241
+ "sacrebleu_ci_low": 0.3432297777298846,
1242
+ "sacrebleu_ci_high": 0.4214166231989559
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
+ 1238,
1248
+ 705,
1249
+ 452,
1250
+ 287
1251
  ],
1252
  "totals": [
1253
+ 2444,
1254
+ 2378,
1255
+ 2312,
1256
+ 2246
1257
  ],
1258
  "precisions": [
1259
+ 0.5065466448445172,
1260
+ 0.2964676198486123,
1261
+ 0.19550173010380623,
1262
+ 0.12778272484416742
1263
  ],
1264
  "bp": 1.0,
1265
+ "sys_len": 2444,
1266
  "ref_len": 1734,
1267
+ "sacrebleu": 0.24748840965516117,
1268
+ "score": 0.24748840965516117,
1269
  "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.21183776036339763,
1271
+ "score_ci_high": 0.2806697118543708,
1272
+ "sacrebleu_ci_low": 0.21183776036339763,
1273
+ "sacrebleu_ci_high": 0.2806697118543708
1274
  },
1275
+ "score": 0.3504084926765324,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
+ "score": 0.4277276595742623,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }