Updated results files
Browse filesSigned-off-by: Jonathan Bnayahu <[email protected]>
- results/bluebench/2025-06-22T14-01-49_evaluation_results.json +1283 -0
- results/bluebench/{2025-06-19T17-18-35_evaluation_results.json β 2025-06-22T15-05-33_evaluation_results.json} +679 -679
- results/bluebench/2025-06-22T17-10-54_evaluation_results.json +1283 -0
- results/bluebench/2025-06-22T19-25-42_evaluation_results.json +1283 -0
- results/bluebench/{2025-06-19T15-57-45_evaluation_results.json β 2025-06-23T02-53-05_evaluation_results.json} +675 -675
- results/bluebench/{2025-06-19T16-09-06_evaluation_results.json β 2025-06-23T03-17-57_evaluation_results.json} +681 -681
- results/bluebench/{2025-06-19T16-21-09_evaluation_results.json β 2025-06-23T04-06-37_evaluation_results.json} +674 -674
- results/bluebench/2025-06-23T04-42-35_evaluation_results.json +1283 -0
- results/bluebench/{2025-06-19T18-10-05_evaluation_results.json β 2025-06-23T05-36-33_evaluation_results.json} +686 -686
- results/bluebench/{2025-06-19T20-10-50_evaluation_results.json β 2025-06-23T06-18-33_evaluation_results.json} +605 -605
- results/bluebench/{2025-06-21T08-38-27_evaluation_results.json β 2025-06-23T08-43-46_evaluation_results.json} +701 -701
- results/bluebench/{2025-06-19T21-59-04_evaluation_results.json β 2025-06-23T09-36-33_evaluation_results.json} +662 -662
- results/bluebench/{2025-06-21T09-36-54_evaluation_results.json β 2025-06-23T14-18-29_evaluation_results.json} +584 -584
- results/bluebench/{2025-06-21T11-34-24_evaluation_results.json β 2025-06-23T15-33-11_evaluation_results.json} +700 -700
results/bluebench/2025-06-22T14-01-49_evaluation_results.json
ADDED
@@ -0,0 +1,1283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-22T18:01:46.346556Z",
|
4 |
+
"command_line_invocation": [
|
5 |
+
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
+
"--tasks",
|
7 |
+
"benchmarks.bluebench",
|
8 |
+
"--model",
|
9 |
+
"cross_provider",
|
10 |
+
"--model_args",
|
11 |
+
"model_name=watsonx/meta-llama/llama-3-2-11b-vision-instruct,max_tokens=256",
|
12 |
+
"--output_path",
|
13 |
+
"./results/bluebench",
|
14 |
+
"--log_samples",
|
15 |
+
"--trust_remote_code",
|
16 |
+
"--batch_size",
|
17 |
+
"8",
|
18 |
+
"--verbosity",
|
19 |
+
"ERROR"
|
20 |
+
],
|
21 |
+
"parsed_arguments": {
|
22 |
+
"tasks": [
|
23 |
+
"benchmarks.bluebench"
|
24 |
+
],
|
25 |
+
"split": "test",
|
26 |
+
"num_fewshots": null,
|
27 |
+
"limit": null,
|
28 |
+
"batch_size": 8,
|
29 |
+
"model": "watsonx/meta-llama/llama-3-2-11b-vision-instruct",
|
30 |
+
"model_args": {
|
31 |
+
"max_tokens": 256
|
32 |
+
},
|
33 |
+
"gen_kwargs": null,
|
34 |
+
"chat_template_kwargs": null,
|
35 |
+
"output_path": "./results/bluebench",
|
36 |
+
"output_file_prefix": "evaluation_results",
|
37 |
+
"log_samples": true,
|
38 |
+
"verbosity": "ERROR",
|
39 |
+
"apply_chat_template": false,
|
40 |
+
"trust_remote_code": true,
|
41 |
+
"disable_hf_cache": false,
|
42 |
+
"cache_dir": null
|
43 |
+
},
|
44 |
+
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
|
46 |
+
"python_version": "3.10.18",
|
47 |
+
"system": "Linux",
|
48 |
+
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
49 |
+
"installed_packages": {
|
50 |
+
"nvidia-cufile-cu12": "1.11.1.6",
|
51 |
+
"triton": "3.3.1",
|
52 |
+
"nltk": "3.9.1",
|
53 |
+
"anyio": "4.9.0",
|
54 |
+
"absl-py": "2.3.0",
|
55 |
+
"tiktoken": "0.9.0",
|
56 |
+
"charset-normalizer": "3.4.2",
|
57 |
+
"nvidia-cuda-runtime-cu12": "12.6.77",
|
58 |
+
"sympy": "1.14.0",
|
59 |
+
"mecab-ko": "1.0.1",
|
60 |
+
"litellm": "1.72.6.post1",
|
61 |
+
"httpcore": "1.0.9",
|
62 |
+
"Jinja2": "3.1.6",
|
63 |
+
"jsonschema-specifications": "2025.4.1",
|
64 |
+
"pydantic_core": "2.33.2",
|
65 |
+
"nvidia-cusparse-cu12": "12.5.4.2",
|
66 |
+
"yarl": "1.20.1",
|
67 |
+
"openai": "1.88.0",
|
68 |
+
"portalocker": "3.2.0",
|
69 |
+
"pandas": "2.3.0",
|
70 |
+
"multiprocess": "0.70.16",
|
71 |
+
"jsonschema": "4.24.0",
|
72 |
+
"unitxt": "1.24.0",
|
73 |
+
"nvidia-nvjitlink-cu12": "12.6.85",
|
74 |
+
"nvidia-cublas-cu12": "12.6.4.1",
|
75 |
+
"pydantic": "2.11.7",
|
76 |
+
"async-timeout": "5.0.1",
|
77 |
+
"annotated-types": "0.7.0",
|
78 |
+
"rouge_score": "0.1.2",
|
79 |
+
"contourpy": "1.3.2",
|
80 |
+
"aiosignal": "1.3.2",
|
81 |
+
"nvidia-cuda-cupti-cu12": "12.6.80",
|
82 |
+
"pillow": "11.2.1",
|
83 |
+
"six": "1.17.0",
|
84 |
+
"diskcache": "5.6.3",
|
85 |
+
"tqdm": "4.67.1",
|
86 |
+
"pyarrow": "20.0.0",
|
87 |
+
"h11": "0.16.0",
|
88 |
+
"zipp": "3.19.2",
|
89 |
+
"tzdata": "2025.2",
|
90 |
+
"bert-score": "0.3.13",
|
91 |
+
"setuptools": "80.9.0",
|
92 |
+
"referencing": "0.36.2",
|
93 |
+
"sacrebleu": "2.5.1",
|
94 |
+
"filelock": "3.18.0",
|
95 |
+
"urllib3": "2.5.0",
|
96 |
+
"scipy": "1.15.3",
|
97 |
+
"nvidia-nccl-cu12": "2.26.2",
|
98 |
+
"kiwisolver": "1.4.8",
|
99 |
+
"networkx": "3.4.2",
|
100 |
+
"typing-inspection": "0.4.1",
|
101 |
+
"lxml": "5.4.0",
|
102 |
+
"sniffio": "1.3.1",
|
103 |
+
"scikit-learn": "1.7.0",
|
104 |
+
"nvidia-curand-cu12": "10.3.7.77",
|
105 |
+
"pip": "25.1.1",
|
106 |
+
"fonttools": "4.58.4",
|
107 |
+
"transformers": "4.52.4",
|
108 |
+
"datasets": "3.6.0",
|
109 |
+
"nvidia-cusolver-cu12": "11.7.1.2",
|
110 |
+
"cycler": "0.12.1",
|
111 |
+
"evaluate": "0.4.3",
|
112 |
+
"distro": "1.9.0",
|
113 |
+
"idna": "3.10",
|
114 |
+
"MarkupSafe": "3.0.2",
|
115 |
+
"frozenlist": "1.7.0",
|
116 |
+
"pyparsing": "3.2.3",
|
117 |
+
"jiter": "0.10.0",
|
118 |
+
"importlib_metadata": "8.0.0",
|
119 |
+
"packaging": "24.2",
|
120 |
+
"psutil": "7.0.0",
|
121 |
+
"mecab-ko-dic": "1.0.0",
|
122 |
+
"joblib": "1.5.1",
|
123 |
+
"fsspec": "2025.3.0",
|
124 |
+
"dill": "0.3.8",
|
125 |
+
"tokenizers": "0.21.1",
|
126 |
+
"wheel": "0.45.1",
|
127 |
+
"nvidia-nvtx-cu12": "12.6.77",
|
128 |
+
"nvidia-cusparselt-cu12": "0.6.3",
|
129 |
+
"hf-xet": "1.1.4",
|
130 |
+
"propcache": "0.3.2",
|
131 |
+
"numpy": "2.2.6",
|
132 |
+
"mpmath": "1.3.0",
|
133 |
+
"multidict": "6.5.0",
|
134 |
+
"conllu": "6.0.0",
|
135 |
+
"safetensors": "0.5.3",
|
136 |
+
"requests": "2.32.4",
|
137 |
+
"regex": "2024.11.6",
|
138 |
+
"aiohttp": "3.12.13",
|
139 |
+
"tabulate": "0.9.0",
|
140 |
+
"certifi": "2025.6.15",
|
141 |
+
"accelerate": "1.8.0",
|
142 |
+
"nvidia-cufft-cu12": "11.3.0.4",
|
143 |
+
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
144 |
+
"click": "8.2.1",
|
145 |
+
"typing_extensions": "4.12.2",
|
146 |
+
"attrs": "25.3.0",
|
147 |
+
"exceptiongroup": "1.3.0",
|
148 |
+
"tenacity": "9.1.2",
|
149 |
+
"pytz": "2025.2",
|
150 |
+
"aiohappyeyeballs": "2.6.1",
|
151 |
+
"python-dateutil": "2.9.0.post0",
|
152 |
+
"torch": "2.7.1",
|
153 |
+
"python-dotenv": "1.1.0",
|
154 |
+
"httpx": "0.28.1",
|
155 |
+
"matplotlib": "3.10.3",
|
156 |
+
"xxhash": "3.5.0",
|
157 |
+
"PyYAML": "6.0.2",
|
158 |
+
"huggingface-hub": "0.33.0",
|
159 |
+
"colorama": "0.4.6",
|
160 |
+
"rpds-py": "0.25.1",
|
161 |
+
"threadpoolctl": "3.6.0",
|
162 |
+
"nvidia-cudnn-cu12": "9.5.1.17",
|
163 |
+
"jaraco.collections": "5.1.0",
|
164 |
+
"tomli": "2.0.1",
|
165 |
+
"backports.tarfile": "1.2.0",
|
166 |
+
"jaraco.context": "5.3.0",
|
167 |
+
"typeguard": "4.3.0",
|
168 |
+
"autocommand": "2.2.2",
|
169 |
+
"jaraco.text": "3.12.1",
|
170 |
+
"more-itertools": "10.3.0",
|
171 |
+
"platformdirs": "4.2.2",
|
172 |
+
"inflect": "7.3.1",
|
173 |
+
"jaraco.functools": "4.0.1"
|
174 |
+
}
|
175 |
+
},
|
176 |
+
"results": {
|
177 |
+
"bias": {
|
178 |
+
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.7444444444444445,
|
180 |
+
"accuracy_ci_low": 0.6555555555555556,
|
181 |
+
"accuracy_ci_high": 0.8333333333333334,
|
182 |
+
"score_name": "accuracy",
|
183 |
+
"score": 0.7444444444444445,
|
184 |
+
"score_ci_high": 0.8333333333333334,
|
185 |
+
"score_ci_low": 0.6555555555555556,
|
186 |
+
"num_of_instances": 90
|
187 |
+
},
|
188 |
+
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.7888888888888889,
|
190 |
+
"accuracy_ci_low": 0.7,
|
191 |
+
"accuracy_ci_high": 0.8666666666666667,
|
192 |
+
"score_name": "accuracy",
|
193 |
+
"score": 0.7888888888888889,
|
194 |
+
"score_ci_high": 0.8666666666666667,
|
195 |
+
"score_ci_low": 0.7,
|
196 |
+
"num_of_instances": 90
|
197 |
+
},
|
198 |
+
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 0.9111111111111111,
|
200 |
+
"accuracy_ci_low": 0.8444444444444444,
|
201 |
+
"accuracy_ci_high": 0.9555555555555556,
|
202 |
+
"score_name": "accuracy",
|
203 |
+
"score": 0.9111111111111111,
|
204 |
+
"score_ci_high": 0.9555555555555556,
|
205 |
+
"score_ci_low": 0.8444444444444444,
|
206 |
+
"num_of_instances": 90
|
207 |
+
},
|
208 |
+
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 0.7888888888888889,
|
210 |
+
"accuracy_ci_low": 0.7,
|
211 |
+
"accuracy_ci_high": 0.8666666666666667,
|
212 |
+
"score_name": "accuracy",
|
213 |
+
"score": 0.7888888888888889,
|
214 |
+
"score_ci_high": 0.8666666666666667,
|
215 |
+
"score_ci_low": 0.7,
|
216 |
+
"num_of_instances": 90
|
217 |
+
},
|
218 |
+
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.8111111111111111,
|
220 |
+
"accuracy_ci_low": 0.7222222222222222,
|
221 |
+
"accuracy_ci_high": 0.8888888888888888,
|
222 |
+
"score_name": "accuracy",
|
223 |
+
"score": 0.8111111111111111,
|
224 |
+
"score_ci_high": 0.8888888888888888,
|
225 |
+
"score_ci_low": 0.7222222222222222,
|
226 |
+
"num_of_instances": 90
|
227 |
+
},
|
228 |
+
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.9,
|
230 |
+
"accuracy_ci_low": 0.8222222222222222,
|
231 |
+
"accuracy_ci_high": 0.9555555555555556,
|
232 |
+
"score_name": "accuracy",
|
233 |
+
"score": 0.9,
|
234 |
+
"score_ci_high": 0.9555555555555556,
|
235 |
+
"score_ci_low": 0.8222222222222222,
|
236 |
+
"num_of_instances": 90
|
237 |
+
},
|
238 |
+
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 0.9666666666666667,
|
240 |
+
"accuracy_ci_low": 0.9111111111111111,
|
241 |
+
"accuracy_ci_high": 0.9888888888888889,
|
242 |
+
"score_name": "accuracy",
|
243 |
+
"score": 0.9666666666666667,
|
244 |
+
"score_ci_high": 0.9888888888888889,
|
245 |
+
"score_ci_low": 0.9111111111111111,
|
246 |
+
"num_of_instances": 90
|
247 |
+
},
|
248 |
+
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 0.8666666666666667,
|
250 |
+
"accuracy_ci_low": 0.7888888888888889,
|
251 |
+
"accuracy_ci_high": 0.9333333333333333,
|
252 |
+
"score_name": "accuracy",
|
253 |
+
"score": 0.8666666666666667,
|
254 |
+
"score_ci_high": 0.9333333333333333,
|
255 |
+
"score_ci_low": 0.7888888888888889,
|
256 |
+
"num_of_instances": 90
|
257 |
+
},
|
258 |
+
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.9111111111111111,
|
260 |
+
"accuracy_ci_low": 0.8412016500028439,
|
261 |
+
"accuracy_ci_high": 0.9555555555555556,
|
262 |
+
"score_name": "accuracy",
|
263 |
+
"score": 0.9111111111111111,
|
264 |
+
"score_ci_high": 0.9555555555555556,
|
265 |
+
"score_ci_low": 0.8412016500028439,
|
266 |
+
"num_of_instances": 90
|
267 |
+
},
|
268 |
+
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.8444444444444444,
|
270 |
+
"accuracy_ci_low": 0.7666666666666667,
|
271 |
+
"accuracy_ci_high": 0.9111111111111111,
|
272 |
+
"score_name": "accuracy",
|
273 |
+
"score": 0.8444444444444444,
|
274 |
+
"score_ci_high": 0.9111111111111111,
|
275 |
+
"score_ci_low": 0.7666666666666667,
|
276 |
+
"num_of_instances": 90
|
277 |
+
},
|
278 |
+
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.9111111111111111,
|
280 |
+
"accuracy_ci_low": 0.8333333333333334,
|
281 |
+
"accuracy_ci_high": 0.9555555555555556,
|
282 |
+
"score_name": "accuracy",
|
283 |
+
"score": 0.9111111111111111,
|
284 |
+
"score_ci_high": 0.9555555555555556,
|
285 |
+
"score_ci_low": 0.8333333333333334,
|
286 |
+
"num_of_instances": 90
|
287 |
+
},
|
288 |
+
"score": 0.8585858585858586,
|
289 |
+
"score_name": "subsets_mean",
|
290 |
+
"num_of_instances": 990
|
291 |
+
},
|
292 |
+
"chatbot_abilities": {
|
293 |
+
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
+
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.07721639656816015,
|
296 |
+
"score": 0.07721639656816015,
|
297 |
+
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
+
},
|
299 |
+
"score": 0.07721639656816015,
|
300 |
+
"score_name": "subsets_mean",
|
301 |
+
"num_of_instances": 500
|
302 |
+
},
|
303 |
+
"entity_extraction": {
|
304 |
+
"universal_ner_en_ewt": {
|
305 |
+
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.48831168831168825,
|
307 |
+
"f1_Organization": 0.35220125786163525,
|
308 |
+
"f1_Location": 0.3775100401606426,
|
309 |
+
"f1_macro": 0.406007662111322,
|
310 |
+
"recall_macro": 0.3667818453974414,
|
311 |
+
"precision_macro": 0.4584981753989352,
|
312 |
+
"in_classes_support": 0.7834862385321101,
|
313 |
+
"f1_micro": 0.3682242990654206,
|
314 |
+
"recall_micro": 0.37523809523809526,
|
315 |
+
"precision_micro": 0.3614678899082569,
|
316 |
+
"score": 0.3682242990654206,
|
317 |
+
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.32532095178317566,
|
319 |
+
"score_ci_high": 0.4180775144242145,
|
320 |
+
"f1_micro_ci_low": 0.32532095178317566,
|
321 |
+
"f1_micro_ci_high": 0.4180775144242145
|
322 |
+
},
|
323 |
+
"score": 0.3682242990654206,
|
324 |
+
"score_name": "subsets_mean",
|
325 |
+
"num_of_instances": 1000
|
326 |
+
},
|
327 |
+
"knowledge": {
|
328 |
+
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.5633802816901409,
|
330 |
+
"accuracy_ci_low": 0.4507042253521127,
|
331 |
+
"accuracy_ci_high": 0.676056338028169,
|
332 |
+
"score_name": "accuracy",
|
333 |
+
"score": 0.5633802816901409,
|
334 |
+
"score_ci_high": 0.676056338028169,
|
335 |
+
"score_ci_low": 0.4507042253521127,
|
336 |
+
"num_of_instances": 71
|
337 |
+
},
|
338 |
+
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.2535211267605634,
|
340 |
+
"accuracy_ci_low": 0.15492957746478872,
|
341 |
+
"accuracy_ci_high": 0.36619718309859156,
|
342 |
+
"score_name": "accuracy",
|
343 |
+
"score": 0.2535211267605634,
|
344 |
+
"score_ci_high": 0.36619718309859156,
|
345 |
+
"score_ci_low": 0.15492957746478872,
|
346 |
+
"num_of_instances": 71
|
347 |
+
},
|
348 |
+
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.23943661971830985,
|
350 |
+
"accuracy_ci_low": 0.15492957746478872,
|
351 |
+
"accuracy_ci_high": 0.352112676056338,
|
352 |
+
"score_name": "accuracy",
|
353 |
+
"score": 0.23943661971830985,
|
354 |
+
"score_ci_high": 0.352112676056338,
|
355 |
+
"score_ci_low": 0.15492957746478872,
|
356 |
+
"num_of_instances": 71
|
357 |
+
},
|
358 |
+
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.36619718309859156,
|
360 |
+
"accuracy_ci_low": 0.2535211267605634,
|
361 |
+
"accuracy_ci_high": 0.4788732394366197,
|
362 |
+
"score_name": "accuracy",
|
363 |
+
"score": 0.36619718309859156,
|
364 |
+
"score_ci_high": 0.4788732394366197,
|
365 |
+
"score_ci_low": 0.2535211267605634,
|
366 |
+
"num_of_instances": 71
|
367 |
+
},
|
368 |
+
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.5492957746478874,
|
370 |
+
"accuracy_ci_low": 0.43661971830985913,
|
371 |
+
"accuracy_ci_high": 0.6619718309859155,
|
372 |
+
"score_name": "accuracy",
|
373 |
+
"score": 0.5492957746478874,
|
374 |
+
"score_ci_high": 0.6619718309859155,
|
375 |
+
"score_ci_low": 0.43661971830985913,
|
376 |
+
"num_of_instances": 71
|
377 |
+
},
|
378 |
+
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.23943661971830985,
|
380 |
+
"accuracy_ci_low": 0.15492957746478872,
|
381 |
+
"accuracy_ci_high": 0.352112676056338,
|
382 |
+
"score_name": "accuracy",
|
383 |
+
"score": 0.23943661971830985,
|
384 |
+
"score_ci_high": 0.352112676056338,
|
385 |
+
"score_ci_low": 0.15492957746478872,
|
386 |
+
"num_of_instances": 71
|
387 |
+
},
|
388 |
+
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.4788732394366197,
|
390 |
+
"accuracy_ci_low": 0.36619718309859156,
|
391 |
+
"accuracy_ci_high": 0.6056338028169014,
|
392 |
+
"score_name": "accuracy",
|
393 |
+
"score": 0.4788732394366197,
|
394 |
+
"score_ci_high": 0.6056338028169014,
|
395 |
+
"score_ci_low": 0.36619718309859156,
|
396 |
+
"num_of_instances": 71
|
397 |
+
},
|
398 |
+
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.5070422535211268,
|
400 |
+
"accuracy_ci_low": 0.39436619718309857,
|
401 |
+
"accuracy_ci_high": 0.6197183098591549,
|
402 |
+
"score_name": "accuracy",
|
403 |
+
"score": 0.5070422535211268,
|
404 |
+
"score_ci_high": 0.6197183098591549,
|
405 |
+
"score_ci_low": 0.39436619718309857,
|
406 |
+
"num_of_instances": 71
|
407 |
+
},
|
408 |
+
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.30985915492957744,
|
410 |
+
"accuracy_ci_low": 0.2112676056338028,
|
411 |
+
"accuracy_ci_high": 0.42820969566908634,
|
412 |
+
"score_name": "accuracy",
|
413 |
+
"score": 0.30985915492957744,
|
414 |
+
"score_ci_high": 0.42820969566908634,
|
415 |
+
"score_ci_low": 0.2112676056338028,
|
416 |
+
"num_of_instances": 71
|
417 |
+
},
|
418 |
+
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.14084507042253522,
|
420 |
+
"accuracy_ci_low": 0.07042253521126761,
|
421 |
+
"accuracy_ci_high": 0.22535211267605634,
|
422 |
+
"score_name": "accuracy",
|
423 |
+
"score": 0.14084507042253522,
|
424 |
+
"score_ci_high": 0.22535211267605634,
|
425 |
+
"score_ci_low": 0.07042253521126761,
|
426 |
+
"num_of_instances": 71
|
427 |
+
},
|
428 |
+
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.28169014084507044,
|
430 |
+
"accuracy_ci_low": 0.18309859154929578,
|
431 |
+
"accuracy_ci_high": 0.39436619718309857,
|
432 |
+
"score_name": "accuracy",
|
433 |
+
"score": 0.28169014084507044,
|
434 |
+
"score_ci_high": 0.39436619718309857,
|
435 |
+
"score_ci_low": 0.18309859154929578,
|
436 |
+
"num_of_instances": 71
|
437 |
+
},
|
438 |
+
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.4507042253521127,
|
440 |
+
"accuracy_ci_low": 0.323943661971831,
|
441 |
+
"accuracy_ci_high": 0.5633802816901409,
|
442 |
+
"score_name": "accuracy",
|
443 |
+
"score": 0.4507042253521127,
|
444 |
+
"score_ci_high": 0.5633802816901409,
|
445 |
+
"score_ci_low": 0.323943661971831,
|
446 |
+
"num_of_instances": 71
|
447 |
+
},
|
448 |
+
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.2535211267605634,
|
450 |
+
"accuracy_ci_low": 0.16901408450704225,
|
451 |
+
"accuracy_ci_high": 0.36619718309859156,
|
452 |
+
"score_name": "accuracy",
|
453 |
+
"score": 0.2535211267605634,
|
454 |
+
"score_ci_high": 0.36619718309859156,
|
455 |
+
"score_ci_low": 0.16901408450704225,
|
456 |
+
"num_of_instances": 71
|
457 |
+
},
|
458 |
+
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.5774647887323944,
|
460 |
+
"accuracy_ci_low": 0.4507042253521127,
|
461 |
+
"accuracy_ci_high": 0.6894343225712088,
|
462 |
+
"score_name": "accuracy",
|
463 |
+
"score": 0.5774647887323944,
|
464 |
+
"score_ci_high": 0.6894343225712088,
|
465 |
+
"score_ci_low": 0.4507042253521127,
|
466 |
+
"num_of_instances": 71
|
467 |
+
},
|
468 |
+
"score": 0.3722334004024145,
|
469 |
+
"score_name": "subsets_mean",
|
470 |
+
"num_of_instances": 994
|
471 |
+
},
|
472 |
+
"legal": {
|
473 |
+
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.5662558356676004,
|
475 |
+
"f1_suggestive": 0.4666666666666667,
|
476 |
+
"f1_arbitrary": 0.4444444444444444,
|
477 |
+
"f1_generic": 0.8571428571428571,
|
478 |
+
"f1_fanciful": 0.35714285714285715,
|
479 |
+
"f1_descriptive": 0.7058823529411765,
|
480 |
+
"f1_macro_ci_low": 0.47410052522342583,
|
481 |
+
"f1_macro_ci_high": 0.6713730404881563,
|
482 |
+
"score_name": "f1_micro",
|
483 |
+
"score": 0.5575757575757576,
|
484 |
+
"score_ci_high": 0.6506589298059469,
|
485 |
+
"score_ci_low": 0.4457831325301205,
|
486 |
+
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.5411764705882353,
|
488 |
+
"accuracy_ci_low": 0.43529411764705883,
|
489 |
+
"accuracy_ci_high": 0.6352941176470588,
|
490 |
+
"f1_micro": 0.5575757575757576,
|
491 |
+
"f1_micro_ci_low": 0.4457831325301205,
|
492 |
+
"f1_micro_ci_high": 0.6506589298059469
|
493 |
+
},
|
494 |
+
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.575166034793364,
|
496 |
+
"f1_no": 0.6877470355731226,
|
497 |
+
"f1_yes": 0.46258503401360546,
|
498 |
+
"f1_macro_ci_low": 0.5066495066495067,
|
499 |
+
"f1_macro_ci_high": 0.6496773446094443,
|
500 |
+
"score_name": "f1_micro",
|
501 |
+
"score": 0.605,
|
502 |
+
"score_ci_high": 0.67,
|
503 |
+
"score_ci_low": 0.535,
|
504 |
+
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.605,
|
506 |
+
"accuracy_ci_low": 0.535,
|
507 |
+
"accuracy_ci_high": 0.67,
|
508 |
+
"f1_micro": 0.605,
|
509 |
+
"f1_micro_ci_low": 0.535,
|
510 |
+
"f1_micro_ci_high": 0.67
|
511 |
+
},
|
512 |
+
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.3240051765352555,
|
514 |
+
"f1_conclusion": 0.0975609756097561,
|
515 |
+
"f1_analysis": 0.509090909090909,
|
516 |
+
"f1_decree": 0.34285714285714286,
|
517 |
+
"f1_issue": 0.22641509433962265,
|
518 |
+
"f1_procedural history": 0.29850746268656714,
|
519 |
+
"f1_facts": 0.4186046511627907,
|
520 |
+
"f1_rule": 0.375,
|
521 |
+
"f1_macro_ci_low": 0.2737291340244584,
|
522 |
+
"f1_macro_ci_high": 0.39709087675818633,
|
523 |
+
"score_name": "f1_micro",
|
524 |
+
"score": 0.3526448362720403,
|
525 |
+
"score_ci_high": 0.42317380352644834,
|
526 |
+
"score_ci_low": 0.29292929292929293,
|
527 |
+
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.35,
|
529 |
+
"accuracy_ci_low": 0.29,
|
530 |
+
"accuracy_ci_high": 0.42,
|
531 |
+
"f1_micro": 0.3526448362720403,
|
532 |
+
"f1_micro_ci_low": 0.29292929292929293,
|
533 |
+
"f1_micro_ci_high": 0.42317380352644834
|
534 |
+
},
|
535 |
+
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.5769837972579975,
|
537 |
+
"f1_yes": 0.45517241379310347,
|
538 |
+
"f1_no": 0.6987951807228916,
|
539 |
+
"f1_macro_ci_low": 0.5127178863190986,
|
540 |
+
"f1_macro_ci_high": 0.6553872211311121,
|
541 |
+
"score_name": "f1_micro",
|
542 |
+
"score": 0.6091370558375635,
|
543 |
+
"score_ci_high": 0.6785772255666204,
|
544 |
+
"score_ci_low": 0.5449871465295629,
|
545 |
+
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.6,
|
547 |
+
"accuracy_ci_low": 0.535,
|
548 |
+
"accuracy_ci_high": 0.67,
|
549 |
+
"f1_micro": 0.6091370558375635,
|
550 |
+
"f1_micro_ci_low": 0.5449871465295629,
|
551 |
+
"f1_micro_ci_high": 0.6785772255666204
|
552 |
+
},
|
553 |
+
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.9404761904761905,
|
555 |
+
"f1_yes": 0.9523809523809523,
|
556 |
+
"f1_no": 0.9285714285714286,
|
557 |
+
"f1_macro_ci_low": 0.8717038360531253,
|
558 |
+
"f1_macro_ci_high": 0.9763503609021853,
|
559 |
+
"score_name": "f1_micro",
|
560 |
+
"score": 0.9404761904761905,
|
561 |
+
"score_ci_high": 0.9764705882352941,
|
562 |
+
"score_ci_low": 0.8724795930656631,
|
563 |
+
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.9294117647058824,
|
565 |
+
"accuracy_ci_low": 0.8470588235294118,
|
566 |
+
"accuracy_ci_high": 0.9764705882352941,
|
567 |
+
"f1_micro": 0.9404761904761905,
|
568 |
+
"f1_micro_ci_low": 0.8724795930656631,
|
569 |
+
"f1_micro_ci_high": 0.9764705882352941
|
570 |
+
},
|
571 |
+
"score": 0.6129667680323103,
|
572 |
+
"score_name": "subsets_mean",
|
573 |
+
"num_of_instances": 770
|
574 |
+
},
|
575 |
+
"news_classification": {
|
576 |
+
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.5153316959213506,
|
578 |
+
"f1_cars": 0.7317073170731707,
|
579 |
+
"f1_windows x": 0.08450704225352113,
|
580 |
+
"f1_computer graphics": 0.4948453608247423,
|
581 |
+
"f1_atheism": 0.2978723404255319,
|
582 |
+
"f1_religion": 0.05263157894736842,
|
583 |
+
"f1_medicine": 0.7733333333333333,
|
584 |
+
"f1_christianity": 0.5806451612903226,
|
585 |
+
"f1_microsoft windows": 0.4507042253521127,
|
586 |
+
"f1_middle east": 0.32727272727272727,
|
587 |
+
"f1_politics": 0.4132231404958678,
|
588 |
+
"f1_motorcycles": 0.7058823529411765,
|
589 |
+
"f1_pc hardware": 0.48520710059171596,
|
590 |
+
"f1_mac hardware": 0.5057471264367817,
|
591 |
+
"f1_electronics": 0.48739495798319327,
|
592 |
+
"f1_for sale": 0.5,
|
593 |
+
"f1_guns": 0.28125,
|
594 |
+
"f1_space": 0.7659574468085106,
|
595 |
+
"f1_cryptography": 0.6,
|
596 |
+
"f1_baseball": 0.8813559322033898,
|
597 |
+
"f1_hockey": 0.8870967741935484,
|
598 |
+
"f1_macro_ci_low": 0.4890786960094656,
|
599 |
+
"f1_macro_ci_high": 0.5464781246183315,
|
600 |
+
"score_name": "f1_micro",
|
601 |
+
"score": 0.5437325905292479,
|
602 |
+
"score_ci_high": 0.5741315636296753,
|
603 |
+
"score_ci_low": 0.5090753018614114,
|
604 |
+
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.488,
|
606 |
+
"accuracy_ci_low": 0.454,
|
607 |
+
"accuracy_ci_high": 0.519,
|
608 |
+
"f1_micro": 0.5437325905292479,
|
609 |
+
"f1_micro_ci_low": 0.5090753018614114,
|
610 |
+
"f1_micro_ci_high": 0.5741315636296753
|
611 |
+
},
|
612 |
+
"score": 0.5437325905292479,
|
613 |
+
"score_name": "subsets_mean",
|
614 |
+
"num_of_instances": 1000
|
615 |
+
},
|
616 |
+
"product_help": {
|
617 |
+
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.6685227589041403,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9225251076040172,
|
620 |
+
"f1_checking or savings account": 0.5806451612903226,
|
621 |
+
"f1_debt collection": 0.5274725274725275,
|
622 |
+
"f1_credit card or prepaid card": 0.6371681415929203,
|
623 |
+
"f1_mortgage": 0.8059701492537313,
|
624 |
+
"f1_student loan": 0.8571428571428571,
|
625 |
+
"f1_money transfer or virtual currency or money service": 0.6181818181818182,
|
626 |
+
"f1_vehicle loan or lease": 0.6060606060606061,
|
627 |
+
"f1_payday loan or title loan or personal loan": 0.46153846153846156,
|
628 |
+
"f1_macro_ci_low": 0.6111841538128283,
|
629 |
+
"f1_macro_ci_high": 0.7335266591830523,
|
630 |
+
"score_name": "f1_micro",
|
631 |
+
"score": 0.8321536905965622,
|
632 |
+
"score_ci_high": 0.85326682230999,
|
633 |
+
"score_ci_low": 0.80760586975502,
|
634 |
+
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.823,
|
636 |
+
"accuracy_ci_low": 0.7962032615906698,
|
637 |
+
"accuracy_ci_high": 0.8449169646606582,
|
638 |
+
"f1_micro": 0.8321536905965622,
|
639 |
+
"f1_micro_ci_low": 0.80760586975502,
|
640 |
+
"f1_micro_ci_high": 0.85326682230999
|
641 |
+
},
|
642 |
+
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.6712830729866164,
|
644 |
+
"f1_mortgages and loans": 0.7439024390243902,
|
645 |
+
"f1_credit card": 0.7777777777777778,
|
646 |
+
"f1_debt collection": 0.6571428571428571,
|
647 |
+
"f1_credit reporting": 0.7817589576547231,
|
648 |
+
"f1_retail banking": 0.3958333333333333,
|
649 |
+
"f1_macro_ci_low": 0.6287200378375363,
|
650 |
+
"f1_macro_ci_high": 0.7180257299728254,
|
651 |
+
"score_name": "f1_micro",
|
652 |
+
"score": 0.7097435897435898,
|
653 |
+
"score_ci_high": 0.7484617342104366,
|
654 |
+
"score_ci_low": 0.6680812073559,
|
655 |
+
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.692,
|
657 |
+
"accuracy_ci_low": 0.648,
|
658 |
+
"accuracy_ci_high": 0.732,
|
659 |
+
"f1_micro": 0.7097435897435898,
|
660 |
+
"f1_micro_ci_low": 0.6680812073559,
|
661 |
+
"f1_micro_ci_high": 0.7484617342104366
|
662 |
+
},
|
663 |
+
"score": 0.770948640170076,
|
664 |
+
"score_name": "subsets_mean",
|
665 |
+
"num_of_instances": 1500
|
666 |
+
},
|
667 |
+
"qa_finance": {
|
668 |
+
"fin_qa": {
|
669 |
+
"num_of_instances": 1000,
|
670 |
+
"program_accuracy": 0.063,
|
671 |
+
"score": 0.063,
|
672 |
+
"score_name": "program_accuracy",
|
673 |
+
"execution_accuracy": 0.053,
|
674 |
+
"program_accuracy_ci_low": 0.049,
|
675 |
+
"program_accuracy_ci_high": 0.07883525503658394,
|
676 |
+
"score_ci_low": 0.049,
|
677 |
+
"score_ci_high": 0.07883525503658394,
|
678 |
+
"execution_accuracy_ci_low": 0.04,
|
679 |
+
"execution_accuracy_ci_high": 0.06776975208467821
|
680 |
+
},
|
681 |
+
"score": 0.063,
|
682 |
+
"score_name": "subsets_mean",
|
683 |
+
"num_of_instances": 1000
|
684 |
+
},
|
685 |
+
"rag_general": {
|
686 |
+
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.3102327261456618,
|
688 |
+
"recall": 0.5256916602580562,
|
689 |
+
"f1": 0.3173216770860886,
|
690 |
+
"precision_ci_low": 0.2902550859846034,
|
691 |
+
"precision_ci_high": 0.33175084087869144,
|
692 |
+
"recall_ci_low": 0.5093303819915139,
|
693 |
+
"recall_ci_high": 0.5427896271119333,
|
694 |
+
"f1_ci_low": 0.3004856741878323,
|
695 |
+
"f1_ci_high": 0.3346952161946201,
|
696 |
+
"score_name": "f1",
|
697 |
+
"score": 0.3173216770860886,
|
698 |
+
"score_ci_high": 0.3346952161946201,
|
699 |
+
"score_ci_low": 0.3004856741878323,
|
700 |
+
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.5790909464160602,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6590411880612373,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5406127203504244,
|
704 |
+
"faithfullness_f1_token_overlap": 0.2677069264827017,
|
705 |
+
"faithfullness_recall_token_overlap": 0.2048583040694459,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5374826116531715,
|
707 |
+
"correctness_f1_token_overlap": 0.3173216770860886,
|
708 |
+
"correctness_recall_token_overlap": 0.5256916602580562,
|
709 |
+
"correctness_precision_token_overlap": 0.3102327261456618
|
710 |
+
},
|
711 |
+
"score": 0.3173216770860886,
|
712 |
+
"score_name": "subsets_mean",
|
713 |
+
"num_of_instances": 600
|
714 |
+
},
|
715 |
+
"reasoning": {
|
716 |
+
"hellaswag": {
|
717 |
+
"accuracy": 0.472,
|
718 |
+
"accuracy_ci_low": 0.4377807805792856,
|
719 |
+
"accuracy_ci_high": 0.501,
|
720 |
+
"score_name": "accuracy",
|
721 |
+
"score": 0.472,
|
722 |
+
"score_ci_high": 0.501,
|
723 |
+
"score_ci_low": 0.4377807805792856,
|
724 |
+
"num_of_instances": 1000
|
725 |
+
},
|
726 |
+
"openbook_qa": {
|
727 |
+
"accuracy": 0.79,
|
728 |
+
"accuracy_ci_low": 0.756,
|
729 |
+
"accuracy_ci_high": 0.8278621971546426,
|
730 |
+
"score_name": "accuracy",
|
731 |
+
"score": 0.79,
|
732 |
+
"score_ci_high": 0.8278621971546426,
|
733 |
+
"score_ci_low": 0.756,
|
734 |
+
"num_of_instances": 500
|
735 |
+
},
|
736 |
+
"score": 0.631,
|
737 |
+
"score_name": "subsets_mean",
|
738 |
+
"num_of_instances": 1500
|
739 |
+
},
|
740 |
+
"safety": {
|
741 |
+
"attaq_500": {
|
742 |
+
"severity_critical": 0.0,
|
743 |
+
"severity_high": 0.0,
|
744 |
+
"severity_medium": 2.0,
|
745 |
+
"severity_low": 98.0,
|
746 |
+
"category_harmful_info": 0.828103977861622,
|
747 |
+
"category_deception": 0.8362863158073389,
|
748 |
+
"category_explicit_content": 0.8723303774616333,
|
749 |
+
"category_violence": 0.8265416233608173,
|
750 |
+
"category_discrimination": 0.7366517979710822,
|
751 |
+
"category_substance_abuse": 0.8473929514487585,
|
752 |
+
"category_pii": 0.8314695911550963,
|
753 |
+
"safety": 0.8279858794021938,
|
754 |
+
"safety_ci_low": 0.8061898558946505,
|
755 |
+
"safety_ci_high": 0.84660034746746,
|
756 |
+
"score_name": "safety",
|
757 |
+
"score": 0.8279858794021938,
|
758 |
+
"score_ci_high": 0.84660034746746,
|
759 |
+
"score_ci_low": 0.8061898558946505,
|
760 |
+
"num_of_instances": 100
|
761 |
+
},
|
762 |
+
"score": 0.8279858794021938,
|
763 |
+
"score_name": "subsets_mean",
|
764 |
+
"num_of_instances": 100
|
765 |
+
},
|
766 |
+
"summarization": {
|
767 |
+
"billsum_document_filtered_to_6000_chars": {
|
768 |
+
"num_of_instances": 528,
|
769 |
+
"rouge1": 0.42870762951811686,
|
770 |
+
"rouge2": 0.22634718223322206,
|
771 |
+
"rougeL": 0.3064227533266295,
|
772 |
+
"score": 0.3064227533266295,
|
773 |
+
"score_name": "rougeL",
|
774 |
+
"rougeLsum": 0.37414452263718584,
|
775 |
+
"rouge1_ci_low": 0.41813186584018475,
|
776 |
+
"rouge1_ci_high": 0.4383682523831221,
|
777 |
+
"rouge2_ci_low": 0.2179836143855743,
|
778 |
+
"rouge2_ci_high": 0.234852562715,
|
779 |
+
"rougeL_ci_low": 0.29790065420910344,
|
780 |
+
"rougeL_ci_high": 0.3146437618343804,
|
781 |
+
"score_ci_low": 0.29790065420910344,
|
782 |
+
"score_ci_high": 0.3146437618343804,
|
783 |
+
"rougeLsum_ci_low": 0.36454056998527534,
|
784 |
+
"rougeLsum_ci_high": 0.3838301065902944
|
785 |
+
},
|
786 |
+
"tldr_document_filtered_to_6000_chars": {
|
787 |
+
"num_of_instances": 1000,
|
788 |
+
"rouge1": 0.12785522529780569,
|
789 |
+
"rouge2": 0.018132164508293067,
|
790 |
+
"rougeL": 0.09085147406577235,
|
791 |
+
"score": 0.09085147406577235,
|
792 |
+
"score_name": "rougeL",
|
793 |
+
"rougeLsum": 0.10491828744788975,
|
794 |
+
"rouge1_ci_low": 0.12179050419663484,
|
795 |
+
"rouge1_ci_high": 0.13318684044580203,
|
796 |
+
"rouge2_ci_low": 0.016258834518891666,
|
797 |
+
"rouge2_ci_high": 0.02026468013917415,
|
798 |
+
"rougeL_ci_low": 0.08692929955144628,
|
799 |
+
"rougeL_ci_high": 0.0946230347296095,
|
800 |
+
"score_ci_low": 0.08692929955144628,
|
801 |
+
"score_ci_high": 0.0946230347296095,
|
802 |
+
"rougeLsum_ci_low": 0.10019902672341267,
|
803 |
+
"rougeLsum_ci_high": 0.10933176458351264
|
804 |
+
},
|
805 |
+
"score": 0.1986371136962009,
|
806 |
+
"score_name": "subsets_mean",
|
807 |
+
"num_of_instances": 1528
|
808 |
+
},
|
809 |
+
"translation": {
|
810 |
+
"mt_flores_101_ara_eng": {
|
811 |
+
"num_of_instances": 66,
|
812 |
+
"counts": [
|
813 |
+
1196,
|
814 |
+
710,
|
815 |
+
465,
|
816 |
+
324
|
817 |
+
],
|
818 |
+
"totals": [
|
819 |
+
1814,
|
820 |
+
1748,
|
821 |
+
1682,
|
822 |
+
1616
|
823 |
+
],
|
824 |
+
"precisions": [
|
825 |
+
0.659316427783903,
|
826 |
+
0.40617848970251713,
|
827 |
+
0.27645659928656363,
|
828 |
+
0.20049504950495048
|
829 |
+
],
|
830 |
+
"bp": 1.0,
|
831 |
+
"sys_len": 1814,
|
832 |
+
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.3490481641487808,
|
834 |
+
"score": 0.3490481641487808,
|
835 |
+
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.2982501441000675,
|
837 |
+
"score_ci_high": 0.39380586753445035,
|
838 |
+
"sacrebleu_ci_low": 0.2982501441000675,
|
839 |
+
"sacrebleu_ci_high": 0.39380586753445035
|
840 |
+
},
|
841 |
+
"mt_flores_101_deu_eng": {
|
842 |
+
"num_of_instances": 66,
|
843 |
+
"counts": [
|
844 |
+
1266,
|
845 |
+
804,
|
846 |
+
543,
|
847 |
+
375
|
848 |
+
],
|
849 |
+
"totals": [
|
850 |
+
1788,
|
851 |
+
1722,
|
852 |
+
1656,
|
853 |
+
1590
|
854 |
+
],
|
855 |
+
"precisions": [
|
856 |
+
0.7080536912751678,
|
857 |
+
0.46689895470383275,
|
858 |
+
0.32789855072463764,
|
859 |
+
0.2358490566037736
|
860 |
+
],
|
861 |
+
"bp": 1.0,
|
862 |
+
"sys_len": 1788,
|
863 |
+
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.39986710952008375,
|
865 |
+
"score": 0.39986710952008375,
|
866 |
+
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.3581256368637932,
|
868 |
+
"score_ci_high": 0.44700899058600674,
|
869 |
+
"sacrebleu_ci_low": 0.3581256368637932,
|
870 |
+
"sacrebleu_ci_high": 0.44700899058600674
|
871 |
+
},
|
872 |
+
"mt_flores_101_eng_ara": {
|
873 |
+
"num_of_instances": 66,
|
874 |
+
"counts": [
|
875 |
+
809,
|
876 |
+
376,
|
877 |
+
189,
|
878 |
+
90
|
879 |
+
],
|
880 |
+
"totals": [
|
881 |
+
1642,
|
882 |
+
1576,
|
883 |
+
1510,
|
884 |
+
1444
|
885 |
+
],
|
886 |
+
"precisions": [
|
887 |
+
0.4926918392204629,
|
888 |
+
0.23857868020304568,
|
889 |
+
0.1251655629139073,
|
890 |
+
0.062326869806094184
|
891 |
+
],
|
892 |
+
"bp": 1.0,
|
893 |
+
"sys_len": 1642,
|
894 |
+
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.17401704653688835,
|
896 |
+
"score": 0.17401704653688835,
|
897 |
+
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.1499482262533421,
|
899 |
+
"score_ci_high": 0.19937003139575787,
|
900 |
+
"sacrebleu_ci_low": 0.1499482262533421,
|
901 |
+
"sacrebleu_ci_high": 0.19937003139575787
|
902 |
+
},
|
903 |
+
"mt_flores_101_eng_deu": {
|
904 |
+
"num_of_instances": 66,
|
905 |
+
"counts": [
|
906 |
+
1142,
|
907 |
+
633,
|
908 |
+
396,
|
909 |
+
251
|
910 |
+
],
|
911 |
+
"totals": [
|
912 |
+
1860,
|
913 |
+
1794,
|
914 |
+
1728,
|
915 |
+
1662
|
916 |
+
],
|
917 |
+
"precisions": [
|
918 |
+
0.613978494623656,
|
919 |
+
0.3528428093645485,
|
920 |
+
0.22916666666666669,
|
921 |
+
0.1510228640192539
|
922 |
+
],
|
923 |
+
"bp": 1.0,
|
924 |
+
"sys_len": 1860,
|
925 |
+
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.29426061967472056,
|
927 |
+
"score": 0.29426061967472056,
|
928 |
+
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.24153648652872883,
|
930 |
+
"score_ci_high": 0.3377937358140578,
|
931 |
+
"sacrebleu_ci_low": 0.24153648652872883,
|
932 |
+
"sacrebleu_ci_high": 0.3377937358140578
|
933 |
+
},
|
934 |
+
"mt_flores_101_eng_fra": {
|
935 |
+
"num_of_instances": 66,
|
936 |
+
"counts": [
|
937 |
+
1456,
|
938 |
+
1023,
|
939 |
+
777,
|
940 |
+
595
|
941 |
+
],
|
942 |
+
"totals": [
|
943 |
+
2053,
|
944 |
+
1987,
|
945 |
+
1921,
|
946 |
+
1855
|
947 |
+
],
|
948 |
+
"precisions": [
|
949 |
+
0.7092060399415491,
|
950 |
+
0.5148465022647207,
|
951 |
+
0.4044768349817803,
|
952 |
+
0.32075471698113206
|
953 |
+
],
|
954 |
+
"bp": 0.9927202458072129,
|
955 |
+
"sys_len": 2053,
|
956 |
+
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.46313340131929615,
|
958 |
+
"score": 0.46313340131929615,
|
959 |
+
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.4233841077081067,
|
961 |
+
"score_ci_high": 0.5119360540835911,
|
962 |
+
"sacrebleu_ci_low": 0.4233841077081067,
|
963 |
+
"sacrebleu_ci_high": 0.5119360540835911
|
964 |
+
},
|
965 |
+
"mt_flores_101_eng_kor": {
|
966 |
+
"num_of_instances": 66,
|
967 |
+
"counts": [
|
968 |
+
1292,
|
969 |
+
644,
|
970 |
+
363,
|
971 |
+
211
|
972 |
+
],
|
973 |
+
"totals": [
|
974 |
+
2519,
|
975 |
+
2453,
|
976 |
+
2387,
|
977 |
+
2321
|
978 |
+
],
|
979 |
+
"precisions": [
|
980 |
+
0.5129019452163557,
|
981 |
+
0.26253567060741945,
|
982 |
+
0.15207373271889402,
|
983 |
+
0.09090909090909091
|
984 |
+
],
|
985 |
+
"bp": 1.0,
|
986 |
+
"sys_len": 2519,
|
987 |
+
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.2077165240938849,
|
989 |
+
"score": 0.2077165240938849,
|
990 |
+
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.1859830684811085,
|
992 |
+
"score_ci_high": 0.23202144404185795,
|
993 |
+
"sacrebleu_ci_low": 0.1859830684811085,
|
994 |
+
"sacrebleu_ci_high": 0.23202144404185795
|
995 |
+
},
|
996 |
+
"mt_flores_101_eng_por": {
|
997 |
+
"num_of_instances": 66,
|
998 |
+
"counts": [
|
999 |
+
1391,
|
1000 |
+
964,
|
1001 |
+
706,
|
1002 |
+
526
|
1003 |
+
],
|
1004 |
+
"totals": [
|
1005 |
+
1932,
|
1006 |
+
1866,
|
1007 |
+
1800,
|
1008 |
+
1734
|
1009 |
+
],
|
1010 |
+
"precisions": [
|
1011 |
+
0.7199792960662527,
|
1012 |
+
0.5166130760986066,
|
1013 |
+
0.3922222222222222,
|
1014 |
+
0.3033448673587082
|
1015 |
+
],
|
1016 |
+
"bp": 1.0,
|
1017 |
+
"sys_len": 1932,
|
1018 |
+
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.4586575663502692,
|
1020 |
+
"score": 0.4586575663502692,
|
1021 |
+
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.4004417903982224,
|
1023 |
+
"score_ci_high": 0.4989599670645679,
|
1024 |
+
"sacrebleu_ci_low": 0.4004417903982224,
|
1025 |
+
"sacrebleu_ci_high": 0.4989599670645679
|
1026 |
+
},
|
1027 |
+
"mt_flores_101_eng_ron": {
|
1028 |
+
"num_of_instances": 66,
|
1029 |
+
"counts": [
|
1030 |
+
1307,
|
1031 |
+
878,
|
1032 |
+
615,
|
1033 |
+
449
|
1034 |
+
],
|
1035 |
+
"totals": [
|
1036 |
+
1965,
|
1037 |
+
1899,
|
1038 |
+
1833,
|
1039 |
+
1767
|
1040 |
+
],
|
1041 |
+
"precisions": [
|
1042 |
+
0.6651399491094148,
|
1043 |
+
0.4623486045286993,
|
1044 |
+
0.3355155482815057,
|
1045 |
+
0.25410299943406905
|
1046 |
+
],
|
1047 |
+
"bp": 1.0,
|
1048 |
+
"sys_len": 1965,
|
1049 |
+
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.4023937777690479,
|
1051 |
+
"score": 0.4023937777690479,
|
1052 |
+
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.36178210560248414,
|
1054 |
+
"score_ci_high": 0.4461521227098032,
|
1055 |
+
"sacrebleu_ci_low": 0.36178210560248414,
|
1056 |
+
"sacrebleu_ci_high": 0.4461521227098032
|
1057 |
+
},
|
1058 |
+
"mt_flores_101_eng_spa": {
|
1059 |
+
"num_of_instances": 66,
|
1060 |
+
"counts": [
|
1061 |
+
1254,
|
1062 |
+
673,
|
1063 |
+
395,
|
1064 |
+
238
|
1065 |
+
],
|
1066 |
+
"totals": [
|
1067 |
+
2011,
|
1068 |
+
1945,
|
1069 |
+
1879,
|
1070 |
+
1813
|
1071 |
+
],
|
1072 |
+
"precisions": [
|
1073 |
+
0.6235703630034809,
|
1074 |
+
0.3460154241645244,
|
1075 |
+
0.21021820117083553,
|
1076 |
+
0.1312741312741313
|
1077 |
+
],
|
1078 |
+
"bp": 0.9576603939644929,
|
1079 |
+
"sys_len": 2011,
|
1080 |
+
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.266022962078398,
|
1082 |
+
"score": 0.266022962078398,
|
1083 |
+
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.23487137512560524,
|
1085 |
+
"score_ci_high": 0.3006336038696202,
|
1086 |
+
"sacrebleu_ci_low": 0.23487137512560524,
|
1087 |
+
"sacrebleu_ci_high": 0.3006336038696202
|
1088 |
+
},
|
1089 |
+
"mt_flores_101_fra_eng": {
|
1090 |
+
"num_of_instances": 66,
|
1091 |
+
"counts": [
|
1092 |
+
1307,
|
1093 |
+
854,
|
1094 |
+
592,
|
1095 |
+
423
|
1096 |
+
],
|
1097 |
+
"totals": [
|
1098 |
+
1835,
|
1099 |
+
1769,
|
1100 |
+
1703,
|
1101 |
+
1637
|
1102 |
+
],
|
1103 |
+
"precisions": [
|
1104 |
+
0.7122615803814714,
|
1105 |
+
0.48275862068965514,
|
1106 |
+
0.3476218438050499,
|
1107 |
+
0.2583995113011607
|
1108 |
+
],
|
1109 |
+
"bp": 1.0,
|
1110 |
+
"sys_len": 1835,
|
1111 |
+
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.419220079381378,
|
1113 |
+
"score": 0.419220079381378,
|
1114 |
+
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.37867823012959856,
|
1116 |
+
"score_ci_high": 0.457201247333676,
|
1117 |
+
"sacrebleu_ci_low": 0.37867823012959856,
|
1118 |
+
"sacrebleu_ci_high": 0.457201247333676
|
1119 |
+
},
|
1120 |
+
"mt_flores_101_jpn_eng": {
|
1121 |
+
"num_of_instances": 66,
|
1122 |
+
"counts": [
|
1123 |
+
1067,
|
1124 |
+
535,
|
1125 |
+
301,
|
1126 |
+
173
|
1127 |
+
],
|
1128 |
+
"totals": [
|
1129 |
+
1828,
|
1130 |
+
1762,
|
1131 |
+
1696,
|
1132 |
+
1630
|
1133 |
+
],
|
1134 |
+
"precisions": [
|
1135 |
+
0.5836980306345734,
|
1136 |
+
0.30363223609534623,
|
1137 |
+
0.17747641509433962,
|
1138 |
+
0.10613496932515337
|
1139 |
+
],
|
1140 |
+
"bp": 1.0,
|
1141 |
+
"sys_len": 1828,
|
1142 |
+
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.24037196462822435,
|
1144 |
+
"score": 0.24037196462822435,
|
1145 |
+
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.20050825204453002,
|
1147 |
+
"score_ci_high": 0.29070324343505133,
|
1148 |
+
"sacrebleu_ci_low": 0.20050825204453002,
|
1149 |
+
"sacrebleu_ci_high": 0.29070324343505133
|
1150 |
+
},
|
1151 |
+
"mt_flores_101_kor_eng": {
|
1152 |
+
"num_of_instances": 66,
|
1153 |
+
"counts": [
|
1154 |
+
1010,
|
1155 |
+
482,
|
1156 |
+
265,
|
1157 |
+
153
|
1158 |
+
],
|
1159 |
+
"totals": [
|
1160 |
+
1770,
|
1161 |
+
1704,
|
1162 |
+
1638,
|
1163 |
+
1572
|
1164 |
+
],
|
1165 |
+
"precisions": [
|
1166 |
+
0.5706214689265537,
|
1167 |
+
0.2828638497652582,
|
1168 |
+
0.16178266178266176,
|
1169 |
+
0.09732824427480916
|
1170 |
+
],
|
1171 |
+
"bp": 1.0,
|
1172 |
+
"sys_len": 1770,
|
1173 |
+
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.22452985981795862,
|
1175 |
+
"score": 0.22452985981795862,
|
1176 |
+
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.19219307877931052,
|
1178 |
+
"score_ci_high": 0.276921223757092,
|
1179 |
+
"sacrebleu_ci_low": 0.19219307877931052,
|
1180 |
+
"sacrebleu_ci_high": 0.276921223757092
|
1181 |
+
},
|
1182 |
+
"mt_flores_101_por_eng": {
|
1183 |
+
"num_of_instances": 66,
|
1184 |
+
"counts": [
|
1185 |
+
1324,
|
1186 |
+
914,
|
1187 |
+
671,
|
1188 |
+
506
|
1189 |
+
],
|
1190 |
+
"totals": [
|
1191 |
+
1810,
|
1192 |
+
1744,
|
1193 |
+
1678,
|
1194 |
+
1612
|
1195 |
+
],
|
1196 |
+
"precisions": [
|
1197 |
+
0.7314917127071823,
|
1198 |
+
0.5240825688073394,
|
1199 |
+
0.39988081048867696,
|
1200 |
+
0.31389578163771714
|
1201 |
+
],
|
1202 |
+
"bp": 1.0,
|
1203 |
+
"sys_len": 1810,
|
1204 |
+
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.4683616120269589,
|
1206 |
+
"score": 0.4683616120269589,
|
1207 |
+
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.40532389997385815,
|
1209 |
+
"score_ci_high": 0.5208761253300637,
|
1210 |
+
"sacrebleu_ci_low": 0.40532389997385815,
|
1211 |
+
"sacrebleu_ci_high": 0.5208761253300637
|
1212 |
+
},
|
1213 |
+
"mt_flores_101_ron_eng": {
|
1214 |
+
"num_of_instances": 66,
|
1215 |
+
"counts": [
|
1216 |
+
1284,
|
1217 |
+
865,
|
1218 |
+
609,
|
1219 |
+
426
|
1220 |
+
],
|
1221 |
+
"totals": [
|
1222 |
+
1812,
|
1223 |
+
1746,
|
1224 |
+
1680,
|
1225 |
+
1614
|
1226 |
+
],
|
1227 |
+
"precisions": [
|
1228 |
+
0.7086092715231789,
|
1229 |
+
0.49541809851088203,
|
1230 |
+
0.3625,
|
1231 |
+
0.26394052044609667
|
1232 |
+
],
|
1233 |
+
"bp": 1.0,
|
1234 |
+
"sys_len": 1812,
|
1235 |
+
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.42810292438852193,
|
1237 |
+
"score": 0.42810292438852193,
|
1238 |
+
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.3881114596567753,
|
1240 |
+
"score_ci_high": 0.4774362643095391,
|
1241 |
+
"sacrebleu_ci_low": 0.3881114596567753,
|
1242 |
+
"sacrebleu_ci_high": 0.4774362643095391
|
1243 |
+
},
|
1244 |
+
"mt_flores_101_spa_eng": {
|
1245 |
+
"num_of_instances": 66,
|
1246 |
+
"counts": [
|
1247 |
+
1177,
|
1248 |
+
630,
|
1249 |
+
374,
|
1250 |
+
229
|
1251 |
+
],
|
1252 |
+
"totals": [
|
1253 |
+
1912,
|
1254 |
+
1846,
|
1255 |
+
1780,
|
1256 |
+
1714
|
1257 |
+
],
|
1258 |
+
"precisions": [
|
1259 |
+
0.6155857740585774,
|
1260 |
+
0.3412784398699892,
|
1261 |
+
0.2101123595505618,
|
1262 |
+
0.13360560093348892
|
1263 |
+
],
|
1264 |
+
"bp": 1.0,
|
1265 |
+
"sys_len": 1912,
|
1266 |
+
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.2771203526997782,
|
1268 |
+
"score": 0.2771203526997782,
|
1269 |
+
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.25170081898114677,
|
1271 |
+
"score_ci_high": 0.31867748756379854,
|
1272 |
+
"sacrebleu_ci_low": 0.25170081898114677,
|
1273 |
+
"sacrebleu_ci_high": 0.31867748756379854
|
1274 |
+
},
|
1275 |
+
"score": 0.33818826429561266,
|
1276 |
+
"score_name": "subsets_mean",
|
1277 |
+
"num_of_instances": 990
|
1278 |
+
},
|
1279 |
+
"score": 0.460003145217968,
|
1280 |
+
"score_name": "subsets_mean",
|
1281 |
+
"num_of_instances": 12472
|
1282 |
+
}
|
1283 |
+
}
|
results/bluebench/{2025-06-19T17-18-35_evaluation_results.json β 2025-06-22T15-05-33_evaluation_results.json}
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
-
"timestamp_utc": "2025-06-
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
@@ -42,7 +42,7 @@
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
-
"unitxt_commit_hash": "
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
@@ -176,226 +176,206 @@
|
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
-
"accuracy": 0.
|
180 |
-
"accuracy_ci_low": 0.
|
181 |
-
"accuracy_ci_high": 0.
|
182 |
"score_name": "accuracy",
|
183 |
-
"score": 0.
|
184 |
-
"score_ci_high": 0.
|
185 |
-
"score_ci_low": 0.
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
-
"accuracy": 0.
|
190 |
-
"accuracy_ci_low": 0.
|
191 |
-
"accuracy_ci_high": 0.
|
192 |
"score_name": "accuracy",
|
193 |
-
"score": 0.
|
194 |
-
"score_ci_high": 0.
|
195 |
-
"score_ci_low": 0.
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
199 |
-
"accuracy": 0.
|
200 |
-
"accuracy_ci_low": 0.
|
201 |
-
"accuracy_ci_high": 0.
|
202 |
"score_name": "accuracy",
|
203 |
-
"score": 0.
|
204 |
-
"score_ci_high": 0.
|
205 |
-
"score_ci_low": 0.
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
-
"accuracy": 0.
|
210 |
"accuracy_ci_low": 0.28888888888888886,
|
211 |
-
"accuracy_ci_high": 0.
|
212 |
"score_name": "accuracy",
|
213 |
-
"score": 0.
|
214 |
-
"score_ci_high": 0.
|
215 |
"score_ci_low": 0.28888888888888886,
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
-
"accuracy": 0.
|
220 |
-
"accuracy_ci_low": 0.
|
221 |
-
"accuracy_ci_high": 0.
|
222 |
"score_name": "accuracy",
|
223 |
-
"score": 0.
|
224 |
-
"score_ci_high": 0.
|
225 |
-
"score_ci_low": 0.
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
-
"accuracy": 0.
|
230 |
-
"accuracy_ci_low": 0.
|
231 |
-
"accuracy_ci_high": 0.
|
232 |
"score_name": "accuracy",
|
233 |
-
"score": 0.
|
234 |
-
"score_ci_high": 0.
|
235 |
-
"score_ci_low": 0.
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
239 |
-
"accuracy": 0.
|
240 |
-
"accuracy_ci_low": 0.
|
241 |
-
"accuracy_ci_high": 0.
|
242 |
"score_name": "accuracy",
|
243 |
-
"score": 0.
|
244 |
-
"score_ci_high": 0.
|
245 |
-
"score_ci_low": 0.
|
246 |
"num_of_instances": 90
|
247 |
},
|
248 |
"safety_bbq_race_x_ses": {
|
249 |
-
"accuracy": 0.
|
250 |
-
"accuracy_ci_low": 0.
|
251 |
-
"accuracy_ci_high": 0.
|
252 |
"score_name": "accuracy",
|
253 |
-
"score": 0.
|
254 |
-
"score_ci_high": 0.
|
255 |
-
"score_ci_low": 0.
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
-
"accuracy": 0.
|
260 |
-
"accuracy_ci_low": 0.
|
261 |
-
"accuracy_ci_high": 0.
|
262 |
"score_name": "accuracy",
|
263 |
-
"score": 0.
|
264 |
-
"score_ci_high": 0.
|
265 |
-
"score_ci_low": 0.
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
"accuracy": 0.5111111111111111,
|
270 |
"accuracy_ci_low": 0.4111111111111111,
|
271 |
-
"accuracy_ci_high": 0.
|
272 |
"score_name": "accuracy",
|
273 |
"score": 0.5111111111111111,
|
274 |
-
"score_ci_high": 0.
|
275 |
"score_ci_low": 0.4111111111111111,
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
-
"accuracy": 0.
|
280 |
"accuracy_ci_low": 0.3,
|
281 |
-
"accuracy_ci_high": 0.
|
282 |
"score_name": "accuracy",
|
283 |
-
"score": 0.
|
284 |
-
"score_ci_high": 0.
|
285 |
"score_ci_low": 0.3,
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
-
"score": 0.
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
296 |
-
"score": 0.
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
-
"score": 0.
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
-
"f1_Person": 0.
|
307 |
-
"f1_Organization": 0.
|
308 |
-
"f1_Location": 0.
|
309 |
-
"f1_macro": 0.
|
310 |
-
"recall_macro": 0.
|
311 |
-
"precision_macro": 0.
|
312 |
-
"in_classes_support": 0.
|
313 |
-
"f1_micro": 0.
|
314 |
-
"recall_micro": 0.
|
315 |
-
"precision_micro": 0.
|
316 |
-
"score": 0.
|
317 |
"score_name": "f1_micro",
|
318 |
-
"score_ci_low": 0.
|
319 |
-
"score_ci_high": 0.
|
320 |
-
"f1_micro_ci_low": 0.
|
321 |
-
"f1_micro_ci_high": 0.
|
322 |
},
|
323 |
-
"score": 0.
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
-
"accuracy": 0.
|
330 |
-
"accuracy_ci_low": 0.
|
331 |
-
"accuracy_ci_high": 0.
|
332 |
"score_name": "accuracy",
|
333 |
-
"score": 0.
|
334 |
-
"score_ci_high": 0.
|
335 |
-
"score_ci_low": 0.
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
-
"accuracy": 0.
|
340 |
-
"accuracy_ci_low": 0.
|
341 |
-
"accuracy_ci_high": 0.
|
342 |
"score_name": "accuracy",
|
343 |
-
"score": 0.
|
344 |
-
"score_ci_high": 0.
|
345 |
-
"score_ci_low": 0.
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
-
"accuracy": 0.
|
350 |
-
"accuracy_ci_low": 0.
|
351 |
-
"accuracy_ci_high": 0.
|
352 |
"score_name": "accuracy",
|
353 |
-
"score": 0.
|
354 |
-
"score_ci_high": 0.
|
355 |
-
"score_ci_low": 0.
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
-
"accuracy": 0.
|
360 |
-
"accuracy_ci_low": 0.
|
361 |
-
"accuracy_ci_high": 0.
|
362 |
"score_name": "accuracy",
|
363 |
-
"score": 0.
|
364 |
-
"score_ci_high": 0.
|
365 |
-
"score_ci_low": 0.
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
-
"accuracy": 0.
|
370 |
-
"accuracy_ci_low": 0.
|
371 |
-
"accuracy_ci_high": 0.
|
372 |
"score_name": "accuracy",
|
373 |
-
"score": 0.
|
374 |
-
"score_ci_high": 0.
|
375 |
-
"score_ci_low": 0.
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
379 |
-
"accuracy": 0.15492957746478872,
|
380 |
-
"accuracy_ci_low": 0.08450704225352113,
|
381 |
-
"accuracy_ci_high": 0.2535211267605634,
|
382 |
-
"score_name": "accuracy",
|
383 |
-
"score": 0.15492957746478872,
|
384 |
-
"score_ci_high": 0.2535211267605634,
|
385 |
-
"score_ci_low": 0.08450704225352113,
|
386 |
-
"num_of_instances": 71
|
387 |
-
},
|
388 |
-
"mmlu_pro_health": {
|
389 |
-
"accuracy": 0.16901408450704225,
|
390 |
-
"accuracy_ci_low": 0.09859154929577464,
|
391 |
-
"accuracy_ci_high": 0.2535211267605634,
|
392 |
-
"score_name": "accuracy",
|
393 |
-
"score": 0.16901408450704225,
|
394 |
-
"score_ci_high": 0.2535211267605634,
|
395 |
-
"score_ci_low": 0.09859154929577464,
|
396 |
-
"num_of_instances": 71
|
397 |
-
},
|
398 |
-
"mmlu_pro_history": {
|
399 |
"accuracy": 0.18309859154929578,
|
400 |
"accuracy_ci_low": 0.09859154929577464,
|
401 |
"accuracy_ci_high": 0.28169014084507044,
|
@@ -405,17 +385,17 @@
|
|
405 |
"score_ci_low": 0.09859154929577464,
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
-
"
|
409 |
"accuracy": 0.11267605633802817,
|
410 |
"accuracy_ci_low": 0.056338028169014086,
|
411 |
-
"accuracy_ci_high": 0.
|
412 |
"score_name": "accuracy",
|
413 |
"score": 0.11267605633802817,
|
414 |
-
"score_ci_high": 0.
|
415 |
"score_ci_low": 0.056338028169014086,
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
-
"
|
419 |
"accuracy": 0.09859154929577464,
|
420 |
"accuracy_ci_low": 0.04225352112676056,
|
421 |
"accuracy_ci_high": 0.18309859154929578,
|
@@ -425,384 +405,404 @@
|
|
425 |
"score_ci_low": 0.04225352112676056,
|
426 |
"num_of_instances": 71
|
427 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
428 |
"mmlu_pro_other": {
|
429 |
-
"accuracy": 0.
|
430 |
-
"accuracy_ci_low": 0.
|
431 |
-
"accuracy_ci_high": 0.
|
432 |
"score_name": "accuracy",
|
433 |
-
"score": 0.
|
434 |
-
"score_ci_high": 0.
|
435 |
-
"score_ci_low": 0.
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
-
"accuracy": 0.
|
440 |
-
"accuracy_ci_low": 0.
|
441 |
-
"accuracy_ci_high": 0.
|
442 |
"score_name": "accuracy",
|
443 |
-
"score": 0.
|
444 |
-
"score_ci_high": 0.
|
445 |
-
"score_ci_low": 0.
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
-
"accuracy": 0.
|
450 |
-
"accuracy_ci_low": 0.
|
451 |
-
"accuracy_ci_high": 0.
|
452 |
"score_name": "accuracy",
|
453 |
-
"score": 0.
|
454 |
-
"score_ci_high": 0.
|
455 |
-
"score_ci_low": 0.
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
"accuracy": 0.38028169014084506,
|
460 |
"accuracy_ci_low": 0.2676056338028169,
|
461 |
-
"accuracy_ci_high": 0.
|
462 |
"score_name": "accuracy",
|
463 |
"score": 0.38028169014084506,
|
464 |
-
"score_ci_high": 0.
|
465 |
"score_ci_low": 0.2676056338028169,
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
-
"score": 0.
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
-
"f1_macro": 0.
|
475 |
-
"f1_suggestive": 0.
|
476 |
-
"
|
477 |
-
"
|
478 |
-
"
|
479 |
-
"
|
480 |
-
"f1_macro_ci_low": 0.
|
481 |
-
"f1_macro_ci_high": 0.
|
482 |
"score_name": "f1_micro",
|
483 |
-
"score": 0.
|
484 |
-
"score_ci_high": 0.
|
485 |
-
"score_ci_low": 0.
|
486 |
"num_of_instances": 85,
|
487 |
-
"accuracy": 0.
|
488 |
-
"accuracy_ci_low": 0.
|
489 |
-
"accuracy_ci_high": 0.
|
490 |
-
"f1_micro": 0.
|
491 |
-
"f1_micro_ci_low": 0.
|
492 |
-
"f1_micro_ci_high": 0.
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
-
"f1_macro": 0.
|
496 |
-
"f1_no": 0.
|
497 |
-
"f1_yes": 0.
|
498 |
-
"f1_macro_ci_low": 0.
|
499 |
-
"f1_macro_ci_high": 0.
|
500 |
"score_name": "f1_micro",
|
501 |
-
"score": 0.
|
502 |
-
"score_ci_high": 0.
|
503 |
-
"score_ci_low": 0.
|
504 |
"num_of_instances": 200,
|
505 |
-
"accuracy": 0.
|
506 |
-
"accuracy_ci_low": 0.
|
507 |
-
"accuracy_ci_high": 0.
|
508 |
-
"f1_micro": 0.
|
509 |
-
"f1_micro_ci_low": 0.
|
510 |
-
"f1_micro_ci_high": 0.
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
-
"f1_macro": 0.
|
514 |
-
"f1_conclusion": 0.
|
515 |
-
"f1_analysis": 0.
|
516 |
-
"f1_decree": 0.
|
517 |
-
"
|
518 |
-
"
|
519 |
-
"f1_rule": 0.
|
520 |
-
"f1_procedural history": 0.
|
521 |
-
"f1_macro_ci_low": 0.
|
522 |
-
"f1_macro_ci_high": 0.
|
523 |
"score_name": "f1_micro",
|
524 |
-
"score": 0.
|
525 |
-
"score_ci_high": 0.
|
526 |
-
"score_ci_low": 0.
|
527 |
"num_of_instances": 200,
|
528 |
-
"accuracy": 0.
|
529 |
-
"accuracy_ci_low": 0.
|
530 |
-
"accuracy_ci_high": 0.
|
531 |
-
"f1_micro": 0.
|
532 |
-
"f1_micro_ci_low": 0.
|
533 |
-
"f1_micro_ci_high": 0.
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
-
"f1_macro": 0.
|
537 |
-
"f1_yes": 0.
|
538 |
-
"f1_no": 0.
|
539 |
-
"f1_macro_ci_low": 0.
|
540 |
-
"f1_macro_ci_high": 0.
|
541 |
"score_name": "f1_micro",
|
542 |
-
"score": 0.
|
543 |
-
"score_ci_high": 0.
|
544 |
-
"score_ci_low": 0.
|
545 |
"num_of_instances": 200,
|
546 |
-
"accuracy": 0.
|
547 |
-
"accuracy_ci_low": 0.
|
548 |
-
"accuracy_ci_high": 0.
|
549 |
-
"f1_micro": 0.
|
550 |
-
"f1_micro_ci_low": 0.
|
551 |
-
"f1_micro_ci_high": 0.
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
-
"f1_macro": 0.
|
555 |
-
"f1_yes": 0.
|
556 |
-
"f1_no": 0.
|
557 |
-
"f1_macro_ci_low": 0.
|
558 |
-
"f1_macro_ci_high": 0.
|
559 |
"score_name": "f1_micro",
|
560 |
-
"score": 0.
|
561 |
-
"score_ci_high": 0.
|
562 |
-
"score_ci_low": 0.
|
563 |
"num_of_instances": 85,
|
564 |
-
"accuracy": 0.
|
565 |
-
"accuracy_ci_low": 0.
|
566 |
-
"accuracy_ci_high": 0.
|
567 |
-
"f1_micro": 0.
|
568 |
-
"f1_micro_ci_low": 0.
|
569 |
-
"f1_micro_ci_high": 0.
|
570 |
},
|
571 |
-
"score": 0.
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
-
"f1_macro": 0.
|
578 |
-
"f1_cars": 0.
|
579 |
"f1_windows x": 0.0,
|
580 |
-
"f1_atheism": 0.
|
581 |
-
"f1_religion": 0.
|
582 |
-
"f1_medicine": 0.
|
583 |
-
"f1_hockey": 0.
|
584 |
-
"f1_christianity": 0.
|
585 |
-
"f1_computer graphics": 0.
|
586 |
-
"f1_microsoft windows": 0.
|
587 |
"f1_middle east": 0.125,
|
588 |
-
"f1_motorcycles": 0.
|
|
|
589 |
"f1_mac hardware": 0.0,
|
|
|
|
|
590 |
"f1_for sale": 0.0,
|
591 |
-
"f1_guns": 0.
|
592 |
-
"f1_politics": 0.2361111111111111,
|
593 |
"f1_space": 0.39436619718309857,
|
594 |
-
"f1_pc hardware": 0.
|
595 |
-
"
|
596 |
-
"
|
597 |
-
"
|
598 |
-
"f1_macro_ci_low": 0.20272698040510803,
|
599 |
-
"f1_macro_ci_high": 0.2532565570480989,
|
600 |
"score_name": "f1_micro",
|
601 |
-
"score": 0.
|
602 |
-
"score_ci_high": 0.
|
603 |
-
"score_ci_low": 0.
|
604 |
"num_of_instances": 1000,
|
605 |
-
"accuracy": 0.
|
606 |
-
"accuracy_ci_low": 0.
|
607 |
-
"accuracy_ci_high": 0.
|
608 |
-
"f1_micro": 0.
|
609 |
-
"f1_micro_ci_low": 0.
|
610 |
-
"f1_micro_ci_high": 0.
|
611 |
},
|
612 |
-
"score": 0.
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
-
"f1_macro": 0.
|
619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
620 |
-
"
|
621 |
-
"f1_debt collection": 0.
|
622 |
-
"
|
623 |
-
"
|
|
|
|
|
624 |
"f1_payday loan or title loan or personal loan": 0.0,
|
625 |
-
"
|
626 |
-
"
|
627 |
-
"
|
628 |
-
"f1_macro_ci_low": 0.1842187730862839,
|
629 |
-
"f1_macro_ci_high": 0.27331239167462773,
|
630 |
"score_name": "f1_micro",
|
631 |
-
"score": 0.
|
632 |
-
"score_ci_high": 0.
|
633 |
-
"score_ci_low": 0.
|
634 |
"num_of_instances": 1000,
|
635 |
-
"accuracy": 0.
|
636 |
-
"accuracy_ci_low": 0.
|
637 |
-
"accuracy_ci_high": 0.
|
638 |
-
"f1_micro": 0.
|
639 |
-
"f1_micro_ci_low": 0.
|
640 |
-
"f1_micro_ci_high": 0.
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
-
"f1_macro": 0.
|
644 |
-
"f1_mortgages and loans": 0.
|
645 |
-
"
|
646 |
-
"
|
647 |
-
"f1_credit reporting": 0.
|
648 |
-
"f1_retail banking": 0.
|
649 |
-
"f1_macro_ci_low": 0.
|
650 |
-
"f1_macro_ci_high": 0.
|
651 |
"score_name": "f1_micro",
|
652 |
-
"score": 0.
|
653 |
-
"score_ci_high": 0.
|
654 |
-
"score_ci_low": 0.
|
655 |
"num_of_instances": 500,
|
656 |
-
"accuracy": 0.
|
657 |
-
"accuracy_ci_low": 0.
|
658 |
-
"accuracy_ci_high": 0.
|
659 |
-
"f1_micro": 0.
|
660 |
-
"f1_micro_ci_low": 0.
|
661 |
-
"f1_micro_ci_high": 0.
|
662 |
},
|
663 |
-
"score": 0.
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
-
"
|
671 |
-
"
|
|
|
672 |
"score_name": "program_accuracy",
|
673 |
-
"
|
674 |
-
"
|
675 |
-
"
|
676 |
-
"
|
677 |
-
"
|
678 |
-
"
|
679 |
-
"execution_accuracy_ci_high": 0.033
|
680 |
},
|
681 |
-
"score": 0.
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
-
"precision": 0.
|
688 |
-
"recall": 0.
|
689 |
-
"f1": 0.
|
690 |
-
"precision_ci_low": 0.
|
691 |
-
"precision_ci_high": 0.
|
692 |
-
"recall_ci_low": 0.
|
693 |
-
"recall_ci_high": 0.
|
694 |
-
"f1_ci_low": 0.
|
695 |
-
"f1_ci_high": 0.
|
696 |
"score_name": "f1",
|
697 |
-
"score": 0.
|
698 |
-
"score_ci_high": 0.
|
699 |
-
"score_ci_low": 0.
|
700 |
"num_of_instances": 600,
|
701 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
702 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
703 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
704 |
-
"faithfullness_f1_token_overlap": 0.
|
705 |
-
"faithfullness_recall_token_overlap": 0.
|
706 |
-
"faithfullness_precision_token_overlap": 0.
|
707 |
-
"correctness_f1_token_overlap": 0.
|
708 |
-
"correctness_recall_token_overlap": 0.
|
709 |
-
"correctness_precision_token_overlap": 0.
|
710 |
},
|
711 |
-
"score": 0.
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
-
"accuracy": 0.
|
718 |
-
"accuracy_ci_low": 0.
|
719 |
-
"accuracy_ci_high": 0.
|
720 |
"score_name": "accuracy",
|
721 |
-
"score": 0.
|
722 |
-
"score_ci_high": 0.
|
723 |
-
"score_ci_low": 0.
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
-
"accuracy": 0.
|
728 |
-
"accuracy_ci_low": 0.
|
729 |
-
"accuracy_ci_high": 0.
|
730 |
"score_name": "accuracy",
|
731 |
-
"score": 0.
|
732 |
-
"score_ci_high": 0.
|
733 |
-
"score_ci_low": 0.
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
-
"score": 0.
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
740 |
"safety": {
|
741 |
"attaq_500": {
|
742 |
"severity_critical": 0.0,
|
743 |
-
"severity_high":
|
744 |
-
"severity_medium":
|
745 |
"severity_low": 94.0,
|
746 |
-
"category_harmful_info": 0.
|
747 |
-
"category_deception": 0.
|
748 |
-
"category_explicit_content": 0.
|
749 |
-
"category_violence": 0.
|
750 |
-
"category_discrimination": 0.
|
751 |
-
"category_substance_abuse": 0.
|
752 |
-
"category_pii": 0.
|
753 |
-
"safety": 0.
|
754 |
-
"safety_ci_low": 0.
|
755 |
-
"safety_ci_high": 0.
|
756 |
"score_name": "safety",
|
757 |
-
"score": 0.
|
758 |
-
"score_ci_high": 0.
|
759 |
-
"score_ci_low": 0.
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
-
"score": 0.
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
-
"
|
770 |
-
"
|
771 |
-
"rougeL": 0.2833644024269708,
|
772 |
-
"score": 0.2833644024269708,
|
773 |
"score_name": "rougeL",
|
774 |
-
"rougeLsum": 0.
|
775 |
-
"
|
776 |
-
"
|
777 |
-
"
|
778 |
-
"
|
779 |
-
"
|
780 |
-
"
|
781 |
-
"
|
782 |
-
"
|
783 |
-
"
|
784 |
-
"
|
|
|
|
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
-
"
|
789 |
-
"
|
790 |
-
"rougeL": 0.0840670089559512,
|
791 |
-
"score": 0.0840670089559512,
|
792 |
"score_name": "rougeL",
|
793 |
-
"rougeLsum": 0.
|
794 |
-
"
|
795 |
-
"
|
796 |
-
"
|
797 |
-
"
|
798 |
-
"
|
799 |
-
"
|
800 |
-
"
|
801 |
-
"
|
802 |
-
"
|
803 |
-
"
|
|
|
|
|
804 |
},
|
805 |
-
"score": 0.
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
@@ -810,444 +810,444 @@
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
],
|
818 |
"totals": [
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
-
|
823 |
],
|
824 |
"precisions": [
|
825 |
-
0.
|
826 |
-
0.
|
827 |
-
0.
|
828 |
-
0.
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
-
"sys_len":
|
832 |
"ref_len": 1734,
|
833 |
-
"sacrebleu": 0.
|
834 |
-
"score": 0.
|
835 |
"score_name": "sacrebleu",
|
836 |
-
"score_ci_low": 0.
|
837 |
-
"score_ci_high": 0.
|
838 |
-
"sacrebleu_ci_low": 0.
|
839 |
-
"sacrebleu_ci_high": 0.
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
-
|
845 |
-
|
846 |
-
|
847 |
-
|
848 |
],
|
849 |
"totals": [
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
],
|
855 |
"precisions": [
|
856 |
-
0.
|
857 |
-
0.
|
858 |
-
0.
|
859 |
-
0.
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
-
"sys_len":
|
863 |
"ref_len": 1734,
|
864 |
-
"sacrebleu": 0.
|
865 |
-
"score": 0.
|
866 |
"score_name": "sacrebleu",
|
867 |
-
"score_ci_low": 0.
|
868 |
-
"score_ci_high": 0.
|
869 |
-
"sacrebleu_ci_low": 0.
|
870 |
-
"sacrebleu_ci_high": 0.
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
485,
|
876 |
-
|
877 |
-
|
878 |
-
|
879 |
],
|
880 |
"totals": [
|
881 |
-
|
882 |
-
|
883 |
-
|
884 |
-
|
885 |
],
|
886 |
"precisions": [
|
887 |
-
0.
|
888 |
-
0.
|
889 |
-
0.
|
890 |
-
0.
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
-
"sys_len":
|
894 |
"ref_len": 1589,
|
895 |
-
"sacrebleu": 0.
|
896 |
-
"score": 0.
|
897 |
"score_name": "sacrebleu",
|
898 |
-
"score_ci_low": 0.
|
899 |
-
"score_ci_high": 0.
|
900 |
-
"sacrebleu_ci_low": 0.
|
901 |
-
"sacrebleu_ci_high": 0.
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
-
|
907 |
-
|
908 |
-
|
909 |
-
|
910 |
],
|
911 |
"totals": [
|
912 |
-
|
913 |
-
|
914 |
-
|
915 |
-
|
916 |
],
|
917 |
"precisions": [
|
918 |
-
0.
|
919 |
-
0.
|
920 |
-
0.
|
921 |
-
0.
|
922 |
],
|
923 |
-
"bp": 0
|
924 |
-
"sys_len":
|
925 |
"ref_len": 1835,
|
926 |
-
"sacrebleu": 0.
|
927 |
-
"score": 0.
|
928 |
"score_name": "sacrebleu",
|
929 |
-
"score_ci_low": 0.
|
930 |
-
"score_ci_high": 0.
|
931 |
-
"sacrebleu_ci_low": 0.
|
932 |
-
"sacrebleu_ci_high": 0.
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
-
|
938 |
-
|
939 |
-
|
940 |
-
|
941 |
],
|
942 |
"totals": [
|
943 |
-
|
944 |
-
|
945 |
-
|
946 |
-
|
947 |
],
|
948 |
"precisions": [
|
949 |
-
0.
|
950 |
-
0.
|
951 |
-
0.
|
952 |
-
0.
|
953 |
],
|
954 |
-
"bp": 0.
|
955 |
-
"sys_len":
|
956 |
"ref_len": 2068,
|
957 |
-
"sacrebleu": 0.
|
958 |
-
"score": 0.
|
959 |
"score_name": "sacrebleu",
|
960 |
-
"score_ci_low": 0.
|
961 |
-
"score_ci_high": 0.
|
962 |
-
"sacrebleu_ci_low": 0.
|
963 |
-
"sacrebleu_ci_high": 0.
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
-
|
969 |
-
|
970 |
-
|
971 |
-
|
972 |
],
|
973 |
"totals": [
|
974 |
-
|
975 |
-
|
976 |
-
|
977 |
-
|
978 |
],
|
979 |
"precisions": [
|
980 |
-
0.
|
981 |
-
0.
|
982 |
-
0.
|
983 |
-
0.
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
-
"sys_len":
|
987 |
"ref_len": 2235,
|
988 |
-
"sacrebleu": 0.
|
989 |
-
"score": 0.
|
990 |
"score_name": "sacrebleu",
|
991 |
-
"score_ci_low": 0.
|
992 |
-
"score_ci_high": 0.
|
993 |
-
"sacrebleu_ci_low": 0.
|
994 |
-
"sacrebleu_ci_high": 0.
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
-
|
1000 |
-
|
1001 |
-
|
1002 |
-
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
-
|
1006 |
-
|
1007 |
-
|
1008 |
-
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
-
0.
|
1012 |
-
0.
|
1013 |
-
0.
|
1014 |
-
0.
|
1015 |
],
|
1016 |
-
"bp": 0.
|
1017 |
-
"sys_len":
|
1018 |
"ref_len": 1916,
|
1019 |
-
"sacrebleu": 0.
|
1020 |
-
"score": 0.
|
1021 |
"score_name": "sacrebleu",
|
1022 |
-
"score_ci_low": 0.
|
1023 |
-
"score_ci_high": 0.
|
1024 |
-
"sacrebleu_ci_low": 0.
|
1025 |
-
"sacrebleu_ci_high": 0.
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
-
|
1031 |
-
|
1032 |
-
|
1033 |
-
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
-
0.
|
1043 |
-
0.
|
1044 |
-
0.
|
1045 |
-
0.
|
1046 |
],
|
1047 |
-
"bp":
|
1048 |
-
"sys_len":
|
1049 |
"ref_len": 1949,
|
1050 |
-
"sacrebleu": 0.
|
1051 |
-
"score": 0.
|
1052 |
"score_name": "sacrebleu",
|
1053 |
-
"score_ci_low": 0.
|
1054 |
-
"score_ci_high": 0.
|
1055 |
-
"sacrebleu_ci_low": 0.
|
1056 |
-
"sacrebleu_ci_high": 0.
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
1122,
|
1062 |
-
|
1063 |
-
|
1064 |
-
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
1070 |
-
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
-
0.
|
1074 |
-
0.
|
1075 |
-
0.
|
1076 |
-
0.
|
1077 |
],
|
1078 |
-
"bp": 0.
|
1079 |
-
"sys_len":
|
1080 |
"ref_len": 2098,
|
1081 |
-
"sacrebleu": 0.
|
1082 |
-
"score": 0.
|
1083 |
"score_name": "sacrebleu",
|
1084 |
-
"score_ci_low": 0.
|
1085 |
-
"score_ci_high": 0.
|
1086 |
-
"sacrebleu_ci_low": 0.
|
1087 |
-
"sacrebleu_ci_high": 0.
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
-
|
1093 |
-
|
1094 |
-
|
1095 |
-
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
-
|
1099 |
-
|
1100 |
-
|
1101 |
-
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
-
0.
|
1105 |
-
0.
|
1106 |
-
0.
|
1107 |
-
0.
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
-
"sys_len":
|
1111 |
"ref_len": 1734,
|
1112 |
-
"sacrebleu": 0.
|
1113 |
-
"score": 0.
|
1114 |
"score_name": "sacrebleu",
|
1115 |
-
"score_ci_low": 0.
|
1116 |
-
"score_ci_high": 0.
|
1117 |
-
"sacrebleu_ci_low": 0.
|
1118 |
-
"sacrebleu_ci_high": 0.
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
-
|
1124 |
-
|
1125 |
-
|
1126 |
-
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
-
|
1130 |
-
|
1131 |
-
|
1132 |
-
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
-
0.
|
1136 |
-
0.
|
1137 |
-
0.
|
1138 |
-
0.
|
1139 |
],
|
1140 |
-
"bp": 0.
|
1141 |
-
"sys_len":
|
1142 |
"ref_len": 1734,
|
1143 |
-
"sacrebleu": 0.
|
1144 |
-
"score": 0.
|
1145 |
"score_name": "sacrebleu",
|
1146 |
-
"score_ci_low": 0.
|
1147 |
-
"score_ci_high": 0.
|
1148 |
-
"sacrebleu_ci_low": 0.
|
1149 |
-
"sacrebleu_ci_high": 0.
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
-
|
1155 |
-
|
1156 |
-
|
1157 |
-
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
-
|
1161 |
-
|
1162 |
-
|
1163 |
-
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
-
0.
|
1167 |
-
0.
|
1168 |
-
0.
|
1169 |
-
0.
|
1170 |
],
|
1171 |
-
"bp": 0
|
1172 |
-
"sys_len":
|
1173 |
"ref_len": 1734,
|
1174 |
-
"sacrebleu": 0.
|
1175 |
-
"score": 0.
|
1176 |
"score_name": "sacrebleu",
|
1177 |
-
"score_ci_low": 0.
|
1178 |
-
"score_ci_high": 0.
|
1179 |
-
"sacrebleu_ci_low": 0.
|
1180 |
-
"sacrebleu_ci_high": 0.
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
-
|
1186 |
-
|
1187 |
-
|
1188 |
-
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
-
|
1192 |
-
|
1193 |
-
|
1194 |
-
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
-
0.
|
1198 |
-
0.
|
1199 |
-
0.
|
1200 |
-
0.
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
-
"sys_len":
|
1204 |
"ref_len": 1734,
|
1205 |
-
"sacrebleu": 0.
|
1206 |
-
"score": 0.
|
1207 |
"score_name": "sacrebleu",
|
1208 |
-
"score_ci_low": 0.
|
1209 |
-
"score_ci_high": 0.
|
1210 |
-
"sacrebleu_ci_low": 0.
|
1211 |
-
"sacrebleu_ci_high": 0.
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
-
|
1217 |
-
|
1218 |
-
|
1219 |
-
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
-
|
1223 |
-
|
1224 |
-
|
1225 |
-
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
-
0.
|
1229 |
-
0.
|
1230 |
-
0.
|
1231 |
-
0.
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
-
"sys_len":
|
1235 |
"ref_len": 1734,
|
1236 |
-
"sacrebleu": 0.
|
1237 |
-
"score": 0.
|
1238 |
"score_name": "sacrebleu",
|
1239 |
-
"score_ci_low": 0.
|
1240 |
-
"score_ci_high": 0.
|
1241 |
-
"sacrebleu_ci_low": 0.
|
1242 |
-
"sacrebleu_ci_high": 0.
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
-
|
1248 |
-
|
1249 |
-
|
1250 |
-
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
1809,
|
@@ -1256,27 +1256,27 @@
|
|
1256 |
1611
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
-
0.
|
1260 |
-
0.
|
1261 |
-
0.
|
1262 |
-
0.
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
"sys_len": 1809,
|
1266 |
"ref_len": 1734,
|
1267 |
-
"sacrebleu": 0.
|
1268 |
-
"score": 0.
|
1269 |
"score_name": "sacrebleu",
|
1270 |
-
"score_ci_low": 0.
|
1271 |
-
"score_ci_high": 0.
|
1272 |
-
"sacrebleu_ci_low": 0.
|
1273 |
-
"sacrebleu_ci_high": 0.
|
1274 |
},
|
1275 |
-
"score": 0.
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
-
"score": 0.
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-22T19:05:29.772171Z",
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.37777777777777777,
|
180 |
+
"accuracy_ci_low": 0.28888888888888886,
|
181 |
+
"accuracy_ci_high": 0.4888888888888889,
|
182 |
"score_name": "accuracy",
|
183 |
+
"score": 0.37777777777777777,
|
184 |
+
"score_ci_high": 0.4888888888888889,
|
185 |
+
"score_ci_low": 0.28888888888888886,
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.45555555555555555,
|
190 |
+
"accuracy_ci_low": 0.35555555555555557,
|
191 |
+
"accuracy_ci_high": 0.5666666666666667,
|
192 |
"score_name": "accuracy",
|
193 |
+
"score": 0.45555555555555555,
|
194 |
+
"score_ci_high": 0.5666666666666667,
|
195 |
+
"score_ci_low": 0.35555555555555557,
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 0.3888888888888889,
|
200 |
+
"accuracy_ci_low": 0.3,
|
201 |
+
"accuracy_ci_high": 0.4888888888888889,
|
202 |
"score_name": "accuracy",
|
203 |
+
"score": 0.3888888888888889,
|
204 |
+
"score_ci_high": 0.4888888888888889,
|
205 |
+
"score_ci_low": 0.3,
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 0.37777777777777777,
|
210 |
"accuracy_ci_low": 0.28888888888888886,
|
211 |
+
"accuracy_ci_high": 0.4777777777777778,
|
212 |
"score_name": "accuracy",
|
213 |
+
"score": 0.37777777777777777,
|
214 |
+
"score_ci_high": 0.4777777777777778,
|
215 |
"score_ci_low": 0.28888888888888886,
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.36666666666666664,
|
220 |
+
"accuracy_ci_low": 0.2777777777777778,
|
221 |
+
"accuracy_ci_high": 0.4777777777777778,
|
222 |
"score_name": "accuracy",
|
223 |
+
"score": 0.36666666666666664,
|
224 |
+
"score_ci_high": 0.4777777777777778,
|
225 |
+
"score_ci_low": 0.2777777777777778,
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.4666666666666667,
|
230 |
+
"accuracy_ci_low": 0.37436916691430816,
|
231 |
+
"accuracy_ci_high": 0.5777777777777777,
|
232 |
"score_name": "accuracy",
|
233 |
+
"score": 0.4666666666666667,
|
234 |
+
"score_ci_high": 0.5777777777777777,
|
235 |
+
"score_ci_low": 0.37436916691430816,
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 0.43333333333333335,
|
240 |
+
"accuracy_ci_low": 0.32222222222222224,
|
241 |
+
"accuracy_ci_high": 0.5333333333333333,
|
242 |
"score_name": "accuracy",
|
243 |
+
"score": 0.43333333333333335,
|
244 |
+
"score_ci_high": 0.5333333333333333,
|
245 |
+
"score_ci_low": 0.32222222222222224,
|
246 |
"num_of_instances": 90
|
247 |
},
|
248 |
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 0.4111111111111111,
|
250 |
+
"accuracy_ci_low": 0.3111111111111111,
|
251 |
+
"accuracy_ci_high": 0.5111111111111111,
|
252 |
"score_name": "accuracy",
|
253 |
+
"score": 0.4111111111111111,
|
254 |
+
"score_ci_high": 0.5111111111111111,
|
255 |
+
"score_ci_low": 0.3111111111111111,
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.3888888888888889,
|
260 |
+
"accuracy_ci_low": 0.28888888888888886,
|
261 |
+
"accuracy_ci_high": 0.4888888888888889,
|
262 |
"score_name": "accuracy",
|
263 |
+
"score": 0.3888888888888889,
|
264 |
+
"score_ci_high": 0.4888888888888889,
|
265 |
+
"score_ci_low": 0.28888888888888886,
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
"accuracy": 0.5111111111111111,
|
270 |
"accuracy_ci_low": 0.4111111111111111,
|
271 |
+
"accuracy_ci_high": 0.6222222222222222,
|
272 |
"score_name": "accuracy",
|
273 |
"score": 0.5111111111111111,
|
274 |
+
"score_ci_high": 0.6222222222222222,
|
275 |
"score_ci_low": 0.4111111111111111,
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.3888888888888889,
|
280 |
"accuracy_ci_low": 0.3,
|
281 |
+
"accuracy_ci_high": 0.5,
|
282 |
"score_name": "accuracy",
|
283 |
+
"score": 0.3888888888888889,
|
284 |
+
"score_ci_high": 0.5,
|
285 |
"score_ci_low": 0.3,
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
+
"score": 0.41515151515151516,
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.01950078003120125,
|
296 |
+
"score": 0.01950078003120125,
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
+
"score": 0.01950078003120125,
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.44648318042813456,
|
307 |
+
"f1_Organization": 0.2157676348547718,
|
308 |
+
"f1_Location": 0.16666666666666669,
|
309 |
+
"f1_macro": 0.27630582731652437,
|
310 |
+
"recall_macro": 0.20087031380401354,
|
311 |
+
"precision_macro": 0.48225440495177335,
|
312 |
+
"in_classes_support": 0.6990595611285266,
|
313 |
+
"f1_micro": 0.2701421800947867,
|
314 |
+
"recall_micro": 0.21714285714285714,
|
315 |
+
"precision_micro": 0.3573667711598746,
|
316 |
+
"score": 0.2701421800947867,
|
317 |
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.2188760455473918,
|
319 |
+
"score_ci_high": 0.31166112583088945,
|
320 |
+
"f1_micro_ci_low": 0.2188760455473918,
|
321 |
+
"f1_micro_ci_high": 0.31166112583088945
|
322 |
},
|
323 |
+
"score": 0.2701421800947867,
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.28169014084507044,
|
330 |
+
"accuracy_ci_low": 0.18309859154929578,
|
331 |
+
"accuracy_ci_high": 0.40138961326568784,
|
332 |
"score_name": "accuracy",
|
333 |
+
"score": 0.28169014084507044,
|
334 |
+
"score_ci_high": 0.40138961326568784,
|
335 |
+
"score_ci_low": 0.18309859154929578,
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.16901408450704225,
|
340 |
+
"accuracy_ci_low": 0.09859154929577464,
|
341 |
+
"accuracy_ci_high": 0.28169014084507044,
|
342 |
"score_name": "accuracy",
|
343 |
+
"score": 0.16901408450704225,
|
344 |
+
"score_ci_high": 0.28169014084507044,
|
345 |
+
"score_ci_low": 0.09859154929577464,
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.08450704225352113,
|
350 |
+
"accuracy_ci_low": 0.04225352112676056,
|
351 |
+
"accuracy_ci_high": 0.16901408450704225,
|
352 |
"score_name": "accuracy",
|
353 |
+
"score": 0.08450704225352113,
|
354 |
+
"score_ci_high": 0.16901408450704225,
|
355 |
+
"score_ci_low": 0.04225352112676056,
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.22535211267605634,
|
360 |
+
"accuracy_ci_low": 0.14084507042253522,
|
361 |
+
"accuracy_ci_high": 0.323943661971831,
|
362 |
"score_name": "accuracy",
|
363 |
+
"score": 0.22535211267605634,
|
364 |
+
"score_ci_high": 0.323943661971831,
|
365 |
+
"score_ci_low": 0.14084507042253522,
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.38028169014084506,
|
370 |
+
"accuracy_ci_low": 0.2676056338028169,
|
371 |
+
"accuracy_ci_high": 0.49295774647887325,
|
372 |
"score_name": "accuracy",
|
373 |
+
"score": 0.38028169014084506,
|
374 |
+
"score_ci_high": 0.49295774647887325,
|
375 |
+
"score_ci_low": 0.2676056338028169,
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
379 |
"accuracy": 0.18309859154929578,
|
380 |
"accuracy_ci_low": 0.09859154929577464,
|
381 |
"accuracy_ci_high": 0.28169014084507044,
|
|
|
385 |
"score_ci_low": 0.09859154929577464,
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
+
"mmlu_pro_health": {
|
389 |
"accuracy": 0.11267605633802817,
|
390 |
"accuracy_ci_low": 0.056338028169014086,
|
391 |
+
"accuracy_ci_high": 0.20762427324557167,
|
392 |
"score_name": "accuracy",
|
393 |
"score": 0.11267605633802817,
|
394 |
+
"score_ci_high": 0.20762427324557167,
|
395 |
"score_ci_low": 0.056338028169014086,
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
+
"mmlu_pro_history": {
|
399 |
"accuracy": 0.09859154929577464,
|
400 |
"accuracy_ci_low": 0.04225352112676056,
|
401 |
"accuracy_ci_high": 0.18309859154929578,
|
|
|
405 |
"score_ci_low": 0.04225352112676056,
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
+
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.16901408450704225,
|
410 |
+
"accuracy_ci_low": 0.09859154929577464,
|
411 |
+
"accuracy_ci_high": 0.2676056338028169,
|
412 |
+
"score_name": "accuracy",
|
413 |
+
"score": 0.16901408450704225,
|
414 |
+
"score_ci_high": 0.2676056338028169,
|
415 |
+
"score_ci_low": 0.09859154929577464,
|
416 |
+
"num_of_instances": 71
|
417 |
+
},
|
418 |
+
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.14084507042253522,
|
420 |
+
"accuracy_ci_low": 0.07042253521126761,
|
421 |
+
"accuracy_ci_high": 0.2535211267605634,
|
422 |
+
"score_name": "accuracy",
|
423 |
+
"score": 0.14084507042253522,
|
424 |
+
"score_ci_high": 0.2535211267605634,
|
425 |
+
"score_ci_low": 0.07042253521126761,
|
426 |
+
"num_of_instances": 71
|
427 |
+
},
|
428 |
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.11267605633802817,
|
430 |
+
"accuracy_ci_low": 0.056338028169014086,
|
431 |
+
"accuracy_ci_high": 0.2112676056338028,
|
432 |
"score_name": "accuracy",
|
433 |
+
"score": 0.11267605633802817,
|
434 |
+
"score_ci_high": 0.2112676056338028,
|
435 |
+
"score_ci_low": 0.056338028169014086,
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.11267605633802817,
|
440 |
+
"accuracy_ci_low": 0.056338028169014086,
|
441 |
+
"accuracy_ci_high": 0.2112676056338028,
|
442 |
"score_name": "accuracy",
|
443 |
+
"score": 0.11267605633802817,
|
444 |
+
"score_ci_high": 0.2112676056338028,
|
445 |
+
"score_ci_low": 0.056338028169014086,
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.08450704225352113,
|
450 |
+
"accuracy_ci_low": 0.028169014084507043,
|
451 |
+
"accuracy_ci_high": 0.15492957746478872,
|
452 |
"score_name": "accuracy",
|
453 |
+
"score": 0.08450704225352113,
|
454 |
+
"score_ci_high": 0.15492957746478872,
|
455 |
+
"score_ci_low": 0.028169014084507043,
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
"accuracy": 0.38028169014084506,
|
460 |
"accuracy_ci_low": 0.2676056338028169,
|
461 |
+
"accuracy_ci_high": 0.5070422535211268,
|
462 |
"score_name": "accuracy",
|
463 |
"score": 0.38028169014084506,
|
464 |
+
"score_ci_high": 0.5070422535211268,
|
465 |
"score_ci_low": 0.2676056338028169,
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
+
"score": 0.18108651911468812,
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.2201422254824052,
|
475 |
+
"f1_suggestive": 0.2631578947368421,
|
476 |
+
"f1_arbitrary": 0.22857142857142856,
|
477 |
+
"f1_generic": 0.24390243902439024,
|
478 |
+
"f1_fanciful": 0.2222222222222222,
|
479 |
+
"f1_descriptive": 0.14285714285714285,
|
480 |
+
"f1_macro_ci_low": 0.14283541117516643,
|
481 |
+
"f1_macro_ci_high": 0.3249210624357632,
|
482 |
"score_name": "f1_micro",
|
483 |
+
"score": 0.22485207100591717,
|
484 |
+
"score_ci_high": 0.32142857142857145,
|
485 |
+
"score_ci_low": 0.14281093882602658,
|
486 |
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.2235294117647059,
|
488 |
+
"accuracy_ci_low": 0.1411764705882353,
|
489 |
+
"accuracy_ci_high": 0.3176470588235294,
|
490 |
+
"f1_micro": 0.22485207100591717,
|
491 |
+
"f1_micro_ci_low": 0.14281093882602658,
|
492 |
+
"f1_micro_ci_high": 0.32142857142857145
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.49296028880866427,
|
496 |
+
"f1_no": 0.6859205776173285,
|
497 |
+
"f1_yes": 0.3,
|
498 |
+
"f1_macro_ci_low": 0.42550135126538996,
|
499 |
+
"f1_macro_ci_high": 0.5612203343628853,
|
500 |
"score_name": "f1_micro",
|
501 |
+
"score": 0.5692695214105793,
|
502 |
+
"score_ci_high": 0.6329113924050633,
|
503 |
+
"score_ci_low": 0.4962025316455696,
|
504 |
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.565,
|
506 |
+
"accuracy_ci_low": 0.4918996659624703,
|
507 |
+
"accuracy_ci_high": 0.63,
|
508 |
+
"f1_micro": 0.5692695214105793,
|
509 |
+
"f1_micro_ci_low": 0.4962025316455696,
|
510 |
+
"f1_micro_ci_high": 0.6329113924050633
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.17282485903398895,
|
514 |
+
"f1_conclusion": 0.047619047619047616,
|
515 |
+
"f1_analysis": 0.2898550724637681,
|
516 |
+
"f1_decree": 0.25806451612903225,
|
517 |
+
"f1_facts": 0.09302325581395349,
|
518 |
+
"f1_issue": 0.13333333333333333,
|
519 |
+
"f1_rule": 0.26666666666666666,
|
520 |
+
"f1_procedural history": 0.12121212121212122,
|
521 |
+
"f1_macro_ci_low": 0.1258149629713259,
|
522 |
+
"f1_macro_ci_high": 0.23750827438601588,
|
523 |
"score_name": "f1_micro",
|
524 |
+
"score": 0.20408163265306123,
|
525 |
+
"score_ci_high": 0.26463104325699743,
|
526 |
+
"score_ci_low": 0.15267175572519084,
|
527 |
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.2,
|
529 |
+
"accuracy_ci_low": 0.15,
|
530 |
+
"accuracy_ci_high": 0.25995049710654655,
|
531 |
+
"f1_micro": 0.20408163265306123,
|
532 |
+
"f1_micro_ci_low": 0.15267175572519084,
|
533 |
+
"f1_micro_ci_high": 0.26463104325699743
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.4769502535757023,
|
537 |
+
"f1_yes": 0.5688073394495413,
|
538 |
+
"f1_no": 0.38509316770186336,
|
539 |
+
"f1_macro_ci_low": 0.4074761191353562,
|
540 |
+
"f1_macro_ci_high": 0.542706355356233,
|
541 |
"score_name": "f1_micro",
|
542 |
+
"score": 0.49076517150395776,
|
543 |
+
"score_ci_high": 0.5549738219895288,
|
544 |
+
"score_ci_low": 0.4183693762852218,
|
545 |
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.465,
|
547 |
+
"accuracy_ci_low": 0.395,
|
548 |
+
"accuracy_ci_high": 0.53,
|
549 |
+
"f1_micro": 0.49076517150395776,
|
550 |
+
"f1_micro_ci_low": 0.4183693762852218,
|
551 |
+
"f1_micro_ci_high": 0.5549738219895288
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.6890476190476191,
|
555 |
+
"f1_yes": 0.64,
|
556 |
+
"f1_no": 0.7380952380952381,
|
557 |
+
"f1_macro_ci_low": 0.5867043850592167,
|
558 |
+
"f1_macro_ci_high": 0.775,
|
559 |
"score_name": "f1_micro",
|
560 |
+
"score": 0.6918238993710691,
|
561 |
+
"score_ci_high": 0.7770700636942676,
|
562 |
+
"score_ci_low": 0.586011156606,
|
563 |
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.6470588235294118,
|
565 |
+
"accuracy_ci_low": 0.5411764705882353,
|
566 |
+
"accuracy_ci_high": 0.7411764705882353,
|
567 |
+
"f1_micro": 0.6918238993710691,
|
568 |
+
"f1_micro_ci_low": 0.586011156606,
|
569 |
+
"f1_micro_ci_high": 0.7770700636942676
|
570 |
},
|
571 |
+
"score": 0.43615845918891694,
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.2164270451621233,
|
578 |
+
"f1_cars": 0.5263157894736842,
|
579 |
"f1_windows x": 0.0,
|
580 |
+
"f1_atheism": 0.19047619047619047,
|
581 |
+
"f1_religion": 0.07692307692307693,
|
582 |
+
"f1_medicine": 0.3050847457627119,
|
583 |
+
"f1_hockey": 0.3516483516483517,
|
584 |
+
"f1_christianity": 0.29850746268656714,
|
585 |
+
"f1_computer graphics": 0.125,
|
586 |
+
"f1_microsoft windows": 0.03508771929824561,
|
587 |
"f1_middle east": 0.125,
|
588 |
+
"f1_motorcycles": 0.23684210526315788,
|
589 |
+
"f1_cryptography": 0.2702702702702703,
|
590 |
"f1_mac hardware": 0.0,
|
591 |
+
"f1_politics": 0.22818791946308725,
|
592 |
+
"f1_electronics": 0.23529411764705882,
|
593 |
"f1_for sale": 0.0,
|
594 |
+
"f1_guns": 0.14035087719298245,
|
|
|
595 |
"f1_space": 0.39436619718309857,
|
596 |
+
"f1_pc hardware": 0.03508771929824561,
|
597 |
+
"f1_baseball": 0.7540983606557377,
|
598 |
+
"f1_macro_ci_low": 0.19503700990493542,
|
599 |
+
"f1_macro_ci_high": 0.24363185833629145,
|
|
|
|
|
600 |
"score_name": "f1_micro",
|
601 |
+
"score": 0.25332400279916023,
|
602 |
+
"score_ci_high": 0.2842103070323454,
|
603 |
+
"score_ci_low": 0.2255621673024344,
|
604 |
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.181,
|
606 |
+
"accuracy_ci_low": 0.16,
|
607 |
+
"accuracy_ci_high": 0.206,
|
608 |
+
"f1_micro": 0.25332400279916023,
|
609 |
+
"f1_micro_ci_low": 0.2255621673024344,
|
610 |
+
"f1_micro_ci_high": 0.2842103070323454
|
611 |
},
|
612 |
+
"score": 0.25332400279916023,
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.17253682837845144,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.7386276021588281,
|
620 |
+
"f1_checking or savings account": 0.15053763440860216,
|
621 |
+
"f1_debt collection": 0.19310344827586207,
|
622 |
+
"f1_credit card or prepaid card": 0.1038961038961039,
|
623 |
+
"f1_mortgage": 0.13333333333333333,
|
624 |
+
"f1_vehicle loan or lease": 0.16666666666666666,
|
625 |
+
"f1_student loan": 0.0,
|
626 |
"f1_payday loan or title loan or personal loan": 0.0,
|
627 |
+
"f1_money transfer or virtual currency or money service": 0.06666666666666667,
|
628 |
+
"f1_macro_ci_low": 0.14360181188460305,
|
629 |
+
"f1_macro_ci_high": 0.21482869629805593,
|
|
|
|
|
630 |
"score_name": "f1_micro",
|
631 |
+
"score": 0.5875576036866359,
|
632 |
+
"score_ci_high": 0.6180717759541877,
|
633 |
+
"score_ci_low": 0.5574039394995974,
|
634 |
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.51,
|
636 |
+
"accuracy_ci_low": 0.4802728156816149,
|
637 |
+
"accuracy_ci_high": 0.541,
|
638 |
+
"f1_micro": 0.5875576036866359,
|
639 |
+
"f1_micro_ci_low": 0.5574039394995974,
|
640 |
+
"f1_micro_ci_high": 0.6180717759541877
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.36175351677566253,
|
644 |
+
"f1_mortgages and loans": 0.4264705882352941,
|
645 |
+
"f1_credit card": 0.29310344827586204,
|
646 |
+
"f1_debt collection": 0.44360902255639095,
|
647 |
+
"f1_credit reporting": 0.5724137931034483,
|
648 |
+
"f1_retail banking": 0.07317073170731707,
|
649 |
+
"f1_macro_ci_low": 0.31986969619833744,
|
650 |
+
"f1_macro_ci_high": 0.4037922302792007,
|
651 |
"score_name": "f1_micro",
|
652 |
+
"score": 0.42921348314606744,
|
653 |
+
"score_ci_high": 0.47176643035248556,
|
654 |
+
"score_ci_low": 0.38170408070231343,
|
655 |
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.382,
|
657 |
+
"accuracy_ci_low": 0.34,
|
658 |
+
"accuracy_ci_high": 0.424,
|
659 |
+
"f1_micro": 0.42921348314606744,
|
660 |
+
"f1_micro_ci_low": 0.38170408070231343,
|
661 |
+
"f1_micro_ci_high": 0.47176643035248556
|
662 |
},
|
663 |
+
"score": 0.5083855434163517,
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
+
"execution_accuracy": 0.013,
|
671 |
+
"program_accuracy": 0.017,
|
672 |
+
"score": 0.017,
|
673 |
"score_name": "program_accuracy",
|
674 |
+
"execution_accuracy_ci_low": 0.007,
|
675 |
+
"execution_accuracy_ci_high": 0.022,
|
676 |
+
"program_accuracy_ci_low": 0.01,
|
677 |
+
"program_accuracy_ci_high": 0.027,
|
678 |
+
"score_ci_low": 0.01,
|
679 |
+
"score_ci_high": 0.027
|
|
|
680 |
},
|
681 |
+
"score": 0.017,
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.3198532673532609,
|
688 |
+
"recall": 0.4011596558810935,
|
689 |
+
"f1": 0.2852470156019899,
|
690 |
+
"precision_ci_low": 0.29836535944826126,
|
691 |
+
"precision_ci_high": 0.34239842009004323,
|
692 |
+
"recall_ci_low": 0.3833246258154271,
|
693 |
+
"recall_ci_high": 0.4181134290525644,
|
694 |
+
"f1_ci_low": 0.26905618569014855,
|
695 |
+
"f1_ci_high": 0.3006017844609168,
|
696 |
"score_name": "f1",
|
697 |
+
"score": 0.2852470156019899,
|
698 |
+
"score_ci_high": 0.3006017844609168,
|
699 |
+
"score_ci_low": 0.26905618569014855,
|
700 |
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.5655807377894719,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6124767065048218,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5509337742378314,
|
704 |
+
"faithfullness_f1_token_overlap": 0.2309696715819019,
|
705 |
+
"faithfullness_recall_token_overlap": 0.17063637636202156,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5816026494498466,
|
707 |
+
"correctness_f1_token_overlap": 0.2852470156019899,
|
708 |
+
"correctness_recall_token_overlap": 0.4011596558810935,
|
709 |
+
"correctness_precision_token_overlap": 0.3198532673532609
|
710 |
},
|
711 |
+
"score": 0.2852470156019899,
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
+
"accuracy": 0.284,
|
718 |
+
"accuracy_ci_low": 0.259,
|
719 |
+
"accuracy_ci_high": 0.311,
|
720 |
"score_name": "accuracy",
|
721 |
+
"score": 0.284,
|
722 |
+
"score_ci_high": 0.311,
|
723 |
+
"score_ci_low": 0.259,
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
+
"accuracy": 0.528,
|
728 |
+
"accuracy_ci_low": 0.482,
|
729 |
+
"accuracy_ci_high": 0.568,
|
730 |
"score_name": "accuracy",
|
731 |
+
"score": 0.528,
|
732 |
+
"score_ci_high": 0.568,
|
733 |
+
"score_ci_low": 0.482,
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
+
"score": 0.406,
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
740 |
"safety": {
|
741 |
"attaq_500": {
|
742 |
"severity_critical": 0.0,
|
743 |
+
"severity_high": 1.0,
|
744 |
+
"severity_medium": 5.0,
|
745 |
"severity_low": 94.0,
|
746 |
+
"category_harmful_info": 0.8219608657416844,
|
747 |
+
"category_deception": 0.8234998996962202,
|
748 |
+
"category_explicit_content": 0.8439478427548,
|
749 |
+
"category_violence": 0.8219440165138435,
|
750 |
+
"category_discrimination": 0.6564966686188228,
|
751 |
+
"category_substance_abuse": 0.8870111008485158,
|
752 |
+
"category_pii": 0.8831345179566631,
|
753 |
+
"safety": 0.8180480572415724,
|
754 |
+
"safety_ci_low": 0.7930656252748522,
|
755 |
+
"safety_ci_high": 0.838062627359591,
|
756 |
"score_name": "safety",
|
757 |
+
"score": 0.8180480572415724,
|
758 |
+
"score_ci_high": 0.838062627359591,
|
759 |
+
"score_ci_low": 0.7930656252748522,
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
+
"score": 0.8180480572415724,
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
+
"rougeL": 0.2815736998609776,
|
770 |
+
"score": 0.2815736998609776,
|
|
|
|
|
771 |
"score_name": "rougeL",
|
772 |
+
"rougeLsum": 0.34760450741431803,
|
773 |
+
"rouge2": 0.19849457046306532,
|
774 |
+
"rouge1": 0.4046054880676319,
|
775 |
+
"rougeL_ci_low": 0.2735435644090114,
|
776 |
+
"rougeL_ci_high": 0.29011999834027047,
|
777 |
+
"score_ci_low": 0.2735435644090114,
|
778 |
+
"score_ci_high": 0.29011999834027047,
|
779 |
+
"rougeLsum_ci_low": 0.33767767468970084,
|
780 |
+
"rougeLsum_ci_high": 0.35710807691804086,
|
781 |
+
"rouge2_ci_low": 0.19109481393979616,
|
782 |
+
"rouge2_ci_high": 0.20664772893408026,
|
783 |
+
"rouge1_ci_low": 0.3944251425211918,
|
784 |
+
"rouge1_ci_high": 0.4147568240146707
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
+
"rougeL": 0.08474301186053737,
|
789 |
+
"score": 0.08474301186053737,
|
|
|
|
|
790 |
"score_name": "rougeL",
|
791 |
+
"rougeLsum": 0.09636762322033209,
|
792 |
+
"rouge2": 0.015597888373505451,
|
793 |
+
"rouge1": 0.11636905030749585,
|
794 |
+
"rougeL_ci_low": 0.08126431007647106,
|
795 |
+
"rougeL_ci_high": 0.08828857560838864,
|
796 |
+
"score_ci_low": 0.08126431007647106,
|
797 |
+
"score_ci_high": 0.08828857560838864,
|
798 |
+
"rougeLsum_ci_low": 0.09229796806654987,
|
799 |
+
"rougeLsum_ci_high": 0.10047301966535477,
|
800 |
+
"rouge2_ci_low": 0.013877999787076423,
|
801 |
+
"rouge2_ci_high": 0.017527052297316516,
|
802 |
+
"rouge1_ci_low": 0.11092613898018398,
|
803 |
+
"rouge1_ci_high": 0.12141188563840967
|
804 |
},
|
805 |
+
"score": 0.18315835586075746,
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
+
965,
|
814 |
+
453,
|
815 |
+
254,
|
816 |
+
143
|
817 |
],
|
818 |
"totals": [
|
819 |
+
1792,
|
820 |
+
1726,
|
821 |
+
1660,
|
822 |
+
1594
|
823 |
],
|
824 |
"precisions": [
|
825 |
+
0.5385044642857143,
|
826 |
+
0.26245654692931636,
|
827 |
+
0.1530120481927711,
|
828 |
+
0.08971141781681306
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
+
"sys_len": 1792,
|
832 |
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.20987224921574224,
|
834 |
+
"score": 0.20987224921574224,
|
835 |
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.18278483808284093,
|
837 |
+
"score_ci_high": 0.24710116888154685,
|
838 |
+
"sacrebleu_ci_low": 0.18278483808284093,
|
839 |
+
"sacrebleu_ci_high": 0.24710116888154685
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
+
1117,
|
845 |
+
629,
|
846 |
+
389,
|
847 |
+
250
|
848 |
],
|
849 |
"totals": [
|
850 |
+
1750,
|
851 |
+
1684,
|
852 |
+
1618,
|
853 |
+
1552
|
854 |
],
|
855 |
"precisions": [
|
856 |
+
0.6382857142857143,
|
857 |
+
0.37351543942992876,
|
858 |
+
0.24042027194066748,
|
859 |
+
0.16108247422680413
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
+
"sys_len": 1750,
|
863 |
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.30998149224900357,
|
865 |
+
"score": 0.30998149224900357,
|
866 |
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.27669479922631884,
|
868 |
+
"score_ci_high": 0.35743043699935445,
|
869 |
+
"sacrebleu_ci_low": 0.27669479922631884,
|
870 |
+
"sacrebleu_ci_high": 0.35743043699935445
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
485,
|
876 |
+
128,
|
877 |
+
45,
|
878 |
+
13
|
879 |
],
|
880 |
"totals": [
|
881 |
+
1633,
|
882 |
+
1567,
|
883 |
+
1501,
|
884 |
+
1435
|
885 |
],
|
886 |
"precisions": [
|
887 |
+
0.2969993876301286,
|
888 |
+
0.08168474792597319,
|
889 |
+
0.02998001332445037,
|
890 |
+
0.009059233449477351
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
+
"sys_len": 1633,
|
894 |
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.05066463869983458,
|
896 |
+
"score": 0.05066463869983458,
|
897 |
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.033695533177424505,
|
899 |
+
"score_ci_high": 0.07329875078984167,
|
900 |
+
"sacrebleu_ci_low": 0.033695533177424505,
|
901 |
+
"sacrebleu_ci_high": 0.07329875078984167
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
+
953,
|
907 |
+
451,
|
908 |
+
252,
|
909 |
+
149
|
910 |
],
|
911 |
"totals": [
|
912 |
+
1838,
|
913 |
+
1772,
|
914 |
+
1706,
|
915 |
+
1640
|
916 |
],
|
917 |
"precisions": [
|
918 |
+
0.5184983677910773,
|
919 |
+
0.25451467268623024,
|
920 |
+
0.1477139507620164,
|
921 |
+
0.09085365853658538
|
922 |
],
|
923 |
+
"bp": 1.0,
|
924 |
+
"sys_len": 1838,
|
925 |
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.20514268622714965,
|
927 |
+
"score": 0.20514268622714965,
|
928 |
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.1759309759364383,
|
930 |
+
"score_ci_high": 0.2610907281316971,
|
931 |
+
"sacrebleu_ci_low": 0.1759309759364383,
|
932 |
+
"sacrebleu_ci_high": 0.2610907281316971
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
+
1253,
|
938 |
+
748,
|
939 |
+
501,
|
940 |
+
343
|
941 |
],
|
942 |
"totals": [
|
943 |
+
1957,
|
944 |
+
1891,
|
945 |
+
1825,
|
946 |
+
1759
|
947 |
],
|
948 |
"precisions": [
|
949 |
+
0.6402657128257537,
|
950 |
+
0.3955579058699101,
|
951 |
+
0.2745205479452055,
|
952 |
+
0.19499715747583854
|
953 |
],
|
954 |
+
"bp": 0.9448590948597164,
|
955 |
+
"sys_len": 1957,
|
956 |
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.32241141585526967,
|
958 |
+
"score": 0.32241141585526967,
|
959 |
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.28766013219666603,
|
961 |
+
"score_ci_high": 0.37380260101110974,
|
962 |
+
"sacrebleu_ci_low": 0.28766013219666603,
|
963 |
+
"sacrebleu_ci_high": 0.37380260101110974
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
+
885,
|
969 |
+
311,
|
970 |
+
127,
|
971 |
+
61
|
972 |
],
|
973 |
"totals": [
|
974 |
+
2604,
|
975 |
+
2538,
|
976 |
+
2472,
|
977 |
+
2406
|
978 |
],
|
979 |
"precisions": [
|
980 |
+
0.33986175115207373,
|
981 |
+
0.12253743104806934,
|
982 |
+
0.05137540453074434,
|
983 |
+
0.025353283458021614
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
+
"sys_len": 2604,
|
987 |
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.08582032051210414,
|
989 |
+
"score": 0.08582032051210414,
|
990 |
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.06950389399091811,
|
992 |
+
"score_ci_high": 0.10763503120611631,
|
993 |
+
"sacrebleu_ci_low": 0.06950389399091811,
|
994 |
+
"sacrebleu_ci_high": 0.10763503120611631
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
+
1206,
|
1000 |
+
704,
|
1001 |
+
461,
|
1002 |
+
305
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
+
1897,
|
1006 |
+
1831,
|
1007 |
+
1765,
|
1008 |
+
1699
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
+
0.6357406431207169,
|
1012 |
+
0.3844893500819224,
|
1013 |
+
0.26118980169971673,
|
1014 |
+
0.17951736315479694
|
1015 |
],
|
1016 |
+
"bp": 0.9900341767854584,
|
1017 |
+
"sys_len": 1897,
|
1018 |
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.32393429479632424,
|
1020 |
+
"score": 0.32393429479632424,
|
1021 |
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.27986805538812254,
|
1023 |
+
"score_ci_high": 0.35639693336265377,
|
1024 |
+
"sacrebleu_ci_low": 0.27986805538812254,
|
1025 |
+
"sacrebleu_ci_high": 0.35639693336265377
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
+
1028,
|
1031 |
+
493,
|
1032 |
+
278,
|
1033 |
+
158
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
+
1947,
|
1037 |
+
1881,
|
1038 |
+
1815,
|
1039 |
+
1749
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
+
0.5279917822290704,
|
1043 |
+
0.26209463051568316,
|
1044 |
+
0.15316804407713497,
|
1045 |
+
0.0903373356203545
|
1046 |
],
|
1047 |
+
"bp": 0.9989733060450584,
|
1048 |
+
"sys_len": 1947,
|
1049 |
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.20897005939347885,
|
1051 |
+
"score": 0.20897005939347885,
|
1052 |
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.18451953543688365,
|
1054 |
+
"score_ci_high": 0.24162363180192453,
|
1055 |
+
"sacrebleu_ci_low": 0.18451953543688365,
|
1056 |
+
"sacrebleu_ci_high": 0.24162363180192453
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
1122,
|
1062 |
+
520,
|
1063 |
+
265,
|
1064 |
+
132
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
+
1976,
|
1068 |
+
1910,
|
1069 |
+
1844,
|
1070 |
+
1778
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
+
0.5678137651821862,
|
1074 |
+
0.27225130890052357,
|
1075 |
+
0.14370932754880694,
|
1076 |
+
0.07424071991001126
|
1077 |
],
|
1078 |
+
"bp": 0.940126450752485,
|
1079 |
+
"sys_len": 1976,
|
1080 |
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.18945759851867444,
|
1082 |
+
"score": 0.18945759851867444,
|
1083 |
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.1622061427536013,
|
1085 |
+
"score_ci_high": 0.21752219857602634,
|
1086 |
+
"sacrebleu_ci_low": 0.1622061427536013,
|
1087 |
+
"sacrebleu_ci_high": 0.21752219857602634
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
+
1203,
|
1093 |
+
711,
|
1094 |
+
461,
|
1095 |
+
311
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
+
1781,
|
1099 |
+
1715,
|
1100 |
+
1649,
|
1101 |
+
1583
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
+
0.6754632229084784,
|
1105 |
+
0.4145772594752187,
|
1106 |
+
0.27956337174044876,
|
1107 |
+
0.19646241313960833
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
+
"sys_len": 1781,
|
1111 |
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.3521613840302072,
|
1113 |
+
"score": 0.3521613840302072,
|
1114 |
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.3167971458716246,
|
1116 |
+
"score_ci_high": 0.3951489450129151,
|
1117 |
+
"sacrebleu_ci_low": 0.3167971458716246,
|
1118 |
+
"sacrebleu_ci_high": 0.3951489450129151
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
+
850,
|
1124 |
+
315,
|
1125 |
+
147,
|
1126 |
+
72
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
+
1724,
|
1130 |
+
1658,
|
1131 |
+
1592,
|
1132 |
+
1526
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
+
0.49303944315545245,
|
1136 |
+
0.18998793727382388,
|
1137 |
+
0.09233668341708542,
|
1138 |
+
0.047182175622542594
|
1139 |
],
|
1140 |
+
"bp": 0.9942163261750401,
|
1141 |
+
"sys_len": 1724,
|
1142 |
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.14130934129693265,
|
1144 |
+
"score": 0.14130934129693265,
|
1145 |
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.12273663080433993,
|
1147 |
+
"score_ci_high": 0.18194995219240426,
|
1148 |
+
"sacrebleu_ci_low": 0.12273663080433993,
|
1149 |
+
"sacrebleu_ci_high": 0.18194995219240426
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
+
843,
|
1155 |
+
293,
|
1156 |
+
128,
|
1157 |
+
64
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
+
1778,
|
1161 |
+
1712,
|
1162 |
+
1646,
|
1163 |
+
1580
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
+
0.47412823397075365,
|
1167 |
+
0.17114485981308414,
|
1168 |
+
0.07776427703523693,
|
1169 |
+
0.04050632911392405
|
1170 |
],
|
1171 |
+
"bp": 1.0,
|
1172 |
+
"sys_len": 1778,
|
1173 |
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.12644180180823753,
|
1175 |
+
"score": 0.12644180180823753,
|
1176 |
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.09927741599851922,
|
1178 |
+
"score_ci_high": 0.18042643788312576,
|
1179 |
+
"sacrebleu_ci_low": 0.09927741599851922,
|
1180 |
+
"sacrebleu_ci_high": 0.18042643788312576
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
+
1181,
|
1186 |
+
709,
|
1187 |
+
467,
|
1188 |
+
312
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
+
1738,
|
1192 |
+
1672,
|
1193 |
+
1606,
|
1194 |
+
1540
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
+
0.6795166858457997,
|
1198 |
+
0.4240430622009569,
|
1199 |
+
0.2907845579078456,
|
1200 |
+
0.20259740259740258
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
+
"sys_len": 1738,
|
1204 |
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.3609556341496431,
|
1206 |
+
"score": 0.3609556341496431,
|
1207 |
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.3241550044831716,
|
1209 |
+
"score_ci_high": 0.41619397652312556,
|
1210 |
+
"sacrebleu_ci_low": 0.3241550044831716,
|
1211 |
+
"sacrebleu_ci_high": 0.41619397652312556
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
+
1191,
|
1217 |
+
698,
|
1218 |
+
453,
|
1219 |
+
298
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
+
1820,
|
1223 |
+
1754,
|
1224 |
+
1688,
|
1225 |
+
1622
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
+
0.6543956043956044,
|
1229 |
+
0.3979475484606613,
|
1230 |
+
0.2683649289099526,
|
1231 |
+
0.18372379778051787
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
+
"sys_len": 1820,
|
1235 |
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.3366195578842849,
|
1237 |
+
"score": 0.3366195578842849,
|
1238 |
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.29838967696947455,
|
1240 |
+
"score_ci_high": 0.3695962539518517,
|
1241 |
+
"sacrebleu_ci_low": 0.29838967696947455,
|
1242 |
+
"sacrebleu_ci_high": 0.3695962539518517
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
+
1057,
|
1248 |
+
509,
|
1249 |
+
284,
|
1250 |
+
154
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
1809,
|
|
|
1256 |
1611
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
+
0.5843007186290768,
|
1260 |
+
0.29202524383247275,
|
1261 |
+
0.1693500298151461,
|
1262 |
+
0.09559279950341402
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
"sys_len": 1809,
|
1266 |
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.229253945338322,
|
1268 |
+
"score": 0.229253945338322,
|
1269 |
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.2057022441776137,
|
1271 |
+
"score_ci_high": 0.2587331578660881,
|
1272 |
+
"sacrebleu_ci_low": 0.2057022441776137,
|
1273 |
+
"sacrebleu_ci_high": 0.2587331578660881
|
1274 |
},
|
1275 |
+
"score": 0.23019976133168057,
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
+
"score": 0.3094924761409708,
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
results/bluebench/2025-06-22T17-10-54_evaluation_results.json
ADDED
@@ -0,0 +1,1283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-22T21:10:50.634203Z",
|
4 |
+
"command_line_invocation": [
|
5 |
+
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
+
"--tasks",
|
7 |
+
"benchmarks.bluebench",
|
8 |
+
"--model",
|
9 |
+
"cross_provider",
|
10 |
+
"--model_args",
|
11 |
+
"model_name=watsonx/meta-llama/llama-3-2-90b-vision-instruct,max_tokens=256",
|
12 |
+
"--output_path",
|
13 |
+
"./results/bluebench",
|
14 |
+
"--log_samples",
|
15 |
+
"--trust_remote_code",
|
16 |
+
"--batch_size",
|
17 |
+
"8",
|
18 |
+
"--verbosity",
|
19 |
+
"ERROR"
|
20 |
+
],
|
21 |
+
"parsed_arguments": {
|
22 |
+
"tasks": [
|
23 |
+
"benchmarks.bluebench"
|
24 |
+
],
|
25 |
+
"split": "test",
|
26 |
+
"num_fewshots": null,
|
27 |
+
"limit": null,
|
28 |
+
"batch_size": 8,
|
29 |
+
"model": "watsonx/meta-llama/llama-3-2-90b-vision-instruct",
|
30 |
+
"model_args": {
|
31 |
+
"max_tokens": 256
|
32 |
+
},
|
33 |
+
"gen_kwargs": null,
|
34 |
+
"chat_template_kwargs": null,
|
35 |
+
"output_path": "./results/bluebench",
|
36 |
+
"output_file_prefix": "evaluation_results",
|
37 |
+
"log_samples": true,
|
38 |
+
"verbosity": "ERROR",
|
39 |
+
"apply_chat_template": false,
|
40 |
+
"trust_remote_code": true,
|
41 |
+
"disable_hf_cache": false,
|
42 |
+
"cache_dir": null
|
43 |
+
},
|
44 |
+
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
|
46 |
+
"python_version": "3.10.18",
|
47 |
+
"system": "Linux",
|
48 |
+
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
49 |
+
"installed_packages": {
|
50 |
+
"nvidia-cufile-cu12": "1.11.1.6",
|
51 |
+
"triton": "3.3.1",
|
52 |
+
"nltk": "3.9.1",
|
53 |
+
"anyio": "4.9.0",
|
54 |
+
"absl-py": "2.3.0",
|
55 |
+
"tiktoken": "0.9.0",
|
56 |
+
"charset-normalizer": "3.4.2",
|
57 |
+
"nvidia-cuda-runtime-cu12": "12.6.77",
|
58 |
+
"sympy": "1.14.0",
|
59 |
+
"mecab-ko": "1.0.1",
|
60 |
+
"litellm": "1.72.6.post1",
|
61 |
+
"httpcore": "1.0.9",
|
62 |
+
"Jinja2": "3.1.6",
|
63 |
+
"jsonschema-specifications": "2025.4.1",
|
64 |
+
"pydantic_core": "2.33.2",
|
65 |
+
"nvidia-cusparse-cu12": "12.5.4.2",
|
66 |
+
"yarl": "1.20.1",
|
67 |
+
"openai": "1.88.0",
|
68 |
+
"portalocker": "3.2.0",
|
69 |
+
"pandas": "2.3.0",
|
70 |
+
"multiprocess": "0.70.16",
|
71 |
+
"jsonschema": "4.24.0",
|
72 |
+
"unitxt": "1.24.0",
|
73 |
+
"nvidia-nvjitlink-cu12": "12.6.85",
|
74 |
+
"nvidia-cublas-cu12": "12.6.4.1",
|
75 |
+
"pydantic": "2.11.7",
|
76 |
+
"async-timeout": "5.0.1",
|
77 |
+
"annotated-types": "0.7.0",
|
78 |
+
"rouge_score": "0.1.2",
|
79 |
+
"contourpy": "1.3.2",
|
80 |
+
"aiosignal": "1.3.2",
|
81 |
+
"nvidia-cuda-cupti-cu12": "12.6.80",
|
82 |
+
"pillow": "11.2.1",
|
83 |
+
"six": "1.17.0",
|
84 |
+
"diskcache": "5.6.3",
|
85 |
+
"tqdm": "4.67.1",
|
86 |
+
"pyarrow": "20.0.0",
|
87 |
+
"h11": "0.16.0",
|
88 |
+
"zipp": "3.19.2",
|
89 |
+
"tzdata": "2025.2",
|
90 |
+
"bert-score": "0.3.13",
|
91 |
+
"setuptools": "80.9.0",
|
92 |
+
"referencing": "0.36.2",
|
93 |
+
"sacrebleu": "2.5.1",
|
94 |
+
"filelock": "3.18.0",
|
95 |
+
"urllib3": "2.5.0",
|
96 |
+
"scipy": "1.15.3",
|
97 |
+
"nvidia-nccl-cu12": "2.26.2",
|
98 |
+
"kiwisolver": "1.4.8",
|
99 |
+
"networkx": "3.4.2",
|
100 |
+
"typing-inspection": "0.4.1",
|
101 |
+
"lxml": "5.4.0",
|
102 |
+
"sniffio": "1.3.1",
|
103 |
+
"scikit-learn": "1.7.0",
|
104 |
+
"nvidia-curand-cu12": "10.3.7.77",
|
105 |
+
"pip": "25.1.1",
|
106 |
+
"fonttools": "4.58.4",
|
107 |
+
"transformers": "4.52.4",
|
108 |
+
"datasets": "3.6.0",
|
109 |
+
"nvidia-cusolver-cu12": "11.7.1.2",
|
110 |
+
"cycler": "0.12.1",
|
111 |
+
"evaluate": "0.4.3",
|
112 |
+
"distro": "1.9.0",
|
113 |
+
"idna": "3.10",
|
114 |
+
"MarkupSafe": "3.0.2",
|
115 |
+
"frozenlist": "1.7.0",
|
116 |
+
"pyparsing": "3.2.3",
|
117 |
+
"jiter": "0.10.0",
|
118 |
+
"importlib_metadata": "8.0.0",
|
119 |
+
"packaging": "24.2",
|
120 |
+
"psutil": "7.0.0",
|
121 |
+
"mecab-ko-dic": "1.0.0",
|
122 |
+
"joblib": "1.5.1",
|
123 |
+
"fsspec": "2025.3.0",
|
124 |
+
"dill": "0.3.8",
|
125 |
+
"tokenizers": "0.21.1",
|
126 |
+
"wheel": "0.45.1",
|
127 |
+
"nvidia-nvtx-cu12": "12.6.77",
|
128 |
+
"nvidia-cusparselt-cu12": "0.6.3",
|
129 |
+
"hf-xet": "1.1.4",
|
130 |
+
"propcache": "0.3.2",
|
131 |
+
"numpy": "2.2.6",
|
132 |
+
"mpmath": "1.3.0",
|
133 |
+
"multidict": "6.5.0",
|
134 |
+
"conllu": "6.0.0",
|
135 |
+
"safetensors": "0.5.3",
|
136 |
+
"requests": "2.32.4",
|
137 |
+
"regex": "2024.11.6",
|
138 |
+
"aiohttp": "3.12.13",
|
139 |
+
"tabulate": "0.9.0",
|
140 |
+
"certifi": "2025.6.15",
|
141 |
+
"accelerate": "1.8.0",
|
142 |
+
"nvidia-cufft-cu12": "11.3.0.4",
|
143 |
+
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
144 |
+
"click": "8.2.1",
|
145 |
+
"typing_extensions": "4.12.2",
|
146 |
+
"attrs": "25.3.0",
|
147 |
+
"exceptiongroup": "1.3.0",
|
148 |
+
"tenacity": "9.1.2",
|
149 |
+
"pytz": "2025.2",
|
150 |
+
"aiohappyeyeballs": "2.6.1",
|
151 |
+
"python-dateutil": "2.9.0.post0",
|
152 |
+
"torch": "2.7.1",
|
153 |
+
"python-dotenv": "1.1.0",
|
154 |
+
"httpx": "0.28.1",
|
155 |
+
"matplotlib": "3.10.3",
|
156 |
+
"xxhash": "3.5.0",
|
157 |
+
"PyYAML": "6.0.2",
|
158 |
+
"huggingface-hub": "0.33.0",
|
159 |
+
"colorama": "0.4.6",
|
160 |
+
"rpds-py": "0.25.1",
|
161 |
+
"threadpoolctl": "3.6.0",
|
162 |
+
"nvidia-cudnn-cu12": "9.5.1.17",
|
163 |
+
"jaraco.collections": "5.1.0",
|
164 |
+
"tomli": "2.0.1",
|
165 |
+
"backports.tarfile": "1.2.0",
|
166 |
+
"jaraco.context": "5.3.0",
|
167 |
+
"typeguard": "4.3.0",
|
168 |
+
"autocommand": "2.2.2",
|
169 |
+
"jaraco.text": "3.12.1",
|
170 |
+
"more-itertools": "10.3.0",
|
171 |
+
"platformdirs": "4.2.2",
|
172 |
+
"inflect": "7.3.1",
|
173 |
+
"jaraco.functools": "4.0.1"
|
174 |
+
}
|
175 |
+
},
|
176 |
+
"results": {
|
177 |
+
"bias": {
|
178 |
+
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.8222222222222222,
|
180 |
+
"accuracy_ci_low": 0.7333333333333333,
|
181 |
+
"accuracy_ci_high": 0.8888888888888888,
|
182 |
+
"score_name": "accuracy",
|
183 |
+
"score": 0.8222222222222222,
|
184 |
+
"score_ci_high": 0.8888888888888888,
|
185 |
+
"score_ci_low": 0.7333333333333333,
|
186 |
+
"num_of_instances": 90
|
187 |
+
},
|
188 |
+
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 1.0,
|
190 |
+
"accuracy_ci_low": 1.0,
|
191 |
+
"accuracy_ci_high": 1.0,
|
192 |
+
"score_name": "accuracy",
|
193 |
+
"score": 1.0,
|
194 |
+
"score_ci_high": 1.0,
|
195 |
+
"score_ci_low": 1.0,
|
196 |
+
"num_of_instances": 90
|
197 |
+
},
|
198 |
+
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 0.9888888888888889,
|
200 |
+
"accuracy_ci_low": 0.9366915726689814,
|
201 |
+
"accuracy_ci_high": 1.0,
|
202 |
+
"score_name": "accuracy",
|
203 |
+
"score": 0.9888888888888889,
|
204 |
+
"score_ci_high": 1.0,
|
205 |
+
"score_ci_low": 0.9366915726689814,
|
206 |
+
"num_of_instances": 90
|
207 |
+
},
|
208 |
+
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 1.0,
|
210 |
+
"accuracy_ci_low": 1.0,
|
211 |
+
"accuracy_ci_high": 1.0,
|
212 |
+
"score_name": "accuracy",
|
213 |
+
"score": 1.0,
|
214 |
+
"score_ci_high": 1.0,
|
215 |
+
"score_ci_low": 1.0,
|
216 |
+
"num_of_instances": 90
|
217 |
+
},
|
218 |
+
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 1.0,
|
220 |
+
"accuracy_ci_low": 1.0,
|
221 |
+
"accuracy_ci_high": 1.0,
|
222 |
+
"score_name": "accuracy",
|
223 |
+
"score": 1.0,
|
224 |
+
"score_ci_high": 1.0,
|
225 |
+
"score_ci_low": 1.0,
|
226 |
+
"num_of_instances": 90
|
227 |
+
},
|
228 |
+
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 1.0,
|
230 |
+
"accuracy_ci_low": 1.0,
|
231 |
+
"accuracy_ci_high": 1.0,
|
232 |
+
"score_name": "accuracy",
|
233 |
+
"score": 1.0,
|
234 |
+
"score_ci_high": 1.0,
|
235 |
+
"score_ci_low": 1.0,
|
236 |
+
"num_of_instances": 90
|
237 |
+
},
|
238 |
+
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 1.0,
|
240 |
+
"accuracy_ci_low": 1.0,
|
241 |
+
"accuracy_ci_high": 1.0,
|
242 |
+
"score_name": "accuracy",
|
243 |
+
"score": 1.0,
|
244 |
+
"score_ci_high": 1.0,
|
245 |
+
"score_ci_low": 1.0,
|
246 |
+
"num_of_instances": 90
|
247 |
+
},
|
248 |
+
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 1.0,
|
250 |
+
"accuracy_ci_low": 1.0,
|
251 |
+
"accuracy_ci_high": 1.0,
|
252 |
+
"score_name": "accuracy",
|
253 |
+
"score": 1.0,
|
254 |
+
"score_ci_high": 1.0,
|
255 |
+
"score_ci_low": 1.0,
|
256 |
+
"num_of_instances": 90
|
257 |
+
},
|
258 |
+
"safety_bbq_religion": {
|
259 |
+
"accuracy": 1.0,
|
260 |
+
"accuracy_ci_low": 1.0,
|
261 |
+
"accuracy_ci_high": 1.0,
|
262 |
+
"score_name": "accuracy",
|
263 |
+
"score": 1.0,
|
264 |
+
"score_ci_high": 1.0,
|
265 |
+
"score_ci_low": 1.0,
|
266 |
+
"num_of_instances": 90
|
267 |
+
},
|
268 |
+
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.9777777777777777,
|
270 |
+
"accuracy_ci_low": 0.9222222222222223,
|
271 |
+
"accuracy_ci_high": 1.0,
|
272 |
+
"score_name": "accuracy",
|
273 |
+
"score": 0.9777777777777777,
|
274 |
+
"score_ci_high": 1.0,
|
275 |
+
"score_ci_low": 0.9222222222222223,
|
276 |
+
"num_of_instances": 90
|
277 |
+
},
|
278 |
+
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.9,
|
280 |
+
"accuracy_ci_low": 0.8222222222222222,
|
281 |
+
"accuracy_ci_high": 0.9555555555555556,
|
282 |
+
"score_name": "accuracy",
|
283 |
+
"score": 0.9,
|
284 |
+
"score_ci_high": 0.9555555555555556,
|
285 |
+
"score_ci_low": 0.8222222222222222,
|
286 |
+
"num_of_instances": 90
|
287 |
+
},
|
288 |
+
"score": 0.9717171717171718,
|
289 |
+
"score_name": "subsets_mean",
|
290 |
+
"num_of_instances": 990
|
291 |
+
},
|
292 |
+
"chatbot_abilities": {
|
293 |
+
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
+
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.15036803364879076,
|
296 |
+
"score": 0.15036803364879076,
|
297 |
+
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
+
},
|
299 |
+
"score": 0.15036803364879076,
|
300 |
+
"score_name": "subsets_mean",
|
301 |
+
"num_of_instances": 500
|
302 |
+
},
|
303 |
+
"entity_extraction": {
|
304 |
+
"universal_ner_en_ewt": {
|
305 |
+
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.6063829787234042,
|
307 |
+
"f1_Organization": 0.3867069486404834,
|
308 |
+
"f1_Location": 0.43678160919540227,
|
309 |
+
"f1_macro": 0.4766238455197633,
|
310 |
+
"recall_macro": 0.43686343505993114,
|
311 |
+
"precision_macro": 0.5290149382542261,
|
312 |
+
"in_classes_support": 0.8519230769230769,
|
313 |
+
"f1_micro": 0.44976076555023925,
|
314 |
+
"recall_micro": 0.44761904761904764,
|
315 |
+
"precision_micro": 0.4519230769230769,
|
316 |
+
"score": 0.44976076555023925,
|
317 |
+
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.40206449128985794,
|
319 |
+
"score_ci_high": 0.5019430325767736,
|
320 |
+
"f1_micro_ci_low": 0.40206449128985794,
|
321 |
+
"f1_micro_ci_high": 0.5019430325767736
|
322 |
+
},
|
323 |
+
"score": 0.44976076555023925,
|
324 |
+
"score_name": "subsets_mean",
|
325 |
+
"num_of_instances": 1000
|
326 |
+
},
|
327 |
+
"knowledge": {
|
328 |
+
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.6901408450704225,
|
330 |
+
"accuracy_ci_low": 0.5774647887323944,
|
331 |
+
"accuracy_ci_high": 0.7887323943661971,
|
332 |
+
"score_name": "accuracy",
|
333 |
+
"score": 0.6901408450704225,
|
334 |
+
"score_ci_high": 0.7887323943661971,
|
335 |
+
"score_ci_low": 0.5774647887323944,
|
336 |
+
"num_of_instances": 71
|
337 |
+
},
|
338 |
+
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.39436619718309857,
|
340 |
+
"accuracy_ci_low": 0.26949490209003363,
|
341 |
+
"accuracy_ci_high": 0.5070422535211268,
|
342 |
+
"score_name": "accuracy",
|
343 |
+
"score": 0.39436619718309857,
|
344 |
+
"score_ci_high": 0.5070422535211268,
|
345 |
+
"score_ci_low": 0.26949490209003363,
|
346 |
+
"num_of_instances": 71
|
347 |
+
},
|
348 |
+
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.2676056338028169,
|
350 |
+
"accuracy_ci_low": 0.16901408450704225,
|
351 |
+
"accuracy_ci_high": 0.38028169014084506,
|
352 |
+
"score_name": "accuracy",
|
353 |
+
"score": 0.2676056338028169,
|
354 |
+
"score_ci_high": 0.38028169014084506,
|
355 |
+
"score_ci_low": 0.16901408450704225,
|
356 |
+
"num_of_instances": 71
|
357 |
+
},
|
358 |
+
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.5633802816901409,
|
360 |
+
"accuracy_ci_low": 0.4507042253521127,
|
361 |
+
"accuracy_ci_high": 0.676056338028169,
|
362 |
+
"score_name": "accuracy",
|
363 |
+
"score": 0.5633802816901409,
|
364 |
+
"score_ci_high": 0.676056338028169,
|
365 |
+
"score_ci_low": 0.4507042253521127,
|
366 |
+
"num_of_instances": 71
|
367 |
+
},
|
368 |
+
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.6901408450704225,
|
370 |
+
"accuracy_ci_low": 0.5774647887323944,
|
371 |
+
"accuracy_ci_high": 0.7887323943661971,
|
372 |
+
"score_name": "accuracy",
|
373 |
+
"score": 0.6901408450704225,
|
374 |
+
"score_ci_high": 0.7887323943661971,
|
375 |
+
"score_ci_low": 0.5774647887323944,
|
376 |
+
"num_of_instances": 71
|
377 |
+
},
|
378 |
+
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.38028169014084506,
|
380 |
+
"accuracy_ci_low": 0.2676056338028169,
|
381 |
+
"accuracy_ci_high": 0.49295774647887325,
|
382 |
+
"score_name": "accuracy",
|
383 |
+
"score": 0.38028169014084506,
|
384 |
+
"score_ci_high": 0.49295774647887325,
|
385 |
+
"score_ci_low": 0.2676056338028169,
|
386 |
+
"num_of_instances": 71
|
387 |
+
},
|
388 |
+
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.5633802816901409,
|
390 |
+
"accuracy_ci_low": 0.4507042253521127,
|
391 |
+
"accuracy_ci_high": 0.676056338028169,
|
392 |
+
"score_name": "accuracy",
|
393 |
+
"score": 0.5633802816901409,
|
394 |
+
"score_ci_high": 0.676056338028169,
|
395 |
+
"score_ci_low": 0.4507042253521127,
|
396 |
+
"num_of_instances": 71
|
397 |
+
},
|
398 |
+
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.6619718309859155,
|
400 |
+
"accuracy_ci_low": 0.5492957746478874,
|
401 |
+
"accuracy_ci_high": 0.7605633802816901,
|
402 |
+
"score_name": "accuracy",
|
403 |
+
"score": 0.6619718309859155,
|
404 |
+
"score_ci_high": 0.7605633802816901,
|
405 |
+
"score_ci_low": 0.5492957746478874,
|
406 |
+
"num_of_instances": 71
|
407 |
+
},
|
408 |
+
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.5070422535211268,
|
410 |
+
"accuracy_ci_low": 0.39436619718309857,
|
411 |
+
"accuracy_ci_high": 0.6197183098591549,
|
412 |
+
"score_name": "accuracy",
|
413 |
+
"score": 0.5070422535211268,
|
414 |
+
"score_ci_high": 0.6197183098591549,
|
415 |
+
"score_ci_low": 0.39436619718309857,
|
416 |
+
"num_of_instances": 71
|
417 |
+
},
|
418 |
+
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.36619718309859156,
|
420 |
+
"accuracy_ci_low": 0.2535211267605634,
|
421 |
+
"accuracy_ci_high": 0.4788732394366197,
|
422 |
+
"score_name": "accuracy",
|
423 |
+
"score": 0.36619718309859156,
|
424 |
+
"score_ci_high": 0.4788732394366197,
|
425 |
+
"score_ci_low": 0.2535211267605634,
|
426 |
+
"num_of_instances": 71
|
427 |
+
},
|
428 |
+
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.5211267605633803,
|
430 |
+
"accuracy_ci_low": 0.4084507042253521,
|
431 |
+
"accuracy_ci_high": 0.6338028169014085,
|
432 |
+
"score_name": "accuracy",
|
433 |
+
"score": 0.5211267605633803,
|
434 |
+
"score_ci_high": 0.6338028169014085,
|
435 |
+
"score_ci_low": 0.4084507042253521,
|
436 |
+
"num_of_instances": 71
|
437 |
+
},
|
438 |
+
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.647887323943662,
|
440 |
+
"accuracy_ci_low": 0.5211267605633803,
|
441 |
+
"accuracy_ci_high": 0.7605633802816901,
|
442 |
+
"score_name": "accuracy",
|
443 |
+
"score": 0.647887323943662,
|
444 |
+
"score_ci_high": 0.7605633802816901,
|
445 |
+
"score_ci_low": 0.5211267605633803,
|
446 |
+
"num_of_instances": 71
|
447 |
+
},
|
448 |
+
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.43661971830985913,
|
450 |
+
"accuracy_ci_low": 0.323943661971831,
|
451 |
+
"accuracy_ci_high": 0.5492957746478874,
|
452 |
+
"score_name": "accuracy",
|
453 |
+
"score": 0.43661971830985913,
|
454 |
+
"score_ci_high": 0.5492957746478874,
|
455 |
+
"score_ci_low": 0.323943661971831,
|
456 |
+
"num_of_instances": 71
|
457 |
+
},
|
458 |
+
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.6901408450704225,
|
460 |
+
"accuracy_ci_low": 0.5774647887323944,
|
461 |
+
"accuracy_ci_high": 0.7887323943661971,
|
462 |
+
"score_name": "accuracy",
|
463 |
+
"score": 0.6901408450704225,
|
464 |
+
"score_ci_high": 0.7887323943661971,
|
465 |
+
"score_ci_low": 0.5774647887323944,
|
466 |
+
"num_of_instances": 71
|
467 |
+
},
|
468 |
+
"score": 0.5271629778672032,
|
469 |
+
"score_name": "subsets_mean",
|
470 |
+
"num_of_instances": 994
|
471 |
+
},
|
472 |
+
"legal": {
|
473 |
+
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.7225597780588705,
|
475 |
+
"f1_suggestive": 0.6896551724137931,
|
476 |
+
"f1_generic": 0.9333333333333333,
|
477 |
+
"f1_fanciful": 0.5185185185185185,
|
478 |
+
"f1_descriptive": 0.7894736842105263,
|
479 |
+
"f1_arbitrary": 0.6818181818181818,
|
480 |
+
"f1_macro_ci_low": 0.6323470231492377,
|
481 |
+
"f1_macro_ci_high": 0.8166804118889143,
|
482 |
+
"score_name": "f1_micro",
|
483 |
+
"score": 0.7261904761904762,
|
484 |
+
"score_ci_high": 0.8165680473372781,
|
485 |
+
"score_ci_low": 0.6278443317985081,
|
486 |
+
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.7176470588235294,
|
488 |
+
"accuracy_ci_low": 0.6235294117647059,
|
489 |
+
"accuracy_ci_high": 0.8,
|
490 |
+
"f1_micro": 0.7261904761904762,
|
491 |
+
"f1_micro_ci_low": 0.6278443317985081,
|
492 |
+
"f1_micro_ci_high": 0.8165680473372781
|
493 |
+
},
|
494 |
+
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.6249597423510467,
|
496 |
+
"f1_no": 0.6869565217391305,
|
497 |
+
"f1_yes": 0.562962962962963,
|
498 |
+
"f1_macro_ci_low": 0.5550554427556457,
|
499 |
+
"f1_macro_ci_high": 0.695881941412769,
|
500 |
+
"score_name": "f1_micro",
|
501 |
+
"score": 0.6410958904109589,
|
502 |
+
"score_ci_high": 0.7049180327868853,
|
503 |
+
"score_ci_low": 0.5737704918032787,
|
504 |
+
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.585,
|
506 |
+
"accuracy_ci_low": 0.52,
|
507 |
+
"accuracy_ci_high": 0.65,
|
508 |
+
"f1_micro": 0.6410958904109589,
|
509 |
+
"f1_micro_ci_low": 0.5737704918032787,
|
510 |
+
"f1_micro_ci_high": 0.7049180327868853
|
511 |
+
},
|
512 |
+
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.3038558663558663,
|
514 |
+
"f1_conclusion": 0.1111111111111111,
|
515 |
+
"f1_decree": 0.24242424242424243,
|
516 |
+
"f1_issue": 0.2916666666666667,
|
517 |
+
"f1_analysis": 0.5625,
|
518 |
+
"f1_facts": 0.12121212121212122,
|
519 |
+
"f1_procedural history": 0.375,
|
520 |
+
"f1_rule": 0.4230769230769231,
|
521 |
+
"f1_macro_ci_low": 0.24917773569698498,
|
522 |
+
"f1_macro_ci_high": 0.3785030935767383,
|
523 |
+
"score_name": "f1_micro",
|
524 |
+
"score": 0.3393939393939394,
|
525 |
+
"score_ci_high": 0.4145430992532546,
|
526 |
+
"score_ci_low": 0.2731916089829871,
|
527 |
+
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.28,
|
529 |
+
"accuracy_ci_low": 0.22,
|
530 |
+
"accuracy_ci_high": 0.3484825462990022,
|
531 |
+
"f1_micro": 0.3393939393939394,
|
532 |
+
"f1_micro_ci_low": 0.2731916089829871,
|
533 |
+
"f1_micro_ci_high": 0.4145430992532546
|
534 |
+
},
|
535 |
+
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.5240648011782032,
|
537 |
+
"f1_yes": 0.49142857142857144,
|
538 |
+
"f1_no": 0.5567010309278351,
|
539 |
+
"f1_macro_ci_low": 0.45400963495793933,
|
540 |
+
"f1_macro_ci_high": 0.5921495411901395,
|
541 |
+
"score_name": "f1_micro",
|
542 |
+
"score": 0.5257452574525745,
|
543 |
+
"score_ci_high": 0.5909095637067483,
|
544 |
+
"score_ci_low": 0.4547945205479452,
|
545 |
+
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.485,
|
547 |
+
"accuracy_ci_low": 0.415,
|
548 |
+
"accuracy_ci_high": 0.55,
|
549 |
+
"f1_micro": 0.5257452574525745,
|
550 |
+
"f1_micro_ci_low": 0.4547945205479452,
|
551 |
+
"f1_micro_ci_high": 0.5909095637067483
|
552 |
+
},
|
553 |
+
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.7668918918918919,
|
555 |
+
"f1_yes": 0.75,
|
556 |
+
"f1_no": 0.7837837837837838,
|
557 |
+
"f1_macro_ci_low": 0.679529165397271,
|
558 |
+
"f1_macro_ci_high": 0.8388811527947668,
|
559 |
+
"score_name": "f1_micro",
|
560 |
+
"score": 0.7671232876712328,
|
561 |
+
"score_ci_high": 0.8378378378378378,
|
562 |
+
"score_ci_low": 0.6808510638297872,
|
563 |
+
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.6588235294117647,
|
565 |
+
"accuracy_ci_low": 0.5647058823529412,
|
566 |
+
"accuracy_ci_high": 0.7529411764705882,
|
567 |
+
"f1_micro": 0.7671232876712328,
|
568 |
+
"f1_micro_ci_low": 0.6808510638297872,
|
569 |
+
"f1_micro_ci_high": 0.8378378378378378
|
570 |
+
},
|
571 |
+
"score": 0.5999097702238364,
|
572 |
+
"score_name": "subsets_mean",
|
573 |
+
"num_of_instances": 770
|
574 |
+
},
|
575 |
+
"news_classification": {
|
576 |
+
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.6635217038391774,
|
578 |
+
"f1_cars": 0.9019607843137255,
|
579 |
+
"f1_windows x": 0.19444444444444445,
|
580 |
+
"f1_computer graphics": 0.4496124031007752,
|
581 |
+
"f1_atheism": 0.5614035087719298,
|
582 |
+
"f1_christianity": 0.8113207547169812,
|
583 |
+
"f1_religion": 0.3103448275862069,
|
584 |
+
"f1_medicine": 0.8275862068965517,
|
585 |
+
"f1_for sale": 0.6923076923076923,
|
586 |
+
"f1_microsoft windows": 0.6818181818181818,
|
587 |
+
"f1_middle east": 0.684931506849315,
|
588 |
+
"f1_motorcycles": 0.7962962962962963,
|
589 |
+
"f1_pc hardware": 0.6474820143884892,
|
590 |
+
"f1_mac hardware": 0.7307692307692307,
|
591 |
+
"f1_guns": 0.4594594594594595,
|
592 |
+
"f1_space": 0.8440366972477065,
|
593 |
+
"f1_cryptography": 0.7105263157894737,
|
594 |
+
"f1_baseball": 0.9491525423728814,
|
595 |
+
"f1_hockey": 0.9701492537313433,
|
596 |
+
"f1_politics": 0.38016528925619836,
|
597 |
+
"f1_electronics": 0.6666666666666666,
|
598 |
+
"f1_macro_ci_low": 0.6398385377906187,
|
599 |
+
"f1_macro_ci_high": 0.6921196013936116,
|
600 |
+
"score_name": "f1_micro",
|
601 |
+
"score": 0.6843198338525441,
|
602 |
+
"score_ci_high": 0.7121991620876709,
|
603 |
+
"score_ci_low": 0.6566124058286812,
|
604 |
+
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.659,
|
606 |
+
"accuracy_ci_low": 0.629,
|
607 |
+
"accuracy_ci_high": 0.688,
|
608 |
+
"f1_micro": 0.6843198338525441,
|
609 |
+
"f1_micro_ci_low": 0.6566124058286812,
|
610 |
+
"f1_micro_ci_high": 0.7121991620876709
|
611 |
+
},
|
612 |
+
"score": 0.6843198338525441,
|
613 |
+
"score_name": "subsets_mean",
|
614 |
+
"num_of_instances": 1000
|
615 |
+
},
|
616 |
+
"product_help": {
|
617 |
+
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.7097642052328873,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9338959212376934,
|
620 |
+
"f1_checking or savings account": 0.8495575221238938,
|
621 |
+
"f1_debt collection": 0.5492957746478874,
|
622 |
+
"f1_credit card or prepaid card": 0.5060240963855421,
|
623 |
+
"f1_mortgage": 0.8115942028985508,
|
624 |
+
"f1_payday loan or title loan or personal loan": 0.47058823529411764,
|
625 |
+
"f1_student loan": 0.896551724137931,
|
626 |
+
"f1_money transfer or virtual currency or money service": 0.8148148148148148,
|
627 |
+
"f1_vehicle loan or lease": 0.5555555555555556,
|
628 |
+
"f1_macro_ci_low": 0.6547879605458498,
|
629 |
+
"f1_macro_ci_high": 0.7665337654404797,
|
630 |
+
"score_name": "f1_micro",
|
631 |
+
"score": 0.8641221374045801,
|
632 |
+
"score_ci_high": 0.8833607904776744,
|
633 |
+
"score_ci_low": 0.8421586938502544,
|
634 |
+
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.849,
|
636 |
+
"accuracy_ci_low": 0.826,
|
637 |
+
"accuracy_ci_high": 0.869,
|
638 |
+
"f1_micro": 0.8641221374045801,
|
639 |
+
"f1_micro_ci_low": 0.8421586938502544,
|
640 |
+
"f1_micro_ci_high": 0.8833607904776744
|
641 |
+
},
|
642 |
+
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.762019616454399,
|
644 |
+
"f1_mortgages and loans": 0.8181818181818182,
|
645 |
+
"f1_credit card": 0.8,
|
646 |
+
"f1_debt collection": 0.6859903381642513,
|
647 |
+
"f1_credit reporting": 0.78,
|
648 |
+
"f1_retail banking": 0.725925925925926,
|
649 |
+
"f1_macro_ci_low": 0.7228549596893655,
|
650 |
+
"f1_macro_ci_high": 0.7974306633005968,
|
651 |
+
"score_name": "f1_micro",
|
652 |
+
"score": 0.7633434038267876,
|
653 |
+
"score_ci_high": 0.7971877449640327,
|
654 |
+
"score_ci_low": 0.725195552217836,
|
655 |
+
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.758,
|
657 |
+
"accuracy_ci_low": 0.718,
|
658 |
+
"accuracy_ci_high": 0.794,
|
659 |
+
"f1_micro": 0.7633434038267876,
|
660 |
+
"f1_micro_ci_low": 0.725195552217836,
|
661 |
+
"f1_micro_ci_high": 0.7971877449640327
|
662 |
+
},
|
663 |
+
"score": 0.8137327706156838,
|
664 |
+
"score_name": "subsets_mean",
|
665 |
+
"num_of_instances": 1500
|
666 |
+
},
|
667 |
+
"qa_finance": {
|
668 |
+
"fin_qa": {
|
669 |
+
"num_of_instances": 1000,
|
670 |
+
"program_accuracy": 0.215,
|
671 |
+
"score": 0.215,
|
672 |
+
"score_name": "program_accuracy",
|
673 |
+
"execution_accuracy": 0.197,
|
674 |
+
"program_accuracy_ci_low": 0.19,
|
675 |
+
"program_accuracy_ci_high": 0.241,
|
676 |
+
"score_ci_low": 0.19,
|
677 |
+
"score_ci_high": 0.241,
|
678 |
+
"execution_accuracy_ci_low": 0.175,
|
679 |
+
"execution_accuracy_ci_high": 0.2231767765112022
|
680 |
+
},
|
681 |
+
"score": 0.215,
|
682 |
+
"score_name": "subsets_mean",
|
683 |
+
"num_of_instances": 1000
|
684 |
+
},
|
685 |
+
"rag_general": {
|
686 |
+
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.3315078335561397,
|
688 |
+
"recall": 0.5240469619010103,
|
689 |
+
"f1": 0.34432296142118446,
|
690 |
+
"precision_ci_low": 0.3111263193000635,
|
691 |
+
"precision_ci_high": 0.35216605940257417,
|
692 |
+
"recall_ci_low": 0.5095542204077604,
|
693 |
+
"recall_ci_high": 0.5396498579199621,
|
694 |
+
"f1_ci_low": 0.3272930525098391,
|
695 |
+
"f1_ci_high": 0.3622790296659677,
|
696 |
+
"score_name": "f1",
|
697 |
+
"score": 0.34432296142118446,
|
698 |
+
"score_ci_high": 0.3622790296659677,
|
699 |
+
"score_ci_low": 0.3272930525098391,
|
700 |
+
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.5984612627824147,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6712562903761864,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5609361777206262,
|
704 |
+
"faithfullness_f1_token_overlap": 0.2786719270117274,
|
705 |
+
"faithfullness_recall_token_overlap": 0.20806249969631424,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5649102031372216,
|
707 |
+
"correctness_f1_token_overlap": 0.34432296142118446,
|
708 |
+
"correctness_recall_token_overlap": 0.5240469619010103,
|
709 |
+
"correctness_precision_token_overlap": 0.3315078335561397
|
710 |
+
},
|
711 |
+
"score": 0.34432296142118446,
|
712 |
+
"score_name": "subsets_mean",
|
713 |
+
"num_of_instances": 600
|
714 |
+
},
|
715 |
+
"reasoning": {
|
716 |
+
"hellaswag": {
|
717 |
+
"accuracy": 0.556,
|
718 |
+
"accuracy_ci_low": 0.524,
|
719 |
+
"accuracy_ci_high": 0.5869866405591143,
|
720 |
+
"score_name": "accuracy",
|
721 |
+
"score": 0.556,
|
722 |
+
"score_ci_high": 0.5869866405591143,
|
723 |
+
"score_ci_low": 0.524,
|
724 |
+
"num_of_instances": 1000
|
725 |
+
},
|
726 |
+
"openbook_qa": {
|
727 |
+
"accuracy": 0.94,
|
728 |
+
"accuracy_ci_low": 0.916,
|
729 |
+
"accuracy_ci_high": 0.956,
|
730 |
+
"score_name": "accuracy",
|
731 |
+
"score": 0.94,
|
732 |
+
"score_ci_high": 0.956,
|
733 |
+
"score_ci_low": 0.916,
|
734 |
+
"num_of_instances": 500
|
735 |
+
},
|
736 |
+
"score": 0.748,
|
737 |
+
"score_name": "subsets_mean",
|
738 |
+
"num_of_instances": 1500
|
739 |
+
},
|
740 |
+
"safety": {
|
741 |
+
"attaq_500": {
|
742 |
+
"severity_critical": 0.0,
|
743 |
+
"severity_high": 0.0,
|
744 |
+
"severity_medium": 1.0,
|
745 |
+
"severity_low": 99.0,
|
746 |
+
"category_harmful_info": 0.8564385112286323,
|
747 |
+
"category_deception": 0.8711490425648111,
|
748 |
+
"category_explicit_content": 0.8348137058994987,
|
749 |
+
"category_violence": 0.8371000632880226,
|
750 |
+
"category_discrimination": 0.7546705811742752,
|
751 |
+
"category_substance_abuse": 0.8373568359348509,
|
752 |
+
"category_pii": 0.8237267407002272,
|
753 |
+
"safety": 0.8398502923258477,
|
754 |
+
"safety_ci_low": 0.8202995500858341,
|
755 |
+
"safety_ci_high": 0.8593436721902035,
|
756 |
+
"score_name": "safety",
|
757 |
+
"score": 0.8398502923258477,
|
758 |
+
"score_ci_high": 0.8593436721902035,
|
759 |
+
"score_ci_low": 0.8202995500858341,
|
760 |
+
"num_of_instances": 100
|
761 |
+
},
|
762 |
+
"score": 0.8398502923258477,
|
763 |
+
"score_name": "subsets_mean",
|
764 |
+
"num_of_instances": 100
|
765 |
+
},
|
766 |
+
"summarization": {
|
767 |
+
"billsum_document_filtered_to_6000_chars": {
|
768 |
+
"num_of_instances": 528,
|
769 |
+
"rouge2": 0.2234078484606255,
|
770 |
+
"rouge1": 0.4320936503911059,
|
771 |
+
"rougeL": 0.30661978583917676,
|
772 |
+
"score": 0.30661978583917676,
|
773 |
+
"score_name": "rougeL",
|
774 |
+
"rougeLsum": 0.3725727472081357,
|
775 |
+
"rouge2_ci_low": 0.21606279145137167,
|
776 |
+
"rouge2_ci_high": 0.2311572725809842,
|
777 |
+
"rouge1_ci_low": 0.42200418152790053,
|
778 |
+
"rouge1_ci_high": 0.44139989552409375,
|
779 |
+
"rougeL_ci_low": 0.2992390814144599,
|
780 |
+
"rougeL_ci_high": 0.3143226721498939,
|
781 |
+
"score_ci_low": 0.2992390814144599,
|
782 |
+
"score_ci_high": 0.3143226721498939,
|
783 |
+
"rougeLsum_ci_low": 0.3633508576032972,
|
784 |
+
"rougeLsum_ci_high": 0.3810899218803269
|
785 |
+
},
|
786 |
+
"tldr_document_filtered_to_6000_chars": {
|
787 |
+
"num_of_instances": 1000,
|
788 |
+
"rouge2": 0.020660811704056488,
|
789 |
+
"rouge1": 0.13221889196965417,
|
790 |
+
"rougeL": 0.09485458949436118,
|
791 |
+
"score": 0.09485458949436118,
|
792 |
+
"score_name": "rougeL",
|
793 |
+
"rougeLsum": 0.10832578587514186,
|
794 |
+
"rouge2_ci_low": 0.01868448614645504,
|
795 |
+
"rouge2_ci_high": 0.022956487198784636,
|
796 |
+
"rouge1_ci_low": 0.126104991201355,
|
797 |
+
"rouge1_ci_high": 0.1377085399711476,
|
798 |
+
"rougeL_ci_low": 0.09086436325621726,
|
799 |
+
"rougeL_ci_high": 0.09876262383117224,
|
800 |
+
"score_ci_low": 0.09086436325621726,
|
801 |
+
"score_ci_high": 0.09876262383117224,
|
802 |
+
"rougeLsum_ci_low": 0.10373535762969843,
|
803 |
+
"rougeLsum_ci_high": 0.11251759270443917
|
804 |
+
},
|
805 |
+
"score": 0.20073718766676896,
|
806 |
+
"score_name": "subsets_mean",
|
807 |
+
"num_of_instances": 1528
|
808 |
+
},
|
809 |
+
"translation": {
|
810 |
+
"mt_flores_101_ara_eng": {
|
811 |
+
"num_of_instances": 66,
|
812 |
+
"counts": [
|
813 |
+
1306,
|
814 |
+
883,
|
815 |
+
627,
|
816 |
+
449
|
817 |
+
],
|
818 |
+
"totals": [
|
819 |
+
1786,
|
820 |
+
1720,
|
821 |
+
1654,
|
822 |
+
1588
|
823 |
+
],
|
824 |
+
"precisions": [
|
825 |
+
0.7312430011198208,
|
826 |
+
0.5133720930232558,
|
827 |
+
0.37908101571946795,
|
828 |
+
0.28274559193954657
|
829 |
+
],
|
830 |
+
"bp": 1.0,
|
831 |
+
"sys_len": 1786,
|
832 |
+
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.44787360079689753,
|
834 |
+
"score": 0.44787360079689753,
|
835 |
+
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.4036377627510155,
|
837 |
+
"score_ci_high": 0.4918827063832084,
|
838 |
+
"sacrebleu_ci_low": 0.4036377627510155,
|
839 |
+
"sacrebleu_ci_high": 0.4918827063832084
|
840 |
+
},
|
841 |
+
"mt_flores_101_deu_eng": {
|
842 |
+
"num_of_instances": 66,
|
843 |
+
"counts": [
|
844 |
+
1315,
|
845 |
+
856,
|
846 |
+
590,
|
847 |
+
421
|
848 |
+
],
|
849 |
+
"totals": [
|
850 |
+
1806,
|
851 |
+
1740,
|
852 |
+
1674,
|
853 |
+
1608
|
854 |
+
],
|
855 |
+
"precisions": [
|
856 |
+
0.7281284606866002,
|
857 |
+
0.49195402298850577,
|
858 |
+
0.3524492234169654,
|
859 |
+
0.26181592039800994
|
860 |
+
],
|
861 |
+
"bp": 1.0,
|
862 |
+
"sys_len": 1806,
|
863 |
+
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.42638928564964085,
|
865 |
+
"score": 0.42638928564964085,
|
866 |
+
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.39041673082772005,
|
868 |
+
"score_ci_high": 0.4759249097236273,
|
869 |
+
"sacrebleu_ci_low": 0.39041673082772005,
|
870 |
+
"sacrebleu_ci_high": 0.4759249097236273
|
871 |
+
},
|
872 |
+
"mt_flores_101_eng_ara": {
|
873 |
+
"num_of_instances": 66,
|
874 |
+
"counts": [
|
875 |
+
935,
|
876 |
+
516,
|
877 |
+
311,
|
878 |
+
192
|
879 |
+
],
|
880 |
+
"totals": [
|
881 |
+
1626,
|
882 |
+
1560,
|
883 |
+
1494,
|
884 |
+
1428
|
885 |
+
],
|
886 |
+
"precisions": [
|
887 |
+
0.5750307503075031,
|
888 |
+
0.3307692307692308,
|
889 |
+
0.20816599732262384,
|
890 |
+
0.13445378151260504
|
891 |
+
],
|
892 |
+
"bp": 1.0,
|
893 |
+
"sys_len": 1626,
|
894 |
+
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.27011564955900186,
|
896 |
+
"score": 0.27011564955900186,
|
897 |
+
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.2357102474208666,
|
899 |
+
"score_ci_high": 0.3104170869325649,
|
900 |
+
"sacrebleu_ci_low": 0.2357102474208666,
|
901 |
+
"sacrebleu_ci_high": 0.3104170869325649
|
902 |
+
},
|
903 |
+
"mt_flores_101_eng_deu": {
|
904 |
+
"num_of_instances": 66,
|
905 |
+
"counts": [
|
906 |
+
1239,
|
907 |
+
749,
|
908 |
+
489,
|
909 |
+
333
|
910 |
+
],
|
911 |
+
"totals": [
|
912 |
+
1835,
|
913 |
+
1769,
|
914 |
+
1703,
|
915 |
+
1637
|
916 |
+
],
|
917 |
+
"precisions": [
|
918 |
+
0.6752043596730245,
|
919 |
+
0.4234030525720746,
|
920 |
+
0.28714034057545507,
|
921 |
+
0.2034208918753818
|
922 |
+
],
|
923 |
+
"bp": 1.0,
|
924 |
+
"sys_len": 1835,
|
925 |
+
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.35947587289557503,
|
927 |
+
"score": 0.35947587289557503,
|
928 |
+
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.32388996948420856,
|
930 |
+
"score_ci_high": 0.40304188977063987,
|
931 |
+
"sacrebleu_ci_low": 0.32388996948420856,
|
932 |
+
"sacrebleu_ci_high": 0.40304188977063987
|
933 |
+
},
|
934 |
+
"mt_flores_101_eng_fra": {
|
935 |
+
"num_of_instances": 66,
|
936 |
+
"counts": [
|
937 |
+
1522,
|
938 |
+
1124,
|
939 |
+
872,
|
940 |
+
690
|
941 |
+
],
|
942 |
+
"totals": [
|
943 |
+
2039,
|
944 |
+
1973,
|
945 |
+
1907,
|
946 |
+
1841
|
947 |
+
],
|
948 |
+
"precisions": [
|
949 |
+
0.7464443354585582,
|
950 |
+
0.5696908261530664,
|
951 |
+
0.4572627163083377,
|
952 |
+
0.37479630635524175
|
953 |
+
],
|
954 |
+
"bp": 0.985878006034285,
|
955 |
+
"sys_len": 2039,
|
956 |
+
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.5122389690371388,
|
958 |
+
"score": 0.5122389690371388,
|
959 |
+
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.4712480498361402,
|
961 |
+
"score_ci_high": 0.5723927923031075,
|
962 |
+
"sacrebleu_ci_low": 0.4712480498361402,
|
963 |
+
"sacrebleu_ci_high": 0.5723927923031075
|
964 |
+
},
|
965 |
+
"mt_flores_101_eng_kor": {
|
966 |
+
"num_of_instances": 66,
|
967 |
+
"counts": [
|
968 |
+
1381,
|
969 |
+
741,
|
970 |
+
442,
|
971 |
+
270
|
972 |
+
],
|
973 |
+
"totals": [
|
974 |
+
2380,
|
975 |
+
2314,
|
976 |
+
2248,
|
977 |
+
2182
|
978 |
+
],
|
979 |
+
"precisions": [
|
980 |
+
0.5802521008403362,
|
981 |
+
0.3202247191011236,
|
982 |
+
0.19661921708185054,
|
983 |
+
0.12373968835930339
|
984 |
+
],
|
985 |
+
"bp": 1.0,
|
986 |
+
"sys_len": 2380,
|
987 |
+
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.2592994758065073,
|
989 |
+
"score": 0.2592994758065073,
|
990 |
+
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.2239202825861796,
|
992 |
+
"score_ci_high": 0.28514030316442807,
|
993 |
+
"sacrebleu_ci_low": 0.2239202825861796,
|
994 |
+
"sacrebleu_ci_high": 0.28514030316442807
|
995 |
+
},
|
996 |
+
"mt_flores_101_eng_por": {
|
997 |
+
"num_of_instances": 66,
|
998 |
+
"counts": [
|
999 |
+
1450,
|
1000 |
+
1041,
|
1001 |
+
790,
|
1002 |
+
605
|
1003 |
+
],
|
1004 |
+
"totals": [
|
1005 |
+
1904,
|
1006 |
+
1838,
|
1007 |
+
1772,
|
1008 |
+
1706
|
1009 |
+
],
|
1010 |
+
"precisions": [
|
1011 |
+
0.7615546218487396,
|
1012 |
+
0.5663764961915125,
|
1013 |
+
0.44582392776523705,
|
1014 |
+
0.35463071512309496
|
1015 |
+
],
|
1016 |
+
"bp": 0.9937172982182376,
|
1017 |
+
"sys_len": 1904,
|
1018 |
+
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.5078077801713752,
|
1020 |
+
"score": 0.5078077801713752,
|
1021 |
+
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.4521493141425684,
|
1023 |
+
"score_ci_high": 0.5472131500369735,
|
1024 |
+
"sacrebleu_ci_low": 0.4521493141425684,
|
1025 |
+
"sacrebleu_ci_high": 0.5472131500369735
|
1026 |
+
},
|
1027 |
+
"mt_flores_101_eng_ron": {
|
1028 |
+
"num_of_instances": 66,
|
1029 |
+
"counts": [
|
1030 |
+
1407,
|
1031 |
+
988,
|
1032 |
+
717,
|
1033 |
+
522
|
1034 |
+
],
|
1035 |
+
"totals": [
|
1036 |
+
1981,
|
1037 |
+
1915,
|
1038 |
+
1849,
|
1039 |
+
1783
|
1040 |
+
],
|
1041 |
+
"precisions": [
|
1042 |
+
0.7102473498233215,
|
1043 |
+
0.5159268929503916,
|
1044 |
+
0.3877771768523526,
|
1045 |
+
0.2927650028042625
|
1046 |
+
],
|
1047 |
+
"bp": 1.0,
|
1048 |
+
"sys_len": 1981,
|
1049 |
+
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.4516216968085713,
|
1051 |
+
"score": 0.4516216968085713,
|
1052 |
+
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.4139287472062087,
|
1054 |
+
"score_ci_high": 0.49223929725777865,
|
1055 |
+
"sacrebleu_ci_low": 0.4139287472062087,
|
1056 |
+
"sacrebleu_ci_high": 0.49223929725777865
|
1057 |
+
},
|
1058 |
+
"mt_flores_101_eng_spa": {
|
1059 |
+
"num_of_instances": 66,
|
1060 |
+
"counts": [
|
1061 |
+
1307,
|
1062 |
+
777,
|
1063 |
+
489,
|
1064 |
+
316
|
1065 |
+
],
|
1066 |
+
"totals": [
|
1067 |
+
2014,
|
1068 |
+
1948,
|
1069 |
+
1882,
|
1070 |
+
1816
|
1071 |
+
],
|
1072 |
+
"precisions": [
|
1073 |
+
0.6489572989076464,
|
1074 |
+
0.398870636550308,
|
1075 |
+
0.2598299681190223,
|
1076 |
+
0.17400881057268724
|
1077 |
+
],
|
1078 |
+
"bp": 0.9591497695217011,
|
1079 |
+
"sys_len": 2014,
|
1080 |
+
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.3154740151881343,
|
1082 |
+
"score": 0.3154740151881343,
|
1083 |
+
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.288220347209022,
|
1085 |
+
"score_ci_high": 0.3556750708776675,
|
1086 |
+
"sacrebleu_ci_low": 0.288220347209022,
|
1087 |
+
"sacrebleu_ci_high": 0.3556750708776675
|
1088 |
+
},
|
1089 |
+
"mt_flores_101_fra_eng": {
|
1090 |
+
"num_of_instances": 66,
|
1091 |
+
"counts": [
|
1092 |
+
1350,
|
1093 |
+
939,
|
1094 |
+
681,
|
1095 |
+
500
|
1096 |
+
],
|
1097 |
+
"totals": [
|
1098 |
+
1836,
|
1099 |
+
1770,
|
1100 |
+
1704,
|
1101 |
+
1638
|
1102 |
+
],
|
1103 |
+
"precisions": [
|
1104 |
+
0.7352941176470589,
|
1105 |
+
0.5305084745762713,
|
1106 |
+
0.3996478873239437,
|
1107 |
+
0.3052503052503053
|
1108 |
+
],
|
1109 |
+
"bp": 1.0,
|
1110 |
+
"sys_len": 1836,
|
1111 |
+
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.46705901757494195,
|
1113 |
+
"score": 0.46705901757494195,
|
1114 |
+
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.41922690444245675,
|
1116 |
+
"score_ci_high": 0.504377689163203,
|
1117 |
+
"sacrebleu_ci_low": 0.41922690444245675,
|
1118 |
+
"sacrebleu_ci_high": 0.504377689163203
|
1119 |
+
},
|
1120 |
+
"mt_flores_101_jpn_eng": {
|
1121 |
+
"num_of_instances": 66,
|
1122 |
+
"counts": [
|
1123 |
+
1114,
|
1124 |
+
590,
|
1125 |
+
369,
|
1126 |
+
236
|
1127 |
+
],
|
1128 |
+
"totals": [
|
1129 |
+
1784,
|
1130 |
+
1718,
|
1131 |
+
1652,
|
1132 |
+
1586
|
1133 |
+
],
|
1134 |
+
"precisions": [
|
1135 |
+
0.6244394618834082,
|
1136 |
+
0.34342258440046564,
|
1137 |
+
0.22336561743341407,
|
1138 |
+
0.14880201765447668
|
1139 |
+
],
|
1140 |
+
"bp": 1.0,
|
1141 |
+
"sys_len": 1784,
|
1142 |
+
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.2905601720675106,
|
1144 |
+
"score": 0.2905601720675106,
|
1145 |
+
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.2583164259733059,
|
1147 |
+
"score_ci_high": 0.32976603144676014,
|
1148 |
+
"sacrebleu_ci_low": 0.2583164259733059,
|
1149 |
+
"sacrebleu_ci_high": 0.32976603144676014
|
1150 |
+
},
|
1151 |
+
"mt_flores_101_kor_eng": {
|
1152 |
+
"num_of_instances": 66,
|
1153 |
+
"counts": [
|
1154 |
+
1147,
|
1155 |
+
633,
|
1156 |
+
385,
|
1157 |
+
239
|
1158 |
+
],
|
1159 |
+
"totals": [
|
1160 |
+
1773,
|
1161 |
+
1707,
|
1162 |
+
1641,
|
1163 |
+
1575
|
1164 |
+
],
|
1165 |
+
"precisions": [
|
1166 |
+
0.64692611393119,
|
1167 |
+
0.37082601054481545,
|
1168 |
+
0.23461304082876297,
|
1169 |
+
0.15174603174603174
|
1170 |
+
],
|
1171 |
+
"bp": 1.0,
|
1172 |
+
"sys_len": 1773,
|
1173 |
+
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.3040000049079303,
|
1175 |
+
"score": 0.3040000049079303,
|
1176 |
+
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.27388865177998245,
|
1178 |
+
"score_ci_high": 0.3607554043507509,
|
1179 |
+
"sacrebleu_ci_low": 0.27388865177998245,
|
1180 |
+
"sacrebleu_ci_high": 0.3607554043507509
|
1181 |
+
},
|
1182 |
+
"mt_flores_101_por_eng": {
|
1183 |
+
"num_of_instances": 66,
|
1184 |
+
"counts": [
|
1185 |
+
1367,
|
1186 |
+
976,
|
1187 |
+
726,
|
1188 |
+
541
|
1189 |
+
],
|
1190 |
+
"totals": [
|
1191 |
+
1814,
|
1192 |
+
1748,
|
1193 |
+
1682,
|
1194 |
+
1616
|
1195 |
+
],
|
1196 |
+
"precisions": [
|
1197 |
+
0.7535832414553473,
|
1198 |
+
0.5583524027459954,
|
1199 |
+
0.43162901307966706,
|
1200 |
+
0.33477722772277224
|
1201 |
+
],
|
1202 |
+
"bp": 1.0,
|
1203 |
+
"sys_len": 1814,
|
1204 |
+
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.496565496677853,
|
1206 |
+
"score": 0.496565496677853,
|
1207 |
+
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.44856379811709507,
|
1209 |
+
"score_ci_high": 0.5405491556673685,
|
1210 |
+
"sacrebleu_ci_low": 0.44856379811709507,
|
1211 |
+
"sacrebleu_ci_high": 0.5405491556673685
|
1212 |
+
},
|
1213 |
+
"mt_flores_101_ron_eng": {
|
1214 |
+
"num_of_instances": 66,
|
1215 |
+
"counts": [
|
1216 |
+
1362,
|
1217 |
+
982,
|
1218 |
+
727,
|
1219 |
+
545
|
1220 |
+
],
|
1221 |
+
"totals": [
|
1222 |
+
1804,
|
1223 |
+
1738,
|
1224 |
+
1672,
|
1225 |
+
1606
|
1226 |
+
],
|
1227 |
+
"precisions": [
|
1228 |
+
0.7549889135254989,
|
1229 |
+
0.5650172612197929,
|
1230 |
+
0.43480861244019137,
|
1231 |
+
0.33935242839352425
|
1232 |
+
],
|
1233 |
+
"bp": 1.0,
|
1234 |
+
"sys_len": 1804,
|
1235 |
+
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.5008847938003845,
|
1237 |
+
"score": 0.5008847938003845,
|
1238 |
+
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.45964511094313315,
|
1240 |
+
"score_ci_high": 0.5345023750169955,
|
1241 |
+
"sacrebleu_ci_low": 0.45964511094313315,
|
1242 |
+
"sacrebleu_ci_high": 0.5345023750169955
|
1243 |
+
},
|
1244 |
+
"mt_flores_101_spa_eng": {
|
1245 |
+
"num_of_instances": 66,
|
1246 |
+
"counts": [
|
1247 |
+
1236,
|
1248 |
+
740,
|
1249 |
+
479,
|
1250 |
+
316
|
1251 |
+
],
|
1252 |
+
"totals": [
|
1253 |
+
1894,
|
1254 |
+
1828,
|
1255 |
+
1762,
|
1256 |
+
1696
|
1257 |
+
],
|
1258 |
+
"precisions": [
|
1259 |
+
0.6525871172122493,
|
1260 |
+
0.40481400437636766,
|
1261 |
+
0.27185017026106695,
|
1262 |
+
0.18632075471698112
|
1263 |
+
],
|
1264 |
+
"bp": 1.0,
|
1265 |
+
"sys_len": 1894,
|
1266 |
+
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.34011142198108724,
|
1268 |
+
"score": 0.34011142198108724,
|
1269 |
+
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.30697541693140973,
|
1271 |
+
"score_ci_high": 0.38727309802034554,
|
1272 |
+
"sacrebleu_ci_low": 0.30697541693140973,
|
1273 |
+
"sacrebleu_ci_high": 0.38727309802034554
|
1274 |
+
},
|
1275 |
+
"score": 0.39663181686150334,
|
1276 |
+
"score_name": "subsets_mean",
|
1277 |
+
"num_of_instances": 990
|
1278 |
+
},
|
1279 |
+
"score": 0.533962583211598,
|
1280 |
+
"score_name": "subsets_mean",
|
1281 |
+
"num_of_instances": 12472
|
1282 |
+
}
|
1283 |
+
}
|
results/bluebench/2025-06-22T19-25-42_evaluation_results.json
ADDED
@@ -0,0 +1,1283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-22T23:25:38.430519Z",
|
4 |
+
"command_line_invocation": [
|
5 |
+
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
+
"--tasks",
|
7 |
+
"benchmarks.bluebench",
|
8 |
+
"--model",
|
9 |
+
"cross_provider",
|
10 |
+
"--model_args",
|
11 |
+
"model_name=watsonx/meta-llama/llama-3-405b-instruct,max_tokens=256",
|
12 |
+
"--output_path",
|
13 |
+
"./results/bluebench",
|
14 |
+
"--log_samples",
|
15 |
+
"--trust_remote_code",
|
16 |
+
"--batch_size",
|
17 |
+
"8",
|
18 |
+
"--verbosity",
|
19 |
+
"ERROR"
|
20 |
+
],
|
21 |
+
"parsed_arguments": {
|
22 |
+
"tasks": [
|
23 |
+
"benchmarks.bluebench"
|
24 |
+
],
|
25 |
+
"split": "test",
|
26 |
+
"num_fewshots": null,
|
27 |
+
"limit": null,
|
28 |
+
"batch_size": 8,
|
29 |
+
"model": "watsonx/meta-llama/llama-3-405b-instruct",
|
30 |
+
"model_args": {
|
31 |
+
"max_tokens": 256
|
32 |
+
},
|
33 |
+
"gen_kwargs": null,
|
34 |
+
"chat_template_kwargs": null,
|
35 |
+
"output_path": "./results/bluebench",
|
36 |
+
"output_file_prefix": "evaluation_results",
|
37 |
+
"log_samples": true,
|
38 |
+
"verbosity": "ERROR",
|
39 |
+
"apply_chat_template": false,
|
40 |
+
"trust_remote_code": true,
|
41 |
+
"disable_hf_cache": false,
|
42 |
+
"cache_dir": null
|
43 |
+
},
|
44 |
+
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
|
46 |
+
"python_version": "3.10.18",
|
47 |
+
"system": "Linux",
|
48 |
+
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
49 |
+
"installed_packages": {
|
50 |
+
"nvidia-cufile-cu12": "1.11.1.6",
|
51 |
+
"triton": "3.3.1",
|
52 |
+
"nltk": "3.9.1",
|
53 |
+
"anyio": "4.9.0",
|
54 |
+
"absl-py": "2.3.0",
|
55 |
+
"tiktoken": "0.9.0",
|
56 |
+
"charset-normalizer": "3.4.2",
|
57 |
+
"nvidia-cuda-runtime-cu12": "12.6.77",
|
58 |
+
"sympy": "1.14.0",
|
59 |
+
"mecab-ko": "1.0.1",
|
60 |
+
"litellm": "1.72.6.post1",
|
61 |
+
"httpcore": "1.0.9",
|
62 |
+
"Jinja2": "3.1.6",
|
63 |
+
"jsonschema-specifications": "2025.4.1",
|
64 |
+
"pydantic_core": "2.33.2",
|
65 |
+
"nvidia-cusparse-cu12": "12.5.4.2",
|
66 |
+
"yarl": "1.20.1",
|
67 |
+
"openai": "1.88.0",
|
68 |
+
"portalocker": "3.2.0",
|
69 |
+
"pandas": "2.3.0",
|
70 |
+
"multiprocess": "0.70.16",
|
71 |
+
"jsonschema": "4.24.0",
|
72 |
+
"unitxt": "1.24.0",
|
73 |
+
"nvidia-nvjitlink-cu12": "12.6.85",
|
74 |
+
"nvidia-cublas-cu12": "12.6.4.1",
|
75 |
+
"pydantic": "2.11.7",
|
76 |
+
"async-timeout": "5.0.1",
|
77 |
+
"annotated-types": "0.7.0",
|
78 |
+
"rouge_score": "0.1.2",
|
79 |
+
"contourpy": "1.3.2",
|
80 |
+
"aiosignal": "1.3.2",
|
81 |
+
"nvidia-cuda-cupti-cu12": "12.6.80",
|
82 |
+
"pillow": "11.2.1",
|
83 |
+
"six": "1.17.0",
|
84 |
+
"diskcache": "5.6.3",
|
85 |
+
"tqdm": "4.67.1",
|
86 |
+
"pyarrow": "20.0.0",
|
87 |
+
"h11": "0.16.0",
|
88 |
+
"zipp": "3.19.2",
|
89 |
+
"tzdata": "2025.2",
|
90 |
+
"bert-score": "0.3.13",
|
91 |
+
"setuptools": "80.9.0",
|
92 |
+
"referencing": "0.36.2",
|
93 |
+
"sacrebleu": "2.5.1",
|
94 |
+
"filelock": "3.18.0",
|
95 |
+
"urllib3": "2.5.0",
|
96 |
+
"scipy": "1.15.3",
|
97 |
+
"nvidia-nccl-cu12": "2.26.2",
|
98 |
+
"kiwisolver": "1.4.8",
|
99 |
+
"networkx": "3.4.2",
|
100 |
+
"typing-inspection": "0.4.1",
|
101 |
+
"lxml": "5.4.0",
|
102 |
+
"sniffio": "1.3.1",
|
103 |
+
"scikit-learn": "1.7.0",
|
104 |
+
"nvidia-curand-cu12": "10.3.7.77",
|
105 |
+
"pip": "25.1.1",
|
106 |
+
"fonttools": "4.58.4",
|
107 |
+
"transformers": "4.52.4",
|
108 |
+
"datasets": "3.6.0",
|
109 |
+
"nvidia-cusolver-cu12": "11.7.1.2",
|
110 |
+
"cycler": "0.12.1",
|
111 |
+
"evaluate": "0.4.3",
|
112 |
+
"distro": "1.9.0",
|
113 |
+
"idna": "3.10",
|
114 |
+
"MarkupSafe": "3.0.2",
|
115 |
+
"frozenlist": "1.7.0",
|
116 |
+
"pyparsing": "3.2.3",
|
117 |
+
"jiter": "0.10.0",
|
118 |
+
"importlib_metadata": "8.0.0",
|
119 |
+
"packaging": "24.2",
|
120 |
+
"psutil": "7.0.0",
|
121 |
+
"mecab-ko-dic": "1.0.0",
|
122 |
+
"joblib": "1.5.1",
|
123 |
+
"fsspec": "2025.3.0",
|
124 |
+
"dill": "0.3.8",
|
125 |
+
"tokenizers": "0.21.1",
|
126 |
+
"wheel": "0.45.1",
|
127 |
+
"nvidia-nvtx-cu12": "12.6.77",
|
128 |
+
"nvidia-cusparselt-cu12": "0.6.3",
|
129 |
+
"hf-xet": "1.1.4",
|
130 |
+
"propcache": "0.3.2",
|
131 |
+
"numpy": "2.2.6",
|
132 |
+
"mpmath": "1.3.0",
|
133 |
+
"multidict": "6.5.0",
|
134 |
+
"conllu": "6.0.0",
|
135 |
+
"safetensors": "0.5.3",
|
136 |
+
"requests": "2.32.4",
|
137 |
+
"regex": "2024.11.6",
|
138 |
+
"aiohttp": "3.12.13",
|
139 |
+
"tabulate": "0.9.0",
|
140 |
+
"certifi": "2025.6.15",
|
141 |
+
"accelerate": "1.8.0",
|
142 |
+
"nvidia-cufft-cu12": "11.3.0.4",
|
143 |
+
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
144 |
+
"click": "8.2.1",
|
145 |
+
"typing_extensions": "4.12.2",
|
146 |
+
"attrs": "25.3.0",
|
147 |
+
"exceptiongroup": "1.3.0",
|
148 |
+
"tenacity": "9.1.2",
|
149 |
+
"pytz": "2025.2",
|
150 |
+
"aiohappyeyeballs": "2.6.1",
|
151 |
+
"python-dateutil": "2.9.0.post0",
|
152 |
+
"torch": "2.7.1",
|
153 |
+
"python-dotenv": "1.1.0",
|
154 |
+
"httpx": "0.28.1",
|
155 |
+
"matplotlib": "3.10.3",
|
156 |
+
"xxhash": "3.5.0",
|
157 |
+
"PyYAML": "6.0.2",
|
158 |
+
"huggingface-hub": "0.33.0",
|
159 |
+
"colorama": "0.4.6",
|
160 |
+
"rpds-py": "0.25.1",
|
161 |
+
"threadpoolctl": "3.6.0",
|
162 |
+
"nvidia-cudnn-cu12": "9.5.1.17",
|
163 |
+
"jaraco.collections": "5.1.0",
|
164 |
+
"tomli": "2.0.1",
|
165 |
+
"backports.tarfile": "1.2.0",
|
166 |
+
"jaraco.context": "5.3.0",
|
167 |
+
"typeguard": "4.3.0",
|
168 |
+
"autocommand": "2.2.2",
|
169 |
+
"jaraco.text": "3.12.1",
|
170 |
+
"more-itertools": "10.3.0",
|
171 |
+
"platformdirs": "4.2.2",
|
172 |
+
"inflect": "7.3.1",
|
173 |
+
"jaraco.functools": "4.0.1"
|
174 |
+
}
|
175 |
+
},
|
176 |
+
"results": {
|
177 |
+
"bias": {
|
178 |
+
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.9777777777777777,
|
180 |
+
"accuracy_ci_low": 0.9190234736102009,
|
181 |
+
"accuracy_ci_high": 1.0,
|
182 |
+
"score_name": "accuracy",
|
183 |
+
"score": 0.9777777777777777,
|
184 |
+
"score_ci_high": 1.0,
|
185 |
+
"score_ci_low": 0.9190234736102009,
|
186 |
+
"num_of_instances": 90
|
187 |
+
},
|
188 |
+
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.9888888888888889,
|
190 |
+
"accuracy_ci_low": 0.9444444444444444,
|
191 |
+
"accuracy_ci_high": 1.0,
|
192 |
+
"score_name": "accuracy",
|
193 |
+
"score": 0.9888888888888889,
|
194 |
+
"score_ci_high": 1.0,
|
195 |
+
"score_ci_low": 0.9444444444444444,
|
196 |
+
"num_of_instances": 90
|
197 |
+
},
|
198 |
+
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 0.9888888888888889,
|
200 |
+
"accuracy_ci_low": 0.9333333333333333,
|
201 |
+
"accuracy_ci_high": 1.0,
|
202 |
+
"score_name": "accuracy",
|
203 |
+
"score": 0.9888888888888889,
|
204 |
+
"score_ci_high": 1.0,
|
205 |
+
"score_ci_low": 0.9333333333333333,
|
206 |
+
"num_of_instances": 90
|
207 |
+
},
|
208 |
+
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 1.0,
|
210 |
+
"accuracy_ci_low": 1.0,
|
211 |
+
"accuracy_ci_high": 1.0,
|
212 |
+
"score_name": "accuracy",
|
213 |
+
"score": 1.0,
|
214 |
+
"score_ci_high": 1.0,
|
215 |
+
"score_ci_low": 1.0,
|
216 |
+
"num_of_instances": 90
|
217 |
+
},
|
218 |
+
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.9888888888888889,
|
220 |
+
"accuracy_ci_low": 0.9444444444444444,
|
221 |
+
"accuracy_ci_high": 1.0,
|
222 |
+
"score_name": "accuracy",
|
223 |
+
"score": 0.9888888888888889,
|
224 |
+
"score_ci_high": 1.0,
|
225 |
+
"score_ci_low": 0.9444444444444444,
|
226 |
+
"num_of_instances": 90
|
227 |
+
},
|
228 |
+
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 1.0,
|
230 |
+
"accuracy_ci_low": 1.0,
|
231 |
+
"accuracy_ci_high": 1.0,
|
232 |
+
"score_name": "accuracy",
|
233 |
+
"score": 1.0,
|
234 |
+
"score_ci_high": 1.0,
|
235 |
+
"score_ci_low": 1.0,
|
236 |
+
"num_of_instances": 90
|
237 |
+
},
|
238 |
+
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 1.0,
|
240 |
+
"accuracy_ci_low": 1.0,
|
241 |
+
"accuracy_ci_high": 1.0,
|
242 |
+
"score_name": "accuracy",
|
243 |
+
"score": 1.0,
|
244 |
+
"score_ci_high": 1.0,
|
245 |
+
"score_ci_low": 1.0,
|
246 |
+
"num_of_instances": 90
|
247 |
+
},
|
248 |
+
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 1.0,
|
250 |
+
"accuracy_ci_low": 1.0,
|
251 |
+
"accuracy_ci_high": 1.0,
|
252 |
+
"score_name": "accuracy",
|
253 |
+
"score": 1.0,
|
254 |
+
"score_ci_high": 1.0,
|
255 |
+
"score_ci_low": 1.0,
|
256 |
+
"num_of_instances": 90
|
257 |
+
},
|
258 |
+
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.9888888888888889,
|
260 |
+
"accuracy_ci_low": 0.9444444444444444,
|
261 |
+
"accuracy_ci_high": 1.0,
|
262 |
+
"score_name": "accuracy",
|
263 |
+
"score": 0.9888888888888889,
|
264 |
+
"score_ci_high": 1.0,
|
265 |
+
"score_ci_low": 0.9444444444444444,
|
266 |
+
"num_of_instances": 90
|
267 |
+
},
|
268 |
+
"safety_bbq_ses": {
|
269 |
+
"accuracy": 1.0,
|
270 |
+
"accuracy_ci_low": 1.0,
|
271 |
+
"accuracy_ci_high": 1.0,
|
272 |
+
"score_name": "accuracy",
|
273 |
+
"score": 1.0,
|
274 |
+
"score_ci_high": 1.0,
|
275 |
+
"score_ci_low": 1.0,
|
276 |
+
"num_of_instances": 90
|
277 |
+
},
|
278 |
+
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.8777777777777778,
|
280 |
+
"accuracy_ci_low": 0.8,
|
281 |
+
"accuracy_ci_high": 0.9333333333333333,
|
282 |
+
"score_name": "accuracy",
|
283 |
+
"score": 0.8777777777777778,
|
284 |
+
"score_ci_high": 0.9333333333333333,
|
285 |
+
"score_ci_low": 0.8,
|
286 |
+
"num_of_instances": 90
|
287 |
+
},
|
288 |
+
"score": 0.9828282828282828,
|
289 |
+
"score_name": "subsets_mean",
|
290 |
+
"num_of_instances": 990
|
291 |
+
},
|
292 |
+
"chatbot_abilities": {
|
293 |
+
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
+
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.12794268167860798,
|
296 |
+
"score": 0.12794268167860798,
|
297 |
+
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
+
},
|
299 |
+
"score": 0.12794268167860798,
|
300 |
+
"score_name": "subsets_mean",
|
301 |
+
"num_of_instances": 500
|
302 |
+
},
|
303 |
+
"entity_extraction": {
|
304 |
+
"universal_ner_en_ewt": {
|
305 |
+
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.6233062330623307,
|
307 |
+
"f1_Organization": 0.4037267080745342,
|
308 |
+
"f1_Location": 0.441860465116279,
|
309 |
+
"f1_macro": 0.4896311354177146,
|
310 |
+
"recall_macro": 0.44046090061205145,
|
311 |
+
"precision_macro": 0.5555845701415322,
|
312 |
+
"in_classes_support": 0.8122605363984674,
|
313 |
+
"f1_micro": 0.45272206303724927,
|
314 |
+
"recall_micro": 0.4514285714285714,
|
315 |
+
"precision_micro": 0.4540229885057471,
|
316 |
+
"score": 0.45272206303724927,
|
317 |
+
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.3966894448307716,
|
319 |
+
"score_ci_high": 0.5096302708354611,
|
320 |
+
"f1_micro_ci_low": 0.3966894448307716,
|
321 |
+
"f1_micro_ci_high": 0.5096302708354611
|
322 |
+
},
|
323 |
+
"score": 0.45272206303724927,
|
324 |
+
"score_name": "subsets_mean",
|
325 |
+
"num_of_instances": 1000
|
326 |
+
},
|
327 |
+
"knowledge": {
|
328 |
+
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.7605633802816901,
|
330 |
+
"accuracy_ci_low": 0.6619718309859155,
|
331 |
+
"accuracy_ci_high": 0.8450704225352113,
|
332 |
+
"score_name": "accuracy",
|
333 |
+
"score": 0.7605633802816901,
|
334 |
+
"score_ci_high": 0.8450704225352113,
|
335 |
+
"score_ci_low": 0.6619718309859155,
|
336 |
+
"num_of_instances": 71
|
337 |
+
},
|
338 |
+
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.49295774647887325,
|
340 |
+
"accuracy_ci_low": 0.38028169014084506,
|
341 |
+
"accuracy_ci_high": 0.6056338028169014,
|
342 |
+
"score_name": "accuracy",
|
343 |
+
"score": 0.49295774647887325,
|
344 |
+
"score_ci_high": 0.6056338028169014,
|
345 |
+
"score_ci_low": 0.38028169014084506,
|
346 |
+
"num_of_instances": 71
|
347 |
+
},
|
348 |
+
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.43661971830985913,
|
350 |
+
"accuracy_ci_low": 0.31179550598679995,
|
351 |
+
"accuracy_ci_high": 0.5633802816901409,
|
352 |
+
"score_name": "accuracy",
|
353 |
+
"score": 0.43661971830985913,
|
354 |
+
"score_ci_high": 0.5633802816901409,
|
355 |
+
"score_ci_low": 0.31179550598679995,
|
356 |
+
"num_of_instances": 71
|
357 |
+
},
|
358 |
+
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.7323943661971831,
|
360 |
+
"accuracy_ci_low": 0.6056338028169014,
|
361 |
+
"accuracy_ci_high": 0.8309859154929577,
|
362 |
+
"score_name": "accuracy",
|
363 |
+
"score": 0.7323943661971831,
|
364 |
+
"score_ci_high": 0.8309859154929577,
|
365 |
+
"score_ci_low": 0.6056338028169014,
|
366 |
+
"num_of_instances": 71
|
367 |
+
},
|
368 |
+
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.7464788732394366,
|
370 |
+
"accuracy_ci_low": 0.6338028169014085,
|
371 |
+
"accuracy_ci_high": 0.8320697555200512,
|
372 |
+
"score_name": "accuracy",
|
373 |
+
"score": 0.7464788732394366,
|
374 |
+
"score_ci_high": 0.8320697555200512,
|
375 |
+
"score_ci_low": 0.6338028169014085,
|
376 |
+
"num_of_instances": 71
|
377 |
+
},
|
378 |
+
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.5633802816901409,
|
380 |
+
"accuracy_ci_low": 0.4507042253521127,
|
381 |
+
"accuracy_ci_high": 0.6901408450704225,
|
382 |
+
"score_name": "accuracy",
|
383 |
+
"score": 0.5633802816901409,
|
384 |
+
"score_ci_high": 0.6901408450704225,
|
385 |
+
"score_ci_low": 0.4507042253521127,
|
386 |
+
"num_of_instances": 71
|
387 |
+
},
|
388 |
+
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.5774647887323944,
|
390 |
+
"accuracy_ci_low": 0.4647887323943662,
|
391 |
+
"accuracy_ci_high": 0.6901408450704225,
|
392 |
+
"score_name": "accuracy",
|
393 |
+
"score": 0.5774647887323944,
|
394 |
+
"score_ci_high": 0.6901408450704225,
|
395 |
+
"score_ci_low": 0.4647887323943662,
|
396 |
+
"num_of_instances": 71
|
397 |
+
},
|
398 |
+
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.6901408450704225,
|
400 |
+
"accuracy_ci_low": 0.5633802816901409,
|
401 |
+
"accuracy_ci_high": 0.7887323943661971,
|
402 |
+
"score_name": "accuracy",
|
403 |
+
"score": 0.6901408450704225,
|
404 |
+
"score_ci_high": 0.7887323943661971,
|
405 |
+
"score_ci_low": 0.5633802816901409,
|
406 |
+
"num_of_instances": 71
|
407 |
+
},
|
408 |
+
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.7323943661971831,
|
410 |
+
"accuracy_ci_low": 0.6056338028169014,
|
411 |
+
"accuracy_ci_high": 0.8309859154929577,
|
412 |
+
"score_name": "accuracy",
|
413 |
+
"score": 0.7323943661971831,
|
414 |
+
"score_ci_high": 0.8309859154929577,
|
415 |
+
"score_ci_low": 0.6056338028169014,
|
416 |
+
"num_of_instances": 71
|
417 |
+
},
|
418 |
+
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.5633802816901409,
|
420 |
+
"accuracy_ci_low": 0.43661971830985913,
|
421 |
+
"accuracy_ci_high": 0.676056338028169,
|
422 |
+
"score_name": "accuracy",
|
423 |
+
"score": 0.5633802816901409,
|
424 |
+
"score_ci_high": 0.676056338028169,
|
425 |
+
"score_ci_low": 0.43661971830985913,
|
426 |
+
"num_of_instances": 71
|
427 |
+
},
|
428 |
+
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.6619718309859155,
|
430 |
+
"accuracy_ci_low": 0.5370780611967093,
|
431 |
+
"accuracy_ci_high": 0.7605633802816901,
|
432 |
+
"score_name": "accuracy",
|
433 |
+
"score": 0.6619718309859155,
|
434 |
+
"score_ci_high": 0.7605633802816901,
|
435 |
+
"score_ci_low": 0.5370780611967093,
|
436 |
+
"num_of_instances": 71
|
437 |
+
},
|
438 |
+
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.8028169014084507,
|
440 |
+
"accuracy_ci_low": 0.6981095742502579,
|
441 |
+
"accuracy_ci_high": 0.8873239436619719,
|
442 |
+
"score_name": "accuracy",
|
443 |
+
"score": 0.8028169014084507,
|
444 |
+
"score_ci_high": 0.8873239436619719,
|
445 |
+
"score_ci_low": 0.6981095742502579,
|
446 |
+
"num_of_instances": 71
|
447 |
+
},
|
448 |
+
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.6338028169014085,
|
450 |
+
"accuracy_ci_low": 0.5211267605633803,
|
451 |
+
"accuracy_ci_high": 0.7323943661971831,
|
452 |
+
"score_name": "accuracy",
|
453 |
+
"score": 0.6338028169014085,
|
454 |
+
"score_ci_high": 0.7323943661971831,
|
455 |
+
"score_ci_low": 0.5211267605633803,
|
456 |
+
"num_of_instances": 71
|
457 |
+
},
|
458 |
+
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.7323943661971831,
|
460 |
+
"accuracy_ci_low": 0.6197183098591549,
|
461 |
+
"accuracy_ci_high": 0.8309859154929577,
|
462 |
+
"score_name": "accuracy",
|
463 |
+
"score": 0.7323943661971831,
|
464 |
+
"score_ci_high": 0.8309859154929577,
|
465 |
+
"score_ci_low": 0.6197183098591549,
|
466 |
+
"num_of_instances": 71
|
467 |
+
},
|
468 |
+
"score": 0.6519114688128773,
|
469 |
+
"score_name": "subsets_mean",
|
470 |
+
"num_of_instances": 994
|
471 |
+
},
|
472 |
+
"legal": {
|
473 |
+
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.7710561497326203,
|
475 |
+
"f1_suggestive": 0.5882352941176471,
|
476 |
+
"f1_generic": 1.0,
|
477 |
+
"f1_fanciful": 0.8125,
|
478 |
+
"f1_descriptive": 0.7878787878787878,
|
479 |
+
"f1_arbitrary": 0.6666666666666666,
|
480 |
+
"f1_macro_ci_low": 0.6815687852174904,
|
481 |
+
"f1_macro_ci_high": 0.8487975830625909,
|
482 |
+
"score_name": "f1_micro",
|
483 |
+
"score": 0.7682926829268293,
|
484 |
+
"score_ci_high": 0.845238531816244,
|
485 |
+
"score_ci_low": 0.682034648754911,
|
486 |
+
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.7411764705882353,
|
488 |
+
"accuracy_ci_low": 0.6470588235294118,
|
489 |
+
"accuracy_ci_high": 0.8235294117647058,
|
490 |
+
"f1_micro": 0.7682926829268293,
|
491 |
+
"f1_micro_ci_low": 0.682034648754911,
|
492 |
+
"f1_micro_ci_high": 0.845238531816244
|
493 |
+
},
|
494 |
+
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.7216529635538103,
|
496 |
+
"f1_no": 0.8398576512455516,
|
497 |
+
"f1_yes": 0.603448275862069,
|
498 |
+
"f1_macro_ci_low": 0.6520262757220233,
|
499 |
+
"f1_macro_ci_high": 0.7890273988307265,
|
500 |
+
"score_name": "f1_micro",
|
501 |
+
"score": 0.7707808564231738,
|
502 |
+
"score_ci_high": 0.8225396492391672,
|
503 |
+
"score_ci_low": 0.7085427135678392,
|
504 |
+
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.765,
|
506 |
+
"accuracy_ci_low": 0.7,
|
507 |
+
"accuracy_ci_high": 0.815,
|
508 |
+
"f1_micro": 0.7707808564231738,
|
509 |
+
"f1_micro_ci_low": 0.7085427135678392,
|
510 |
+
"f1_micro_ci_high": 0.8225396492391672
|
511 |
+
},
|
512 |
+
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.30206082783981175,
|
514 |
+
"f1_conclusion": 0.13953488372093023,
|
515 |
+
"f1_issue": 0.13333333333333333,
|
516 |
+
"f1_decree": 0.3783783783783784,
|
517 |
+
"f1_rule": 0.4482758620689655,
|
518 |
+
"f1_analysis": 0.5647058823529412,
|
519 |
+
"f1_facts": 0.18604651162790697,
|
520 |
+
"f1_procedural history": 0.2641509433962264,
|
521 |
+
"f1_macro_ci_low": 0.2513517165690192,
|
522 |
+
"f1_macro_ci_high": 0.3775685968384507,
|
523 |
+
"score_name": "f1_micro",
|
524 |
+
"score": 0.33516483516483514,
|
525 |
+
"score_ci_high": 0.4075146671820192,
|
526 |
+
"score_ci_low": 0.272347535123403,
|
527 |
+
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.305,
|
529 |
+
"accuracy_ci_low": 0.245,
|
530 |
+
"accuracy_ci_high": 0.375,
|
531 |
+
"f1_micro": 0.33516483516483514,
|
532 |
+
"f1_micro_ci_low": 0.272347535123403,
|
533 |
+
"f1_micro_ci_high": 0.4075146671820192
|
534 |
+
},
|
535 |
+
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.5047062641999351,
|
537 |
+
"f1_yes": 0.5991561181434599,
|
538 |
+
"f1_no": 0.41025641025641024,
|
539 |
+
"f1_macro_ci_low": 0.4417647306569189,
|
540 |
+
"f1_macro_ci_high": 0.5795532826410552,
|
541 |
+
"score_name": "f1_micro",
|
542 |
+
"score": 0.5241730279898219,
|
543 |
+
"score_ci_high": 0.5950630270095045,
|
544 |
+
"score_ci_low": 0.459552667145485,
|
545 |
+
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.515,
|
547 |
+
"accuracy_ci_low": 0.45,
|
548 |
+
"accuracy_ci_high": 0.585,
|
549 |
+
"f1_micro": 0.5241730279898219,
|
550 |
+
"f1_micro_ci_low": 0.459552667145485,
|
551 |
+
"f1_micro_ci_high": 0.5950630270095045
|
552 |
+
},
|
553 |
+
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.9284195605953225,
|
555 |
+
"f1_yes": 0.9156626506024096,
|
556 |
+
"f1_no": 0.9411764705882353,
|
557 |
+
"f1_macro_ci_low": 0.8686025850356507,
|
558 |
+
"f1_macro_ci_high": 0.9706771979676585,
|
559 |
+
"score_name": "f1_micro",
|
560 |
+
"score": 0.9285714285714286,
|
561 |
+
"score_ci_high": 0.9704142011834319,
|
562 |
+
"score_ci_low": 0.8690476190476191,
|
563 |
+
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.9176470588235294,
|
565 |
+
"accuracy_ci_low": 0.8588235294117647,
|
566 |
+
"accuracy_ci_high": 0.9647058823529412,
|
567 |
+
"f1_micro": 0.9285714285714286,
|
568 |
+
"f1_micro_ci_low": 0.8690476190476191,
|
569 |
+
"f1_micro_ci_high": 0.9704142011834319
|
570 |
+
},
|
571 |
+
"score": 0.6653965662152177,
|
572 |
+
"score_name": "subsets_mean",
|
573 |
+
"num_of_instances": 770
|
574 |
+
},
|
575 |
+
"news_classification": {
|
576 |
+
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.6486110220528901,
|
578 |
+
"f1_cars": 0.8958333333333334,
|
579 |
+
"f1_windows x": 0.09090909090909091,
|
580 |
+
"f1_computer graphics": 0.4793388429752066,
|
581 |
+
"f1_atheism": 0.5245901639344263,
|
582 |
+
"f1_religion": 0.044444444444444446,
|
583 |
+
"f1_medicine": 0.813953488372093,
|
584 |
+
"f1_christianity": 0.8727272727272727,
|
585 |
+
"f1_for sale": 0.7777777777777778,
|
586 |
+
"f1_microsoft windows": 0.7708333333333334,
|
587 |
+
"f1_middle east": 0.5671641791044776,
|
588 |
+
"f1_motorcycles": 0.7692307692307693,
|
589 |
+
"f1_pc hardware": 0.6046511627906976,
|
590 |
+
"f1_mac hardware": 0.7924528301886793,
|
591 |
+
"f1_electronics": 0.7291666666666666,
|
592 |
+
"f1_guns": 0.410958904109589,
|
593 |
+
"f1_space": 0.8846153846153846,
|
594 |
+
"f1_cryptography": 0.72,
|
595 |
+
"f1_baseball": 0.9391304347826087,
|
596 |
+
"f1_hockey": 0.9545454545454546,
|
597 |
+
"f1_politics": 0.32989690721649484,
|
598 |
+
"f1_macro_ci_low": 0.6245539233827745,
|
599 |
+
"f1_macro_ci_high": 0.6735741716064018,
|
600 |
+
"score_name": "f1_micro",
|
601 |
+
"score": 0.6871961102106969,
|
602 |
+
"score_ci_high": 0.7158552998269507,
|
603 |
+
"score_ci_low": 0.6568978311145116,
|
604 |
+
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.636,
|
606 |
+
"accuracy_ci_low": 0.6068817918985229,
|
607 |
+
"accuracy_ci_high": 0.666,
|
608 |
+
"f1_micro": 0.6871961102106969,
|
609 |
+
"f1_micro_ci_low": 0.6568978311145116,
|
610 |
+
"f1_micro_ci_high": 0.7158552998269507
|
611 |
+
},
|
612 |
+
"score": 0.6871961102106969,
|
613 |
+
"score_name": "subsets_mean",
|
614 |
+
"num_of_instances": 1000
|
615 |
+
},
|
616 |
+
"product_help": {
|
617 |
+
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.7442070442021397,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9329529243937232,
|
620 |
+
"f1_checking or savings account": 0.8269230769230769,
|
621 |
+
"f1_debt collection": 0.5324675324675324,
|
622 |
+
"f1_credit card or prepaid card": 0.7933884297520661,
|
623 |
+
"f1_mortgage": 0.8405797101449275,
|
624 |
+
"f1_student loan": 0.896551724137931,
|
625 |
+
"f1_money transfer or virtual currency or money service": 0.875,
|
626 |
+
"f1_vehicle loan or lease": 0.6666666666666666,
|
627 |
+
"f1_payday loan or title loan or personal loan": 0.3333333333333333,
|
628 |
+
"f1_macro_ci_low": 0.6958471577865166,
|
629 |
+
"f1_macro_ci_high": 0.8045739980351424,
|
630 |
+
"score_name": "f1_micro",
|
631 |
+
"score": 0.8742393509127789,
|
632 |
+
"score_ci_high": 0.8929169783856484,
|
633 |
+
"score_ci_low": 0.852776904397444,
|
634 |
+
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.862,
|
636 |
+
"accuracy_ci_low": 0.839,
|
637 |
+
"accuracy_ci_high": 0.882,
|
638 |
+
"f1_micro": 0.8742393509127789,
|
639 |
+
"f1_micro_ci_low": 0.852776904397444,
|
640 |
+
"f1_micro_ci_high": 0.8929169783856484
|
641 |
+
},
|
642 |
+
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.747877693509548,
|
644 |
+
"f1_mortgages and loans": 0.8248587570621468,
|
645 |
+
"f1_credit card": 0.7912087912087912,
|
646 |
+
"f1_debt collection": 0.683982683982684,
|
647 |
+
"f1_credit reporting": 0.7205882352941176,
|
648 |
+
"f1_retail banking": 0.71875,
|
649 |
+
"f1_macro_ci_low": 0.7065255253955101,
|
650 |
+
"f1_macro_ci_high": 0.7849687727329339,
|
651 |
+
"score_name": "f1_micro",
|
652 |
+
"score": 0.7434343434343434,
|
653 |
+
"score_ci_high": 0.7787863123983747,
|
654 |
+
"score_ci_low": 0.7018885821645714,
|
655 |
+
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.736,
|
657 |
+
"accuracy_ci_low": 0.694,
|
658 |
+
"accuracy_ci_high": 0.772,
|
659 |
+
"f1_micro": 0.7434343434343434,
|
660 |
+
"f1_micro_ci_low": 0.7018885821645714,
|
661 |
+
"f1_micro_ci_high": 0.7787863123983747
|
662 |
+
},
|
663 |
+
"score": 0.8088368471735612,
|
664 |
+
"score_name": "subsets_mean",
|
665 |
+
"num_of_instances": 1500
|
666 |
+
},
|
667 |
+
"qa_finance": {
|
668 |
+
"fin_qa": {
|
669 |
+
"num_of_instances": 1000,
|
670 |
+
"program_accuracy": 0.212,
|
671 |
+
"score": 0.212,
|
672 |
+
"score_name": "program_accuracy",
|
673 |
+
"execution_accuracy": 0.197,
|
674 |
+
"program_accuracy_ci_low": 0.185,
|
675 |
+
"program_accuracy_ci_high": 0.236,
|
676 |
+
"score_ci_low": 0.185,
|
677 |
+
"score_ci_high": 0.236,
|
678 |
+
"execution_accuracy_ci_low": 0.17257528462439894,
|
679 |
+
"execution_accuracy_ci_high": 0.222
|
680 |
+
},
|
681 |
+
"score": 0.212,
|
682 |
+
"score_name": "subsets_mean",
|
683 |
+
"num_of_instances": 1000
|
684 |
+
},
|
685 |
+
"rag_general": {
|
686 |
+
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.3245919340419664,
|
688 |
+
"recall": 0.5553697849897254,
|
689 |
+
"f1": 0.34463972021312694,
|
690 |
+
"precision_ci_low": 0.30294812146038264,
|
691 |
+
"precision_ci_high": 0.34672435235830834,
|
692 |
+
"recall_ci_low": 0.5393969536805812,
|
693 |
+
"recall_ci_high": 0.5708437647062446,
|
694 |
+
"f1_ci_low": 0.32625746187015825,
|
695 |
+
"f1_ci_high": 0.3632802184669128,
|
696 |
+
"score_name": "f1",
|
697 |
+
"score": 0.34463972021312694,
|
698 |
+
"score_ci_high": 0.3632802184669128,
|
699 |
+
"score_ci_low": 0.32625746187015825,
|
700 |
+
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.5976956130564213,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.678848825097084,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5559072712808848,
|
704 |
+
"faithfullness_f1_token_overlap": 0.2867915721228578,
|
705 |
+
"faithfullness_recall_token_overlap": 0.21812174721257188,
|
706 |
+
"faithfullness_precision_token_overlap": 0.546883220093946,
|
707 |
+
"correctness_f1_token_overlap": 0.34463972021312694,
|
708 |
+
"correctness_recall_token_overlap": 0.5553697849897254,
|
709 |
+
"correctness_precision_token_overlap": 0.3245919340419664
|
710 |
+
},
|
711 |
+
"score": 0.34463972021312694,
|
712 |
+
"score_name": "subsets_mean",
|
713 |
+
"num_of_instances": 600
|
714 |
+
},
|
715 |
+
"reasoning": {
|
716 |
+
"hellaswag": {
|
717 |
+
"accuracy": 0.57,
|
718 |
+
"accuracy_ci_low": 0.537,
|
719 |
+
"accuracy_ci_high": 0.598,
|
720 |
+
"score_name": "accuracy",
|
721 |
+
"score": 0.57,
|
722 |
+
"score_ci_high": 0.598,
|
723 |
+
"score_ci_low": 0.537,
|
724 |
+
"num_of_instances": 1000
|
725 |
+
},
|
726 |
+
"openbook_qa": {
|
727 |
+
"accuracy": 0.914,
|
728 |
+
"accuracy_ci_low": 0.886,
|
729 |
+
"accuracy_ci_high": 0.936,
|
730 |
+
"score_name": "accuracy",
|
731 |
+
"score": 0.914,
|
732 |
+
"score_ci_high": 0.936,
|
733 |
+
"score_ci_low": 0.886,
|
734 |
+
"num_of_instances": 500
|
735 |
+
},
|
736 |
+
"score": 0.742,
|
737 |
+
"score_name": "subsets_mean",
|
738 |
+
"num_of_instances": 1500
|
739 |
+
},
|
740 |
+
"safety": {
|
741 |
+
"attaq_500": {
|
742 |
+
"severity_critical": 1.0,
|
743 |
+
"severity_high": 0.0,
|
744 |
+
"severity_medium": 0.0,
|
745 |
+
"severity_low": 99.0,
|
746 |
+
"category_harmful_info": 0.8239067156488699,
|
747 |
+
"category_deception": 0.8416350447017737,
|
748 |
+
"category_explicit_content": 0.8482034478073168,
|
749 |
+
"category_violence": 0.8124119411385249,
|
750 |
+
"category_discrimination": 0.7214018125382681,
|
751 |
+
"category_substance_abuse": 0.8382871117856767,
|
752 |
+
"category_pii": 0.838832758643009,
|
753 |
+
"safety": 0.8184977591824201,
|
754 |
+
"safety_ci_low": 0.799251726518489,
|
755 |
+
"safety_ci_high": 0.83564691259826,
|
756 |
+
"score_name": "safety",
|
757 |
+
"score": 0.8184977591824201,
|
758 |
+
"score_ci_high": 0.83564691259826,
|
759 |
+
"score_ci_low": 0.799251726518489,
|
760 |
+
"num_of_instances": 100
|
761 |
+
},
|
762 |
+
"score": 0.8184977591824201,
|
763 |
+
"score_name": "subsets_mean",
|
764 |
+
"num_of_instances": 100
|
765 |
+
},
|
766 |
+
"summarization": {
|
767 |
+
"billsum_document_filtered_to_6000_chars": {
|
768 |
+
"num_of_instances": 528,
|
769 |
+
"rouge2": 0.21878117525659171,
|
770 |
+
"rouge1": 0.42882781698282896,
|
771 |
+
"rougeLsum": 0.36923160950814665,
|
772 |
+
"rougeL": 0.3012206645101065,
|
773 |
+
"score": 0.3012206645101065,
|
774 |
+
"score_name": "rougeL",
|
775 |
+
"rouge2_ci_low": 0.2116465564648193,
|
776 |
+
"rouge2_ci_high": 0.22611002289776966,
|
777 |
+
"rouge1_ci_low": 0.4181854383996789,
|
778 |
+
"rouge1_ci_high": 0.4380350294107447,
|
779 |
+
"rougeLsum_ci_low": 0.3593867077962995,
|
780 |
+
"rougeLsum_ci_high": 0.377981997024725,
|
781 |
+
"rougeL_ci_low": 0.2938004078438361,
|
782 |
+
"rougeL_ci_high": 0.30862765298917266,
|
783 |
+
"score_ci_low": 0.2938004078438361,
|
784 |
+
"score_ci_high": 0.30862765298917266
|
785 |
+
},
|
786 |
+
"tldr_document_filtered_to_6000_chars": {
|
787 |
+
"num_of_instances": 1000,
|
788 |
+
"rouge2": 0.01897199585186187,
|
789 |
+
"rouge1": 0.12886717133874953,
|
790 |
+
"rougeLsum": 0.10635879624885117,
|
791 |
+
"rougeL": 0.09266060514104649,
|
792 |
+
"score": 0.09266060514104649,
|
793 |
+
"score_name": "rougeL",
|
794 |
+
"rouge2_ci_low": 0.0169285768359538,
|
795 |
+
"rouge2_ci_high": 0.020965979315931374,
|
796 |
+
"rouge1_ci_low": 0.1230681468561615,
|
797 |
+
"rouge1_ci_high": 0.13427737204069826,
|
798 |
+
"rougeLsum_ci_low": 0.10189392757917064,
|
799 |
+
"rougeLsum_ci_high": 0.11089749692460946,
|
800 |
+
"rougeL_ci_low": 0.08868339152088286,
|
801 |
+
"rougeL_ci_high": 0.0963878314649574,
|
802 |
+
"score_ci_low": 0.08868339152088286,
|
803 |
+
"score_ci_high": 0.0963878314649574
|
804 |
+
},
|
805 |
+
"score": 0.1969406348255765,
|
806 |
+
"score_name": "subsets_mean",
|
807 |
+
"num_of_instances": 1528
|
808 |
+
},
|
809 |
+
"translation": {
|
810 |
+
"mt_flores_101_ara_eng": {
|
811 |
+
"num_of_instances": 66,
|
812 |
+
"counts": [
|
813 |
+
1296,
|
814 |
+
846,
|
815 |
+
593,
|
816 |
+
421
|
817 |
+
],
|
818 |
+
"totals": [
|
819 |
+
1768,
|
820 |
+
1702,
|
821 |
+
1636,
|
822 |
+
1570
|
823 |
+
],
|
824 |
+
"precisions": [
|
825 |
+
0.7330316742081447,
|
826 |
+
0.4970622796709753,
|
827 |
+
0.36246943765281175,
|
828 |
+
0.2681528662420382
|
829 |
+
],
|
830 |
+
"bp": 1.0,
|
831 |
+
"sys_len": 1768,
|
832 |
+
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.4338072904737007,
|
834 |
+
"score": 0.4338072904737007,
|
835 |
+
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.3840033634108722,
|
837 |
+
"score_ci_high": 0.4745374005130659,
|
838 |
+
"sacrebleu_ci_low": 0.3840033634108722,
|
839 |
+
"sacrebleu_ci_high": 0.4745374005130659
|
840 |
+
},
|
841 |
+
"mt_flores_101_deu_eng": {
|
842 |
+
"num_of_instances": 66,
|
843 |
+
"counts": [
|
844 |
+
1324,
|
845 |
+
883,
|
846 |
+
626,
|
847 |
+
442
|
848 |
+
],
|
849 |
+
"totals": [
|
850 |
+
1796,
|
851 |
+
1730,
|
852 |
+
1664,
|
853 |
+
1598
|
854 |
+
],
|
855 |
+
"precisions": [
|
856 |
+
0.7371937639198218,
|
857 |
+
0.5104046242774567,
|
858 |
+
0.3762019230769231,
|
859 |
+
0.2765957446808511
|
860 |
+
],
|
861 |
+
"bp": 1.0,
|
862 |
+
"sys_len": 1796,
|
863 |
+
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.44482653871620387,
|
865 |
+
"score": 0.44482653871620387,
|
866 |
+
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.4081818656109621,
|
868 |
+
"score_ci_high": 0.4860589069348345,
|
869 |
+
"sacrebleu_ci_low": 0.4081818656109621,
|
870 |
+
"sacrebleu_ci_high": 0.4860589069348345
|
871 |
+
},
|
872 |
+
"mt_flores_101_eng_ara": {
|
873 |
+
"num_of_instances": 66,
|
874 |
+
"counts": [
|
875 |
+
898,
|
876 |
+
497,
|
877 |
+
303,
|
878 |
+
187
|
879 |
+
],
|
880 |
+
"totals": [
|
881 |
+
1785,
|
882 |
+
1719,
|
883 |
+
1653,
|
884 |
+
1587
|
885 |
+
],
|
886 |
+
"precisions": [
|
887 |
+
0.5030812324929972,
|
888 |
+
0.28912158231529955,
|
889 |
+
0.18330308529945555,
|
890 |
+
0.1178323881537492
|
891 |
+
],
|
892 |
+
"bp": 1.0,
|
893 |
+
"sys_len": 1785,
|
894 |
+
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.23674906403928667,
|
896 |
+
"score": 0.23674906403928667,
|
897 |
+
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.18135183356554332,
|
899 |
+
"score_ci_high": 0.28960778605368953,
|
900 |
+
"sacrebleu_ci_low": 0.18135183356554332,
|
901 |
+
"sacrebleu_ci_high": 0.28960778605368953
|
902 |
+
},
|
903 |
+
"mt_flores_101_eng_deu": {
|
904 |
+
"num_of_instances": 66,
|
905 |
+
"counts": [
|
906 |
+
1240,
|
907 |
+
761,
|
908 |
+
513,
|
909 |
+
362
|
910 |
+
],
|
911 |
+
"totals": [
|
912 |
+
1823,
|
913 |
+
1757,
|
914 |
+
1691,
|
915 |
+
1625
|
916 |
+
],
|
917 |
+
"precisions": [
|
918 |
+
0.6801974766867801,
|
919 |
+
0.4331246442800228,
|
920 |
+
0.3033707865168539,
|
921 |
+
0.22276923076923075
|
922 |
+
],
|
923 |
+
"bp": 0.9934390613382812,
|
924 |
+
"sys_len": 1823,
|
925 |
+
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.3731732035283488,
|
927 |
+
"score": 0.3731732035283488,
|
928 |
+
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.3128221603250469,
|
930 |
+
"score_ci_high": 0.4084245255594999,
|
931 |
+
"sacrebleu_ci_low": 0.3128221603250469,
|
932 |
+
"sacrebleu_ci_high": 0.4084245255594999
|
933 |
+
},
|
934 |
+
"mt_flores_101_eng_fra": {
|
935 |
+
"num_of_instances": 66,
|
936 |
+
"counts": [
|
937 |
+
1532,
|
938 |
+
1143,
|
939 |
+
898,
|
940 |
+
722
|
941 |
+
],
|
942 |
+
"totals": [
|
943 |
+
2028,
|
944 |
+
1962,
|
945 |
+
1896,
|
946 |
+
1830
|
947 |
+
],
|
948 |
+
"precisions": [
|
949 |
+
0.7554240631163708,
|
950 |
+
0.5825688073394495,
|
951 |
+
0.4736286919831224,
|
952 |
+
0.3945355191256831
|
953 |
+
],
|
954 |
+
"bp": 0.9804693769806172,
|
955 |
+
"sys_len": 2028,
|
956 |
+
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.5250486815503393,
|
958 |
+
"score": 0.5250486815503393,
|
959 |
+
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.4776077157588871,
|
961 |
+
"score_ci_high": 0.57891652183332,
|
962 |
+
"sacrebleu_ci_low": 0.4776077157588871,
|
963 |
+
"sacrebleu_ci_high": 0.57891652183332
|
964 |
+
},
|
965 |
+
"mt_flores_101_eng_kor": {
|
966 |
+
"num_of_instances": 66,
|
967 |
+
"counts": [
|
968 |
+
1351,
|
969 |
+
728,
|
970 |
+
447,
|
971 |
+
279
|
972 |
+
],
|
973 |
+
"totals": [
|
974 |
+
2735,
|
975 |
+
2669,
|
976 |
+
2603,
|
977 |
+
2537
|
978 |
+
],
|
979 |
+
"precisions": [
|
980 |
+
0.4939670932358318,
|
981 |
+
0.27276133383289625,
|
982 |
+
0.17172493276988093,
|
983 |
+
0.10997240835632636
|
984 |
+
],
|
985 |
+
"bp": 1.0,
|
986 |
+
"sys_len": 2735,
|
987 |
+
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.22459468717502307,
|
989 |
+
"score": 0.22459468717502307,
|
990 |
+
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.1881220082315981,
|
992 |
+
"score_ci_high": 0.25342316010757016,
|
993 |
+
"sacrebleu_ci_low": 0.1881220082315981,
|
994 |
+
"sacrebleu_ci_high": 0.25342316010757016
|
995 |
+
},
|
996 |
+
"mt_flores_101_eng_por": {
|
997 |
+
"num_of_instances": 66,
|
998 |
+
"counts": [
|
999 |
+
1429,
|
1000 |
+
1021,
|
1001 |
+
760,
|
1002 |
+
570
|
1003 |
+
],
|
1004 |
+
"totals": [
|
1005 |
+
1901,
|
1006 |
+
1835,
|
1007 |
+
1769,
|
1008 |
+
1703
|
1009 |
+
],
|
1010 |
+
"precisions": [
|
1011 |
+
0.751709626512362,
|
1012 |
+
0.5564032697547684,
|
1013 |
+
0.4296212549462973,
|
1014 |
+
0.33470346447445687
|
1015 |
+
],
|
1016 |
+
"bp": 0.9921404650355355,
|
1017 |
+
"sys_len": 1901,
|
1018 |
+
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.49132583520106116,
|
1020 |
+
"score": 0.49132583520106116,
|
1021 |
+
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.4478372466403723,
|
1023 |
+
"score_ci_high": 0.5303109141597054,
|
1024 |
+
"sacrebleu_ci_low": 0.4478372466403723,
|
1025 |
+
"sacrebleu_ci_high": 0.5303109141597054
|
1026 |
+
},
|
1027 |
+
"mt_flores_101_eng_ron": {
|
1028 |
+
"num_of_instances": 66,
|
1029 |
+
"counts": [
|
1030 |
+
1387,
|
1031 |
+
979,
|
1032 |
+
711,
|
1033 |
+
518
|
1034 |
+
],
|
1035 |
+
"totals": [
|
1036 |
+
1967,
|
1037 |
+
1901,
|
1038 |
+
1835,
|
1039 |
+
1769
|
1040 |
+
],
|
1041 |
+
"precisions": [
|
1042 |
+
0.7051347229283172,
|
1043 |
+
0.5149921094160969,
|
1044 |
+
0.38746594005449597,
|
1045 |
+
0.2928208027133974
|
1046 |
+
],
|
1047 |
+
"bp": 1.0,
|
1048 |
+
"sys_len": 1967,
|
1049 |
+
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.450533442657802,
|
1051 |
+
"score": 0.450533442657802,
|
1052 |
+
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.41119874078427415,
|
1054 |
+
"score_ci_high": 0.4996205132749857,
|
1055 |
+
"sacrebleu_ci_low": 0.41119874078427415,
|
1056 |
+
"sacrebleu_ci_high": 0.4996205132749857
|
1057 |
+
},
|
1058 |
+
"mt_flores_101_eng_spa": {
|
1059 |
+
"num_of_instances": 66,
|
1060 |
+
"counts": [
|
1061 |
+
1300,
|
1062 |
+
749,
|
1063 |
+
458,
|
1064 |
+
283
|
1065 |
+
],
|
1066 |
+
"totals": [
|
1067 |
+
2033,
|
1068 |
+
1967,
|
1069 |
+
1901,
|
1070 |
+
1835
|
1071 |
+
],
|
1072 |
+
"precisions": [
|
1073 |
+
0.6394490900147565,
|
1074 |
+
0.3807829181494662,
|
1075 |
+
0.24092582851130984,
|
1076 |
+
0.1542234332425068
|
1077 |
+
],
|
1078 |
+
"bp": 0.9685332604439724,
|
1079 |
+
"sys_len": 2033,
|
1080 |
+
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.29870591960318976,
|
1082 |
+
"score": 0.29870591960318976,
|
1083 |
+
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.2686987876011016,
|
1085 |
+
"score_ci_high": 0.3321355366475583,
|
1086 |
+
"sacrebleu_ci_low": 0.2686987876011016,
|
1087 |
+
"sacrebleu_ci_high": 0.3321355366475583
|
1088 |
+
},
|
1089 |
+
"mt_flores_101_fra_eng": {
|
1090 |
+
"num_of_instances": 66,
|
1091 |
+
"counts": [
|
1092 |
+
1340,
|
1093 |
+
946,
|
1094 |
+
692,
|
1095 |
+
509
|
1096 |
+
],
|
1097 |
+
"totals": [
|
1098 |
+
1799,
|
1099 |
+
1733,
|
1100 |
+
1667,
|
1101 |
+
1601
|
1102 |
+
],
|
1103 |
+
"precisions": [
|
1104 |
+
0.7448582545858811,
|
1105 |
+
0.5458742065781881,
|
1106 |
+
0.4151169766046791,
|
1107 |
+
0.31792629606495937
|
1108 |
+
],
|
1109 |
+
"bp": 1.0,
|
1110 |
+
"sys_len": 1799,
|
1111 |
+
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.4812999188340168,
|
1113 |
+
"score": 0.4812999188340168,
|
1114 |
+
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.43585100595842746,
|
1116 |
+
"score_ci_high": 0.5287499225865158,
|
1117 |
+
"sacrebleu_ci_low": 0.43585100595842746,
|
1118 |
+
"sacrebleu_ci_high": 0.5287499225865158
|
1119 |
+
},
|
1120 |
+
"mt_flores_101_jpn_eng": {
|
1121 |
+
"num_of_instances": 66,
|
1122 |
+
"counts": [
|
1123 |
+
1139,
|
1124 |
+
642,
|
1125 |
+
413,
|
1126 |
+
271
|
1127 |
+
],
|
1128 |
+
"totals": [
|
1129 |
+
1798,
|
1130 |
+
1732,
|
1131 |
+
1666,
|
1132 |
+
1600
|
1133 |
+
],
|
1134 |
+
"precisions": [
|
1135 |
+
0.6334816462736373,
|
1136 |
+
0.37066974595842955,
|
1137 |
+
0.24789915966386555,
|
1138 |
+
0.169375
|
1139 |
+
],
|
1140 |
+
"bp": 1.0,
|
1141 |
+
"sys_len": 1798,
|
1142 |
+
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.3151094190111042,
|
1144 |
+
"score": 0.3151094190111042,
|
1145 |
+
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.27523820425365936,
|
1147 |
+
"score_ci_high": 0.37080224288423497,
|
1148 |
+
"sacrebleu_ci_low": 0.27523820425365936,
|
1149 |
+
"sacrebleu_ci_high": 0.37080224288423497
|
1150 |
+
},
|
1151 |
+
"mt_flores_101_kor_eng": {
|
1152 |
+
"num_of_instances": 66,
|
1153 |
+
"counts": [
|
1154 |
+
1096,
|
1155 |
+
603,
|
1156 |
+
369,
|
1157 |
+
231
|
1158 |
+
],
|
1159 |
+
"totals": [
|
1160 |
+
1757,
|
1161 |
+
1691,
|
1162 |
+
1625,
|
1163 |
+
1559
|
1164 |
+
],
|
1165 |
+
"precisions": [
|
1166 |
+
0.6237905520774046,
|
1167 |
+
0.3565937315198108,
|
1168 |
+
0.2270769230769231,
|
1169 |
+
0.14817190506735087
|
1170 |
+
],
|
1171 |
+
"bp": 1.0,
|
1172 |
+
"sys_len": 1757,
|
1173 |
+
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.29412899612915067,
|
1175 |
+
"score": 0.29412899612915067,
|
1176 |
+
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.2630441728848743,
|
1178 |
+
"score_ci_high": 0.3478998363728344,
|
1179 |
+
"sacrebleu_ci_low": 0.2630441728848743,
|
1180 |
+
"sacrebleu_ci_high": 0.3478998363728344
|
1181 |
+
},
|
1182 |
+
"mt_flores_101_por_eng": {
|
1183 |
+
"num_of_instances": 66,
|
1184 |
+
"counts": [
|
1185 |
+
1366,
|
1186 |
+
978,
|
1187 |
+
742,
|
1188 |
+
559
|
1189 |
+
],
|
1190 |
+
"totals": [
|
1191 |
+
1794,
|
1192 |
+
1728,
|
1193 |
+
1662,
|
1194 |
+
1596
|
1195 |
+
],
|
1196 |
+
"precisions": [
|
1197 |
+
0.7614269788182831,
|
1198 |
+
0.5659722222222222,
|
1199 |
+
0.4464500601684717,
|
1200 |
+
0.35025062656641603
|
1201 |
+
],
|
1202 |
+
"bp": 1.0,
|
1203 |
+
"sys_len": 1794,
|
1204 |
+
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.5094995397125037,
|
1206 |
+
"score": 0.5094995397125037,
|
1207 |
+
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.45392787953470803,
|
1209 |
+
"score_ci_high": 0.5666406845959309,
|
1210 |
+
"sacrebleu_ci_low": 0.45392787953470803,
|
1211 |
+
"sacrebleu_ci_high": 0.5666406845959309
|
1212 |
+
},
|
1213 |
+
"mt_flores_101_ron_eng": {
|
1214 |
+
"num_of_instances": 66,
|
1215 |
+
"counts": [
|
1216 |
+
1357,
|
1217 |
+
953,
|
1218 |
+
689,
|
1219 |
+
517
|
1220 |
+
],
|
1221 |
+
"totals": [
|
1222 |
+
1780,
|
1223 |
+
1714,
|
1224 |
+
1648,
|
1225 |
+
1582
|
1226 |
+
],
|
1227 |
+
"precisions": [
|
1228 |
+
0.7623595505617977,
|
1229 |
+
0.5560093348891482,
|
1230 |
+
0.4180825242718446,
|
1231 |
+
0.3268015170670038
|
1232 |
+
],
|
1233 |
+
"bp": 1.0,
|
1234 |
+
"sys_len": 1780,
|
1235 |
+
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.49056549932677673,
|
1237 |
+
"score": 0.49056549932677673,
|
1238 |
+
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.4449725178153994,
|
1240 |
+
"score_ci_high": 0.5499905859714643,
|
1241 |
+
"sacrebleu_ci_low": 0.4449725178153994,
|
1242 |
+
"sacrebleu_ci_high": 0.5499905859714643
|
1243 |
+
},
|
1244 |
+
"mt_flores_101_spa_eng": {
|
1245 |
+
"num_of_instances": 66,
|
1246 |
+
"counts": [
|
1247 |
+
1196,
|
1248 |
+
684,
|
1249 |
+
436,
|
1250 |
+
274
|
1251 |
+
],
|
1252 |
+
"totals": [
|
1253 |
+
1839,
|
1254 |
+
1773,
|
1255 |
+
1707,
|
1256 |
+
1641
|
1257 |
+
],
|
1258 |
+
"precisions": [
|
1259 |
+
0.6503534529635671,
|
1260 |
+
0.38578680203045684,
|
1261 |
+
0.255418863503222,
|
1262 |
+
0.16697135892748324
|
1263 |
+
],
|
1264 |
+
"bp": 1.0,
|
1265 |
+
"sys_len": 1839,
|
1266 |
+
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.3216236243185879,
|
1268 |
+
"score": 0.3216236243185879,
|
1269 |
+
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.29765430133685095,
|
1271 |
+
"score_ci_high": 0.36660487766328476,
|
1272 |
+
"sacrebleu_ci_low": 0.29765430133685095,
|
1273 |
+
"sacrebleu_ci_high": 0.36660487766328476
|
1274 |
+
},
|
1275 |
+
"score": 0.39273277735180634,
|
1276 |
+
"score_name": "subsets_mean",
|
1277 |
+
"num_of_instances": 990
|
1278 |
+
},
|
1279 |
+
"score": 0.5448957624253402,
|
1280 |
+
"score_name": "subsets_mean",
|
1281 |
+
"num_of_instances": 12472
|
1282 |
+
}
|
1283 |
+
}
|
results/bluebench/{2025-06-19T15-57-45_evaluation_results.json β 2025-06-23T02-53-05_evaluation_results.json}
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
-
"timestamp_utc": "2025-06-
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
@@ -42,7 +42,7 @@
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
-
"unitxt_commit_hash": "
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
@@ -178,61 +178,61 @@
|
|
178 |
"safety_bbq_age": {
|
179 |
"accuracy": 0.6444444444444445,
|
180 |
"accuracy_ci_low": 0.5444444444444444,
|
181 |
-
"accuracy_ci_high": 0.
|
182 |
"score_name": "accuracy",
|
183 |
"score": 0.6444444444444445,
|
184 |
-
"score_ci_high": 0.
|
185 |
"score_ci_low": 0.5444444444444444,
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
-
"accuracy": 0.
|
190 |
-
"accuracy_ci_low": 0.
|
191 |
"accuracy_ci_high": 0.8,
|
192 |
"score_name": "accuracy",
|
193 |
-
"score": 0.
|
194 |
"score_ci_high": 0.8,
|
195 |
-
"score_ci_low": 0.
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
199 |
"accuracy": 0.9111111111111111,
|
200 |
-
"accuracy_ci_low": 0.
|
201 |
"accuracy_ci_high": 0.9555555555555556,
|
202 |
"score_name": "accuracy",
|
203 |
"score": 0.9111111111111111,
|
204 |
"score_ci_high": 0.9555555555555556,
|
205 |
-
"score_ci_low": 0.
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
-
"accuracy": 0.
|
210 |
-
"accuracy_ci_low": 0.
|
211 |
-
"accuracy_ci_high": 0.
|
212 |
"score_name": "accuracy",
|
213 |
-
"score": 0.
|
214 |
-
"score_ci_high": 0.
|
215 |
-
"score_ci_low": 0.
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
-
"accuracy": 0.
|
220 |
-
"accuracy_ci_low": 0.
|
221 |
-
"accuracy_ci_high": 0.
|
222 |
"score_name": "accuracy",
|
223 |
-
"score": 0.
|
224 |
-
"score_ci_high": 0.
|
225 |
-
"score_ci_low": 0.
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
-
"accuracy": 0.
|
230 |
-
"accuracy_ci_low": 0.
|
231 |
-
"accuracy_ci_high":
|
232 |
"score_name": "accuracy",
|
233 |
-
"score": 0.
|
234 |
-
"score_ci_high":
|
235 |
-
"score_ci_low": 0.
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
@@ -256,553 +256,553 @@
|
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
-
"accuracy": 0.
|
260 |
-
"accuracy_ci_low": 0.
|
261 |
-
"accuracy_ci_high": 0.
|
262 |
"score_name": "accuracy",
|
263 |
-
"score": 0.
|
264 |
-
"score_ci_high": 0.
|
265 |
-
"score_ci_low": 0.
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
-
"accuracy": 0.
|
270 |
-
"accuracy_ci_low": 0.
|
271 |
-
"accuracy_ci_high": 0.
|
272 |
"score_name": "accuracy",
|
273 |
-
"score": 0.
|
274 |
-
"score_ci_high": 0.
|
275 |
-
"score_ci_low": 0.
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
-
"accuracy": 0.
|
280 |
-
"accuracy_ci_low": 0.
|
281 |
-
"accuracy_ci_high": 0.
|
282 |
"score_name": "accuracy",
|
283 |
-
"score": 0.
|
284 |
-
"score_ci_high": 0.
|
285 |
-
"score_ci_low": 0.
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
-
"score": 0.
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
296 |
-
"score": 0.
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
-
"score": 0.
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
-
"f1_Person": 0.
|
307 |
-
"
|
308 |
-
"
|
309 |
-
"f1_macro": 0.
|
310 |
-
"recall_macro": 0.
|
311 |
-
"precision_macro": 0.
|
312 |
-
"in_classes_support": 0.
|
313 |
-
"f1_micro": 0.
|
314 |
-
"recall_micro": 0.
|
315 |
-
"precision_micro": 0.
|
316 |
-
"score": 0.
|
317 |
"score_name": "f1_micro",
|
318 |
-
"score_ci_low": 0.
|
319 |
-
"score_ci_high": 0.
|
320 |
-
"f1_micro_ci_low": 0.
|
321 |
-
"f1_micro_ci_high": 0.
|
322 |
},
|
323 |
-
"score": 0.
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
-
"accuracy": 0.
|
330 |
"accuracy_ci_low": 0.43661971830985913,
|
331 |
-
"accuracy_ci_high": 0.
|
332 |
"score_name": "accuracy",
|
333 |
-
"score": 0.
|
334 |
-
"score_ci_high": 0.
|
335 |
"score_ci_low": 0.43661971830985913,
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
-
"accuracy": 0.
|
340 |
-
"accuracy_ci_low": 0.
|
341 |
-
"accuracy_ci_high": 0.
|
342 |
"score_name": "accuracy",
|
343 |
-
"score": 0.
|
344 |
-
"score_ci_high": 0.
|
345 |
-
"score_ci_low": 0.
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
-
"accuracy": 0.
|
350 |
-
"accuracy_ci_low": 0.
|
351 |
-
"accuracy_ci_high": 0.
|
352 |
"score_name": "accuracy",
|
353 |
-
"score": 0.
|
354 |
-
"score_ci_high": 0.
|
355 |
-
"score_ci_low": 0.
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
-
"accuracy": 0.
|
360 |
"accuracy_ci_low": 0.2112676056338028,
|
361 |
-
"accuracy_ci_high": 0.
|
362 |
"score_name": "accuracy",
|
363 |
-
"score": 0.
|
364 |
-
"score_ci_high": 0.
|
365 |
"score_ci_low": 0.2112676056338028,
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
-
"accuracy": 0.
|
370 |
-
"accuracy_ci_low": 0.
|
371 |
-
"accuracy_ci_high": 0.
|
372 |
"score_name": "accuracy",
|
373 |
-
"score": 0.
|
374 |
-
"score_ci_high": 0.
|
375 |
-
"score_ci_low": 0.
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
379 |
-
"accuracy": 0.
|
380 |
-
"accuracy_ci_low": 0.
|
381 |
-
"accuracy_ci_high": 0.
|
382 |
"score_name": "accuracy",
|
383 |
-
"score": 0.
|
384 |
-
"score_ci_high": 0.
|
385 |
-
"score_ci_low": 0.
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
-
"accuracy": 0.
|
390 |
-
"accuracy_ci_low": 0.
|
391 |
-
"accuracy_ci_high": 0.
|
392 |
"score_name": "accuracy",
|
393 |
-
"score": 0.
|
394 |
-
"score_ci_high": 0.
|
395 |
-
"score_ci_low": 0.
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
-
"accuracy": 0.
|
400 |
-
"accuracy_ci_low": 0.
|
401 |
-
"accuracy_ci_high": 0.
|
402 |
"score_name": "accuracy",
|
403 |
-
"score": 0.
|
404 |
-
"score_ci_high": 0.
|
405 |
-
"score_ci_low": 0.
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
"accuracy": 0.3380281690140845,
|
410 |
"accuracy_ci_low": 0.23943661971830985,
|
411 |
-
"accuracy_ci_high": 0.
|
412 |
"score_name": "accuracy",
|
413 |
"score": 0.3380281690140845,
|
414 |
-
"score_ci_high": 0.
|
415 |
"score_ci_low": 0.23943661971830985,
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
-
"accuracy": 0.
|
420 |
-
"accuracy_ci_low": 0.
|
421 |
-
"accuracy_ci_high": 0.
|
422 |
"score_name": "accuracy",
|
423 |
-
"score": 0.
|
424 |
-
"score_ci_high": 0.
|
425 |
-
"score_ci_low": 0.
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
-
"accuracy": 0.
|
430 |
-
"accuracy_ci_low": 0.
|
431 |
-
"accuracy_ci_high": 0.
|
432 |
"score_name": "accuracy",
|
433 |
-
"score": 0.
|
434 |
-
"score_ci_high": 0.
|
435 |
-
"score_ci_low": 0.
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
-
"accuracy": 0.
|
440 |
-
"accuracy_ci_low": 0.
|
441 |
-
"accuracy_ci_high": 0.
|
442 |
"score_name": "accuracy",
|
443 |
-
"score": 0.
|
444 |
-
"score_ci_high": 0.
|
445 |
-
"score_ci_low": 0.
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
-
"accuracy": 0.
|
450 |
-
"accuracy_ci_low": 0.
|
451 |
-
"accuracy_ci_high": 0.
|
452 |
"score_name": "accuracy",
|
453 |
-
"score": 0.
|
454 |
-
"score_ci_high": 0.
|
455 |
-
"score_ci_low": 0.
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
-
"accuracy": 0.
|
460 |
-
"accuracy_ci_low": 0.
|
461 |
-
"accuracy_ci_high": 0.
|
462 |
"score_name": "accuracy",
|
463 |
-
"score": 0.
|
464 |
-
"score_ci_high": 0.
|
465 |
-
"score_ci_low": 0.
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
-
"score": 0.
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
-
"f1_macro": 0.
|
475 |
-
"f1_suggestive": 0.
|
476 |
-
"f1_descriptive": 0.
|
477 |
"f1_generic": 0.11764705882352941,
|
478 |
-
"f1_fanciful": 0.
|
479 |
-
"f1_arbitrary": 0.
|
480 |
-
"f1_macro_ci_low": 0.
|
481 |
-
"f1_macro_ci_high": 0.
|
482 |
"score_name": "f1_micro",
|
483 |
-
"score": 0.
|
484 |
-
"score_ci_high": 0.
|
485 |
-
"score_ci_low": 0.
|
486 |
"num_of_instances": 85,
|
487 |
-
"accuracy": 0.
|
488 |
-
"accuracy_ci_low": 0.
|
489 |
-
"accuracy_ci_high": 0.
|
490 |
-
"f1_micro": 0.
|
491 |
-
"f1_micro_ci_low": 0.
|
492 |
-
"f1_micro_ci_high": 0.
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
-
"f1_macro": 0.
|
496 |
-
"f1_no": 0.
|
497 |
-
"f1_yes": 0.
|
498 |
-
"f1_macro_ci_low": 0.
|
499 |
-
"f1_macro_ci_high": 0.
|
500 |
"score_name": "f1_micro",
|
501 |
-
"score": 0.
|
502 |
-
"score_ci_high": 0.
|
503 |
-
"score_ci_low": 0.
|
504 |
"num_of_instances": 200,
|
505 |
-
"accuracy": 0.
|
506 |
-
"accuracy_ci_low": 0.
|
507 |
-
"accuracy_ci_high": 0.
|
508 |
-
"f1_micro": 0.
|
509 |
-
"f1_micro_ci_low": 0.
|
510 |
-
"f1_micro_ci_high": 0.
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
-
"f1_macro": 0.
|
514 |
-
"f1_conclusion": 0.
|
515 |
-
"f1_issue": 0.
|
516 |
-
"f1_decree": 0.
|
517 |
-
"f1_rule": 0.
|
518 |
-
"f1_analysis": 0.
|
519 |
-
"f1_facts": 0.
|
520 |
-
"f1_procedural history": 0.
|
521 |
-
"f1_macro_ci_low": 0.
|
522 |
-
"f1_macro_ci_high": 0.
|
523 |
"score_name": "f1_micro",
|
524 |
-
"score": 0.
|
525 |
-
"score_ci_high": 0.
|
526 |
-
"score_ci_low": 0.
|
527 |
"num_of_instances": 200,
|
528 |
-
"accuracy": 0.
|
529 |
-
"accuracy_ci_low": 0.
|
530 |
-
"accuracy_ci_high": 0.
|
531 |
-
"f1_micro": 0.
|
532 |
-
"f1_micro_ci_low": 0.
|
533 |
-
"f1_micro_ci_high": 0.
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
-
"f1_macro": 0.
|
537 |
-
"f1_yes": 0.
|
538 |
-
"f1_no": 0.
|
539 |
-
"f1_macro_ci_low": 0.
|
540 |
-
"f1_macro_ci_high": 0.
|
541 |
"score_name": "f1_micro",
|
542 |
-
"score": 0.
|
543 |
-
"score_ci_high": 0.
|
544 |
-
"score_ci_low": 0.
|
545 |
"num_of_instances": 200,
|
546 |
-
"accuracy": 0.
|
547 |
-
"accuracy_ci_low": 0.
|
548 |
-
"accuracy_ci_high": 0.
|
549 |
-
"f1_micro": 0.
|
550 |
-
"f1_micro_ci_low": 0.
|
551 |
-
"f1_micro_ci_high": 0.
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
-
"f1_macro": 0.
|
555 |
-
"f1_yes": 0.
|
556 |
-
"f1_no": 0.
|
557 |
-
"f1_macro_ci_low": 0.
|
558 |
-
"f1_macro_ci_high": 0.
|
559 |
"score_name": "f1_micro",
|
560 |
-
"score": 0.
|
561 |
-
"score_ci_high": 0.
|
562 |
-
"score_ci_low": 0.
|
563 |
"num_of_instances": 85,
|
564 |
-
"accuracy": 0.
|
565 |
-
"accuracy_ci_low": 0.
|
566 |
-
"accuracy_ci_high": 0.
|
567 |
-
"f1_micro": 0.
|
568 |
-
"f1_micro_ci_low": 0.
|
569 |
-
"f1_micro_ci_high": 0.
|
570 |
},
|
571 |
-
"score": 0.
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
-
"f1_macro": 0.
|
578 |
-
"f1_cars": 0.
|
579 |
-
"
|
580 |
-
"
|
581 |
-
"
|
582 |
-
"
|
583 |
-
"
|
584 |
-
"
|
585 |
-
"
|
586 |
-
"
|
587 |
-
"
|
588 |
-
"
|
589 |
-
"
|
590 |
-
"
|
591 |
-
"
|
592 |
-
"
|
593 |
-
"f1_guns": 0.
|
594 |
-
"f1_space": 0.
|
595 |
-
"
|
596 |
-
"
|
597 |
-
"
|
598 |
-
"f1_macro_ci_low": 0.
|
599 |
-
"f1_macro_ci_high": 0.
|
600 |
"score_name": "f1_micro",
|
601 |
-
"score": 0.
|
602 |
-
"score_ci_high": 0.
|
603 |
-
"score_ci_low": 0.
|
604 |
"num_of_instances": 1000,
|
605 |
-
"accuracy": 0.
|
606 |
-
"accuracy_ci_low": 0.
|
607 |
-
"accuracy_ci_high": 0.
|
608 |
-
"f1_micro": 0.
|
609 |
-
"f1_micro_ci_low": 0.
|
610 |
-
"f1_micro_ci_high": 0.
|
611 |
},
|
612 |
-
"score": 0.
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
-
"f1_macro": 0.
|
619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
620 |
-
"
|
621 |
-
"f1_debt collection": 0.
|
622 |
-
"
|
623 |
-
"
|
624 |
-
"
|
625 |
-
"
|
626 |
-
"
|
627 |
-
"
|
628 |
-
"f1_macro_ci_low": 0.
|
629 |
-
"f1_macro_ci_high": 0.
|
630 |
"score_name": "f1_micro",
|
631 |
-
"score": 0.
|
632 |
-
"score_ci_high": 0.
|
633 |
-
"score_ci_low": 0.
|
634 |
"num_of_instances": 1000,
|
635 |
-
"accuracy": 0.
|
636 |
-
"accuracy_ci_low": 0.
|
637 |
-
"accuracy_ci_high": 0.
|
638 |
-
"f1_micro": 0.
|
639 |
-
"f1_micro_ci_low": 0.
|
640 |
-
"f1_micro_ci_high": 0.
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
-
"f1_macro": 0.
|
644 |
-
"f1_mortgages and loans": 0.
|
645 |
-
"f1_credit card": 0.
|
646 |
-
"
|
647 |
-
"
|
648 |
-
"
|
649 |
-
"f1_macro_ci_low": 0.
|
650 |
-
"f1_macro_ci_high": 0.
|
651 |
"score_name": "f1_micro",
|
652 |
-
"score": 0.
|
653 |
-
"score_ci_high": 0.
|
654 |
-
"score_ci_low": 0.
|
655 |
"num_of_instances": 500,
|
656 |
-
"accuracy": 0.
|
657 |
-
"accuracy_ci_low": 0.
|
658 |
-
"accuracy_ci_high": 0.
|
659 |
-
"f1_micro": 0.
|
660 |
-
"f1_micro_ci_low": 0.
|
661 |
-
"f1_micro_ci_high": 0.
|
662 |
},
|
663 |
-
"score": 0.
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
-
"
|
671 |
-
"
|
|
|
672 |
"score_name": "program_accuracy",
|
673 |
-
"
|
674 |
-
"
|
675 |
-
"
|
676 |
-
"
|
677 |
-
"
|
678 |
-
"
|
679 |
-
"execution_accuracy_ci_high": 0.132
|
680 |
},
|
681 |
-
"score": 0.
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
-
"precision": 0.
|
688 |
-
"recall": 0.
|
689 |
-
"f1": 0.
|
690 |
-
"precision_ci_low": 0.
|
691 |
-
"precision_ci_high": 0.
|
692 |
-
"recall_ci_low": 0.
|
693 |
-
"recall_ci_high": 0.
|
694 |
-
"f1_ci_low": 0.
|
695 |
-
"f1_ci_high": 0.
|
696 |
"score_name": "f1",
|
697 |
-
"score": 0.
|
698 |
-
"score_ci_high": 0.
|
699 |
-
"score_ci_low": 0.
|
700 |
"num_of_instances": 600,
|
701 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
702 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
703 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
704 |
-
"faithfullness_f1_token_overlap": 0.
|
705 |
-
"faithfullness_recall_token_overlap": 0.
|
706 |
-
"faithfullness_precision_token_overlap": 0.
|
707 |
-
"correctness_f1_token_overlap": 0.
|
708 |
-
"correctness_recall_token_overlap": 0.
|
709 |
-
"correctness_precision_token_overlap": 0.
|
710 |
},
|
711 |
-
"score": 0.
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
-
"accuracy": 0.
|
718 |
-
"accuracy_ci_low": 0.
|
719 |
-
"accuracy_ci_high": 0.
|
720 |
"score_name": "accuracy",
|
721 |
-
"score": 0.
|
722 |
-
"score_ci_high": 0.
|
723 |
-
"score_ci_low": 0.
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
-
"accuracy": 0.
|
728 |
-
"accuracy_ci_low": 0.
|
729 |
-
"accuracy_ci_high": 0.
|
730 |
"score_name": "accuracy",
|
731 |
-
"score": 0.
|
732 |
-
"score_ci_high": 0.
|
733 |
-
"score_ci_low": 0.
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
-
"score": 0.
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
740 |
"safety": {
|
741 |
"attaq_500": {
|
742 |
-
"severity_critical":
|
743 |
-
"severity_high":
|
744 |
-
"severity_medium":
|
745 |
-
"severity_low":
|
746 |
-
"category_harmful_info": 0.
|
747 |
-
"category_deception": 0.
|
748 |
-
"category_explicit_content": 0.
|
749 |
-
"category_violence": 0.
|
750 |
-
"category_discrimination": 0.
|
751 |
-
"category_substance_abuse": 0.
|
752 |
-
"category_pii": 0.
|
753 |
-
"safety": 0.
|
754 |
-
"safety_ci_low": 0.
|
755 |
-
"safety_ci_high": 0.
|
756 |
"score_name": "safety",
|
757 |
-
"score": 0.
|
758 |
-
"score_ci_high": 0.
|
759 |
-
"score_ci_low": 0.
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
-
"score": 0.
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
-
"
|
770 |
-
"rouge1": 0.
|
771 |
-
"rougeL": 0.
|
772 |
-
"score": 0.
|
773 |
"score_name": "rougeL",
|
774 |
-
"
|
775 |
-
"
|
776 |
-
"
|
777 |
-
"rouge1_ci_low": 0.
|
778 |
-
"rouge1_ci_high": 0.
|
779 |
-
"rougeL_ci_low": 0.
|
780 |
-
"rougeL_ci_high": 0.
|
781 |
-
"score_ci_low": 0.
|
782 |
-
"score_ci_high": 0.
|
783 |
-
"
|
784 |
-
"
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
-
"
|
789 |
-
"rouge1": 0.
|
790 |
-
"rougeL": 0.
|
791 |
-
"score": 0.
|
792 |
"score_name": "rougeL",
|
793 |
-
"
|
794 |
-
"
|
795 |
-
"
|
796 |
-
"rouge1_ci_low": 0.
|
797 |
-
"rouge1_ci_high": 0.
|
798 |
-
"rougeL_ci_low": 0.
|
799 |
-
"rougeL_ci_high": 0.
|
800 |
-
"score_ci_low": 0.
|
801 |
-
"score_ci_high": 0.
|
802 |
-
"
|
803 |
-
"
|
804 |
},
|
805 |
-
"score": 0.
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
@@ -810,473 +810,473 @@
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
],
|
818 |
"totals": [
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
-
|
823 |
],
|
824 |
"precisions": [
|
825 |
-
0.
|
826 |
-
0.
|
827 |
-
0.
|
828 |
-
0.
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
-
"sys_len":
|
832 |
"ref_len": 1734,
|
833 |
-
"sacrebleu": 0.
|
834 |
-
"score": 0.
|
835 |
"score_name": "sacrebleu",
|
836 |
-
"score_ci_low": 0.
|
837 |
-
"score_ci_high": 0.
|
838 |
-
"sacrebleu_ci_low": 0.
|
839 |
-
"sacrebleu_ci_high": 0.
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
-
|
845 |
-
|
846 |
-
|
847 |
-
|
848 |
],
|
849 |
"totals": [
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
],
|
855 |
"precisions": [
|
856 |
-
0.
|
857 |
-
0.
|
858 |
-
0.
|
859 |
-
0.
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
-
"sys_len":
|
863 |
"ref_len": 1734,
|
864 |
-
"sacrebleu": 0.
|
865 |
-
"score": 0.
|
866 |
"score_name": "sacrebleu",
|
867 |
-
"score_ci_low": 0.
|
868 |
-
"score_ci_high": 0.
|
869 |
-
"sacrebleu_ci_low": 0.
|
870 |
-
"sacrebleu_ci_high": 0.
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
-
|
876 |
-
|
877 |
-
|
878 |
-
|
879 |
],
|
880 |
"totals": [
|
881 |
-
|
882 |
-
|
883 |
-
|
884 |
-
|
885 |
],
|
886 |
"precisions": [
|
887 |
-
0.
|
888 |
-
0.
|
889 |
-
0.
|
890 |
-
0.
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
-
"sys_len":
|
894 |
"ref_len": 1589,
|
895 |
-
"sacrebleu": 0.
|
896 |
-
"score": 0.
|
897 |
"score_name": "sacrebleu",
|
898 |
-
"score_ci_low": 0.
|
899 |
-
"score_ci_high": 0.
|
900 |
-
"sacrebleu_ci_low": 0.
|
901 |
-
"sacrebleu_ci_high": 0.
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
-
|
907 |
-
|
908 |
-
|
909 |
-
|
910 |
],
|
911 |
"totals": [
|
912 |
-
|
913 |
-
|
914 |
-
|
915 |
-
|
916 |
],
|
917 |
"precisions": [
|
918 |
-
0.
|
919 |
-
0.
|
920 |
-
0.
|
921 |
-
0.
|
922 |
],
|
923 |
"bp": 1.0,
|
924 |
-
"sys_len":
|
925 |
"ref_len": 1835,
|
926 |
-
"sacrebleu": 0.
|
927 |
-
"score": 0.
|
928 |
"score_name": "sacrebleu",
|
929 |
-
"score_ci_low": 0.
|
930 |
-
"score_ci_high": 0.
|
931 |
-
"sacrebleu_ci_low": 0.
|
932 |
-
"sacrebleu_ci_high": 0.
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
-
|
938 |
-
|
939 |
-
|
940 |
-
|
941 |
],
|
942 |
"totals": [
|
943 |
-
|
944 |
-
|
945 |
-
|
946 |
-
|
947 |
],
|
948 |
"precisions": [
|
949 |
-
0.
|
950 |
-
0.
|
951 |
-
0.
|
952 |
-
0.
|
953 |
],
|
954 |
-
"bp": 0.
|
955 |
-
"sys_len":
|
956 |
"ref_len": 2068,
|
957 |
-
"sacrebleu": 0.
|
958 |
-
"score": 0.
|
959 |
"score_name": "sacrebleu",
|
960 |
-
"score_ci_low": 0.
|
961 |
-
"score_ci_high": 0.
|
962 |
-
"sacrebleu_ci_low": 0.
|
963 |
-
"sacrebleu_ci_high": 0.
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
-
|
969 |
-
|
970 |
-
|
971 |
-
|
972 |
],
|
973 |
"totals": [
|
974 |
-
|
975 |
-
|
976 |
-
|
977 |
-
|
978 |
],
|
979 |
"precisions": [
|
980 |
-
0.
|
981 |
-
0.
|
982 |
-
0.
|
983 |
-
0.
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
-
"sys_len":
|
987 |
"ref_len": 2235,
|
988 |
-
"sacrebleu": 0.
|
989 |
-
"score": 0.
|
990 |
"score_name": "sacrebleu",
|
991 |
-
"score_ci_low": 0.
|
992 |
-
"score_ci_high": 0.
|
993 |
-
"sacrebleu_ci_low": 0.
|
994 |
-
"sacrebleu_ci_high": 0.
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
-
|
1000 |
-
|
1001 |
-
|
1002 |
-
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
-
|
1006 |
-
|
1007 |
-
|
1008 |
-
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
-
0.
|
1012 |
-
0.
|
1013 |
-
0.
|
1014 |
-
0.
|
1015 |
],
|
1016 |
-
"bp": 0.
|
1017 |
-
"sys_len":
|
1018 |
"ref_len": 1916,
|
1019 |
-
"sacrebleu": 0.
|
1020 |
-
"score": 0.
|
1021 |
"score_name": "sacrebleu",
|
1022 |
-
"score_ci_low": 0.
|
1023 |
-
"score_ci_high": 0.
|
1024 |
-
"sacrebleu_ci_low": 0.
|
1025 |
-
"sacrebleu_ci_high": 0.
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
-
|
1031 |
-
|
1032 |
-
|
1033 |
-
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
-
0.
|
1043 |
-
0.
|
1044 |
-
0.
|
1045 |
-
0.
|
1046 |
],
|
1047 |
"bp": 1.0,
|
1048 |
-
"sys_len":
|
1049 |
"ref_len": 1949,
|
1050 |
-
"sacrebleu": 0.
|
1051 |
-
"score": 0.
|
1052 |
"score_name": "sacrebleu",
|
1053 |
-
"score_ci_low": 0.
|
1054 |
-
"score_ci_high": 0.
|
1055 |
-
"sacrebleu_ci_low": 0.
|
1056 |
-
"sacrebleu_ci_high": 0.
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
-
|
1062 |
-
|
1063 |
-
|
1064 |
-
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
1070 |
-
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
-
0.
|
1074 |
-
0.
|
1075 |
-
0.
|
1076 |
-
0.
|
1077 |
],
|
1078 |
-
"bp": 0.
|
1079 |
-
"sys_len":
|
1080 |
"ref_len": 2098,
|
1081 |
-
"sacrebleu": 0.
|
1082 |
-
"score": 0.
|
1083 |
"score_name": "sacrebleu",
|
1084 |
-
"score_ci_low": 0.
|
1085 |
-
"score_ci_high": 0.
|
1086 |
-
"sacrebleu_ci_low": 0.
|
1087 |
-
"sacrebleu_ci_high": 0.
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
-
|
1093 |
-
|
1094 |
-
|
1095 |
-
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
-
|
1099 |
-
|
1100 |
-
|
1101 |
-
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
-
0.
|
1105 |
-
0.
|
1106 |
-
0.
|
1107 |
-
0.
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
-
"sys_len":
|
1111 |
"ref_len": 1734,
|
1112 |
-
"sacrebleu": 0.
|
1113 |
-
"score": 0.
|
1114 |
"score_name": "sacrebleu",
|
1115 |
-
"score_ci_low": 0.
|
1116 |
-
"score_ci_high": 0.
|
1117 |
-
"sacrebleu_ci_low": 0.
|
1118 |
-
"sacrebleu_ci_high": 0.
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
-
|
1124 |
-
|
1125 |
-
|
1126 |
140
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
-
|
1130 |
-
|
1131 |
-
|
1132 |
-
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
-
0.
|
1136 |
-
0.
|
1137 |
-
0.
|
1138 |
-
0.
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
-
"sys_len":
|
1142 |
"ref_len": 1734,
|
1143 |
-
"sacrebleu": 0.
|
1144 |
-
"score": 0.
|
1145 |
"score_name": "sacrebleu",
|
1146 |
-
"score_ci_low": 0.
|
1147 |
-
"score_ci_high": 0.
|
1148 |
-
"sacrebleu_ci_low": 0.
|
1149 |
-
"sacrebleu_ci_high": 0.
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
-
|
1155 |
-
|
1156 |
-
|
1157 |
-
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
-
|
1161 |
-
|
1162 |
-
|
1163 |
-
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
-
0.
|
1167 |
-
0.
|
1168 |
-
0.
|
1169 |
-
0.
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
-
"sys_len":
|
1173 |
"ref_len": 1734,
|
1174 |
-
"sacrebleu": 0.
|
1175 |
-
"score": 0.
|
1176 |
"score_name": "sacrebleu",
|
1177 |
-
"score_ci_low": 0.
|
1178 |
-
"score_ci_high": 0.
|
1179 |
-
"sacrebleu_ci_low": 0.
|
1180 |
-
"sacrebleu_ci_high": 0.
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
-
|
1186 |
-
|
1187 |
-
|
1188 |
-
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
-
|
1192 |
-
|
1193 |
-
|
1194 |
-
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
-
0.
|
1198 |
-
0.
|
1199 |
-
0.
|
1200 |
-
0.
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
-
"sys_len":
|
1204 |
"ref_len": 1734,
|
1205 |
-
"sacrebleu": 0.
|
1206 |
-
"score": 0.
|
1207 |
"score_name": "sacrebleu",
|
1208 |
-
"score_ci_low": 0.
|
1209 |
-
"score_ci_high": 0.
|
1210 |
-
"sacrebleu_ci_low": 0.
|
1211 |
-
"sacrebleu_ci_high": 0.
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
-
|
1217 |
729,
|
1218 |
-
|
1219 |
-
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
-
|
1223 |
-
|
1224 |
-
|
1225 |
-
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
-
0.
|
1229 |
-
0.
|
1230 |
-
0.
|
1231 |
-
0.
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
-
"sys_len":
|
1235 |
"ref_len": 1734,
|
1236 |
-
"sacrebleu": 0.
|
1237 |
-
"score": 0.
|
1238 |
"score_name": "sacrebleu",
|
1239 |
-
"score_ci_low": 0.
|
1240 |
-
"score_ci_high": 0.
|
1241 |
-
"sacrebleu_ci_low": 0.
|
1242 |
-
"sacrebleu_ci_high": 0.
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
-
|
1248 |
-
|
1249 |
-
|
1250 |
-
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
-
|
1254 |
-
|
1255 |
-
|
1256 |
-
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
-
0.
|
1260 |
-
0.
|
1261 |
-
0.
|
1262 |
-
0.
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
-
"sys_len":
|
1266 |
"ref_len": 1734,
|
1267 |
-
"sacrebleu": 0.
|
1268 |
-
"score": 0.
|
1269 |
"score_name": "sacrebleu",
|
1270 |
-
"score_ci_low": 0.
|
1271 |
-
"score_ci_high": 0.
|
1272 |
-
"sacrebleu_ci_low": 0.
|
1273 |
-
"sacrebleu_ci_high": 0.
|
1274 |
},
|
1275 |
-
"score": 0.
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
-
"score": 0.
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-23T06:53:01.281933Z",
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
178 |
"safety_bbq_age": {
|
179 |
"accuracy": 0.6444444444444445,
|
180 |
"accuracy_ci_low": 0.5444444444444444,
|
181 |
+
"accuracy_ci_high": 0.7384996290160605,
|
182 |
"score_name": "accuracy",
|
183 |
"score": 0.6444444444444445,
|
184 |
+
"score_ci_high": 0.7384996290160605,
|
185 |
"score_ci_low": 0.5444444444444444,
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.7111111111111111,
|
190 |
+
"accuracy_ci_low": 0.6,
|
191 |
"accuracy_ci_high": 0.8,
|
192 |
"score_name": "accuracy",
|
193 |
+
"score": 0.7111111111111111,
|
194 |
"score_ci_high": 0.8,
|
195 |
+
"score_ci_low": 0.6,
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
199 |
"accuracy": 0.9111111111111111,
|
200 |
+
"accuracy_ci_low": 0.8444444444444444,
|
201 |
"accuracy_ci_high": 0.9555555555555556,
|
202 |
"score_name": "accuracy",
|
203 |
"score": 0.9111111111111111,
|
204 |
"score_ci_high": 0.9555555555555556,
|
205 |
+
"score_ci_low": 0.8444444444444444,
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 0.7888888888888889,
|
210 |
+
"accuracy_ci_low": 0.7,
|
211 |
+
"accuracy_ci_high": 0.8666666666666667,
|
212 |
"score_name": "accuracy",
|
213 |
+
"score": 0.7888888888888889,
|
214 |
+
"score_ci_high": 0.8666666666666667,
|
215 |
+
"score_ci_low": 0.7,
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.7888888888888889,
|
220 |
+
"accuracy_ci_low": 0.6888888888888889,
|
221 |
+
"accuracy_ci_high": 0.8666666666666667,
|
222 |
"score_name": "accuracy",
|
223 |
+
"score": 0.7888888888888889,
|
224 |
+
"score_ci_high": 0.8666666666666667,
|
225 |
+
"score_ci_low": 0.6888888888888889,
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.9666666666666667,
|
230 |
+
"accuracy_ci_low": 0.9222222222222223,
|
231 |
+
"accuracy_ci_high": 0.9888888888888889,
|
232 |
"score_name": "accuracy",
|
233 |
+
"score": 0.9666666666666667,
|
234 |
+
"score_ci_high": 0.9888888888888889,
|
235 |
+
"score_ci_low": 0.9222222222222223,
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
|
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.8333333333333334,
|
260 |
+
"accuracy_ci_low": 0.7333333333333333,
|
261 |
+
"accuracy_ci_high": 0.9,
|
262 |
"score_name": "accuracy",
|
263 |
+
"score": 0.8333333333333334,
|
264 |
+
"score_ci_high": 0.9,
|
265 |
+
"score_ci_low": 0.7333333333333333,
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.7,
|
270 |
+
"accuracy_ci_low": 0.6,
|
271 |
+
"accuracy_ci_high": 0.7888888888888889,
|
272 |
"score_name": "accuracy",
|
273 |
+
"score": 0.7,
|
274 |
+
"score_ci_high": 0.7888888888888889,
|
275 |
+
"score_ci_low": 0.6,
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.8222222222222222,
|
280 |
+
"accuracy_ci_low": 0.7444444444444445,
|
281 |
+
"accuracy_ci_high": 0.8888888888888888,
|
282 |
"score_name": "accuracy",
|
283 |
+
"score": 0.8222222222222222,
|
284 |
+
"score_ci_high": 0.8888888888888888,
|
285 |
+
"score_ci_low": 0.7444444444444445,
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
+
"score": 0.8222222222222222,
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.0625,
|
296 |
+
"score": 0.0625,
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
+
"score": 0.0625,
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.5798816568047337,
|
307 |
+
"f1_Organization": 0.42592592592592593,
|
308 |
+
"f1_Location": 0.40336134453781514,
|
309 |
+
"f1_macro": 0.46972297575615823,
|
310 |
+
"recall_macro": 0.40171664278500413,
|
311 |
+
"precision_macro": 0.5828611111111112,
|
312 |
+
"in_classes_support": 0.8064516129032258,
|
313 |
+
"f1_micro": 0.4343434343434343,
|
314 |
+
"recall_micro": 0.4095238095238095,
|
315 |
+
"precision_micro": 0.46236559139784944,
|
316 |
+
"score": 0.4343434343434343,
|
317 |
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.3767195996125236,
|
319 |
+
"score_ci_high": 0.4839388766009964,
|
320 |
+
"f1_micro_ci_low": 0.3767195996125236,
|
321 |
+
"f1_micro_ci_high": 0.4839388766009964
|
322 |
},
|
323 |
+
"score": 0.4343434343434343,
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.5492957746478874,
|
330 |
"accuracy_ci_low": 0.43661971830985913,
|
331 |
+
"accuracy_ci_high": 0.6619718309859155,
|
332 |
"score_name": "accuracy",
|
333 |
+
"score": 0.5492957746478874,
|
334 |
+
"score_ci_high": 0.6619718309859155,
|
335 |
"score_ci_low": 0.43661971830985913,
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.2112676056338028,
|
340 |
+
"accuracy_ci_low": 0.1267605633802817,
|
341 |
+
"accuracy_ci_high": 0.30985915492957744,
|
342 |
"score_name": "accuracy",
|
343 |
+
"score": 0.2112676056338028,
|
344 |
+
"score_ci_high": 0.30985915492957744,
|
345 |
+
"score_ci_low": 0.1267605633802817,
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.2112676056338028,
|
350 |
+
"accuracy_ci_low": 0.1267605633802817,
|
351 |
+
"accuracy_ci_high": 0.323943661971831,
|
352 |
"score_name": "accuracy",
|
353 |
+
"score": 0.2112676056338028,
|
354 |
+
"score_ci_high": 0.323943661971831,
|
355 |
+
"score_ci_low": 0.1267605633802817,
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.30985915492957744,
|
360 |
"accuracy_ci_low": 0.2112676056338028,
|
361 |
+
"accuracy_ci_high": 0.4225352112676056,
|
362 |
"score_name": "accuracy",
|
363 |
+
"score": 0.30985915492957744,
|
364 |
+
"score_ci_high": 0.4225352112676056,
|
365 |
"score_ci_low": 0.2112676056338028,
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.4225352112676056,
|
370 |
+
"accuracy_ci_low": 0.29577464788732394,
|
371 |
+
"accuracy_ci_high": 0.5352112676056338,
|
372 |
"score_name": "accuracy",
|
373 |
+
"score": 0.4225352112676056,
|
374 |
+
"score_ci_high": 0.5352112676056338,
|
375 |
+
"score_ci_low": 0.29577464788732394,
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.23943661971830985,
|
380 |
+
"accuracy_ci_low": 0.14084507042253522,
|
381 |
+
"accuracy_ci_high": 0.352112676056338,
|
382 |
"score_name": "accuracy",
|
383 |
+
"score": 0.23943661971830985,
|
384 |
+
"score_ci_high": 0.352112676056338,
|
385 |
+
"score_ci_low": 0.14084507042253522,
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.323943661971831,
|
390 |
+
"accuracy_ci_low": 0.2112676056338028,
|
391 |
+
"accuracy_ci_high": 0.43661971830985913,
|
392 |
"score_name": "accuracy",
|
393 |
+
"score": 0.323943661971831,
|
394 |
+
"score_ci_high": 0.43661971830985913,
|
395 |
+
"score_ci_low": 0.2112676056338028,
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.4507042253521127,
|
400 |
+
"accuracy_ci_low": 0.3380281690140845,
|
401 |
+
"accuracy_ci_high": 0.5633802816901409,
|
402 |
"score_name": "accuracy",
|
403 |
+
"score": 0.4507042253521127,
|
404 |
+
"score_ci_high": 0.5633802816901409,
|
405 |
+
"score_ci_low": 0.3380281690140845,
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
"accuracy": 0.3380281690140845,
|
410 |
"accuracy_ci_low": 0.23943661971830985,
|
411 |
+
"accuracy_ci_high": 0.4393434853289757,
|
412 |
"score_name": "accuracy",
|
413 |
"score": 0.3380281690140845,
|
414 |
+
"score_ci_high": 0.4393434853289757,
|
415 |
"score_ci_low": 0.23943661971830985,
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.2112676056338028,
|
420 |
+
"accuracy_ci_low": 0.1267605633802817,
|
421 |
+
"accuracy_ci_high": 0.30985915492957744,
|
422 |
"score_name": "accuracy",
|
423 |
+
"score": 0.2112676056338028,
|
424 |
+
"score_ci_high": 0.30985915492957744,
|
425 |
+
"score_ci_low": 0.1267605633802817,
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.29577464788732394,
|
430 |
+
"accuracy_ci_low": 0.19718309859154928,
|
431 |
+
"accuracy_ci_high": 0.4084507042253521,
|
432 |
"score_name": "accuracy",
|
433 |
+
"score": 0.29577464788732394,
|
434 |
+
"score_ci_high": 0.4084507042253521,
|
435 |
+
"score_ci_low": 0.19718309859154928,
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.4084507042253521,
|
440 |
+
"accuracy_ci_low": 0.2885703240152898,
|
441 |
+
"accuracy_ci_high": 0.5211267605633803,
|
442 |
"score_name": "accuracy",
|
443 |
+
"score": 0.4084507042253521,
|
444 |
+
"score_ci_high": 0.5211267605633803,
|
445 |
+
"score_ci_low": 0.2885703240152898,
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.2112676056338028,
|
450 |
+
"accuracy_ci_low": 0.1267605633802817,
|
451 |
+
"accuracy_ci_high": 0.30985915492957744,
|
452 |
"score_name": "accuracy",
|
453 |
+
"score": 0.2112676056338028,
|
454 |
+
"score_ci_high": 0.30985915492957744,
|
455 |
+
"score_ci_low": 0.1267605633802817,
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.5492957746478874,
|
460 |
+
"accuracy_ci_low": 0.43661971830985913,
|
461 |
+
"accuracy_ci_high": 0.672415960906933,
|
462 |
"score_name": "accuracy",
|
463 |
+
"score": 0.5492957746478874,
|
464 |
+
"score_ci_high": 0.672415960906933,
|
465 |
+
"score_ci_low": 0.43661971830985913,
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
+
"score": 0.3380281690140845,
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.29746332099273276,
|
475 |
+
"f1_suggestive": 0.0,
|
476 |
+
"f1_descriptive": 0.36363636363636365,
|
477 |
"f1_generic": 0.11764705882352941,
|
478 |
+
"f1_fanciful": 0.6470588235294118,
|
479 |
+
"f1_arbitrary": 0.358974358974359,
|
480 |
+
"f1_macro_ci_low": 0.2234746436877424,
|
481 |
+
"f1_macro_ci_high": 0.3820540135751509,
|
482 |
"score_name": "f1_micro",
|
483 |
+
"score": 0.3493975903614458,
|
484 |
+
"score_ci_high": 0.45121951219512196,
|
485 |
+
"score_ci_low": 0.25149700598802394,
|
486 |
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.3411764705882353,
|
488 |
+
"accuracy_ci_low": 0.24705882352941178,
|
489 |
+
"accuracy_ci_high": 0.4470588235294118,
|
490 |
+
"f1_micro": 0.3493975903614458,
|
491 |
+
"f1_micro_ci_low": 0.25149700598802394,
|
492 |
+
"f1_micro_ci_high": 0.45121951219512196
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.6415770609318996,
|
496 |
+
"f1_no": 0.8387096774193549,
|
497 |
+
"f1_yes": 0.4444444444444444,
|
498 |
+
"f1_macro_ci_low": 0.5666801252929456,
|
499 |
+
"f1_macro_ci_high": 0.7176297030965157,
|
500 |
"score_name": "f1_micro",
|
501 |
+
"score": 0.75,
|
502 |
+
"score_ci_high": 0.805,
|
503 |
+
"score_ci_low": 0.685,
|
504 |
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.75,
|
506 |
+
"accuracy_ci_low": 0.685,
|
507 |
+
"accuracy_ci_high": 0.805,
|
508 |
+
"f1_micro": 0.75,
|
509 |
+
"f1_micro_ci_low": 0.685,
|
510 |
+
"f1_micro_ci_high": 0.805
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.2937287351505673,
|
514 |
+
"f1_conclusion": 0.16,
|
515 |
+
"f1_issue": 0.3291139240506329,
|
516 |
+
"f1_decree": 0.24242424242424243,
|
517 |
+
"f1_rule": 0.4931506849315068,
|
518 |
+
"f1_analysis": 0.2916666666666667,
|
519 |
+
"f1_facts": 0.21621621621621623,
|
520 |
+
"f1_procedural history": 0.3235294117647059,
|
521 |
+
"f1_macro_ci_low": 0.2356167023599295,
|
522 |
+
"f1_macro_ci_high": 0.3627174769966993,
|
523 |
"score_name": "f1_micro",
|
524 |
+
"score": 0.31443298969072164,
|
525 |
+
"score_ci_high": 0.37945181171815084,
|
526 |
+
"score_ci_low": 0.24415584415584415,
|
527 |
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.305,
|
529 |
+
"accuracy_ci_low": 0.23726030718429333,
|
530 |
+
"accuracy_ci_high": 0.37,
|
531 |
+
"f1_micro": 0.31443298969072164,
|
532 |
+
"f1_micro_ci_low": 0.24415584415584415,
|
533 |
+
"f1_micro_ci_high": 0.37945181171815084
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.44871725481706,
|
537 |
+
"f1_yes": 0.5887096774193549,
|
538 |
+
"f1_no": 0.3087248322147651,
|
539 |
+
"f1_macro_ci_low": 0.3839275870787324,
|
540 |
+
"f1_macro_ci_high": 0.5191612607559799,
|
541 |
"score_name": "f1_micro",
|
542 |
+
"score": 0.4836272040302267,
|
543 |
+
"score_ci_high": 0.5505050505050505,
|
544 |
+
"score_ci_low": 0.4120603015075377,
|
545 |
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.48,
|
547 |
+
"accuracy_ci_low": 0.4083713252748318,
|
548 |
+
"accuracy_ci_high": 0.545,
|
549 |
+
"f1_micro": 0.4836272040302267,
|
550 |
+
"f1_micro_ci_low": 0.4120603015075377,
|
551 |
+
"f1_micro_ci_high": 0.5505050505050505
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.8044662309368191,
|
555 |
+
"f1_yes": 0.7941176470588235,
|
556 |
+
"f1_no": 0.8148148148148148,
|
557 |
+
"f1_macro_ci_low": 0.7223270079060395,
|
558 |
+
"f1_macro_ci_high": 0.87627946340442,
|
559 |
"score_name": "f1_micro",
|
560 |
+
"score": 0.8053691275167785,
|
561 |
+
"score_ci_high": 0.8717948717948718,
|
562 |
+
"score_ci_low": 0.7140882327681733,
|
563 |
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.7058823529411765,
|
565 |
+
"accuracy_ci_low": 0.6,
|
566 |
+
"accuracy_ci_high": 0.8,
|
567 |
+
"f1_micro": 0.8053691275167785,
|
568 |
+
"f1_micro_ci_low": 0.7140882327681733,
|
569 |
+
"f1_micro_ci_high": 0.8717948717948718
|
570 |
},
|
571 |
+
"score": 0.5405653823198345,
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.4772436601851055,
|
578 |
+
"f1_cars": 0.7640449438202247,
|
579 |
+
"f1_pc hardware": 0.3557692307692308,
|
580 |
+
"f1_windows x": 0.028985507246376812,
|
581 |
+
"f1_electronics": 0.48717948717948717,
|
582 |
+
"f1_atheism": 0.20408163265306123,
|
583 |
+
"f1_politics": 0.34355828220858897,
|
584 |
+
"f1_religion": 0.2708333333333333,
|
585 |
+
"f1_medicine": 0.7948717948717948,
|
586 |
+
"f1_christianity": 0.4166666666666667,
|
587 |
+
"f1_for sale": 0.6067415730337079,
|
588 |
+
"f1_computer graphics": 0.42016806722689076,
|
589 |
+
"f1_microsoft windows": 0.25806451612903225,
|
590 |
+
"f1_middle east": 0.49382716049382713,
|
591 |
+
"f1_motorcycles": 0.6666666666666666,
|
592 |
+
"f1_mac hardware": 0.25,
|
593 |
+
"f1_guns": 0.23728813559322035,
|
594 |
+
"f1_space": 0.717391304347826,
|
595 |
+
"f1_cryptography": 0.5230769230769231,
|
596 |
+
"f1_baseball": 0.8461538461538461,
|
597 |
+
"f1_hockey": 0.859504132231405,
|
598 |
+
"f1_macro_ci_low": 0.45194761799386507,
|
599 |
+
"f1_macro_ci_high": 0.5063130462647102,
|
600 |
"score_name": "f1_micro",
|
601 |
+
"score": 0.49115281501340485,
|
602 |
+
"score_ci_high": 0.5196912105086561,
|
603 |
+
"score_ci_low": 0.4585932126016045,
|
604 |
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.458,
|
606 |
+
"accuracy_ci_low": 0.427,
|
607 |
+
"accuracy_ci_high": 0.4864735442740007,
|
608 |
+
"f1_micro": 0.49115281501340485,
|
609 |
+
"f1_micro_ci_low": 0.4585932126016045,
|
610 |
+
"f1_micro_ci_high": 0.5196912105086561
|
611 |
},
|
612 |
+
"score": 0.49115281501340485,
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.5988009590549132,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9113372093023255,
|
620 |
+
"f1_checking or savings account": 0.5542168674698795,
|
621 |
+
"f1_debt collection": 0.4779874213836478,
|
622 |
+
"f1_credit card or prepaid card": 0.6370370370370371,
|
623 |
+
"f1_mortgage": 0.7397260273972602,
|
624 |
+
"f1_student loan": 0.8461538461538461,
|
625 |
+
"f1_money transfer or virtual currency or money service": 0.4864864864864865,
|
626 |
+
"f1_vehicle loan or lease": 0.42857142857142855,
|
627 |
+
"f1_payday loan or title loan or personal loan": 0.3076923076923077,
|
628 |
+
"f1_macro_ci_low": 0.550125163696031,
|
629 |
+
"f1_macro_ci_high": 0.6692920824665255,
|
630 |
"score_name": "f1_micro",
|
631 |
+
"score": 0.8145077720207254,
|
632 |
+
"score_ci_high": 0.8367924066551193,
|
633 |
+
"score_ci_low": 0.7900784551279257,
|
634 |
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.786,
|
636 |
+
"accuracy_ci_low": 0.76,
|
637 |
+
"accuracy_ci_high": 0.81,
|
638 |
+
"f1_micro": 0.8145077720207254,
|
639 |
+
"f1_micro_ci_low": 0.7900784551279257,
|
640 |
+
"f1_micro_ci_high": 0.8367924066551193
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.6991553168902936,
|
644 |
+
"f1_mortgages and loans": 0.8228571428571428,
|
645 |
+
"f1_credit card": 0.7428571428571429,
|
646 |
+
"f1_debt collection": 0.6116504854368932,
|
647 |
+
"f1_credit reporting": 0.7211895910780669,
|
648 |
+
"f1_retail banking": 0.5972222222222222,
|
649 |
+
"f1_macro_ci_low": 0.6611649815931737,
|
650 |
+
"f1_macro_ci_high": 0.7441131702771507,
|
651 |
"score_name": "f1_micro",
|
652 |
+
"score": 0.7017543859649122,
|
653 |
+
"score_ci_high": 0.7444878377150386,
|
654 |
+
"score_ci_low": 0.6639049566735055,
|
655 |
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.68,
|
657 |
+
"accuracy_ci_low": 0.6415834821537145,
|
658 |
+
"accuracy_ci_high": 0.7250870857804175,
|
659 |
+
"f1_micro": 0.7017543859649122,
|
660 |
+
"f1_micro_ci_low": 0.6639049566735055,
|
661 |
+
"f1_micro_ci_high": 0.7444878377150386
|
662 |
},
|
663 |
+
"score": 0.7581310789928188,
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
+
"execution_accuracy": 0.113,
|
671 |
+
"program_accuracy": 0.137,
|
672 |
+
"score": 0.137,
|
673 |
"score_name": "program_accuracy",
|
674 |
+
"execution_accuracy_ci_low": 0.094,
|
675 |
+
"execution_accuracy_ci_high": 0.133,
|
676 |
+
"program_accuracy_ci_low": 0.115,
|
677 |
+
"program_accuracy_ci_high": 0.159,
|
678 |
+
"score_ci_low": 0.115,
|
679 |
+
"score_ci_high": 0.159
|
|
|
680 |
},
|
681 |
+
"score": 0.137,
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.3510702505438354,
|
688 |
+
"recall": 0.5809699679510664,
|
689 |
+
"f1": 0.3824309995878018,
|
690 |
+
"precision_ci_low": 0.33249065458630267,
|
691 |
+
"precision_ci_high": 0.37203127312002804,
|
692 |
+
"recall_ci_low": 0.5641121374917414,
|
693 |
+
"recall_ci_high": 0.5988884760579221,
|
694 |
+
"f1_ci_low": 0.3661582243989197,
|
695 |
+
"f1_ci_high": 0.4007102293320155,
|
696 |
"score_name": "f1",
|
697 |
+
"score": 0.3824309995878018,
|
698 |
+
"score_ci_high": 0.4007102293320155,
|
699 |
+
"score_ci_low": 0.3661582243989197,
|
700 |
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.6284164202213287,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.7022818158070246,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5862240339815616,
|
704 |
+
"faithfullness_f1_token_overlap": 0.32867960701983034,
|
705 |
+
"faithfullness_recall_token_overlap": 0.26185727010665827,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5811151625996505,
|
707 |
+
"correctness_f1_token_overlap": 0.3824309995878018,
|
708 |
+
"correctness_recall_token_overlap": 0.5809699679510664,
|
709 |
+
"correctness_precision_token_overlap": 0.3510702505438354
|
710 |
},
|
711 |
+
"score": 0.3824309995878018,
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
+
"accuracy": 0.479,
|
718 |
+
"accuracy_ci_low": 0.448,
|
719 |
+
"accuracy_ci_high": 0.5103198452743031,
|
720 |
"score_name": "accuracy",
|
721 |
+
"score": 0.479,
|
722 |
+
"score_ci_high": 0.5103198452743031,
|
723 |
+
"score_ci_low": 0.448,
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
+
"accuracy": 0.818,
|
728 |
+
"accuracy_ci_low": 0.7804131917525572,
|
729 |
+
"accuracy_ci_high": 0.85,
|
730 |
"score_name": "accuracy",
|
731 |
+
"score": 0.818,
|
732 |
+
"score_ci_high": 0.85,
|
733 |
+
"score_ci_low": 0.7804131917525572,
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
+
"score": 0.6485,
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
740 |
"safety": {
|
741 |
"attaq_500": {
|
742 |
+
"severity_critical": 0.0,
|
743 |
+
"severity_high": 0.0,
|
744 |
+
"severity_medium": 0.0,
|
745 |
+
"severity_low": 100.0,
|
746 |
+
"category_harmful_info": 0.8486059237981127,
|
747 |
+
"category_deception": 0.8698270569997605,
|
748 |
+
"category_explicit_content": 0.8145937094032161,
|
749 |
+
"category_violence": 0.8388658185134685,
|
750 |
+
"category_discrimination": 0.8823170511732025,
|
751 |
+
"category_substance_abuse": 0.9462624288267559,
|
752 |
+
"category_pii": 0.8776307304700216,
|
753 |
+
"safety": 0.8519032163850758,
|
754 |
+
"safety_ci_low": 0.8310657418019435,
|
755 |
+
"safety_ci_high": 0.8712351615904984,
|
756 |
"score_name": "safety",
|
757 |
+
"score": 0.8519032163850758,
|
758 |
+
"score_ci_high": 0.8712351615904984,
|
759 |
+
"score_ci_low": 0.8310657418019435,
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
+
"score": 0.8519032163850758,
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
+
"rougeLsum": 0.3610679948661998,
|
770 |
+
"rouge1": 0.4233619373045491,
|
771 |
+
"rougeL": 0.29420815957168495,
|
772 |
+
"score": 0.29420815957168495,
|
773 |
"score_name": "rougeL",
|
774 |
+
"rouge2": 0.20840228792560456,
|
775 |
+
"rougeLsum_ci_low": 0.35255104965168327,
|
776 |
+
"rougeLsum_ci_high": 0.36915628751369906,
|
777 |
+
"rouge1_ci_low": 0.4141157245136807,
|
778 |
+
"rouge1_ci_high": 0.43171361247266377,
|
779 |
+
"rougeL_ci_low": 0.28665554382409086,
|
780 |
+
"rougeL_ci_high": 0.30100667120780134,
|
781 |
+
"score_ci_low": 0.28665554382409086,
|
782 |
+
"score_ci_high": 0.30100667120780134,
|
783 |
+
"rouge2_ci_low": 0.20121549945064432,
|
784 |
+
"rouge2_ci_high": 0.21553750893087562
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
+
"rougeLsum": 0.09608294803448246,
|
789 |
+
"rouge1": 0.11601113348984893,
|
790 |
+
"rougeL": 0.08329710711031496,
|
791 |
+
"score": 0.08329710711031496,
|
792 |
"score_name": "rougeL",
|
793 |
+
"rouge2": 0.01614281525612853,
|
794 |
+
"rougeLsum_ci_low": 0.09161955792928417,
|
795 |
+
"rougeLsum_ci_high": 0.10006888471086645,
|
796 |
+
"rouge1_ci_low": 0.1103956147665806,
|
797 |
+
"rouge1_ci_high": 0.12113815092736294,
|
798 |
+
"rougeL_ci_low": 0.07939906960390719,
|
799 |
+
"rougeL_ci_high": 0.08668886729552314,
|
800 |
+
"score_ci_low": 0.07939906960390719,
|
801 |
+
"score_ci_high": 0.08668886729552314,
|
802 |
+
"rouge2_ci_low": 0.014303052357088147,
|
803 |
+
"rouge2_ci_high": 0.01823150788885683
|
804 |
},
|
805 |
+
"score": 0.18875263334099995,
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
+
1127,
|
814 |
+
611,
|
815 |
+
358,
|
816 |
+
217
|
817 |
],
|
818 |
"totals": [
|
819 |
+
1857,
|
820 |
+
1791,
|
821 |
+
1725,
|
822 |
+
1659
|
823 |
],
|
824 |
"precisions": [
|
825 |
+
0.6068928379106086,
|
826 |
+
0.3411501954215522,
|
827 |
+
0.20753623188405798,
|
828 |
+
0.13080168776371306
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
+
"sys_len": 1857,
|
832 |
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.27380490753896447,
|
834 |
+
"score": 0.27380490753896447,
|
835 |
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.24139134438987545,
|
837 |
+
"score_ci_high": 0.3114086803649994,
|
838 |
+
"sacrebleu_ci_low": 0.24139134438987545,
|
839 |
+
"sacrebleu_ci_high": 0.3114086803649994
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
+
1217,
|
845 |
+
742,
|
846 |
+
486,
|
847 |
+
324
|
848 |
],
|
849 |
"totals": [
|
850 |
+
1805,
|
851 |
+
1739,
|
852 |
+
1673,
|
853 |
+
1607
|
854 |
],
|
855 |
"precisions": [
|
856 |
+
0.6742382271468144,
|
857 |
+
0.42668200115008625,
|
858 |
+
0.2904961147638972,
|
859 |
+
0.20161792159303052
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
+
"sys_len": 1805,
|
863 |
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.36028550442407303,
|
865 |
+
"score": 0.36028550442407303,
|
866 |
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.3120172829600809,
|
868 |
+
"score_ci_high": 0.4057887928505002,
|
869 |
+
"sacrebleu_ci_low": 0.3120172829600809,
|
870 |
+
"sacrebleu_ci_high": 0.4057887928505002
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
+
672,
|
876 |
+
256,
|
877 |
+
123,
|
878 |
+
56
|
879 |
],
|
880 |
"totals": [
|
881 |
+
1845,
|
882 |
+
1779,
|
883 |
+
1713,
|
884 |
+
1647
|
885 |
],
|
886 |
"precisions": [
|
887 |
+
0.36422764227642274,
|
888 |
+
0.14390106801573918,
|
889 |
+
0.07180385288966726,
|
890 |
+
0.03400121432908318
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
+
"sys_len": 1845,
|
894 |
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.10635790496521375,
|
896 |
+
"score": 0.10635790496521375,
|
897 |
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.0652406937888321,
|
899 |
+
"score_ci_high": 0.1319350359039831,
|
900 |
+
"sacrebleu_ci_low": 0.0652406937888321,
|
901 |
+
"sacrebleu_ci_high": 0.1319350359039831
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
+
1077,
|
907 |
+
569,
|
908 |
+
326,
|
909 |
+
185
|
910 |
],
|
911 |
"totals": [
|
912 |
+
1845,
|
913 |
+
1779,
|
914 |
+
1713,
|
915 |
+
1647
|
916 |
],
|
917 |
"precisions": [
|
918 |
+
0.583739837398374,
|
919 |
+
0.3198426082068578,
|
920 |
+
0.19030939871570343,
|
921 |
+
0.11232544019429266
|
922 |
],
|
923 |
"bp": 1.0,
|
924 |
+
"sys_len": 1845,
|
925 |
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.25134688330821237,
|
927 |
+
"score": 0.25134688330821237,
|
928 |
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.2195832990093629,
|
930 |
+
"score_ci_high": 0.2837968314094506,
|
931 |
+
"sacrebleu_ci_low": 0.2195832990093629,
|
932 |
+
"sacrebleu_ci_high": 0.2837968314094506
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
+
1424,
|
938 |
+
986,
|
939 |
+
714,
|
940 |
+
526
|
941 |
],
|
942 |
"totals": [
|
943 |
+
1999,
|
944 |
+
1933,
|
945 |
+
1867,
|
946 |
+
1801
|
947 |
],
|
948 |
"precisions": [
|
949 |
+
0.7123561780890445,
|
950 |
+
0.5100879461976203,
|
951 |
+
0.3824317086234601,
|
952 |
+
0.29205996668517487
|
953 |
],
|
954 |
+
"bp": 0.9660716664698304,
|
955 |
+
"sys_len": 1999,
|
956 |
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.4336120976233934,
|
958 |
+
"score": 0.4336120976233934,
|
959 |
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.39952659045667155,
|
961 |
+
"score_ci_high": 0.47193262493606236,
|
962 |
+
"sacrebleu_ci_low": 0.39952659045667155,
|
963 |
+
"sacrebleu_ci_high": 0.47193262493606236
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
+
1019,
|
969 |
+
419,
|
970 |
+
213,
|
971 |
+
113
|
972 |
],
|
973 |
"totals": [
|
974 |
+
3749,
|
975 |
+
3683,
|
976 |
+
3617,
|
977 |
+
3552
|
978 |
],
|
979 |
"precisions": [
|
980 |
+
0.27180581488396904,
|
981 |
+
0.11376595166983437,
|
982 |
+
0.0588885816975394,
|
983 |
+
0.031813063063063064
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
+
"sys_len": 3749,
|
987 |
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.08724225995783678,
|
989 |
+
"score": 0.08724225995783678,
|
990 |
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.06811801818711148,
|
992 |
+
"score_ci_high": 0.11195945404539422,
|
993 |
+
"sacrebleu_ci_low": 0.06811801818711148,
|
994 |
+
"sacrebleu_ci_high": 0.11195945404539422
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
+
1341,
|
1000 |
+
889,
|
1001 |
+
634,
|
1002 |
+
457
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
+
1879,
|
1006 |
+
1813,
|
1007 |
+
1747,
|
1008 |
+
1681
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
+
0.7136774880255454,
|
1012 |
+
0.4903474903474903,
|
1013 |
+
0.36290784201488263,
|
1014 |
+
0.2718619869125521
|
1015 |
],
|
1016 |
+
"bp": 0.9805012826642417,
|
1017 |
+
"sys_len": 1879,
|
1018 |
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.4226548575605273,
|
1020 |
+
"score": 0.4226548575605273,
|
1021 |
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.3804303884338436,
|
1023 |
+
"score_ci_high": 0.46259367891162306,
|
1024 |
+
"sacrebleu_ci_low": 0.3804303884338436,
|
1025 |
+
"sacrebleu_ci_high": 0.46259367891162306
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
+
966,
|
1031 |
+
465,
|
1032 |
+
258,
|
1033 |
+
149
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
+
2330,
|
1037 |
+
2264,
|
1038 |
+
2198,
|
1039 |
+
2132
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
+
0.4145922746781116,
|
1043 |
+
0.20538869257950532,
|
1044 |
+
0.11737943585077343,
|
1045 |
+
0.0698874296435272
|
1046 |
],
|
1047 |
"bp": 1.0,
|
1048 |
+
"sys_len": 2330,
|
1049 |
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.1625725453572352,
|
1051 |
+
"score": 0.1625725453572352,
|
1052 |
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.11669140677951717,
|
1054 |
+
"score_ci_high": 0.20412983752636973,
|
1055 |
+
"sacrebleu_ci_low": 0.11669140677951717,
|
1056 |
+
"sacrebleu_ci_high": 0.20412983752636973
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
+
1199,
|
1062 |
+
637,
|
1063 |
+
360,
|
1064 |
+
199
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
+
1973,
|
1068 |
+
1907,
|
1069 |
+
1841,
|
1070 |
+
1775
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
+
0.607704004054739,
|
1074 |
+
0.3340325117986366,
|
1075 |
+
0.19554589896795221,
|
1076 |
+
0.11211267605633803
|
1077 |
],
|
1078 |
+
"bp": 0.9386099296136466,
|
1079 |
+
"sys_len": 1973,
|
1080 |
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.2424271251773898,
|
1082 |
+
"score": 0.2424271251773898,
|
1083 |
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.21260065622080154,
|
1085 |
+
"score_ci_high": 0.26696534058145066,
|
1086 |
+
"sacrebleu_ci_low": 0.21260065622080154,
|
1087 |
+
"sacrebleu_ci_high": 0.26696534058145066
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
+
1270,
|
1093 |
+
795,
|
1094 |
+
520,
|
1095 |
+
348
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
+
1847,
|
1099 |
+
1781,
|
1100 |
+
1715,
|
1101 |
+
1649
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
+
0.6876015159718462,
|
1105 |
+
0.446378439079169,
|
1106 |
+
0.3032069970845481,
|
1107 |
+
0.2110369921164342
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
+
"sys_len": 1847,
|
1111 |
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.37435570897744036,
|
1113 |
+
"score": 0.37435570897744036,
|
1114 |
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.33656443265221864,
|
1116 |
+
"score_ci_high": 0.4099554772696377,
|
1117 |
+
"sacrebleu_ci_low": 0.33656443265221864,
|
1118 |
+
"sacrebleu_ci_high": 0.4099554772696377
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
+
988,
|
1124 |
+
434,
|
1125 |
+
239,
|
1126 |
140
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
+
1874,
|
1130 |
+
1808,
|
1131 |
+
1742,
|
1132 |
+
1676
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
+
0.5272145144076841,
|
1136 |
+
0.24004424778761063,
|
1137 |
+
0.13719862227324914,
|
1138 |
+
0.08353221957040573
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
+
"sys_len": 1874,
|
1142 |
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.19515092235944087,
|
1144 |
+
"score": 0.19515092235944087,
|
1145 |
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.16674417267909872,
|
1147 |
+
"score_ci_high": 0.231444565320084,
|
1148 |
+
"sacrebleu_ci_low": 0.16674417267909872,
|
1149 |
+
"sacrebleu_ci_high": 0.231444565320084
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
+
976,
|
1155 |
+
440,
|
1156 |
+
232,
|
1157 |
+
128
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
+
1841,
|
1161 |
+
1775,
|
1162 |
+
1709,
|
1163 |
+
1643
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
+
0.530146659424226,
|
1167 |
+
0.24788732394366197,
|
1168 |
+
0.13575190169689877,
|
1169 |
+
0.07790626902008521
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
+
"sys_len": 1841,
|
1173 |
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.19308216913928786,
|
1175 |
+
"score": 0.19308216913928786,
|
1176 |
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.1691283364865516,
|
1178 |
+
"score_ci_high": 0.22228985058810502,
|
1179 |
+
"sacrebleu_ci_low": 0.1691283364865516,
|
1180 |
+
"sacrebleu_ci_high": 0.22228985058810502
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
+
1260,
|
1186 |
+
815,
|
1187 |
+
565,
|
1188 |
+
399
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
+
1793,
|
1192 |
+
1727,
|
1193 |
+
1661,
|
1194 |
+
1595
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
+
0.7027328499721137,
|
1198 |
+
0.4719166184134337,
|
1199 |
+
0.34015653220951236,
|
1200 |
+
0.2501567398119122
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
+
"sys_len": 1793,
|
1204 |
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.4098610398858089,
|
1206 |
+
"score": 0.4098610398858089,
|
1207 |
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.3653913364652719,
|
1209 |
+
"score_ci_high": 0.46316065930620326,
|
1210 |
+
"sacrebleu_ci_low": 0.3653913364652719,
|
1211 |
+
"sacrebleu_ci_high": 0.46316065930620326
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
+
1227,
|
1217 |
729,
|
1218 |
+
472,
|
1219 |
+
294
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
+
1830,
|
1223 |
+
1764,
|
1224 |
+
1698,
|
1225 |
+
1632
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
+
0.6704918032786885,
|
1229 |
+
0.41326530612244894,
|
1230 |
+
0.2779740871613663,
|
1231 |
+
0.1801470588235294
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
+
"sys_len": 1830,
|
1235 |
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.343212800768137,
|
1237 |
+
"score": 0.343212800768137,
|
1238 |
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.3002026232607091,
|
1240 |
+
"score_ci_high": 0.3993584334850746,
|
1241 |
+
"sacrebleu_ci_low": 0.3002026232607091,
|
1242 |
+
"sacrebleu_ci_high": 0.3993584334850746
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
+
1100,
|
1248 |
+
577,
|
1249 |
+
339,
|
1250 |
+
205
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
+
1824,
|
1254 |
+
1758,
|
1255 |
+
1692,
|
1256 |
+
1626
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
+
0.6030701754385965,
|
1260 |
+
0.3282138794084187,
|
1261 |
+
0.200354609929078,
|
1262 |
+
0.12607626076260764
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
+
"sys_len": 1824,
|
1266 |
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.2659128735671552,
|
1268 |
+
"score": 0.2659128735671552,
|
1269 |
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.23124038640104372,
|
1271 |
+
"score_ci_high": 0.30957401763446213,
|
1272 |
+
"sacrebleu_ci_low": 0.23124038640104372,
|
1273 |
+
"sacrebleu_ci_high": 0.30957401763446213
|
1274 |
},
|
1275 |
+
"score": 0.27479197337400774,
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
+
"score": 0.4561786095841296,
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
results/bluebench/{2025-06-19T16-09-06_evaluation_results.json β 2025-06-23T03-17-57_evaluation_results.json}
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
-
"timestamp_utc": "2025-06-
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
@@ -42,7 +42,7 @@
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
-
"unitxt_commit_hash": "
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
@@ -176,633 +176,633 @@
|
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
-
"accuracy": 0.
|
180 |
-
"accuracy_ci_low": 0.
|
181 |
-
"accuracy_ci_high": 0.
|
182 |
"score_name": "accuracy",
|
183 |
-
"score": 0.
|
184 |
-
"score_ci_high": 0.
|
185 |
-
"score_ci_low": 0.
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
-
"accuracy": 0.
|
190 |
-
"accuracy_ci_low": 0.
|
191 |
-
"accuracy_ci_high": 0.
|
192 |
"score_name": "accuracy",
|
193 |
-
"score": 0.
|
194 |
-
"score_ci_high": 0.
|
195 |
-
"score_ci_low": 0.
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
199 |
-
"accuracy": 0.
|
200 |
-
"accuracy_ci_low": 0.
|
201 |
-
"accuracy_ci_high": 0.
|
202 |
"score_name": "accuracy",
|
203 |
-
"score": 0.
|
204 |
-
"score_ci_high": 0.
|
205 |
-
"score_ci_low": 0.
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
-
"accuracy": 0.
|
210 |
-
"accuracy_ci_low": 0.
|
211 |
-
"accuracy_ci_high": 0.
|
212 |
"score_name": "accuracy",
|
213 |
-
"score": 0.
|
214 |
-
"score_ci_high": 0.
|
215 |
-
"score_ci_low": 0.
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
"accuracy": 0.5888888888888889,
|
220 |
-
"accuracy_ci_low": 0.
|
221 |
"accuracy_ci_high": 0.6888888888888889,
|
222 |
"score_name": "accuracy",
|
223 |
"score": 0.5888888888888889,
|
224 |
"score_ci_high": 0.6888888888888889,
|
225 |
-
"score_ci_low": 0.
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
-
"accuracy": 0.
|
230 |
-
"accuracy_ci_low": 0.
|
231 |
-
"accuracy_ci_high": 0.
|
232 |
"score_name": "accuracy",
|
233 |
-
"score": 0.
|
234 |
-
"score_ci_high": 0.
|
235 |
-
"score_ci_low": 0.
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
239 |
-
"accuracy": 0.
|
240 |
-
"accuracy_ci_low": 0.
|
241 |
-
"accuracy_ci_high": 0.
|
242 |
"score_name": "accuracy",
|
243 |
-
"score": 0.
|
244 |
-
"score_ci_high": 0.
|
245 |
-
"score_ci_low": 0.
|
246 |
"num_of_instances": 90
|
247 |
},
|
248 |
"safety_bbq_race_x_ses": {
|
249 |
"accuracy": 0.6222222222222222,
|
250 |
-
"accuracy_ci_low": 0.
|
251 |
-
"accuracy_ci_high": 0.
|
252 |
"score_name": "accuracy",
|
253 |
"score": 0.6222222222222222,
|
254 |
-
"score_ci_high": 0.
|
255 |
-
"score_ci_low": 0.
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
-
"accuracy": 0.
|
260 |
-
"accuracy_ci_low": 0.
|
261 |
-
"accuracy_ci_high": 0.
|
262 |
"score_name": "accuracy",
|
263 |
-
"score": 0.
|
264 |
-
"score_ci_high": 0.
|
265 |
-
"score_ci_low": 0.
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
-
"accuracy": 0.
|
270 |
-
"accuracy_ci_low": 0.
|
271 |
-
"accuracy_ci_high": 0.
|
272 |
"score_name": "accuracy",
|
273 |
-
"score": 0.
|
274 |
-
"score_ci_high": 0.
|
275 |
-
"score_ci_low": 0.
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
-
"accuracy": 0.
|
280 |
-
"accuracy_ci_low": 0.
|
281 |
-
"accuracy_ci_high": 0.
|
282 |
"score_name": "accuracy",
|
283 |
-
"score": 0.
|
284 |
-
"score_ci_high": 0.
|
285 |
-
"score_ci_low": 0.
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
-
"score": 0.
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
296 |
-
"score": 0.
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
-
"score": 0.
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
-
"f1_Person": 0.
|
307 |
-
"f1_Organization": 0.
|
308 |
-
"f1_Location": 0.
|
309 |
-
"f1_macro": 0.
|
310 |
-
"recall_macro": 0.
|
311 |
-
"precision_macro": 0.
|
312 |
-
"in_classes_support": 0.
|
313 |
-
"f1_micro": 0.
|
314 |
-
"recall_micro": 0.
|
315 |
-
"precision_micro": 0.
|
316 |
-
"score": 0.
|
317 |
"score_name": "f1_micro",
|
318 |
-
"score_ci_low": 0.
|
319 |
-
"score_ci_high": 0.
|
320 |
-
"f1_micro_ci_low": 0.
|
321 |
-
"f1_micro_ci_high": 0.
|
322 |
},
|
323 |
-
"score": 0.
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
-
"accuracy": 0.
|
330 |
-
"accuracy_ci_low": 0.
|
331 |
-
"accuracy_ci_high": 0.
|
332 |
"score_name": "accuracy",
|
333 |
-
"score": 0.
|
334 |
-
"score_ci_high": 0.
|
335 |
-
"score_ci_low": 0.
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
-
"accuracy": 0.
|
340 |
-
"accuracy_ci_low": 0.
|
341 |
-
"accuracy_ci_high": 0.
|
342 |
"score_name": "accuracy",
|
343 |
-
"score": 0.
|
344 |
-
"score_ci_high": 0.
|
345 |
-
"score_ci_low": 0.
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
"accuracy": 0.23943661971830985,
|
350 |
"accuracy_ci_low": 0.14084507042253522,
|
351 |
-
"accuracy_ci_high": 0.
|
352 |
"score_name": "accuracy",
|
353 |
"score": 0.23943661971830985,
|
354 |
-
"score_ci_high": 0.
|
355 |
"score_ci_low": 0.14084507042253522,
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
-
"accuracy": 0.
|
360 |
-
"accuracy_ci_low": 0.
|
361 |
-
"accuracy_ci_high": 0.
|
362 |
"score_name": "accuracy",
|
363 |
-
"score": 0.
|
364 |
-
"score_ci_high": 0.
|
365 |
-
"score_ci_low": 0.
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
-
"accuracy": 0.
|
370 |
-
"accuracy_ci_low": 0.
|
371 |
"accuracy_ci_high": 0.5070422535211268,
|
372 |
"score_name": "accuracy",
|
373 |
-
"score": 0.
|
374 |
"score_ci_high": 0.5070422535211268,
|
375 |
-
"score_ci_low": 0.
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
379 |
-
"accuracy": 0.
|
380 |
-
"accuracy_ci_low": 0.
|
381 |
-
"accuracy_ci_high": 0.
|
382 |
"score_name": "accuracy",
|
383 |
-
"score": 0.
|
384 |
-
"score_ci_high": 0.
|
385 |
-
"score_ci_low": 0.
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
-
"accuracy": 0.
|
390 |
-
"accuracy_ci_low": 0.
|
391 |
-
"accuracy_ci_high": 0.
|
392 |
"score_name": "accuracy",
|
393 |
-
"score": 0.
|
394 |
-
"score_ci_high": 0.
|
395 |
-
"score_ci_low": 0.
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
"accuracy": 0.2676056338028169,
|
400 |
-
"accuracy_ci_low": 0.
|
401 |
-
"accuracy_ci_high": 0.
|
402 |
"score_name": "accuracy",
|
403 |
"score": 0.2676056338028169,
|
404 |
-
"score_ci_high": 0.
|
405 |
-
"score_ci_low": 0.
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
-
"accuracy": 0.
|
410 |
-
"accuracy_ci_low": 0.
|
411 |
-
"accuracy_ci_high": 0.
|
412 |
"score_name": "accuracy",
|
413 |
-
"score": 0.
|
414 |
-
"score_ci_high": 0.
|
415 |
-
"score_ci_low": 0.
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
-
"accuracy": 0.
|
420 |
-
"accuracy_ci_low": 0.
|
421 |
-
"accuracy_ci_high": 0.
|
422 |
"score_name": "accuracy",
|
423 |
-
"score": 0.
|
424 |
-
"score_ci_high": 0.
|
425 |
-
"score_ci_low": 0.
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
-
"accuracy": 0.
|
430 |
-
"accuracy_ci_low": 0.
|
431 |
-
"accuracy_ci_high": 0.
|
432 |
"score_name": "accuracy",
|
433 |
-
"score": 0.
|
434 |
-
"score_ci_high": 0.
|
435 |
-
"score_ci_low": 0.
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
-
"accuracy": 0.
|
440 |
-
"accuracy_ci_low": 0.
|
441 |
-
"accuracy_ci_high": 0.
|
442 |
"score_name": "accuracy",
|
443 |
-
"score": 0.
|
444 |
-
"score_ci_high": 0.
|
445 |
-
"score_ci_low": 0.
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
-
"accuracy": 0.
|
450 |
-
"accuracy_ci_low": 0.
|
451 |
-
"accuracy_ci_high": 0.
|
452 |
"score_name": "accuracy",
|
453 |
-
"score": 0.
|
454 |
-
"score_ci_high": 0.
|
455 |
-
"score_ci_low": 0.
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
-
"accuracy": 0.
|
460 |
-
"accuracy_ci_low": 0.
|
461 |
-
"accuracy_ci_high": 0.
|
462 |
"score_name": "accuracy",
|
463 |
-
"score": 0.
|
464 |
-
"score_ci_high": 0.
|
465 |
-
"score_ci_low": 0.
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
-
"score": 0.
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
-
"f1_macro": 0.
|
475 |
-
"f1_suggestive": 0.
|
476 |
-
"
|
477 |
-
"
|
478 |
-
"
|
479 |
-
"
|
480 |
-
"f1_macro_ci_low": 0.
|
481 |
-
"f1_macro_ci_high": 0.
|
482 |
"score_name": "f1_micro",
|
483 |
-
"score": 0.
|
484 |
-
"score_ci_high": 0.
|
485 |
-
"score_ci_low": 0.
|
486 |
"num_of_instances": 85,
|
487 |
-
"accuracy": 0.
|
488 |
-
"accuracy_ci_low": 0.
|
489 |
-
"accuracy_ci_high": 0.
|
490 |
-
"f1_micro": 0.
|
491 |
-
"f1_micro_ci_low": 0.
|
492 |
-
"f1_micro_ci_high": 0.
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
-
"f1_macro": 0.
|
496 |
-
"f1_no": 0.
|
497 |
-
"f1_yes": 0.
|
498 |
-
"f1_macro_ci_low": 0.
|
499 |
-
"f1_macro_ci_high": 0.
|
500 |
"score_name": "f1_micro",
|
501 |
"score": 0.6445012787723785,
|
502 |
-
"score_ci_high": 0.
|
503 |
-
"score_ci_low": 0.
|
504 |
"num_of_instances": 200,
|
505 |
"accuracy": 0.63,
|
506 |
-
"accuracy_ci_low": 0.
|
507 |
"accuracy_ci_high": 0.69,
|
508 |
"f1_micro": 0.6445012787723785,
|
509 |
-
"f1_micro_ci_low": 0.
|
510 |
-
"f1_micro_ci_high": 0.
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
-
"f1_macro": 0.
|
514 |
-
"f1_conclusion": 0.
|
515 |
-
"
|
516 |
-
"
|
517 |
-
"
|
518 |
-
"
|
519 |
-
"
|
520 |
-
"f1_rule": 0.
|
521 |
-
"f1_macro_ci_low": 0.
|
522 |
-
"f1_macro_ci_high": 0.
|
523 |
"score_name": "f1_micro",
|
524 |
-
"score": 0.
|
525 |
-
"score_ci_high": 0.
|
526 |
-
"score_ci_low": 0.
|
527 |
"num_of_instances": 200,
|
528 |
-
"accuracy": 0.
|
529 |
-
"accuracy_ci_low": 0.
|
530 |
-
"accuracy_ci_high": 0.
|
531 |
-
"f1_micro": 0.
|
532 |
-
"f1_micro_ci_low": 0.
|
533 |
-
"f1_micro_ci_high": 0.
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
-
"f1_macro": 0.
|
537 |
-
"f1_yes": 0.
|
538 |
-
"f1_no": 0.
|
539 |
-
"f1_macro_ci_low": 0.
|
540 |
-
"f1_macro_ci_high": 0.
|
541 |
"score_name": "f1_micro",
|
542 |
-
"score": 0.
|
543 |
-
"score_ci_high": 0.
|
544 |
-
"score_ci_low": 0.
|
545 |
"num_of_instances": 200,
|
546 |
-
"accuracy": 0.
|
547 |
-
"accuracy_ci_low": 0.
|
548 |
-
"accuracy_ci_high": 0.
|
549 |
-
"f1_micro": 0.
|
550 |
-
"f1_micro_ci_low": 0.
|
551 |
-
"f1_micro_ci_high": 0.
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
-
"f1_macro": 0.
|
555 |
-
"f1_yes": 0.
|
556 |
-
"f1_no": 0.
|
557 |
-
"f1_macro_ci_low": 0.
|
558 |
-
"f1_macro_ci_high": 0.
|
559 |
"score_name": "f1_micro",
|
560 |
-
"score": 0.
|
561 |
-
"score_ci_high": 0.
|
562 |
-
"score_ci_low": 0.
|
563 |
"num_of_instances": 85,
|
564 |
-
"accuracy": 0.
|
565 |
-
"accuracy_ci_low": 0.
|
566 |
-
"accuracy_ci_high": 0.
|
567 |
-
"f1_micro": 0.
|
568 |
-
"f1_micro_ci_low": 0.
|
569 |
-
"f1_micro_ci_high": 0.
|
570 |
},
|
571 |
-
"score": 0.
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
-
"f1_macro": 0.
|
578 |
-
"f1_cars": 0.
|
579 |
-
"f1_windows x": 0.
|
580 |
-
"f1_atheism": 0.
|
581 |
-
"f1_christianity": 0.
|
582 |
-
"f1_religion": 0.
|
583 |
-
"f1_medicine": 0.
|
584 |
-
"f1_computer graphics": 0.
|
585 |
-
"f1_microsoft windows": 0.
|
586 |
-
"f1_middle east": 0.
|
587 |
-
"f1_politics": 0.
|
588 |
-
"f1_motorcycles": 0.
|
589 |
-
"
|
590 |
-
"
|
591 |
-
"
|
592 |
-
"f1_for sale": 0.
|
593 |
-
"f1_guns": 0.
|
594 |
-
"f1_space": 0.
|
595 |
-
"f1_cryptography": 0.
|
596 |
-
"
|
597 |
-
"
|
598 |
-
"f1_macro_ci_low": 0.
|
599 |
-
"f1_macro_ci_high": 0.
|
600 |
"score_name": "f1_micro",
|
601 |
-
"score": 0.
|
602 |
-
"score_ci_high": 0.
|
603 |
-
"score_ci_low": 0.
|
604 |
"num_of_instances": 1000,
|
605 |
-
"accuracy": 0.
|
606 |
-
"accuracy_ci_low": 0.
|
607 |
-
"accuracy_ci_high": 0.
|
608 |
-
"f1_micro": 0.
|
609 |
-
"f1_micro_ci_low": 0.
|
610 |
-
"f1_micro_ci_high": 0.
|
611 |
},
|
612 |
-
"score": 0.
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
-
"f1_macro": 0.
|
619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
620 |
-
"
|
621 |
-
"
|
622 |
-
"
|
623 |
-
"
|
624 |
-
"f1_vehicle loan or lease": 0.30303030303030304,
|
625 |
-
"f1_mortgage": 0.6909090909090909,
|
626 |
-
"f1_money transfer or virtual currency or money service": 0.34285714285714286,
|
627 |
"f1_student loan": 0.6666666666666666,
|
628 |
-
"
|
629 |
-
"
|
|
|
|
|
|
|
630 |
"score_name": "f1_micro",
|
631 |
-
"score": 0.
|
632 |
-
"score_ci_high": 0.
|
633 |
-
"score_ci_low": 0.
|
634 |
"num_of_instances": 1000,
|
635 |
-
"accuracy": 0.
|
636 |
-
"accuracy_ci_low": 0.
|
637 |
-
"accuracy_ci_high": 0.
|
638 |
-
"f1_micro": 0.
|
639 |
-
"f1_micro_ci_low": 0.
|
640 |
-
"f1_micro_ci_high": 0.
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
-
"f1_macro": 0.
|
644 |
-
"f1_mortgages and loans": 0.
|
645 |
-
"f1_credit card": 0.
|
646 |
-
"f1_debt collection": 0.
|
647 |
-
"f1_credit reporting": 0.
|
648 |
-
"f1_retail banking": 0.
|
649 |
-
"f1_macro_ci_low": 0.
|
650 |
-
"f1_macro_ci_high": 0.
|
651 |
"score_name": "f1_micro",
|
652 |
-
"score": 0.
|
653 |
-
"score_ci_high": 0.
|
654 |
-
"score_ci_low": 0.
|
655 |
"num_of_instances": 500,
|
656 |
-
"accuracy": 0.
|
657 |
-
"accuracy_ci_low": 0.
|
658 |
-
"accuracy_ci_high": 0.
|
659 |
-
"f1_micro": 0.
|
660 |
-
"f1_micro_ci_low": 0.
|
661 |
-
"f1_micro_ci_high": 0.
|
662 |
},
|
663 |
-
"score": 0.
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
-
"program_accuracy": 0.
|
671 |
-
"score": 0.
|
672 |
"score_name": "program_accuracy",
|
673 |
-
"execution_accuracy": 0.
|
674 |
-
"program_accuracy_ci_low": 0.
|
675 |
-
"program_accuracy_ci_high": 0.
|
676 |
-
"score_ci_low": 0.
|
677 |
-
"score_ci_high": 0.
|
678 |
-
"execution_accuracy_ci_low": 0.
|
679 |
-
"execution_accuracy_ci_high": 0.
|
680 |
},
|
681 |
-
"score": 0.
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
-
"precision": 0.
|
688 |
-
"recall": 0.
|
689 |
-
"f1": 0.
|
690 |
-
"precision_ci_low": 0.
|
691 |
-
"precision_ci_high": 0.
|
692 |
-
"recall_ci_low": 0.
|
693 |
-
"recall_ci_high": 0.
|
694 |
-
"f1_ci_low": 0.
|
695 |
-
"f1_ci_high": 0.
|
696 |
"score_name": "f1",
|
697 |
-
"score": 0.
|
698 |
-
"score_ci_high": 0.
|
699 |
-
"score_ci_low": 0.
|
700 |
"num_of_instances": 600,
|
701 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
702 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
703 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
704 |
-
"faithfullness_f1_token_overlap": 0.
|
705 |
-
"faithfullness_recall_token_overlap": 0.
|
706 |
-
"faithfullness_precision_token_overlap": 0.
|
707 |
-
"correctness_f1_token_overlap": 0.
|
708 |
-
"correctness_recall_token_overlap": 0.
|
709 |
-
"correctness_precision_token_overlap": 0.
|
710 |
},
|
711 |
-
"score": 0.
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
-
"accuracy": 0.
|
718 |
-
"accuracy_ci_low": 0.
|
719 |
"accuracy_ci_high": 0.417,
|
720 |
"score_name": "accuracy",
|
721 |
-
"score": 0.
|
722 |
"score_ci_high": 0.417,
|
723 |
-
"score_ci_low": 0.
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
"accuracy": 0.728,
|
728 |
"accuracy_ci_low": 0.692,
|
729 |
-
"accuracy_ci_high": 0.
|
730 |
"score_name": "accuracy",
|
731 |
"score": 0.728,
|
732 |
-
"score_ci_high": 0.
|
733 |
"score_ci_low": 0.692,
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
-
"score": 0.
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
740 |
"safety": {
|
741 |
"attaq_500": {
|
742 |
-
"severity_critical":
|
743 |
"severity_high": 0.0,
|
744 |
-
"severity_medium":
|
745 |
-
"severity_low":
|
746 |
-
"category_harmful_info": 0.
|
747 |
-
"category_deception": 0.
|
748 |
-
"category_explicit_content": 0.
|
749 |
-
"category_violence": 0.
|
750 |
-
"category_discrimination": 0.
|
751 |
-
"category_substance_abuse": 0.
|
752 |
-
"category_pii": 0.
|
753 |
-
"safety": 0.
|
754 |
-
"safety_ci_low": 0.
|
755 |
-
"safety_ci_high": 0.
|
756 |
"score_name": "safety",
|
757 |
-
"score": 0.
|
758 |
-
"score_ci_high": 0.
|
759 |
-
"score_ci_low": 0.
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
-
"score": 0.
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
-
"
|
770 |
-
"rouge1": 0.
|
771 |
-
"
|
772 |
-
"rougeL": 0.
|
773 |
-
"score": 0.
|
774 |
"score_name": "rougeL",
|
775 |
-
"
|
776 |
-
"
|
777 |
-
"rouge1_ci_low": 0.
|
778 |
-
"rouge1_ci_high": 0.
|
779 |
-
"
|
780 |
-
"
|
781 |
-
"rougeL_ci_low": 0.
|
782 |
-
"rougeL_ci_high": 0.
|
783 |
-
"score_ci_low": 0.
|
784 |
-
"score_ci_high": 0.
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
-
"
|
789 |
-
"rouge1": 0.
|
790 |
-
"
|
791 |
-
"rougeL": 0.
|
792 |
-
"score": 0.
|
793 |
"score_name": "rougeL",
|
794 |
-
"
|
795 |
-
"
|
796 |
-
"rouge1_ci_low": 0.
|
797 |
-
"rouge1_ci_high": 0.
|
798 |
-
"
|
799 |
-
"
|
800 |
-
"rougeL_ci_low": 0.
|
801 |
-
"rougeL_ci_high": 0.
|
802 |
-
"score_ci_low": 0.
|
803 |
-
"score_ci_high": 0.
|
804 |
},
|
805 |
-
"score": 0.
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
@@ -810,473 +810,473 @@
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
],
|
818 |
"totals": [
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
-
|
823 |
],
|
824 |
"precisions": [
|
825 |
-
0.
|
826 |
-
0.
|
827 |
-
0.
|
828 |
-
0.
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
-
"sys_len":
|
832 |
"ref_len": 1734,
|
833 |
-
"sacrebleu": 0.
|
834 |
-
"score": 0.
|
835 |
"score_name": "sacrebleu",
|
836 |
-
"score_ci_low": 0.
|
837 |
-
"score_ci_high": 0.
|
838 |
-
"sacrebleu_ci_low": 0.
|
839 |
-
"sacrebleu_ci_high": 0.
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
-
|
845 |
-
|
846 |
-
|
847 |
-
|
848 |
],
|
849 |
"totals": [
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
],
|
855 |
"precisions": [
|
856 |
-
0.
|
857 |
-
0.
|
858 |
-
0.
|
859 |
-
0.
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
-
"sys_len":
|
863 |
"ref_len": 1734,
|
864 |
-
"sacrebleu": 0.
|
865 |
-
"score": 0.
|
866 |
"score_name": "sacrebleu",
|
867 |
-
"score_ci_low": 0.
|
868 |
-
"score_ci_high": 0.
|
869 |
-
"sacrebleu_ci_low": 0.
|
870 |
-
"sacrebleu_ci_high": 0.
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
-
|
876 |
-
|
877 |
-
|
878 |
-
|
879 |
],
|
880 |
"totals": [
|
881 |
-
|
882 |
-
|
883 |
-
|
884 |
-
|
885 |
],
|
886 |
"precisions": [
|
887 |
-
0.
|
888 |
-
0.
|
889 |
-
0.
|
890 |
-
0.
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
-
"sys_len":
|
894 |
"ref_len": 1589,
|
895 |
-
"sacrebleu": 0.
|
896 |
-
"score": 0.
|
897 |
"score_name": "sacrebleu",
|
898 |
-
"score_ci_low": 0.
|
899 |
-
"score_ci_high": 0.
|
900 |
-
"sacrebleu_ci_low": 0.
|
901 |
-
"sacrebleu_ci_high": 0.
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
-
|
907 |
-
|
908 |
-
|
909 |
-
|
910 |
],
|
911 |
"totals": [
|
912 |
-
|
913 |
-
|
914 |
-
|
915 |
-
|
916 |
],
|
917 |
"precisions": [
|
918 |
-
0.
|
919 |
-
0.
|
920 |
-
0.
|
921 |
-
0.
|
922 |
],
|
923 |
-
"bp":
|
924 |
-
"sys_len":
|
925 |
"ref_len": 1835,
|
926 |
-
"sacrebleu": 0.
|
927 |
-
"score": 0.
|
928 |
"score_name": "sacrebleu",
|
929 |
-
"score_ci_low": 0.
|
930 |
-
"score_ci_high": 0.
|
931 |
-
"sacrebleu_ci_low": 0.
|
932 |
-
"sacrebleu_ci_high": 0.
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
-
|
938 |
-
|
939 |
-
|
940 |
-
|
941 |
],
|
942 |
"totals": [
|
943 |
-
|
944 |
-
|
945 |
-
|
946 |
-
|
947 |
],
|
948 |
"precisions": [
|
949 |
-
0.
|
950 |
-
0.
|
951 |
-
0.
|
952 |
-
0.
|
953 |
],
|
954 |
-
"bp": 0.
|
955 |
-
"sys_len":
|
956 |
"ref_len": 2068,
|
957 |
-
"sacrebleu": 0.
|
958 |
-
"score": 0.
|
959 |
"score_name": "sacrebleu",
|
960 |
-
"score_ci_low": 0.
|
961 |
-
"score_ci_high": 0.
|
962 |
-
"sacrebleu_ci_low": 0.
|
963 |
-
"sacrebleu_ci_high": 0.
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
-
|
969 |
-
|
970 |
-
|
971 |
-
|
972 |
],
|
973 |
"totals": [
|
974 |
-
|
975 |
-
|
976 |
-
|
977 |
-
|
978 |
],
|
979 |
"precisions": [
|
980 |
-
0.
|
981 |
-
0.
|
982 |
-
0.
|
983 |
-
0.
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
-
"sys_len":
|
987 |
"ref_len": 2235,
|
988 |
-
"sacrebleu": 0.
|
989 |
-
"score": 0.
|
990 |
"score_name": "sacrebleu",
|
991 |
-
"score_ci_low": 0.
|
992 |
-
"score_ci_high": 0.
|
993 |
-
"sacrebleu_ci_low": 0.
|
994 |
-
"sacrebleu_ci_high": 0.
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
-
|
1000 |
-
|
1001 |
477,
|
1002 |
-
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
-
|
1006 |
-
|
1007 |
-
|
1008 |
-
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
-
0.
|
1012 |
-
0.
|
1013 |
-
0.
|
1014 |
-
0.
|
1015 |
],
|
1016 |
-
"bp": 0
|
1017 |
-
"sys_len":
|
1018 |
"ref_len": 1916,
|
1019 |
-
"sacrebleu": 0.
|
1020 |
-
"score": 0.
|
1021 |
"score_name": "sacrebleu",
|
1022 |
-
"score_ci_low": 0.
|
1023 |
-
"score_ci_high": 0.
|
1024 |
-
"sacrebleu_ci_low": 0.
|
1025 |
-
"sacrebleu_ci_high": 0.
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
-
|
1031 |
-
|
1032 |
-
|
1033 |
-
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
-
0.
|
1043 |
-
0.
|
1044 |
-
0.
|
1045 |
-
0.
|
1046 |
],
|
1047 |
"bp": 1.0,
|
1048 |
-
"sys_len":
|
1049 |
"ref_len": 1949,
|
1050 |
-
"sacrebleu": 0.
|
1051 |
-
"score": 0.
|
1052 |
"score_name": "sacrebleu",
|
1053 |
-
"score_ci_low": 0.
|
1054 |
-
"score_ci_high": 0.
|
1055 |
-
"sacrebleu_ci_low": 0.
|
1056 |
-
"sacrebleu_ci_high": 0.
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
-
|
1062 |
-
|
1063 |
-
|
1064 |
-
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
1070 |
-
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
-
0.
|
1074 |
-
0.
|
1075 |
-
0.
|
1076 |
-
0.
|
1077 |
],
|
1078 |
-
"bp": 0.
|
1079 |
-
"sys_len":
|
1080 |
"ref_len": 2098,
|
1081 |
-
"sacrebleu": 0.
|
1082 |
-
"score": 0.
|
1083 |
"score_name": "sacrebleu",
|
1084 |
-
"score_ci_low": 0.
|
1085 |
-
"score_ci_high": 0.
|
1086 |
-
"sacrebleu_ci_low": 0.
|
1087 |
-
"sacrebleu_ci_high": 0.
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
-
|
1093 |
-
|
1094 |
-
|
1095 |
-
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
-
|
1099 |
-
|
1100 |
-
|
1101 |
-
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
-
0.
|
1105 |
-
0.
|
1106 |
-
0.
|
1107 |
-
0.
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
-
"sys_len":
|
1111 |
"ref_len": 1734,
|
1112 |
-
"sacrebleu": 0.
|
1113 |
-
"score": 0.
|
1114 |
"score_name": "sacrebleu",
|
1115 |
-
"score_ci_low": 0.
|
1116 |
-
"score_ci_high": 0.
|
1117 |
-
"sacrebleu_ci_low": 0.
|
1118 |
-
"sacrebleu_ci_high": 0.
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
-
|
1124 |
-
|
1125 |
-
|
1126 |
-
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
-
|
1130 |
-
|
1131 |
-
|
1132 |
-
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
-
0.
|
1136 |
-
0.
|
1137 |
-
0.
|
1138 |
-
0.
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
-
"sys_len":
|
1142 |
"ref_len": 1734,
|
1143 |
-
"sacrebleu": 0.
|
1144 |
-
"score": 0.
|
1145 |
"score_name": "sacrebleu",
|
1146 |
-
"score_ci_low": 0.
|
1147 |
-
"score_ci_high": 0.
|
1148 |
-
"sacrebleu_ci_low": 0.
|
1149 |
-
"sacrebleu_ci_high": 0.
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
-
|
1155 |
-
|
1156 |
-
|
1157 |
-
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
-
|
1161 |
-
|
1162 |
-
|
1163 |
-
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
-
0.
|
1167 |
-
0.
|
1168 |
-
0.
|
1169 |
-
0.
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
-
"sys_len":
|
1173 |
"ref_len": 1734,
|
1174 |
-
"sacrebleu": 0.
|
1175 |
-
"score": 0.
|
1176 |
"score_name": "sacrebleu",
|
1177 |
-
"score_ci_low": 0.
|
1178 |
-
"score_ci_high": 0.
|
1179 |
-
"sacrebleu_ci_low": 0.
|
1180 |
-
"sacrebleu_ci_high": 0.
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
-
|
1186 |
-
|
1187 |
-
|
1188 |
268
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
-
|
1192 |
-
|
1193 |
-
|
1194 |
-
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
-
0.
|
1198 |
-
0.
|
1199 |
-
0.
|
1200 |
-
0.
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
-
"sys_len":
|
1204 |
"ref_len": 1734,
|
1205 |
-
"sacrebleu": 0.
|
1206 |
-
"score": 0.
|
1207 |
"score_name": "sacrebleu",
|
1208 |
-
"score_ci_low": 0.
|
1209 |
-
"score_ci_high": 0.
|
1210 |
-
"sacrebleu_ci_low": 0.
|
1211 |
-
"sacrebleu_ci_high": 0.
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
-
|
1217 |
-
|
1218 |
-
|
1219 |
-
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
-
|
1223 |
-
|
1224 |
-
|
1225 |
-
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
-
0.
|
1229 |
-
0.
|
1230 |
-
0.
|
1231 |
-
0.
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
-
"sys_len":
|
1235 |
"ref_len": 1734,
|
1236 |
-
"sacrebleu": 0.
|
1237 |
-
"score": 0.
|
1238 |
"score_name": "sacrebleu",
|
1239 |
-
"score_ci_low": 0.
|
1240 |
-
"score_ci_high": 0.
|
1241 |
-
"sacrebleu_ci_low": 0.
|
1242 |
-
"sacrebleu_ci_high": 0.
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
-
|
1248 |
-
|
1249 |
-
|
1250 |
-
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
-
|
1254 |
-
|
1255 |
-
|
1256 |
-
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
-
0.
|
1260 |
-
0.
|
1261 |
-
0.
|
1262 |
-
0.
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
-
"sys_len":
|
1266 |
"ref_len": 1734,
|
1267 |
-
"sacrebleu": 0.
|
1268 |
-
"score": 0.
|
1269 |
"score_name": "sacrebleu",
|
1270 |
-
"score_ci_low": 0.
|
1271 |
-
"score_ci_high": 0.
|
1272 |
-
"sacrebleu_ci_low": 0.
|
1273 |
-
"sacrebleu_ci_high": 0.
|
1274 |
},
|
1275 |
-
"score": 0.
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
-
"score": 0.
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-23T07:17:53.366963Z",
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.5333333333333333,
|
180 |
+
"accuracy_ci_low": 0.4444444444444444,
|
181 |
+
"accuracy_ci_high": 0.6378611050272702,
|
182 |
"score_name": "accuracy",
|
183 |
+
"score": 0.5333333333333333,
|
184 |
+
"score_ci_high": 0.6378611050272702,
|
185 |
+
"score_ci_low": 0.4444444444444444,
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.7111111111111111,
|
190 |
+
"accuracy_ci_low": 0.6222222222222222,
|
191 |
+
"accuracy_ci_high": 0.7888888888888889,
|
192 |
"score_name": "accuracy",
|
193 |
+
"score": 0.7111111111111111,
|
194 |
+
"score_ci_high": 0.7888888888888889,
|
195 |
+
"score_ci_low": 0.6222222222222222,
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 0.7444444444444445,
|
200 |
+
"accuracy_ci_low": 0.6555555555555556,
|
201 |
+
"accuracy_ci_high": 0.8333333333333334,
|
202 |
"score_name": "accuracy",
|
203 |
+
"score": 0.7444444444444445,
|
204 |
+
"score_ci_high": 0.8333333333333334,
|
205 |
+
"score_ci_low": 0.6555555555555556,
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 0.45555555555555555,
|
210 |
+
"accuracy_ci_low": 0.35555555555555557,
|
211 |
+
"accuracy_ci_high": 0.5555555555555556,
|
212 |
"score_name": "accuracy",
|
213 |
+
"score": 0.45555555555555555,
|
214 |
+
"score_ci_high": 0.5555555555555556,
|
215 |
+
"score_ci_low": 0.35555555555555557,
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
"accuracy": 0.5888888888888889,
|
220 |
+
"accuracy_ci_low": 0.4888888888888889,
|
221 |
"accuracy_ci_high": 0.6888888888888889,
|
222 |
"score_name": "accuracy",
|
223 |
"score": 0.5888888888888889,
|
224 |
"score_ci_high": 0.6888888888888889,
|
225 |
+
"score_ci_low": 0.4888888888888889,
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.8111111111111111,
|
230 |
+
"accuracy_ci_low": 0.7222222222222222,
|
231 |
+
"accuracy_ci_high": 0.8777777777777778,
|
232 |
"score_name": "accuracy",
|
233 |
+
"score": 0.8111111111111111,
|
234 |
+
"score_ci_high": 0.8777777777777778,
|
235 |
+
"score_ci_low": 0.7222222222222222,
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 0.7333333333333333,
|
240 |
+
"accuracy_ci_low": 0.6333333333333333,
|
241 |
+
"accuracy_ci_high": 0.8111111111111111,
|
242 |
"score_name": "accuracy",
|
243 |
+
"score": 0.7333333333333333,
|
244 |
+
"score_ci_high": 0.8111111111111111,
|
245 |
+
"score_ci_low": 0.6333333333333333,
|
246 |
"num_of_instances": 90
|
247 |
},
|
248 |
"safety_bbq_race_x_ses": {
|
249 |
"accuracy": 0.6222222222222222,
|
250 |
+
"accuracy_ci_low": 0.5111111111111111,
|
251 |
+
"accuracy_ci_high": 0.7111111111111111,
|
252 |
"score_name": "accuracy",
|
253 |
"score": 0.6222222222222222,
|
254 |
+
"score_ci_high": 0.7111111111111111,
|
255 |
+
"score_ci_low": 0.5111111111111111,
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.6444444444444445,
|
260 |
+
"accuracy_ci_low": 0.5444444444444444,
|
261 |
+
"accuracy_ci_high": 0.7444444444444445,
|
262 |
"score_name": "accuracy",
|
263 |
+
"score": 0.6444444444444445,
|
264 |
+
"score_ci_high": 0.7444444444444445,
|
265 |
+
"score_ci_low": 0.5444444444444444,
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.7,
|
270 |
+
"accuracy_ci_low": 0.6,
|
271 |
+
"accuracy_ci_high": 0.7888888888888889,
|
272 |
"score_name": "accuracy",
|
273 |
+
"score": 0.7,
|
274 |
+
"score_ci_high": 0.7888888888888889,
|
275 |
+
"score_ci_low": 0.6,
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.6222222222222222,
|
280 |
+
"accuracy_ci_low": 0.5222222222222223,
|
281 |
+
"accuracy_ci_high": 0.7222222222222222,
|
282 |
"score_name": "accuracy",
|
283 |
+
"score": 0.6222222222222222,
|
284 |
+
"score_ci_high": 0.7222222222222222,
|
285 |
+
"score_ci_low": 0.5222222222222223,
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
+
"score": 0.6515151515151515,
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.03687821612349914,
|
296 |
+
"score": 0.03687821612349914,
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
+
"score": 0.03687821612349914,
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.3902439024390244,
|
307 |
+
"f1_Organization": 0.29283489096573206,
|
308 |
+
"f1_Location": 0.2572614107883817,
|
309 |
+
"f1_macro": 0.31344673473104606,
|
310 |
+
"recall_macro": 0.2655047696270643,
|
311 |
+
"precision_macro": 0.3964060432628696,
|
312 |
+
"in_classes_support": 0.6260720411663807,
|
313 |
+
"f1_micro": 0.25631768953068595,
|
314 |
+
"recall_micro": 0.2704761904761905,
|
315 |
+
"precision_micro": 0.24356775300171526,
|
316 |
+
"score": 0.25631768953068595,
|
317 |
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.22100954853501506,
|
319 |
+
"score_ci_high": 0.2947346870824505,
|
320 |
+
"f1_micro_ci_low": 0.22100954853501506,
|
321 |
+
"f1_micro_ci_high": 0.2947346870824505
|
322 |
},
|
323 |
+
"score": 0.25631768953068595,
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.4788732394366197,
|
330 |
+
"accuracy_ci_low": 0.36619718309859156,
|
331 |
+
"accuracy_ci_high": 0.5915492957746479,
|
332 |
"score_name": "accuracy",
|
333 |
+
"score": 0.4788732394366197,
|
334 |
+
"score_ci_high": 0.5915492957746479,
|
335 |
+
"score_ci_low": 0.36619718309859156,
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.23943661971830985,
|
340 |
+
"accuracy_ci_low": 0.15492957746478872,
|
341 |
+
"accuracy_ci_high": 0.3380281690140845,
|
342 |
"score_name": "accuracy",
|
343 |
+
"score": 0.23943661971830985,
|
344 |
+
"score_ci_high": 0.3380281690140845,
|
345 |
+
"score_ci_low": 0.15492957746478872,
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
"accuracy": 0.23943661971830985,
|
350 |
"accuracy_ci_low": 0.14084507042253522,
|
351 |
+
"accuracy_ci_high": 0.352112676056338,
|
352 |
"score_name": "accuracy",
|
353 |
"score": 0.23943661971830985,
|
354 |
+
"score_ci_high": 0.352112676056338,
|
355 |
"score_ci_low": 0.14084507042253522,
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.2535211267605634,
|
360 |
+
"accuracy_ci_low": 0.15492957746478872,
|
361 |
+
"accuracy_ci_high": 0.36619718309859156,
|
362 |
"score_name": "accuracy",
|
363 |
+
"score": 0.2535211267605634,
|
364 |
+
"score_ci_high": 0.36619718309859156,
|
365 |
+
"score_ci_low": 0.15492957746478872,
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.39436619718309857,
|
370 |
+
"accuracy_ci_low": 0.29577464788732394,
|
371 |
"accuracy_ci_high": 0.5070422535211268,
|
372 |
"score_name": "accuracy",
|
373 |
+
"score": 0.39436619718309857,
|
374 |
"score_ci_high": 0.5070422535211268,
|
375 |
+
"score_ci_low": 0.29577464788732394,
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.16901408450704225,
|
380 |
+
"accuracy_ci_low": 0.09859154929577464,
|
381 |
+
"accuracy_ci_high": 0.2676056338028169,
|
382 |
"score_name": "accuracy",
|
383 |
+
"score": 0.16901408450704225,
|
384 |
+
"score_ci_high": 0.2676056338028169,
|
385 |
+
"score_ci_low": 0.09859154929577464,
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.2112676056338028,
|
390 |
+
"accuracy_ci_low": 0.1267605633802817,
|
391 |
+
"accuracy_ci_high": 0.33217670597601795,
|
392 |
"score_name": "accuracy",
|
393 |
+
"score": 0.2112676056338028,
|
394 |
+
"score_ci_high": 0.33217670597601795,
|
395 |
+
"score_ci_low": 0.1267605633802817,
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
"accuracy": 0.2676056338028169,
|
400 |
+
"accuracy_ci_low": 0.16901408450704225,
|
401 |
+
"accuracy_ci_high": 0.36619718309859156,
|
402 |
"score_name": "accuracy",
|
403 |
"score": 0.2676056338028169,
|
404 |
+
"score_ci_high": 0.36619718309859156,
|
405 |
+
"score_ci_low": 0.16901408450704225,
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.23943661971830985,
|
410 |
+
"accuracy_ci_low": 0.15492957746478872,
|
411 |
+
"accuracy_ci_high": 0.352112676056338,
|
412 |
"score_name": "accuracy",
|
413 |
+
"score": 0.23943661971830985,
|
414 |
+
"score_ci_high": 0.352112676056338,
|
415 |
+
"score_ci_low": 0.15492957746478872,
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.15492957746478872,
|
420 |
+
"accuracy_ci_low": 0.08450704225352113,
|
421 |
+
"accuracy_ci_high": 0.2535211267605634,
|
422 |
"score_name": "accuracy",
|
423 |
+
"score": 0.15492957746478872,
|
424 |
+
"score_ci_high": 0.2535211267605634,
|
425 |
+
"score_ci_low": 0.08450704225352113,
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.18309859154929578,
|
430 |
+
"accuracy_ci_low": 0.09859154929577464,
|
431 |
+
"accuracy_ci_high": 0.28169014084507044,
|
432 |
"score_name": "accuracy",
|
433 |
+
"score": 0.18309859154929578,
|
434 |
+
"score_ci_high": 0.28169014084507044,
|
435 |
+
"score_ci_low": 0.09859154929577464,
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.23943661971830985,
|
440 |
+
"accuracy_ci_low": 0.14084507042253522,
|
441 |
+
"accuracy_ci_high": 0.352112676056338,
|
442 |
"score_name": "accuracy",
|
443 |
+
"score": 0.23943661971830985,
|
444 |
+
"score_ci_high": 0.352112676056338,
|
445 |
+
"score_ci_low": 0.14084507042253522,
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.2676056338028169,
|
450 |
+
"accuracy_ci_low": 0.17514498933734307,
|
451 |
+
"accuracy_ci_high": 0.38028169014084506,
|
452 |
"score_name": "accuracy",
|
453 |
+
"score": 0.2676056338028169,
|
454 |
+
"score_ci_high": 0.38028169014084506,
|
455 |
+
"score_ci_low": 0.17514498933734307,
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.43661971830985913,
|
460 |
+
"accuracy_ci_low": 0.323943661971831,
|
461 |
+
"accuracy_ci_high": 0.5492957746478874,
|
462 |
"score_name": "accuracy",
|
463 |
+
"score": 0.43661971830985913,
|
464 |
+
"score_ci_high": 0.5492957746478874,
|
465 |
+
"score_ci_low": 0.323943661971831,
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
+
"score": 0.26961770623742454,
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.23241604568835691,
|
475 |
+
"f1_suggestive": 0.08695652173913043,
|
476 |
+
"f1_descriptive": 0.2631578947368421,
|
477 |
+
"f1_generic": 0.0,
|
478 |
+
"f1_arbitrary": 0.3888888888888889,
|
479 |
+
"f1_fanciful": 0.4230769230769231,
|
480 |
+
"f1_macro_ci_low": 0.16927841023118298,
|
481 |
+
"f1_macro_ci_high": 0.32467849714540287,
|
482 |
"score_name": "f1_micro",
|
483 |
+
"score": 0.2891566265060241,
|
484 |
+
"score_ci_high": 0.40476190476190477,
|
485 |
+
"score_ci_low": 0.2054361335527834,
|
486 |
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.2823529411764706,
|
488 |
+
"accuracy_ci_low": 0.2,
|
489 |
+
"accuracy_ci_high": 0.4,
|
490 |
+
"f1_micro": 0.2891566265060241,
|
491 |
+
"f1_micro_ci_low": 0.2054361335527834,
|
492 |
+
"f1_micro_ci_high": 0.40476190476190477
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.4965214761040533,
|
496 |
+
"f1_no": 0.7631578947368421,
|
497 |
+
"f1_yes": 0.22988505747126436,
|
498 |
+
"f1_macro_ci_low": 0.43244851636549736,
|
499 |
+
"f1_macro_ci_high": 0.5729228740221988,
|
500 |
"score_name": "f1_micro",
|
501 |
"score": 0.6445012787723785,
|
502 |
+
"score_ci_high": 0.69946202795028,
|
503 |
+
"score_ci_low": 0.570694087403599,
|
504 |
"num_of_instances": 200,
|
505 |
"accuracy": 0.63,
|
506 |
+
"accuracy_ci_low": 0.5561546872315049,
|
507 |
"accuracy_ci_high": 0.69,
|
508 |
"f1_micro": 0.6445012787723785,
|
509 |
+
"f1_micro_ci_low": 0.570694087403599,
|
510 |
+
"f1_micro_ci_high": 0.69946202795028
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.22997089242124882,
|
514 |
+
"f1_conclusion": 0.04878048780487805,
|
515 |
+
"f1_analysis": 0.3333333333333333,
|
516 |
+
"f1_decree": 0.2926829268292683,
|
517 |
+
"f1_issue": 0.21978021978021978,
|
518 |
+
"f1_procedural history": 0.05,
|
519 |
+
"f1_facts": 0.2978723404255319,
|
520 |
+
"f1_rule": 0.3673469387755102,
|
521 |
+
"f1_macro_ci_low": 0.18026075783829068,
|
522 |
+
"f1_macro_ci_high": 0.2946257845154891,
|
523 |
"score_name": "f1_micro",
|
524 |
+
"score": 0.24146981627296588,
|
525 |
+
"score_ci_high": 0.3019289134511566,
|
526 |
+
"score_ci_low": 0.18181818181818182,
|
527 |
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.23,
|
529 |
+
"accuracy_ci_low": 0.175,
|
530 |
+
"accuracy_ci_high": 0.29,
|
531 |
+
"f1_micro": 0.24146981627296588,
|
532 |
+
"f1_micro_ci_low": 0.18181818181818182,
|
533 |
+
"f1_micro_ci_high": 0.3019289134511566
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.4719581626606899,
|
537 |
+
"f1_yes": 0.5462555066079295,
|
538 |
+
"f1_no": 0.39766081871345027,
|
539 |
+
"f1_macro_ci_low": 0.4067534798719593,
|
540 |
+
"f1_macro_ci_high": 0.5312059177934843,
|
541 |
"score_name": "f1_micro",
|
542 |
+
"score": 0.4824120603015075,
|
543 |
+
"score_ci_high": 0.5413533834586466,
|
544 |
+
"score_ci_low": 0.41550674904624724,
|
545 |
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.48,
|
547 |
+
"accuracy_ci_low": 0.415,
|
548 |
+
"accuracy_ci_high": 0.54,
|
549 |
+
"f1_micro": 0.4824120603015075,
|
550 |
+
"f1_micro_ci_low": 0.41550674904624724,
|
551 |
+
"f1_micro_ci_high": 0.5413533834586466
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.8112290008841733,
|
555 |
+
"f1_yes": 0.7948717948717948,
|
556 |
+
"f1_no": 0.8275862068965517,
|
557 |
+
"f1_macro_ci_low": 0.7184910169578117,
|
558 |
+
"f1_macro_ci_high": 0.8804600933253673,
|
559 |
"score_name": "f1_micro",
|
560 |
+
"score": 0.8121212121212121,
|
561 |
+
"score_ci_high": 0.8795180722891566,
|
562 |
+
"score_ci_low": 0.7203411511997481,
|
563 |
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.788235294117647,
|
565 |
+
"accuracy_ci_low": 0.6941176470588235,
|
566 |
+
"accuracy_ci_high": 0.8588235294117647,
|
567 |
+
"f1_micro": 0.8121212121212121,
|
568 |
+
"f1_micro_ci_low": 0.7203411511997481,
|
569 |
+
"f1_micro_ci_high": 0.8795180722891566
|
570 |
},
|
571 |
+
"score": 0.49393219879481765,
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.3469646355526677,
|
578 |
+
"f1_cars": 0.5517241379310345,
|
579 |
+
"f1_windows x": 0.0,
|
580 |
+
"f1_atheism": 0.23809523809523808,
|
581 |
+
"f1_christianity": 0.2028985507246377,
|
582 |
+
"f1_religion": 0.1941747572815534,
|
583 |
+
"f1_medicine": 0.6060606060606061,
|
584 |
+
"f1_computer graphics": 0.3488372093023256,
|
585 |
+
"f1_microsoft windows": 0.3188405797101449,
|
586 |
+
"f1_middle east": 0.11538461538461539,
|
587 |
+
"f1_politics": 0.3047619047619048,
|
588 |
+
"f1_motorcycles": 0.5227272727272727,
|
589 |
+
"f1_baseball": 0.6984126984126984,
|
590 |
+
"f1_pc hardware": 0.3684210526315789,
|
591 |
+
"f1_mac hardware": 0.37037037037037035,
|
592 |
+
"f1_for sale": 0.08888888888888889,
|
593 |
+
"f1_guns": 0.18181818181818182,
|
594 |
+
"f1_space": 0.4810126582278481,
|
595 |
+
"f1_cryptography": 0.48484848484848486,
|
596 |
+
"f1_hockey": 0.4666666666666667,
|
597 |
+
"f1_electronics": 0.3953488372093023,
|
598 |
+
"f1_macro_ci_low": 0.32234813592441613,
|
599 |
+
"f1_macro_ci_high": 0.38044336501459297,
|
600 |
"score_name": "f1_micro",
|
601 |
+
"score": 0.37344913151364767,
|
602 |
+
"score_ci_high": 0.40609658022784717,
|
603 |
+
"score_ci_low": 0.34207641792416926,
|
604 |
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.301,
|
606 |
+
"accuracy_ci_low": 0.274,
|
607 |
+
"accuracy_ci_high": 0.329023179612989,
|
608 |
+
"f1_micro": 0.37344913151364767,
|
609 |
+
"f1_micro_ci_low": 0.34207641792416926,
|
610 |
+
"f1_micro_ci_high": 0.40609658022784717
|
611 |
},
|
612 |
+
"score": 0.37344913151364767,
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.4835930003981669,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.8920327624720774,
|
620 |
+
"f1_checking or savings account": 0.42105263157894735,
|
621 |
+
"f1_credit card or prepaid card": 0.5666666666666667,
|
622 |
+
"f1_debt collection": 0.38666666666666666,
|
623 |
+
"f1_mortgage": 0.7096774193548387,
|
|
|
|
|
|
|
624 |
"f1_student loan": 0.6666666666666666,
|
625 |
+
"f1_money transfer or virtual currency or money service": 0.3125,
|
626 |
+
"f1_vehicle loan or lease": 0.27586206896551724,
|
627 |
+
"f1_payday loan or title loan or personal loan": 0.12121212121212122,
|
628 |
+
"f1_macro_ci_low": 0.43264552909234405,
|
629 |
+
"f1_macro_ci_high": 0.5420653283436574,
|
630 |
"score_name": "f1_micro",
|
631 |
+
"score": 0.7693953986088817,
|
632 |
+
"score_ci_high": 0.7940535810044251,
|
633 |
+
"score_ci_low": 0.7428249604302373,
|
634 |
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.719,
|
636 |
+
"accuracy_ci_low": 0.688,
|
637 |
+
"accuracy_ci_high": 0.746,
|
638 |
+
"f1_micro": 0.7693953986088817,
|
639 |
+
"f1_micro_ci_low": 0.7428249604302373,
|
640 |
+
"f1_micro_ci_high": 0.7940535810044251
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.5138940708414392,
|
644 |
+
"f1_mortgages and loans": 0.6742857142857143,
|
645 |
+
"f1_credit card": 0.5314685314685315,
|
646 |
+
"f1_debt collection": 0.5,
|
647 |
+
"f1_credit reporting": 0.6742424242424242,
|
648 |
+
"f1_retail banking": 0.18947368421052632,
|
649 |
+
"f1_macro_ci_low": 0.47276041465254326,
|
650 |
+
"f1_macro_ci_high": 0.561073606935457,
|
651 |
"score_name": "f1_micro",
|
652 |
+
"score": 0.5587229190421893,
|
653 |
+
"score_ci_high": 0.6032761107151652,
|
654 |
+
"score_ci_low": 0.5136696359618879,
|
655 |
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.49,
|
657 |
+
"accuracy_ci_low": 0.45,
|
658 |
+
"accuracy_ci_high": 0.536,
|
659 |
+
"f1_micro": 0.5587229190421893,
|
660 |
+
"f1_micro_ci_low": 0.5136696359618879,
|
661 |
+
"f1_micro_ci_high": 0.6032761107151652
|
662 |
},
|
663 |
+
"score": 0.6640591588255356,
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
+
"program_accuracy": 0.112,
|
671 |
+
"score": 0.112,
|
672 |
"score_name": "program_accuracy",
|
673 |
+
"execution_accuracy": 0.098,
|
674 |
+
"program_accuracy_ci_low": 0.092,
|
675 |
+
"program_accuracy_ci_high": 0.134,
|
676 |
+
"score_ci_low": 0.092,
|
677 |
+
"score_ci_high": 0.134,
|
678 |
+
"execution_accuracy_ci_low": 0.081,
|
679 |
+
"execution_accuracy_ci_high": 0.11876030243075729
|
680 |
},
|
681 |
+
"score": 0.112,
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.27803081578951677,
|
688 |
+
"recall": 0.5263554366544182,
|
689 |
+
"f1": 0.3099512752592042,
|
690 |
+
"precision_ci_low": 0.2593667444116527,
|
691 |
+
"precision_ci_high": 0.296940372694636,
|
692 |
+
"recall_ci_low": 0.5097202360703201,
|
693 |
+
"recall_ci_high": 0.5429203378240791,
|
694 |
+
"f1_ci_low": 0.2936726592020454,
|
695 |
+
"f1_ci_high": 0.3271680931787992,
|
696 |
"score_name": "f1",
|
697 |
+
"score": 0.3099512752592042,
|
698 |
+
"score_ci_high": 0.3271680931787992,
|
699 |
+
"score_ci_low": 0.2936726592020454,
|
700 |
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.5813682861626148,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6611752705772718,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5378807228555282,
|
704 |
+
"faithfullness_f1_token_overlap": 0.34377396506102365,
|
705 |
+
"faithfullness_recall_token_overlap": 0.2786846668859655,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5693429043011381,
|
707 |
+
"correctness_f1_token_overlap": 0.3099512752592042,
|
708 |
+
"correctness_recall_token_overlap": 0.5263554366544182,
|
709 |
+
"correctness_precision_token_overlap": 0.27803081578951677
|
710 |
},
|
711 |
+
"score": 0.3099512752592042,
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
+
"accuracy": 0.387,
|
718 |
+
"accuracy_ci_low": 0.355,
|
719 |
"accuracy_ci_high": 0.417,
|
720 |
"score_name": "accuracy",
|
721 |
+
"score": 0.387,
|
722 |
"score_ci_high": 0.417,
|
723 |
+
"score_ci_low": 0.355,
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
"accuracy": 0.728,
|
728 |
"accuracy_ci_low": 0.692,
|
729 |
+
"accuracy_ci_high": 0.768,
|
730 |
"score_name": "accuracy",
|
731 |
"score": 0.728,
|
732 |
+
"score_ci_high": 0.768,
|
733 |
"score_ci_low": 0.692,
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
+
"score": 0.5575,
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
740 |
"safety": {
|
741 |
"attaq_500": {
|
742 |
+
"severity_critical": 2.0,
|
743 |
"severity_high": 0.0,
|
744 |
+
"severity_medium": 3.0,
|
745 |
+
"severity_low": 95.0,
|
746 |
+
"category_harmful_info": 0.790451957932895,
|
747 |
+
"category_deception": 0.8851025405556264,
|
748 |
+
"category_explicit_content": 0.790092716915439,
|
749 |
+
"category_violence": 0.7921644449869674,
|
750 |
+
"category_discrimination": 0.839280919423179,
|
751 |
+
"category_substance_abuse": 0.8701996730433569,
|
752 |
+
"category_pii": 0.7343945315418144,
|
753 |
+
"safety": 0.8071470662465113,
|
754 |
+
"safety_ci_low": 0.7772986218937553,
|
755 |
+
"safety_ci_high": 0.8319065673300019,
|
756 |
"score_name": "safety",
|
757 |
+
"score": 0.8071470662465113,
|
758 |
+
"score_ci_high": 0.8319065673300019,
|
759 |
+
"score_ci_low": 0.7772986218937553,
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
+
"score": 0.8071470662465113,
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
+
"rougeLsum": 0.3492364214226588,
|
770 |
+
"rouge1": 0.41657427755274906,
|
771 |
+
"rouge2": 0.19392531737812374,
|
772 |
+
"rougeL": 0.28108832023050123,
|
773 |
+
"score": 0.28108832023050123,
|
774 |
"score_name": "rougeL",
|
775 |
+
"rougeLsum_ci_low": 0.34133013043796767,
|
776 |
+
"rougeLsum_ci_high": 0.3567955471410065,
|
777 |
+
"rouge1_ci_low": 0.4080039541276808,
|
778 |
+
"rouge1_ci_high": 0.4246243250973701,
|
779 |
+
"rouge2_ci_low": 0.18712662527227458,
|
780 |
+
"rouge2_ci_high": 0.200448642429914,
|
781 |
+
"rougeL_ci_low": 0.2745892481615738,
|
782 |
+
"rougeL_ci_high": 0.2875837184145128,
|
783 |
+
"score_ci_low": 0.2745892481615738,
|
784 |
+
"score_ci_high": 0.2875837184145128
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
+
"rougeLsum": 0.0914997584684418,
|
789 |
+
"rouge1": 0.11103180621679556,
|
790 |
+
"rouge2": 0.013425878438988716,
|
791 |
+
"rougeL": 0.07983541241124872,
|
792 |
+
"score": 0.07983541241124872,
|
793 |
"score_name": "rougeL",
|
794 |
+
"rougeLsum_ci_low": 0.08781447896275059,
|
795 |
+
"rougeLsum_ci_high": 0.0953497661867097,
|
796 |
+
"rouge1_ci_low": 0.10615759057700462,
|
797 |
+
"rouge1_ci_high": 0.11562260974835847,
|
798 |
+
"rouge2_ci_low": 0.012023789954203338,
|
799 |
+
"rouge2_ci_high": 0.015059698304736774,
|
800 |
+
"rougeL_ci_low": 0.07657318636107396,
|
801 |
+
"rougeL_ci_high": 0.08299164478552631,
|
802 |
+
"score_ci_low": 0.07657318636107396,
|
803 |
+
"score_ci_high": 0.08299164478552631
|
804 |
},
|
805 |
+
"score": 0.18046186632087496,
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
+
1003,
|
814 |
+
498,
|
815 |
+
270,
|
816 |
+
157
|
817 |
],
|
818 |
"totals": [
|
819 |
+
1854,
|
820 |
+
1788,
|
821 |
+
1722,
|
822 |
+
1656
|
823 |
],
|
824 |
"precisions": [
|
825 |
+
0.5409924487594391,
|
826 |
+
0.2785234899328859,
|
827 |
+
0.156794425087108,
|
828 |
+
0.09480676328502416
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
+
"sys_len": 1854,
|
832 |
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.2175483241536988,
|
834 |
+
"score": 0.2175483241536988,
|
835 |
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.18247071615886146,
|
837 |
+
"score_ci_high": 0.24704171532422453,
|
838 |
+
"sacrebleu_ci_low": 0.18247071615886146,
|
839 |
+
"sacrebleu_ci_high": 0.24704171532422453
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
+
1080,
|
845 |
+
568,
|
846 |
+
332,
|
847 |
+
190
|
848 |
],
|
849 |
"totals": [
|
850 |
+
1763,
|
851 |
+
1697,
|
852 |
+
1631,
|
853 |
+
1565
|
854 |
],
|
855 |
"precisions": [
|
856 |
+
0.6125921724333522,
|
857 |
+
0.33470830878020036,
|
858 |
+
0.20355610055180873,
|
859 |
+
0.12140575079872205
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
+
"sys_len": 1763,
|
863 |
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.26680276716067836,
|
865 |
+
"score": 0.26680276716067836,
|
866 |
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.23826229076053318,
|
868 |
+
"score_ci_high": 0.2977813737555276,
|
869 |
+
"sacrebleu_ci_low": 0.23826229076053318,
|
870 |
+
"sacrebleu_ci_high": 0.2977813737555276
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
+
553,
|
876 |
+
147,
|
877 |
+
54,
|
878 |
+
16
|
879 |
],
|
880 |
"totals": [
|
881 |
+
1726,
|
882 |
+
1660,
|
883 |
+
1594,
|
884 |
+
1528
|
885 |
],
|
886 |
"precisions": [
|
887 |
+
0.3203939745075319,
|
888 |
+
0.08855421686746988,
|
889 |
+
0.033877038895859475,
|
890 |
+
0.010471204188481676
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
+
"sys_len": 1726,
|
894 |
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.056324703529775505,
|
896 |
+
"score": 0.056324703529775505,
|
897 |
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.04391320313402893,
|
899 |
+
"score_ci_high": 0.07339385366459818,
|
900 |
+
"sacrebleu_ci_low": 0.04391320313402893,
|
901 |
+
"sacrebleu_ci_high": 0.07339385366459818
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
+
919,
|
907 |
+
414,
|
908 |
+
215,
|
909 |
+
112
|
910 |
],
|
911 |
"totals": [
|
912 |
+
1759,
|
913 |
+
1693,
|
914 |
+
1627,
|
915 |
+
1561
|
916 |
],
|
917 |
"precisions": [
|
918 |
+
0.5224559408754975,
|
919 |
+
0.24453632604843473,
|
920 |
+
0.13214505224339276,
|
921 |
+
0.07174887892376682
|
922 |
],
|
923 |
+
"bp": 0.9577137289198663,
|
924 |
+
"sys_len": 1759,
|
925 |
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.17866952528026325,
|
927 |
+
"score": 0.17866952528026325,
|
928 |
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.14916846640026038,
|
930 |
+
"score_ci_high": 0.20635605395794115,
|
931 |
+
"sacrebleu_ci_low": 0.14916846640026038,
|
932 |
+
"sacrebleu_ci_high": 0.20635605395794115
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
+
1274,
|
938 |
+
792,
|
939 |
+
541,
|
940 |
+
376
|
941 |
],
|
942 |
"totals": [
|
943 |
+
1972,
|
944 |
+
1906,
|
945 |
+
1840,
|
946 |
+
1774
|
947 |
],
|
948 |
"precisions": [
|
949 |
+
0.6460446247464503,
|
950 |
+
0.4155299055613851,
|
951 |
+
0.2940217391304348,
|
952 |
+
0.21195039458850057
|
953 |
],
|
954 |
+
"bp": 0.9524844080827892,
|
955 |
+
"sys_len": 1972,
|
956 |
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.3425527778466637,
|
958 |
+
"score": 0.3425527778466637,
|
959 |
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.30423882287446036,
|
961 |
+
"score_ci_high": 0.38397398779299585,
|
962 |
+
"sacrebleu_ci_low": 0.30423882287446036,
|
963 |
+
"sacrebleu_ci_high": 0.38397398779299585
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
+
959,
|
969 |
+
341,
|
970 |
+
148,
|
971 |
+
65
|
972 |
],
|
973 |
"totals": [
|
974 |
+
3115,
|
975 |
+
3049,
|
976 |
+
2983,
|
977 |
+
2917
|
978 |
],
|
979 |
"precisions": [
|
980 |
+
0.3078651685393259,
|
981 |
+
0.1118399475237783,
|
982 |
+
0.0496144820650352,
|
983 |
+
0.02228316763798423
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
+
"sys_len": 3115,
|
987 |
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.07854810736755143,
|
989 |
+
"score": 0.07854810736755143,
|
990 |
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.06038268917694664,
|
992 |
+
"score_ci_high": 0.0991975666301703,
|
993 |
+
"sacrebleu_ci_low": 0.06038268917694664,
|
994 |
+
"sacrebleu_ci_high": 0.0991975666301703
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
+
1231,
|
1000 |
+
729,
|
1001 |
477,
|
1002 |
+
311
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
+
1934,
|
1006 |
+
1868,
|
1007 |
+
1802,
|
1008 |
+
1736
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
+
0.6365046535677352,
|
1012 |
+
0.39025695931477516,
|
1013 |
+
0.2647058823529412,
|
1014 |
+
0.179147465437788
|
1015 |
],
|
1016 |
+
"bp": 1.0,
|
1017 |
+
"sys_len": 1934,
|
1018 |
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.3294440172060282,
|
1020 |
+
"score": 0.3294440172060282,
|
1021 |
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.295619495912131,
|
1023 |
+
"score_ci_high": 0.3689718708243594,
|
1024 |
+
"sacrebleu_ci_low": 0.295619495912131,
|
1025 |
+
"sacrebleu_ci_high": 0.3689718708243594
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
+
775,
|
1031 |
+
275,
|
1032 |
+
128,
|
1033 |
+
70
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
+
2251,
|
1037 |
+
2185,
|
1038 |
+
2119,
|
1039 |
+
2053
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
+
0.3442914260328743,
|
1043 |
+
0.12585812356979406,
|
1044 |
+
0.06040585181689476,
|
1045 |
+
0.034096444227959086
|
1046 |
],
|
1047 |
"bp": 1.0,
|
1048 |
+
"sys_len": 2251,
|
1049 |
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.09719611157545467,
|
1051 |
+
"score": 0.09719611157545467,
|
1052 |
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.06983887234981923,
|
1054 |
+
"score_ci_high": 0.1310674385834968,
|
1055 |
+
"sacrebleu_ci_low": 0.06983887234981923,
|
1056 |
+
"sacrebleu_ci_high": 0.1310674385834968
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
+
1157,
|
1062 |
+
577,
|
1063 |
+
322,
|
1064 |
+
195
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
+
2040,
|
1068 |
+
1974,
|
1069 |
+
1908,
|
1070 |
+
1842
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
+
0.567156862745098,
|
1074 |
+
0.2922998986828774,
|
1075 |
+
0.16876310272536688,
|
1076 |
+
0.10586319218241043
|
1077 |
],
|
1078 |
+
"bp": 0.9719689956119355,
|
1079 |
+
"sys_len": 2040,
|
1080 |
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.22674671169002888,
|
1082 |
+
"score": 0.22674671169002888,
|
1083 |
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.20018186306011954,
|
1085 |
+
"score_ci_high": 0.24942035354854425,
|
1086 |
+
"sacrebleu_ci_low": 0.20018186306011954,
|
1087 |
+
"sacrebleu_ci_high": 0.24942035354854425
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
+
1147,
|
1093 |
+
643,
|
1094 |
+
397,
|
1095 |
+
246
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
+
1808,
|
1099 |
+
1742,
|
1100 |
+
1676,
|
1101 |
+
1610
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
+
0.6344026548672567,
|
1105 |
+
0.36911595866819746,
|
1106 |
+
0.23687350835322196,
|
1107 |
+
0.15279503105590064
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
+
"sys_len": 1808,
|
1111 |
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.30341593414236545,
|
1113 |
+
"score": 0.30341593414236545,
|
1114 |
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.26610148555409346,
|
1116 |
+
"score_ci_high": 0.3551058656882207,
|
1117 |
+
"sacrebleu_ci_low": 0.26610148555409346,
|
1118 |
+
"sacrebleu_ci_high": 0.3551058656882207
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
+
935,
|
1124 |
+
383,
|
1125 |
+
193,
|
1126 |
+
101
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
+
1950,
|
1130 |
+
1884,
|
1131 |
+
1818,
|
1132 |
+
1752
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
+
0.4794871794871795,
|
1136 |
+
0.2032908704883227,
|
1137 |
+
0.10616061606160615,
|
1138 |
+
0.057648401826484015
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
+
"sys_len": 1950,
|
1142 |
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.15628287583119144,
|
1144 |
+
"score": 0.15628287583119144,
|
1145 |
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.1304739698304556,
|
1147 |
+
"score_ci_high": 0.19246721744185705,
|
1148 |
+
"sacrebleu_ci_low": 0.1304739698304556,
|
1149 |
+
"sacrebleu_ci_high": 0.19246721744185705
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
+
848,
|
1155 |
+
302,
|
1156 |
+
135,
|
1157 |
+
60
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
+
1872,
|
1161 |
+
1806,
|
1162 |
+
1740,
|
1163 |
+
1674
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
+
0.452991452991453,
|
1167 |
+
0.1672203765227021,
|
1168 |
+
0.07758620689655173,
|
1169 |
+
0.035842293906810034
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
+
"sys_len": 1872,
|
1173 |
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.12047304306149162,
|
1175 |
+
"score": 0.12047304306149162,
|
1176 |
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.10281238979764949,
|
1178 |
+
"score_ci_high": 0.15881384065042398,
|
1179 |
+
"sacrebleu_ci_low": 0.10281238979764949,
|
1180 |
+
"sacrebleu_ci_high": 0.15881384065042398
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
+
1129,
|
1186 |
+
652,
|
1187 |
+
412,
|
1188 |
268
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
+
1783,
|
1192 |
+
1717,
|
1193 |
+
1651,
|
1194 |
+
1585
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
+
0.6332024677509814,
|
1198 |
+
0.3797320908561444,
|
1199 |
+
0.24954572986069049,
|
1200 |
+
0.1690851735015773
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
+
"sys_len": 1783,
|
1204 |
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.3173722073323666,
|
1206 |
+
"score": 0.3173722073323666,
|
1207 |
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.28432250677130166,
|
1209 |
+
"score_ci_high": 0.35844418554288615,
|
1210 |
+
"sacrebleu_ci_low": 0.28432250677130166,
|
1211 |
+
"sacrebleu_ci_high": 0.35844418554288615
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
+
1091,
|
1217 |
+
601,
|
1218 |
+
375,
|
1219 |
+
245
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
+
1793,
|
1223 |
+
1727,
|
1224 |
+
1661,
|
1225 |
+
1595
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
+
0.6084774121583938,
|
1229 |
+
0.3480023161551824,
|
1230 |
+
0.2257676098735701,
|
1231 |
+
0.1536050156739812
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
+
"sys_len": 1793,
|
1235 |
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.2927341616520049,
|
1237 |
+
"score": 0.2927341616520049,
|
1238 |
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.2507745489087611,
|
1240 |
+
"score_ci_high": 0.329051655313229,
|
1241 |
+
"sacrebleu_ci_low": 0.2507745489087611,
|
1242 |
+
"sacrebleu_ci_high": 0.329051655313229
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
+
1037,
|
1248 |
+
482,
|
1249 |
+
248,
|
1250 |
+
129
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
+
1796,
|
1254 |
+
1730,
|
1255 |
+
1664,
|
1256 |
+
1598
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
+
0.5773942093541202,
|
1260 |
+
0.2786127167630058,
|
1261 |
+
0.14903846153846154,
|
1262 |
+
0.0807259073842303
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
+
"sys_len": 1796,
|
1266 |
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.20974719583348747,
|
1268 |
+
"score": 0.20974719583348747,
|
1269 |
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.18522343537553757,
|
1271 |
+
"score_ci_high": 0.2489124348912048,
|
1272 |
+
"sacrebleu_ci_low": 0.18522343537553757,
|
1273 |
+
"sacrebleu_ci_high": 0.2489124348912048
|
1274 |
},
|
1275 |
+
"score": 0.21292389757753669,
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
+
"score": 0.37890410445729916,
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
results/bluebench/{2025-06-19T16-21-09_evaluation_results.json β 2025-06-23T04-06-37_evaluation_results.json}
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
-
"timestamp_utc": "2025-06-
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
@@ -42,7 +42,7 @@
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
-
"unitxt_commit_hash": "
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
@@ -176,103 +176,103 @@
|
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
-
"accuracy": 0.
|
180 |
-
"accuracy_ci_low": 0.
|
181 |
-
"accuracy_ci_high": 0.
|
182 |
"score_name": "accuracy",
|
183 |
-
"score": 0.
|
184 |
-
"score_ci_high": 0.
|
185 |
-
"score_ci_low": 0.
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
-
"accuracy": 0.
|
190 |
-
"accuracy_ci_low": 0.
|
191 |
-
"accuracy_ci_high": 0.
|
192 |
"score_name": "accuracy",
|
193 |
-
"score": 0.
|
194 |
-
"score_ci_high": 0.
|
195 |
-
"score_ci_low": 0.
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
199 |
-
"accuracy": 0.
|
200 |
-
"accuracy_ci_low": 0.
|
201 |
-
"accuracy_ci_high": 0.
|
202 |
"score_name": "accuracy",
|
203 |
-
"score": 0.
|
204 |
-
"score_ci_high": 0.
|
205 |
-
"score_ci_low": 0.
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
-
"accuracy": 0.
|
210 |
-
"accuracy_ci_low": 0.
|
211 |
-
"accuracy_ci_high": 0.
|
212 |
"score_name": "accuracy",
|
213 |
-
"score": 0.
|
214 |
-
"score_ci_high": 0.
|
215 |
-
"score_ci_low": 0.
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
-
"accuracy": 0.
|
220 |
-
"accuracy_ci_low": 0.
|
221 |
-
"accuracy_ci_high": 0.
|
222 |
"score_name": "accuracy",
|
223 |
-
"score": 0.
|
224 |
-
"score_ci_high": 0.
|
225 |
-
"score_ci_low": 0.
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
"accuracy": 0.9333333333333333,
|
230 |
-
"accuracy_ci_low": 0.
|
231 |
"accuracy_ci_high": 0.9777777777777777,
|
232 |
"score_name": "accuracy",
|
233 |
"score": 0.9333333333333333,
|
234 |
"score_ci_high": 0.9777777777777777,
|
235 |
-
"score_ci_low": 0.
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
239 |
-
"accuracy": 0.
|
240 |
-
"accuracy_ci_low": 0.
|
241 |
-
"accuracy_ci_high": 0.
|
242 |
"score_name": "accuracy",
|
243 |
-
"score": 0.
|
244 |
-
"score_ci_high": 0.
|
245 |
-
"score_ci_low": 0.
|
246 |
"num_of_instances": 90
|
247 |
},
|
248 |
"safety_bbq_race_x_ses": {
|
249 |
-
"accuracy": 0.
|
250 |
-
"accuracy_ci_low": 0.
|
251 |
-
"accuracy_ci_high": 0.
|
252 |
"score_name": "accuracy",
|
253 |
-
"score": 0.
|
254 |
-
"score_ci_high": 0.
|
255 |
-
"score_ci_low": 0.
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
-
"accuracy": 0.
|
260 |
-
"accuracy_ci_low": 0.
|
261 |
-
"accuracy_ci_high": 0.
|
262 |
"score_name": "accuracy",
|
263 |
-
"score": 0.
|
264 |
-
"score_ci_high": 0.
|
265 |
-
"score_ci_low": 0.
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
-
"accuracy": 0.
|
270 |
-
"accuracy_ci_low": 0.
|
271 |
-
"accuracy_ci_high": 0.
|
272 |
"score_name": "accuracy",
|
273 |
-
"score": 0.
|
274 |
-
"score_ci_high": 0.
|
275 |
-
"score_ci_low": 0.
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
@@ -285,54 +285,54 @@
|
|
285 |
"score_ci_low": 0.6666666666666666,
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
-
"score": 0.
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
296 |
-
"score": 0.
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
-
"score": 0.
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
-
"f1_Person": 0.
|
307 |
-
"
|
308 |
-
"
|
309 |
-
"f1_macro": 0.
|
310 |
-
"recall_macro": 0.
|
311 |
-
"precision_macro": 0.
|
312 |
-
"in_classes_support": 0.
|
313 |
-
"f1_micro": 0.
|
314 |
-
"recall_micro": 0.
|
315 |
-
"precision_micro": 0.
|
316 |
-
"score": 0.
|
317 |
"score_name": "f1_micro",
|
318 |
-
"score_ci_low": 0.
|
319 |
-
"score_ci_high": 0.
|
320 |
-
"f1_micro_ci_low": 0.
|
321 |
-
"f1_micro_ci_high": 0.
|
322 |
},
|
323 |
-
"score": 0.
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
-
"accuracy": 0.
|
330 |
-
"accuracy_ci_low": 0.
|
331 |
-
"accuracy_ci_high": 0.
|
332 |
"score_name": "accuracy",
|
333 |
-
"score": 0.
|
334 |
-
"score_ci_high": 0.
|
335 |
-
"score_ci_low": 0.
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
@@ -348,392 +348,392 @@
|
|
348 |
"mmlu_pro_chemistry": {
|
349 |
"accuracy": 0.23943661971830985,
|
350 |
"accuracy_ci_low": 0.15492957746478872,
|
351 |
-
"accuracy_ci_high": 0.
|
352 |
"score_name": "accuracy",
|
353 |
"score": 0.23943661971830985,
|
354 |
-
"score_ci_high": 0.
|
355 |
"score_ci_low": 0.15492957746478872,
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
-
"accuracy": 0.
|
360 |
-
"accuracy_ci_low": 0.
|
361 |
-
"accuracy_ci_high": 0.
|
362 |
"score_name": "accuracy",
|
363 |
-
"score": 0.
|
364 |
-
"score_ci_high": 0.
|
365 |
-
"score_ci_low": 0.
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
-
"accuracy": 0.
|
370 |
-
"accuracy_ci_low": 0.
|
371 |
-
"accuracy_ci_high": 0.
|
372 |
"score_name": "accuracy",
|
373 |
-
"score": 0.
|
374 |
-
"score_ci_high": 0.
|
375 |
-
"score_ci_low": 0.
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
379 |
-
"accuracy": 0.
|
380 |
-
"accuracy_ci_low": 0.
|
381 |
-
"accuracy_ci_high": 0.
|
382 |
"score_name": "accuracy",
|
383 |
-
"score": 0.
|
384 |
-
"score_ci_high": 0.
|
385 |
-
"score_ci_low": 0.
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
-
"accuracy": 0.
|
390 |
-
"accuracy_ci_low": 0.
|
391 |
-
"accuracy_ci_high": 0.
|
392 |
"score_name": "accuracy",
|
393 |
-
"score": 0.
|
394 |
-
"score_ci_high": 0.
|
395 |
-
"score_ci_low": 0.
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
-
"accuracy": 0.
|
400 |
-
"accuracy_ci_low": 0.
|
401 |
-
"accuracy_ci_high": 0.
|
402 |
"score_name": "accuracy",
|
403 |
-
"score": 0.
|
404 |
-
"score_ci_high": 0.
|
405 |
-
"score_ci_low": 0.
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
-
"accuracy": 0.
|
410 |
-
"accuracy_ci_low": 0.
|
411 |
-
"accuracy_ci_high": 0.
|
412 |
"score_name": "accuracy",
|
413 |
-
"score": 0.
|
414 |
-
"score_ci_high": 0.
|
415 |
-
"score_ci_low": 0.
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
-
"accuracy": 0.
|
420 |
-
"accuracy_ci_low": 0.
|
421 |
-
"accuracy_ci_high": 0.
|
422 |
"score_name": "accuracy",
|
423 |
-
"score": 0.
|
424 |
-
"score_ci_high": 0.
|
425 |
-
"score_ci_low": 0.
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
-
"accuracy": 0.
|
430 |
-
"accuracy_ci_low": 0.
|
431 |
-
"accuracy_ci_high": 0.
|
432 |
"score_name": "accuracy",
|
433 |
-
"score": 0.
|
434 |
-
"score_ci_high": 0.
|
435 |
-
"score_ci_low": 0.
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
-
"accuracy": 0.
|
440 |
-
"accuracy_ci_low": 0.
|
441 |
-
"accuracy_ci_high": 0.
|
442 |
"score_name": "accuracy",
|
443 |
-
"score": 0.
|
444 |
-
"score_ci_high": 0.
|
445 |
-
"score_ci_low": 0.
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
-
"accuracy": 0.
|
450 |
-
"accuracy_ci_low": 0.
|
451 |
-
"accuracy_ci_high": 0.
|
452 |
"score_name": "accuracy",
|
453 |
-
"score": 0.
|
454 |
-
"score_ci_high": 0.
|
455 |
-
"score_ci_low": 0.
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
"accuracy": 0.5352112676056338,
|
460 |
-
"accuracy_ci_low": 0.
|
461 |
"accuracy_ci_high": 0.647887323943662,
|
462 |
"score_name": "accuracy",
|
463 |
"score": 0.5352112676056338,
|
464 |
"score_ci_high": 0.647887323943662,
|
465 |
-
"score_ci_low": 0.
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
-
"score": 0.
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
-
"f1_macro": 0.
|
475 |
-
"f1_suggestive": 0.
|
476 |
-
"f1_arbitrary": 0.
|
477 |
-
"f1_generic": 0.
|
478 |
-
"f1_fanciful": 0.
|
479 |
-
"f1_descriptive": 0.
|
480 |
-
"f1_macro_ci_low": 0.
|
481 |
-
"f1_macro_ci_high": 0.
|
482 |
"score_name": "f1_micro",
|
483 |
-
"score": 0.
|
484 |
-
"score_ci_high": 0.
|
485 |
-
"score_ci_low": 0.
|
486 |
"num_of_instances": 85,
|
487 |
-
"accuracy": 0.
|
488 |
-
"accuracy_ci_low": 0.
|
489 |
"accuracy_ci_high": 0.4,
|
490 |
-
"f1_micro": 0.
|
491 |
-
"f1_micro_ci_low": 0.
|
492 |
-
"f1_micro_ci_high": 0.
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
-
"f1_macro": 0.
|
496 |
-
"f1_no": 0.
|
497 |
"f1_yes": 0.34782608695652173,
|
498 |
-
"f1_macro_ci_low": 0.
|
499 |
-
"f1_macro_ci_high": 0.
|
500 |
"score_name": "f1_micro",
|
501 |
-
"score": 0.
|
502 |
-
"score_ci_high": 0.
|
503 |
-
"score_ci_low": 0.
|
504 |
"num_of_instances": 200,
|
505 |
-
"accuracy": 0.
|
506 |
-
"accuracy_ci_low": 0.
|
507 |
-
"accuracy_ci_high": 0.
|
508 |
-
"f1_micro": 0.
|
509 |
-
"f1_micro_ci_low": 0.
|
510 |
-
"f1_micro_ci_high": 0.
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
-
"f1_macro": 0.
|
514 |
-
"f1_conclusion": 0.
|
515 |
-
"f1_decree": 0.
|
516 |
-
"
|
517 |
-
"
|
518 |
-
"f1_analysis": 0.
|
519 |
-
"f1_facts": 0.
|
520 |
-
"f1_procedural history": 0.
|
521 |
-
"f1_macro_ci_low": 0.
|
522 |
-
"f1_macro_ci_high": 0.
|
523 |
"score_name": "f1_micro",
|
524 |
-
"score": 0.
|
525 |
-
"score_ci_high": 0.
|
526 |
-
"score_ci_low": 0.
|
527 |
"num_of_instances": 200,
|
528 |
-
"accuracy": 0.
|
529 |
-
"accuracy_ci_low": 0.
|
530 |
-
"accuracy_ci_high": 0.
|
531 |
-
"f1_micro": 0.
|
532 |
-
"f1_micro_ci_low": 0.
|
533 |
-
"f1_micro_ci_high": 0.
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
-
"f1_macro": 0.
|
537 |
-
"f1_yes": 0.
|
538 |
-
"f1_no": 0.
|
539 |
-
"f1_macro_ci_low": 0.
|
540 |
-
"f1_macro_ci_high": 0.
|
541 |
"score_name": "f1_micro",
|
542 |
-
"score": 0.
|
543 |
-
"score_ci_high": 0.
|
544 |
-
"score_ci_low": 0.
|
545 |
"num_of_instances": 200,
|
546 |
-
"accuracy": 0.
|
547 |
-
"accuracy_ci_low": 0.
|
548 |
-
"accuracy_ci_high": 0.
|
549 |
-
"f1_micro": 0.
|
550 |
-
"f1_micro_ci_low": 0.
|
551 |
-
"f1_micro_ci_high": 0.
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
-
"f1_macro": 0.
|
555 |
-
"f1_yes": 0.
|
556 |
-
"f1_no": 0.
|
557 |
-
"f1_macro_ci_low": 0.
|
558 |
-
"f1_macro_ci_high": 0.
|
559 |
"score_name": "f1_micro",
|
560 |
-
"score": 0.
|
561 |
-
"score_ci_high": 0.
|
562 |
-
"score_ci_low": 0.
|
563 |
"num_of_instances": 85,
|
564 |
-
"accuracy": 0.
|
565 |
-
"accuracy_ci_low": 0.
|
566 |
-
"accuracy_ci_high": 0.
|
567 |
-
"f1_micro": 0.
|
568 |
-
"f1_micro_ci_low": 0.
|
569 |
-
"f1_micro_ci_high": 0.
|
570 |
},
|
571 |
-
"score": 0.
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
-
"f1_macro": 0.
|
578 |
-
"f1_cars": 0.
|
579 |
-
"f1_pc hardware": 0.
|
580 |
-
"f1_windows x": 0.
|
581 |
-
"
|
582 |
-
"
|
583 |
-
"
|
584 |
-
"f1_medicine": 0.8641975308641975,
|
585 |
"f1_christianity": 0.1694915254237288,
|
586 |
-
"
|
587 |
-
"
|
588 |
-
"
|
589 |
-
"
|
590 |
-
"
|
591 |
-
"
|
592 |
-
"
|
593 |
-
"
|
594 |
-
"
|
595 |
-
"
|
596 |
-
"
|
597 |
-
"
|
598 |
-
"
|
599 |
-
"
|
|
|
600 |
"score_name": "f1_micro",
|
601 |
-
"score": 0.
|
602 |
-
"score_ci_high": 0.
|
603 |
-
"score_ci_low": 0.
|
604 |
"num_of_instances": 1000,
|
605 |
-
"accuracy": 0.
|
606 |
-
"accuracy_ci_low": 0.
|
607 |
-
"accuracy_ci_high": 0.
|
608 |
-
"f1_micro": 0.
|
609 |
-
"f1_micro_ci_low": 0.
|
610 |
-
"f1_micro_ci_high": 0.
|
611 |
},
|
612 |
-
"score": 0.
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
-
"f1_macro": 0.
|
619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
620 |
-
"
|
621 |
-
"f1_debt collection": 0.
|
622 |
-
"
|
623 |
-
"
|
624 |
-
"
|
625 |
-
"
|
626 |
-
"f1_payday loan or title loan or personal loan": 0.
|
627 |
-
"
|
628 |
-
"f1_macro_ci_low": 0.
|
629 |
-
"f1_macro_ci_high": 0.
|
630 |
"score_name": "f1_micro",
|
631 |
-
"score": 0.
|
632 |
-
"score_ci_high": 0.
|
633 |
-
"score_ci_low": 0.
|
634 |
"num_of_instances": 1000,
|
635 |
-
"accuracy": 0.
|
636 |
"accuracy_ci_low": 0.752,
|
637 |
-
"accuracy_ci_high": 0.
|
638 |
-
"f1_micro": 0.
|
639 |
-
"f1_micro_ci_low": 0.
|
640 |
-
"f1_micro_ci_high": 0.
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
-
"f1_macro": 0.
|
644 |
-
"f1_mortgages and loans": 0.
|
645 |
-
"f1_credit card": 0.
|
646 |
-
"f1_debt collection": 0.
|
647 |
-
"
|
648 |
-
"
|
649 |
-
"f1_macro_ci_low": 0.
|
650 |
-
"f1_macro_ci_high": 0.
|
651 |
"score_name": "f1_micro",
|
652 |
-
"score": 0.
|
653 |
-
"score_ci_high": 0.
|
654 |
-
"score_ci_low": 0.
|
655 |
"num_of_instances": 500,
|
656 |
-
"accuracy": 0.
|
657 |
-
"accuracy_ci_low": 0.
|
658 |
-
"accuracy_ci_high": 0.
|
659 |
-
"f1_micro": 0.
|
660 |
-
"f1_micro_ci_low": 0.
|
661 |
-
"f1_micro_ci_high": 0.
|
662 |
},
|
663 |
-
"score": 0.
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
-
"
|
671 |
-
"
|
672 |
-
"score": 0.085,
|
673 |
"score_name": "program_accuracy",
|
674 |
-
"
|
675 |
-
"
|
676 |
-
"
|
677 |
-
"
|
678 |
-
"
|
679 |
-
"
|
|
|
680 |
},
|
681 |
-
"score": 0.
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
-
"precision": 0.
|
688 |
-
"recall": 0.
|
689 |
-
"f1": 0.
|
690 |
-
"precision_ci_low": 0.
|
691 |
-
"precision_ci_high": 0.
|
692 |
-
"recall_ci_low": 0.
|
693 |
-
"recall_ci_high": 0.
|
694 |
-
"f1_ci_low": 0.
|
695 |
-
"f1_ci_high": 0.
|
696 |
"score_name": "f1",
|
697 |
-
"score": 0.
|
698 |
-
"score_ci_high": 0.
|
699 |
-
"score_ci_low": 0.
|
700 |
"num_of_instances": 600,
|
701 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
702 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
703 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
704 |
-
"faithfullness_f1_token_overlap": 0.
|
705 |
-
"faithfullness_recall_token_overlap": 0.
|
706 |
-
"faithfullness_precision_token_overlap": 0.
|
707 |
-
"correctness_f1_token_overlap": 0.
|
708 |
-
"correctness_recall_token_overlap": 0.
|
709 |
-
"correctness_precision_token_overlap": 0.
|
710 |
},
|
711 |
-
"score": 0.
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
-
"accuracy": 0.
|
718 |
-
"accuracy_ci_low": 0.
|
719 |
-
"accuracy_ci_high": 0.
|
720 |
"score_name": "accuracy",
|
721 |
-
"score": 0.
|
722 |
-
"score_ci_high": 0.
|
723 |
-
"score_ci_low": 0.
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
-
"accuracy": 0.
|
728 |
-
"accuracy_ci_low": 0.
|
729 |
-
"accuracy_ci_high": 0.
|
730 |
"score_name": "accuracy",
|
731 |
-
"score": 0.
|
732 |
-
"score_ci_high": 0.
|
733 |
-
"score_ci_low": 0.
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
-
"score": 0.
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
@@ -741,68 +741,68 @@
|
|
741 |
"attaq_500": {
|
742 |
"severity_critical": 0.0,
|
743 |
"severity_high": 0.0,
|
744 |
-
"severity_medium":
|
745 |
-
"severity_low":
|
746 |
-
"category_harmful_info": 0.
|
747 |
-
"category_deception": 0.
|
748 |
-
"category_explicit_content": 0.
|
749 |
-
"category_violence": 0.
|
750 |
-
"category_discrimination": 0.
|
751 |
-
"category_substance_abuse": 0.
|
752 |
-
"category_pii": 0.
|
753 |
-
"safety": 0.
|
754 |
-
"safety_ci_low": 0.
|
755 |
-
"safety_ci_high": 0.
|
756 |
"score_name": "safety",
|
757 |
-
"score": 0.
|
758 |
-
"score_ci_high": 0.
|
759 |
-
"score_ci_low": 0.
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
-
"score": 0.
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
-
"
|
770 |
-
"
|
771 |
-
"rouge2": 0.
|
772 |
-
"rougeL": 0.
|
773 |
-
"score": 0.
|
774 |
"score_name": "rougeL",
|
775 |
-
"
|
776 |
-
"
|
777 |
-
"
|
778 |
-
"
|
779 |
-
"rouge2_ci_low": 0.
|
780 |
-
"rouge2_ci_high": 0.
|
781 |
-
"rougeL_ci_low": 0.
|
782 |
-
"rougeL_ci_high": 0.
|
783 |
-
"score_ci_low": 0.
|
784 |
-
"score_ci_high": 0.
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
-
"
|
789 |
-
"
|
790 |
-
"rouge2": 0.
|
791 |
-
"rougeL": 0.
|
792 |
-
"score": 0.
|
793 |
"score_name": "rougeL",
|
794 |
-
"
|
795 |
-
"
|
796 |
-
"
|
797 |
-
"
|
798 |
-
"rouge2_ci_low": 0.
|
799 |
-
"rouge2_ci_high": 0.
|
800 |
-
"rougeL_ci_low": 0.
|
801 |
-
"rougeL_ci_high": 0.
|
802 |
-
"score_ci_low": 0.
|
803 |
-
"score_ci_high": 0.
|
804 |
},
|
805 |
-
"score": 0.
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
@@ -810,473 +810,473 @@
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
],
|
818 |
"totals": [
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
-
|
823 |
],
|
824 |
"precisions": [
|
825 |
-
0.
|
826 |
-
0.
|
827 |
-
0.
|
828 |
-
0.
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
-
"sys_len":
|
832 |
"ref_len": 1734,
|
833 |
-
"sacrebleu": 0.
|
834 |
-
"score": 0.
|
835 |
"score_name": "sacrebleu",
|
836 |
-
"score_ci_low": 0.
|
837 |
-
"score_ci_high": 0.
|
838 |
-
"sacrebleu_ci_low": 0.
|
839 |
-
"sacrebleu_ci_high": 0.
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
-
|
845 |
-
|
846 |
-
|
847 |
-
|
848 |
],
|
849 |
"totals": [
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
],
|
855 |
"precisions": [
|
856 |
-
0.
|
857 |
-
0.
|
858 |
-
0.
|
859 |
-
0.
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
-
"sys_len":
|
863 |
"ref_len": 1734,
|
864 |
-
"sacrebleu": 0.
|
865 |
-
"score": 0.
|
866 |
"score_name": "sacrebleu",
|
867 |
-
"score_ci_low": 0.
|
868 |
-
"score_ci_high": 0.
|
869 |
-
"sacrebleu_ci_low": 0.
|
870 |
-
"sacrebleu_ci_high": 0.
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
-
|
876 |
-
|
877 |
-
|
878 |
-
|
879 |
],
|
880 |
"totals": [
|
881 |
-
|
882 |
-
|
883 |
-
|
884 |
-
|
885 |
],
|
886 |
"precisions": [
|
887 |
-
0.
|
888 |
-
0.
|
889 |
-
0.
|
890 |
-
0.
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
-
"sys_len":
|
894 |
"ref_len": 1589,
|
895 |
-
"sacrebleu": 0.
|
896 |
-
"score": 0.
|
897 |
"score_name": "sacrebleu",
|
898 |
-
"score_ci_low": 0.
|
899 |
-
"score_ci_high": 0.
|
900 |
-
"sacrebleu_ci_low": 0.
|
901 |
-
"sacrebleu_ci_high": 0.
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
-
|
907 |
-
|
908 |
-
|
909 |
-
|
910 |
],
|
911 |
"totals": [
|
912 |
-
|
913 |
-
|
914 |
-
|
915 |
-
|
916 |
],
|
917 |
"precisions": [
|
918 |
-
0.
|
919 |
-
0.
|
920 |
-
0.
|
921 |
-
0.
|
922 |
],
|
923 |
"bp": 1.0,
|
924 |
-
"sys_len":
|
925 |
"ref_len": 1835,
|
926 |
-
"sacrebleu": 0.
|
927 |
-
"score": 0.
|
928 |
"score_name": "sacrebleu",
|
929 |
-
"score_ci_low": 0.
|
930 |
-
"score_ci_high": 0.
|
931 |
-
"sacrebleu_ci_low": 0.
|
932 |
-
"sacrebleu_ci_high": 0.
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
-
|
938 |
-
|
939 |
-
|
940 |
-
|
941 |
],
|
942 |
"totals": [
|
943 |
-
|
944 |
-
|
945 |
-
|
946 |
-
|
947 |
],
|
948 |
"precisions": [
|
949 |
-
0.
|
950 |
-
0.
|
951 |
-
0.
|
952 |
-
0.
|
953 |
],
|
954 |
"bp": 1.0,
|
955 |
-
"sys_len":
|
956 |
"ref_len": 2068,
|
957 |
-
"sacrebleu": 0.
|
958 |
-
"score": 0.
|
959 |
"score_name": "sacrebleu",
|
960 |
-
"score_ci_low": 0.
|
961 |
-
"score_ci_high": 0.
|
962 |
-
"sacrebleu_ci_low": 0.
|
963 |
-
"sacrebleu_ci_high": 0.
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
-
|
969 |
-
|
970 |
-
|
971 |
-
|
972 |
],
|
973 |
"totals": [
|
974 |
-
|
975 |
-
|
976 |
-
|
977 |
-
|
978 |
],
|
979 |
"precisions": [
|
980 |
-
0.
|
981 |
-
0.
|
982 |
-
0.
|
983 |
-
0.
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
-
"sys_len":
|
987 |
"ref_len": 2235,
|
988 |
-
"sacrebleu": 0.
|
989 |
-
"score": 0.
|
990 |
"score_name": "sacrebleu",
|
991 |
-
"score_ci_low": 0.
|
992 |
-
"score_ci_high": 0.
|
993 |
-
"sacrebleu_ci_low": 0.
|
994 |
-
"sacrebleu_ci_high": 0.
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
-
|
1000 |
-
|
1001 |
-
|
1002 |
-
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
-
|
1006 |
-
|
1007 |
-
|
1008 |
-
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
-
0.
|
1012 |
-
0.
|
1013 |
-
0.
|
1014 |
-
0.
|
1015 |
],
|
1016 |
"bp": 1.0,
|
1017 |
-
"sys_len":
|
1018 |
"ref_len": 1916,
|
1019 |
-
"sacrebleu": 0.
|
1020 |
-
"score": 0.
|
1021 |
"score_name": "sacrebleu",
|
1022 |
-
"score_ci_low": 0.
|
1023 |
-
"score_ci_high": 0.
|
1024 |
-
"sacrebleu_ci_low": 0.
|
1025 |
-
"sacrebleu_ci_high": 0.
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
-
|
1031 |
-
|
1032 |
-
|
1033 |
-
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
-
0.
|
1043 |
-
0.
|
1044 |
-
0.
|
1045 |
-
0.
|
1046 |
],
|
1047 |
"bp": 1.0,
|
1048 |
-
"sys_len":
|
1049 |
"ref_len": 1949,
|
1050 |
-
"sacrebleu": 0.
|
1051 |
-
"score": 0.
|
1052 |
"score_name": "sacrebleu",
|
1053 |
-
"score_ci_low": 0.
|
1054 |
-
"score_ci_high": 0.
|
1055 |
-
"sacrebleu_ci_low": 0.
|
1056 |
-
"sacrebleu_ci_high": 0.
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
-
|
1062 |
-
|
1063 |
-
|
1064 |
-
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
-
3045,
|
1068 |
-
2979,
|
1069 |
2913,
|
1070 |
-
2847
|
|
|
|
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
-
0.
|
1074 |
-
0.
|
1075 |
-
0.
|
1076 |
-
0.
|
1077 |
],
|
1078 |
"bp": 1.0,
|
1079 |
-
"sys_len":
|
1080 |
"ref_len": 2098,
|
1081 |
-
"sacrebleu": 0.
|
1082 |
-
"score": 0.
|
1083 |
"score_name": "sacrebleu",
|
1084 |
-
"score_ci_low": 0.
|
1085 |
-
"score_ci_high": 0.
|
1086 |
-
"sacrebleu_ci_low": 0.
|
1087 |
-
"sacrebleu_ci_high": 0.
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
-
|
1093 |
-
|
1094 |
-
|
1095 |
-
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
-
|
1099 |
-
|
1100 |
-
|
1101 |
-
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
-
0.
|
1105 |
-
0.
|
1106 |
-
0.
|
1107 |
-
0.
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
-
"sys_len":
|
1111 |
"ref_len": 1734,
|
1112 |
-
"sacrebleu": 0.
|
1113 |
-
"score": 0.
|
1114 |
"score_name": "sacrebleu",
|
1115 |
-
"score_ci_low": 0.
|
1116 |
-
"score_ci_high": 0.
|
1117 |
-
"sacrebleu_ci_low": 0.
|
1118 |
-
"sacrebleu_ci_high": 0.
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
-
|
1124 |
-
|
1125 |
-
|
1126 |
-
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
-
|
1130 |
-
|
1131 |
-
|
1132 |
-
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
-
0.
|
1136 |
-
0.
|
1137 |
-
0.
|
1138 |
-
0.
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
-
"sys_len":
|
1142 |
"ref_len": 1734,
|
1143 |
-
"sacrebleu": 0.
|
1144 |
-
"score": 0.
|
1145 |
"score_name": "sacrebleu",
|
1146 |
-
"score_ci_low": 0.
|
1147 |
-
"score_ci_high": 0.
|
1148 |
-
"sacrebleu_ci_low": 0.
|
1149 |
-
"sacrebleu_ci_high": 0.
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
-
|
1155 |
-
|
1156 |
-
|
1157 |
-
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
-
|
1161 |
-
|
1162 |
-
|
1163 |
-
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
-
0.
|
1167 |
-
0.
|
1168 |
-
0.
|
1169 |
-
0.
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
-
"sys_len":
|
1173 |
"ref_len": 1734,
|
1174 |
-
"sacrebleu": 0.
|
1175 |
-
"score": 0.
|
1176 |
"score_name": "sacrebleu",
|
1177 |
-
"score_ci_low": 0.
|
1178 |
-
"score_ci_high": 0.
|
1179 |
-
"sacrebleu_ci_low": 0.
|
1180 |
-
"sacrebleu_ci_high": 0.
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
-
|
1186 |
-
|
1187 |
-
|
1188 |
-
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
-
|
1192 |
-
|
1193 |
-
|
1194 |
-
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
-
0.
|
1198 |
-
0.
|
1199 |
-
0.
|
1200 |
-
0.
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
-
"sys_len":
|
1204 |
"ref_len": 1734,
|
1205 |
-
"sacrebleu": 0.
|
1206 |
-
"score": 0.
|
1207 |
"score_name": "sacrebleu",
|
1208 |
-
"score_ci_low": 0.
|
1209 |
-
"score_ci_high": 0.
|
1210 |
-
"sacrebleu_ci_low": 0.
|
1211 |
-
"sacrebleu_ci_high": 0.
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
-
|
1217 |
-
|
1218 |
-
|
1219 |
-
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
-
|
1223 |
-
|
1224 |
-
|
1225 |
-
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
-
0.
|
1229 |
-
0.
|
1230 |
-
0.
|
1231 |
-
0.
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
-
"sys_len":
|
1235 |
"ref_len": 1734,
|
1236 |
-
"sacrebleu": 0.
|
1237 |
-
"score": 0.
|
1238 |
"score_name": "sacrebleu",
|
1239 |
-
"score_ci_low": 0.
|
1240 |
-
"score_ci_high": 0.
|
1241 |
-
"sacrebleu_ci_low": 0.
|
1242 |
-
"sacrebleu_ci_high": 0.
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
-
|
1248 |
-
|
1249 |
-
|
1250 |
-
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
-
|
1254 |
-
|
1255 |
-
|
1256 |
-
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
-
0.
|
1260 |
-
0.
|
1261 |
-
0.
|
1262 |
-
0.
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
-
"sys_len":
|
1266 |
"ref_len": 1734,
|
1267 |
-
"sacrebleu": 0.
|
1268 |
-
"score": 0.
|
1269 |
"score_name": "sacrebleu",
|
1270 |
-
"score_ci_low": 0.
|
1271 |
-
"score_ci_high": 0.
|
1272 |
-
"sacrebleu_ci_low": 0.
|
1273 |
-
"sacrebleu_ci_high": 0.
|
1274 |
},
|
1275 |
-
"score": 0.
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
-
"score": 0.
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-23T08:06:33.434344Z",
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.5,
|
180 |
+
"accuracy_ci_low": 0.4,
|
181 |
+
"accuracy_ci_high": 0.6,
|
182 |
"score_name": "accuracy",
|
183 |
+
"score": 0.5,
|
184 |
+
"score_ci_high": 0.6,
|
185 |
+
"score_ci_low": 0.4,
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.6777777777777778,
|
190 |
+
"accuracy_ci_low": 0.5777777777777777,
|
191 |
+
"accuracy_ci_high": 0.7666666666666667,
|
192 |
"score_name": "accuracy",
|
193 |
+
"score": 0.6777777777777778,
|
194 |
+
"score_ci_high": 0.7666666666666667,
|
195 |
+
"score_ci_low": 0.5777777777777777,
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 0.8333333333333334,
|
200 |
+
"accuracy_ci_low": 0.7444444444444445,
|
201 |
+
"accuracy_ci_high": 0.9,
|
202 |
"score_name": "accuracy",
|
203 |
+
"score": 0.8333333333333334,
|
204 |
+
"score_ci_high": 0.9,
|
205 |
+
"score_ci_low": 0.7444444444444445,
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 0.6666666666666666,
|
210 |
+
"accuracy_ci_low": 0.5666666666666667,
|
211 |
+
"accuracy_ci_high": 0.7555555555555555,
|
212 |
"score_name": "accuracy",
|
213 |
+
"score": 0.6666666666666666,
|
214 |
+
"score_ci_high": 0.7555555555555555,
|
215 |
+
"score_ci_low": 0.5666666666666667,
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.7111111111111111,
|
220 |
+
"accuracy_ci_low": 0.6111111111111112,
|
221 |
+
"accuracy_ci_high": 0.8,
|
222 |
"score_name": "accuracy",
|
223 |
+
"score": 0.7111111111111111,
|
224 |
+
"score_ci_high": 0.8,
|
225 |
+
"score_ci_low": 0.6111111111111112,
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
"accuracy": 0.9333333333333333,
|
230 |
+
"accuracy_ci_low": 0.8777777777777778,
|
231 |
"accuracy_ci_high": 0.9777777777777777,
|
232 |
"score_name": "accuracy",
|
233 |
"score": 0.9333333333333333,
|
234 |
"score_ci_high": 0.9777777777777777,
|
235 |
+
"score_ci_low": 0.8777777777777778,
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 0.8444444444444444,
|
240 |
+
"accuracy_ci_low": 0.7555555555555555,
|
241 |
+
"accuracy_ci_high": 0.9048361867497154,
|
242 |
"score_name": "accuracy",
|
243 |
+
"score": 0.8444444444444444,
|
244 |
+
"score_ci_high": 0.9048361867497154,
|
245 |
+
"score_ci_low": 0.7555555555555555,
|
246 |
"num_of_instances": 90
|
247 |
},
|
248 |
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 0.9111111111111111,
|
250 |
+
"accuracy_ci_low": 0.8333333333333334,
|
251 |
+
"accuracy_ci_high": 0.9555555555555556,
|
252 |
"score_name": "accuracy",
|
253 |
+
"score": 0.9111111111111111,
|
254 |
+
"score_ci_high": 0.9555555555555556,
|
255 |
+
"score_ci_low": 0.8333333333333334,
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.7444444444444445,
|
260 |
+
"accuracy_ci_low": 0.6555555555555556,
|
261 |
+
"accuracy_ci_high": 0.83090190108808,
|
262 |
"score_name": "accuracy",
|
263 |
+
"score": 0.7444444444444445,
|
264 |
+
"score_ci_high": 0.83090190108808,
|
265 |
+
"score_ci_low": 0.6555555555555556,
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.6777777777777778,
|
270 |
+
"accuracy_ci_low": 0.5777777777777777,
|
271 |
+
"accuracy_ci_high": 0.7666666666666667,
|
272 |
"score_name": "accuracy",
|
273 |
+
"score": 0.6777777777777778,
|
274 |
+
"score_ci_high": 0.7666666666666667,
|
275 |
+
"score_ci_low": 0.5777777777777777,
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
|
|
285 |
"score_ci_low": 0.6666666666666666,
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
+
"score": 0.7515151515151516,
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.08744186046511628,
|
296 |
+
"score": 0.08744186046511628,
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
+
"score": 0.08744186046511628,
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.52046783625731,
|
307 |
+
"f1_Location": 0.3275862068965517,
|
308 |
+
"f1_Organization": 0.3905723905723905,
|
309 |
+
"f1_macro": 0.41287547790875073,
|
310 |
+
"recall_macro": 0.34275188964299236,
|
311 |
+
"precision_macro": 0.5261312195216724,
|
312 |
+
"in_classes_support": 0.5945017182130584,
|
313 |
+
"f1_micro": 0.3342366757000903,
|
314 |
+
"recall_micro": 0.3523809523809524,
|
315 |
+
"precision_micro": 0.3178694158075601,
|
316 |
+
"score": 0.3342366757000903,
|
317 |
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.29025426476142113,
|
319 |
+
"score_ci_high": 0.38246190736620644,
|
320 |
+
"f1_micro_ci_low": 0.29025426476142113,
|
321 |
+
"f1_micro_ci_high": 0.38246190736620644
|
322 |
},
|
323 |
+
"score": 0.3342366757000903,
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.4225352112676056,
|
330 |
+
"accuracy_ci_low": 0.30985915492957744,
|
331 |
+
"accuracy_ci_high": 0.5352112676056338,
|
332 |
"score_name": "accuracy",
|
333 |
+
"score": 0.4225352112676056,
|
334 |
+
"score_ci_high": 0.5352112676056338,
|
335 |
+
"score_ci_low": 0.30985915492957744,
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
|
|
348 |
"mmlu_pro_chemistry": {
|
349 |
"accuracy": 0.23943661971830985,
|
350 |
"accuracy_ci_low": 0.15492957746478872,
|
351 |
+
"accuracy_ci_high": 0.352112676056338,
|
352 |
"score_name": "accuracy",
|
353 |
"score": 0.23943661971830985,
|
354 |
+
"score_ci_high": 0.352112676056338,
|
355 |
"score_ci_low": 0.15492957746478872,
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.4084507042253521,
|
360 |
+
"accuracy_ci_low": 0.29577464788732394,
|
361 |
+
"accuracy_ci_high": 0.5211267605633803,
|
362 |
"score_name": "accuracy",
|
363 |
+
"score": 0.4084507042253521,
|
364 |
+
"score_ci_high": 0.5211267605633803,
|
365 |
+
"score_ci_low": 0.29577464788732394,
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.4084507042253521,
|
370 |
+
"accuracy_ci_low": 0.29577464788732394,
|
371 |
+
"accuracy_ci_high": 0.5211267605633803,
|
372 |
"score_name": "accuracy",
|
373 |
+
"score": 0.4084507042253521,
|
374 |
+
"score_ci_high": 0.5211267605633803,
|
375 |
+
"score_ci_low": 0.29577464788732394,
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.23943661971830985,
|
380 |
+
"accuracy_ci_low": 0.15492957746478872,
|
381 |
+
"accuracy_ci_high": 0.3380281690140845,
|
382 |
"score_name": "accuracy",
|
383 |
+
"score": 0.23943661971830985,
|
384 |
+
"score_ci_high": 0.3380281690140845,
|
385 |
+
"score_ci_low": 0.15492957746478872,
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.352112676056338,
|
390 |
+
"accuracy_ci_low": 0.23943661971830985,
|
391 |
+
"accuracy_ci_high": 0.4647887323943662,
|
392 |
"score_name": "accuracy",
|
393 |
+
"score": 0.352112676056338,
|
394 |
+
"score_ci_high": 0.4647887323943662,
|
395 |
+
"score_ci_low": 0.23943661971830985,
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.323943661971831,
|
400 |
+
"accuracy_ci_low": 0.21693057179778907,
|
401 |
+
"accuracy_ci_high": 0.43661971830985913,
|
402 |
"score_name": "accuracy",
|
403 |
+
"score": 0.323943661971831,
|
404 |
+
"score_ci_high": 0.43661971830985913,
|
405 |
+
"score_ci_low": 0.21693057179778907,
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.30985915492957744,
|
410 |
+
"accuracy_ci_low": 0.2112676056338028,
|
411 |
+
"accuracy_ci_high": 0.42459270101591795,
|
412 |
"score_name": "accuracy",
|
413 |
+
"score": 0.30985915492957744,
|
414 |
+
"score_ci_high": 0.42459270101591795,
|
415 |
+
"score_ci_low": 0.2112676056338028,
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.09859154929577464,
|
420 |
+
"accuracy_ci_low": 0.04225352112676056,
|
421 |
+
"accuracy_ci_high": 0.17777703477060838,
|
422 |
"score_name": "accuracy",
|
423 |
+
"score": 0.09859154929577464,
|
424 |
+
"score_ci_high": 0.17777703477060838,
|
425 |
+
"score_ci_low": 0.04225352112676056,
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.323943661971831,
|
430 |
+
"accuracy_ci_low": 0.22338079742223388,
|
431 |
+
"accuracy_ci_high": 0.43661971830985913,
|
432 |
"score_name": "accuracy",
|
433 |
+
"score": 0.323943661971831,
|
434 |
+
"score_ci_high": 0.43661971830985913,
|
435 |
+
"score_ci_low": 0.22338079742223388,
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.4647887323943662,
|
440 |
+
"accuracy_ci_low": 0.352112676056338,
|
441 |
+
"accuracy_ci_high": 0.5915492957746479,
|
442 |
"score_name": "accuracy",
|
443 |
+
"score": 0.4647887323943662,
|
444 |
+
"score_ci_high": 0.5915492957746479,
|
445 |
+
"score_ci_low": 0.352112676056338,
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.18309859154929578,
|
450 |
+
"accuracy_ci_low": 0.11267605633802817,
|
451 |
+
"accuracy_ci_high": 0.28169014084507044,
|
452 |
"score_name": "accuracy",
|
453 |
+
"score": 0.18309859154929578,
|
454 |
+
"score_ci_high": 0.28169014084507044,
|
455 |
+
"score_ci_low": 0.11267605633802817,
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
"accuracy": 0.5352112676056338,
|
460 |
+
"accuracy_ci_low": 0.4225352112676056,
|
461 |
"accuracy_ci_high": 0.647887323943662,
|
462 |
"score_name": "accuracy",
|
463 |
"score": 0.5352112676056338,
|
464 |
"score_ci_high": 0.647887323943662,
|
465 |
+
"score_ci_low": 0.4225352112676056,
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
+
"score": 0.32193158953722334,
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.3075373413771583,
|
475 |
+
"f1_suggestive": 0.36363636363636365,
|
476 |
+
"f1_arbitrary": 0.28,
|
477 |
+
"f1_generic": 0.3157894736842105,
|
478 |
+
"f1_fanciful": 0.1,
|
479 |
+
"f1_descriptive": 0.4782608695652174,
|
480 |
+
"f1_macro_ci_low": 0.22135682385238098,
|
481 |
+
"f1_macro_ci_high": 0.4258827689087187,
|
482 |
"score_name": "f1_micro",
|
483 |
+
"score": 0.33121019108280253,
|
484 |
+
"score_ci_high": 0.43513626025637364,
|
485 |
+
"score_ci_low": 0.22818791946308725,
|
486 |
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.3058823529411765,
|
488 |
+
"accuracy_ci_low": 0.21176470588235294,
|
489 |
"accuracy_ci_high": 0.4,
|
490 |
+
"f1_micro": 0.33121019108280253,
|
491 |
+
"f1_micro_ci_low": 0.22818791946308725,
|
492 |
+
"f1_micro_ci_high": 0.43513626025637364
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.563568215892054,
|
496 |
+
"f1_no": 0.7793103448275862,
|
497 |
"f1_yes": 0.34782608695652173,
|
498 |
+
"f1_macro_ci_low": 0.49159571105513383,
|
499 |
+
"f1_macro_ci_high": 0.6365342652768683,
|
500 |
"score_name": "f1_micro",
|
501 |
+
"score": 0.675392670157068,
|
502 |
+
"score_ci_high": 0.73489030467135,
|
503 |
+
"score_ci_low": 0.608918205032967,
|
504 |
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.645,
|
506 |
+
"accuracy_ci_low": 0.58,
|
507 |
+
"accuracy_ci_high": 0.705,
|
508 |
+
"f1_micro": 0.675392670157068,
|
509 |
+
"f1_micro_ci_low": 0.608918205032967,
|
510 |
+
"f1_micro_ci_high": 0.73489030467135
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.3062664077216316,
|
514 |
+
"f1_conclusion": 0.20833333333333334,
|
515 |
+
"f1_decree": 0.1875,
|
516 |
+
"f1_rule": 0.47761194029850745,
|
517 |
+
"f1_issue": 0.25,
|
518 |
+
"f1_analysis": 0.44,
|
519 |
+
"f1_facts": 0.2727272727272727,
|
520 |
+
"f1_procedural history": 0.3076923076923077,
|
521 |
+
"f1_macro_ci_low": 0.24659229419876927,
|
522 |
+
"f1_macro_ci_high": 0.3810118235674986,
|
523 |
"score_name": "f1_micro",
|
524 |
+
"score": 0.3209169054441261,
|
525 |
+
"score_ci_high": 0.3885967259042703,
|
526 |
+
"score_ci_low": 0.2564102564102564,
|
527 |
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.28,
|
529 |
+
"accuracy_ci_low": 0.22,
|
530 |
+
"accuracy_ci_high": 0.34,
|
531 |
+
"f1_micro": 0.3209169054441261,
|
532 |
+
"f1_micro_ci_low": 0.2564102564102564,
|
533 |
+
"f1_micro_ci_high": 0.3885967259042703
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.5153888280394305,
|
537 |
+
"f1_yes": 0.6090909090909091,
|
538 |
+
"f1_no": 0.42168674698795183,
|
539 |
+
"f1_macro_ci_low": 0.44286956267940425,
|
540 |
+
"f1_macro_ci_high": 0.5822132955205006,
|
541 |
"score_name": "f1_micro",
|
542 |
+
"score": 0.5284974093264249,
|
543 |
+
"score_ci_high": 0.5917634471129095,
|
544 |
+
"score_ci_low": 0.4572437728690022,
|
545 |
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.51,
|
547 |
+
"accuracy_ci_low": 0.44,
|
548 |
+
"accuracy_ci_high": 0.575,
|
549 |
+
"f1_micro": 0.5284974093264249,
|
550 |
+
"f1_micro_ci_low": 0.4572437728690022,
|
551 |
+
"f1_micro_ci_high": 0.5917634471129095
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.8555844155844157,
|
555 |
+
"f1_yes": 0.88,
|
556 |
+
"f1_no": 0.8311688311688312,
|
557 |
+
"f1_macro_ci_low": 0.7810530949191602,
|
558 |
+
"f1_macro_ci_high": 0.911227513400763,
|
559 |
"score_name": "f1_micro",
|
560 |
+
"score": 0.8552631578947368,
|
561 |
+
"score_ci_high": 0.9104714274063991,
|
562 |
+
"score_ci_low": 0.7791920429268818,
|
563 |
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.7647058823529411,
|
565 |
+
"accuracy_ci_low": 0.6705882352941176,
|
566 |
+
"accuracy_ci_high": 0.8470588235294118,
|
567 |
+
"f1_micro": 0.8552631578947368,
|
568 |
+
"f1_micro_ci_low": 0.7791920429268818,
|
569 |
+
"f1_micro_ci_high": 0.9104714274063991
|
570 |
},
|
571 |
+
"score": 0.5422560667810317,
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.4238304887304002,
|
578 |
+
"f1_cars": 0.6534653465346535,
|
579 |
+
"f1_pc hardware": 0.38095238095238093,
|
580 |
+
"f1_windows x": 0.0,
|
581 |
+
"f1_atheism": 0.2727272727272727,
|
582 |
+
"f1_religion": 0.22641509433962265,
|
583 |
+
"f1_medicine": 0.7901234567901234,
|
|
|
584 |
"f1_christianity": 0.1694915254237288,
|
585 |
+
"f1_computer graphics": 0.3373493975903614,
|
586 |
+
"f1_microsoft windows": 0.37681159420289856,
|
587 |
+
"f1_middle east": 0.4594594594594595,
|
588 |
+
"f1_politics": 0.27906976744186046,
|
589 |
+
"f1_motorcycles": 0.4883720930232558,
|
590 |
+
"f1_mac hardware": 0.03125,
|
591 |
+
"f1_for sale": 0.6461538461538462,
|
592 |
+
"f1_guns": 0.18518518518518517,
|
593 |
+
"f1_space": 0.575,
|
594 |
+
"f1_cryptography": 0.5079365079365079,
|
595 |
+
"f1_baseball": 0.8468468468468469,
|
596 |
+
"f1_hockey": 0.85,
|
597 |
+
"f1_electronics": 0.4,
|
598 |
+
"f1_macro_ci_low": 0.39859097081154116,
|
599 |
+
"f1_macro_ci_high": 0.4545535978307604,
|
600 |
"score_name": "f1_micro",
|
601 |
+
"score": 0.4485549132947977,
|
602 |
+
"score_ci_high": 0.4787668189917876,
|
603 |
+
"score_ci_low": 0.41661505505349583,
|
604 |
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.388,
|
606 |
+
"accuracy_ci_low": 0.358,
|
607 |
+
"accuracy_ci_high": 0.4198351175250287,
|
608 |
+
"f1_micro": 0.4485549132947977,
|
609 |
+
"f1_micro_ci_low": 0.41661505505349583,
|
610 |
+
"f1_micro_ci_high": 0.4787668189917876
|
611 |
},
|
612 |
+
"score": 0.4485549132947977,
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.6126200216184788,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9002217294900222,
|
620 |
+
"f1_checking or savings account": 0.6451612903225806,
|
621 |
+
"f1_debt collection": 0.5066666666666667,
|
622 |
+
"f1_credit card or prepaid card": 0.6277372262773723,
|
623 |
+
"f1_mortgage": 0.7945205479452054,
|
624 |
+
"f1_student loan": 0.8461538461538461,
|
625 |
+
"f1_money transfer or virtual currency or money service": 0.4878048780487805,
|
626 |
+
"f1_payday loan or title loan or personal loan": 0.2608695652173913,
|
627 |
+
"f1_vehicle loan or lease": 0.4444444444444444,
|
628 |
+
"f1_macro_ci_low": 0.557724310665768,
|
629 |
+
"f1_macro_ci_high": 0.6722482288571774,
|
630 |
"score_name": "f1_micro",
|
631 |
+
"score": 0.8101924076963078,
|
632 |
+
"score_ci_high": 0.8332463122584837,
|
633 |
+
"score_ci_low": 0.7850628587071383,
|
634 |
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.779,
|
636 |
"accuracy_ci_low": 0.752,
|
637 |
+
"accuracy_ci_high": 0.805,
|
638 |
+
"f1_micro": 0.8101924076963078,
|
639 |
+
"f1_micro_ci_low": 0.7850628587071383,
|
640 |
+
"f1_micro_ci_high": 0.8332463122584837
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.6957226327824474,
|
644 |
+
"f1_mortgages and loans": 0.7861271676300579,
|
645 |
+
"f1_credit card": 0.735632183908046,
|
646 |
+
"f1_debt collection": 0.6605504587155964,
|
647 |
+
"f1_retail banking": 0.5853658536585366,
|
648 |
+
"f1_credit reporting": 0.7109375,
|
649 |
+
"f1_macro_ci_low": 0.6549701202052777,
|
650 |
+
"f1_macro_ci_high": 0.7382657469246365,
|
651 |
"score_name": "f1_micro",
|
652 |
+
"score": 0.701271186440678,
|
653 |
+
"score_ci_high": 0.7411785857709632,
|
654 |
+
"score_ci_low": 0.6609516931464113,
|
655 |
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.662,
|
657 |
+
"accuracy_ci_low": 0.6202110366430569,
|
658 |
+
"accuracy_ci_high": 0.706,
|
659 |
+
"f1_micro": 0.701271186440678,
|
660 |
+
"f1_micro_ci_low": 0.6609516931464113,
|
661 |
+
"f1_micro_ci_high": 0.7411785857709632
|
662 |
},
|
663 |
+
"score": 0.7557317970684929,
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
+
"program_accuracy": 0.084,
|
671 |
+
"score": 0.084,
|
|
|
672 |
"score_name": "program_accuracy",
|
673 |
+
"execution_accuracy": 0.073,
|
674 |
+
"program_accuracy_ci_low": 0.067,
|
675 |
+
"program_accuracy_ci_high": 0.10386305691021766,
|
676 |
+
"score_ci_low": 0.067,
|
677 |
+
"score_ci_high": 0.10386305691021766,
|
678 |
+
"execution_accuracy_ci_low": 0.057,
|
679 |
+
"execution_accuracy_ci_high": 0.091
|
680 |
},
|
681 |
+
"score": 0.084,
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.2967925946544494,
|
688 |
+
"recall": 0.5841362559189178,
|
689 |
+
"f1": 0.3364987383722942,
|
690 |
+
"precision_ci_low": 0.2764979602656248,
|
691 |
+
"precision_ci_high": 0.3159857611493305,
|
692 |
+
"recall_ci_low": 0.5663103849045117,
|
693 |
+
"recall_ci_high": 0.6007142202829963,
|
694 |
+
"f1_ci_low": 0.3178259767982501,
|
695 |
+
"f1_ci_high": 0.35351716805909167,
|
696 |
"score_name": "f1",
|
697 |
+
"score": 0.3364987383722942,
|
698 |
+
"score_ci_high": 0.35351716805909167,
|
699 |
+
"score_ci_low": 0.3178259767982501,
|
700 |
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.5976409501334031,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6827784284452597,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5494535167018573,
|
704 |
+
"faithfullness_f1_token_overlap": 0.34046513614196555,
|
705 |
+
"faithfullness_recall_token_overlap": 0.2738914194701684,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5609296227002613,
|
707 |
+
"correctness_f1_token_overlap": 0.3364987383722942,
|
708 |
+
"correctness_recall_token_overlap": 0.5841362559189178,
|
709 |
+
"correctness_precision_token_overlap": 0.2967925946544494
|
710 |
},
|
711 |
+
"score": 0.3364987383722942,
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
+
"accuracy": 0.448,
|
718 |
+
"accuracy_ci_low": 0.417,
|
719 |
+
"accuracy_ci_high": 0.479,
|
720 |
"score_name": "accuracy",
|
721 |
+
"score": 0.448,
|
722 |
+
"score_ci_high": 0.479,
|
723 |
+
"score_ci_low": 0.417,
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
+
"accuracy": 0.78,
|
728 |
+
"accuracy_ci_low": 0.742,
|
729 |
+
"accuracy_ci_high": 0.816,
|
730 |
"score_name": "accuracy",
|
731 |
+
"score": 0.78,
|
732 |
+
"score_ci_high": 0.816,
|
733 |
+
"score_ci_low": 0.742,
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
+
"score": 0.614,
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
|
|
741 |
"attaq_500": {
|
742 |
"severity_critical": 0.0,
|
743 |
"severity_high": 0.0,
|
744 |
+
"severity_medium": 1.0,
|
745 |
+
"severity_low": 99.0,
|
746 |
+
"category_harmful_info": 0.8719441451812312,
|
747 |
+
"category_deception": 0.888891483497138,
|
748 |
+
"category_explicit_content": 0.8127973503386131,
|
749 |
+
"category_violence": 0.8689369508242678,
|
750 |
+
"category_discrimination": 0.8825277714502243,
|
751 |
+
"category_substance_abuse": 0.9410604351096683,
|
752 |
+
"category_pii": 0.7720797760067163,
|
753 |
+
"safety": 0.8674504241983716,
|
754 |
+
"safety_ci_low": 0.847960857428302,
|
755 |
+
"safety_ci_high": 0.8844523349924212,
|
756 |
"score_name": "safety",
|
757 |
+
"score": 0.8674504241983716,
|
758 |
+
"score_ci_high": 0.8844523349924212,
|
759 |
+
"score_ci_low": 0.847960857428302,
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
+
"score": 0.8674504241983716,
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
+
"rouge1": 0.42001432363059676,
|
770 |
+
"rougeLsum": 0.35176737784751416,
|
771 |
+
"rouge2": 0.20090339450058858,
|
772 |
+
"rougeL": 0.2887794256459243,
|
773 |
+
"score": 0.2887794256459243,
|
774 |
"score_name": "rougeL",
|
775 |
+
"rouge1_ci_low": 0.4115358503100108,
|
776 |
+
"rouge1_ci_high": 0.42788391457002284,
|
777 |
+
"rougeLsum_ci_low": 0.3432569747337183,
|
778 |
+
"rougeLsum_ci_high": 0.35847372198631006,
|
779 |
+
"rouge2_ci_low": 0.19492457543616534,
|
780 |
+
"rouge2_ci_high": 0.2075223454056542,
|
781 |
+
"rougeL_ci_low": 0.28220405170841467,
|
782 |
+
"rougeL_ci_high": 0.2953572975976334,
|
783 |
+
"score_ci_low": 0.28220405170841467,
|
784 |
+
"score_ci_high": 0.2953572975976334
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
+
"rouge1": 0.11196786218861304,
|
789 |
+
"rougeLsum": 0.09259288227162547,
|
790 |
+
"rouge2": 0.014304299542517345,
|
791 |
+
"rougeL": 0.08050830498137622,
|
792 |
+
"score": 0.08050830498137622,
|
793 |
"score_name": "rougeL",
|
794 |
+
"rouge1_ci_low": 0.10730072397656114,
|
795 |
+
"rouge1_ci_high": 0.11708078514416911,
|
796 |
+
"rougeLsum_ci_low": 0.088799408920107,
|
797 |
+
"rougeLsum_ci_high": 0.09663277250494734,
|
798 |
+
"rouge2_ci_low": 0.01265361386023307,
|
799 |
+
"rouge2_ci_high": 0.01610624039999516,
|
800 |
+
"rougeL_ci_low": 0.07745615703093822,
|
801 |
+
"rougeL_ci_high": 0.08426746560170988,
|
802 |
+
"score_ci_low": 0.07745615703093822,
|
803 |
+
"score_ci_high": 0.08426746560170988
|
804 |
},
|
805 |
+
"score": 0.18464386531365026,
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
+
1147,
|
814 |
+
635,
|
815 |
+
377,
|
816 |
+
236
|
817 |
],
|
818 |
"totals": [
|
819 |
+
2783,
|
820 |
+
2717,
|
821 |
+
2651,
|
822 |
+
2585
|
823 |
],
|
824 |
"precisions": [
|
825 |
+
0.41214516708587856,
|
826 |
+
0.23371365476628636,
|
827 |
+
0.14221048660882685,
|
828 |
+
0.09129593810444873
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
+
"sys_len": 2783,
|
832 |
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.18805260077651942,
|
834 |
+
"score": 0.18805260077651942,
|
835 |
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.15590859876442242,
|
837 |
+
"score_ci_high": 0.22665030743269873,
|
838 |
+
"sacrebleu_ci_low": 0.15590859876442242,
|
839 |
+
"sacrebleu_ci_high": 0.22665030743269873
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
+
1250,
|
845 |
+
740,
|
846 |
+
469,
|
847 |
+
298
|
848 |
],
|
849 |
"totals": [
|
850 |
+
3365,
|
851 |
+
3299,
|
852 |
+
3233,
|
853 |
+
3167
|
854 |
],
|
855 |
"precisions": [
|
856 |
+
0.37147102526002973,
|
857 |
+
0.22431039709002729,
|
858 |
+
0.1450665017012063,
|
859 |
+
0.09409535838332808
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
+
"sys_len": 3365,
|
863 |
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.18364428677137226,
|
865 |
+
"score": 0.18364428677137226,
|
866 |
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.15133175793244782,
|
868 |
+
"score_ci_high": 0.238285104264321,
|
869 |
+
"sacrebleu_ci_low": 0.15133175793244782,
|
870 |
+
"sacrebleu_ci_high": 0.238285104264321
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
+
701,
|
876 |
+
279,
|
877 |
+
122,
|
878 |
+
55
|
879 |
],
|
880 |
"totals": [
|
881 |
+
2379,
|
882 |
+
2313,
|
883 |
+
2247,
|
884 |
+
2181
|
885 |
],
|
886 |
"precisions": [
|
887 |
+
0.294661622530475,
|
888 |
+
0.12062256809338522,
|
889 |
+
0.054294615042278595,
|
890 |
+
0.02521779000458505
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
+
"sys_len": 2379,
|
894 |
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.08352259557657876,
|
896 |
+
"score": 0.08352259557657876,
|
897 |
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.06605026431332355,
|
899 |
+
"score_ci_high": 0.10504705952927867,
|
900 |
+
"sacrebleu_ci_low": 0.06605026431332355,
|
901 |
+
"sacrebleu_ci_high": 0.10504705952927867
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
+
1060,
|
907 |
+
555,
|
908 |
+
321,
|
909 |
+
197
|
910 |
],
|
911 |
"totals": [
|
912 |
+
2307,
|
913 |
+
2241,
|
914 |
+
2175,
|
915 |
+
2109
|
916 |
],
|
917 |
"precisions": [
|
918 |
+
0.45947117468573906,
|
919 |
+
0.24765729585006693,
|
920 |
+
0.14758620689655172,
|
921 |
+
0.09340919867235657
|
922 |
],
|
923 |
"bp": 1.0,
|
924 |
+
"sys_len": 2307,
|
925 |
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.19901517998924645,
|
927 |
+
"score": 0.19901517998924645,
|
928 |
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.15118718491370434,
|
930 |
+
"score_ci_high": 0.24524942034353023,
|
931 |
+
"sacrebleu_ci_low": 0.15118718491370434,
|
932 |
+
"sacrebleu_ci_high": 0.24524942034353023
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
+
1383,
|
938 |
+
931,
|
939 |
+
685,
|
940 |
+
511
|
941 |
],
|
942 |
"totals": [
|
943 |
+
2499,
|
944 |
+
2433,
|
945 |
+
2367,
|
946 |
+
2301
|
947 |
],
|
948 |
"precisions": [
|
949 |
+
0.553421368547419,
|
950 |
+
0.3826551582408549,
|
951 |
+
0.28939585973806503,
|
952 |
+
0.222077357670578
|
953 |
],
|
954 |
"bp": 1.0,
|
955 |
+
"sys_len": 2499,
|
956 |
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.34155844112464445,
|
958 |
+
"score": 0.34155844112464445,
|
959 |
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.2882755414660873,
|
961 |
+
"score_ci_high": 0.40044941880570056,
|
962 |
+
"sacrebleu_ci_low": 0.2882755414660873,
|
963 |
+
"sacrebleu_ci_high": 0.40044941880570056
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
+
1091,
|
969 |
+
445,
|
970 |
+
224,
|
971 |
+
119
|
972 |
],
|
973 |
"totals": [
|
974 |
+
4751,
|
975 |
+
4685,
|
976 |
+
4619,
|
977 |
+
4553
|
978 |
],
|
979 |
"precisions": [
|
980 |
+
0.2296358661334456,
|
981 |
+
0.09498399146211313,
|
982 |
+
0.04849534531283828,
|
983 |
+
0.026136613222051394
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
+
"sys_len": 4751,
|
987 |
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.07251199865213667,
|
989 |
+
"score": 0.07251199865213667,
|
990 |
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.05587322252934343,
|
992 |
+
"score_ci_high": 0.09184707565044344,
|
993 |
+
"sacrebleu_ci_low": 0.05587322252934343,
|
994 |
+
"sacrebleu_ci_high": 0.09184707565044344
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
+
1335,
|
1000 |
+
873,
|
1001 |
+
615,
|
1002 |
+
436
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
+
3124,
|
1006 |
+
3058,
|
1007 |
+
2992,
|
1008 |
+
2926
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
+
0.427336747759283,
|
1012 |
+
0.2854807063440157,
|
1013 |
+
0.20554812834224598,
|
1014 |
+
0.14900888585099112
|
1015 |
],
|
1016 |
"bp": 1.0,
|
1017 |
+
"sys_len": 3124,
|
1018 |
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.24723968084246245,
|
1020 |
+
"score": 0.24723968084246245,
|
1021 |
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.1931280328121111,
|
1023 |
+
"score_ci_high": 0.3044993993983362,
|
1024 |
+
"sacrebleu_ci_low": 0.1931280328121111,
|
1025 |
+
"sacrebleu_ci_high": 0.3044993993983362
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
+
897,
|
1031 |
+
407,
|
1032 |
+
221,
|
1033 |
+
130
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
+
2626,
|
1037 |
+
2560,
|
1038 |
+
2494,
|
1039 |
+
2428
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
+
0.3415841584158416,
|
1043 |
+
0.158984375,
|
1044 |
+
0.08861267040898156,
|
1045 |
+
0.05354200988467875
|
1046 |
],
|
1047 |
"bp": 1.0,
|
1048 |
+
"sys_len": 2626,
|
1049 |
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.12669534688031472,
|
1051 |
+
"score": 0.12669534688031472,
|
1052 |
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.1000733380304748,
|
1054 |
+
"score_ci_high": 0.16178959885111238,
|
1055 |
+
"sacrebleu_ci_low": 0.1000733380304748,
|
1056 |
+
"sacrebleu_ci_high": 0.16178959885111238
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
+
1203,
|
1062 |
+
616,
|
1063 |
+
335,
|
1064 |
+
184
|
1065 |
],
|
1066 |
"totals": [
|
|
|
|
|
1067 |
2913,
|
1068 |
+
2847,
|
1069 |
+
2781,
|
1070 |
+
2715
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
+
0.41297631307929966,
|
1074 |
+
0.21636810677906568,
|
1075 |
+
0.12046026609133405,
|
1076 |
+
0.06777163904235727
|
1077 |
],
|
1078 |
"bp": 1.0,
|
1079 |
+
"sys_len": 2913,
|
1080 |
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.16434350639643316,
|
1082 |
+
"score": 0.16434350639643316,
|
1083 |
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.1325515600195346,
|
1085 |
+
"score_ci_high": 0.20299768479868893,
|
1086 |
+
"sacrebleu_ci_low": 0.1325515600195346,
|
1087 |
+
"sacrebleu_ci_high": 0.20299768479868893
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
+
1254,
|
1093 |
+
732,
|
1094 |
+
469,
|
1095 |
+
322
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
+
3278,
|
1099 |
+
3212,
|
1100 |
+
3146,
|
1101 |
+
3080
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
+
0.3825503355704698,
|
1105 |
+
0.22789539227895392,
|
1106 |
+
0.14907819453274,
|
1107 |
+
0.10454545454545455
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
+
"sys_len": 3278,
|
1111 |
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.19199320250461963,
|
1113 |
+
"score": 0.19199320250461963,
|
1114 |
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.15725080687797277,
|
1116 |
+
"score_ci_high": 0.24193576258661156,
|
1117 |
+
"sacrebleu_ci_low": 0.15725080687797277,
|
1118 |
+
"sacrebleu_ci_high": 0.24193576258661156
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
+
1040,
|
1124 |
+
474,
|
1125 |
+
264,
|
1126 |
+
161
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
+
3106,
|
1130 |
+
3040,
|
1131 |
+
2974,
|
1132 |
+
2908
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
+
0.334835801674179,
|
1136 |
+
0.15592105263157896,
|
1137 |
+
0.08876933422999328,
|
1138 |
+
0.05536451169188446
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
+
"sys_len": 3106,
|
1142 |
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.1265632943373452,
|
1144 |
+
"score": 0.1265632943373452,
|
1145 |
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.10051707288201024,
|
1147 |
+
"score_ci_high": 0.15415847724283543,
|
1148 |
+
"sacrebleu_ci_low": 0.10051707288201024,
|
1149 |
+
"sacrebleu_ci_high": 0.15415847724283543
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
+
978,
|
1155 |
+
426,
|
1156 |
+
216,
|
1157 |
+
119
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
+
3053,
|
1161 |
+
2987,
|
1162 |
+
2921,
|
1163 |
+
2855
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
+
0.3203406485424173,
|
1167 |
+
0.14261801138265817,
|
1168 |
+
0.07394727832933927,
|
1169 |
+
0.04168126094570928
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
+
"sys_len": 3053,
|
1173 |
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.10893372822633232,
|
1175 |
+
"score": 0.10893372822633232,
|
1176 |
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.08051704910100821,
|
1178 |
+
"score_ci_high": 0.1399793521343314,
|
1179 |
+
"sacrebleu_ci_low": 0.08051704910100821,
|
1180 |
+
"sacrebleu_ci_high": 0.1399793521343314
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
+
1279,
|
1186 |
+
820,
|
1187 |
+
574,
|
1188 |
+
416
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
+
2919,
|
1192 |
+
2853,
|
1193 |
+
2787,
|
1194 |
+
2721
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
+
0.4381637547105173,
|
1198 |
+
0.28741675429372593,
|
1199 |
+
0.2059562253318981,
|
1200 |
+
0.15288496876148475
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
+
"sys_len": 2919,
|
1204 |
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.250941252136478,
|
1206 |
+
"score": 0.250941252136478,
|
1207 |
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.19373911879549774,
|
1209 |
+
"score_ci_high": 0.3143574197034948,
|
1210 |
+
"sacrebleu_ci_low": 0.19373911879549774,
|
1211 |
+
"sacrebleu_ci_high": 0.3143574197034948
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
+
1215,
|
1217 |
+
691,
|
1218 |
+
421,
|
1219 |
+
259
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
+
2920,
|
1223 |
+
2854,
|
1224 |
+
2788,
|
1225 |
+
2722
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
+
0.41609589041095885,
|
1229 |
+
0.24211632796075683,
|
1230 |
+
0.15100430416068866,
|
1231 |
+
0.09515062454077883
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
+
"sys_len": 2920,
|
1235 |
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.19505389122054267,
|
1237 |
+
"score": 0.19505389122054267,
|
1238 |
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.16097395914199633,
|
1240 |
+
"score_ci_high": 0.23627234222780022,
|
1241 |
+
"sacrebleu_ci_low": 0.16097395914199633,
|
1242 |
+
"sacrebleu_ci_high": 0.23627234222780022
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
+
1158,
|
1248 |
+
586,
|
1249 |
+
328,
|
1250 |
+
189
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
+
3432,
|
1254 |
+
3366,
|
1255 |
+
3300,
|
1256 |
+
3234
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
+
0.3374125874125874,
|
1260 |
+
0.1740938799762329,
|
1261 |
+
0.0993939393939394,
|
1262 |
+
0.05844155844155845
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
+
"sys_len": 3432,
|
1266 |
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.1359116294436951,
|
1268 |
+
"score": 0.1359116294436951,
|
1269 |
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.10538411838136834,
|
1271 |
+
"score_ci_high": 0.18075724261232987,
|
1272 |
+
"sacrebleu_ci_low": 0.10538411838136834,
|
1273 |
+
"sacrebleu_ci_high": 0.18075724261232987
|
1274 |
},
|
1275 |
+
"score": 0.17439870899191476,
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
+
"score": 0.42328152240293343,
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
results/bluebench/2025-06-23T04-42-35_evaluation_results.json
ADDED
@@ -0,0 +1,1283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-23T08:42:31.876970Z",
|
4 |
+
"command_line_invocation": [
|
5 |
+
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
+
"--tasks",
|
7 |
+
"benchmarks.bluebench",
|
8 |
+
"--model",
|
9 |
+
"cross_provider",
|
10 |
+
"--model_args",
|
11 |
+
"model_name=watsonx/ibm/granite-3-8b-instruct,max_tokens=256",
|
12 |
+
"--output_path",
|
13 |
+
"./results/bluebench",
|
14 |
+
"--log_samples",
|
15 |
+
"--trust_remote_code",
|
16 |
+
"--batch_size",
|
17 |
+
"8",
|
18 |
+
"--verbosity",
|
19 |
+
"ERROR"
|
20 |
+
],
|
21 |
+
"parsed_arguments": {
|
22 |
+
"tasks": [
|
23 |
+
"benchmarks.bluebench"
|
24 |
+
],
|
25 |
+
"split": "test",
|
26 |
+
"num_fewshots": null,
|
27 |
+
"limit": null,
|
28 |
+
"batch_size": 8,
|
29 |
+
"model": "watsonx/ibm/granite-3-8b-instruct",
|
30 |
+
"model_args": {
|
31 |
+
"max_tokens": 256
|
32 |
+
},
|
33 |
+
"gen_kwargs": null,
|
34 |
+
"chat_template_kwargs": null,
|
35 |
+
"output_path": "./results/bluebench",
|
36 |
+
"output_file_prefix": "evaluation_results",
|
37 |
+
"log_samples": true,
|
38 |
+
"verbosity": "ERROR",
|
39 |
+
"apply_chat_template": false,
|
40 |
+
"trust_remote_code": true,
|
41 |
+
"disable_hf_cache": false,
|
42 |
+
"cache_dir": null
|
43 |
+
},
|
44 |
+
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
|
46 |
+
"python_version": "3.10.18",
|
47 |
+
"system": "Linux",
|
48 |
+
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
49 |
+
"installed_packages": {
|
50 |
+
"nvidia-cufile-cu12": "1.11.1.6",
|
51 |
+
"triton": "3.3.1",
|
52 |
+
"nltk": "3.9.1",
|
53 |
+
"anyio": "4.9.0",
|
54 |
+
"absl-py": "2.3.0",
|
55 |
+
"tiktoken": "0.9.0",
|
56 |
+
"charset-normalizer": "3.4.2",
|
57 |
+
"nvidia-cuda-runtime-cu12": "12.6.77",
|
58 |
+
"sympy": "1.14.0",
|
59 |
+
"mecab-ko": "1.0.1",
|
60 |
+
"litellm": "1.72.6.post1",
|
61 |
+
"httpcore": "1.0.9",
|
62 |
+
"Jinja2": "3.1.6",
|
63 |
+
"jsonschema-specifications": "2025.4.1",
|
64 |
+
"pydantic_core": "2.33.2",
|
65 |
+
"nvidia-cusparse-cu12": "12.5.4.2",
|
66 |
+
"yarl": "1.20.1",
|
67 |
+
"openai": "1.88.0",
|
68 |
+
"portalocker": "3.2.0",
|
69 |
+
"pandas": "2.3.0",
|
70 |
+
"multiprocess": "0.70.16",
|
71 |
+
"jsonschema": "4.24.0",
|
72 |
+
"unitxt": "1.24.0",
|
73 |
+
"nvidia-nvjitlink-cu12": "12.6.85",
|
74 |
+
"nvidia-cublas-cu12": "12.6.4.1",
|
75 |
+
"pydantic": "2.11.7",
|
76 |
+
"async-timeout": "5.0.1",
|
77 |
+
"annotated-types": "0.7.0",
|
78 |
+
"rouge_score": "0.1.2",
|
79 |
+
"contourpy": "1.3.2",
|
80 |
+
"aiosignal": "1.3.2",
|
81 |
+
"nvidia-cuda-cupti-cu12": "12.6.80",
|
82 |
+
"pillow": "11.2.1",
|
83 |
+
"six": "1.17.0",
|
84 |
+
"diskcache": "5.6.3",
|
85 |
+
"tqdm": "4.67.1",
|
86 |
+
"pyarrow": "20.0.0",
|
87 |
+
"h11": "0.16.0",
|
88 |
+
"zipp": "3.19.2",
|
89 |
+
"tzdata": "2025.2",
|
90 |
+
"bert-score": "0.3.13",
|
91 |
+
"setuptools": "80.9.0",
|
92 |
+
"referencing": "0.36.2",
|
93 |
+
"sacrebleu": "2.5.1",
|
94 |
+
"filelock": "3.18.0",
|
95 |
+
"urllib3": "2.5.0",
|
96 |
+
"scipy": "1.15.3",
|
97 |
+
"nvidia-nccl-cu12": "2.26.2",
|
98 |
+
"kiwisolver": "1.4.8",
|
99 |
+
"networkx": "3.4.2",
|
100 |
+
"typing-inspection": "0.4.1",
|
101 |
+
"lxml": "5.4.0",
|
102 |
+
"sniffio": "1.3.1",
|
103 |
+
"scikit-learn": "1.7.0",
|
104 |
+
"nvidia-curand-cu12": "10.3.7.77",
|
105 |
+
"pip": "25.1.1",
|
106 |
+
"fonttools": "4.58.4",
|
107 |
+
"transformers": "4.52.4",
|
108 |
+
"datasets": "3.6.0",
|
109 |
+
"nvidia-cusolver-cu12": "11.7.1.2",
|
110 |
+
"cycler": "0.12.1",
|
111 |
+
"evaluate": "0.4.3",
|
112 |
+
"distro": "1.9.0",
|
113 |
+
"idna": "3.10",
|
114 |
+
"MarkupSafe": "3.0.2",
|
115 |
+
"frozenlist": "1.7.0",
|
116 |
+
"pyparsing": "3.2.3",
|
117 |
+
"jiter": "0.10.0",
|
118 |
+
"importlib_metadata": "8.0.0",
|
119 |
+
"packaging": "24.2",
|
120 |
+
"psutil": "7.0.0",
|
121 |
+
"mecab-ko-dic": "1.0.0",
|
122 |
+
"joblib": "1.5.1",
|
123 |
+
"fsspec": "2025.3.0",
|
124 |
+
"dill": "0.3.8",
|
125 |
+
"tokenizers": "0.21.1",
|
126 |
+
"wheel": "0.45.1",
|
127 |
+
"nvidia-nvtx-cu12": "12.6.77",
|
128 |
+
"nvidia-cusparselt-cu12": "0.6.3",
|
129 |
+
"hf-xet": "1.1.4",
|
130 |
+
"propcache": "0.3.2",
|
131 |
+
"numpy": "2.2.6",
|
132 |
+
"mpmath": "1.3.0",
|
133 |
+
"multidict": "6.5.0",
|
134 |
+
"conllu": "6.0.0",
|
135 |
+
"safetensors": "0.5.3",
|
136 |
+
"requests": "2.32.4",
|
137 |
+
"regex": "2024.11.6",
|
138 |
+
"aiohttp": "3.12.13",
|
139 |
+
"tabulate": "0.9.0",
|
140 |
+
"certifi": "2025.6.15",
|
141 |
+
"accelerate": "1.8.0",
|
142 |
+
"nvidia-cufft-cu12": "11.3.0.4",
|
143 |
+
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
144 |
+
"click": "8.2.1",
|
145 |
+
"typing_extensions": "4.12.2",
|
146 |
+
"attrs": "25.3.0",
|
147 |
+
"exceptiongroup": "1.3.0",
|
148 |
+
"tenacity": "9.1.2",
|
149 |
+
"pytz": "2025.2",
|
150 |
+
"aiohappyeyeballs": "2.6.1",
|
151 |
+
"python-dateutil": "2.9.0.post0",
|
152 |
+
"torch": "2.7.1",
|
153 |
+
"python-dotenv": "1.1.0",
|
154 |
+
"httpx": "0.28.1",
|
155 |
+
"matplotlib": "3.10.3",
|
156 |
+
"xxhash": "3.5.0",
|
157 |
+
"PyYAML": "6.0.2",
|
158 |
+
"huggingface-hub": "0.33.0",
|
159 |
+
"colorama": "0.4.6",
|
160 |
+
"rpds-py": "0.25.1",
|
161 |
+
"threadpoolctl": "3.6.0",
|
162 |
+
"nvidia-cudnn-cu12": "9.5.1.17",
|
163 |
+
"jaraco.collections": "5.1.0",
|
164 |
+
"tomli": "2.0.1",
|
165 |
+
"backports.tarfile": "1.2.0",
|
166 |
+
"jaraco.context": "5.3.0",
|
167 |
+
"typeguard": "4.3.0",
|
168 |
+
"autocommand": "2.2.2",
|
169 |
+
"jaraco.text": "3.12.1",
|
170 |
+
"more-itertools": "10.3.0",
|
171 |
+
"platformdirs": "4.2.2",
|
172 |
+
"inflect": "7.3.1",
|
173 |
+
"jaraco.functools": "4.0.1"
|
174 |
+
}
|
175 |
+
},
|
176 |
+
"results": {
|
177 |
+
"bias": {
|
178 |
+
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.6,
|
180 |
+
"accuracy_ci_low": 0.4961662149523231,
|
181 |
+
"accuracy_ci_high": 0.6888888888888889,
|
182 |
+
"score_name": "accuracy",
|
183 |
+
"score": 0.6,
|
184 |
+
"score_ci_high": 0.6888888888888889,
|
185 |
+
"score_ci_low": 0.4961662149523231,
|
186 |
+
"num_of_instances": 90
|
187 |
+
},
|
188 |
+
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.6888888888888889,
|
190 |
+
"accuracy_ci_low": 0.5777777777777777,
|
191 |
+
"accuracy_ci_high": 0.7666666666666667,
|
192 |
+
"score_name": "accuracy",
|
193 |
+
"score": 0.6888888888888889,
|
194 |
+
"score_ci_high": 0.7666666666666667,
|
195 |
+
"score_ci_low": 0.5777777777777777,
|
196 |
+
"num_of_instances": 90
|
197 |
+
},
|
198 |
+
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 0.9,
|
200 |
+
"accuracy_ci_low": 0.8222222222222222,
|
201 |
+
"accuracy_ci_high": 0.9555555555555556,
|
202 |
+
"score_name": "accuracy",
|
203 |
+
"score": 0.9,
|
204 |
+
"score_ci_high": 0.9555555555555556,
|
205 |
+
"score_ci_low": 0.8222222222222222,
|
206 |
+
"num_of_instances": 90
|
207 |
+
},
|
208 |
+
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 0.6888888888888889,
|
210 |
+
"accuracy_ci_low": 0.5780895036995246,
|
211 |
+
"accuracy_ci_high": 0.7888888888888889,
|
212 |
+
"score_name": "accuracy",
|
213 |
+
"score": 0.6888888888888889,
|
214 |
+
"score_ci_high": 0.7888888888888889,
|
215 |
+
"score_ci_low": 0.5780895036995246,
|
216 |
+
"num_of_instances": 90
|
217 |
+
},
|
218 |
+
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.7888888888888889,
|
220 |
+
"accuracy_ci_low": 0.689667704010142,
|
221 |
+
"accuracy_ci_high": 0.8555555555555555,
|
222 |
+
"score_name": "accuracy",
|
223 |
+
"score": 0.7888888888888889,
|
224 |
+
"score_ci_high": 0.8555555555555555,
|
225 |
+
"score_ci_low": 0.689667704010142,
|
226 |
+
"num_of_instances": 90
|
227 |
+
},
|
228 |
+
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.9222222222222223,
|
230 |
+
"accuracy_ci_low": 0.8444444444444444,
|
231 |
+
"accuracy_ci_high": 0.9666666666666667,
|
232 |
+
"score_name": "accuracy",
|
233 |
+
"score": 0.9222222222222223,
|
234 |
+
"score_ci_high": 0.9666666666666667,
|
235 |
+
"score_ci_low": 0.8444444444444444,
|
236 |
+
"num_of_instances": 90
|
237 |
+
},
|
238 |
+
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 0.9,
|
240 |
+
"accuracy_ci_low": 0.8222222222222222,
|
241 |
+
"accuracy_ci_high": 0.9555555555555556,
|
242 |
+
"score_name": "accuracy",
|
243 |
+
"score": 0.9,
|
244 |
+
"score_ci_high": 0.9555555555555556,
|
245 |
+
"score_ci_low": 0.8222222222222222,
|
246 |
+
"num_of_instances": 90
|
247 |
+
},
|
248 |
+
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 0.9555555555555556,
|
250 |
+
"accuracy_ci_low": 0.9,
|
251 |
+
"accuracy_ci_high": 0.9888888888888889,
|
252 |
+
"score_name": "accuracy",
|
253 |
+
"score": 0.9555555555555556,
|
254 |
+
"score_ci_high": 0.9888888888888889,
|
255 |
+
"score_ci_low": 0.9,
|
256 |
+
"num_of_instances": 90
|
257 |
+
},
|
258 |
+
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.8222222222222222,
|
260 |
+
"accuracy_ci_low": 0.7333333333333333,
|
261 |
+
"accuracy_ci_high": 0.8888888888888888,
|
262 |
+
"score_name": "accuracy",
|
263 |
+
"score": 0.8222222222222222,
|
264 |
+
"score_ci_high": 0.8888888888888888,
|
265 |
+
"score_ci_low": 0.7333333333333333,
|
266 |
+
"num_of_instances": 90
|
267 |
+
},
|
268 |
+
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.6777777777777778,
|
270 |
+
"accuracy_ci_low": 0.5777777777777777,
|
271 |
+
"accuracy_ci_high": 0.7666666666666667,
|
272 |
+
"score_name": "accuracy",
|
273 |
+
"score": 0.6777777777777778,
|
274 |
+
"score_ci_high": 0.7666666666666667,
|
275 |
+
"score_ci_low": 0.5777777777777777,
|
276 |
+
"num_of_instances": 90
|
277 |
+
},
|
278 |
+
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.8333333333333334,
|
280 |
+
"accuracy_ci_low": 0.7555555555555555,
|
281 |
+
"accuracy_ci_high": 0.9077323275921318,
|
282 |
+
"score_name": "accuracy",
|
283 |
+
"score": 0.8333333333333334,
|
284 |
+
"score_ci_high": 0.9077323275921318,
|
285 |
+
"score_ci_low": 0.7555555555555555,
|
286 |
+
"num_of_instances": 90
|
287 |
+
},
|
288 |
+
"score": 0.797979797979798,
|
289 |
+
"score_name": "subsets_mean",
|
290 |
+
"num_of_instances": 990
|
291 |
+
},
|
292 |
+
"chatbot_abilities": {
|
293 |
+
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
+
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.07200720072007201,
|
296 |
+
"score": 0.07200720072007201,
|
297 |
+
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
+
},
|
299 |
+
"score": 0.07200720072007201,
|
300 |
+
"score_name": "subsets_mean",
|
301 |
+
"num_of_instances": 500
|
302 |
+
},
|
303 |
+
"entity_extraction": {
|
304 |
+
"universal_ner_en_ewt": {
|
305 |
+
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.5089820359281437,
|
307 |
+
"f1_Organization": 0.3546511627906977,
|
308 |
+
"f1_Location": 0.3474903474903475,
|
309 |
+
"f1_macro": 0.4037078487363963,
|
310 |
+
"recall_macro": 0.3583554354766996,
|
311 |
+
"precision_macro": 0.4822578777124232,
|
312 |
+
"in_classes_support": 0.5928057553956835,
|
313 |
+
"f1_micro": 0.31311475409836065,
|
314 |
+
"recall_micro": 0.3638095238095238,
|
315 |
+
"precision_micro": 0.27482014388489207,
|
316 |
+
"score": 0.31311475409836065,
|
317 |
+
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.2581926050371807,
|
319 |
+
"score_ci_high": 0.35574639217016485,
|
320 |
+
"f1_micro_ci_low": 0.2581926050371807,
|
321 |
+
"f1_micro_ci_high": 0.35574639217016485
|
322 |
+
},
|
323 |
+
"score": 0.31311475409836065,
|
324 |
+
"score_name": "subsets_mean",
|
325 |
+
"num_of_instances": 1000
|
326 |
+
},
|
327 |
+
"knowledge": {
|
328 |
+
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.5633802816901409,
|
330 |
+
"accuracy_ci_low": 0.4647887323943662,
|
331 |
+
"accuracy_ci_high": 0.676056338028169,
|
332 |
+
"score_name": "accuracy",
|
333 |
+
"score": 0.5633802816901409,
|
334 |
+
"score_ci_high": 0.676056338028169,
|
335 |
+
"score_ci_low": 0.4647887323943662,
|
336 |
+
"num_of_instances": 71
|
337 |
+
},
|
338 |
+
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.2535211267605634,
|
340 |
+
"accuracy_ci_low": 0.15492957746478872,
|
341 |
+
"accuracy_ci_high": 0.36619718309859156,
|
342 |
+
"score_name": "accuracy",
|
343 |
+
"score": 0.2535211267605634,
|
344 |
+
"score_ci_high": 0.36619718309859156,
|
345 |
+
"score_ci_low": 0.15492957746478872,
|
346 |
+
"num_of_instances": 71
|
347 |
+
},
|
348 |
+
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.23943661971830985,
|
350 |
+
"accuracy_ci_low": 0.15492957746478872,
|
351 |
+
"accuracy_ci_high": 0.36619718309859156,
|
352 |
+
"score_name": "accuracy",
|
353 |
+
"score": 0.23943661971830985,
|
354 |
+
"score_ci_high": 0.36619718309859156,
|
355 |
+
"score_ci_low": 0.15492957746478872,
|
356 |
+
"num_of_instances": 71
|
357 |
+
},
|
358 |
+
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.352112676056338,
|
360 |
+
"accuracy_ci_low": 0.2535211267605634,
|
361 |
+
"accuracy_ci_high": 0.47475562822206696,
|
362 |
+
"score_name": "accuracy",
|
363 |
+
"score": 0.352112676056338,
|
364 |
+
"score_ci_high": 0.47475562822206696,
|
365 |
+
"score_ci_low": 0.2535211267605634,
|
366 |
+
"num_of_instances": 71
|
367 |
+
},
|
368 |
+
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.4647887323943662,
|
370 |
+
"accuracy_ci_low": 0.3380281690140845,
|
371 |
+
"accuracy_ci_high": 0.5774647887323944,
|
372 |
+
"score_name": "accuracy",
|
373 |
+
"score": 0.4647887323943662,
|
374 |
+
"score_ci_high": 0.5774647887323944,
|
375 |
+
"score_ci_low": 0.3380281690140845,
|
376 |
+
"num_of_instances": 71
|
377 |
+
},
|
378 |
+
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.2535211267605634,
|
380 |
+
"accuracy_ci_low": 0.15492957746478872,
|
381 |
+
"accuracy_ci_high": 0.36619718309859156,
|
382 |
+
"score_name": "accuracy",
|
383 |
+
"score": 0.2535211267605634,
|
384 |
+
"score_ci_high": 0.36619718309859156,
|
385 |
+
"score_ci_low": 0.15492957746478872,
|
386 |
+
"num_of_instances": 71
|
387 |
+
},
|
388 |
+
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.352112676056338,
|
390 |
+
"accuracy_ci_low": 0.23943661971830985,
|
391 |
+
"accuracy_ci_high": 0.4647887323943662,
|
392 |
+
"score_name": "accuracy",
|
393 |
+
"score": 0.352112676056338,
|
394 |
+
"score_ci_high": 0.4647887323943662,
|
395 |
+
"score_ci_low": 0.23943661971830985,
|
396 |
+
"num_of_instances": 71
|
397 |
+
},
|
398 |
+
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.43661971830985913,
|
400 |
+
"accuracy_ci_low": 0.323943661971831,
|
401 |
+
"accuracy_ci_high": 0.5633802816901409,
|
402 |
+
"score_name": "accuracy",
|
403 |
+
"score": 0.43661971830985913,
|
404 |
+
"score_ci_high": 0.5633802816901409,
|
405 |
+
"score_ci_low": 0.323943661971831,
|
406 |
+
"num_of_instances": 71
|
407 |
+
},
|
408 |
+
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.30985915492957744,
|
410 |
+
"accuracy_ci_low": 0.2112676056338028,
|
411 |
+
"accuracy_ci_high": 0.4225352112676056,
|
412 |
+
"score_name": "accuracy",
|
413 |
+
"score": 0.30985915492957744,
|
414 |
+
"score_ci_high": 0.4225352112676056,
|
415 |
+
"score_ci_low": 0.2112676056338028,
|
416 |
+
"num_of_instances": 71
|
417 |
+
},
|
418 |
+
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.11267605633802817,
|
420 |
+
"accuracy_ci_low": 0.056338028169014086,
|
421 |
+
"accuracy_ci_high": 0.19718309859154928,
|
422 |
+
"score_name": "accuracy",
|
423 |
+
"score": 0.11267605633802817,
|
424 |
+
"score_ci_high": 0.19718309859154928,
|
425 |
+
"score_ci_low": 0.056338028169014086,
|
426 |
+
"num_of_instances": 71
|
427 |
+
},
|
428 |
+
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.323943661971831,
|
430 |
+
"accuracy_ci_low": 0.22193333267792992,
|
431 |
+
"accuracy_ci_high": 0.43661971830985913,
|
432 |
+
"score_name": "accuracy",
|
433 |
+
"score": 0.323943661971831,
|
434 |
+
"score_ci_high": 0.43661971830985913,
|
435 |
+
"score_ci_low": 0.22193333267792992,
|
436 |
+
"num_of_instances": 71
|
437 |
+
},
|
438 |
+
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.38028169014084506,
|
440 |
+
"accuracy_ci_low": 0.2535211267605634,
|
441 |
+
"accuracy_ci_high": 0.49295774647887325,
|
442 |
+
"score_name": "accuracy",
|
443 |
+
"score": 0.38028169014084506,
|
444 |
+
"score_ci_high": 0.49295774647887325,
|
445 |
+
"score_ci_low": 0.2535211267605634,
|
446 |
+
"num_of_instances": 71
|
447 |
+
},
|
448 |
+
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.16901408450704225,
|
450 |
+
"accuracy_ci_low": 0.09859154929577464,
|
451 |
+
"accuracy_ci_high": 0.2676056338028169,
|
452 |
+
"score_name": "accuracy",
|
453 |
+
"score": 0.16901408450704225,
|
454 |
+
"score_ci_high": 0.2676056338028169,
|
455 |
+
"score_ci_low": 0.09859154929577464,
|
456 |
+
"num_of_instances": 71
|
457 |
+
},
|
458 |
+
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.5492957746478874,
|
460 |
+
"accuracy_ci_low": 0.43661971830985913,
|
461 |
+
"accuracy_ci_high": 0.6619718309859155,
|
462 |
+
"score_name": "accuracy",
|
463 |
+
"score": 0.5492957746478874,
|
464 |
+
"score_ci_high": 0.6619718309859155,
|
465 |
+
"score_ci_low": 0.43661971830985913,
|
466 |
+
"num_of_instances": 71
|
467 |
+
},
|
468 |
+
"score": 0.34004024144869216,
|
469 |
+
"score_name": "subsets_mean",
|
470 |
+
"num_of_instances": 994
|
471 |
+
},
|
472 |
+
"legal": {
|
473 |
+
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.38313112869412325,
|
475 |
+
"f1_suggestive": 0.4,
|
476 |
+
"f1_descriptive": 0.49056603773584906,
|
477 |
+
"f1_generic": 0.1111111111111111,
|
478 |
+
"f1_fanciful": 0.5806451612903226,
|
479 |
+
"f1_arbitrary": 0.3333333333333333,
|
480 |
+
"f1_macro_ci_low": 0.293497629073193,
|
481 |
+
"f1_macro_ci_high": 0.49184198170551063,
|
482 |
+
"score_name": "f1_micro",
|
483 |
+
"score": 0.41420118343195267,
|
484 |
+
"score_ci_high": 0.5176470588235295,
|
485 |
+
"score_ci_low": 0.3058823529411765,
|
486 |
+
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.4117647058823529,
|
488 |
+
"accuracy_ci_low": 0.3058823529411765,
|
489 |
+
"accuracy_ci_high": 0.5176470588235295,
|
490 |
+
"f1_micro": 0.41420118343195267,
|
491 |
+
"f1_micro_ci_low": 0.3058823529411765,
|
492 |
+
"f1_micro_ci_high": 0.5176470588235295
|
493 |
+
},
|
494 |
+
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.5893994540491356,
|
496 |
+
"f1_no": 0.821656050955414,
|
497 |
+
"f1_yes": 0.35714285714285715,
|
498 |
+
"f1_macro_ci_low": 0.5097301675555063,
|
499 |
+
"f1_macro_ci_high": 0.6745862952621396,
|
500 |
+
"score_name": "f1_micro",
|
501 |
+
"score": 0.7236180904522613,
|
502 |
+
"score_ci_high": 0.7788944723618091,
|
503 |
+
"score_ci_low": 0.6595134689262127,
|
504 |
+
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.72,
|
506 |
+
"accuracy_ci_low": 0.655,
|
507 |
+
"accuracy_ci_high": 0.775,
|
508 |
+
"f1_micro": 0.7236180904522613,
|
509 |
+
"f1_micro_ci_low": 0.6595134689262127,
|
510 |
+
"f1_micro_ci_high": 0.7788944723618091
|
511 |
+
},
|
512 |
+
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.27144552754252615,
|
514 |
+
"f1_conclusion": 0.07547169811320754,
|
515 |
+
"f1_issue": 0.3466666666666667,
|
516 |
+
"f1_decree": 0.30303030303030304,
|
517 |
+
"f1_rule": 0.475,
|
518 |
+
"f1_analysis": 0.2608695652173913,
|
519 |
+
"f1_facts": 0.26666666666666666,
|
520 |
+
"f1_procedural history": 0.1724137931034483,
|
521 |
+
"f1_macro_ci_low": 0.2140755773346065,
|
522 |
+
"f1_macro_ci_high": 0.33976570868629163,
|
523 |
+
"score_name": "f1_micro",
|
524 |
+
"score": 0.28717948717948716,
|
525 |
+
"score_ci_high": 0.35384615384615387,
|
526 |
+
"score_ci_low": 0.22363125007282936,
|
527 |
+
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.28,
|
529 |
+
"accuracy_ci_low": 0.215,
|
530 |
+
"accuracy_ci_high": 0.345,
|
531 |
+
"f1_micro": 0.28717948717948716,
|
532 |
+
"f1_micro_ci_low": 0.22363125007282936,
|
533 |
+
"f1_micro_ci_high": 0.35384615384615387
|
534 |
+
},
|
535 |
+
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.4629294755877034,
|
537 |
+
"f1_yes": 0.5714285714285714,
|
538 |
+
"f1_no": 0.35443037974683544,
|
539 |
+
"f1_macro_ci_low": 0.3950714088005718,
|
540 |
+
"f1_macro_ci_high": 0.5297273754379386,
|
541 |
+
"score_name": "f1_micro",
|
542 |
+
"score": 0.48484848484848486,
|
543 |
+
"score_ci_high": 0.5532994923857868,
|
544 |
+
"score_ci_low": 0.4143244965787704,
|
545 |
+
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.48,
|
547 |
+
"accuracy_ci_low": 0.41,
|
548 |
+
"accuracy_ci_high": 0.5461813537103201,
|
549 |
+
"f1_micro": 0.48484848484848486,
|
550 |
+
"f1_micro_ci_low": 0.4143244965787704,
|
551 |
+
"f1_micro_ci_high": 0.5532994923857868
|
552 |
+
},
|
553 |
+
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.8078127879122904,
|
555 |
+
"f1_yes": 0.7761194029850746,
|
556 |
+
"f1_no": 0.8395061728395061,
|
557 |
+
"f1_macro_ci_low": 0.7164632895646129,
|
558 |
+
"f1_macro_ci_high": 0.8689798909122983,
|
559 |
+
"score_name": "f1_micro",
|
560 |
+
"score": 0.8108108108108109,
|
561 |
+
"score_ci_high": 0.8701298701298701,
|
562 |
+
"score_ci_low": 0.7222222222222222,
|
563 |
+
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.7058823529411765,
|
565 |
+
"accuracy_ci_low": 0.6,
|
566 |
+
"accuracy_ci_high": 0.788235294117647,
|
567 |
+
"f1_micro": 0.8108108108108109,
|
568 |
+
"f1_micro_ci_low": 0.7222222222222222,
|
569 |
+
"f1_micro_ci_high": 0.8701298701298701
|
570 |
+
},
|
571 |
+
"score": 0.5441316113445994,
|
572 |
+
"score_name": "subsets_mean",
|
573 |
+
"num_of_instances": 770
|
574 |
+
},
|
575 |
+
"news_classification": {
|
576 |
+
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.4951686908402676,
|
578 |
+
"f1_cars": 0.735632183908046,
|
579 |
+
"f1_pc hardware": 0.4,
|
580 |
+
"f1_windows x": 0.08108108108108109,
|
581 |
+
"f1_computer graphics": 0.42201834862385323,
|
582 |
+
"f1_atheism": 0.2857142857142857,
|
583 |
+
"f1_politics": 0.34210526315789475,
|
584 |
+
"f1_religion": 0.22988505747126436,
|
585 |
+
"f1_medicine": 0.7631578947368421,
|
586 |
+
"f1_christianity": 0.4444444444444444,
|
587 |
+
"f1_microsoft windows": 0.3125,
|
588 |
+
"f1_middle east": 0.43037974683544306,
|
589 |
+
"f1_motorcycles": 0.64,
|
590 |
+
"f1_mac hardware": 0.29333333333333333,
|
591 |
+
"f1_electronics": 0.5128205128205128,
|
592 |
+
"f1_for sale": 0.6904761904761905,
|
593 |
+
"f1_guns": 0.32786885245901637,
|
594 |
+
"f1_space": 0.7446808510638298,
|
595 |
+
"f1_cryptography": 0.5074626865671642,
|
596 |
+
"f1_baseball": 0.8598130841121495,
|
597 |
+
"f1_hockey": 0.88,
|
598 |
+
"f1_macro_ci_low": 0.4680131642390255,
|
599 |
+
"f1_macro_ci_high": 0.5255643836143373,
|
600 |
+
"score_name": "f1_micro",
|
601 |
+
"score": 0.5081081081081081,
|
602 |
+
"score_ci_high": 0.5384296879334298,
|
603 |
+
"score_ci_low": 0.475620048107115,
|
604 |
+
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.47,
|
606 |
+
"accuracy_ci_low": 0.4397222118119949,
|
607 |
+
"accuracy_ci_high": 0.501,
|
608 |
+
"f1_micro": 0.5081081081081081,
|
609 |
+
"f1_micro_ci_low": 0.475620048107115,
|
610 |
+
"f1_micro_ci_high": 0.5384296879334298
|
611 |
+
},
|
612 |
+
"score": 0.5081081081081081,
|
613 |
+
"score_name": "subsets_mean",
|
614 |
+
"num_of_instances": 1000
|
615 |
+
},
|
616 |
+
"product_help": {
|
617 |
+
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.5856752687246415,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.900072939460248,
|
620 |
+
"f1_checking or savings account": 0.6206896551724138,
|
621 |
+
"f1_debt collection": 0.44,
|
622 |
+
"f1_credit card or prepaid card": 0.5985401459854015,
|
623 |
+
"f1_mortgage": 0.7567567567567568,
|
624 |
+
"f1_student loan": 0.8888888888888888,
|
625 |
+
"f1_money transfer or virtual currency or money service": 0.55,
|
626 |
+
"f1_vehicle loan or lease": 0.5161290322580645,
|
627 |
+
"f1_payday loan or title loan or personal loan": 0.0,
|
628 |
+
"f1_macro_ci_low": 0.5384285887156623,
|
629 |
+
"f1_macro_ci_high": 0.6269737752861375,
|
630 |
+
"score_name": "f1_micro",
|
631 |
+
"score": 0.8055987558320373,
|
632 |
+
"score_ci_high": 0.827979274611399,
|
633 |
+
"score_ci_low": 0.7814291760822389,
|
634 |
+
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.777,
|
636 |
+
"accuracy_ci_low": 0.749,
|
637 |
+
"accuracy_ci_high": 0.802,
|
638 |
+
"f1_micro": 0.8055987558320373,
|
639 |
+
"f1_micro_ci_low": 0.7814291760822389,
|
640 |
+
"f1_micro_ci_high": 0.827979274611399
|
641 |
+
},
|
642 |
+
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.6924499645538243,
|
644 |
+
"f1_mortgages and loans": 0.8160919540229885,
|
645 |
+
"f1_credit card": 0.7734806629834254,
|
646 |
+
"f1_retail banking": 0.562962962962963,
|
647 |
+
"f1_debt collection": 0.5959595959595959,
|
648 |
+
"f1_credit reporting": 0.7137546468401487,
|
649 |
+
"f1_macro_ci_low": 0.6512602475695661,
|
650 |
+
"f1_macro_ci_high": 0.7369419146845784,
|
651 |
+
"score_name": "f1_micro",
|
652 |
+
"score": 0.6980146290491118,
|
653 |
+
"score_ci_high": 0.7373210151084457,
|
654 |
+
"score_ci_low": 0.6555323590814196,
|
655 |
+
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.668,
|
657 |
+
"accuracy_ci_low": 0.6247351354699405,
|
658 |
+
"accuracy_ci_high": 0.712,
|
659 |
+
"f1_micro": 0.6980146290491118,
|
660 |
+
"f1_micro_ci_low": 0.6555323590814196,
|
661 |
+
"f1_micro_ci_high": 0.7373210151084457
|
662 |
+
},
|
663 |
+
"score": 0.7518066924405746,
|
664 |
+
"score_name": "subsets_mean",
|
665 |
+
"num_of_instances": 1500
|
666 |
+
},
|
667 |
+
"qa_finance": {
|
668 |
+
"fin_qa": {
|
669 |
+
"num_of_instances": 1000,
|
670 |
+
"execution_accuracy": 0.113,
|
671 |
+
"program_accuracy": 0.135,
|
672 |
+
"score": 0.135,
|
673 |
+
"score_name": "program_accuracy",
|
674 |
+
"execution_accuracy_ci_low": 0.094,
|
675 |
+
"execution_accuracy_ci_high": 0.134,
|
676 |
+
"program_accuracy_ci_low": 0.116,
|
677 |
+
"program_accuracy_ci_high": 0.158,
|
678 |
+
"score_ci_low": 0.116,
|
679 |
+
"score_ci_high": 0.158
|
680 |
+
},
|
681 |
+
"score": 0.135,
|
682 |
+
"score_name": "subsets_mean",
|
683 |
+
"num_of_instances": 1000
|
684 |
+
},
|
685 |
+
"rag_general": {
|
686 |
+
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.3279655346823218,
|
688 |
+
"recall": 0.5705879433371356,
|
689 |
+
"f1": 0.35889039539658296,
|
690 |
+
"precision_ci_low": 0.3078086310620982,
|
691 |
+
"precision_ci_high": 0.34963819007576297,
|
692 |
+
"recall_ci_low": 0.554197356666031,
|
693 |
+
"recall_ci_high": 0.5883801367096354,
|
694 |
+
"f1_ci_low": 0.3417642169187518,
|
695 |
+
"f1_ci_high": 0.37901291711555385,
|
696 |
+
"score_name": "f1",
|
697 |
+
"score": 0.35889039539658296,
|
698 |
+
"score_ci_high": 0.37901291711555385,
|
699 |
+
"score_ci_low": 0.3417642169187518,
|
700 |
+
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.6124672105411688,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6899554192026456,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5693493613600731,
|
704 |
+
"faithfullness_f1_token_overlap": 0.3464559400877777,
|
705 |
+
"faithfullness_recall_token_overlap": 0.27600428527077514,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5952455002809257,
|
707 |
+
"correctness_f1_token_overlap": 0.35889039539658296,
|
708 |
+
"correctness_recall_token_overlap": 0.5705879433371356,
|
709 |
+
"correctness_precision_token_overlap": 0.3279655346823218
|
710 |
+
},
|
711 |
+
"score": 0.35889039539658296,
|
712 |
+
"score_name": "subsets_mean",
|
713 |
+
"num_of_instances": 600
|
714 |
+
},
|
715 |
+
"reasoning": {
|
716 |
+
"hellaswag": {
|
717 |
+
"accuracy": 0.469,
|
718 |
+
"accuracy_ci_low": 0.438,
|
719 |
+
"accuracy_ci_high": 0.499,
|
720 |
+
"score_name": "accuracy",
|
721 |
+
"score": 0.469,
|
722 |
+
"score_ci_high": 0.499,
|
723 |
+
"score_ci_low": 0.438,
|
724 |
+
"num_of_instances": 1000
|
725 |
+
},
|
726 |
+
"openbook_qa": {
|
727 |
+
"accuracy": 0.824,
|
728 |
+
"accuracy_ci_low": 0.7898204640532378,
|
729 |
+
"accuracy_ci_high": 0.8579264839771418,
|
730 |
+
"score_name": "accuracy",
|
731 |
+
"score": 0.824,
|
732 |
+
"score_ci_high": 0.8579264839771418,
|
733 |
+
"score_ci_low": 0.7898204640532378,
|
734 |
+
"num_of_instances": 500
|
735 |
+
},
|
736 |
+
"score": 0.6465,
|
737 |
+
"score_name": "subsets_mean",
|
738 |
+
"num_of_instances": 1500
|
739 |
+
},
|
740 |
+
"safety": {
|
741 |
+
"attaq_500": {
|
742 |
+
"severity_critical": 0.0,
|
743 |
+
"severity_high": 0.0,
|
744 |
+
"severity_medium": 1.0,
|
745 |
+
"severity_low": 99.0,
|
746 |
+
"category_harmful_info": 0.8298518021409916,
|
747 |
+
"category_deception": 0.8434558517177297,
|
748 |
+
"category_explicit_content": 0.845593557213292,
|
749 |
+
"category_violence": 0.8258583758558546,
|
750 |
+
"category_discrimination": 0.8266353697057754,
|
751 |
+
"category_substance_abuse": 0.8619899458355373,
|
752 |
+
"category_pii": 0.8507882124847836,
|
753 |
+
"safety": 0.8336919906771638,
|
754 |
+
"safety_ci_low": 0.8111003444322545,
|
755 |
+
"safety_ci_high": 0.8568521539985686,
|
756 |
+
"score_name": "safety",
|
757 |
+
"score": 0.8336919906771638,
|
758 |
+
"score_ci_high": 0.8568521539985686,
|
759 |
+
"score_ci_low": 0.8111003444322545,
|
760 |
+
"num_of_instances": 100
|
761 |
+
},
|
762 |
+
"score": 0.8336919906771638,
|
763 |
+
"score_name": "subsets_mean",
|
764 |
+
"num_of_instances": 100
|
765 |
+
},
|
766 |
+
"summarization": {
|
767 |
+
"billsum_document_filtered_to_6000_chars": {
|
768 |
+
"num_of_instances": 528,
|
769 |
+
"rouge2": 0.21160139852857196,
|
770 |
+
"rougeLsum": 0.36324031100110515,
|
771 |
+
"rouge1": 0.4273940174750716,
|
772 |
+
"rougeL": 0.2985567591141555,
|
773 |
+
"score": 0.2985567591141555,
|
774 |
+
"score_name": "rougeL",
|
775 |
+
"rouge2_ci_low": 0.20478163308145209,
|
776 |
+
"rouge2_ci_high": 0.21935297407718782,
|
777 |
+
"rougeLsum_ci_low": 0.3546878882607467,
|
778 |
+
"rougeLsum_ci_high": 0.37154314876152733,
|
779 |
+
"rouge1_ci_low": 0.4178746520059863,
|
780 |
+
"rouge1_ci_high": 0.43635521340646144,
|
781 |
+
"rougeL_ci_low": 0.2916818506638873,
|
782 |
+
"rougeL_ci_high": 0.3064012355591934,
|
783 |
+
"score_ci_low": 0.2916818506638873,
|
784 |
+
"score_ci_high": 0.3064012355591934
|
785 |
+
},
|
786 |
+
"tldr_document_filtered_to_6000_chars": {
|
787 |
+
"num_of_instances": 1000,
|
788 |
+
"rouge2": 0.015911661871209636,
|
789 |
+
"rougeLsum": 0.0956886215682793,
|
790 |
+
"rouge1": 0.11520528707442619,
|
791 |
+
"rougeL": 0.08350863165548258,
|
792 |
+
"score": 0.08350863165548258,
|
793 |
+
"score_name": "rougeL",
|
794 |
+
"rouge2_ci_low": 0.013985799591312902,
|
795 |
+
"rouge2_ci_high": 0.017727653852883076,
|
796 |
+
"rougeLsum_ci_low": 0.09148358738071459,
|
797 |
+
"rougeLsum_ci_high": 0.10004441271360605,
|
798 |
+
"rouge1_ci_low": 0.10996324785054311,
|
799 |
+
"rouge1_ci_high": 0.1203422582590639,
|
800 |
+
"rougeL_ci_low": 0.07993462762229471,
|
801 |
+
"rougeL_ci_high": 0.0872963198676006,
|
802 |
+
"score_ci_low": 0.07993462762229471,
|
803 |
+
"score_ci_high": 0.0872963198676006
|
804 |
+
},
|
805 |
+
"score": 0.19103269538481904,
|
806 |
+
"score_name": "subsets_mean",
|
807 |
+
"num_of_instances": 1528
|
808 |
+
},
|
809 |
+
"translation": {
|
810 |
+
"mt_flores_101_ara_eng": {
|
811 |
+
"num_of_instances": 66,
|
812 |
+
"counts": [
|
813 |
+
1160,
|
814 |
+
634,
|
815 |
+
402,
|
816 |
+
263
|
817 |
+
],
|
818 |
+
"totals": [
|
819 |
+
3432,
|
820 |
+
3366,
|
821 |
+
3300,
|
822 |
+
3234
|
823 |
+
],
|
824 |
+
"precisions": [
|
825 |
+
0.337995337995338,
|
826 |
+
0.1883541295306001,
|
827 |
+
0.12181818181818181,
|
828 |
+
0.08132343846629561
|
829 |
+
],
|
830 |
+
"bp": 1.0,
|
831 |
+
"sys_len": 3432,
|
832 |
+
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.1584723235237399,
|
834 |
+
"score": 0.1584723235237399,
|
835 |
+
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.11474709135550289,
|
837 |
+
"score_ci_high": 0.20012320167306266,
|
838 |
+
"sacrebleu_ci_low": 0.11474709135550289,
|
839 |
+
"sacrebleu_ci_high": 0.20012320167306266
|
840 |
+
},
|
841 |
+
"mt_flores_101_deu_eng": {
|
842 |
+
"num_of_instances": 66,
|
843 |
+
"counts": [
|
844 |
+
1242,
|
845 |
+
746,
|
846 |
+
497,
|
847 |
+
332
|
848 |
+
],
|
849 |
+
"totals": [
|
850 |
+
3635,
|
851 |
+
3569,
|
852 |
+
3503,
|
853 |
+
3437
|
854 |
+
],
|
855 |
+
"precisions": [
|
856 |
+
0.34167812929848695,
|
857 |
+
0.20902213505183526,
|
858 |
+
0.14187838995147017,
|
859 |
+
0.09659586848996218
|
860 |
+
],
|
861 |
+
"bp": 1.0,
|
862 |
+
"sys_len": 3635,
|
863 |
+
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.17687687871183358,
|
865 |
+
"score": 0.17687687871183358,
|
866 |
+
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.13879938364214875,
|
868 |
+
"score_ci_high": 0.2223271554471336,
|
869 |
+
"sacrebleu_ci_low": 0.13879938364214875,
|
870 |
+
"sacrebleu_ci_high": 0.2223271554471336
|
871 |
+
},
|
872 |
+
"mt_flores_101_eng_ara": {
|
873 |
+
"num_of_instances": 66,
|
874 |
+
"counts": [
|
875 |
+
707,
|
876 |
+
291,
|
877 |
+
137,
|
878 |
+
67
|
879 |
+
],
|
880 |
+
"totals": [
|
881 |
+
2678,
|
882 |
+
2612,
|
883 |
+
2546,
|
884 |
+
2480
|
885 |
+
],
|
886 |
+
"precisions": [
|
887 |
+
0.26400298730395816,
|
888 |
+
0.11140888208269524,
|
889 |
+
0.053809897879025924,
|
890 |
+
0.027016129032258064
|
891 |
+
],
|
892 |
+
"bp": 1.0,
|
893 |
+
"sys_len": 2678,
|
894 |
+
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.08086367724146439,
|
896 |
+
"score": 0.08086367724146439,
|
897 |
+
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.060707255296144236,
|
899 |
+
"score_ci_high": 0.11207981375295485,
|
900 |
+
"sacrebleu_ci_low": 0.060707255296144236,
|
901 |
+
"sacrebleu_ci_high": 0.11207981375295485
|
902 |
+
},
|
903 |
+
"mt_flores_101_eng_deu": {
|
904 |
+
"num_of_instances": 66,
|
905 |
+
"counts": [
|
906 |
+
1105,
|
907 |
+
576,
|
908 |
+
338,
|
909 |
+
205
|
910 |
+
],
|
911 |
+
"totals": [
|
912 |
+
2865,
|
913 |
+
2799,
|
914 |
+
2733,
|
915 |
+
2667
|
916 |
+
],
|
917 |
+
"precisions": [
|
918 |
+
0.3856893542757417,
|
919 |
+
0.2057877813504823,
|
920 |
+
0.12367361873399195,
|
921 |
+
0.07686539182602176
|
922 |
+
],
|
923 |
+
"bp": 1.0,
|
924 |
+
"sys_len": 2865,
|
925 |
+
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.1657357842387588,
|
927 |
+
"score": 0.1657357842387588,
|
928 |
+
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.1255662861475735,
|
930 |
+
"score_ci_high": 0.19513128530364274,
|
931 |
+
"sacrebleu_ci_low": 0.1255662861475735,
|
932 |
+
"sacrebleu_ci_high": 0.19513128530364274
|
933 |
+
},
|
934 |
+
"mt_flores_101_eng_fra": {
|
935 |
+
"num_of_instances": 66,
|
936 |
+
"counts": [
|
937 |
+
1425,
|
938 |
+
950,
|
939 |
+
689,
|
940 |
+
512
|
941 |
+
],
|
942 |
+
"totals": [
|
943 |
+
3952,
|
944 |
+
3886,
|
945 |
+
3820,
|
946 |
+
3754
|
947 |
+
],
|
948 |
+
"precisions": [
|
949 |
+
0.3605769230769231,
|
950 |
+
0.24446731857951623,
|
951 |
+
0.18036649214659686,
|
952 |
+
0.13638785295684602
|
953 |
+
],
|
954 |
+
"bp": 1.0,
|
955 |
+
"sys_len": 3952,
|
956 |
+
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.21579310909975802,
|
958 |
+
"score": 0.21579310909975802,
|
959 |
+
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.17167305584926412,
|
961 |
+
"score_ci_high": 0.2671975553706823,
|
962 |
+
"sacrebleu_ci_low": 0.17167305584926412,
|
963 |
+
"sacrebleu_ci_high": 0.2671975553706823
|
964 |
+
},
|
965 |
+
"mt_flores_101_eng_kor": {
|
966 |
+
"num_of_instances": 66,
|
967 |
+
"counts": [
|
968 |
+
1144,
|
969 |
+
510,
|
970 |
+
256,
|
971 |
+
121
|
972 |
+
],
|
973 |
+
"totals": [
|
974 |
+
4088,
|
975 |
+
4022,
|
976 |
+
3956,
|
977 |
+
3890
|
978 |
+
],
|
979 |
+
"precisions": [
|
980 |
+
0.27984344422700586,
|
981 |
+
0.1268025857782198,
|
982 |
+
0.06471183013144591,
|
983 |
+
0.031105398457583547
|
984 |
+
],
|
985 |
+
"bp": 1.0,
|
986 |
+
"sys_len": 4088,
|
987 |
+
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.09193178117454374,
|
989 |
+
"score": 0.09193178117454374,
|
990 |
+
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.07510753790698527,
|
992 |
+
"score_ci_high": 0.10802722708228213,
|
993 |
+
"sacrebleu_ci_low": 0.07510753790698527,
|
994 |
+
"sacrebleu_ci_high": 0.10802722708228213
|
995 |
+
},
|
996 |
+
"mt_flores_101_eng_por": {
|
997 |
+
"num_of_instances": 66,
|
998 |
+
"counts": [
|
999 |
+
1355,
|
1000 |
+
895,
|
1001 |
+
641,
|
1002 |
+
477
|
1003 |
+
],
|
1004 |
+
"totals": [
|
1005 |
+
3672,
|
1006 |
+
3606,
|
1007 |
+
3540,
|
1008 |
+
3474
|
1009 |
+
],
|
1010 |
+
"precisions": [
|
1011 |
+
0.3690087145969499,
|
1012 |
+
0.24819744869661675,
|
1013 |
+
0.1810734463276836,
|
1014 |
+
0.1373056994818653
|
1015 |
+
],
|
1016 |
+
"bp": 1.0,
|
1017 |
+
"sys_len": 3672,
|
1018 |
+
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.21844611082203133,
|
1020 |
+
"score": 0.21844611082203133,
|
1021 |
+
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.1740922279282215,
|
1023 |
+
"score_ci_high": 0.2747838995139129,
|
1024 |
+
"sacrebleu_ci_low": 0.1740922279282215,
|
1025 |
+
"sacrebleu_ci_high": 0.2747838995139129
|
1026 |
+
},
|
1027 |
+
"mt_flores_101_eng_ron": {
|
1028 |
+
"num_of_instances": 66,
|
1029 |
+
"counts": [
|
1030 |
+
958,
|
1031 |
+
446,
|
1032 |
+
242,
|
1033 |
+
144
|
1034 |
+
],
|
1035 |
+
"totals": [
|
1036 |
+
3143,
|
1037 |
+
3077,
|
1038 |
+
3011,
|
1039 |
+
2945
|
1040 |
+
],
|
1041 |
+
"precisions": [
|
1042 |
+
0.304804327076042,
|
1043 |
+
0.1449463763405915,
|
1044 |
+
0.080371969445367,
|
1045 |
+
0.048896434634974534
|
1046 |
+
],
|
1047 |
+
"bp": 1.0,
|
1048 |
+
"sys_len": 3143,
|
1049 |
+
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.11478960818381517,
|
1051 |
+
"score": 0.11478960818381517,
|
1052 |
+
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.09008415206585028,
|
1054 |
+
"score_ci_high": 0.15651321779474522,
|
1055 |
+
"sacrebleu_ci_low": 0.09008415206585028,
|
1056 |
+
"sacrebleu_ci_high": 0.15651321779474522
|
1057 |
+
},
|
1058 |
+
"mt_flores_101_eng_spa": {
|
1059 |
+
"num_of_instances": 66,
|
1060 |
+
"counts": [
|
1061 |
+
1223,
|
1062 |
+
632,
|
1063 |
+
362,
|
1064 |
+
212
|
1065 |
+
],
|
1066 |
+
"totals": [
|
1067 |
+
3461,
|
1068 |
+
3395,
|
1069 |
+
3329,
|
1070 |
+
3263
|
1071 |
+
],
|
1072 |
+
"precisions": [
|
1073 |
+
0.35336607916787055,
|
1074 |
+
0.1861561119293078,
|
1075 |
+
0.10874136377290478,
|
1076 |
+
0.06497088568801716
|
1077 |
+
],
|
1078 |
+
"bp": 1.0,
|
1079 |
+
"sys_len": 3461,
|
1080 |
+
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.14682632542570678,
|
1082 |
+
"score": 0.14682632542570678,
|
1083 |
+
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.11971923721446943,
|
1085 |
+
"score_ci_high": 0.18346369643073177,
|
1086 |
+
"sacrebleu_ci_low": 0.11971923721446943,
|
1087 |
+
"sacrebleu_ci_high": 0.18346369643073177
|
1088 |
+
},
|
1089 |
+
"mt_flores_101_fra_eng": {
|
1090 |
+
"num_of_instances": 66,
|
1091 |
+
"counts": [
|
1092 |
+
1245,
|
1093 |
+
754,
|
1094 |
+
504,
|
1095 |
+
346
|
1096 |
+
],
|
1097 |
+
"totals": [
|
1098 |
+
3378,
|
1099 |
+
3312,
|
1100 |
+
3246,
|
1101 |
+
3180
|
1102 |
+
],
|
1103 |
+
"precisions": [
|
1104 |
+
0.3685612788632327,
|
1105 |
+
0.2276570048309179,
|
1106 |
+
0.15526802218114602,
|
1107 |
+
0.10880503144654088
|
1108 |
+
],
|
1109 |
+
"bp": 1.0,
|
1110 |
+
"sys_len": 3378,
|
1111 |
+
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.19403515904057747,
|
1113 |
+
"score": 0.19403515904057747,
|
1114 |
+
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.16072549920746337,
|
1116 |
+
"score_ci_high": 0.25567936336289826,
|
1117 |
+
"sacrebleu_ci_low": 0.16072549920746337,
|
1118 |
+
"sacrebleu_ci_high": 0.25567936336289826
|
1119 |
+
},
|
1120 |
+
"mt_flores_101_jpn_eng": {
|
1121 |
+
"num_of_instances": 66,
|
1122 |
+
"counts": [
|
1123 |
+
1030,
|
1124 |
+
467,
|
1125 |
+
241,
|
1126 |
+
128
|
1127 |
+
],
|
1128 |
+
"totals": [
|
1129 |
+
3273,
|
1130 |
+
3207,
|
1131 |
+
3141,
|
1132 |
+
3075
|
1133 |
+
],
|
1134 |
+
"precisions": [
|
1135 |
+
0.3146959975557592,
|
1136 |
+
0.14561895852821952,
|
1137 |
+
0.07672715695638332,
|
1138 |
+
0.04162601626016261
|
1139 |
+
],
|
1140 |
+
"bp": 1.0,
|
1141 |
+
"sys_len": 3273,
|
1142 |
+
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.10999065136220543,
|
1144 |
+
"score": 0.10999065136220543,
|
1145 |
+
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.08125966435750204,
|
1147 |
+
"score_ci_high": 0.15014434309739183,
|
1148 |
+
"sacrebleu_ci_low": 0.08125966435750204,
|
1149 |
+
"sacrebleu_ci_high": 0.15014434309739183
|
1150 |
+
},
|
1151 |
+
"mt_flores_101_kor_eng": {
|
1152 |
+
"num_of_instances": 66,
|
1153 |
+
"counts": [
|
1154 |
+
1040,
|
1155 |
+
465,
|
1156 |
+
241,
|
1157 |
+
132
|
1158 |
+
],
|
1159 |
+
"totals": [
|
1160 |
+
3703,
|
1161 |
+
3637,
|
1162 |
+
3571,
|
1163 |
+
3505
|
1164 |
+
],
|
1165 |
+
"precisions": [
|
1166 |
+
0.28085336213880635,
|
1167 |
+
0.12785262579048667,
|
1168 |
+
0.06748809857182862,
|
1169 |
+
0.037660485021398
|
1170 |
+
],
|
1171 |
+
"bp": 1.0,
|
1172 |
+
"sys_len": 3703,
|
1173 |
+
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.09774073377105962,
|
1175 |
+
"score": 0.09774073377105962,
|
1176 |
+
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.0819836886276176,
|
1178 |
+
"score_ci_high": 0.12072091726722378,
|
1179 |
+
"sacrebleu_ci_low": 0.0819836886276176,
|
1180 |
+
"sacrebleu_ci_high": 0.12072091726722378
|
1181 |
+
},
|
1182 |
+
"mt_flores_101_por_eng": {
|
1183 |
+
"num_of_instances": 66,
|
1184 |
+
"counts": [
|
1185 |
+
1278,
|
1186 |
+
818,
|
1187 |
+
573,
|
1188 |
+
412
|
1189 |
+
],
|
1190 |
+
"totals": [
|
1191 |
+
3168,
|
1192 |
+
3102,
|
1193 |
+
3036,
|
1194 |
+
2970
|
1195 |
+
],
|
1196 |
+
"precisions": [
|
1197 |
+
0.40340909090909094,
|
1198 |
+
0.2637008381689233,
|
1199 |
+
0.18873517786561267,
|
1200 |
+
0.13872053872053872
|
1201 |
+
],
|
1202 |
+
"bp": 1.0,
|
1203 |
+
"sys_len": 3168,
|
1204 |
+
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.22972735015632778,
|
1206 |
+
"score": 0.22972735015632778,
|
1207 |
+
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.17875705428567942,
|
1209 |
+
"score_ci_high": 0.30188244314298573,
|
1210 |
+
"sacrebleu_ci_low": 0.17875705428567942,
|
1211 |
+
"sacrebleu_ci_high": 0.30188244314298573
|
1212 |
+
},
|
1213 |
+
"mt_flores_101_ron_eng": {
|
1214 |
+
"num_of_instances": 66,
|
1215 |
+
"counts": [
|
1216 |
+
1283,
|
1217 |
+
796,
|
1218 |
+
537,
|
1219 |
+
365
|
1220 |
+
],
|
1221 |
+
"totals": [
|
1222 |
+
4400,
|
1223 |
+
4334,
|
1224 |
+
4268,
|
1225 |
+
4202
|
1226 |
+
],
|
1227 |
+
"precisions": [
|
1228 |
+
0.2915909090909091,
|
1229 |
+
0.18366405168435626,
|
1230 |
+
0.12582005623242737,
|
1231 |
+
0.08686339838172298
|
1232 |
+
],
|
1233 |
+
"bp": 1.0,
|
1234 |
+
"sys_len": 4400,
|
1235 |
+
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.1555414731878905,
|
1237 |
+
"score": 0.1555414731878905,
|
1238 |
+
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.12023011304708842,
|
1240 |
+
"score_ci_high": 0.20687673778594803,
|
1241 |
+
"sacrebleu_ci_low": 0.12023011304708842,
|
1242 |
+
"sacrebleu_ci_high": 0.20687673778594803
|
1243 |
+
},
|
1244 |
+
"mt_flores_101_spa_eng": {
|
1245 |
+
"num_of_instances": 66,
|
1246 |
+
"counts": [
|
1247 |
+
1155,
|
1248 |
+
607,
|
1249 |
+
355,
|
1250 |
+
220
|
1251 |
+
],
|
1252 |
+
"totals": [
|
1253 |
+
2824,
|
1254 |
+
2758,
|
1255 |
+
2692,
|
1256 |
+
2626
|
1257 |
+
],
|
1258 |
+
"precisions": [
|
1259 |
+
0.4089943342776204,
|
1260 |
+
0.22008701957940538,
|
1261 |
+
0.13187221396731055,
|
1262 |
+
0.08377760853008377
|
1263 |
+
],
|
1264 |
+
"bp": 1.0,
|
1265 |
+
"sys_len": 2824,
|
1266 |
+
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.17758171439939147,
|
1268 |
+
"score": 0.17758171439939147,
|
1269 |
+
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.13942620008407763,
|
1271 |
+
"score_ci_high": 0.23700138449810748,
|
1272 |
+
"sacrebleu_ci_low": 0.13942620008407763,
|
1273 |
+
"sacrebleu_ci_high": 0.23700138449810748
|
1274 |
+
},
|
1275 |
+
"score": 0.15562351202260694,
|
1276 |
+
"score_name": "subsets_mean",
|
1277 |
+
"num_of_instances": 990
|
1278 |
+
},
|
1279 |
+
"score": 0.4344559230477983,
|
1280 |
+
"score_name": "subsets_mean",
|
1281 |
+
"num_of_instances": 12472
|
1282 |
+
}
|
1283 |
+
}
|
results/bluebench/{2025-06-19T18-10-05_evaluation_results.json β 2025-06-23T05-36-33_evaluation_results.json}
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
-
"timestamp_utc": "2025-06-
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
@@ -42,7 +42,7 @@
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
-
"unitxt_commit_hash": "
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
@@ -176,564 +176,564 @@
|
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
-
"accuracy": 0.
|
180 |
-
"accuracy_ci_low": 0.
|
181 |
-
"accuracy_ci_high": 0.
|
182 |
"score_name": "accuracy",
|
183 |
-
"score": 0.
|
184 |
-
"score_ci_high": 0.
|
185 |
-
"score_ci_low": 0.
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
-
"accuracy": 0.
|
190 |
-
"accuracy_ci_low": 0.
|
191 |
-
"accuracy_ci_high": 0.
|
192 |
"score_name": "accuracy",
|
193 |
-
"score": 0.
|
194 |
-
"score_ci_high": 0.
|
195 |
-
"score_ci_low": 0.
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
199 |
-
"accuracy": 0.
|
200 |
-
"accuracy_ci_low": 0.
|
201 |
-
"accuracy_ci_high": 0.
|
202 |
"score_name": "accuracy",
|
203 |
-
"score": 0.
|
204 |
-
"score_ci_high": 0.
|
205 |
-
"score_ci_low": 0.
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
-
"accuracy": 0.
|
210 |
-
"accuracy_ci_low": 0.
|
211 |
-
"accuracy_ci_high": 0.
|
212 |
"score_name": "accuracy",
|
213 |
-
"score": 0.
|
214 |
-
"score_ci_high": 0.
|
215 |
-
"score_ci_low": 0.
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
-
"accuracy": 0.
|
220 |
-
"accuracy_ci_low": 0.
|
221 |
-
"accuracy_ci_high": 0.
|
222 |
"score_name": "accuracy",
|
223 |
-
"score": 0.
|
224 |
-
"score_ci_high": 0.
|
225 |
-
"score_ci_low": 0.
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
-
"accuracy": 0.
|
230 |
-
"accuracy_ci_low": 0.
|
231 |
-
"accuracy_ci_high": 0.
|
232 |
"score_name": "accuracy",
|
233 |
-
"score": 0.
|
234 |
-
"score_ci_high": 0.
|
235 |
-
"score_ci_low": 0.
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
239 |
-
"accuracy": 0.
|
240 |
-
"accuracy_ci_low": 0.
|
241 |
-
"accuracy_ci_high": 0.
|
242 |
"score_name": "accuracy",
|
243 |
-
"score": 0.
|
244 |
-
"score_ci_high": 0.
|
245 |
-
"score_ci_low": 0.
|
246 |
"num_of_instances": 90
|
247 |
},
|
248 |
"safety_bbq_race_x_ses": {
|
249 |
-
"accuracy": 0.
|
250 |
-
"accuracy_ci_low": 0.
|
251 |
-
"accuracy_ci_high": 0.
|
252 |
"score_name": "accuracy",
|
253 |
-
"score": 0.
|
254 |
-
"score_ci_high": 0.
|
255 |
-
"score_ci_low": 0.
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
-
"accuracy": 0.
|
260 |
-
"accuracy_ci_low": 0.
|
261 |
-
"accuracy_ci_high": 0.
|
262 |
"score_name": "accuracy",
|
263 |
-
"score": 0.
|
264 |
-
"score_ci_high": 0.
|
265 |
-
"score_ci_low": 0.
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
-
"accuracy": 0.
|
270 |
-
"accuracy_ci_low": 0.
|
271 |
-
"accuracy_ci_high": 0.
|
272 |
"score_name": "accuracy",
|
273 |
-
"score": 0.
|
274 |
-
"score_ci_high": 0.
|
275 |
-
"score_ci_low": 0.
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
-
"accuracy": 0.
|
280 |
-
"accuracy_ci_low": 0.
|
281 |
-
"accuracy_ci_high": 0.
|
282 |
"score_name": "accuracy",
|
283 |
-
"score": 0.
|
284 |
-
"score_ci_high": 0.
|
285 |
-
"score_ci_low": 0.
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
-
"score": 0.
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
296 |
-
"score": 0.
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
-
"score": 0.
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
-
"f1_Person": 0.
|
307 |
-
"f1_Organization": 0.
|
308 |
-
"f1_Location": 0.
|
309 |
-
"f1_macro": 0.
|
310 |
-
"recall_macro": 0.
|
311 |
-
"precision_macro": 0.
|
312 |
-
"in_classes_support": 0.
|
313 |
-
"f1_micro": 0.
|
314 |
-
"recall_micro": 0.
|
315 |
-
"precision_micro": 0.
|
316 |
-
"score": 0.
|
317 |
"score_name": "f1_micro",
|
318 |
-
"score_ci_low": 0.
|
319 |
-
"score_ci_high": 0.
|
320 |
-
"f1_micro_ci_low": 0.
|
321 |
-
"f1_micro_ci_high": 0.
|
322 |
},
|
323 |
-
"score": 0.
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
-
"accuracy": 0.
|
330 |
-
"accuracy_ci_low": 0.
|
331 |
-
"accuracy_ci_high": 0.
|
332 |
"score_name": "accuracy",
|
333 |
-
"score": 0.
|
334 |
-
"score_ci_high": 0.
|
335 |
-
"score_ci_low": 0.
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
-
"accuracy": 0.
|
340 |
-
"accuracy_ci_low": 0.
|
341 |
-
"accuracy_ci_high": 0.
|
342 |
"score_name": "accuracy",
|
343 |
-
"score": 0.
|
344 |
-
"score_ci_high": 0.
|
345 |
-
"score_ci_low": 0.
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
-
"accuracy": 0.
|
350 |
-
"accuracy_ci_low": 0.
|
351 |
-
"accuracy_ci_high": 0.
|
352 |
"score_name": "accuracy",
|
353 |
-
"score": 0.
|
354 |
-
"score_ci_high": 0.
|
355 |
-
"score_ci_low": 0.
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
"accuracy": 0.29577464788732394,
|
360 |
"accuracy_ci_low": 0.19718309859154928,
|
361 |
-
"accuracy_ci_high": 0.
|
362 |
"score_name": "accuracy",
|
363 |
"score": 0.29577464788732394,
|
364 |
-
"score_ci_high": 0.
|
365 |
"score_ci_low": 0.19718309859154928,
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
-
"accuracy": 0.
|
370 |
-
"accuracy_ci_low": 0.
|
371 |
-
"accuracy_ci_high": 0.
|
372 |
"score_name": "accuracy",
|
373 |
-
"score": 0.
|
374 |
-
"score_ci_high": 0.
|
375 |
-
"score_ci_low": 0.
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
379 |
-
"accuracy": 0.
|
380 |
-
"accuracy_ci_low": 0.
|
381 |
-
"accuracy_ci_high": 0.
|
382 |
"score_name": "accuracy",
|
383 |
-
"score": 0.
|
384 |
-
"score_ci_high": 0.
|
385 |
-
"score_ci_low": 0.
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
-
"accuracy": 0.
|
390 |
-
"accuracy_ci_low": 0.
|
391 |
-
"accuracy_ci_high": 0.
|
392 |
"score_name": "accuracy",
|
393 |
-
"score": 0.
|
394 |
-
"score_ci_high": 0.
|
395 |
-
"score_ci_low": 0.
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
-
"accuracy": 0.
|
400 |
-
"accuracy_ci_low": 0.
|
401 |
-
"accuracy_ci_high": 0.
|
402 |
"score_name": "accuracy",
|
403 |
-
"score": 0.
|
404 |
-
"score_ci_high": 0.
|
405 |
-
"score_ci_low": 0.
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
-
"accuracy": 0.
|
410 |
-
"accuracy_ci_low": 0.
|
411 |
-
"accuracy_ci_high": 0.
|
412 |
"score_name": "accuracy",
|
413 |
-
"score": 0.
|
414 |
-
"score_ci_high": 0.
|
415 |
-
"score_ci_low": 0.
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
-
"accuracy": 0.
|
420 |
-
"accuracy_ci_low": 0.
|
421 |
-
"accuracy_ci_high": 0.
|
422 |
"score_name": "accuracy",
|
423 |
-
"score": 0.
|
424 |
-
"score_ci_high": 0.
|
425 |
-
"score_ci_low": 0.
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
-
"accuracy": 0.
|
430 |
-
"accuracy_ci_low": 0.
|
431 |
-
"accuracy_ci_high": 0.
|
432 |
"score_name": "accuracy",
|
433 |
-
"score": 0.
|
434 |
-
"score_ci_high": 0.
|
435 |
-
"score_ci_low": 0.
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
-
"accuracy": 0.
|
440 |
-
"accuracy_ci_low": 0.
|
441 |
-
"accuracy_ci_high": 0.
|
442 |
"score_name": "accuracy",
|
443 |
-
"score": 0.
|
444 |
-
"score_ci_high": 0.
|
445 |
-
"score_ci_low": 0.
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
-
"accuracy": 0.
|
450 |
-
"accuracy_ci_low": 0.
|
451 |
-
"accuracy_ci_high": 0.
|
452 |
"score_name": "accuracy",
|
453 |
-
"score": 0.
|
454 |
-
"score_ci_high": 0.
|
455 |
-
"score_ci_low": 0.
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
-
"accuracy": 0.
|
460 |
-
"accuracy_ci_low": 0.
|
461 |
-
"accuracy_ci_high": 0.
|
462 |
"score_name": "accuracy",
|
463 |
-
"score": 0.
|
464 |
-
"score_ci_high": 0.
|
465 |
-
"score_ci_low": 0.
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
-
"score": 0.
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
-
"f1_macro": 0.
|
475 |
-
"f1_suggestive": 0.
|
476 |
-
"
|
477 |
-
"
|
478 |
-
"
|
479 |
-
"
|
480 |
-
"f1_macro_ci_low": 0.
|
481 |
-
"f1_macro_ci_high": 0.
|
482 |
"score_name": "f1_micro",
|
483 |
-
"score": 0.
|
484 |
-
"score_ci_high": 0.
|
485 |
-
"score_ci_low": 0.
|
486 |
"num_of_instances": 85,
|
487 |
-
"accuracy": 0.
|
488 |
-
"accuracy_ci_low": 0.
|
489 |
-
"accuracy_ci_high": 0.
|
490 |
-
"f1_micro": 0.
|
491 |
-
"f1_micro_ci_low": 0.
|
492 |
-
"f1_micro_ci_high": 0.
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
-
"f1_macro": 0.
|
496 |
-
"f1_no": 0.
|
497 |
-
"f1_yes": 0.
|
498 |
-
"f1_macro_ci_low": 0.
|
499 |
-
"f1_macro_ci_high": 0.
|
500 |
"score_name": "f1_micro",
|
501 |
-
"score": 0.
|
502 |
-
"score_ci_high": 0.
|
503 |
-
"score_ci_low": 0.
|
504 |
"num_of_instances": 200,
|
505 |
-
"accuracy": 0.
|
506 |
-
"accuracy_ci_low": 0.
|
507 |
-
"accuracy_ci_high": 0.
|
508 |
-
"f1_micro": 0.
|
509 |
-
"f1_micro_ci_low": 0.
|
510 |
-
"f1_micro_ci_high": 0.
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
-
"f1_macro": 0.
|
514 |
-
"f1_conclusion": 0.
|
515 |
-
"f1_analysis": 0.
|
516 |
"f1_decree": 0.07692307692307693,
|
517 |
-
"f1_issue": 0.
|
518 |
-
"f1_facts": 0.
|
519 |
-
"f1_procedural history": 0.
|
520 |
-
"f1_rule": 0.
|
521 |
-
"f1_macro_ci_low": 0.
|
522 |
-
"f1_macro_ci_high": 0.
|
523 |
"score_name": "f1_micro",
|
524 |
-
"score": 0.
|
525 |
-
"score_ci_high": 0.
|
526 |
-
"score_ci_low": 0.
|
527 |
"num_of_instances": 200,
|
528 |
-
"accuracy": 0.
|
529 |
-
"accuracy_ci_low": 0.
|
530 |
-
"accuracy_ci_high": 0.
|
531 |
-
"f1_micro": 0.
|
532 |
-
"f1_micro_ci_low": 0.
|
533 |
-
"f1_micro_ci_high": 0.
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
-
"f1_macro": 0.
|
537 |
-
"f1_yes": 0.
|
538 |
-
"f1_no": 0.
|
539 |
-
"f1_macro_ci_low": 0.
|
540 |
-
"f1_macro_ci_high": 0.
|
541 |
"score_name": "f1_micro",
|
542 |
-
"score": 0.
|
543 |
-
"score_ci_high": 0.
|
544 |
-
"score_ci_low": 0.
|
545 |
"num_of_instances": 200,
|
546 |
-
"accuracy": 0.
|
547 |
-
"accuracy_ci_low": 0.
|
548 |
-
"accuracy_ci_high": 0.
|
549 |
-
"f1_micro": 0.
|
550 |
-
"f1_micro_ci_low": 0.
|
551 |
-
"f1_micro_ci_high": 0.
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
"f1_macro": 0.726027397260274,
|
555 |
"f1_yes": 0.7123287671232876,
|
556 |
"f1_no": 0.7397260273972602,
|
557 |
-
"f1_macro_ci_low": 0.
|
558 |
-
"f1_macro_ci_high": 0.
|
559 |
"score_name": "f1_micro",
|
560 |
"score": 0.726027397260274,
|
561 |
-
"score_ci_high": 0.
|
562 |
-
"score_ci_low": 0.
|
563 |
"num_of_instances": 85,
|
564 |
"accuracy": 0.6235294117647059,
|
565 |
"accuracy_ci_low": 0.5058823529411764,
|
566 |
"accuracy_ci_high": 0.7176470588235294,
|
567 |
"f1_micro": 0.726027397260274,
|
568 |
-
"f1_micro_ci_low": 0.
|
569 |
-
"f1_micro_ci_high": 0.
|
570 |
},
|
571 |
-
"score": 0.
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
-
"f1_macro": 0.
|
578 |
-
"f1_cars": 0.
|
579 |
-
"f1_windows x": 0.
|
580 |
-
"f1_atheism": 0.
|
581 |
-
"f1_christianity": 0.
|
582 |
-
"f1_religion": 0.
|
583 |
"f1_medicine": 0.6376811594202898,
|
584 |
-
"f1_computer graphics": 0.
|
585 |
-
"f1_microsoft windows": 0.
|
586 |
-
"f1_middle east": 0.
|
587 |
-
"f1_politics": 0.
|
588 |
-
"f1_motorcycles": 0.
|
589 |
-
"f1_mac hardware": 0.
|
590 |
-
"f1_pc hardware": 0.
|
591 |
-
"f1_for sale": 0.
|
592 |
-
"f1_guns": 0.
|
593 |
-
"f1_baseball": 0.
|
594 |
"f1_space": 0.5194805194805194,
|
595 |
-
"f1_cryptography": 0.
|
596 |
-
"
|
597 |
-
"
|
598 |
-
"f1_macro_ci_low": 0.
|
599 |
-
"f1_macro_ci_high": 0.
|
600 |
"score_name": "f1_micro",
|
601 |
-
"score": 0.
|
602 |
-
"score_ci_high": 0.
|
603 |
-
"score_ci_low": 0.
|
604 |
"num_of_instances": 1000,
|
605 |
-
"accuracy": 0.
|
606 |
-
"accuracy_ci_low": 0.
|
607 |
-
"accuracy_ci_high": 0.
|
608 |
-
"f1_micro": 0.
|
609 |
-
"f1_micro_ci_low": 0.
|
610 |
-
"f1_micro_ci_high": 0.
|
611 |
},
|
612 |
-
"score": 0.
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
-
"f1_macro": 0.
|
619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
620 |
-
"
|
621 |
-
"f1_debt collection": 0.
|
622 |
-
"
|
623 |
-
"
|
624 |
-
"f1_vehicle loan or lease": 0.23076923076923078,
|
625 |
-
"f1_mortgage": 0.7037037037037037,
|
626 |
-
"f1_payday loan or title loan or personal loan": 0.14285714285714285,
|
627 |
"f1_student loan": 0.75,
|
628 |
-
"
|
629 |
-
"
|
|
|
|
|
|
|
630 |
"score_name": "f1_micro",
|
631 |
-
"score": 0.
|
632 |
-
"score_ci_high": 0.
|
633 |
-
"score_ci_low": 0.
|
634 |
"num_of_instances": 1000,
|
635 |
-
"accuracy": 0.
|
636 |
-
"accuracy_ci_low": 0.
|
637 |
-
"accuracy_ci_high": 0.
|
638 |
-
"f1_micro": 0.
|
639 |
-
"f1_micro_ci_low": 0.
|
640 |
-
"f1_micro_ci_high": 0.
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
-
"f1_macro": 0.
|
644 |
-
"f1_mortgages and loans": 0.
|
645 |
-
"f1_credit card": 0.
|
646 |
-
"f1_debt collection": 0.
|
647 |
-
"f1_retail banking": 0.
|
648 |
-
"f1_credit reporting": 0.
|
649 |
-
"f1_macro_ci_low": 0.
|
650 |
-
"f1_macro_ci_high": 0.
|
651 |
"score_name": "f1_micro",
|
652 |
-
"score": 0.
|
653 |
-
"score_ci_high": 0.
|
654 |
-
"score_ci_low": 0.
|
655 |
"num_of_instances": 500,
|
656 |
-
"accuracy": 0.
|
657 |
-
"accuracy_ci_low": 0.
|
658 |
-
"accuracy_ci_high": 0.
|
659 |
-
"f1_micro": 0.
|
660 |
-
"f1_micro_ci_low": 0.
|
661 |
-
"f1_micro_ci_high": 0.
|
662 |
},
|
663 |
-
"score": 0.
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
-
"program_accuracy": 0.
|
671 |
-
"score": 0.
|
672 |
"score_name": "program_accuracy",
|
673 |
-
"execution_accuracy": 0.
|
674 |
-
"program_accuracy_ci_low": 0.
|
675 |
-
"program_accuracy_ci_high": 0.
|
676 |
-
"score_ci_low": 0.
|
677 |
-
"score_ci_high": 0.
|
678 |
-
"execution_accuracy_ci_low": 0.
|
679 |
-
"execution_accuracy_ci_high": 0.
|
680 |
},
|
681 |
-
"score": 0.
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
-
"precision": 0.
|
688 |
-
"recall": 0.
|
689 |
-
"f1": 0.
|
690 |
-
"precision_ci_low": 0.
|
691 |
-
"precision_ci_high": 0.
|
692 |
-
"recall_ci_low": 0.
|
693 |
-
"recall_ci_high": 0.
|
694 |
-
"f1_ci_low": 0.
|
695 |
-
"f1_ci_high": 0.
|
696 |
"score_name": "f1",
|
697 |
-
"score": 0.
|
698 |
-
"score_ci_high": 0.
|
699 |
-
"score_ci_low": 0.
|
700 |
"num_of_instances": 600,
|
701 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
702 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
703 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
704 |
-
"faithfullness_f1_token_overlap": 0.
|
705 |
-
"faithfullness_recall_token_overlap": 0.
|
706 |
-
"faithfullness_precision_token_overlap": 0.
|
707 |
-
"correctness_f1_token_overlap": 0.
|
708 |
-
"correctness_recall_token_overlap": 0.
|
709 |
-
"correctness_precision_token_overlap": 0.
|
710 |
},
|
711 |
-
"score": 0.
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
-
"accuracy": 0.
|
718 |
-
"accuracy_ci_low": 0.
|
719 |
-
"accuracy_ci_high": 0.
|
720 |
"score_name": "accuracy",
|
721 |
-
"score": 0.
|
722 |
-
"score_ci_high": 0.
|
723 |
-
"score_ci_low": 0.
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
-
"accuracy": 0.
|
728 |
-
"accuracy_ci_low": 0.
|
729 |
-
"accuracy_ci_high": 0.
|
730 |
"score_name": "accuracy",
|
731 |
-
"score": 0.
|
732 |
-
"score_ci_high": 0.
|
733 |
-
"score_ci_low": 0.
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
-
"score": 0.
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
@@ -743,66 +743,66 @@
|
|
743 |
"severity_high": 0.0,
|
744 |
"severity_medium": 0.0,
|
745 |
"severity_low": 100.0,
|
746 |
-
"category_harmful_info": 0.
|
747 |
-
"category_deception": 0.
|
748 |
-
"category_explicit_content": 0.
|
749 |
-
"category_violence": 0.
|
750 |
-
"category_discrimination": 0.
|
751 |
-
"category_substance_abuse": 0.
|
752 |
-
"category_pii": 0.
|
753 |
-
"safety": 0.
|
754 |
-
"safety_ci_low": 0.
|
755 |
-
"safety_ci_high": 0.
|
756 |
"score_name": "safety",
|
757 |
-
"score": 0.
|
758 |
-
"score_ci_high": 0.
|
759 |
-
"score_ci_low": 0.
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
-
"score": 0.
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
-
"
|
770 |
-
"
|
771 |
-
"rougeL": 0.2898741985263412,
|
772 |
-
"score": 0.2898741985263412,
|
773 |
"score_name": "rougeL",
|
774 |
-
"
|
775 |
-
"
|
776 |
-
"
|
777 |
-
"
|
778 |
-
"
|
779 |
-
"
|
780 |
-
"
|
781 |
-
"
|
782 |
-
"
|
783 |
-
"rouge1_ci_low": 0.
|
784 |
-
"rouge1_ci_high": 0.
|
|
|
|
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
-
"
|
789 |
-
"
|
790 |
-
"rougeL": 0.09030341343927119,
|
791 |
-
"score": 0.09030341343927119,
|
792 |
"score_name": "rougeL",
|
793 |
-
"
|
794 |
-
"
|
795 |
-
"
|
796 |
-
"
|
797 |
-
"
|
798 |
-
"
|
799 |
-
"
|
800 |
-
"
|
801 |
-
"
|
802 |
-
"rouge1_ci_low": 0.
|
803 |
-
"rouge1_ci_high": 0.
|
|
|
|
|
804 |
},
|
805 |
-
"score": 0.
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
@@ -810,473 +810,473 @@
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
],
|
818 |
"totals": [
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
-
|
823 |
],
|
824 |
"precisions": [
|
825 |
-
0.
|
826 |
-
0.
|
827 |
-
0.
|
828 |
-
0.
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
-
"sys_len":
|
832 |
"ref_len": 1734,
|
833 |
-
"sacrebleu": 0.
|
834 |
-
"score": 0.
|
835 |
"score_name": "sacrebleu",
|
836 |
-
"score_ci_low": 0.
|
837 |
-
"score_ci_high": 0.
|
838 |
-
"sacrebleu_ci_low": 0.
|
839 |
-
"sacrebleu_ci_high": 0.
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
-
|
845 |
-
|
846 |
-
|
847 |
-
|
848 |
],
|
849 |
"totals": [
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
],
|
855 |
"precisions": [
|
856 |
-
0.
|
857 |
-
0.
|
858 |
-
0.
|
859 |
-
0.
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
-
"sys_len":
|
863 |
"ref_len": 1734,
|
864 |
-
"sacrebleu": 0.
|
865 |
-
"score": 0.
|
866 |
"score_name": "sacrebleu",
|
867 |
-
"score_ci_low": 0.
|
868 |
-
"score_ci_high": 0.
|
869 |
-
"sacrebleu_ci_low": 0.
|
870 |
-
"sacrebleu_ci_high": 0.
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
613,
|
876 |
-
|
877 |
-
|
878 |
-
|
879 |
],
|
880 |
"totals": [
|
881 |
-
|
882 |
-
|
883 |
-
|
884 |
-
|
885 |
],
|
886 |
"precisions": [
|
887 |
-
0.
|
888 |
-
0.
|
889 |
-
0.
|
890 |
-
0.
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
-
"sys_len":
|
894 |
"ref_len": 1589,
|
895 |
-
"sacrebleu": 0.
|
896 |
-
"score": 0.
|
897 |
"score_name": "sacrebleu",
|
898 |
-
"score_ci_low": 0.
|
899 |
-
"score_ci_high": 0.
|
900 |
-
"sacrebleu_ci_low": 0.
|
901 |
-
"sacrebleu_ci_high": 0.
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
-
|
907 |
-
|
908 |
-
|
909 |
-
|
910 |
],
|
911 |
"totals": [
|
912 |
-
|
913 |
-
|
914 |
-
|
915 |
-
|
916 |
],
|
917 |
"precisions": [
|
918 |
-
0.
|
919 |
-
0.
|
920 |
-
0.
|
921 |
-
0.
|
922 |
],
|
923 |
-
"bp": 0.
|
924 |
-
"sys_len":
|
925 |
"ref_len": 1835,
|
926 |
-
"sacrebleu": 0.
|
927 |
-
"score": 0.
|
928 |
"score_name": "sacrebleu",
|
929 |
-
"score_ci_low": 0.
|
930 |
-
"score_ci_high": 0.
|
931 |
-
"sacrebleu_ci_low": 0.
|
932 |
-
"sacrebleu_ci_high": 0.
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
-
|
938 |
-
|
939 |
-
|
940 |
-
|
941 |
],
|
942 |
"totals": [
|
943 |
-
|
944 |
-
|
945 |
-
|
946 |
-
|
947 |
],
|
948 |
"precisions": [
|
949 |
-
0.
|
950 |
-
0.
|
951 |
-
0.
|
952 |
-
0.
|
953 |
],
|
954 |
-
"bp": 0.
|
955 |
-
"sys_len":
|
956 |
"ref_len": 2068,
|
957 |
-
"sacrebleu": 0.
|
958 |
-
"score": 0.
|
959 |
"score_name": "sacrebleu",
|
960 |
-
"score_ci_low": 0.
|
961 |
-
"score_ci_high": 0.
|
962 |
-
"sacrebleu_ci_low": 0.
|
963 |
-
"sacrebleu_ci_high": 0.
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
-
|
969 |
512,
|
970 |
-
|
971 |
-
|
972 |
],
|
973 |
"totals": [
|
974 |
-
|
975 |
-
|
976 |
-
|
977 |
-
|
978 |
],
|
979 |
"precisions": [
|
980 |
-
0.
|
981 |
-
0.
|
982 |
-
0.
|
983 |
-
0.
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
-
"sys_len":
|
987 |
"ref_len": 2235,
|
988 |
-
"sacrebleu": 0.
|
989 |
-
"score": 0.
|
990 |
"score_name": "sacrebleu",
|
991 |
-
"score_ci_low": 0.
|
992 |
-
"score_ci_high": 0.
|
993 |
-
"sacrebleu_ci_low": 0.
|
994 |
-
"sacrebleu_ci_high": 0.
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
-
|
1000 |
-
|
1001 |
-
|
1002 |
-
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
-
|
1006 |
-
|
1007 |
-
|
1008 |
-
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
-
0.
|
1012 |
-
0.
|
1013 |
-
0.
|
1014 |
-
0.
|
1015 |
],
|
1016 |
-
"bp": 0.
|
1017 |
-
"sys_len":
|
1018 |
"ref_len": 1916,
|
1019 |
-
"sacrebleu": 0.
|
1020 |
-
"score": 0.
|
1021 |
"score_name": "sacrebleu",
|
1022 |
-
"score_ci_low": 0.
|
1023 |
-
"score_ci_high": 0.
|
1024 |
-
"sacrebleu_ci_low": 0.
|
1025 |
-
"sacrebleu_ci_high": 0.
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
-
|
1031 |
710,
|
1032 |
-
|
1033 |
-
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
-
0.
|
1043 |
-
0.
|
1044 |
-
0.
|
1045 |
-
0.
|
1046 |
],
|
1047 |
-
"bp": 0.
|
1048 |
-
"sys_len":
|
1049 |
"ref_len": 1949,
|
1050 |
-
"sacrebleu": 0.
|
1051 |
-
"score": 0.
|
1052 |
"score_name": "sacrebleu",
|
1053 |
-
"score_ci_low": 0.
|
1054 |
-
"score_ci_high": 0.
|
1055 |
-
"sacrebleu_ci_low": 0.
|
1056 |
-
"sacrebleu_ci_high": 0.
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
-
|
1062 |
-
|
1063 |
-
|
1064 |
-
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
1070 |
-
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
-
0.
|
1074 |
-
0.
|
1075 |
-
0.
|
1076 |
-
0.
|
1077 |
],
|
1078 |
-
"bp": 0.
|
1079 |
-
"sys_len":
|
1080 |
"ref_len": 2098,
|
1081 |
-
"sacrebleu": 0.
|
1082 |
-
"score": 0.
|
1083 |
"score_name": "sacrebleu",
|
1084 |
-
"score_ci_low": 0.
|
1085 |
-
"score_ci_high": 0.
|
1086 |
-
"sacrebleu_ci_low": 0.
|
1087 |
-
"sacrebleu_ci_high": 0.
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
-
|
1093 |
807,
|
1094 |
-
|
1095 |
-
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
-
|
1099 |
-
|
1100 |
-
|
1101 |
-
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
-
0.
|
1105 |
-
0.
|
1106 |
-
0.
|
1107 |
-
0.
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
-
"sys_len":
|
1111 |
"ref_len": 1734,
|
1112 |
-
"sacrebleu": 0.
|
1113 |
-
"score": 0.
|
1114 |
"score_name": "sacrebleu",
|
1115 |
-
"score_ci_low": 0.
|
1116 |
-
"score_ci_high": 0.
|
1117 |
-
"sacrebleu_ci_low": 0.
|
1118 |
-
"sacrebleu_ci_high": 0.
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
-
|
1124 |
-
|
1125 |
-
|
1126 |
-
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
-
|
1130 |
-
|
1131 |
-
|
1132 |
-
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
-
0.
|
1136 |
-
0.
|
1137 |
-
0.
|
1138 |
-
0.
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
-
"sys_len":
|
1142 |
"ref_len": 1734,
|
1143 |
-
"sacrebleu": 0.
|
1144 |
-
"score": 0.
|
1145 |
"score_name": "sacrebleu",
|
1146 |
-
"score_ci_low": 0.
|
1147 |
-
"score_ci_high": 0.
|
1148 |
-
"sacrebleu_ci_low": 0.
|
1149 |
-
"sacrebleu_ci_high": 0.
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
-
|
1155 |
-
|
1156 |
-
|
1157 |
-
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
-
|
1161 |
-
|
1162 |
-
|
1163 |
-
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
-
0.
|
1167 |
-
0.
|
1168 |
-
0.
|
1169 |
-
0.
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
-
"sys_len":
|
1173 |
"ref_len": 1734,
|
1174 |
-
"sacrebleu": 0.
|
1175 |
-
"score": 0.
|
1176 |
"score_name": "sacrebleu",
|
1177 |
-
"score_ci_low": 0.
|
1178 |
-
"score_ci_high": 0.
|
1179 |
-
"sacrebleu_ci_low": 0.
|
1180 |
-
"sacrebleu_ci_high": 0.
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
-
|
1186 |
-
|
1187 |
-
|
1188 |
-
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
-
|
1192 |
-
|
1193 |
-
|
1194 |
-
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
-
0.
|
1198 |
-
0.
|
1199 |
-
0.
|
1200 |
-
0.
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
-
"sys_len":
|
1204 |
"ref_len": 1734,
|
1205 |
-
"sacrebleu": 0.
|
1206 |
-
"score": 0.
|
1207 |
"score_name": "sacrebleu",
|
1208 |
-
"score_ci_low": 0.
|
1209 |
-
"score_ci_high": 0.
|
1210 |
-
"sacrebleu_ci_low": 0.
|
1211 |
-
"sacrebleu_ci_high": 0.
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
-
|
1217 |
-
|
1218 |
-
|
1219 |
-
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
-
|
1223 |
-
|
1224 |
-
|
1225 |
-
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
-
0.
|
1229 |
-
0.
|
1230 |
-
0.
|
1231 |
-
0.
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
-
"sys_len":
|
1235 |
"ref_len": 1734,
|
1236 |
-
"sacrebleu": 0.
|
1237 |
-
"score": 0.
|
1238 |
"score_name": "sacrebleu",
|
1239 |
-
"score_ci_low": 0.
|
1240 |
-
"score_ci_high": 0.
|
1241 |
-
"sacrebleu_ci_low": 0.
|
1242 |
-
"sacrebleu_ci_high": 0.
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
-
|
1248 |
-
|
1249 |
-
|
1250 |
-
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
-
|
1254 |
-
|
1255 |
-
|
1256 |
-
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
-
0.
|
1260 |
-
0.
|
1261 |
-
0.
|
1262 |
-
0.
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
-
"sys_len":
|
1266 |
"ref_len": 1734,
|
1267 |
-
"sacrebleu": 0.
|
1268 |
-
"score": 0.
|
1269 |
"score_name": "sacrebleu",
|
1270 |
-
"score_ci_low": 0.
|
1271 |
-
"score_ci_high": 0.
|
1272 |
-
"sacrebleu_ci_low": 0.
|
1273 |
-
"sacrebleu_ci_high": 0.
|
1274 |
},
|
1275 |
-
"score": 0.
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
-
"score": 0.
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-23T09:36:30.499456Z",
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.6333333333333333,
|
180 |
+
"accuracy_ci_low": 0.5222222222222223,
|
181 |
+
"accuracy_ci_high": 0.7333333333333333,
|
182 |
"score_name": "accuracy",
|
183 |
+
"score": 0.6333333333333333,
|
184 |
+
"score_ci_high": 0.7333333333333333,
|
185 |
+
"score_ci_low": 0.5222222222222223,
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.6888888888888889,
|
190 |
+
"accuracy_ci_low": 0.6,
|
191 |
+
"accuracy_ci_high": 0.7777777777777778,
|
192 |
"score_name": "accuracy",
|
193 |
+
"score": 0.6888888888888889,
|
194 |
+
"score_ci_high": 0.7777777777777778,
|
195 |
+
"score_ci_low": 0.6,
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 0.8444444444444444,
|
200 |
+
"accuracy_ci_low": 0.7555555555555555,
|
201 |
+
"accuracy_ci_high": 0.9111111111111111,
|
202 |
"score_name": "accuracy",
|
203 |
+
"score": 0.8444444444444444,
|
204 |
+
"score_ci_high": 0.9111111111111111,
|
205 |
+
"score_ci_low": 0.7555555555555555,
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 0.6777777777777778,
|
210 |
+
"accuracy_ci_low": 0.5777777777777777,
|
211 |
+
"accuracy_ci_high": 0.7555555555555555,
|
212 |
"score_name": "accuracy",
|
213 |
+
"score": 0.6777777777777778,
|
214 |
+
"score_ci_high": 0.7555555555555555,
|
215 |
+
"score_ci_low": 0.5777777777777777,
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.7222222222222222,
|
220 |
+
"accuracy_ci_low": 0.6222222222222222,
|
221 |
+
"accuracy_ci_high": 0.8111111111111111,
|
222 |
"score_name": "accuracy",
|
223 |
+
"score": 0.7222222222222222,
|
224 |
+
"score_ci_high": 0.8111111111111111,
|
225 |
+
"score_ci_low": 0.6222222222222222,
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.8222222222222222,
|
230 |
+
"accuracy_ci_low": 0.7333333333333333,
|
231 |
+
"accuracy_ci_high": 0.8888888888888888,
|
232 |
"score_name": "accuracy",
|
233 |
+
"score": 0.8222222222222222,
|
234 |
+
"score_ci_high": 0.8888888888888888,
|
235 |
+
"score_ci_low": 0.7333333333333333,
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 0.7444444444444445,
|
240 |
+
"accuracy_ci_low": 0.6444444444444445,
|
241 |
+
"accuracy_ci_high": 0.8222222222222222,
|
242 |
"score_name": "accuracy",
|
243 |
+
"score": 0.7444444444444445,
|
244 |
+
"score_ci_high": 0.8222222222222222,
|
245 |
+
"score_ci_low": 0.6444444444444445,
|
246 |
"num_of_instances": 90
|
247 |
},
|
248 |
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 0.7444444444444445,
|
250 |
+
"accuracy_ci_low": 0.6444444444444445,
|
251 |
+
"accuracy_ci_high": 0.8333333333333334,
|
252 |
"score_name": "accuracy",
|
253 |
+
"score": 0.7444444444444445,
|
254 |
+
"score_ci_high": 0.8333333333333334,
|
255 |
+
"score_ci_low": 0.6444444444444445,
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.7444444444444445,
|
260 |
+
"accuracy_ci_low": 0.6444444444444445,
|
261 |
+
"accuracy_ci_high": 0.8222222222222222,
|
262 |
"score_name": "accuracy",
|
263 |
+
"score": 0.7444444444444445,
|
264 |
+
"score_ci_high": 0.8222222222222222,
|
265 |
+
"score_ci_low": 0.6444444444444445,
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.7777777777777778,
|
270 |
+
"accuracy_ci_low": 0.6790372940698232,
|
271 |
+
"accuracy_ci_high": 0.8555555555555555,
|
272 |
"score_name": "accuracy",
|
273 |
+
"score": 0.7777777777777778,
|
274 |
+
"score_ci_high": 0.8555555555555555,
|
275 |
+
"score_ci_low": 0.6790372940698232,
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.7888888888888889,
|
280 |
+
"accuracy_ci_low": 0.6888888888888889,
|
281 |
+
"accuracy_ci_high": 0.8555555555555555,
|
282 |
"score_name": "accuracy",
|
283 |
+
"score": 0.7888888888888889,
|
284 |
+
"score_ci_high": 0.8555555555555555,
|
285 |
+
"score_ci_low": 0.6888888888888889,
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
+
"score": 0.7444444444444445,
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.052083333333333336,
|
296 |
+
"score": 0.052083333333333336,
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
+
"score": 0.052083333333333336,
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.4769647696476965,
|
307 |
+
"f1_Organization": 0.2893890675241158,
|
308 |
+
"f1_Location": 0.30894308943089427,
|
309 |
+
"f1_macro": 0.3584323088675689,
|
310 |
+
"recall_macro": 0.31476418018843305,
|
311 |
+
"precision_macro": 0.4193267050409908,
|
312 |
+
"in_classes_support": 0.7786407766990291,
|
313 |
+
"f1_micro": 0.32884615384615384,
|
314 |
+
"recall_micro": 0.32571428571428573,
|
315 |
+
"precision_micro": 0.3320388349514563,
|
316 |
+
"score": 0.32884615384615384,
|
317 |
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.2827095328153393,
|
319 |
+
"score_ci_high": 0.3761779704134513,
|
320 |
+
"f1_micro_ci_low": 0.2827095328153393,
|
321 |
+
"f1_micro_ci_high": 0.3761779704134513
|
322 |
},
|
323 |
+
"score": 0.32884615384615384,
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.49295774647887325,
|
330 |
+
"accuracy_ci_low": 0.38028169014084506,
|
331 |
+
"accuracy_ci_high": 0.6056338028169014,
|
332 |
"score_name": "accuracy",
|
333 |
+
"score": 0.49295774647887325,
|
334 |
+
"score_ci_high": 0.6056338028169014,
|
335 |
+
"score_ci_low": 0.38028169014084506,
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.323943661971831,
|
340 |
+
"accuracy_ci_low": 0.22535211267605634,
|
341 |
+
"accuracy_ci_high": 0.43661971830985913,
|
342 |
"score_name": "accuracy",
|
343 |
+
"score": 0.323943661971831,
|
344 |
+
"score_ci_high": 0.43661971830985913,
|
345 |
+
"score_ci_low": 0.22535211267605634,
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.22535211267605634,
|
350 |
+
"accuracy_ci_low": 0.14084507042253522,
|
351 |
+
"accuracy_ci_high": 0.3380281690140845,
|
352 |
"score_name": "accuracy",
|
353 |
+
"score": 0.22535211267605634,
|
354 |
+
"score_ci_high": 0.3380281690140845,
|
355 |
+
"score_ci_low": 0.14084507042253522,
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
"accuracy": 0.29577464788732394,
|
360 |
"accuracy_ci_low": 0.19718309859154928,
|
361 |
+
"accuracy_ci_high": 0.4084507042253521,
|
362 |
"score_name": "accuracy",
|
363 |
"score": 0.29577464788732394,
|
364 |
+
"score_ci_high": 0.4084507042253521,
|
365 |
"score_ci_low": 0.19718309859154928,
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.5070422535211268,
|
370 |
+
"accuracy_ci_low": 0.38028169014084506,
|
371 |
+
"accuracy_ci_high": 0.6197183098591549,
|
372 |
"score_name": "accuracy",
|
373 |
+
"score": 0.5070422535211268,
|
374 |
+
"score_ci_high": 0.6197183098591549,
|
375 |
+
"score_ci_low": 0.38028169014084506,
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.18309859154929578,
|
380 |
+
"accuracy_ci_low": 0.09859154929577464,
|
381 |
+
"accuracy_ci_high": 0.2676056338028169,
|
382 |
"score_name": "accuracy",
|
383 |
+
"score": 0.18309859154929578,
|
384 |
+
"score_ci_high": 0.2676056338028169,
|
385 |
+
"score_ci_low": 0.09859154929577464,
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.323943661971831,
|
390 |
+
"accuracy_ci_low": 0.2112676056338028,
|
391 |
+
"accuracy_ci_high": 0.43661971830985913,
|
392 |
"score_name": "accuracy",
|
393 |
+
"score": 0.323943661971831,
|
394 |
+
"score_ci_high": 0.43661971830985913,
|
395 |
+
"score_ci_low": 0.2112676056338028,
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.36619718309859156,
|
400 |
+
"accuracy_ci_low": 0.2535211267605634,
|
401 |
+
"accuracy_ci_high": 0.4788732394366197,
|
402 |
"score_name": "accuracy",
|
403 |
+
"score": 0.36619718309859156,
|
404 |
+
"score_ci_high": 0.4788732394366197,
|
405 |
+
"score_ci_low": 0.2535211267605634,
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.28169014084507044,
|
410 |
+
"accuracy_ci_low": 0.18309859154929578,
|
411 |
+
"accuracy_ci_high": 0.39436619718309857,
|
412 |
"score_name": "accuracy",
|
413 |
+
"score": 0.28169014084507044,
|
414 |
+
"score_ci_high": 0.39436619718309857,
|
415 |
+
"score_ci_low": 0.18309859154929578,
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.15492957746478872,
|
420 |
+
"accuracy_ci_low": 0.08450704225352113,
|
421 |
+
"accuracy_ci_high": 0.26564872868691924,
|
422 |
"score_name": "accuracy",
|
423 |
+
"score": 0.15492957746478872,
|
424 |
+
"score_ci_high": 0.26564872868691924,
|
425 |
+
"score_ci_low": 0.08450704225352113,
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.28169014084507044,
|
430 |
+
"accuracy_ci_low": 0.18309859154929578,
|
431 |
+
"accuracy_ci_high": 0.39436619718309857,
|
432 |
"score_name": "accuracy",
|
433 |
+
"score": 0.28169014084507044,
|
434 |
+
"score_ci_high": 0.39436619718309857,
|
435 |
+
"score_ci_low": 0.18309859154929578,
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.29577464788732394,
|
440 |
+
"accuracy_ci_low": 0.19718309859154928,
|
441 |
+
"accuracy_ci_high": 0.4084507042253521,
|
442 |
"score_name": "accuracy",
|
443 |
+
"score": 0.29577464788732394,
|
444 |
+
"score_ci_high": 0.4084507042253521,
|
445 |
+
"score_ci_low": 0.19718309859154928,
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.16901408450704225,
|
450 |
+
"accuracy_ci_low": 0.09859154929577464,
|
451 |
+
"accuracy_ci_high": 0.2676056338028169,
|
452 |
"score_name": "accuracy",
|
453 |
+
"score": 0.16901408450704225,
|
454 |
+
"score_ci_high": 0.2676056338028169,
|
455 |
+
"score_ci_low": 0.09859154929577464,
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.4788732394366197,
|
460 |
+
"accuracy_ci_low": 0.36619718309859156,
|
461 |
+
"accuracy_ci_high": 0.6012345324644585,
|
462 |
"score_name": "accuracy",
|
463 |
+
"score": 0.4788732394366197,
|
464 |
+
"score_ci_high": 0.6012345324644585,
|
465 |
+
"score_ci_low": 0.36619718309859156,
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
+
"score": 0.31287726358148893,
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.37558008658008657,
|
475 |
+
"f1_suggestive": 0.24242424242424243,
|
476 |
+
"f1_generic": 0.38095238095238093,
|
477 |
+
"f1_descriptive": 0.4583333333333333,
|
478 |
+
"f1_fanciful": 0.32,
|
479 |
+
"f1_arbitrary": 0.47619047619047616,
|
480 |
+
"f1_macro_ci_low": 0.2831225773147394,
|
481 |
+
"f1_macro_ci_high": 0.5002662505279254,
|
482 |
"score_name": "f1_micro",
|
483 |
+
"score": 0.3905325443786982,
|
484 |
+
"score_ci_high": 0.4970414201183432,
|
485 |
+
"score_ci_low": 0.2850635959228859,
|
486 |
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.38823529411764707,
|
488 |
+
"accuracy_ci_low": 0.2823529411764706,
|
489 |
+
"accuracy_ci_high": 0.49411764705882355,
|
490 |
+
"f1_micro": 0.3905325443786982,
|
491 |
+
"f1_micro_ci_low": 0.2850635959228859,
|
492 |
+
"f1_micro_ci_high": 0.4970414201183432
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.4401501318725908,
|
496 |
+
"f1_no": 0.40860215053763443,
|
497 |
+
"f1_yes": 0.4716981132075472,
|
498 |
+
"f1_macro_ci_low": 0.37497026604570943,
|
499 |
+
"f1_macro_ci_high": 0.5074058340630839,
|
500 |
"score_name": "f1_micro",
|
501 |
+
"score": 0.44221105527638194,
|
502 |
+
"score_ci_high": 0.507537688442211,
|
503 |
+
"score_ci_low": 0.3763531585733561,
|
504 |
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.44,
|
506 |
+
"accuracy_ci_low": 0.375,
|
507 |
+
"accuracy_ci_high": 0.505,
|
508 |
+
"f1_micro": 0.44221105527638194,
|
509 |
+
"f1_micro_ci_low": 0.3763531585733561,
|
510 |
+
"f1_micro_ci_high": 0.507537688442211
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.1916316363119106,
|
514 |
+
"f1_conclusion": 0.15584415584415584,
|
515 |
+
"f1_analysis": 0.3333333333333333,
|
516 |
"f1_decree": 0.07692307692307693,
|
517 |
+
"f1_issue": 0.23076923076923078,
|
518 |
+
"f1_facts": 0.12903225806451613,
|
519 |
+
"f1_procedural history": 0.11764705882352941,
|
520 |
+
"f1_rule": 0.2978723404255319,
|
521 |
+
"f1_macro_ci_low": 0.143695452040772,
|
522 |
+
"f1_macro_ci_high": 0.25550408682657144,
|
523 |
"score_name": "f1_micro",
|
524 |
+
"score": 0.22278481012658227,
|
525 |
+
"score_ci_high": 0.2864321608040201,
|
526 |
+
"score_ci_low": 0.16660296570964608,
|
527 |
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.22,
|
529 |
+
"accuracy_ci_low": 0.165,
|
530 |
+
"accuracy_ci_high": 0.28021087258250593,
|
531 |
+
"f1_micro": 0.22278481012658227,
|
532 |
+
"f1_micro_ci_low": 0.16660296570964608,
|
533 |
+
"f1_micro_ci_high": 0.2864321608040201
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.5259978425026969,
|
537 |
+
"f1_yes": 0.5631067961165048,
|
538 |
+
"f1_no": 0.4888888888888889,
|
539 |
+
"f1_macro_ci_low": 0.4618026481617566,
|
540 |
+
"f1_macro_ci_high": 0.5997495353215635,
|
541 |
"score_name": "f1_micro",
|
542 |
+
"score": 0.5284974093264249,
|
543 |
+
"score_ci_high": 0.5989912778302698,
|
544 |
+
"score_ci_low": 0.46113989637305697,
|
545 |
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.51,
|
547 |
+
"accuracy_ci_low": 0.445,
|
548 |
+
"accuracy_ci_high": 0.58,
|
549 |
+
"f1_micro": 0.5284974093264249,
|
550 |
+
"f1_micro_ci_low": 0.46113989637305697,
|
551 |
+
"f1_micro_ci_high": 0.5989912778302698
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
"f1_macro": 0.726027397260274,
|
555 |
"f1_yes": 0.7123287671232876,
|
556 |
"f1_no": 0.7397260273972602,
|
557 |
+
"f1_macro_ci_low": 0.618628457335439,
|
558 |
+
"f1_macro_ci_high": 0.8122702152748204,
|
559 |
"score_name": "f1_micro",
|
560 |
"score": 0.726027397260274,
|
561 |
+
"score_ci_high": 0.8104575163398693,
|
562 |
+
"score_ci_low": 0.6186406698987806,
|
563 |
"num_of_instances": 85,
|
564 |
"accuracy": 0.6235294117647059,
|
565 |
"accuracy_ci_low": 0.5058823529411764,
|
566 |
"accuracy_ci_high": 0.7176470588235294,
|
567 |
"f1_micro": 0.726027397260274,
|
568 |
+
"f1_micro_ci_low": 0.6186406698987806,
|
569 |
+
"f1_micro_ci_high": 0.8104575163398693
|
570 |
},
|
571 |
+
"score": 0.46201064327367225,
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.3891021150890982,
|
578 |
+
"f1_cars": 0.7346938775510204,
|
579 |
+
"f1_windows x": 0.0,
|
580 |
+
"f1_atheism": 0.425531914893617,
|
581 |
+
"f1_christianity": 0.4444444444444444,
|
582 |
+
"f1_religion": 0.15873015873015872,
|
583 |
"f1_medicine": 0.6376811594202898,
|
584 |
+
"f1_computer graphics": 0.2755102040816326,
|
585 |
+
"f1_microsoft windows": 0.29850746268656714,
|
586 |
+
"f1_middle east": 0.19607843137254902,
|
587 |
+
"f1_politics": 0.3387096774193548,
|
588 |
+
"f1_motorcycles": 0.43902439024390244,
|
589 |
+
"f1_mac hardware": 0.2,
|
590 |
+
"f1_pc hardware": 0.34545454545454546,
|
591 |
+
"f1_for sale": 0.33962264150943394,
|
592 |
+
"f1_guns": 0.26666666666666666,
|
593 |
+
"f1_baseball": 0.7368421052631579,
|
594 |
"f1_space": 0.5194805194805194,
|
595 |
+
"f1_cryptography": 0.4358974358974359,
|
596 |
+
"f1_hockey": 0.5625,
|
597 |
+
"f1_electronics": 0.4266666666666667,
|
598 |
+
"f1_macro_ci_low": 0.35853792537669554,
|
599 |
+
"f1_macro_ci_high": 0.4194279498018566,
|
600 |
"score_name": "f1_micro",
|
601 |
+
"score": 0.4063792085056113,
|
602 |
+
"score_ci_high": 0.4367348562601148,
|
603 |
+
"score_ci_low": 0.3741790131164338,
|
604 |
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.344,
|
606 |
+
"accuracy_ci_low": 0.314,
|
607 |
+
"accuracy_ci_high": 0.371,
|
608 |
+
"f1_micro": 0.4063792085056113,
|
609 |
+
"f1_micro_ci_low": 0.3741790131164338,
|
610 |
+
"f1_micro_ci_high": 0.4367348562601148
|
611 |
},
|
612 |
+
"score": 0.4063792085056113,
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.5522255415970777,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9099656357388316,
|
620 |
+
"f1_checking or savings account": 0.5301204819277109,
|
621 |
+
"f1_debt collection": 0.3576158940397351,
|
622 |
+
"f1_credit card or prepaid card": 0.37777777777777777,
|
623 |
+
"f1_mortgage": 0.7017543859649122,
|
|
|
|
|
|
|
624 |
"f1_student loan": 0.75,
|
625 |
+
"f1_money transfer or virtual currency or money service": 0.6666666666666666,
|
626 |
+
"f1_vehicle loan or lease": 0.5161290322580645,
|
627 |
+
"f1_payday loan or title loan or personal loan": 0.16,
|
628 |
+
"f1_macro_ci_low": 0.5008792423568225,
|
629 |
+
"f1_macro_ci_high": 0.6059191922507057,
|
630 |
"score_name": "f1_micro",
|
631 |
+
"score": 0.7975522692503825,
|
632 |
+
"score_ci_high": 0.8230092874186598,
|
633 |
+
"score_ci_low": 0.7734015345268542,
|
634 |
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.782,
|
636 |
+
"accuracy_ci_low": 0.758,
|
637 |
+
"accuracy_ci_high": 0.808771349424543,
|
638 |
+
"f1_micro": 0.7975522692503825,
|
639 |
+
"f1_micro_ci_low": 0.7734015345268542,
|
640 |
+
"f1_micro_ci_high": 0.8230092874186598
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.5976480822045338,
|
644 |
+
"f1_mortgages and loans": 0.7428571428571429,
|
645 |
+
"f1_credit card": 0.6767676767676768,
|
646 |
+
"f1_debt collection": 0.5729166666666666,
|
647 |
+
"f1_retail banking": 0.26666666666666666,
|
648 |
+
"f1_credit reporting": 0.7290322580645161,
|
649 |
+
"f1_macro_ci_low": 0.5536640329621239,
|
650 |
+
"f1_macro_ci_high": 0.6426754598088634,
|
651 |
"score_name": "f1_micro",
|
652 |
+
"score": 0.6408163265306123,
|
653 |
+
"score_ci_high": 0.683589397051309,
|
654 |
+
"score_ci_low": 0.5968250791908158,
|
655 |
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.628,
|
657 |
+
"accuracy_ci_low": 0.584,
|
658 |
+
"accuracy_ci_high": 0.67,
|
659 |
+
"f1_micro": 0.6408163265306123,
|
660 |
+
"f1_micro_ci_low": 0.5968250791908158,
|
661 |
+
"f1_micro_ci_high": 0.683589397051309
|
662 |
},
|
663 |
+
"score": 0.7191842978904974,
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
+
"program_accuracy": 0.046,
|
671 |
+
"score": 0.046,
|
672 |
"score_name": "program_accuracy",
|
673 |
+
"execution_accuracy": 0.04,
|
674 |
+
"program_accuracy_ci_low": 0.035,
|
675 |
+
"program_accuracy_ci_high": 0.06,
|
676 |
+
"score_ci_low": 0.035,
|
677 |
+
"score_ci_high": 0.06,
|
678 |
+
"execution_accuracy_ci_low": 0.029,
|
679 |
+
"execution_accuracy_ci_high": 0.053
|
680 |
},
|
681 |
+
"score": 0.046,
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.3263731921396224,
|
688 |
+
"recall": 0.5005136281930357,
|
689 |
+
"f1": 0.3238306562243135,
|
690 |
+
"precision_ci_low": 0.3062305395351377,
|
691 |
+
"precision_ci_high": 0.3471267358926223,
|
692 |
+
"recall_ci_low": 0.48406708194912995,
|
693 |
+
"recall_ci_high": 0.5170051754306556,
|
694 |
+
"f1_ci_low": 0.30817001709054165,
|
695 |
+
"f1_ci_high": 0.33859378817920405,
|
696 |
"score_name": "f1",
|
697 |
+
"score": 0.3238306562243135,
|
698 |
+
"score_ci_high": 0.33859378817920405,
|
699 |
+
"score_ci_low": 0.30817001709054165,
|
700 |
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.5865038787325223,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6549727993210157,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5550703954448303,
|
704 |
+
"faithfullness_f1_token_overlap": 0.26017056639729325,
|
705 |
+
"faithfullness_recall_token_overlap": 0.19094108989263792,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5662514000838377,
|
707 |
+
"correctness_f1_token_overlap": 0.3238306562243135,
|
708 |
+
"correctness_recall_token_overlap": 0.5005136281930357,
|
709 |
+
"correctness_precision_token_overlap": 0.3263731921396224
|
710 |
},
|
711 |
+
"score": 0.3238306562243135,
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
+
"accuracy": 0.425,
|
718 |
+
"accuracy_ci_low": 0.394,
|
719 |
+
"accuracy_ci_high": 0.4557921663139014,
|
720 |
"score_name": "accuracy",
|
721 |
+
"score": 0.425,
|
722 |
+
"score_ci_high": 0.4557921663139014,
|
723 |
+
"score_ci_low": 0.394,
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
+
"accuracy": 0.736,
|
728 |
+
"accuracy_ci_low": 0.7,
|
729 |
+
"accuracy_ci_high": 0.776,
|
730 |
"score_name": "accuracy",
|
731 |
+
"score": 0.736,
|
732 |
+
"score_ci_high": 0.776,
|
733 |
+
"score_ci_low": 0.7,
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
+
"score": 0.5805,
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
|
|
743 |
"severity_high": 0.0,
|
744 |
"severity_medium": 0.0,
|
745 |
"severity_low": 100.0,
|
746 |
+
"category_harmful_info": 0.8295307880589028,
|
747 |
+
"category_deception": 0.8032731126354199,
|
748 |
+
"category_explicit_content": 0.8484950255263936,
|
749 |
+
"category_violence": 0.8168619035846657,
|
750 |
+
"category_discrimination": 0.8478557457999577,
|
751 |
+
"category_substance_abuse": 0.8705138153500027,
|
752 |
+
"category_pii": 0.829884376514841,
|
753 |
+
"safety": 0.8276369063778677,
|
754 |
+
"safety_ci_low": 0.8109914147202264,
|
755 |
+
"safety_ci_high": 0.8430729937190028,
|
756 |
"score_name": "safety",
|
757 |
+
"score": 0.8276369063778677,
|
758 |
+
"score_ci_high": 0.8430729937190028,
|
759 |
+
"score_ci_low": 0.8109914147202264,
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
+
"score": 0.8276369063778677,
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
+
"rougeL": 0.284383588545129,
|
770 |
+
"score": 0.284383588545129,
|
|
|
|
|
771 |
"score_name": "rougeL",
|
772 |
+
"rouge2": 0.20245431329782115,
|
773 |
+
"rouge1": 0.41045505876440336,
|
774 |
+
"rougeLsum": 0.3502025548591709,
|
775 |
+
"rougeL_ci_low": 0.2769343751140693,
|
776 |
+
"rougeL_ci_high": 0.29127408884195716,
|
777 |
+
"score_ci_low": 0.2769343751140693,
|
778 |
+
"score_ci_high": 0.29127408884195716,
|
779 |
+
"rouge2_ci_low": 0.1950015427241588,
|
780 |
+
"rouge2_ci_high": 0.210011723499623,
|
781 |
+
"rouge1_ci_low": 0.4008962815661577,
|
782 |
+
"rouge1_ci_high": 0.41982792488499465,
|
783 |
+
"rougeLsum_ci_low": 0.3416385040140321,
|
784 |
+
"rougeLsum_ci_high": 0.3588894507334038
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
+
"rougeL": 0.0903457635776036,
|
789 |
+
"score": 0.0903457635776036,
|
|
|
|
|
790 |
"score_name": "rougeL",
|
791 |
+
"rouge2": 0.018003187802161934,
|
792 |
+
"rouge1": 0.12438028754478446,
|
793 |
+
"rougeLsum": 0.10277785443605283,
|
794 |
+
"rougeL_ci_low": 0.08651302258172923,
|
795 |
+
"rougeL_ci_high": 0.09388371145028165,
|
796 |
+
"score_ci_low": 0.08651302258172923,
|
797 |
+
"score_ci_high": 0.09388371145028165,
|
798 |
+
"rouge2_ci_low": 0.016237543973207882,
|
799 |
+
"rouge2_ci_high": 0.01999767687426406,
|
800 |
+
"rouge1_ci_low": 0.11882142858616347,
|
801 |
+
"rouge1_ci_high": 0.12948246473507513,
|
802 |
+
"rougeLsum_ci_low": 0.0979381342979595,
|
803 |
+
"rougeLsum_ci_high": 0.10690023059691123
|
804 |
},
|
805 |
+
"score": 0.1873646760613663,
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
+
1135,
|
814 |
+
646,
|
815 |
+
410,
|
816 |
+
275
|
817 |
],
|
818 |
"totals": [
|
819 |
+
1820,
|
820 |
+
1754,
|
821 |
+
1688,
|
822 |
+
1622
|
823 |
],
|
824 |
"precisions": [
|
825 |
+
0.6236263736263736,
|
826 |
+
0.36830102622576966,
|
827 |
+
0.24289099526066352,
|
828 |
+
0.16954377311960542
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
+
"sys_len": 1820,
|
832 |
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.31185676193781753,
|
834 |
+
"score": 0.31185676193781753,
|
835 |
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.26078843081913794,
|
837 |
+
"score_ci_high": 0.35262811190937277,
|
838 |
+
"sacrebleu_ci_low": 0.26078843081913794,
|
839 |
+
"sacrebleu_ci_high": 0.35262811190937277
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
+
1238,
|
845 |
+
750,
|
846 |
+
499,
|
847 |
+
339
|
848 |
],
|
849 |
"totals": [
|
850 |
+
1796,
|
851 |
+
1730,
|
852 |
+
1664,
|
853 |
+
1598
|
854 |
],
|
855 |
"precisions": [
|
856 |
+
0.6893095768374166,
|
857 |
+
0.4335260115606936,
|
858 |
+
0.2998798076923077,
|
859 |
+
0.21214017521902379
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
+
"sys_len": 1796,
|
863 |
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.3713213364431593,
|
865 |
+
"score": 0.3713213364431593,
|
866 |
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.3292670063116335,
|
868 |
+
"score_ci_high": 0.4181859347073083,
|
869 |
+
"sacrebleu_ci_low": 0.3292670063116335,
|
870 |
+
"sacrebleu_ci_high": 0.4181859347073083
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
613,
|
876 |
+
190,
|
877 |
+
79,
|
878 |
+
26
|
879 |
],
|
880 |
"totals": [
|
881 |
+
1656,
|
882 |
+
1590,
|
883 |
+
1524,
|
884 |
+
1458
|
885 |
],
|
886 |
"precisions": [
|
887 |
+
0.3701690821256039,
|
888 |
+
0.11949685534591195,
|
889 |
+
0.05183727034120735,
|
890 |
+
0.01783264746227709
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
+
"sys_len": 1656,
|
894 |
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.07996568130005909,
|
896 |
+
"score": 0.07996568130005909,
|
897 |
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.06042930465467444,
|
899 |
+
"score_ci_high": 0.09652810994564934,
|
900 |
+
"sacrebleu_ci_low": 0.06042930465467444,
|
901 |
+
"sacrebleu_ci_high": 0.09652810994564934
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
+
1029,
|
907 |
+
509,
|
908 |
+
282,
|
909 |
+
168
|
910 |
],
|
911 |
"totals": [
|
912 |
+
1810,
|
913 |
+
1744,
|
914 |
+
1678,
|
915 |
+
1612
|
916 |
],
|
917 |
"precisions": [
|
918 |
+
0.5685082872928177,
|
919 |
+
0.2918577981651376,
|
920 |
+
0.16805721096543505,
|
921 |
+
0.10421836228287841
|
922 |
],
|
923 |
+
"bp": 0.9862827954544454,
|
924 |
+
"sys_len": 1810,
|
925 |
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.22899649328289487,
|
927 |
+
"score": 0.22899649328289487,
|
928 |
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.19262787967326042,
|
930 |
+
"score_ci_high": 0.280077803172394,
|
931 |
+
"sacrebleu_ci_low": 0.19262787967326042,
|
932 |
+
"sacrebleu_ci_high": 0.280077803172394
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
+
1359,
|
938 |
+
902,
|
939 |
+
653,
|
940 |
+
485
|
941 |
],
|
942 |
"totals": [
|
943 |
+
1997,
|
944 |
+
1931,
|
945 |
+
1865,
|
946 |
+
1799
|
947 |
],
|
948 |
"precisions": [
|
949 |
+
0.6805207811717576,
|
950 |
+
0.4671154842050751,
|
951 |
+
0.3501340482573727,
|
952 |
+
0.2695942190105614
|
953 |
],
|
954 |
+
"bp": 0.9650712656118398,
|
955 |
+
"sys_len": 1997,
|
956 |
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.40166318618755137,
|
958 |
+
"score": 0.40166318618755137,
|
959 |
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.3617396828186495,
|
961 |
+
"score_ci_high": 0.4488753587822201,
|
962 |
+
"sacrebleu_ci_low": 0.3617396828186495,
|
963 |
+
"sacrebleu_ci_high": 0.4488753587822201
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
+
1121,
|
969 |
512,
|
970 |
+
259,
|
971 |
+
142
|
972 |
],
|
973 |
"totals": [
|
974 |
+
2523,
|
975 |
+
2457,
|
976 |
+
2391,
|
977 |
+
2325
|
978 |
],
|
979 |
"precisions": [
|
980 |
+
0.44431232659532305,
|
981 |
+
0.20838420838420837,
|
982 |
+
0.1083228774571309,
|
983 |
+
0.0610752688172043
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
+
"sys_len": 2523,
|
987 |
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.1573202708440978,
|
989 |
+
"score": 0.1573202708440978,
|
990 |
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.1330679575324248,
|
992 |
+
"score_ci_high": 0.17785519831459645,
|
993 |
+
"sacrebleu_ci_low": 0.1330679575324248,
|
994 |
+
"sacrebleu_ci_high": 0.17785519831459645
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
+
1319,
|
1000 |
+
872,
|
1001 |
+
612,
|
1002 |
+
444
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
+
1868,
|
1006 |
+
1802,
|
1007 |
+
1736,
|
1008 |
+
1670
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
+
0.7061027837259101,
|
1012 |
+
0.4839067702552719,
|
1013 |
+
0.35253456221198154,
|
1014 |
+
0.2658682634730539
|
1015 |
],
|
1016 |
+
"bp": 0.974631399286791,
|
1017 |
+
"sys_len": 1868,
|
1018 |
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.41230144255258333,
|
1020 |
+
"score": 0.41230144255258333,
|
1021 |
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.3754985997111881,
|
1023 |
+
"score_ci_high": 0.4595286683207052,
|
1024 |
+
"sacrebleu_ci_low": 0.3754985997111881,
|
1025 |
+
"sacrebleu_ci_high": 0.4595286683207052
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
+
1192,
|
1031 |
710,
|
1032 |
+
457,
|
1033 |
+
302
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
+
1928,
|
1037 |
+
1862,
|
1038 |
+
1796,
|
1039 |
+
1730
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
+
0.6182572614107884,
|
1043 |
+
0.38131041890440387,
|
1044 |
+
0.2544543429844098,
|
1045 |
+
0.1745664739884393
|
1046 |
],
|
1047 |
+
"bp": 0.9891669881299116,
|
1048 |
+
"sys_len": 1928,
|
1049 |
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.31642753307552074,
|
1051 |
+
"score": 0.31642753307552074,
|
1052 |
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.2854187560212565,
|
1054 |
+
"score_ci_high": 0.3701086569937762,
|
1055 |
+
"sacrebleu_ci_low": 0.2854187560212565,
|
1056 |
+
"sacrebleu_ci_high": 0.3701086569937762
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
+
1208,
|
1062 |
+
659,
|
1063 |
+
389,
|
1064 |
+
236
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
+
1983,
|
1068 |
+
1917,
|
1069 |
+
1851,
|
1070 |
+
1785
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
+
0.6091780131114473,
|
1074 |
+
0.34376630151278037,
|
1075 |
+
0.2101566720691518,
|
1076 |
+
0.13221288515406163
|
1077 |
],
|
1078 |
+
"bp": 0.9436566096384625,
|
1079 |
+
"sys_len": 1983,
|
1080 |
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.26062742180685816,
|
1082 |
+
"score": 0.26062742180685816,
|
1083 |
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.22914767064053682,
|
1085 |
+
"score_ci_high": 0.2844965463617,
|
1086 |
+
"sacrebleu_ci_low": 0.22914767064053682,
|
1087 |
+
"sacrebleu_ci_high": 0.2844965463617
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
+
1276,
|
1093 |
807,
|
1094 |
+
545,
|
1095 |
+
375
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
+
1818,
|
1099 |
+
1752,
|
1100 |
+
1686,
|
1101 |
+
1620
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
+
0.7018701870187019,
|
1105 |
+
0.4606164383561644,
|
1106 |
+
0.3232502965599051,
|
1107 |
+
0.23148148148148148
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
+
"sys_len": 1818,
|
1111 |
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.39437815723424946,
|
1113 |
+
"score": 0.39437815723424946,
|
1114 |
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.3589675802199702,
|
1116 |
+
"score_ci_high": 0.4463980849713465,
|
1117 |
+
"sacrebleu_ci_low": 0.3589675802199702,
|
1118 |
+
"sacrebleu_ci_high": 0.4463980849713465
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
+
1027,
|
1124 |
+
468,
|
1125 |
+
248,
|
1126 |
+
130
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
+
1824,
|
1130 |
+
1758,
|
1131 |
+
1692,
|
1132 |
+
1626
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
+
0.5630482456140351,
|
1136 |
+
0.26621160409556316,
|
1137 |
+
0.14657210401891255,
|
1138 |
+
0.07995079950799508
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
+
"sys_len": 1824,
|
1142 |
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.20472066389963584,
|
1144 |
+
"score": 0.20472066389963584,
|
1145 |
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.1789860929189351,
|
1147 |
+
"score_ci_high": 0.24365182830296692,
|
1148 |
+
"sacrebleu_ci_low": 0.1789860929189351,
|
1149 |
+
"sacrebleu_ci_high": 0.24365182830296692
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
+
963,
|
1155 |
+
428,
|
1156 |
+
229,
|
1157 |
+
133
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
+
1783,
|
1161 |
+
1717,
|
1162 |
+
1651,
|
1163 |
+
1585
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
+
0.5401009534492429,
|
1167 |
+
0.24927198602213163,
|
1168 |
+
0.1387038158691702,
|
1169 |
+
0.08391167192429022
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
+
"sys_len": 1783,
|
1173 |
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.19895955473357632,
|
1175 |
+
"score": 0.19895955473357632,
|
1176 |
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.1719456080924031,
|
1178 |
+
"score_ci_high": 0.2466087231408179,
|
1179 |
+
"sacrebleu_ci_low": 0.1719456080924031,
|
1180 |
+
"sacrebleu_ci_high": 0.2466087231408179
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
+
1283,
|
1186 |
+
836,
|
1187 |
+
589,
|
1188 |
+
428
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
+
1803,
|
1192 |
+
1737,
|
1193 |
+
1671,
|
1194 |
+
1605
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
+
0.7115917914586799,
|
1198 |
+
0.48128957973517555,
|
1199 |
+
0.35248354278874927,
|
1200 |
+
0.26666666666666666
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
+
"sys_len": 1803,
|
1204 |
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.4235807758108321,
|
1206 |
+
"score": 0.4235807758108321,
|
1207 |
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.37601558928179885,
|
1209 |
+
"score_ci_high": 0.47290261446153176,
|
1210 |
+
"sacrebleu_ci_low": 0.37601558928179885,
|
1211 |
+
"sacrebleu_ci_high": 0.47290261446153176
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
+
1297,
|
1217 |
+
833,
|
1218 |
+
566,
|
1219 |
+
384
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
+
1841,
|
1223 |
+
1775,
|
1224 |
+
1709,
|
1225 |
+
1643
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
+
0.7045084193373167,
|
1229 |
+
0.46929577464788735,
|
1230 |
+
0.3311878291398479,
|
1231 |
+
0.23371880706025563
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
+
"sys_len": 1841,
|
1235 |
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.3999679713298994,
|
1237 |
+
"score": 0.3999679713298994,
|
1238 |
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.3551473917196145,
|
1240 |
+
"score_ci_high": 0.4324084862016873,
|
1241 |
+
"sacrebleu_ci_low": 0.3551473917196145,
|
1242 |
+
"sacrebleu_ci_high": 0.4324084862016873
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
+
1126,
|
1248 |
+
594,
|
1249 |
+
349,
|
1250 |
+
214
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
+
1834,
|
1254 |
+
1768,
|
1255 |
+
1702,
|
1256 |
+
1636
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
+
0.6139585605234461,
|
1260 |
+
0.335972850678733,
|
1261 |
+
0.20505287896592242,
|
1262 |
+
0.13080684596577016
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
+
"sys_len": 1834,
|
1266 |
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.2727312463583288,
|
1268 |
+
"score": 0.2727312463583288,
|
1269 |
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.23703993855927166,
|
1271 |
+
"score_ci_high": 0.3146036672452131,
|
1272 |
+
"sacrebleu_ci_low": 0.23703993855927166,
|
1273 |
+
"sacrebleu_ci_high": 0.3146036672452131
|
1274 |
},
|
1275 |
+
"score": 0.2956545664531376,
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
+
"score": 0.4066778576916836,
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
results/bluebench/{2025-06-19T20-10-50_evaluation_results.json β 2025-06-23T06-18-33_evaluation_results.json}
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
-
"timestamp_utc": "2025-06-
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
@@ -42,7 +42,7 @@
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
-
"unitxt_commit_hash": "
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
@@ -256,13 +256,13 @@
|
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
-
"accuracy": 0.
|
260 |
-
"accuracy_ci_low": 0.
|
261 |
-
"accuracy_ci_high": 0.
|
262 |
"score_name": "accuracy",
|
263 |
-
"score": 0.
|
264 |
-
"score_ci_high": 0.
|
265 |
-
"score_ci_low": 0.
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
@@ -285,74 +285,74 @@
|
|
285 |
"score_ci_low": 0.7888888888888889,
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
-
"score": 0.
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
296 |
-
"score": 0.
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
-
"score": 0.
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
-
"f1_Person": 0.
|
307 |
-
"f1_Organization": 0.
|
308 |
-
"f1_Location": 0.
|
309 |
-
"f1_macro": 0.
|
310 |
-
"recall_macro": 0.
|
311 |
-
"precision_macro": 0.
|
312 |
-
"in_classes_support": 0.
|
313 |
-
"f1_micro": 0.
|
314 |
-
"recall_micro": 0.
|
315 |
-
"precision_micro": 0.
|
316 |
-
"score": 0.
|
317 |
"score_name": "f1_micro",
|
318 |
-
"score_ci_low": 0.
|
319 |
-
"score_ci_high": 0.
|
320 |
-
"f1_micro_ci_low": 0.
|
321 |
-
"f1_micro_ci_high": 0.
|
322 |
},
|
323 |
-
"score": 0.
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
-
"accuracy": 0.
|
330 |
-
"accuracy_ci_low": 0.
|
331 |
-
"accuracy_ci_high": 0.
|
332 |
"score_name": "accuracy",
|
333 |
-
"score": 0.
|
334 |
-
"score_ci_high": 0.
|
335 |
-
"score_ci_low": 0.
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
-
"accuracy": 0.
|
340 |
-
"accuracy_ci_low": 0.
|
341 |
-
"accuracy_ci_high": 0.
|
342 |
"score_name": "accuracy",
|
343 |
-
"score": 0.
|
344 |
-
"score_ci_high": 0.
|
345 |
-
"score_ci_low": 0.
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
-
"accuracy": 0.
|
350 |
-
"accuracy_ci_low": 0.
|
351 |
-
"accuracy_ci_high": 0.
|
352 |
"score_name": "accuracy",
|
353 |
-
"score": 0.
|
354 |
-
"score_ci_high": 0.
|
355 |
-
"score_ci_low": 0.
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
@@ -366,189 +366,189 @@
|
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
-
"accuracy": 0.
|
370 |
-
"accuracy_ci_low": 0.
|
371 |
-
"accuracy_ci_high": 0.
|
372 |
"score_name": "accuracy",
|
373 |
-
"score": 0.
|
374 |
-
"score_ci_high": 0.
|
375 |
-
"score_ci_low": 0.
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
379 |
-
"accuracy": 0.
|
380 |
-
"accuracy_ci_low": 0.
|
381 |
-
"accuracy_ci_high": 0.
|
382 |
"score_name": "accuracy",
|
383 |
-
"score": 0.
|
384 |
-
"score_ci_high": 0.
|
385 |
-
"score_ci_low": 0.
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
-
"accuracy": 0.
|
390 |
-
"accuracy_ci_low": 0.
|
391 |
-
"accuracy_ci_high": 0.
|
392 |
"score_name": "accuracy",
|
393 |
-
"score": 0.
|
394 |
-
"score_ci_high": 0.
|
395 |
-
"score_ci_low": 0.
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
-
"accuracy": 0.
|
400 |
-
"accuracy_ci_low": 0.
|
401 |
-
"accuracy_ci_high": 0.
|
402 |
"score_name": "accuracy",
|
403 |
-
"score": 0.
|
404 |
-
"score_ci_high": 0.
|
405 |
-
"score_ci_low": 0.
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
-
"accuracy": 0.
|
410 |
-
"accuracy_ci_low": 0.
|
411 |
-
"accuracy_ci_high": 0.
|
412 |
"score_name": "accuracy",
|
413 |
-
"score": 0.
|
414 |
-
"score_ci_high": 0.
|
415 |
-
"score_ci_low": 0.
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
-
"accuracy": 0.
|
420 |
-
"accuracy_ci_low": 0.
|
421 |
-
"accuracy_ci_high": 0.
|
422 |
"score_name": "accuracy",
|
423 |
-
"score": 0.
|
424 |
-
"score_ci_high": 0.
|
425 |
-
"score_ci_low": 0.
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
-
"accuracy": 0.
|
430 |
-
"accuracy_ci_low": 0.
|
431 |
-
"accuracy_ci_high": 0.
|
432 |
"score_name": "accuracy",
|
433 |
-
"score": 0.
|
434 |
-
"score_ci_high": 0.
|
435 |
-
"score_ci_low": 0.
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
"accuracy": 0.6901408450704225,
|
440 |
-
"accuracy_ci_low": 0.
|
441 |
"accuracy_ci_high": 0.7887323943661971,
|
442 |
"score_name": "accuracy",
|
443 |
"score": 0.6901408450704225,
|
444 |
"score_ci_high": 0.7887323943661971,
|
445 |
-
"score_ci_low": 0.
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
-
"accuracy": 0.
|
450 |
-
"accuracy_ci_low": 0.
|
451 |
-
"accuracy_ci_high": 0.
|
452 |
"score_name": "accuracy",
|
453 |
-
"score": 0.
|
454 |
-
"score_ci_high": 0.
|
455 |
-
"score_ci_low": 0.
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
-
"accuracy": 0.
|
460 |
-
"accuracy_ci_low": 0.
|
461 |
"accuracy_ci_high": 0.7605633802816901,
|
462 |
"score_name": "accuracy",
|
463 |
-
"score": 0.
|
464 |
"score_ci_high": 0.7605633802816901,
|
465 |
-
"score_ci_low": 0.
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
-
"score": 0.
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
-
"f1_macro": 0.
|
475 |
-
"f1_suggestive": 0.
|
476 |
"f1_generic": 0.6666666666666666,
|
477 |
-
"f1_descriptive": 0.
|
478 |
"f1_fanciful": 0.4166666666666667,
|
479 |
-
"f1_arbitrary": 0.
|
480 |
-
"f1_macro_ci_low": 0.
|
481 |
-
"f1_macro_ci_high": 0.
|
482 |
"score_name": "f1_micro",
|
483 |
-
"score": 0.
|
484 |
-
"score_ci_high": 0.
|
485 |
-
"score_ci_low": 0.
|
486 |
"num_of_instances": 85,
|
487 |
-
"accuracy": 0.
|
488 |
-
"accuracy_ci_low": 0.
|
489 |
"accuracy_ci_high": 0.6588235294117647,
|
490 |
-
"f1_micro": 0.
|
491 |
-
"f1_micro_ci_low": 0.
|
492 |
-
"f1_micro_ci_high": 0.
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
-
"f1_macro": 0.
|
496 |
-
"f1_no": 0.
|
497 |
-
"f1_yes": 0.
|
498 |
-
"f1_macro_ci_low": 0.
|
499 |
-
"f1_macro_ci_high": 0.
|
500 |
"score_name": "f1_micro",
|
501 |
-
"score": 0.
|
502 |
-
"score_ci_high": 0.
|
503 |
-
"score_ci_low": 0.
|
504 |
"num_of_instances": 200,
|
505 |
-
"accuracy": 0.
|
506 |
-
"accuracy_ci_low": 0.
|
507 |
-
"accuracy_ci_high": 0.
|
508 |
-
"f1_micro": 0.
|
509 |
-
"f1_micro_ci_low": 0.
|
510 |
-
"f1_micro_ci_high": 0.
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
-
"f1_macro": 0.
|
514 |
"f1_conclusion": 0.0625,
|
515 |
-
"f1_issue": 0.16326530612244897,
|
516 |
"f1_decree": 0.2,
|
517 |
-
"
|
518 |
-
"
|
519 |
-
"
|
520 |
-
"
|
521 |
-
"
|
522 |
-
"
|
|
|
523 |
"score_name": "f1_micro",
|
524 |
-
"score": 0.
|
525 |
-
"score_ci_high": 0.
|
526 |
-
"score_ci_low": 0.
|
527 |
"num_of_instances": 200,
|
528 |
-
"accuracy": 0.
|
529 |
-
"accuracy_ci_low": 0.
|
530 |
-
"accuracy_ci_high": 0.
|
531 |
-
"f1_micro": 0.
|
532 |
-
"f1_micro_ci_low": 0.
|
533 |
-
"f1_micro_ci_high": 0.
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
-
"f1_macro": 0.
|
537 |
-
"f1_yes": 0.
|
538 |
-
"f1_no": 0.
|
539 |
-
"f1_macro_ci_low": 0.
|
540 |
-
"f1_macro_ci_high": 0.
|
541 |
"score_name": "f1_micro",
|
542 |
-
"score": 0.
|
543 |
-
"score_ci_high": 0.
|
544 |
-
"score_ci_low": 0.
|
545 |
"num_of_instances": 200,
|
546 |
-
"accuracy": 0.
|
547 |
-
"accuracy_ci_low": 0.
|
548 |
-
"accuracy_ci_high": 0.
|
549 |
-
"f1_micro": 0.
|
550 |
-
"f1_micro_ci_low": 0.
|
551 |
-
"f1_micro_ci_high": 0.
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
"f1_macro": 0.7776061776061776,
|
@@ -568,241 +568,241 @@
|
|
568 |
"f1_micro_ci_low": 0.6950354609929078,
|
569 |
"f1_micro_ci_high": 0.8435374149659864
|
570 |
},
|
571 |
-
"score": 0.
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
-
"f1_macro": 0.
|
578 |
"f1_cars": 0.8089887640449438,
|
579 |
"f1_windows x": 0.06153846153846154,
|
580 |
-
"f1_computer graphics": 0.
|
581 |
"f1_atheism": 0.1951219512195122,
|
582 |
-
"f1_christianity": 0.
|
583 |
-
"f1_religion": 0.
|
584 |
-
"f1_medicine": 0.
|
585 |
-
"f1_microsoft windows": 0.
|
586 |
"f1_middle east": 0.6666666666666666,
|
587 |
"f1_motorcycles": 0.7619047619047619,
|
588 |
-
"f1_politics": 0.
|
589 |
-
"f1_pc hardware": 0.
|
590 |
-
"f1_mac hardware": 0.
|
591 |
-
"
|
592 |
-
"
|
|
|
593 |
"f1_space": 0.82,
|
594 |
-
"f1_cryptography": 0.
|
595 |
-
"f1_baseball": 0.
|
596 |
-
"f1_hockey": 0.
|
597 |
-
"
|
598 |
-
"
|
599 |
-
"f1_macro_ci_high": 0.6464945617502024,
|
600 |
"score_name": "f1_micro",
|
601 |
-
"score": 0.
|
602 |
-
"score_ci_high": 0.
|
603 |
-
"score_ci_low": 0.
|
604 |
"num_of_instances": 1000,
|
605 |
"accuracy": 0.608,
|
606 |
-
"accuracy_ci_low": 0.
|
607 |
-
"accuracy_ci_high": 0.
|
608 |
-
"f1_micro": 0.
|
609 |
-
"f1_micro_ci_low": 0.
|
610 |
-
"f1_micro_ci_high": 0.
|
611 |
},
|
612 |
-
"score": 0.
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
-
"f1_macro": 0.
|
619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
620 |
-
"
|
621 |
-
"f1_debt collection": 0.
|
622 |
-
"
|
623 |
-
"
|
624 |
-
"f1_student loan": 0.
|
625 |
-
"
|
626 |
-
"
|
627 |
-
"f1_payday loan or title loan or personal loan": 0.
|
628 |
-
"f1_macro_ci_low": 0.
|
629 |
-
"f1_macro_ci_high": 0.
|
630 |
"score_name": "f1_micro",
|
631 |
-
"score": 0.
|
632 |
-
"score_ci_high": 0.
|
633 |
-
"score_ci_low": 0.
|
634 |
"num_of_instances": 1000,
|
635 |
-
"accuracy": 0.
|
636 |
-
"accuracy_ci_low": 0.
|
637 |
-
"accuracy_ci_high": 0.
|
638 |
-
"f1_micro": 0.
|
639 |
-
"f1_micro_ci_low": 0.
|
640 |
-
"f1_micro_ci_high": 0.
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
-
"f1_macro": 0.
|
644 |
-
"f1_mortgages and loans": 0.
|
645 |
-
"f1_credit card": 0.
|
646 |
-
"f1_debt collection": 0.
|
647 |
"f1_credit reporting": 0.752851711026616,
|
648 |
-
"f1_retail banking": 0.
|
649 |
-
"f1_macro_ci_low": 0.
|
650 |
-
"f1_macro_ci_high": 0.
|
651 |
"score_name": "f1_micro",
|
652 |
"score": 0.7741273100616016,
|
653 |
-
"score_ci_high": 0.
|
654 |
-
"score_ci_low": 0.
|
655 |
"num_of_instances": 500,
|
656 |
"accuracy": 0.754,
|
657 |
"accuracy_ci_low": 0.716,
|
658 |
-
"accuracy_ci_high": 0.
|
659 |
"f1_micro": 0.7741273100616016,
|
660 |
-
"f1_micro_ci_low": 0.
|
661 |
-
"f1_micro_ci_high": 0.
|
662 |
},
|
663 |
-
"score": 0.
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
-
"
|
671 |
-
"
|
672 |
-
"score": 0.182,
|
673 |
"score_name": "program_accuracy",
|
674 |
-
"
|
675 |
-
"
|
676 |
-
"
|
677 |
-
"
|
678 |
-
"
|
679 |
-
"
|
|
|
680 |
},
|
681 |
-
"score": 0.
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
-
"precision": 0.
|
688 |
-
"recall": 0.
|
689 |
-
"f1": 0.
|
690 |
-
"precision_ci_low": 0.
|
691 |
-
"precision_ci_high": 0.
|
692 |
-
"recall_ci_low": 0.
|
693 |
-
"recall_ci_high": 0.
|
694 |
-
"f1_ci_low": 0.
|
695 |
-
"f1_ci_high": 0.
|
696 |
"score_name": "f1",
|
697 |
-
"score": 0.
|
698 |
-
"score_ci_high": 0.
|
699 |
-
"score_ci_low": 0.
|
700 |
"num_of_instances": 600,
|
701 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
702 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
703 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
704 |
-
"faithfullness_f1_token_overlap": 0.
|
705 |
-
"faithfullness_recall_token_overlap": 0.
|
706 |
-
"faithfullness_precision_token_overlap": 0.
|
707 |
-
"correctness_f1_token_overlap": 0.
|
708 |
-
"correctness_recall_token_overlap": 0.
|
709 |
-
"correctness_precision_token_overlap": 0.
|
710 |
},
|
711 |
-
"score": 0.
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
-
"accuracy": 0.
|
718 |
-
"accuracy_ci_low": 0.
|
719 |
-
"accuracy_ci_high": 0.
|
720 |
"score_name": "accuracy",
|
721 |
-
"score": 0.
|
722 |
-
"score_ci_high": 0.
|
723 |
-
"score_ci_low": 0.
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
-
"accuracy": 0.
|
728 |
-
"accuracy_ci_low": 0.
|
729 |
-
"accuracy_ci_high": 0.
|
730 |
"score_name": "accuracy",
|
731 |
-
"score": 0.
|
732 |
-
"score_ci_high": 0.
|
733 |
-
"score_ci_low": 0.
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
-
"score": 0.
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
740 |
"safety": {
|
741 |
"attaq_500": {
|
742 |
"severity_critical": 0.0,
|
743 |
-
"severity_high":
|
744 |
-
"severity_medium":
|
745 |
"severity_low": 96.0,
|
746 |
-
"category_harmful_info": 0.
|
747 |
-
"category_deception": 0.
|
748 |
-
"category_explicit_content": 0.
|
749 |
-
"category_violence": 0.
|
750 |
-
"category_discrimination": 0.
|
751 |
-
"category_substance_abuse": 0.
|
752 |
-
"category_pii": 0.
|
753 |
-
"safety": 0.
|
754 |
-
"safety_ci_low": 0.
|
755 |
-
"safety_ci_high": 0.
|
756 |
"score_name": "safety",
|
757 |
-
"score": 0.
|
758 |
-
"score_ci_high": 0.
|
759 |
-
"score_ci_low": 0.
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
-
"score": 0.
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
-
"
|
770 |
-
"rougeL": 0.
|
771 |
-
"score": 0.
|
772 |
"score_name": "rougeL",
|
773 |
-
"
|
774 |
-
"
|
775 |
-
"
|
776 |
-
"
|
777 |
-
"rougeL_ci_low": 0.
|
778 |
-
"rougeL_ci_high": 0.
|
779 |
-
"score_ci_low": 0.
|
780 |
-
"score_ci_high": 0.
|
781 |
-
"
|
782 |
-
"
|
783 |
-
"
|
784 |
-
"
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
-
"
|
789 |
-
"rougeL": 0.
|
790 |
-
"score": 0.
|
791 |
"score_name": "rougeL",
|
792 |
-
"
|
793 |
-
"
|
794 |
-
"
|
795 |
-
"
|
796 |
-
"rougeL_ci_low": 0.
|
797 |
-
"rougeL_ci_high": 0.
|
798 |
-
"score_ci_low": 0.
|
799 |
-
"score_ci_high": 0.
|
800 |
-
"
|
801 |
-
"
|
802 |
-
"
|
803 |
-
"
|
804 |
},
|
805 |
-
"score": 0.
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
@@ -810,473 +810,473 @@
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
],
|
818 |
"totals": [
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
-
|
823 |
],
|
824 |
"precisions": [
|
825 |
-
0.
|
826 |
-
0.
|
827 |
-
0.
|
828 |
-
0.
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
-
"sys_len":
|
832 |
"ref_len": 1734,
|
833 |
-
"sacrebleu": 0.
|
834 |
-
"score": 0.
|
835 |
"score_name": "sacrebleu",
|
836 |
-
"score_ci_low": 0.
|
837 |
-
"score_ci_high": 0.
|
838 |
-
"sacrebleu_ci_low": 0.
|
839 |
-
"sacrebleu_ci_high": 0.
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
-
|
845 |
-
|
846 |
-
|
847 |
-
|
848 |
],
|
849 |
"totals": [
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
],
|
855 |
"precisions": [
|
856 |
-
0.
|
857 |
-
0.
|
858 |
-
0.
|
859 |
-
0.
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
-
"sys_len":
|
863 |
"ref_len": 1734,
|
864 |
-
"sacrebleu": 0.
|
865 |
-
"score": 0.
|
866 |
"score_name": "sacrebleu",
|
867 |
-
"score_ci_low": 0.
|
868 |
-
"score_ci_high": 0.
|
869 |
-
"sacrebleu_ci_low": 0.
|
870 |
-
"sacrebleu_ci_high": 0.
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
-
|
876 |
-
|
877 |
-
|
878 |
-
|
879 |
],
|
880 |
"totals": [
|
881 |
-
|
882 |
-
|
883 |
-
|
884 |
-
|
885 |
],
|
886 |
"precisions": [
|
887 |
-
0.
|
888 |
-
0.
|
889 |
-
0.
|
890 |
-
0.
|
891 |
],
|
892 |
-
"bp":
|
893 |
-
"sys_len":
|
894 |
"ref_len": 1589,
|
895 |
-
"sacrebleu": 0.
|
896 |
-
"score": 0.
|
897 |
"score_name": "sacrebleu",
|
898 |
-
"score_ci_low": 0.
|
899 |
-
"score_ci_high": 0.
|
900 |
-
"sacrebleu_ci_low": 0.
|
901 |
-
"sacrebleu_ci_high": 0.
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
-
|
907 |
-
|
908 |
-
|
909 |
-
|
910 |
],
|
911 |
"totals": [
|
912 |
-
|
913 |
-
|
914 |
-
|
915 |
-
|
916 |
],
|
917 |
"precisions": [
|
918 |
-
0.
|
919 |
-
0.
|
920 |
-
0.
|
921 |
-
0.
|
922 |
],
|
923 |
-
"bp": 0.
|
924 |
-
"sys_len":
|
925 |
"ref_len": 1835,
|
926 |
-
"sacrebleu": 0.
|
927 |
-
"score": 0.
|
928 |
"score_name": "sacrebleu",
|
929 |
-
"score_ci_low": 0.
|
930 |
-
"score_ci_high": 0.
|
931 |
-
"sacrebleu_ci_low": 0.
|
932 |
-
"sacrebleu_ci_high": 0.
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
-
|
938 |
-
|
939 |
-
|
940 |
-
|
941 |
],
|
942 |
"totals": [
|
943 |
-
|
944 |
-
|
945 |
-
|
946 |
-
|
947 |
],
|
948 |
"precisions": [
|
949 |
-
0.
|
950 |
-
0.
|
951 |
-
0.
|
952 |
-
0.
|
953 |
],
|
954 |
-
"bp": 0.
|
955 |
-
"sys_len":
|
956 |
"ref_len": 2068,
|
957 |
-
"sacrebleu": 0.
|
958 |
-
"score": 0.
|
959 |
"score_name": "sacrebleu",
|
960 |
-
"score_ci_low": 0.
|
961 |
-
"score_ci_high": 0.
|
962 |
-
"sacrebleu_ci_low": 0.
|
963 |
-
"sacrebleu_ci_high": 0.
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
-
|
969 |
-
|
970 |
-
|
971 |
-
|
972 |
],
|
973 |
"totals": [
|
974 |
-
|
975 |
-
|
976 |
-
|
977 |
-
|
978 |
],
|
979 |
"precisions": [
|
980 |
-
0.
|
981 |
-
0.
|
982 |
-
0.
|
983 |
-
0.
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
-
"sys_len":
|
987 |
"ref_len": 2235,
|
988 |
-
"sacrebleu": 0.
|
989 |
-
"score": 0.
|
990 |
"score_name": "sacrebleu",
|
991 |
-
"score_ci_low": 0.
|
992 |
-
"score_ci_high": 0.
|
993 |
-
"sacrebleu_ci_low": 0.
|
994 |
-
"sacrebleu_ci_high": 0.
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
-
|
1000 |
-
|
1001 |
-
|
1002 |
-
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
-
|
1006 |
-
|
1007 |
-
|
1008 |
-
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
-
0.
|
1012 |
-
0.
|
1013 |
-
0.
|
1014 |
-
0.
|
1015 |
],
|
1016 |
-
"bp": 0.
|
1017 |
-
"sys_len":
|
1018 |
"ref_len": 1916,
|
1019 |
-
"sacrebleu": 0.
|
1020 |
-
"score": 0.
|
1021 |
"score_name": "sacrebleu",
|
1022 |
-
"score_ci_low": 0.
|
1023 |
-
"score_ci_high": 0.
|
1024 |
-
"sacrebleu_ci_low": 0.
|
1025 |
-
"sacrebleu_ci_high": 0.
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
-
|
1031 |
-
|
1032 |
-
|
1033 |
-
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
-
0.
|
1043 |
-
0.
|
1044 |
-
0.
|
1045 |
-
0.
|
1046 |
],
|
1047 |
-
"bp": 0.
|
1048 |
-
"sys_len":
|
1049 |
"ref_len": 1949,
|
1050 |
-
"sacrebleu": 0.
|
1051 |
-
"score": 0.
|
1052 |
"score_name": "sacrebleu",
|
1053 |
-
"score_ci_low": 0.
|
1054 |
-
"score_ci_high": 0.
|
1055 |
-
"sacrebleu_ci_low": 0.
|
1056 |
-
"sacrebleu_ci_high": 0.
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
-
|
1062 |
-
|
1063 |
-
|
1064 |
-
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
1070 |
-
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
-
0.
|
1074 |
-
0.
|
1075 |
-
0.
|
1076 |
-
0.
|
1077 |
],
|
1078 |
-
"bp": 0.
|
1079 |
-
"sys_len":
|
1080 |
"ref_len": 2098,
|
1081 |
-
"sacrebleu": 0.
|
1082 |
-
"score": 0.
|
1083 |
"score_name": "sacrebleu",
|
1084 |
-
"score_ci_low": 0.
|
1085 |
-
"score_ci_high": 0.
|
1086 |
-
"sacrebleu_ci_low": 0.
|
1087 |
-
"sacrebleu_ci_high": 0.
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
-
|
1093 |
-
|
1094 |
-
|
1095 |
-
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
-
|
1099 |
-
|
1100 |
-
|
1101 |
-
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
-
0.
|
1105 |
-
0.
|
1106 |
-
0.
|
1107 |
-
0.
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
-
"sys_len":
|
1111 |
"ref_len": 1734,
|
1112 |
-
"sacrebleu": 0.
|
1113 |
-
"score": 0.
|
1114 |
"score_name": "sacrebleu",
|
1115 |
-
"score_ci_low": 0.
|
1116 |
-
"score_ci_high": 0.
|
1117 |
-
"sacrebleu_ci_low": 0.
|
1118 |
-
"sacrebleu_ci_high": 0.
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
-
|
1124 |
-
|
1125 |
-
|
1126 |
-
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
-
|
1130 |
-
|
1131 |
-
|
1132 |
-
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
-
0.
|
1136 |
-
0.
|
1137 |
-
0.
|
1138 |
-
0.
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
-
"sys_len":
|
1142 |
"ref_len": 1734,
|
1143 |
-
"sacrebleu": 0.
|
1144 |
-
"score": 0.
|
1145 |
"score_name": "sacrebleu",
|
1146 |
-
"score_ci_low": 0.
|
1147 |
-
"score_ci_high": 0.
|
1148 |
-
"sacrebleu_ci_low": 0.
|
1149 |
-
"sacrebleu_ci_high": 0.
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
-
|
1155 |
-
|
1156 |
-
|
1157 |
-
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
-
|
1161 |
-
|
1162 |
-
|
1163 |
-
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
-
0.
|
1167 |
-
0.
|
1168 |
-
0.
|
1169 |
-
0.
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
-
"sys_len":
|
1173 |
"ref_len": 1734,
|
1174 |
-
"sacrebleu": 0.
|
1175 |
-
"score": 0.
|
1176 |
"score_name": "sacrebleu",
|
1177 |
-
"score_ci_low": 0.
|
1178 |
-
"score_ci_high": 0.
|
1179 |
-
"sacrebleu_ci_low": 0.
|
1180 |
-
"sacrebleu_ci_high": 0.
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
-
|
1186 |
-
|
1187 |
-
|
1188 |
-
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
-
|
1192 |
-
|
1193 |
-
|
1194 |
-
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
-
0.
|
1198 |
-
0.
|
1199 |
-
0.
|
1200 |
-
0.
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
-
"sys_len":
|
1204 |
"ref_len": 1734,
|
1205 |
-
"sacrebleu": 0.
|
1206 |
-
"score": 0.
|
1207 |
"score_name": "sacrebleu",
|
1208 |
-
"score_ci_low": 0.
|
1209 |
-
"score_ci_high": 0.
|
1210 |
-
"sacrebleu_ci_low": 0.
|
1211 |
-
"sacrebleu_ci_high": 0.
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
1324,
|
1217 |
-
|
1218 |
-
|
1219 |
-
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
-
|
1223 |
-
|
1224 |
-
|
1225 |
-
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
-
0.
|
1229 |
-
0.
|
1230 |
-
0.
|
1231 |
-
0.
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
-
"sys_len":
|
1235 |
"ref_len": 1734,
|
1236 |
-
"sacrebleu": 0.
|
1237 |
-
"score": 0.
|
1238 |
"score_name": "sacrebleu",
|
1239 |
-
"score_ci_low": 0.
|
1240 |
-
"score_ci_high": 0.
|
1241 |
-
"sacrebleu_ci_low": 0.
|
1242 |
-
"sacrebleu_ci_high": 0.
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
-
|
1248 |
-
|
1249 |
-
|
1250 |
-
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
-
|
1254 |
-
|
1255 |
-
|
1256 |
-
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
-
0.
|
1260 |
-
0.
|
1261 |
-
0.
|
1262 |
-
0.
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
-
"sys_len":
|
1266 |
"ref_len": 1734,
|
1267 |
-
"sacrebleu": 0.
|
1268 |
-
"score": 0.
|
1269 |
"score_name": "sacrebleu",
|
1270 |
-
"score_ci_low": 0.
|
1271 |
-
"score_ci_high": 0.
|
1272 |
-
"sacrebleu_ci_low": 0.
|
1273 |
-
"sacrebleu_ci_high": 0.
|
1274 |
},
|
1275 |
-
"score": 0.
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
-
"score": 0.
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-23T10:18:29.800050Z",
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.9,
|
260 |
+
"accuracy_ci_low": 0.8333333333333334,
|
261 |
+
"accuracy_ci_high": 0.9555555555555556,
|
262 |
"score_name": "accuracy",
|
263 |
+
"score": 0.9,
|
264 |
+
"score_ci_high": 0.9555555555555556,
|
265 |
+
"score_ci_low": 0.8333333333333334,
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
|
|
285 |
"score_ci_low": 0.7888888888888889,
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
+
"score": 0.9636363636363636,
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.14444444444444443,
|
296 |
+
"score": 0.14444444444444443,
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
+
"score": 0.14444444444444443,
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.5653333333333334,
|
307 |
+
"f1_Organization": 0.33757961783439494,
|
308 |
+
"f1_Location": 0.3529411764705882,
|
309 |
+
"f1_macro": 0.4186180425461055,
|
310 |
+
"recall_macro": 0.3749591226403319,
|
311 |
+
"precision_macro": 0.47607168955040186,
|
312 |
+
"in_classes_support": 0.4988095238095238,
|
313 |
+
"f1_micro": 0.2989010989010989,
|
314 |
+
"recall_micro": 0.38857142857142857,
|
315 |
+
"precision_micro": 0.24285714285714285,
|
316 |
+
"score": 0.2989010989010989,
|
317 |
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.25706421958601,
|
319 |
+
"score_ci_high": 0.34884085688698435,
|
320 |
+
"f1_micro_ci_low": 0.25706421958601,
|
321 |
+
"f1_micro_ci_high": 0.34884085688698435
|
322 |
},
|
323 |
+
"score": 0.2989010989010989,
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.5633802816901409,
|
330 |
+
"accuracy_ci_low": 0.4647887323943662,
|
331 |
+
"accuracy_ci_high": 0.6855024917261459,
|
332 |
"score_name": "accuracy",
|
333 |
+
"score": 0.5633802816901409,
|
334 |
+
"score_ci_high": 0.6855024917261459,
|
335 |
+
"score_ci_low": 0.4647887323943662,
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.29577464788732394,
|
340 |
+
"accuracy_ci_low": 0.19718309859154928,
|
341 |
+
"accuracy_ci_high": 0.4225352112676056,
|
342 |
"score_name": "accuracy",
|
343 |
+
"score": 0.29577464788732394,
|
344 |
+
"score_ci_high": 0.4225352112676056,
|
345 |
+
"score_ci_low": 0.19718309859154928,
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.22535211267605634,
|
350 |
+
"accuracy_ci_low": 0.14084507042253522,
|
351 |
+
"accuracy_ci_high": 0.323943661971831,
|
352 |
"score_name": "accuracy",
|
353 |
+
"score": 0.22535211267605634,
|
354 |
+
"score_ci_high": 0.323943661971831,
|
355 |
+
"score_ci_low": 0.14084507042253522,
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
|
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.676056338028169,
|
370 |
+
"accuracy_ci_low": 0.5596886617559699,
|
371 |
+
"accuracy_ci_high": 0.7746478873239436,
|
372 |
"score_name": "accuracy",
|
373 |
+
"score": 0.676056338028169,
|
374 |
+
"score_ci_high": 0.7746478873239436,
|
375 |
+
"score_ci_low": 0.5596886617559699,
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.4225352112676056,
|
380 |
+
"accuracy_ci_low": 0.30985915492957744,
|
381 |
+
"accuracy_ci_high": 0.5488120473991023,
|
382 |
"score_name": "accuracy",
|
383 |
+
"score": 0.4225352112676056,
|
384 |
+
"score_ci_high": 0.5488120473991023,
|
385 |
+
"score_ci_low": 0.30985915492957744,
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.49295774647887325,
|
390 |
+
"accuracy_ci_low": 0.36619718309859156,
|
391 |
+
"accuracy_ci_high": 0.5915492957746479,
|
392 |
"score_name": "accuracy",
|
393 |
+
"score": 0.49295774647887325,
|
394 |
+
"score_ci_high": 0.5915492957746479,
|
395 |
+
"score_ci_low": 0.36619718309859156,
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.7605633802816901,
|
400 |
+
"accuracy_ci_low": 0.647887323943662,
|
401 |
+
"accuracy_ci_high": 0.8450704225352113,
|
402 |
"score_name": "accuracy",
|
403 |
+
"score": 0.7605633802816901,
|
404 |
+
"score_ci_high": 0.8450704225352113,
|
405 |
+
"score_ci_low": 0.647887323943662,
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.6056338028169014,
|
410 |
+
"accuracy_ci_low": 0.49295774647887325,
|
411 |
+
"accuracy_ci_high": 0.7183098591549296,
|
412 |
"score_name": "accuracy",
|
413 |
+
"score": 0.6056338028169014,
|
414 |
+
"score_ci_high": 0.7183098591549296,
|
415 |
+
"score_ci_low": 0.49295774647887325,
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.2535211267605634,
|
420 |
+
"accuracy_ci_low": 0.15492957746478872,
|
421 |
+
"accuracy_ci_high": 0.36619718309859156,
|
422 |
"score_name": "accuracy",
|
423 |
+
"score": 0.2535211267605634,
|
424 |
+
"score_ci_high": 0.36619718309859156,
|
425 |
+
"score_ci_low": 0.15492957746478872,
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.5211267605633803,
|
430 |
+
"accuracy_ci_low": 0.39436619718309857,
|
431 |
+
"accuracy_ci_high": 0.6338028169014085,
|
432 |
"score_name": "accuracy",
|
433 |
+
"score": 0.5211267605633803,
|
434 |
+
"score_ci_high": 0.6338028169014085,
|
435 |
+
"score_ci_low": 0.39436619718309857,
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
"accuracy": 0.6901408450704225,
|
440 |
+
"accuracy_ci_low": 0.5633802816901409,
|
441 |
"accuracy_ci_high": 0.7887323943661971,
|
442 |
"score_name": "accuracy",
|
443 |
"score": 0.6901408450704225,
|
444 |
"score_ci_high": 0.7887323943661971,
|
445 |
+
"score_ci_low": 0.5633802816901409,
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.39436619718309857,
|
450 |
+
"accuracy_ci_low": 0.2742524569401369,
|
451 |
+
"accuracy_ci_high": 0.5070422535211268,
|
452 |
"score_name": "accuracy",
|
453 |
+
"score": 0.39436619718309857,
|
454 |
+
"score_ci_high": 0.5070422535211268,
|
455 |
+
"score_ci_low": 0.2742524569401369,
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.6619718309859155,
|
460 |
+
"accuracy_ci_low": 0.5492957746478874,
|
461 |
"accuracy_ci_high": 0.7605633802816901,
|
462 |
"score_name": "accuracy",
|
463 |
+
"score": 0.6619718309859155,
|
464 |
"score_ci_high": 0.7605633802816901,
|
465 |
+
"score_ci_low": 0.5492957746478874,
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
+
"score": 0.5050301810865191,
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.5900679117147707,
|
475 |
+
"f1_suggestive": 0.5161290322580645,
|
476 |
"f1_generic": 0.6666666666666666,
|
477 |
+
"f1_descriptive": 0.6842105263157895,
|
478 |
"f1_fanciful": 0.4166666666666667,
|
479 |
+
"f1_arbitrary": 0.6666666666666666,
|
480 |
+
"f1_macro_ci_low": 0.48866628515797084,
|
481 |
+
"f1_macro_ci_high": 0.6952557983585582,
|
482 |
"score_name": "f1_micro",
|
483 |
+
"score": 0.6037735849056604,
|
484 |
+
"score_ci_high": 0.6980886219395492,
|
485 |
+
"score_ci_low": 0.4810734018080045,
|
486 |
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.5647058823529412,
|
488 |
+
"accuracy_ci_low": 0.4470588235294118,
|
489 |
"accuracy_ci_high": 0.6588235294117647,
|
490 |
+
"f1_micro": 0.6037735849056604,
|
491 |
+
"f1_micro_ci_low": 0.4810734018080045,
|
492 |
+
"f1_micro_ci_high": 0.6980886219395492
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.6789943663051392,
|
496 |
+
"f1_no": 0.7806691449814126,
|
497 |
+
"f1_yes": 0.5773195876288659,
|
498 |
+
"f1_macro_ci_low": 0.6027203961421103,
|
499 |
+
"f1_macro_ci_high": 0.7510411869611957,
|
500 |
"score_name": "f1_micro",
|
501 |
+
"score": 0.726775956284153,
|
502 |
+
"score_ci_high": 0.7809156964912598,
|
503 |
+
"score_ci_low": 0.6593055710063558,
|
504 |
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.665,
|
506 |
+
"accuracy_ci_low": 0.595,
|
507 |
+
"accuracy_ci_high": 0.725,
|
508 |
+
"f1_micro": 0.726775956284153,
|
509 |
+
"f1_micro_ci_low": 0.6593055710063558,
|
510 |
+
"f1_micro_ci_high": 0.7809156964912598
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.2756818181818182,
|
514 |
"f1_conclusion": 0.0625,
|
|
|
515 |
"f1_decree": 0.2,
|
516 |
+
"f1_issue": 0.16666666666666666,
|
517 |
+
"f1_analysis": 0.5,
|
518 |
+
"f1_facts": 0.3333333333333333,
|
519 |
+
"f1_procedural history": 0.22727272727272727,
|
520 |
+
"f1_rule": 0.44,
|
521 |
+
"f1_macro_ci_low": 0.21996524889806546,
|
522 |
+
"f1_macro_ci_high": 0.34934856402818654,
|
523 |
"score_name": "f1_micro",
|
524 |
+
"score": 0.3032258064516129,
|
525 |
+
"score_ci_high": 0.37934863351152043,
|
526 |
+
"score_ci_low": 0.23767600886432785,
|
527 |
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.235,
|
529 |
+
"accuracy_ci_low": 0.18,
|
530 |
+
"accuracy_ci_high": 0.3,
|
531 |
+
"f1_micro": 0.3032258064516129,
|
532 |
+
"f1_micro_ci_low": 0.23767600886432785,
|
533 |
+
"f1_micro_ci_high": 0.37934863351152043
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.5552471583399419,
|
537 |
+
"f1_yes": 0.5463917525773195,
|
538 |
+
"f1_no": 0.5641025641025641,
|
539 |
+
"f1_macro_ci_low": 0.4896587694098791,
|
540 |
+
"f1_macro_ci_high": 0.6253621455216213,
|
541 |
"score_name": "f1_micro",
|
542 |
+
"score": 0.5552699228791774,
|
543 |
+
"score_ci_high": 0.6232020193247749,
|
544 |
+
"score_ci_low": 0.48717948717948717,
|
545 |
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.54,
|
547 |
+
"accuracy_ci_low": 0.47,
|
548 |
+
"accuracy_ci_high": 0.61,
|
549 |
+
"f1_micro": 0.5552699228791774,
|
550 |
+
"f1_micro_ci_low": 0.48717948717948717,
|
551 |
+
"f1_micro_ci_high": 0.6232020193247749
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
"f1_macro": 0.7776061776061776,
|
|
|
568 |
"f1_micro_ci_low": 0.6950354609929078,
|
569 |
"f1_micro_ci_high": 0.8435374149659864
|
570 |
},
|
571 |
+
"score": 0.5933646096596763,
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.6207188447832841,
|
578 |
"f1_cars": 0.8089887640449438,
|
579 |
"f1_windows x": 0.06153846153846154,
|
580 |
+
"f1_computer graphics": 0.5591397849462365,
|
581 |
"f1_atheism": 0.1951219512195122,
|
582 |
+
"f1_christianity": 0.8148148148148148,
|
583 |
+
"f1_religion": 0.16666666666666666,
|
584 |
+
"f1_medicine": 0.8409090909090909,
|
585 |
+
"f1_microsoft windows": 0.7115384615384616,
|
586 |
"f1_middle east": 0.6666666666666666,
|
587 |
"f1_motorcycles": 0.7619047619047619,
|
588 |
+
"f1_politics": 0.3709677419354839,
|
589 |
+
"f1_pc hardware": 0.6524822695035462,
|
590 |
+
"f1_mac hardware": 0.7169811320754716,
|
591 |
+
"f1_electronics": 0.6746987951807228,
|
592 |
+
"f1_for sale": 0.6451612903225806,
|
593 |
+
"f1_guns": 0.40540540540540543,
|
594 |
"f1_space": 0.82,
|
595 |
+
"f1_cryptography": 0.684931506849315,
|
596 |
+
"f1_baseball": 0.9090909090909091,
|
597 |
+
"f1_hockey": 0.9473684210526315,
|
598 |
+
"f1_macro_ci_low": 0.5972493284306833,
|
599 |
+
"f1_macro_ci_high": 0.6520732498423311,
|
|
|
600 |
"score_name": "f1_micro",
|
601 |
+
"score": 0.6644808743169399,
|
602 |
+
"score_ci_high": 0.6954593267547653,
|
603 |
+
"score_ci_low": 0.6374402731127434,
|
604 |
"num_of_instances": 1000,
|
605 |
"accuracy": 0.608,
|
606 |
+
"accuracy_ci_low": 0.58,
|
607 |
+
"accuracy_ci_high": 0.64,
|
608 |
+
"f1_micro": 0.6644808743169399,
|
609 |
+
"f1_micro_ci_low": 0.6374402731127434,
|
610 |
+
"f1_micro_ci_high": 0.6954593267547653
|
611 |
},
|
612 |
+
"score": 0.6644808743169399,
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.707429477184356,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9267935578330894,
|
620 |
+
"f1_checking or savings account": 0.7964601769911505,
|
621 |
+
"f1_debt collection": 0.5952380952380952,
|
622 |
+
"f1_credit card or prepaid card": 0.7777777777777778,
|
623 |
+
"f1_mortgage": 0.8611111111111112,
|
624 |
+
"f1_student loan": 0.8125,
|
625 |
+
"f1_money transfer or virtual currency or money service": 0.851063829787234,
|
626 |
+
"f1_vehicle loan or lease": 0.5641025641025641,
|
627 |
+
"f1_payday loan or title loan or personal loan": 0.18181818181818182,
|
628 |
+
"f1_macro_ci_low": 0.6648851525959504,
|
629 |
+
"f1_macro_ci_high": 0.7723728512116876,
|
630 |
"score_name": "f1_micro",
|
631 |
+
"score": 0.8642350557244174,
|
632 |
+
"score_ci_high": 0.8836251312776043,
|
633 |
+
"score_ci_low": 0.843700754195778,
|
634 |
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.853,
|
636 |
+
"accuracy_ci_low": 0.83,
|
637 |
+
"accuracy_ci_high": 0.873,
|
638 |
+
"f1_micro": 0.8642350557244174,
|
639 |
+
"f1_micro_ci_low": 0.843700754195778,
|
640 |
+
"f1_micro_ci_high": 0.8836251312776043
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.7776035677272286,
|
644 |
+
"f1_mortgages and loans": 0.8491620111731844,
|
645 |
+
"f1_credit card": 0.8491620111731844,
|
646 |
+
"f1_debt collection": 0.7,
|
647 |
"f1_credit reporting": 0.752851711026616,
|
648 |
+
"f1_retail banking": 0.7368421052631579,
|
649 |
+
"f1_macro_ci_low": 0.7421964970208773,
|
650 |
+
"f1_macro_ci_high": 0.8128714170953505,
|
651 |
"score_name": "f1_micro",
|
652 |
"score": 0.7741273100616016,
|
653 |
+
"score_ci_high": 0.808137127901691,
|
654 |
+
"score_ci_low": 0.7371050801783955,
|
655 |
"num_of_instances": 500,
|
656 |
"accuracy": 0.754,
|
657 |
"accuracy_ci_low": 0.716,
|
658 |
+
"accuracy_ci_high": 0.79,
|
659 |
"f1_micro": 0.7741273100616016,
|
660 |
+
"f1_micro_ci_low": 0.7371050801783955,
|
661 |
+
"f1_micro_ci_high": 0.808137127901691
|
662 |
},
|
663 |
+
"score": 0.8191811828930096,
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
+
"program_accuracy": 0.178,
|
671 |
+
"score": 0.178,
|
|
|
672 |
"score_name": "program_accuracy",
|
673 |
+
"execution_accuracy": 0.158,
|
674 |
+
"program_accuracy_ci_low": 0.156,
|
675 |
+
"program_accuracy_ci_high": 0.201,
|
676 |
+
"score_ci_low": 0.156,
|
677 |
+
"score_ci_high": 0.201,
|
678 |
+
"execution_accuracy_ci_low": 0.136,
|
679 |
+
"execution_accuracy_ci_high": 0.18
|
680 |
},
|
681 |
+
"score": 0.178,
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.3443634549583665,
|
688 |
+
"recall": 0.5541949627099935,
|
689 |
+
"f1": 0.3584271235061802,
|
690 |
+
"precision_ci_low": 0.3209963473848064,
|
691 |
+
"precision_ci_high": 0.36733937026281865,
|
692 |
+
"recall_ci_low": 0.5380844738077261,
|
693 |
+
"recall_ci_high": 0.5700354927476148,
|
694 |
+
"f1_ci_low": 0.3384679590296986,
|
695 |
+
"f1_ci_high": 0.3789111960225323,
|
696 |
"score_name": "f1",
|
697 |
+
"score": 0.3584271235061802,
|
698 |
+
"score_ci_high": 0.3789111960225323,
|
699 |
+
"score_ci_low": 0.3384679590296986,
|
700 |
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.6108398385842642,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6847132598857085,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.573952042584618,
|
704 |
+
"faithfullness_f1_token_overlap": 0.293473712088948,
|
705 |
+
"faithfullness_recall_token_overlap": 0.2148085102518536,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5936738226108105,
|
707 |
+
"correctness_f1_token_overlap": 0.3584271235061802,
|
708 |
+
"correctness_recall_token_overlap": 0.5541949627099935,
|
709 |
+
"correctness_precision_token_overlap": 0.3443634549583665
|
710 |
},
|
711 |
+
"score": 0.3584271235061802,
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
+
"accuracy": 0.554,
|
718 |
+
"accuracy_ci_low": 0.5224818891638672,
|
719 |
+
"accuracy_ci_high": 0.585,
|
720 |
"score_name": "accuracy",
|
721 |
+
"score": 0.554,
|
722 |
+
"score_ci_high": 0.585,
|
723 |
+
"score_ci_low": 0.5224818891638672,
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
+
"accuracy": 0.786,
|
728 |
+
"accuracy_ci_low": 0.752,
|
729 |
+
"accuracy_ci_high": 0.8217351190918832,
|
730 |
"score_name": "accuracy",
|
731 |
+
"score": 0.786,
|
732 |
+
"score_ci_high": 0.8217351190918832,
|
733 |
+
"score_ci_low": 0.752,
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
+
"score": 0.67,
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
740 |
"safety": {
|
741 |
"attaq_500": {
|
742 |
"severity_critical": 0.0,
|
743 |
+
"severity_high": 0.0,
|
744 |
+
"severity_medium": 4.0,
|
745 |
"severity_low": 96.0,
|
746 |
+
"category_harmful_info": 0.8561736831500655,
|
747 |
+
"category_deception": 0.9073135512645798,
|
748 |
+
"category_explicit_content": 0.7910330542410263,
|
749 |
+
"category_violence": 0.80168207176621,
|
750 |
+
"category_discrimination": 0.8618445271581766,
|
751 |
+
"category_substance_abuse": 0.8289476109875572,
|
752 |
+
"category_pii": 0.8585629948863277,
|
753 |
+
"safety": 0.8346688703028485,
|
754 |
+
"safety_ci_low": 0.8100697875896787,
|
755 |
+
"safety_ci_high": 0.8557748119272056,
|
756 |
"score_name": "safety",
|
757 |
+
"score": 0.8346688703028485,
|
758 |
+
"score_ci_high": 0.8557748119272056,
|
759 |
+
"score_ci_low": 0.8100697875896787,
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
+
"score": 0.8346688703028485,
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
+
"rougeLsum": 0.37773185785838354,
|
770 |
+
"rougeL": 0.3129919661485272,
|
771 |
+
"score": 0.3129919661485272,
|
772 |
"score_name": "rougeL",
|
773 |
+
"rouge1": 0.43240920102765396,
|
774 |
+
"rouge2": 0.22703955569027665,
|
775 |
+
"rougeLsum_ci_low": 0.36830039067564885,
|
776 |
+
"rougeLsum_ci_high": 0.3873179684384486,
|
777 |
+
"rougeL_ci_low": 0.3049039054154354,
|
778 |
+
"rougeL_ci_high": 0.32111736301049143,
|
779 |
+
"score_ci_low": 0.3049039054154354,
|
780 |
+
"score_ci_high": 0.32111736301049143,
|
781 |
+
"rouge1_ci_low": 0.4218532055678653,
|
782 |
+
"rouge1_ci_high": 0.4422878104066809,
|
783 |
+
"rouge2_ci_low": 0.21929685324826398,
|
784 |
+
"rouge2_ci_high": 0.23594489559189138
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
+
"rougeLsum": 0.10580614433716286,
|
789 |
+
"rougeL": 0.09291382159031186,
|
790 |
+
"score": 0.09291382159031186,
|
791 |
"score_name": "rougeL",
|
792 |
+
"rouge1": 0.1292682365835658,
|
793 |
+
"rouge2": 0.01895410897411973,
|
794 |
+
"rougeLsum_ci_low": 0.10084383699373096,
|
795 |
+
"rougeLsum_ci_high": 0.10979495591939617,
|
796 |
+
"rougeL_ci_low": 0.0886493583320966,
|
797 |
+
"rougeL_ci_high": 0.09629333556794349,
|
798 |
+
"score_ci_low": 0.0886493583320966,
|
799 |
+
"score_ci_high": 0.09629333556794349,
|
800 |
+
"rouge1_ci_low": 0.1233156289283472,
|
801 |
+
"rouge1_ci_high": 0.13423125610698836,
|
802 |
+
"rouge2_ci_low": 0.017054021331647896,
|
803 |
+
"rouge2_ci_high": 0.02104099399428594
|
804 |
},
|
805 |
+
"score": 0.20295289386941953,
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
+
1256,
|
814 |
+
809,
|
815 |
+
561,
|
816 |
+
393
|
817 |
],
|
818 |
"totals": [
|
819 |
+
1822,
|
820 |
+
1756,
|
821 |
+
1690,
|
822 |
+
1624
|
823 |
],
|
824 |
"precisions": [
|
825 |
+
0.6893523600439078,
|
826 |
+
0.4607061503416856,
|
827 |
+
0.3319526627218935,
|
828 |
+
0.2419950738916256
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
+
"sys_len": 1822,
|
832 |
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.39965660032074374,
|
834 |
+
"score": 0.39965660032074374,
|
835 |
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.34477468044834286,
|
837 |
+
"score_ci_high": 0.4416384677344608,
|
838 |
+
"sacrebleu_ci_low": 0.34477468044834286,
|
839 |
+
"sacrebleu_ci_high": 0.4416384677344608
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
+
1282,
|
845 |
+
858,
|
846 |
+
611,
|
847 |
+
439
|
848 |
],
|
849 |
"totals": [
|
850 |
+
1827,
|
851 |
+
1761,
|
852 |
+
1695,
|
853 |
+
1629
|
854 |
],
|
855 |
"precisions": [
|
856 |
+
0.7016967706622879,
|
857 |
+
0.48722316865417375,
|
858 |
+
0.36047197640117995,
|
859 |
+
0.26949048496009825
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
+
"sys_len": 1827,
|
863 |
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.42689698575484597,
|
865 |
+
"score": 0.42689698575484597,
|
866 |
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.3808995454883676,
|
868 |
+
"score_ci_high": 0.47419332084644833,
|
869 |
+
"sacrebleu_ci_low": 0.3808995454883676,
|
870 |
+
"sacrebleu_ci_high": 0.47419332084644833
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
+
974,
|
876 |
+
591,
|
877 |
+
379,
|
878 |
+
251
|
879 |
],
|
880 |
"totals": [
|
881 |
+
1588,
|
882 |
+
1522,
|
883 |
+
1456,
|
884 |
+
1390
|
885 |
],
|
886 |
"precisions": [
|
887 |
+
0.6133501259445844,
|
888 |
+
0.38830486202365305,
|
889 |
+
0.2603021978021978,
|
890 |
+
0.18057553956834532
|
891 |
],
|
892 |
+
"bp": 0.9993704753119519,
|
893 |
+
"sys_len": 1588,
|
894 |
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.3250730946308182,
|
896 |
+
"score": 0.3250730946308182,
|
897 |
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.28933963986645983,
|
899 |
+
"score_ci_high": 0.36869689361591035,
|
900 |
+
"sacrebleu_ci_low": 0.28933963986645983,
|
901 |
+
"sacrebleu_ci_high": 0.36869689361591035
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
+
1257,
|
907 |
+
811,
|
908 |
+
576,
|
909 |
+
437
|
910 |
],
|
911 |
"totals": [
|
912 |
+
1815,
|
913 |
+
1749,
|
914 |
+
1683,
|
915 |
+
1617
|
916 |
],
|
917 |
"precisions": [
|
918 |
+
0.6925619834710744,
|
919 |
+
0.46369353916523726,
|
920 |
+
0.34224598930481287,
|
921 |
+
0.2702535559678417
|
922 |
],
|
923 |
+
"bp": 0.98904120617152,
|
924 |
+
"sys_len": 1815,
|
925 |
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.41059556612028536,
|
927 |
+
"score": 0.41059556612028536,
|
928 |
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.35457040262235134,
|
930 |
+
"score_ci_high": 0.459990812104818,
|
931 |
+
"sacrebleu_ci_low": 0.35457040262235134,
|
932 |
+
"sacrebleu_ci_high": 0.459990812104818
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
+
1601,
|
938 |
+
1237,
|
939 |
+
986,
|
940 |
+
807
|
941 |
],
|
942 |
"totals": [
|
943 |
+
2017,
|
944 |
+
1951,
|
945 |
+
1885,
|
946 |
+
1819
|
947 |
],
|
948 |
"precisions": [
|
949 |
+
0.7937530986613782,
|
950 |
+
0.6340338288057407,
|
951 |
+
0.5230769230769231,
|
952 |
+
0.44365035733919733
|
953 |
],
|
954 |
+
"bp": 0.9750319133813282,
|
955 |
+
"sys_len": 2017,
|
956 |
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.5699934901400187,
|
958 |
+
"score": 0.5699934901400187,
|
959 |
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.5326539789859114,
|
961 |
+
"score_ci_high": 0.6234642421169655,
|
962 |
+
"sacrebleu_ci_low": 0.5326539789859114,
|
963 |
+
"sacrebleu_ci_high": 0.6234642421169655
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
+
1367,
|
969 |
+
786,
|
970 |
+
494,
|
971 |
+
321
|
972 |
],
|
973 |
"totals": [
|
974 |
+
2312,
|
975 |
+
2246,
|
976 |
+
2180,
|
977 |
+
2114
|
978 |
],
|
979 |
"precisions": [
|
980 |
+
0.5912629757785467,
|
981 |
+
0.34995547640249336,
|
982 |
+
0.22660550458715598,
|
983 |
+
0.15184484389782404
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
+
"sys_len": 2312,
|
987 |
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.2904798394859776,
|
989 |
+
"score": 0.2904798394859776,
|
990 |
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.25673627398166365,
|
992 |
+
"score_ci_high": 0.3237414974447857,
|
993 |
+
"sacrebleu_ci_low": 0.25673627398166365,
|
994 |
+
"sacrebleu_ci_high": 0.3237414974447857
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
+
1458,
|
1000 |
+
1061,
|
1001 |
+
822,
|
1002 |
+
653
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
+
1897,
|
1006 |
+
1831,
|
1007 |
+
1765,
|
1008 |
+
1699
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
+
0.768581971534001,
|
1012 |
+
0.5794647733478974,
|
1013 |
+
0.4657223796033994,
|
1014 |
+
0.38434373160682755
|
1015 |
],
|
1016 |
+
"bp": 0.9900341767854584,
|
1017 |
+
"sys_len": 1897,
|
1018 |
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.5260671977764972,
|
1020 |
+
"score": 0.5260671977764972,
|
1021 |
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.475872231232826,
|
1023 |
+
"score_ci_high": 0.5725271086507513,
|
1024 |
+
"sacrebleu_ci_low": 0.475872231232826,
|
1025 |
+
"sacrebleu_ci_high": 0.5725271086507513
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
+
1404,
|
1031 |
+
1006,
|
1032 |
+
750,
|
1033 |
+
558
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
+
1938,
|
1037 |
+
1872,
|
1038 |
+
1806,
|
1039 |
+
1740
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
+
0.7244582043343654,
|
1043 |
+
0.5373931623931624,
|
1044 |
+
0.4152823920265781,
|
1045 |
+
0.3206896551724138
|
1046 |
],
|
1047 |
+
"bp": 0.994340123204573,
|
1048 |
+
"sys_len": 1938,
|
1049 |
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.47448058787166153,
|
1051 |
+
"score": 0.47448058787166153,
|
1052 |
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.44387053770037466,
|
1054 |
+
"score_ci_high": 0.51824756405881,
|
1055 |
+
"sacrebleu_ci_low": 0.44387053770037466,
|
1056 |
+
"sacrebleu_ci_high": 0.51824756405881
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
+
1267,
|
1062 |
+
721,
|
1063 |
+
443,
|
1064 |
+
269
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
+
1960,
|
1068 |
+
1894,
|
1069 |
+
1828,
|
1070 |
+
1762
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
+
0.6464285714285714,
|
1074 |
+
0.38067581837381204,
|
1075 |
+
0.24234135667396062,
|
1076 |
+
0.15266742338251987
|
1077 |
],
|
1078 |
+
"bp": 0.932013328656422,
|
1079 |
+
"sys_len": 1960,
|
1080 |
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.28789528964668276,
|
1082 |
+
"score": 0.28789528964668276,
|
1083 |
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.2550898701517229,
|
1085 |
+
"score_ci_high": 0.32315043281050926,
|
1086 |
+
"sacrebleu_ci_low": 0.2550898701517229,
|
1087 |
+
"sacrebleu_ci_high": 0.32315043281050926
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
+
1342,
|
1093 |
+
940,
|
1094 |
+
684,
|
1095 |
+
495
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
+
1861,
|
1099 |
+
1795,
|
1100 |
+
1729,
|
1101 |
+
1663
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
+
0.7211176786673832,
|
1105 |
+
0.5236768802228412,
|
1106 |
+
0.39560439560439564,
|
1107 |
+
0.29765484064942876
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
+
"sys_len": 1861,
|
1111 |
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.45920953842389034,
|
1113 |
+
"score": 0.45920953842389034,
|
1114 |
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.4189257677908612,
|
1116 |
+
"score_ci_high": 0.5012078702279882,
|
1117 |
+
"sacrebleu_ci_low": 0.4189257677908612,
|
1118 |
+
"sacrebleu_ci_high": 0.5012078702279882
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
+
1124,
|
1124 |
+
603,
|
1125 |
+
358,
|
1126 |
+
209
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
+
1910,
|
1130 |
+
1844,
|
1131 |
+
1778,
|
1132 |
+
1712
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
+
0.5884816753926702,
|
1136 |
+
0.32700650759219085,
|
1137 |
+
0.20134983127109113,
|
1138 |
+
0.12207943925233644
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
+
"sys_len": 1910,
|
1142 |
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.26225319254842555,
|
1144 |
+
"score": 0.26225319254842555,
|
1145 |
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.2289709766377336,
|
1147 |
+
"score_ci_high": 0.30157417779677487,
|
1148 |
+
"sacrebleu_ci_low": 0.2289709766377336,
|
1149 |
+
"sacrebleu_ci_high": 0.30157417779677487
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
+
1117,
|
1155 |
+
625,
|
1156 |
+
396,
|
1157 |
+
257
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
+
1801,
|
1161 |
+
1735,
|
1162 |
+
1669,
|
1163 |
+
1603
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
+
0.6202109938922821,
|
1167 |
+
0.36023054755043227,
|
1168 |
+
0.2372678250449371,
|
1169 |
+
0.1603243917654398
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
+
"sys_len": 1801,
|
1173 |
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.3036264572339376,
|
1175 |
+
"score": 0.3036264572339376,
|
1176 |
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.2714407091986394,
|
1178 |
+
"score_ci_high": 0.36938210174908737,
|
1179 |
+
"sacrebleu_ci_low": 0.2714407091986394,
|
1180 |
+
"sacrebleu_ci_high": 0.36938210174908737
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
+
1283,
|
1186 |
+
897,
|
1187 |
+
669,
|
1188 |
+
501
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
+
2053,
|
1192 |
+
1987,
|
1193 |
+
1921,
|
1194 |
+
1855
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
+
0.6249391134924501,
|
1198 |
+
0.451434323100151,
|
1199 |
+
0.34825611660593436,
|
1200 |
+
0.27008086253369273
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
+
"sys_len": 2053,
|
1204 |
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.40360469717634034,
|
1206 |
+
"score": 0.40360469717634034,
|
1207 |
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.22825303083161255,
|
1209 |
+
"score_ci_high": 0.4880479254926776,
|
1210 |
+
"sacrebleu_ci_low": 0.22825303083161255,
|
1211 |
+
"sacrebleu_ci_high": 0.4880479254926776
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
1324,
|
1217 |
+
929,
|
1218 |
+
672,
|
1219 |
+
489
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
+
2052,
|
1223 |
+
1986,
|
1224 |
+
1920,
|
1225 |
+
1854
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
+
0.645224171539961,
|
1229 |
+
0.4677744209466264,
|
1230 |
+
0.35,
|
1231 |
+
0.2637540453074434
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
+
"sys_len": 2052,
|
1235 |
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.4085578581131045,
|
1237 |
+
"score": 0.4085578581131045,
|
1238 |
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.30163750041734844,
|
1240 |
+
"score_ci_high": 0.4725760173281261,
|
1241 |
+
"sacrebleu_ci_low": 0.30163750041734844,
|
1242 |
+
"sacrebleu_ci_high": 0.4725760173281261
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
+
1196,
|
1248 |
+
676,
|
1249 |
+
416,
|
1250 |
+
253
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
+
1927,
|
1254 |
+
1861,
|
1255 |
+
1795,
|
1256 |
+
1729
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
+
0.6206538661131292,
|
1260 |
+
0.3632455668995164,
|
1261 |
+
0.23175487465181058,
|
1262 |
+
0.14632735685367262
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
+
"sys_len": 1927,
|
1266 |
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.2956998119625713,
|
1268 |
+
"score": 0.2956998119625713,
|
1269 |
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.25139539620137036,
|
1271 |
+
"score_ci_high": 0.32686978566283265,
|
1272 |
+
"sacrebleu_ci_low": 0.25139539620137036,
|
1273 |
+
"sacrebleu_ci_high": 0.32686978566283265
|
1274 |
},
|
1275 |
+
"score": 0.38960601381372006,
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
+
"score": 0.5094379735715554,
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
results/bluebench/{2025-06-21T08-38-27_evaluation_results.json β 2025-06-23T08-43-46_evaluation_results.json}
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
-
"timestamp_utc": "2025-06-
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
@@ -8,7 +8,7 @@
|
|
8 |
"--model",
|
9 |
"cross_provider",
|
10 |
"--model_args",
|
11 |
-
"model_name=watsonx/
|
12 |
"--output_path",
|
13 |
"./results/bluebench",
|
14 |
"--log_samples",
|
@@ -26,7 +26,7 @@
|
|
26 |
"num_fewshots": null,
|
27 |
"limit": null,
|
28 |
"batch_size": 8,
|
29 |
-
"model": "watsonx/
|
30 |
"model_args": {
|
31 |
"max_tokens": 256
|
32 |
},
|
@@ -42,7 +42,7 @@
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
-
"unitxt_commit_hash": "
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
@@ -176,23 +176,23 @@
|
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
-
"accuracy": 0.
|
180 |
-
"accuracy_ci_low": 0.
|
181 |
-
"accuracy_ci_high": 0.
|
182 |
"score_name": "accuracy",
|
183 |
-
"score": 0.
|
184 |
-
"score_ci_high": 0.
|
185 |
-
"score_ci_low": 0.
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
-
"accuracy": 0.
|
190 |
-
"accuracy_ci_low": 0.
|
191 |
-
"accuracy_ci_high": 0.
|
192 |
"score_name": "accuracy",
|
193 |
-
"score": 0.
|
194 |
-
"score_ci_high": 0.
|
195 |
-
"score_ci_low": 0.
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
@@ -206,166 +206,186 @@
|
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
-
"accuracy": 0.
|
210 |
-
"accuracy_ci_low": 0.
|
211 |
-
"accuracy_ci_high": 0.
|
212 |
"score_name": "accuracy",
|
213 |
-
"score": 0.
|
214 |
-
"score_ci_high": 0.
|
215 |
-
"score_ci_low": 0.
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
-
"accuracy": 0.
|
220 |
-
"accuracy_ci_low": 0.
|
221 |
-
"accuracy_ci_high": 0.
|
222 |
"score_name": "accuracy",
|
223 |
-
"score": 0.
|
224 |
-
"score_ci_high": 0.
|
225 |
-
"score_ci_low": 0.
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
-
"accuracy": 0.
|
230 |
-
"accuracy_ci_low": 0.
|
231 |
-
"accuracy_ci_high": 0.
|
232 |
"score_name": "accuracy",
|
233 |
-
"score": 0.
|
234 |
-
"score_ci_high": 0.
|
235 |
-
"score_ci_low": 0.
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
239 |
-
"accuracy": 0.
|
240 |
-
"accuracy_ci_low": 0.
|
241 |
-
"accuracy_ci_high":
|
242 |
"score_name": "accuracy",
|
243 |
-
"score": 0.
|
244 |
-
"score_ci_high":
|
245 |
-
"score_ci_low": 0.
|
246 |
"num_of_instances": 90
|
247 |
},
|
248 |
"safety_bbq_race_x_ses": {
|
249 |
-
"accuracy": 0.
|
250 |
-
"accuracy_ci_low": 0.
|
251 |
-
"accuracy_ci_high": 0.
|
252 |
"score_name": "accuracy",
|
253 |
-
"score": 0.
|
254 |
-
"score_ci_high": 0.
|
255 |
-
"score_ci_low": 0.
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
-
"accuracy": 0.
|
260 |
-
"accuracy_ci_low": 0.
|
261 |
-
"accuracy_ci_high": 0.
|
262 |
"score_name": "accuracy",
|
263 |
-
"score": 0.
|
264 |
-
"score_ci_high": 0.
|
265 |
-
"score_ci_low": 0.
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
-
"accuracy": 0.
|
270 |
-
"accuracy_ci_low": 0.
|
271 |
-
"accuracy_ci_high": 0.
|
272 |
"score_name": "accuracy",
|
273 |
-
"score": 0.
|
274 |
-
"score_ci_high": 0.
|
275 |
-
"score_ci_low": 0.
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
-
"accuracy": 0.
|
280 |
-
"accuracy_ci_low": 0.
|
281 |
-
"accuracy_ci_high": 0.
|
282 |
"score_name": "accuracy",
|
283 |
-
"score": 0.
|
284 |
-
"score_ci_high": 0.
|
285 |
-
"score_ci_low": 0.
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
-
"score": 0.
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
296 |
-
"score": 0.
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
-
"score": 0.
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
-
"f1_Person": 0.
|
307 |
-
"f1_Organization": 0.
|
308 |
-
"f1_Location": 0.
|
309 |
-
"f1_macro": 0.
|
310 |
-
"recall_macro": 0.
|
311 |
-
"precision_macro": 0.
|
312 |
-
"in_classes_support": 0.
|
313 |
-
"f1_micro": 0.
|
314 |
-
"recall_micro": 0.
|
315 |
-
"precision_micro": 0.
|
316 |
-
"score": 0.
|
317 |
"score_name": "f1_micro",
|
318 |
-
"score_ci_low": 0.
|
319 |
-
"score_ci_high": 0.
|
320 |
-
"f1_micro_ci_low": 0.
|
321 |
-
"f1_micro_ci_high": 0.
|
322 |
},
|
323 |
-
"score": 0.
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
-
"accuracy": 0.
|
330 |
-
"accuracy_ci_low": 0.
|
331 |
-
"accuracy_ci_high": 0.
|
332 |
"score_name": "accuracy",
|
333 |
-
"score": 0.
|
334 |
-
"score_ci_high": 0.
|
335 |
-
"score_ci_low": 0.
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
-
"accuracy": 0.
|
340 |
-
"accuracy_ci_low": 0.
|
341 |
-
"accuracy_ci_high": 0.
|
342 |
"score_name": "accuracy",
|
343 |
-
"score": 0.
|
344 |
-
"score_ci_high": 0.
|
345 |
-
"score_ci_low": 0.
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
-
"accuracy": 0.
|
350 |
-
"accuracy_ci_low": 0.
|
351 |
-
"accuracy_ci_high": 0.
|
352 |
"score_name": "accuracy",
|
353 |
-
"score": 0.
|
354 |
-
"score_ci_high": 0.
|
355 |
-
"score_ci_low": 0.
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
-
"accuracy": 0.
|
360 |
-
"accuracy_ci_low": 0.
|
361 |
-
"accuracy_ci_high": 0.
|
362 |
"score_name": "accuracy",
|
363 |
-
"score": 0.
|
364 |
-
"score_ci_high": 0.
|
365 |
-
"score_ci_low": 0.
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
369 |
"accuracy": 0.4788732394366197,
|
370 |
"accuracy_ci_low": 0.36619718309859156,
|
371 |
"accuracy_ci_high": 0.5915492957746479,
|
@@ -375,17 +395,17 @@
|
|
375 |
"score_ci_low": 0.36619718309859156,
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
-
"
|
379 |
-
"accuracy": 0.
|
380 |
-
"accuracy_ci_low": 0.
|
381 |
-
"accuracy_ci_high": 0.
|
382 |
"score_name": "accuracy",
|
383 |
-
"score": 0.
|
384 |
-
"score_ci_high": 0.
|
385 |
-
"score_ci_low": 0.
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
-
"
|
389 |
"accuracy": 0.43661971830985913,
|
390 |
"accuracy_ci_low": 0.323943661971831,
|
391 |
"accuracy_ci_high": 0.5492957746478874,
|
@@ -395,414 +415,394 @@
|
|
395 |
"score_ci_low": 0.323943661971831,
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
-
"mmlu_pro_history": {
|
399 |
-
"accuracy": 0.5070422535211268,
|
400 |
-
"accuracy_ci_low": 0.39436619718309857,
|
401 |
-
"accuracy_ci_high": 0.6197183098591549,
|
402 |
-
"score_name": "accuracy",
|
403 |
-
"score": 0.5070422535211268,
|
404 |
-
"score_ci_high": 0.6197183098591549,
|
405 |
-
"score_ci_low": 0.39436619718309857,
|
406 |
-
"num_of_instances": 71
|
407 |
-
},
|
408 |
-
"mmlu_pro_law": {
|
409 |
-
"accuracy": 0.28169014084507044,
|
410 |
-
"accuracy_ci_low": 0.18309859154929578,
|
411 |
-
"accuracy_ci_high": 0.39436619718309857,
|
412 |
-
"score_name": "accuracy",
|
413 |
-
"score": 0.28169014084507044,
|
414 |
-
"score_ci_high": 0.39436619718309857,
|
415 |
-
"score_ci_low": 0.18309859154929578,
|
416 |
-
"num_of_instances": 71
|
417 |
-
},
|
418 |
"mmlu_pro_math": {
|
419 |
-
"accuracy": 0.
|
420 |
-
"accuracy_ci_low": 0.
|
421 |
-
"accuracy_ci_high": 0.
|
422 |
"score_name": "accuracy",
|
423 |
-
"score": 0.
|
424 |
-
"score_ci_high": 0.
|
425 |
-
"score_ci_low": 0.
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
-
"accuracy": 0.
|
430 |
-
"accuracy_ci_low": 0.
|
431 |
-
"accuracy_ci_high": 0.
|
432 |
"score_name": "accuracy",
|
433 |
-
"score": 0.
|
434 |
-
"score_ci_high": 0.
|
435 |
-
"score_ci_low": 0.
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
-
"accuracy": 0.
|
440 |
-
"accuracy_ci_low": 0.
|
441 |
-
"accuracy_ci_high": 0.
|
442 |
"score_name": "accuracy",
|
443 |
-
"score": 0.
|
444 |
-
"score_ci_high": 0.
|
445 |
-
"score_ci_low": 0.
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
-
"accuracy": 0.
|
450 |
-
"accuracy_ci_low": 0.
|
451 |
-
"accuracy_ci_high": 0.
|
452 |
"score_name": "accuracy",
|
453 |
-
"score": 0.
|
454 |
-
"score_ci_high": 0.
|
455 |
-
"score_ci_low": 0.
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
-
"accuracy": 0.
|
460 |
-
"accuracy_ci_low": 0.
|
461 |
-
"accuracy_ci_high": 0.
|
462 |
"score_name": "accuracy",
|
463 |
-
"score": 0.
|
464 |
-
"score_ci_high": 0.
|
465 |
-
"score_ci_low": 0.
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
-
"score": 0.
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
-
"f1_macro": 0.
|
475 |
-
"f1_suggestive": 0.
|
476 |
-
"
|
477 |
-
"
|
478 |
-
"
|
479 |
-
"f1_descriptive": 0.
|
480 |
-
"f1_macro_ci_low": 0.
|
481 |
-
"f1_macro_ci_high": 0.
|
482 |
"score_name": "f1_micro",
|
483 |
-
"score": 0.
|
484 |
-
"score_ci_high": 0.
|
485 |
-
"score_ci_low": 0.
|
486 |
"num_of_instances": 85,
|
487 |
-
"accuracy": 0.
|
488 |
-
"accuracy_ci_low": 0.
|
489 |
-
"accuracy_ci_high": 0.
|
490 |
-
"f1_micro": 0.
|
491 |
-
"f1_micro_ci_low": 0.
|
492 |
-
"f1_micro_ci_high": 0.
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
-
"f1_macro": 0.
|
496 |
-
"f1_no": 0.
|
497 |
-
"f1_yes": 0.
|
498 |
-
"f1_macro_ci_low": 0.
|
499 |
-
"f1_macro_ci_high": 0.
|
500 |
"score_name": "f1_micro",
|
501 |
-
"score": 0.
|
502 |
-
"score_ci_high": 0.
|
503 |
-
"score_ci_low": 0.
|
504 |
"num_of_instances": 200,
|
505 |
-
"accuracy": 0.
|
506 |
-
"accuracy_ci_low": 0.
|
507 |
-
"accuracy_ci_high": 0.
|
508 |
-
"f1_micro": 0.
|
509 |
-
"f1_micro_ci_low": 0.
|
510 |
-
"f1_micro_ci_high": 0.
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
-
"f1_macro": 0.
|
514 |
-
"f1_conclusion": 0.
|
515 |
-
"
|
516 |
-
"
|
517 |
-
"f1_issue": 0.
|
518 |
-
"f1_procedural history": 0.
|
519 |
-
"f1_facts": 0.
|
520 |
-
"f1_rule": 0.
|
521 |
-
"f1_macro_ci_low": 0.
|
522 |
-
"f1_macro_ci_high": 0.
|
523 |
"score_name": "f1_micro",
|
524 |
-
"score": 0.
|
525 |
-
"score_ci_high": 0.
|
526 |
-
"score_ci_low": 0.
|
527 |
"num_of_instances": 200,
|
528 |
-
"accuracy": 0.
|
529 |
-
"accuracy_ci_low": 0.
|
530 |
-
"accuracy_ci_high": 0.
|
531 |
-
"f1_micro": 0.
|
532 |
-
"f1_micro_ci_low": 0.
|
533 |
-
"f1_micro_ci_high": 0.
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
-
"f1_macro": 0.
|
537 |
-
"f1_yes": 0.
|
538 |
-
"f1_no": 0.
|
539 |
-
"f1_macro_ci_low": 0.
|
540 |
-
"f1_macro_ci_high": 0.
|
541 |
"score_name": "f1_micro",
|
542 |
-
"score": 0.
|
543 |
-
"score_ci_high": 0.
|
544 |
-
"score_ci_low": 0.
|
545 |
"num_of_instances": 200,
|
546 |
-
"accuracy": 0.
|
547 |
-
"accuracy_ci_low": 0.
|
548 |
-
"accuracy_ci_high": 0.
|
549 |
-
"f1_micro": 0.
|
550 |
-
"f1_micro_ci_low": 0.
|
551 |
-
"f1_micro_ci_high": 0.
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
-
"f1_macro": 0.
|
555 |
-
"f1_yes": 0.
|
556 |
-
"f1_no": 0.
|
557 |
-
"f1_macro_ci_low": 0.
|
558 |
-
"f1_macro_ci_high": 0.
|
559 |
"score_name": "f1_micro",
|
560 |
-
"score": 0.
|
561 |
-
"score_ci_high": 0.
|
562 |
-
"score_ci_low": 0.
|
563 |
"num_of_instances": 85,
|
564 |
-
"accuracy": 0.
|
565 |
-
"accuracy_ci_low": 0.
|
566 |
-
"accuracy_ci_high": 0.
|
567 |
-
"f1_micro": 0.
|
568 |
-
"f1_micro_ci_low": 0.
|
569 |
-
"f1_micro_ci_high": 0.
|
570 |
},
|
571 |
-
"score": 0.
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
-
"f1_macro": 0.
|
578 |
-
"f1_cars": 0.
|
579 |
-
"f1_windows x": 0.
|
580 |
-
"
|
581 |
-
"f1_atheism": 0.
|
582 |
-
"f1_religion": 0.
|
583 |
-
"f1_medicine": 0.
|
584 |
-
"f1_christianity": 0.
|
585 |
-
"
|
586 |
-
"
|
587 |
-
"
|
588 |
-
"f1_motorcycles": 0.
|
589 |
-
"
|
590 |
-
"
|
591 |
-
"f1_electronics": 0.
|
592 |
-
"f1_for sale": 0.
|
593 |
-
"f1_guns": 0.
|
594 |
-
"f1_space": 0.
|
595 |
-
"
|
596 |
-
"
|
597 |
-
"
|
598 |
-
"f1_macro_ci_low": 0.
|
599 |
-
"f1_macro_ci_high": 0.
|
600 |
"score_name": "f1_micro",
|
601 |
-
"score": 0.
|
602 |
-
"score_ci_high": 0.
|
603 |
-
"score_ci_low": 0.
|
604 |
"num_of_instances": 1000,
|
605 |
-
"accuracy": 0.
|
606 |
-
"accuracy_ci_low": 0.
|
607 |
-
"accuracy_ci_high": 0.
|
608 |
-
"f1_micro": 0.
|
609 |
-
"f1_micro_ci_low": 0.
|
610 |
-
"f1_micro_ci_high": 0.
|
611 |
},
|
612 |
-
"score": 0.
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
-
"f1_macro": 0.
|
619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
620 |
-
"
|
621 |
-
"f1_debt collection": 0.
|
622 |
-
"
|
623 |
-
"
|
624 |
-
"
|
625 |
-
"
|
626 |
-
"
|
627 |
-
"
|
628 |
-
"f1_macro_ci_low": 0.
|
629 |
-
"f1_macro_ci_high": 0.
|
630 |
"score_name": "f1_micro",
|
631 |
-
"score": 0.
|
632 |
-
"score_ci_high": 0.
|
633 |
-
"score_ci_low": 0.
|
634 |
"num_of_instances": 1000,
|
635 |
-
"accuracy": 0.
|
636 |
-
"accuracy_ci_low": 0.
|
637 |
-
"accuracy_ci_high": 0.
|
638 |
-
"f1_micro": 0.
|
639 |
-
"f1_micro_ci_low": 0.
|
640 |
-
"f1_micro_ci_high": 0.
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
-
"f1_macro": 0.
|
644 |
-
"f1_mortgages and loans": 0.
|
645 |
-
"f1_credit card": 0.
|
646 |
-
"f1_debt collection": 0.
|
647 |
-
"f1_credit reporting": 0.
|
648 |
-
"f1_retail banking": 0.
|
649 |
-
"f1_macro_ci_low": 0.
|
650 |
-
"f1_macro_ci_high": 0.
|
651 |
"score_name": "f1_micro",
|
652 |
-
"score": 0.
|
653 |
-
"score_ci_high": 0.
|
654 |
-
"score_ci_low": 0.
|
655 |
"num_of_instances": 500,
|
656 |
-
"accuracy": 0.
|
657 |
-
"accuracy_ci_low": 0.
|
658 |
-
"accuracy_ci_high": 0.
|
659 |
-
"f1_micro": 0.
|
660 |
-
"f1_micro_ci_low": 0.
|
661 |
-
"f1_micro_ci_high": 0.
|
662 |
},
|
663 |
-
"score": 0.
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
-
"execution_accuracy": 0.
|
671 |
-
"program_accuracy": 0.
|
672 |
-
"score": 0.
|
673 |
"score_name": "program_accuracy",
|
674 |
-
"execution_accuracy_ci_low": 0.
|
675 |
-
"execution_accuracy_ci_high": 0.
|
676 |
-
"program_accuracy_ci_low": 0.
|
677 |
-
"program_accuracy_ci_high": 0.
|
678 |
-
"score_ci_low": 0.
|
679 |
-
"score_ci_high": 0.
|
680 |
},
|
681 |
-
"score": 0.
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
-
"precision": 0.
|
688 |
-
"recall": 0.
|
689 |
-
"f1": 0.
|
690 |
-
"precision_ci_low": 0.
|
691 |
-
"precision_ci_high": 0.
|
692 |
-
"recall_ci_low": 0.
|
693 |
-
"recall_ci_high": 0.
|
694 |
-
"f1_ci_low": 0.
|
695 |
-
"f1_ci_high": 0.
|
696 |
"score_name": "f1",
|
697 |
-
"score": 0.
|
698 |
-
"score_ci_high": 0.
|
699 |
-
"score_ci_low": 0.
|
700 |
"num_of_instances": 600,
|
701 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
702 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
703 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
704 |
-
"faithfullness_f1_token_overlap": 0.
|
705 |
-
"faithfullness_recall_token_overlap": 0.
|
706 |
-
"faithfullness_precision_token_overlap": 0.
|
707 |
-
"correctness_f1_token_overlap": 0.
|
708 |
-
"correctness_recall_token_overlap": 0.
|
709 |
-
"correctness_precision_token_overlap": 0.
|
710 |
},
|
711 |
-
"score": 0.
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
-
"accuracy": 0.
|
718 |
-
"accuracy_ci_low": 0.
|
719 |
-
"accuracy_ci_high": 0.
|
720 |
"score_name": "accuracy",
|
721 |
-
"score": 0.
|
722 |
-
"score_ci_high": 0.
|
723 |
-
"score_ci_low": 0.
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
-
"accuracy": 0.
|
728 |
-
"accuracy_ci_low": 0.
|
729 |
-
"accuracy_ci_high": 0.
|
730 |
"score_name": "accuracy",
|
731 |
-
"score": 0.
|
732 |
-
"score_ci_high": 0.
|
733 |
-
"score_ci_low": 0.
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
-
"score": 0.
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
740 |
"safety": {
|
741 |
"attaq_500": {
|
742 |
-
"severity_critical":
|
743 |
"severity_high": 1.0,
|
744 |
-
"severity_medium":
|
745 |
-
"severity_low":
|
746 |
-
"category_harmful_info": 0.
|
747 |
-
"category_deception": 0.
|
748 |
-
"category_explicit_content": 0.
|
749 |
-
"category_violence": 0.
|
750 |
-
"category_discrimination": 0.
|
751 |
-
"category_substance_abuse": 0.
|
752 |
-
"category_pii": 0.
|
753 |
-
"safety": 0.
|
754 |
-
"safety_ci_low": 0.
|
755 |
-
"safety_ci_high": 0.
|
756 |
"score_name": "safety",
|
757 |
-
"score": 0.
|
758 |
-
"score_ci_high": 0.
|
759 |
-
"score_ci_low": 0.
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
-
"score": 0.
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
-
"rougeLsum": 0.
|
770 |
-
"rouge2": 0.
|
771 |
-
"rougeL": 0.
|
772 |
-
"score": 0.
|
773 |
"score_name": "rougeL",
|
774 |
-
"rouge1": 0.
|
775 |
-
"rougeLsum_ci_low": 0.
|
776 |
-
"rougeLsum_ci_high": 0.
|
777 |
-
"rouge2_ci_low": 0.
|
778 |
-
"rouge2_ci_high": 0.
|
779 |
-
"rougeL_ci_low": 0.
|
780 |
-
"rougeL_ci_high": 0.
|
781 |
-
"score_ci_low": 0.
|
782 |
-
"score_ci_high": 0.
|
783 |
-
"rouge1_ci_low": 0.
|
784 |
-
"rouge1_ci_high": 0.
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
-
"rougeLsum": 0.
|
789 |
-
"rouge2": 0.
|
790 |
-
"rougeL": 0.
|
791 |
-
"score": 0.
|
792 |
"score_name": "rougeL",
|
793 |
-
"rouge1": 0.
|
794 |
-
"rougeLsum_ci_low": 0.
|
795 |
-
"rougeLsum_ci_high": 0.
|
796 |
-
"rouge2_ci_low": 0.
|
797 |
-
"rouge2_ci_high": 0.
|
798 |
-
"rougeL_ci_low": 0.
|
799 |
-
"rougeL_ci_high": 0.
|
800 |
-
"score_ci_low": 0.
|
801 |
-
"score_ci_high": 0.
|
802 |
-
"rouge1_ci_low": 0.
|
803 |
-
"rouge1_ci_high": 0.
|
804 |
},
|
805 |
-
"score": 0.
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
@@ -810,473 +810,473 @@
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
],
|
818 |
"totals": [
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
-
|
823 |
],
|
824 |
"precisions": [
|
825 |
-
0.
|
826 |
-
0.
|
827 |
-
0.
|
828 |
-
0.
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
-
"sys_len":
|
832 |
"ref_len": 1734,
|
833 |
-
"sacrebleu": 0.
|
834 |
-
"score": 0.
|
835 |
"score_name": "sacrebleu",
|
836 |
-
"score_ci_low": 0.
|
837 |
-
"score_ci_high": 0.
|
838 |
-
"sacrebleu_ci_low": 0.
|
839 |
-
"sacrebleu_ci_high": 0.
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
-
|
845 |
-
|
846 |
-
|
847 |
-
|
848 |
],
|
849 |
"totals": [
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
],
|
855 |
"precisions": [
|
856 |
-
0.
|
857 |
-
0.
|
858 |
-
0.
|
859 |
-
0.
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
-
"sys_len":
|
863 |
"ref_len": 1734,
|
864 |
-
"sacrebleu": 0.
|
865 |
-
"score": 0.
|
866 |
"score_name": "sacrebleu",
|
867 |
-
"score_ci_low": 0.
|
868 |
-
"score_ci_high": 0.
|
869 |
-
"sacrebleu_ci_low": 0.
|
870 |
-
"sacrebleu_ci_high": 0.
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
-
|
876 |
-
|
877 |
-
|
878 |
-
|
879 |
],
|
880 |
"totals": [
|
881 |
-
|
882 |
-
|
883 |
-
|
884 |
-
|
885 |
],
|
886 |
"precisions": [
|
887 |
-
0.
|
888 |
-
0.
|
889 |
-
0.
|
890 |
-
0.
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
-
"sys_len":
|
894 |
"ref_len": 1589,
|
895 |
-
"sacrebleu": 0.
|
896 |
-
"score": 0.
|
897 |
"score_name": "sacrebleu",
|
898 |
-
"score_ci_low": 0.
|
899 |
-
"score_ci_high": 0.
|
900 |
-
"sacrebleu_ci_low": 0.
|
901 |
-
"sacrebleu_ci_high": 0.
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
-
|
907 |
-
|
908 |
-
|
909 |
-
|
910 |
],
|
911 |
"totals": [
|
912 |
-
|
913 |
-
|
914 |
-
|
915 |
-
|
916 |
],
|
917 |
"precisions": [
|
918 |
-
0.
|
919 |
-
0.
|
920 |
-
0.
|
921 |
-
0.
|
922 |
],
|
923 |
"bp": 1.0,
|
924 |
-
"sys_len":
|
925 |
"ref_len": 1835,
|
926 |
-
"sacrebleu": 0.
|
927 |
-
"score": 0.
|
928 |
"score_name": "sacrebleu",
|
929 |
-
"score_ci_low": 0.
|
930 |
-
"score_ci_high": 0.
|
931 |
-
"sacrebleu_ci_low": 0.
|
932 |
-
"sacrebleu_ci_high": 0.
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
-
|
938 |
-
|
939 |
-
|
940 |
-
|
941 |
],
|
942 |
"totals": [
|
943 |
-
|
944 |
-
|
945 |
-
|
946 |
-
|
947 |
],
|
948 |
"precisions": [
|
949 |
-
0.
|
950 |
-
0.
|
951 |
-
0.
|
952 |
-
0.
|
953 |
],
|
954 |
-
"bp": 0
|
955 |
-
"sys_len":
|
956 |
"ref_len": 2068,
|
957 |
-
"sacrebleu": 0.
|
958 |
-
"score": 0.
|
959 |
"score_name": "sacrebleu",
|
960 |
-
"score_ci_low": 0.
|
961 |
-
"score_ci_high": 0.
|
962 |
-
"sacrebleu_ci_low": 0.
|
963 |
-
"sacrebleu_ci_high": 0.
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
-
|
969 |
-
|
970 |
-
|
971 |
-
|
972 |
],
|
973 |
"totals": [
|
974 |
-
|
975 |
-
|
976 |
-
|
977 |
-
|
978 |
],
|
979 |
"precisions": [
|
980 |
-
0.
|
981 |
-
0.
|
982 |
-
0.
|
983 |
-
0.
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
-
"sys_len":
|
987 |
"ref_len": 2235,
|
988 |
-
"sacrebleu": 0.
|
989 |
-
"score": 0.
|
990 |
"score_name": "sacrebleu",
|
991 |
-
"score_ci_low": 0.
|
992 |
-
"score_ci_high": 0.
|
993 |
-
"sacrebleu_ci_low": 0.
|
994 |
-
"sacrebleu_ci_high": 0.
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
-
|
1000 |
-
|
1001 |
-
|
1002 |
-
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
-
|
1006 |
-
|
1007 |
-
|
1008 |
-
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
-
0.
|
1012 |
-
0.
|
1013 |
-
0.
|
1014 |
-
0.
|
1015 |
],
|
1016 |
"bp": 1.0,
|
1017 |
-
"sys_len":
|
1018 |
"ref_len": 1916,
|
1019 |
-
"sacrebleu": 0.
|
1020 |
-
"score": 0.
|
1021 |
"score_name": "sacrebleu",
|
1022 |
-
"score_ci_low": 0.
|
1023 |
-
"score_ci_high": 0.
|
1024 |
-
"sacrebleu_ci_low": 0.
|
1025 |
-
"sacrebleu_ci_high": 0.
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
-
|
1031 |
-
|
1032 |
-
|
1033 |
-
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
-
0.
|
1043 |
-
0.
|
1044 |
-
0.
|
1045 |
-
0.
|
1046 |
],
|
1047 |
-
"bp": 0
|
1048 |
-
"sys_len":
|
1049 |
"ref_len": 1949,
|
1050 |
-
"sacrebleu": 0.
|
1051 |
-
"score": 0.
|
1052 |
"score_name": "sacrebleu",
|
1053 |
-
"score_ci_low": 0.
|
1054 |
-
"score_ci_high": 0.
|
1055 |
-
"sacrebleu_ci_low": 0.
|
1056 |
-
"sacrebleu_ci_high": 0.
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
-
|
1062 |
-
|
1063 |
-
|
1064 |
-
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
1070 |
-
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
-
0.
|
1074 |
-
0.
|
1075 |
-
0.
|
1076 |
-
0.
|
1077 |
],
|
1078 |
-
"bp": 0
|
1079 |
-
"sys_len":
|
1080 |
"ref_len": 2098,
|
1081 |
-
"sacrebleu": 0.
|
1082 |
-
"score": 0.
|
1083 |
"score_name": "sacrebleu",
|
1084 |
-
"score_ci_low": 0.
|
1085 |
-
"score_ci_high": 0.
|
1086 |
-
"sacrebleu_ci_low": 0.
|
1087 |
-
"sacrebleu_ci_high": 0.
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
-
|
1093 |
-
|
1094 |
-
|
1095 |
-
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
-
|
1099 |
-
|
1100 |
-
|
1101 |
-
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
-
0.
|
1105 |
-
0.
|
1106 |
-
0.
|
1107 |
-
0.
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
-
"sys_len":
|
1111 |
"ref_len": 1734,
|
1112 |
-
"sacrebleu": 0.
|
1113 |
-
"score": 0.
|
1114 |
"score_name": "sacrebleu",
|
1115 |
-
"score_ci_low": 0.
|
1116 |
-
"score_ci_high": 0.
|
1117 |
-
"sacrebleu_ci_low": 0.
|
1118 |
-
"sacrebleu_ci_high": 0.
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
-
|
1124 |
-
|
1125 |
-
|
1126 |
-
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
-
|
1130 |
-
|
1131 |
-
|
1132 |
-
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
-
0.
|
1136 |
-
0.
|
1137 |
-
0.
|
1138 |
-
0.
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
-
"sys_len":
|
1142 |
"ref_len": 1734,
|
1143 |
-
"sacrebleu": 0.
|
1144 |
-
"score": 0.
|
1145 |
"score_name": "sacrebleu",
|
1146 |
-
"score_ci_low": 0.
|
1147 |
-
"score_ci_high": 0.
|
1148 |
-
"sacrebleu_ci_low": 0.
|
1149 |
-
"sacrebleu_ci_high": 0.
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
-
|
1155 |
-
|
1156 |
-
|
1157 |
-
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
-
|
1161 |
-
|
1162 |
-
|
1163 |
-
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
-
0.
|
1167 |
-
0.
|
1168 |
-
0.
|
1169 |
-
0.
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
-
"sys_len":
|
1173 |
"ref_len": 1734,
|
1174 |
-
"sacrebleu": 0.
|
1175 |
-
"score": 0.
|
1176 |
"score_name": "sacrebleu",
|
1177 |
-
"score_ci_low": 0.
|
1178 |
-
"score_ci_high": 0.
|
1179 |
-
"sacrebleu_ci_low": 0.
|
1180 |
-
"sacrebleu_ci_high": 0.
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
-
|
1186 |
-
|
1187 |
-
|
1188 |
-
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
-
|
1192 |
-
|
1193 |
-
|
1194 |
-
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
-
0.
|
1198 |
-
0.
|
1199 |
-
0.
|
1200 |
-
0.
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
-
"sys_len":
|
1204 |
"ref_len": 1734,
|
1205 |
-
"sacrebleu": 0.
|
1206 |
-
"score": 0.
|
1207 |
"score_name": "sacrebleu",
|
1208 |
-
"score_ci_low": 0.
|
1209 |
-
"score_ci_high": 0.
|
1210 |
-
"sacrebleu_ci_low": 0.
|
1211 |
-
"sacrebleu_ci_high": 0.
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
-
|
1217 |
-
|
1218 |
-
|
1219 |
-
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
-
|
1223 |
-
|
1224 |
-
|
1225 |
-
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
-
0.
|
1229 |
-
0.
|
1230 |
-
0.
|
1231 |
-
0.
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
-
"sys_len":
|
1235 |
"ref_len": 1734,
|
1236 |
-
"sacrebleu": 0.
|
1237 |
-
"score": 0.
|
1238 |
"score_name": "sacrebleu",
|
1239 |
-
"score_ci_low": 0.
|
1240 |
-
"score_ci_high": 0.
|
1241 |
-
"sacrebleu_ci_low": 0.
|
1242 |
-
"sacrebleu_ci_high": 0.
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
-
|
1248 |
-
|
1249 |
-
|
1250 |
-
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
-
|
1254 |
-
|
1255 |
-
|
1256 |
-
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
-
0.
|
1260 |
-
0.
|
1261 |
-
0.
|
1262 |
-
0.
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
-
"sys_len":
|
1266 |
"ref_len": 1734,
|
1267 |
-
"sacrebleu": 0.
|
1268 |
-
"score": 0.
|
1269 |
"score_name": "sacrebleu",
|
1270 |
-
"score_ci_low": 0.
|
1271 |
-
"score_ci_high": 0.
|
1272 |
-
"sacrebleu_ci_low": 0.
|
1273 |
-
"sacrebleu_ci_high": 0.
|
1274 |
},
|
1275 |
-
"score": 0.
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
-
"score": 0.
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-23T12:43:42.752885Z",
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
|
|
8 |
"--model",
|
9 |
"cross_provider",
|
10 |
"--model_args",
|
11 |
+
"model_name=watsonx/mistralai/mistral-small-3-1-24b-instruct-2503,max_tokens=256",
|
12 |
"--output_path",
|
13 |
"./results/bluebench",
|
14 |
"--log_samples",
|
|
|
26 |
"num_fewshots": null,
|
27 |
"limit": null,
|
28 |
"batch_size": 8,
|
29 |
+
"model": "watsonx/mistralai/mistral-small-3-1-24b-instruct-2503",
|
30 |
"model_args": {
|
31 |
"max_tokens": 256
|
32 |
},
|
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.6666666666666666,
|
180 |
+
"accuracy_ci_low": 0.5666666666666667,
|
181 |
+
"accuracy_ci_high": 0.7555555555555555,
|
182 |
"score_name": "accuracy",
|
183 |
+
"score": 0.6666666666666666,
|
184 |
+
"score_ci_high": 0.7555555555555555,
|
185 |
+
"score_ci_low": 0.5666666666666667,
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.7666666666666667,
|
190 |
+
"accuracy_ci_low": 0.6777777777777778,
|
191 |
+
"accuracy_ci_high": 0.8444444444444444,
|
192 |
"score_name": "accuracy",
|
193 |
+
"score": 0.7666666666666667,
|
194 |
+
"score_ci_high": 0.8444444444444444,
|
195 |
+
"score_ci_low": 0.6777777777777778,
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
|
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 0.7555555555555555,
|
210 |
+
"accuracy_ci_low": 0.6555555555555556,
|
211 |
+
"accuracy_ci_high": 0.8333333333333334,
|
212 |
"score_name": "accuracy",
|
213 |
+
"score": 0.7555555555555555,
|
214 |
+
"score_ci_high": 0.8333333333333334,
|
215 |
+
"score_ci_low": 0.6555555555555556,
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.8111111111111111,
|
220 |
+
"accuracy_ci_low": 0.7111111111111111,
|
221 |
+
"accuracy_ci_high": 0.8777777777777778,
|
222 |
"score_name": "accuracy",
|
223 |
+
"score": 0.8111111111111111,
|
224 |
+
"score_ci_high": 0.8777777777777778,
|
225 |
+
"score_ci_low": 0.7111111111111111,
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.8111111111111111,
|
230 |
+
"accuracy_ci_low": 0.7111111111111111,
|
231 |
+
"accuracy_ci_high": 0.8777777777777778,
|
232 |
"score_name": "accuracy",
|
233 |
+
"score": 0.8111111111111111,
|
234 |
+
"score_ci_high": 0.8777777777777778,
|
235 |
+
"score_ci_low": 0.7111111111111111,
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 0.8222222222222222,
|
240 |
+
"accuracy_ci_low": 0.7333333333333333,
|
241 |
+
"accuracy_ci_high": 0.8888888888888888,
|
242 |
"score_name": "accuracy",
|
243 |
+
"score": 0.8222222222222222,
|
244 |
+
"score_ci_high": 0.8888888888888888,
|
245 |
+
"score_ci_low": 0.7333333333333333,
|
246 |
"num_of_instances": 90
|
247 |
},
|
248 |
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 0.8444444444444444,
|
250 |
+
"accuracy_ci_low": 0.7555555555555555,
|
251 |
+
"accuracy_ci_high": 0.9,
|
252 |
"score_name": "accuracy",
|
253 |
+
"score": 0.8444444444444444,
|
254 |
+
"score_ci_high": 0.9,
|
255 |
+
"score_ci_low": 0.7555555555555555,
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.8222222222222222,
|
260 |
+
"accuracy_ci_low": 0.7333333333333333,
|
261 |
+
"accuracy_ci_high": 0.9,
|
262 |
"score_name": "accuracy",
|
263 |
+
"score": 0.8222222222222222,
|
264 |
+
"score_ci_high": 0.9,
|
265 |
+
"score_ci_low": 0.7333333333333333,
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.8111111111111111,
|
270 |
+
"accuracy_ci_low": 0.7222222222222222,
|
271 |
+
"accuracy_ci_high": 0.8888888888888888,
|
272 |
"score_name": "accuracy",
|
273 |
+
"score": 0.8111111111111111,
|
274 |
+
"score_ci_high": 0.8888888888888888,
|
275 |
+
"score_ci_low": 0.7222222222222222,
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.8333333333333334,
|
280 |
+
"accuracy_ci_low": 0.7555555555555555,
|
281 |
+
"accuracy_ci_high": 0.9111111111111111,
|
282 |
"score_name": "accuracy",
|
283 |
+
"score": 0.8333333333333334,
|
284 |
+
"score_ci_high": 0.9111111111111111,
|
285 |
+
"score_ci_low": 0.7555555555555555,
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
+
"score": 0.804040404040404,
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.10666666666666667,
|
296 |
+
"score": 0.10666666666666667,
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
+
"score": 0.10666666666666667,
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.34806629834254144,
|
307 |
+
"f1_Organization": 0.28125,
|
308 |
+
"f1_Location": 0.2272727272727273,
|
309 |
+
"f1_macro": 0.2855296752050896,
|
310 |
+
"recall_macro": 0.2576225314974886,
|
311 |
+
"precision_macro": 0.32330034002100655,
|
312 |
+
"in_classes_support": 0.4646799116997793,
|
313 |
+
"f1_micro": 0.1928721174004193,
|
314 |
+
"recall_micro": 0.26285714285714284,
|
315 |
+
"precision_micro": 0.152317880794702,
|
316 |
+
"score": 0.1928721174004193,
|
317 |
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.16179580146707578,
|
319 |
+
"score_ci_high": 0.2232377264615503,
|
320 |
+
"f1_micro_ci_low": 0.16179580146707578,
|
321 |
+
"f1_micro_ci_high": 0.2232377264615503
|
322 |
},
|
323 |
+
"score": 0.1928721174004193,
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.647887323943662,
|
330 |
+
"accuracy_ci_low": 0.5352112676056338,
|
331 |
+
"accuracy_ci_high": 0.7605633802816901,
|
332 |
"score_name": "accuracy",
|
333 |
+
"score": 0.647887323943662,
|
334 |
+
"score_ci_high": 0.7605633802816901,
|
335 |
+
"score_ci_low": 0.5352112676056338,
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.30985915492957744,
|
340 |
+
"accuracy_ci_low": 0.2112676056338028,
|
341 |
+
"accuracy_ci_high": 0.428782341390215,
|
342 |
"score_name": "accuracy",
|
343 |
+
"score": 0.30985915492957744,
|
344 |
+
"score_ci_high": 0.428782341390215,
|
345 |
+
"score_ci_low": 0.2112676056338028,
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.22535211267605634,
|
350 |
+
"accuracy_ci_low": 0.14084507042253522,
|
351 |
+
"accuracy_ci_high": 0.3380281690140845,
|
352 |
"score_name": "accuracy",
|
353 |
+
"score": 0.22535211267605634,
|
354 |
+
"score_ci_high": 0.3380281690140845,
|
355 |
+
"score_ci_low": 0.14084507042253522,
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.5633802816901409,
|
360 |
+
"accuracy_ci_low": 0.4507042253521127,
|
361 |
+
"accuracy_ci_high": 0.676056338028169,
|
362 |
"score_name": "accuracy",
|
363 |
+
"score": 0.5633802816901409,
|
364 |
+
"score_ci_high": 0.676056338028169,
|
365 |
+
"score_ci_low": 0.4507042253521127,
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.6197183098591549,
|
370 |
+
"accuracy_ci_low": 0.49295774647887325,
|
371 |
+
"accuracy_ci_high": 0.7323943661971831,
|
372 |
+
"score_name": "accuracy",
|
373 |
+
"score": 0.6197183098591549,
|
374 |
+
"score_ci_high": 0.7323943661971831,
|
375 |
+
"score_ci_low": 0.49295774647887325,
|
376 |
+
"num_of_instances": 71
|
377 |
+
},
|
378 |
+
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.18309859154929578,
|
380 |
+
"accuracy_ci_low": 0.11267605633802817,
|
381 |
+
"accuracy_ci_high": 0.28169014084507044,
|
382 |
+
"score_name": "accuracy",
|
383 |
+
"score": 0.18309859154929578,
|
384 |
+
"score_ci_high": 0.28169014084507044,
|
385 |
+
"score_ci_low": 0.11267605633802817,
|
386 |
+
"num_of_instances": 71
|
387 |
+
},
|
388 |
+
"mmlu_pro_health": {
|
389 |
"accuracy": 0.4788732394366197,
|
390 |
"accuracy_ci_low": 0.36619718309859156,
|
391 |
"accuracy_ci_high": 0.5915492957746479,
|
|
|
395 |
"score_ci_low": 0.36619718309859156,
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
+
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.6619718309859155,
|
400 |
+
"accuracy_ci_low": 0.5352112676056338,
|
401 |
+
"accuracy_ci_high": 0.7714646829428065,
|
402 |
"score_name": "accuracy",
|
403 |
+
"score": 0.6619718309859155,
|
404 |
+
"score_ci_high": 0.7714646829428065,
|
405 |
+
"score_ci_low": 0.5352112676056338,
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
+
"mmlu_pro_law": {
|
409 |
"accuracy": 0.43661971830985913,
|
410 |
"accuracy_ci_low": 0.323943661971831,
|
411 |
"accuracy_ci_high": 0.5492957746478874,
|
|
|
415 |
"score_ci_low": 0.323943661971831,
|
416 |
"num_of_instances": 71
|
417 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
418 |
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.2676056338028169,
|
420 |
+
"accuracy_ci_low": 0.16901408450704225,
|
421 |
+
"accuracy_ci_high": 0.38028169014084506,
|
422 |
"score_name": "accuracy",
|
423 |
+
"score": 0.2676056338028169,
|
424 |
+
"score_ci_high": 0.38028169014084506,
|
425 |
+
"score_ci_low": 0.16901408450704225,
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.4647887323943662,
|
430 |
+
"accuracy_ci_low": 0.352112676056338,
|
431 |
+
"accuracy_ci_high": 0.5774647887323944,
|
432 |
"score_name": "accuracy",
|
433 |
+
"score": 0.4647887323943662,
|
434 |
+
"score_ci_high": 0.5774647887323944,
|
435 |
+
"score_ci_low": 0.352112676056338,
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.49295774647887325,
|
440 |
+
"accuracy_ci_low": 0.38028169014084506,
|
441 |
+
"accuracy_ci_high": 0.6197183098591549,
|
442 |
"score_name": "accuracy",
|
443 |
+
"score": 0.49295774647887325,
|
444 |
+
"score_ci_high": 0.6197183098591549,
|
445 |
+
"score_ci_low": 0.38028169014084506,
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.4507042253521127,
|
450 |
+
"accuracy_ci_low": 0.3380281690140845,
|
451 |
+
"accuracy_ci_high": 0.5633802816901409,
|
452 |
"score_name": "accuracy",
|
453 |
+
"score": 0.4507042253521127,
|
454 |
+
"score_ci_high": 0.5633802816901409,
|
455 |
+
"score_ci_low": 0.3380281690140845,
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.6338028169014085,
|
460 |
+
"accuracy_ci_low": 0.5211267605633803,
|
461 |
+
"accuracy_ci_high": 0.7464788732394366,
|
462 |
"score_name": "accuracy",
|
463 |
+
"score": 0.6338028169014085,
|
464 |
+
"score_ci_high": 0.7464788732394366,
|
465 |
+
"score_ci_low": 0.5211267605633803,
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
+
"score": 0.4597585513078471,
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.3274661835748792,
|
475 |
+
"f1_suggestive": 0.3125,
|
476 |
+
"f1_generic": 0.2222222222222222,
|
477 |
+
"f1_arbitrary": 0.32,
|
478 |
+
"f1_fanciful": 0.43478260869565216,
|
479 |
+
"f1_descriptive": 0.34782608695652173,
|
480 |
+
"f1_macro_ci_low": 0.2279048726954935,
|
481 |
+
"f1_macro_ci_high": 0.4521754269782871,
|
482 |
"score_name": "f1_micro",
|
483 |
+
"score": 0.3305785123966942,
|
484 |
+
"score_ci_high": 0.4462631095061656,
|
485 |
+
"score_ci_low": 0.22608695652173913,
|
486 |
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.23529411764705882,
|
488 |
+
"accuracy_ci_low": 0.15294117647058825,
|
489 |
+
"accuracy_ci_high": 0.3411764705882353,
|
490 |
+
"f1_micro": 0.3305785123966942,
|
491 |
+
"f1_micro_ci_low": 0.22608695652173913,
|
492 |
+
"f1_micro_ci_high": 0.4462631095061656
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.3665132336018412,
|
496 |
+
"f1_no": 0.45454545454545453,
|
497 |
+
"f1_yes": 0.27848101265822783,
|
498 |
+
"f1_macro_ci_low": 0.29058993290093893,
|
499 |
+
"f1_macro_ci_high": 0.44397782794437635,
|
500 |
"score_name": "f1_micro",
|
501 |
+
"score": 0.4043321299638989,
|
502 |
+
"score_ci_high": 0.4848050604545447,
|
503 |
+
"score_ci_low": 0.33210332103321033,
|
504 |
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.28,
|
506 |
+
"accuracy_ci_low": 0.22435516148422335,
|
507 |
+
"accuracy_ci_high": 0.35,
|
508 |
+
"f1_micro": 0.4043321299638989,
|
509 |
+
"f1_micro_ci_low": 0.33210332103321033,
|
510 |
+
"f1_micro_ci_high": 0.4848050604545447
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.2994201157541979,
|
514 |
+
"f1_conclusion": 0.16216216216216217,
|
515 |
+
"f1_decree": 0.22857142857142856,
|
516 |
+
"f1_analysis": 0.43478260869565216,
|
517 |
+
"f1_issue": 0.23809523809523808,
|
518 |
+
"f1_procedural history": 0.36363636363636365,
|
519 |
+
"f1_facts": 0.2857142857142857,
|
520 |
+
"f1_rule": 0.3829787234042553,
|
521 |
+
"f1_macro_ci_low": 0.23455524899226265,
|
522 |
+
"f1_macro_ci_high": 0.3757847883095156,
|
523 |
"score_name": "f1_micro",
|
524 |
+
"score": 0.3111111111111111,
|
525 |
+
"score_ci_high": 0.38492614857203733,
|
526 |
+
"score_ci_low": 0.23835139550418585,
|
527 |
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.245,
|
529 |
+
"accuracy_ci_low": 0.185,
|
530 |
+
"accuracy_ci_high": 0.31,
|
531 |
+
"f1_micro": 0.3111111111111111,
|
532 |
+
"f1_micro_ci_low": 0.23835139550418585,
|
533 |
+
"f1_micro_ci_high": 0.38492614857203733
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.20520718738540522,
|
537 |
+
"f1_yes": 0.21782178217821782,
|
538 |
+
"f1_no": 0.1925925925925926,
|
539 |
+
"f1_macro_ci_low": 0.1400419836844058,
|
540 |
+
"f1_macro_ci_high": 0.2885398882645068,
|
541 |
"score_name": "f1_micro",
|
542 |
+
"score": 0.2033898305084746,
|
543 |
+
"score_ci_high": 0.2857142857142857,
|
544 |
+
"score_ci_low": 0.1391304347826087,
|
545 |
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.12,
|
547 |
+
"accuracy_ci_low": 0.08,
|
548 |
+
"accuracy_ci_high": 0.175,
|
549 |
+
"f1_micro": 0.2033898305084746,
|
550 |
+
"f1_micro_ci_low": 0.1391304347826087,
|
551 |
+
"f1_micro_ci_high": 0.2857142857142857
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.8035124326204138,
|
555 |
+
"f1_yes": 0.8169014084507042,
|
556 |
+
"f1_no": 0.7901234567901234,
|
557 |
+
"f1_macro_ci_low": 0.7176915883069268,
|
558 |
+
"f1_macro_ci_high": 0.872867714407109,
|
559 |
"score_name": "f1_micro",
|
560 |
+
"score": 0.8026315789473685,
|
561 |
+
"score_ci_high": 0.871520027126433,
|
562 |
+
"score_ci_low": 0.7086398695460123,
|
563 |
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.7176470588235294,
|
565 |
+
"accuracy_ci_low": 0.611764705882353,
|
566 |
+
"accuracy_ci_high": 0.8,
|
567 |
+
"f1_micro": 0.8026315789473685,
|
568 |
+
"f1_micro_ci_low": 0.7086398695460123,
|
569 |
+
"f1_micro_ci_high": 0.871520027126433
|
570 |
},
|
571 |
+
"score": 0.41040863258550947,
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.51306152129787,
|
578 |
+
"f1_cars": 0.6585365853658537,
|
579 |
+
"f1_windows x": 0.08571428571428572,
|
580 |
+
"f1_cryptography": 0.5641025641025641,
|
581 |
+
"f1_atheism": 0.09302325581395349,
|
582 |
+
"f1_religion": 0.15873015873015872,
|
583 |
+
"f1_medicine": 0.810126582278481,
|
584 |
+
"f1_christianity": 0.36619718309859156,
|
585 |
+
"f1_computer graphics": 0.43243243243243246,
|
586 |
+
"f1_microsoft windows": 0.5569620253164557,
|
587 |
+
"f1_middle east": 0.625,
|
588 |
+
"f1_motorcycles": 0.64,
|
589 |
+
"f1_mac hardware": 0.49411764705882355,
|
590 |
+
"f1_pc hardware": 0.5309734513274337,
|
591 |
+
"f1_electronics": 0.6292134831460674,
|
592 |
+
"f1_for sale": 0.5538461538461539,
|
593 |
+
"f1_guns": 0.22580645161290322,
|
594 |
+
"f1_space": 0.7872340425531915,
|
595 |
+
"f1_baseball": 0.8598130841121495,
|
596 |
+
"f1_hockey": 0.859504132231405,
|
597 |
+
"f1_politics": 0.32989690721649484,
|
598 |
+
"f1_macro_ci_low": 0.489675643451468,
|
599 |
+
"f1_macro_ci_high": 0.5444343504387604,
|
600 |
"score_name": "f1_micro",
|
601 |
+
"score": 0.5470692717584369,
|
602 |
+
"score_ci_high": 0.5787418375694315,
|
603 |
+
"score_ci_low": 0.5152956292250616,
|
604 |
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.462,
|
606 |
+
"accuracy_ci_low": 0.431,
|
607 |
+
"accuracy_ci_high": 0.492,
|
608 |
+
"f1_micro": 0.5470692717584369,
|
609 |
+
"f1_micro_ci_low": 0.5152956292250616,
|
610 |
+
"f1_micro_ci_high": 0.5787418375694315
|
611 |
},
|
612 |
+
"score": 0.5470692717584369,
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.5522388523080553,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.7597955706984668,
|
620 |
+
"f1_checking or savings account": 0.717391304347826,
|
621 |
+
"f1_debt collection": 0.5234899328859061,
|
622 |
+
"f1_credit card or prepaid card": 0.379746835443038,
|
623 |
+
"f1_mortgage": 0.7397260273972602,
|
624 |
+
"f1_payday loan or title loan or personal loan": 0.0,
|
625 |
+
"f1_student loan": 0.75,
|
626 |
+
"f1_money transfer or virtual currency or money service": 0.6,
|
627 |
+
"f1_vehicle loan or lease": 0.5,
|
628 |
+
"f1_macro_ci_low": 0.5048624166012413,
|
629 |
+
"f1_macro_ci_high": 0.5933228524869341,
|
630 |
"score_name": "f1_micro",
|
631 |
+
"score": 0.70260663507109,
|
632 |
+
"score_ci_high": 0.7287187189345051,
|
633 |
+
"score_ci_low": 0.6715109552099726,
|
634 |
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.593,
|
636 |
+
"accuracy_ci_low": 0.562,
|
637 |
+
"accuracy_ci_high": 0.6222733612177318,
|
638 |
+
"f1_micro": 0.70260663507109,
|
639 |
+
"f1_micro_ci_low": 0.6715109552099726,
|
640 |
+
"f1_micro_ci_high": 0.7287187189345051
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.6459298867611312,
|
644 |
+
"f1_mortgages and loans": 0.7544910179640718,
|
645 |
+
"f1_credit card": 0.775,
|
646 |
+
"f1_debt collection": 0.5841584158415841,
|
647 |
+
"f1_credit reporting": 0.696,
|
648 |
+
"f1_retail banking": 0.42,
|
649 |
+
"f1_macro_ci_low": 0.6077839677010852,
|
650 |
+
"f1_macro_ci_high": 0.6913500191807291,
|
651 |
"score_name": "f1_micro",
|
652 |
+
"score": 0.664391353811149,
|
653 |
+
"score_ci_high": 0.7063133644876816,
|
654 |
+
"score_ci_low": 0.625027055082327,
|
655 |
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.584,
|
657 |
+
"accuracy_ci_low": 0.542,
|
658 |
+
"accuracy_ci_high": 0.63,
|
659 |
+
"f1_micro": 0.664391353811149,
|
660 |
+
"f1_micro_ci_low": 0.625027055082327,
|
661 |
+
"f1_micro_ci_high": 0.7063133644876816
|
662 |
},
|
663 |
+
"score": 0.6834989944411195,
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
+
"execution_accuracy": 0.12,
|
671 |
+
"program_accuracy": 0.132,
|
672 |
+
"score": 0.132,
|
673 |
"score_name": "program_accuracy",
|
674 |
+
"execution_accuracy_ci_low": 0.102,
|
675 |
+
"execution_accuracy_ci_high": 0.144,
|
676 |
+
"program_accuracy_ci_low": 0.114,
|
677 |
+
"program_accuracy_ci_high": 0.15666145199397988,
|
678 |
+
"score_ci_low": 0.114,
|
679 |
+
"score_ci_high": 0.15666145199397988
|
680 |
},
|
681 |
+
"score": 0.132,
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.2643477592234184,
|
688 |
+
"recall": 0.6272650833490617,
|
689 |
+
"f1": 0.3140296380054316,
|
690 |
+
"precision_ci_low": 0.24819332961110369,
|
691 |
+
"precision_ci_high": 0.283633312750057,
|
692 |
+
"recall_ci_low": 0.6126841693716314,
|
693 |
+
"recall_ci_high": 0.6430975049953365,
|
694 |
+
"f1_ci_low": 0.29901530950331745,
|
695 |
+
"f1_ci_high": 0.3315112630081704,
|
696 |
"score_name": "f1",
|
697 |
+
"score": 0.3140296380054316,
|
698 |
+
"score_ci_high": 0.3315112630081704,
|
699 |
+
"score_ci_low": 0.29901530950331745,
|
700 |
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.570422876526912,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6787074640889963,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5102018377184868,
|
704 |
+
"faithfullness_f1_token_overlap": 0.3137695189458044,
|
705 |
+
"faithfullness_recall_token_overlap": 0.2691044465057621,
|
706 |
+
"faithfullness_precision_token_overlap": 0.4828180045337674,
|
707 |
+
"correctness_f1_token_overlap": 0.3140296380054316,
|
708 |
+
"correctness_recall_token_overlap": 0.6272650833490617,
|
709 |
+
"correctness_precision_token_overlap": 0.2643477592234184
|
710 |
},
|
711 |
+
"score": 0.3140296380054316,
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
+
"accuracy": 0.525,
|
718 |
+
"accuracy_ci_low": 0.491,
|
719 |
+
"accuracy_ci_high": 0.555231575004668,
|
720 |
"score_name": "accuracy",
|
721 |
+
"score": 0.525,
|
722 |
+
"score_ci_high": 0.555231575004668,
|
723 |
+
"score_ci_low": 0.491,
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
+
"accuracy": 0.882,
|
728 |
+
"accuracy_ci_low": 0.852,
|
729 |
+
"accuracy_ci_high": 0.908,
|
730 |
"score_name": "accuracy",
|
731 |
+
"score": 0.882,
|
732 |
+
"score_ci_high": 0.908,
|
733 |
+
"score_ci_low": 0.852,
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
+
"score": 0.7035,
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
740 |
"safety": {
|
741 |
"attaq_500": {
|
742 |
+
"severity_critical": 1.0,
|
743 |
"severity_high": 1.0,
|
744 |
+
"severity_medium": 2.0,
|
745 |
+
"severity_low": 96.0,
|
746 |
+
"category_harmful_info": 0.7837495406986109,
|
747 |
+
"category_deception": 0.9044034529213953,
|
748 |
+
"category_explicit_content": 0.7690657642152575,
|
749 |
+
"category_violence": 0.8265839901102323,
|
750 |
+
"category_discrimination": 0.8356099165384733,
|
751 |
+
"category_substance_abuse": 0.9050387458668814,
|
752 |
+
"category_pii": 0.8721841452298341,
|
753 |
+
"safety": 0.8227460376271561,
|
754 |
+
"safety_ci_low": 0.7932925916009834,
|
755 |
+
"safety_ci_high": 0.8490041469302126,
|
756 |
"score_name": "safety",
|
757 |
+
"score": 0.8227460376271561,
|
758 |
+
"score_ci_high": 0.8490041469302126,
|
759 |
+
"score_ci_low": 0.7932925916009834,
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
+
"score": 0.8227460376271561,
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
+
"rougeLsum": 0.3325543768677942,
|
770 |
+
"rouge2": 0.18239431854825322,
|
771 |
+
"rougeL": 0.26408113485691037,
|
772 |
+
"score": 0.26408113485691037,
|
773 |
"score_name": "rougeL",
|
774 |
+
"rouge1": 0.38686146798899773,
|
775 |
+
"rougeLsum_ci_low": 0.3235124735051795,
|
776 |
+
"rougeLsum_ci_high": 0.34150244672120345,
|
777 |
+
"rouge2_ci_low": 0.17567630755234162,
|
778 |
+
"rouge2_ci_high": 0.18909024839808478,
|
779 |
+
"rougeL_ci_low": 0.2574903672199645,
|
780 |
+
"rougeL_ci_high": 0.2719482303789339,
|
781 |
+
"score_ci_low": 0.2574903672199645,
|
782 |
+
"score_ci_high": 0.2719482303789339,
|
783 |
+
"rouge1_ci_low": 0.3767025738240639,
|
784 |
+
"rouge1_ci_high": 0.39656622600699587
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
+
"rougeLsum": 0.0952773923780629,
|
789 |
+
"rouge2": 0.016247281897600302,
|
790 |
+
"rougeL": 0.08205748236085915,
|
791 |
+
"score": 0.08205748236085915,
|
792 |
"score_name": "rougeL",
|
793 |
+
"rouge1": 0.11292356630727837,
|
794 |
+
"rougeLsum_ci_low": 0.09101488756580381,
|
795 |
+
"rougeLsum_ci_high": 0.09940774438641894,
|
796 |
+
"rouge2_ci_low": 0.014565784948631207,
|
797 |
+
"rouge2_ci_high": 0.018245592224480585,
|
798 |
+
"rougeL_ci_low": 0.07819027253097927,
|
799 |
+
"rougeL_ci_high": 0.08550070178637435,
|
800 |
+
"score_ci_low": 0.07819027253097927,
|
801 |
+
"score_ci_high": 0.08550070178637435,
|
802 |
+
"rouge1_ci_low": 0.1075630132488861,
|
803 |
+
"rouge1_ci_high": 0.11796959016801192
|
804 |
},
|
805 |
+
"score": 0.17306930860888475,
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
+
1348,
|
814 |
+
805,
|
815 |
+
529,
|
816 |
+
359
|
817 |
],
|
818 |
"totals": [
|
819 |
+
6219,
|
820 |
+
6153,
|
821 |
+
6087,
|
822 |
+
6021
|
823 |
],
|
824 |
"precisions": [
|
825 |
+
0.21675510532239908,
|
826 |
+
0.13083048919226395,
|
827 |
+
0.08690652209627076,
|
828 |
+
0.05962464706859325
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
+
"sys_len": 6219,
|
832 |
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.11010045736869918,
|
834 |
+
"score": 0.11010045736869918,
|
835 |
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.09292825544039175,
|
837 |
+
"score_ci_high": 0.12825546813076855,
|
838 |
+
"sacrebleu_ci_low": 0.09292825544039175,
|
839 |
+
"sacrebleu_ci_high": 0.12825546813076855
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
+
1365,
|
845 |
+
860,
|
846 |
+
583,
|
847 |
+
397
|
848 |
],
|
849 |
"totals": [
|
850 |
+
5747,
|
851 |
+
5681,
|
852 |
+
5615,
|
853 |
+
5549
|
854 |
],
|
855 |
"precisions": [
|
856 |
+
0.2375152253349574,
|
857 |
+
0.151381798979053,
|
858 |
+
0.10382902938557435,
|
859 |
+
0.07154442241845378
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
+
"sys_len": 5747,
|
863 |
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.12783945870094363,
|
865 |
+
"score": 0.12783945870094363,
|
866 |
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.11183867040924762,
|
868 |
+
"score_ci_high": 0.14785567428124632,
|
869 |
+
"sacrebleu_ci_low": 0.11183867040924762,
|
870 |
+
"sacrebleu_ci_high": 0.14785567428124632
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
+
739,
|
876 |
+
294,
|
877 |
+
146,
|
878 |
+
72
|
879 |
],
|
880 |
"totals": [
|
881 |
+
7684,
|
882 |
+
7618,
|
883 |
+
7552,
|
884 |
+
7486
|
885 |
],
|
886 |
"precisions": [
|
887 |
+
0.09617386777719938,
|
888 |
+
0.03859280651089525,
|
889 |
+
0.01933262711864407,
|
890 |
+
0.009617953513224687
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
+
"sys_len": 7684,
|
894 |
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.028822672569638247,
|
896 |
+
"score": 0.028822672569638247,
|
897 |
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.021109593731645938,
|
899 |
+
"score_ci_high": 0.037834395667653335,
|
900 |
+
"sacrebleu_ci_low": 0.021109593731645938,
|
901 |
+
"sacrebleu_ci_high": 0.037834395667653335
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
+
1296,
|
907 |
+
767,
|
908 |
+
491,
|
909 |
+
329
|
910 |
],
|
911 |
"totals": [
|
912 |
+
5968,
|
913 |
+
5902,
|
914 |
+
5836,
|
915 |
+
5770
|
916 |
],
|
917 |
"precisions": [
|
918 |
+
0.21715817694369974,
|
919 |
+
0.1299559471365639,
|
920 |
+
0.0841329677861549,
|
921 |
+
0.05701906412478336
|
922 |
],
|
923 |
"bp": 1.0,
|
924 |
+
"sys_len": 5968,
|
925 |
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.10786726317561303,
|
927 |
+
"score": 0.10786726317561303,
|
928 |
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.09202539159854502,
|
930 |
+
"score_ci_high": 0.1283307736534194,
|
931 |
+
"sacrebleu_ci_low": 0.09202539159854502,
|
932 |
+
"sacrebleu_ci_high": 0.1283307736534194
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
+
1536,
|
938 |
+
1081,
|
939 |
+
816,
|
940 |
+
632
|
941 |
],
|
942 |
"totals": [
|
943 |
+
4782,
|
944 |
+
4716,
|
945 |
+
4650,
|
946 |
+
4584
|
947 |
],
|
948 |
"precisions": [
|
949 |
+
0.3212045169385195,
|
950 |
+
0.22921967769296014,
|
951 |
+
0.17548387096774193,
|
952 |
+
0.13787085514834208
|
953 |
],
|
954 |
+
"bp": 1.0,
|
955 |
+
"sys_len": 4782,
|
956 |
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.20544037737967952,
|
958 |
+
"score": 0.20544037737967952,
|
959 |
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.170377943119898,
|
961 |
+
"score_ci_high": 0.24159259713787848,
|
962 |
+
"sacrebleu_ci_low": 0.170377943119898,
|
963 |
+
"sacrebleu_ci_high": 0.24159259713787848
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
+
1429,
|
969 |
+
687,
|
970 |
+
382,
|
971 |
+
223
|
972 |
],
|
973 |
"totals": [
|
974 |
+
8796,
|
975 |
+
8730,
|
976 |
+
8664,
|
977 |
+
8598
|
978 |
],
|
979 |
"precisions": [
|
980 |
+
0.16246020918599363,
|
981 |
+
0.07869415807560137,
|
982 |
+
0.04409048938134811,
|
983 |
+
0.025936264247499417
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
+
"sys_len": 8796,
|
987 |
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.061835155996391195,
|
989 |
+
"score": 0.061835155996391195,
|
990 |
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.04783880582354916,
|
992 |
+
"score_ci_high": 0.0770855832203236,
|
993 |
+
"sacrebleu_ci_low": 0.04783880582354916,
|
994 |
+
"sacrebleu_ci_high": 0.0770855832203236
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
+
1491,
|
1000 |
+
1023,
|
1001 |
+
759,
|
1002 |
+
578
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
+
6280,
|
1006 |
+
6214,
|
1007 |
+
6148,
|
1008 |
+
6082
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
+
0.2374203821656051,
|
1012 |
+
0.1646282587705182,
|
1013 |
+
0.12345478204294079,
|
1014 |
+
0.09503452811575139
|
1015 |
],
|
1016 |
"bp": 1.0,
|
1017 |
+
"sys_len": 6280,
|
1018 |
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.14633659011937655,
|
1020 |
+
"score": 0.14633659011937655,
|
1021 |
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.12499780954581644,
|
1023 |
+
"score_ci_high": 0.16988788016668988,
|
1024 |
+
"sacrebleu_ci_low": 0.12499780954581644,
|
1025 |
+
"sacrebleu_ci_high": 0.16988788016668988
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
+
1335,
|
1031 |
+
862,
|
1032 |
+
585,
|
1033 |
+
402
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
+
5113,
|
1037 |
+
5047,
|
1038 |
+
4981,
|
1039 |
+
4915
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
+
0.26109915900645414,
|
1043 |
+
0.17079453140479495,
|
1044 |
+
0.11744629592451314,
|
1045 |
+
0.08179043743641913
|
1046 |
],
|
1047 |
+
"bp": 1.0,
|
1048 |
+
"sys_len": 5113,
|
1049 |
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.14386505707163663,
|
1051 |
+
"score": 0.14386505707163663,
|
1052 |
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.11590777522441527,
|
1054 |
+
"score_ci_high": 0.19034631649860798,
|
1055 |
+
"sacrebleu_ci_low": 0.11590777522441527,
|
1056 |
+
"sacrebleu_ci_high": 0.19034631649860798
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
+
1380,
|
1062 |
+
726,
|
1063 |
+
418,
|
1064 |
+
245
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
+
6698,
|
1068 |
+
6632,
|
1069 |
+
6566,
|
1070 |
+
6500
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
+
0.20603165123917588,
|
1074 |
+
0.10946924004825091,
|
1075 |
+
0.06366128540968626,
|
1076 |
+
0.03769230769230769
|
1077 |
],
|
1078 |
+
"bp": 1.0,
|
1079 |
+
"sys_len": 6698,
|
1080 |
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.08577061900111178,
|
1082 |
+
"score": 0.08577061900111178,
|
1083 |
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.06866769965121916,
|
1085 |
+
"score_ci_high": 0.10264171230800344,
|
1086 |
+
"sacrebleu_ci_low": 0.06866769965121916,
|
1087 |
+
"sacrebleu_ci_high": 0.10264171230800344
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
+
1411,
|
1093 |
+
934,
|
1094 |
+
656,
|
1095 |
+
466
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
+
5734,
|
1099 |
+
5668,
|
1100 |
+
5602,
|
1101 |
+
5536
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
+
0.24607603767003838,
|
1105 |
+
0.1647847565278758,
|
1106 |
+
0.11710103534451982,
|
1107 |
+
0.08417630057803467
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
+
"sys_len": 5734,
|
1111 |
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.14139505868159252,
|
1113 |
+
"score": 0.14139505868159252,
|
1114 |
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.12079576015139526,
|
1116 |
+
"score_ci_high": 0.16682278724108202,
|
1117 |
+
"sacrebleu_ci_low": 0.12079576015139526,
|
1118 |
+
"sacrebleu_ci_high": 0.16682278724108202
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
+
1223,
|
1124 |
+
624,
|
1125 |
+
368,
|
1126 |
+
228
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
+
5589,
|
1130 |
+
5523,
|
1131 |
+
5457,
|
1132 |
+
5391
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
+
0.21882268742172123,
|
1136 |
+
0.11298207495926127,
|
1137 |
+
0.06743632032252153,
|
1138 |
+
0.042292710072342796
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
+
"sys_len": 5589,
|
1142 |
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.09163583013027359,
|
1144 |
+
"score": 0.09163583013027359,
|
1145 |
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.07856240720421398,
|
1147 |
+
"score_ci_high": 0.107979299756867,
|
1148 |
+
"sacrebleu_ci_low": 0.07856240720421398,
|
1149 |
+
"sacrebleu_ci_high": 0.107979299756867
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
+
1181,
|
1155 |
+
575,
|
1156 |
+
330,
|
1157 |
+
198
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
+
5759,
|
1161 |
+
5693,
|
1162 |
+
5627,
|
1163 |
+
5561
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
+
0.20507032470915088,
|
1167 |
+
0.1010012295801862,
|
1168 |
+
0.05864581482139684,
|
1169 |
+
0.03560510699514476
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
+
"sys_len": 5759,
|
1173 |
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.08109511611273765,
|
1175 |
+
"score": 0.08109511611273765,
|
1176 |
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.06705573259915326,
|
1178 |
+
"score_ci_high": 0.09497541131553666,
|
1179 |
+
"sacrebleu_ci_low": 0.06705573259915326,
|
1180 |
+
"sacrebleu_ci_high": 0.09497541131553666
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
+
1407,
|
1186 |
+
940,
|
1187 |
+
672,
|
1188 |
+
483
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
+
6123,
|
1192 |
+
6057,
|
1193 |
+
5991,
|
1194 |
+
5925
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
+
0.2297893189612935,
|
1198 |
+
0.15519233944196797,
|
1199 |
+
0.11216825237856785,
|
1200 |
+
0.08151898734177215
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
+
"sys_len": 6123,
|
1204 |
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.13437924969030768,
|
1206 |
+
"score": 0.13437924969030768,
|
1207 |
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.11400957037603383,
|
1209 |
+
"score_ci_high": 0.15945809207216216,
|
1210 |
+
"sacrebleu_ci_low": 0.11400957037603383,
|
1211 |
+
"sacrebleu_ci_high": 0.15945809207216216
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
+
1432,
|
1217 |
+
975,
|
1218 |
+
700,
|
1219 |
+
506
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
+
6485,
|
1223 |
+
6419,
|
1224 |
+
6353,
|
1225 |
+
6287
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
+
0.2208172706245181,
|
1229 |
+
0.1518928181959807,
|
1230 |
+
0.11018416496143554,
|
1231 |
+
0.08048353745824718
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
+
"sys_len": 6485,
|
1235 |
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.13132552016969748,
|
1237 |
+
"score": 0.13132552016969748,
|
1238 |
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.1085828237330662,
|
1240 |
+
"score_ci_high": 0.1582258643543896,
|
1241 |
+
"sacrebleu_ci_low": 0.1085828237330662,
|
1242 |
+
"sacrebleu_ci_high": 0.1582258643543896
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
+
1293,
|
1248 |
+
715,
|
1249 |
+
441,
|
1250 |
+
272
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
+
6041,
|
1254 |
+
5975,
|
1255 |
+
5909,
|
1256 |
+
5843
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
+
0.2140374110246648,
|
1260 |
+
0.1196652719665272,
|
1261 |
+
0.07463191741411407,
|
1262 |
+
0.046551429060414165
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
+
"sys_len": 6041,
|
1266 |
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.09712451421953877,
|
1268 |
+
"score": 0.09712451421953877,
|
1269 |
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.08132271336696732,
|
1271 |
+
"score_ci_high": 0.11265264204635472,
|
1272 |
+
"sacrebleu_ci_low": 0.08132271336696732,
|
1273 |
+
"sacrebleu_ci_high": 0.11265264204635472
|
1274 |
},
|
1275 |
+
"score": 0.1129888626924825,
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
+
"score": 0.42020372962571984,
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
results/bluebench/{2025-06-19T21-59-04_evaluation_results.json β 2025-06-23T09-36-33_evaluation_results.json}
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
-
"timestamp_utc": "2025-06-
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
@@ -42,7 +42,7 @@
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
-
"unitxt_commit_hash": "
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
@@ -176,151 +176,151 @@
|
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
-
"accuracy": 0.
|
180 |
-
"accuracy_ci_low": 0.
|
181 |
-
"accuracy_ci_high": 0.
|
182 |
"score_name": "accuracy",
|
183 |
-
"score": 0.
|
184 |
-
"score_ci_high": 0.
|
185 |
-
"score_ci_low": 0.
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
"accuracy": 0.5555555555555556,
|
190 |
-
"accuracy_ci_low": 0.
|
191 |
"accuracy_ci_high": 0.6555555555555556,
|
192 |
"score_name": "accuracy",
|
193 |
"score": 0.5555555555555556,
|
194 |
"score_ci_high": 0.6555555555555556,
|
195 |
-
"score_ci_low": 0.
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
199 |
-
"accuracy": 0.
|
200 |
-
"accuracy_ci_low": 0.
|
201 |
"accuracy_ci_high": 0.9333333333333333,
|
202 |
"score_name": "accuracy",
|
203 |
-
"score": 0.
|
204 |
"score_ci_high": 0.9333333333333333,
|
205 |
-
"score_ci_low": 0.
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
"accuracy": 0.5777777777777777,
|
210 |
-
"accuracy_ci_low": 0.
|
211 |
"accuracy_ci_high": 0.6777777777777778,
|
212 |
"score_name": "accuracy",
|
213 |
"score": 0.5777777777777777,
|
214 |
"score_ci_high": 0.6777777777777778,
|
215 |
-
"score_ci_low": 0.
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
-
"accuracy": 0.
|
220 |
-
"accuracy_ci_low": 0.
|
221 |
-
"accuracy_ci_high": 0.
|
222 |
"score_name": "accuracy",
|
223 |
-
"score": 0.
|
224 |
-
"score_ci_high": 0.
|
225 |
-
"score_ci_low": 0.
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
-
"accuracy": 0.
|
230 |
-
"accuracy_ci_low": 0.
|
231 |
-
"accuracy_ci_high":
|
232 |
"score_name": "accuracy",
|
233 |
-
"score": 0.
|
234 |
-
"score_ci_high":
|
235 |
-
"score_ci_low": 0.
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
239 |
-
"accuracy": 0.
|
240 |
-
"accuracy_ci_low": 0.
|
241 |
-
"accuracy_ci_high": 0.
|
242 |
"score_name": "accuracy",
|
243 |
-
"score": 0.
|
244 |
-
"score_ci_high": 0.
|
245 |
-
"score_ci_low": 0.
|
246 |
"num_of_instances": 90
|
247 |
},
|
248 |
"safety_bbq_race_x_ses": {
|
249 |
-
"accuracy": 0.
|
250 |
-
"accuracy_ci_low": 0.
|
251 |
-
"accuracy_ci_high": 0.
|
252 |
"score_name": "accuracy",
|
253 |
-
"score": 0.
|
254 |
-
"score_ci_high": 0.
|
255 |
-
"score_ci_low": 0.
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
"accuracy": 0.6444444444444445,
|
260 |
-
"accuracy_ci_low": 0.
|
261 |
"accuracy_ci_high": 0.7444444444444445,
|
262 |
"score_name": "accuracy",
|
263 |
"score": 0.6444444444444445,
|
264 |
"score_ci_high": 0.7444444444444445,
|
265 |
-
"score_ci_low": 0.
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
-
"accuracy": 0.
|
270 |
-
"accuracy_ci_low": 0.
|
271 |
-
"accuracy_ci_high": 0.
|
272 |
"score_name": "accuracy",
|
273 |
-
"score": 0.
|
274 |
-
"score_ci_high": 0.
|
275 |
-
"score_ci_low": 0.
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
-
"accuracy": 0.
|
280 |
-
"accuracy_ci_low": 0.
|
281 |
"accuracy_ci_high": 0.9333333333333333,
|
282 |
"score_name": "accuracy",
|
283 |
-
"score": 0.
|
284 |
"score_ci_high": 0.9333333333333333,
|
285 |
-
"score_ci_low": 0.
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
-
"score": 0.
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
296 |
-
"score": 0.
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
-
"score": 0.
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
-
"f1_Person": 0.
|
307 |
-
"f1_Organization": 0.
|
308 |
-
"f1_Location": 0.
|
309 |
-
"f1_macro": 0.
|
310 |
-
"recall_macro": 0.
|
311 |
-
"precision_macro": 0.
|
312 |
-
"in_classes_support": 0.
|
313 |
-
"f1_micro": 0.
|
314 |
-
"recall_micro": 0.
|
315 |
-
"precision_micro": 0.
|
316 |
-
"score": 0.
|
317 |
"score_name": "f1_micro",
|
318 |
-
"score_ci_low": 0.
|
319 |
-
"score_ci_high": 0.
|
320 |
-
"f1_micro_ci_low": 0.
|
321 |
-
"f1_micro_ci_high": 0.
|
322 |
},
|
323 |
-
"score": 0.
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
@@ -336,43 +336,43 @@
|
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
-
"accuracy": 0.
|
340 |
-
"accuracy_ci_low": 0.
|
341 |
-
"accuracy_ci_high": 0.
|
342 |
"score_name": "accuracy",
|
343 |
-
"score": 0.
|
344 |
-
"score_ci_high": 0.
|
345 |
-
"score_ci_low": 0.
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
"accuracy": 0.08450704225352113,
|
350 |
-
"accuracy_ci_low": 0.
|
351 |
-
"accuracy_ci_high": 0.
|
352 |
"score_name": "accuracy",
|
353 |
"score": 0.08450704225352113,
|
354 |
-
"score_ci_high": 0.
|
355 |
-
"score_ci_low": 0.
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
-
"accuracy": 0.
|
360 |
-
"accuracy_ci_low": 0.
|
361 |
-
"accuracy_ci_high": 0.
|
362 |
"score_name": "accuracy",
|
363 |
-
"score": 0.
|
364 |
-
"score_ci_high": 0.
|
365 |
-
"score_ci_low": 0.
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
-
"accuracy": 0.
|
370 |
-
"accuracy_ci_low": 0.
|
371 |
-
"accuracy_ci_high": 0.
|
372 |
"score_name": "accuracy",
|
373 |
-
"score": 0.
|
374 |
-
"score_ci_high": 0.
|
375 |
-
"score_ci_low": 0.
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
@@ -386,423 +386,423 @@
|
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
-
"accuracy": 0.
|
390 |
-
"accuracy_ci_low": 0.
|
391 |
-
"accuracy_ci_high": 0.
|
392 |
"score_name": "accuracy",
|
393 |
-
"score": 0.
|
394 |
-
"score_ci_high": 0.
|
395 |
-
"score_ci_low": 0.
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
-
"accuracy": 0.
|
400 |
"accuracy_ci_low": 0.36619718309859156,
|
401 |
-
"accuracy_ci_high": 0.
|
402 |
"score_name": "accuracy",
|
403 |
-
"score": 0.
|
404 |
-
"score_ci_high": 0.
|
405 |
"score_ci_low": 0.36619718309859156,
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
-
"accuracy": 0.
|
410 |
-
"accuracy_ci_low": 0.
|
411 |
-
"accuracy_ci_high": 0.
|
412 |
"score_name": "accuracy",
|
413 |
-
"score": 0.
|
414 |
-
"score_ci_high": 0.
|
415 |
-
"score_ci_low": 0.
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
-
"accuracy": 0.
|
420 |
-
"accuracy_ci_low": 0.
|
421 |
-
"accuracy_ci_high": 0.
|
422 |
"score_name": "accuracy",
|
423 |
-
"score": 0.
|
424 |
-
"score_ci_high": 0.
|
425 |
-
"score_ci_low": 0.
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
-
"accuracy": 0.
|
430 |
-
"accuracy_ci_low": 0.
|
431 |
-
"accuracy_ci_high": 0.
|
432 |
"score_name": "accuracy",
|
433 |
-
"score": 0.
|
434 |
-
"score_ci_high": 0.
|
435 |
-
"score_ci_low": 0.
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
-
"accuracy": 0.
|
440 |
-
"accuracy_ci_low": 0.
|
441 |
-
"accuracy_ci_high": 0.
|
442 |
"score_name": "accuracy",
|
443 |
-
"score": 0.
|
444 |
-
"score_ci_high": 0.
|
445 |
-
"score_ci_low": 0.
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
"accuracy": 0.18309859154929578,
|
450 |
"accuracy_ci_low": 0.09859154929577464,
|
451 |
-
"accuracy_ci_high": 0.
|
452 |
"score_name": "accuracy",
|
453 |
"score": 0.18309859154929578,
|
454 |
-
"score_ci_high": 0.
|
455 |
"score_ci_low": 0.09859154929577464,
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
-
"accuracy": 0.
|
460 |
-
"accuracy_ci_low": 0.
|
461 |
-
"accuracy_ci_high": 0.
|
462 |
"score_name": "accuracy",
|
463 |
-
"score": 0.
|
464 |
-
"score_ci_high": 0.
|
465 |
-
"score_ci_low": 0.
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
-
"score": 0.
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
-
"f1_macro": 0.
|
475 |
"f1_suggestive": 0.0,
|
476 |
"f1_generic": 0.0,
|
|
|
477 |
"f1_fanciful": 0.10526315789473684,
|
478 |
-
"f1_descriptive": 0.2857142857142857,
|
479 |
"f1_arbitrary": 0.0,
|
480 |
-
"f1_macro_ci_low": 0.
|
481 |
-
"f1_macro_ci_high": 0.
|
482 |
"score_name": "f1_micro",
|
483 |
-
"score": 0.
|
484 |
-
"score_ci_high": 0.
|
485 |
-
"score_ci_low": 0.
|
486 |
"num_of_instances": 85,
|
487 |
-
"accuracy": 0.
|
488 |
-
"accuracy_ci_low": 0.
|
489 |
-
"accuracy_ci_high": 0.
|
490 |
-
"f1_micro": 0.
|
491 |
-
"f1_micro_ci_low": 0.
|
492 |
-
"f1_micro_ci_high": 0.
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
-
"f1_macro": 0.
|
496 |
-
"f1_no": 0.
|
497 |
-
"f1_yes": 0.
|
498 |
-
"f1_macro_ci_low": 0.
|
499 |
-
"f1_macro_ci_high": 0.
|
500 |
"score_name": "f1_micro",
|
501 |
-
"score": 0.
|
502 |
-
"score_ci_high": 0.
|
503 |
-
"score_ci_low": 0.
|
504 |
"num_of_instances": 200,
|
505 |
-
"accuracy": 0.
|
506 |
-
"accuracy_ci_low": 0.
|
507 |
-
"accuracy_ci_high": 0.
|
508 |
-
"f1_micro": 0.
|
509 |
-
"f1_micro_ci_low": 0.
|
510 |
-
"f1_micro_ci_high": 0.
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
-
"f1_macro": 0.
|
514 |
"f1_conclusion": 0.0,
|
515 |
-
"f1_decree": 0.
|
516 |
-
"f1_issue": 0.
|
517 |
-
"f1_analysis": 0.
|
518 |
-
"f1_facts": 0.
|
519 |
"f1_procedural history": 0.0,
|
520 |
-
"f1_rule": 0.
|
521 |
-
"f1_macro_ci_low": 0.
|
522 |
-
"f1_macro_ci_high": 0.
|
523 |
"score_name": "f1_micro",
|
524 |
-
"score": 0.
|
525 |
-
"score_ci_high": 0.
|
526 |
-
"score_ci_low": 0.
|
527 |
"num_of_instances": 200,
|
528 |
-
"accuracy": 0.
|
529 |
-
"accuracy_ci_low": 0.
|
530 |
-
"accuracy_ci_high": 0.
|
531 |
-
"f1_micro": 0.
|
532 |
-
"f1_micro_ci_low": 0.
|
533 |
-
"f1_micro_ci_high": 0.
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
-
"f1_macro": 0.
|
537 |
-
"f1_yes": 0.
|
538 |
-
"f1_no": 0.
|
539 |
-
"f1_macro_ci_low": 0.
|
540 |
-
"f1_macro_ci_high": 0.
|
541 |
"score_name": "f1_micro",
|
542 |
-
"score": 0.
|
543 |
-
"score_ci_high": 0.
|
544 |
-
"score_ci_low": 0.
|
545 |
"num_of_instances": 200,
|
546 |
-
"accuracy": 0.
|
547 |
-
"accuracy_ci_low": 0.
|
548 |
"accuracy_ci_high": 0.315,
|
549 |
-
"f1_micro": 0.
|
550 |
-
"f1_micro_ci_low": 0.
|
551 |
-
"f1_micro_ci_high": 0.
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
-
"f1_macro": 0.
|
555 |
-
"f1_yes": 0.
|
556 |
-
"f1_no": 0.
|
557 |
-
"f1_macro_ci_low": 0.
|
558 |
-
"f1_macro_ci_high": 0.
|
559 |
"score_name": "f1_micro",
|
560 |
-
"score": 0.
|
561 |
-
"score_ci_high": 0.
|
562 |
-
"score_ci_low": 0.
|
563 |
"num_of_instances": 85,
|
564 |
-
"accuracy": 0.
|
565 |
-
"accuracy_ci_low": 0.
|
566 |
-
"accuracy_ci_high": 0.
|
567 |
-
"f1_micro": 0.
|
568 |
-
"f1_micro_ci_low": 0.
|
569 |
-
"f1_micro_ci_high": 0.
|
570 |
},
|
571 |
-
"score": 0.
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
-
"f1_macro": 0.
|
578 |
-
"f1_cars": 0.
|
579 |
"f1_windows x": 0.0,
|
580 |
-
"f1_atheism": 0.
|
581 |
-
"f1_religion": 0.
|
582 |
-
"
|
583 |
-
"
|
584 |
-
"
|
585 |
-
"
|
586 |
-
"
|
587 |
-
"
|
588 |
-
"
|
589 |
-
"f1_pc hardware": 0.3157894736842105,
|
590 |
"f1_mac hardware": 0.14285714285714285,
|
591 |
-
"f1_for sale": 0.
|
592 |
-
"f1_guns": 0.
|
593 |
-
"f1_space": 0.
|
594 |
-
"f1_cryptography": 0.
|
595 |
-
"f1_baseball": 0.
|
596 |
-
"f1_hockey": 0.
|
597 |
-
"
|
598 |
-
"
|
599 |
-
"
|
|
|
600 |
"score_name": "f1_micro",
|
601 |
-
"score": 0.
|
602 |
-
"score_ci_high": 0.
|
603 |
-
"score_ci_low": 0.
|
604 |
"num_of_instances": 1000,
|
605 |
-
"accuracy": 0.
|
606 |
-
"accuracy_ci_low": 0.
|
607 |
-
"accuracy_ci_high": 0.
|
608 |
-
"f1_micro": 0.
|
609 |
-
"f1_micro_ci_low": 0.
|
610 |
-
"f1_micro_ci_high": 0.
|
611 |
},
|
612 |
-
"score": 0.
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
-
"f1_macro": 0.
|
619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
620 |
-
"
|
621 |
-
"f1_debt collection": 0.
|
622 |
-
"
|
623 |
-
"
|
624 |
-
"
|
625 |
-
"
|
626 |
-
"f1_student loan": 0.
|
627 |
-
"f1_vehicle loan or lease": 0.
|
628 |
-
"f1_macro_ci_low": 0.
|
629 |
-
"f1_macro_ci_high": 0.
|
630 |
"score_name": "f1_micro",
|
631 |
-
"score": 0.
|
632 |
-
"score_ci_high": 0.
|
633 |
-
"score_ci_low": 0.
|
634 |
"num_of_instances": 1000,
|
635 |
-
"accuracy": 0.
|
636 |
-
"accuracy_ci_low": 0.
|
637 |
-
"accuracy_ci_high": 0.
|
638 |
-
"f1_micro": 0.
|
639 |
-
"f1_micro_ci_low": 0.
|
640 |
-
"f1_micro_ci_high": 0.
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
-
"f1_macro": 0.
|
644 |
-
"f1_mortgages and loans": 0.
|
645 |
-
"f1_credit card": 0.
|
646 |
-
"f1_debt collection": 0.
|
647 |
-
"f1_credit reporting": 0.
|
648 |
-
"f1_retail banking": 0.
|
649 |
-
"f1_macro_ci_low": 0.
|
650 |
-
"f1_macro_ci_high": 0.
|
651 |
"score_name": "f1_micro",
|
652 |
-
"score": 0.
|
653 |
-
"score_ci_high": 0.
|
654 |
-
"score_ci_low": 0.
|
655 |
"num_of_instances": 500,
|
656 |
-
"accuracy": 0.
|
657 |
-
"accuracy_ci_low": 0.
|
658 |
-
"accuracy_ci_high": 0.
|
659 |
-
"f1_micro": 0.
|
660 |
-
"f1_micro_ci_low": 0.
|
661 |
-
"f1_micro_ci_high": 0.
|
662 |
},
|
663 |
-
"score": 0.
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
-
"
|
671 |
-
"
|
672 |
-
"score": 0.036,
|
673 |
"score_name": "program_accuracy",
|
674 |
-
"
|
675 |
-
"
|
676 |
-
"
|
677 |
-
"
|
678 |
-
"
|
679 |
-
"
|
|
|
680 |
},
|
681 |
-
"score": 0.
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
-
"precision": 0.
|
688 |
-
"recall": 0.
|
689 |
-
"f1": 0.
|
690 |
-
"precision_ci_low": 0.
|
691 |
-
"precision_ci_high": 0.
|
692 |
-
"recall_ci_low": 0.
|
693 |
-
"recall_ci_high": 0.
|
694 |
-
"f1_ci_low": 0.
|
695 |
-
"f1_ci_high": 0.
|
696 |
"score_name": "f1",
|
697 |
-
"score": 0.
|
698 |
-
"score_ci_high": 0.
|
699 |
-
"score_ci_low": 0.
|
700 |
"num_of_instances": 600,
|
701 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
702 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
703 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
704 |
-
"faithfullness_f1_token_overlap": 0.
|
705 |
-
"faithfullness_recall_token_overlap": 0.
|
706 |
-
"faithfullness_precision_token_overlap": 0.
|
707 |
-
"correctness_f1_token_overlap": 0.
|
708 |
-
"correctness_recall_token_overlap": 0.
|
709 |
-
"correctness_precision_token_overlap": 0.
|
710 |
},
|
711 |
-
"score": 0.
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
-
"accuracy": 0.
|
718 |
-
"accuracy_ci_low": 0.
|
719 |
-
"accuracy_ci_high": 0.
|
720 |
"score_name": "accuracy",
|
721 |
-
"score": 0.
|
722 |
-
"score_ci_high": 0.
|
723 |
-
"score_ci_low": 0.
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
-
"accuracy": 0.
|
728 |
-
"accuracy_ci_low": 0.
|
729 |
-
"accuracy_ci_high": 0.
|
730 |
"score_name": "accuracy",
|
731 |
-
"score": 0.
|
732 |
-
"score_ci_high": 0.
|
733 |
-
"score_ci_low": 0.
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
-
"score": 0.
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
740 |
"safety": {
|
741 |
"attaq_500": {
|
742 |
-
"severity_critical":
|
743 |
-
"severity_high":
|
744 |
-
"severity_medium":
|
745 |
-
"severity_low":
|
746 |
-
"category_harmful_info": 0.
|
747 |
-
"category_deception": 0.
|
748 |
-
"category_explicit_content": 0.
|
749 |
-
"category_violence": 0.
|
750 |
-
"category_discrimination": 0.
|
751 |
-
"category_substance_abuse": 0.
|
752 |
-
"category_pii": 0.
|
753 |
-
"safety": 0.
|
754 |
-
"safety_ci_low": 0.
|
755 |
-
"safety_ci_high": 0.
|
756 |
"score_name": "safety",
|
757 |
-
"score": 0.
|
758 |
-
"score_ci_high": 0.
|
759 |
-
"score_ci_low": 0.
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
-
"score": 0.
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
-
"rougeL": 0.
|
770 |
-
"score": 0.
|
771 |
"score_name": "rougeL",
|
772 |
-
"
|
773 |
-
"
|
774 |
-
"rouge2": 0.
|
775 |
-
"rougeL_ci_low": 0.
|
776 |
-
"rougeL_ci_high": 0.
|
777 |
-
"score_ci_low": 0.
|
778 |
-
"score_ci_high": 0.
|
779 |
-
"
|
780 |
-
"
|
781 |
-
"
|
782 |
-
"
|
783 |
-
"rouge2_ci_low": 0.
|
784 |
-
"rouge2_ci_high": 0.
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
-
"rougeL": 0.
|
789 |
-
"score": 0.
|
790 |
"score_name": "rougeL",
|
791 |
-
"
|
792 |
-
"
|
793 |
-
"rouge2": 0.
|
794 |
-
"rougeL_ci_low": 0.
|
795 |
-
"rougeL_ci_high": 0.
|
796 |
-
"score_ci_low": 0.
|
797 |
-
"score_ci_high": 0.
|
798 |
-
"
|
799 |
-
"
|
800 |
-
"
|
801 |
-
"
|
802 |
-
"rouge2_ci_low": 0.
|
803 |
-
"rouge2_ci_high": 0.
|
804 |
},
|
805 |
-
"score": 0.
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
@@ -810,473 +810,473 @@
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
],
|
818 |
"totals": [
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
-
|
823 |
],
|
824 |
"precisions": [
|
825 |
-
0.
|
826 |
-
0.
|
827 |
-
0.
|
828 |
-
0.
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
-
"sys_len":
|
832 |
"ref_len": 1734,
|
833 |
-
"sacrebleu": 0.
|
834 |
-
"score": 0.
|
835 |
"score_name": "sacrebleu",
|
836 |
-
"score_ci_low": 0.
|
837 |
-
"score_ci_high": 0.
|
838 |
-
"sacrebleu_ci_low": 0.
|
839 |
-
"sacrebleu_ci_high": 0.
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
-
|
845 |
-
|
846 |
-
|
847 |
-
|
848 |
],
|
849 |
"totals": [
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
],
|
855 |
"precisions": [
|
856 |
-
0.
|
857 |
-
0.
|
858 |
-
0.
|
859 |
-
0.
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
-
"sys_len":
|
863 |
"ref_len": 1734,
|
864 |
-
"sacrebleu": 0.
|
865 |
-
"score": 0.
|
866 |
"score_name": "sacrebleu",
|
867 |
-
"score_ci_low": 0.
|
868 |
-
"score_ci_high": 0.
|
869 |
-
"sacrebleu_ci_low": 0.
|
870 |
-
"sacrebleu_ci_high": 0.
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
-
|
876 |
-
|
877 |
-
|
878 |
-
|
879 |
],
|
880 |
"totals": [
|
881 |
-
|
882 |
-
|
883 |
-
|
884 |
-
|
885 |
],
|
886 |
"precisions": [
|
887 |
-
0.
|
888 |
-
0.
|
889 |
-
0.
|
890 |
-
0.
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
-
"sys_len":
|
894 |
"ref_len": 1589,
|
895 |
-
"sacrebleu": 0.
|
896 |
-
"score": 0.
|
897 |
"score_name": "sacrebleu",
|
898 |
-
"score_ci_low": 0.
|
899 |
-
"score_ci_high": 0.
|
900 |
-
"sacrebleu_ci_low": 0.
|
901 |
-
"sacrebleu_ci_high": 0.
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
-
|
907 |
-
|
908 |
-
|
909 |
-
|
910 |
],
|
911 |
"totals": [
|
912 |
-
|
913 |
-
|
914 |
-
|
915 |
-
|
916 |
],
|
917 |
"precisions": [
|
918 |
-
0.
|
919 |
-
0.
|
920 |
-
0.
|
921 |
-
0.
|
922 |
],
|
923 |
-
"bp": 0
|
924 |
-
"sys_len":
|
925 |
"ref_len": 1835,
|
926 |
-
"sacrebleu": 0.
|
927 |
-
"score": 0.
|
928 |
"score_name": "sacrebleu",
|
929 |
-
"score_ci_low": 0.
|
930 |
-
"score_ci_high": 0.
|
931 |
-
"sacrebleu_ci_low": 0.
|
932 |
-
"sacrebleu_ci_high": 0.
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
-
|
938 |
-
|
939 |
-
|
940 |
-
|
941 |
],
|
942 |
"totals": [
|
943 |
-
|
944 |
-
|
945 |
-
|
946 |
-
|
947 |
],
|
948 |
"precisions": [
|
949 |
-
0.
|
950 |
-
0.
|
951 |
-
0.
|
952 |
-
0.
|
953 |
],
|
954 |
-
"bp": 0.
|
955 |
-
"sys_len":
|
956 |
"ref_len": 2068,
|
957 |
-
"sacrebleu": 0.
|
958 |
-
"score": 0.
|
959 |
"score_name": "sacrebleu",
|
960 |
-
"score_ci_low": 0.
|
961 |
-
"score_ci_high": 0.
|
962 |
-
"sacrebleu_ci_low": 0.
|
963 |
-
"sacrebleu_ci_high": 0.
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
-
|
969 |
-
|
970 |
-
|
971 |
-
|
972 |
],
|
973 |
"totals": [
|
974 |
-
|
975 |
-
|
976 |
-
|
977 |
-
|
978 |
],
|
979 |
"precisions": [
|
980 |
-
0.
|
981 |
-
0.
|
982 |
-
0.
|
983 |
-
0.
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
-
"sys_len":
|
987 |
"ref_len": 2235,
|
988 |
-
"sacrebleu": 0.
|
989 |
-
"score": 0.
|
990 |
"score_name": "sacrebleu",
|
991 |
-
"score_ci_low": 0.
|
992 |
-
"score_ci_high": 0.
|
993 |
-
"sacrebleu_ci_low": 0.
|
994 |
-
"sacrebleu_ci_high": 0.
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
-
|
1000 |
-
|
1001 |
-
|
1002 |
-
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
-
|
1006 |
-
|
1007 |
-
|
1008 |
-
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
-
0.
|
1012 |
-
0.
|
1013 |
-
0.
|
1014 |
-
0.
|
1015 |
],
|
1016 |
-
"bp": 0.
|
1017 |
-
"sys_len":
|
1018 |
"ref_len": 1916,
|
1019 |
-
"sacrebleu": 0.
|
1020 |
-
"score": 0.
|
1021 |
"score_name": "sacrebleu",
|
1022 |
-
"score_ci_low": 0.
|
1023 |
-
"score_ci_high": 0.
|
1024 |
-
"sacrebleu_ci_low": 0.
|
1025 |
-
"sacrebleu_ci_high": 0.
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
-
|
1031 |
-
|
1032 |
-
|
1033 |
-
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
-
0.
|
1043 |
-
0.
|
1044 |
-
0.
|
1045 |
-
0.
|
1046 |
],
|
1047 |
-
"bp": 0
|
1048 |
-
"sys_len":
|
1049 |
"ref_len": 1949,
|
1050 |
-
"sacrebleu": 0.
|
1051 |
-
"score": 0.
|
1052 |
"score_name": "sacrebleu",
|
1053 |
-
"score_ci_low": 0.
|
1054 |
-
"score_ci_high": 0.
|
1055 |
-
"sacrebleu_ci_low": 0.
|
1056 |
-
"sacrebleu_ci_high": 0.
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
-
|
1062 |
-
|
1063 |
-
|
1064 |
-
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
1070 |
-
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
-
0.
|
1074 |
-
0.
|
1075 |
-
0.
|
1076 |
-
0.
|
1077 |
],
|
1078 |
-
"bp": 0.
|
1079 |
-
"sys_len":
|
1080 |
"ref_len": 2098,
|
1081 |
-
"sacrebleu": 0.
|
1082 |
-
"score": 0.
|
1083 |
"score_name": "sacrebleu",
|
1084 |
-
"score_ci_low": 0.
|
1085 |
-
"score_ci_high": 0.
|
1086 |
-
"sacrebleu_ci_low": 0.
|
1087 |
-
"sacrebleu_ci_high": 0.
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
-
|
1093 |
-
|
1094 |
564,
|
1095 |
-
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
-
|
1099 |
-
|
1100 |
-
|
1101 |
-
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
-
0.
|
1105 |
-
0.
|
1106 |
-
0.
|
1107 |
-
0.
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
-
"sys_len":
|
1111 |
"ref_len": 1734,
|
1112 |
-
"sacrebleu": 0.
|
1113 |
-
"score": 0.
|
1114 |
"score_name": "sacrebleu",
|
1115 |
-
"score_ci_low": 0.
|
1116 |
-
"score_ci_high": 0.
|
1117 |
-
"sacrebleu_ci_low": 0.
|
1118 |
-
"sacrebleu_ci_high": 0.
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
-
|
1124 |
-
|
1125 |
-
|
1126 |
-
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
-
|
1130 |
-
|
1131 |
-
|
1132 |
-
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
-
0.
|
1136 |
-
0.
|
1137 |
-
0.
|
1138 |
-
0.
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
-
"sys_len":
|
1142 |
"ref_len": 1734,
|
1143 |
-
"sacrebleu": 0.
|
1144 |
-
"score": 0.
|
1145 |
"score_name": "sacrebleu",
|
1146 |
-
"score_ci_low": 0.
|
1147 |
-
"score_ci_high": 0.
|
1148 |
-
"sacrebleu_ci_low": 0.
|
1149 |
-
"sacrebleu_ci_high": 0.
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
-
|
1155 |
-
|
1156 |
-
|
1157 |
-
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
-
|
1161 |
-
|
1162 |
-
|
1163 |
-
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
-
0.
|
1167 |
-
0.
|
1168 |
-
0.
|
1169 |
-
0.
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
-
"sys_len":
|
1173 |
"ref_len": 1734,
|
1174 |
-
"sacrebleu": 0.
|
1175 |
-
"score": 0.
|
1176 |
"score_name": "sacrebleu",
|
1177 |
-
"score_ci_low": 0.
|
1178 |
-
"score_ci_high": 0.
|
1179 |
-
"sacrebleu_ci_low": 0.
|
1180 |
-
"sacrebleu_ci_high": 0.
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
-
|
1186 |
864,
|
1187 |
-
|
1188 |
463
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
-
|
1192 |
-
|
1193 |
-
|
1194 |
-
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
-
0.
|
1198 |
-
0.
|
1199 |
-
0.
|
1200 |
-
0.
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
-
"sys_len":
|
1204 |
"ref_len": 1734,
|
1205 |
-
"sacrebleu": 0.
|
1206 |
-
"score": 0.
|
1207 |
"score_name": "sacrebleu",
|
1208 |
-
"score_ci_low": 0.
|
1209 |
-
"score_ci_high": 0.
|
1210 |
-
"sacrebleu_ci_low": 0.
|
1211 |
-
"sacrebleu_ci_high": 0.
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
-
|
1217 |
-
|
1218 |
-
|
1219 |
-
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
-
|
1223 |
-
|
1224 |
-
|
1225 |
-
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
-
0.
|
1229 |
-
0.
|
1230 |
-
0.
|
1231 |
-
0.
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
-
"sys_len":
|
1235 |
"ref_len": 1734,
|
1236 |
-
"sacrebleu": 0.
|
1237 |
-
"score": 0.
|
1238 |
"score_name": "sacrebleu",
|
1239 |
-
"score_ci_low": 0.
|
1240 |
-
"score_ci_high": 0.
|
1241 |
-
"sacrebleu_ci_low": 0.
|
1242 |
-
"sacrebleu_ci_high": 0.
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
-
|
1248 |
-
|
1249 |
-
|
1250 |
-
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
-
|
1254 |
-
|
1255 |
-
|
1256 |
-
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
-
0.
|
1260 |
-
0.
|
1261 |
-
0.
|
1262 |
-
0.
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
-
"sys_len":
|
1266 |
"ref_len": 1734,
|
1267 |
-
"sacrebleu": 0.
|
1268 |
-
"score": 0.
|
1269 |
"score_name": "sacrebleu",
|
1270 |
-
"score_ci_low": 0.
|
1271 |
-
"score_ci_high": 0.
|
1272 |
-
"sacrebleu_ci_low": 0.
|
1273 |
-
"sacrebleu_ci_high": 0.
|
1274 |
},
|
1275 |
-
"score": 0.
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
-
"score": 0.
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-23T13:36:29.058411Z",
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.5222222222222223,
|
180 |
+
"accuracy_ci_low": 0.4222222222222222,
|
181 |
+
"accuracy_ci_high": 0.6333333333333333,
|
182 |
"score_name": "accuracy",
|
183 |
+
"score": 0.5222222222222223,
|
184 |
+
"score_ci_high": 0.6333333333333333,
|
185 |
+
"score_ci_low": 0.4222222222222222,
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
"accuracy": 0.5555555555555556,
|
190 |
+
"accuracy_ci_low": 0.44726747907364484,
|
191 |
"accuracy_ci_high": 0.6555555555555556,
|
192 |
"score_name": "accuracy",
|
193 |
"score": 0.5555555555555556,
|
194 |
"score_ci_high": 0.6555555555555556,
|
195 |
+
"score_ci_low": 0.44726747907364484,
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 0.8666666666666667,
|
200 |
+
"accuracy_ci_low": 0.7781253622132644,
|
201 |
"accuracy_ci_high": 0.9333333333333333,
|
202 |
"score_name": "accuracy",
|
203 |
+
"score": 0.8666666666666667,
|
204 |
"score_ci_high": 0.9333333333333333,
|
205 |
+
"score_ci_low": 0.7781253622132644,
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
"accuracy": 0.5777777777777777,
|
210 |
+
"accuracy_ci_low": 0.457520776596763,
|
211 |
"accuracy_ci_high": 0.6777777777777778,
|
212 |
"score_name": "accuracy",
|
213 |
"score": 0.5777777777777777,
|
214 |
"score_ci_high": 0.6777777777777778,
|
215 |
+
"score_ci_low": 0.457520776596763,
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.5888888888888889,
|
220 |
+
"accuracy_ci_low": 0.4888888888888889,
|
221 |
+
"accuracy_ci_high": 0.6804301831819051,
|
222 |
"score_name": "accuracy",
|
223 |
+
"score": 0.5888888888888889,
|
224 |
+
"score_ci_high": 0.6804301831819051,
|
225 |
+
"score_ci_low": 0.4888888888888889,
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.9666666666666667,
|
230 |
+
"accuracy_ci_low": 0.9111111111111111,
|
231 |
+
"accuracy_ci_high": 0.9888888888888889,
|
232 |
"score_name": "accuracy",
|
233 |
+
"score": 0.9666666666666667,
|
234 |
+
"score_ci_high": 0.9888888888888889,
|
235 |
+
"score_ci_low": 0.9111111111111111,
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 0.8555555555555555,
|
240 |
+
"accuracy_ci_low": 0.7725017589399771,
|
241 |
+
"accuracy_ci_high": 0.9222222222222223,
|
242 |
"score_name": "accuracy",
|
243 |
+
"score": 0.8555555555555555,
|
244 |
+
"score_ci_high": 0.9222222222222223,
|
245 |
+
"score_ci_low": 0.7725017589399771,
|
246 |
"num_of_instances": 90
|
247 |
},
|
248 |
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 0.6666666666666666,
|
250 |
+
"accuracy_ci_low": 0.5666666666666667,
|
251 |
+
"accuracy_ci_high": 0.7555555555555555,
|
252 |
"score_name": "accuracy",
|
253 |
+
"score": 0.6666666666666666,
|
254 |
+
"score_ci_high": 0.7555555555555555,
|
255 |
+
"score_ci_low": 0.5666666666666667,
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
"accuracy": 0.6444444444444445,
|
260 |
+
"accuracy_ci_low": 0.5444444444444444,
|
261 |
"accuracy_ci_high": 0.7444444444444445,
|
262 |
"score_name": "accuracy",
|
263 |
"score": 0.6444444444444445,
|
264 |
"score_ci_high": 0.7444444444444445,
|
265 |
+
"score_ci_low": 0.5444444444444444,
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.6444444444444445,
|
270 |
+
"accuracy_ci_low": 0.5444444444444444,
|
271 |
+
"accuracy_ci_high": 0.7444444444444445,
|
272 |
"score_name": "accuracy",
|
273 |
+
"score": 0.6444444444444445,
|
274 |
+
"score_ci_high": 0.7444444444444445,
|
275 |
+
"score_ci_low": 0.5444444444444444,
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.8777777777777778,
|
280 |
+
"accuracy_ci_low": 0.8,
|
281 |
"accuracy_ci_high": 0.9333333333333333,
|
282 |
"score_name": "accuracy",
|
283 |
+
"score": 0.8777777777777778,
|
284 |
"score_ci_high": 0.9333333333333333,
|
285 |
+
"score_ci_low": 0.8,
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
+
"score": 0.706060606060606,
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.07977207977207977,
|
296 |
+
"score": 0.07977207977207977,
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
+
"score": 0.07977207977207977,
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.3180428134556575,
|
307 |
+
"f1_Organization": 0.2747603833865815,
|
308 |
+
"f1_Location": 0.22950819672131148,
|
309 |
+
"f1_macro": 0.27410379785451683,
|
310 |
+
"recall_macro": 0.23183858884648992,
|
311 |
+
"precision_macro": 0.3454658738569018,
|
312 |
+
"in_classes_support": 0.5302806499261448,
|
313 |
+
"f1_micro": 0.20465890183028285,
|
314 |
+
"recall_micro": 0.2342857142857143,
|
315 |
+
"precision_micro": 0.18168389955686853,
|
316 |
+
"score": 0.20465890183028285,
|
317 |
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.1623107942071267,
|
319 |
+
"score_ci_high": 0.24302164655950423,
|
320 |
+
"f1_micro_ci_low": 0.1623107942071267,
|
321 |
+
"f1_micro_ci_high": 0.24302164655950423
|
322 |
},
|
323 |
+
"score": 0.20465890183028285,
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
|
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.1267605633802817,
|
340 |
+
"accuracy_ci_low": 0.056338028169014086,
|
341 |
+
"accuracy_ci_high": 0.2112676056338028,
|
342 |
"score_name": "accuracy",
|
343 |
+
"score": 0.1267605633802817,
|
344 |
+
"score_ci_high": 0.2112676056338028,
|
345 |
+
"score_ci_low": 0.056338028169014086,
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
"accuracy": 0.08450704225352113,
|
350 |
+
"accuracy_ci_low": 0.04225352112676056,
|
351 |
+
"accuracy_ci_high": 0.16901408450704225,
|
352 |
"score_name": "accuracy",
|
353 |
"score": 0.08450704225352113,
|
354 |
+
"score_ci_high": 0.16901408450704225,
|
355 |
+
"score_ci_low": 0.04225352112676056,
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.4084507042253521,
|
360 |
+
"accuracy_ci_low": 0.30985915492957744,
|
361 |
+
"accuracy_ci_high": 0.5211267605633803,
|
362 |
"score_name": "accuracy",
|
363 |
+
"score": 0.4084507042253521,
|
364 |
+
"score_ci_high": 0.5211267605633803,
|
365 |
+
"score_ci_low": 0.30985915492957744,
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.5211267605633803,
|
370 |
+
"accuracy_ci_low": 0.39436619718309857,
|
371 |
+
"accuracy_ci_high": 0.6338028169014085,
|
372 |
"score_name": "accuracy",
|
373 |
+
"score": 0.5211267605633803,
|
374 |
+
"score_ci_high": 0.6338028169014085,
|
375 |
+
"score_ci_low": 0.39436619718309857,
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
|
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.3380281690140845,
|
390 |
+
"accuracy_ci_low": 0.22535211267605634,
|
391 |
+
"accuracy_ci_high": 0.4507042253521127,
|
392 |
"score_name": "accuracy",
|
393 |
+
"score": 0.3380281690140845,
|
394 |
+
"score_ci_high": 0.4507042253521127,
|
395 |
+
"score_ci_low": 0.22535211267605634,
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.4788732394366197,
|
400 |
"accuracy_ci_low": 0.36619718309859156,
|
401 |
+
"accuracy_ci_high": 0.5915492957746479,
|
402 |
"score_name": "accuracy",
|
403 |
+
"score": 0.4788732394366197,
|
404 |
+
"score_ci_high": 0.5915492957746479,
|
405 |
"score_ci_low": 0.36619718309859156,
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.36619718309859156,
|
410 |
+
"accuracy_ci_low": 0.2676056338028169,
|
411 |
+
"accuracy_ci_high": 0.4788732394366197,
|
412 |
"score_name": "accuracy",
|
413 |
+
"score": 0.36619718309859156,
|
414 |
+
"score_ci_high": 0.4788732394366197,
|
415 |
+
"score_ci_low": 0.2676056338028169,
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.08450704225352113,
|
420 |
+
"accuracy_ci_low": 0.028169014084507043,
|
421 |
+
"accuracy_ci_high": 0.16901408450704225,
|
422 |
"score_name": "accuracy",
|
423 |
+
"score": 0.08450704225352113,
|
424 |
+
"score_ci_high": 0.16901408450704225,
|
425 |
+
"score_ci_low": 0.028169014084507043,
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.323943661971831,
|
430 |
+
"accuracy_ci_low": 0.22535211267605634,
|
431 |
+
"accuracy_ci_high": 0.4397440034897243,
|
432 |
"score_name": "accuracy",
|
433 |
+
"score": 0.323943661971831,
|
434 |
+
"score_ci_high": 0.4397440034897243,
|
435 |
+
"score_ci_low": 0.22535211267605634,
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.4225352112676056,
|
440 |
+
"accuracy_ci_low": 0.30985915492957744,
|
441 |
+
"accuracy_ci_high": 0.5492957746478874,
|
442 |
"score_name": "accuracy",
|
443 |
+
"score": 0.4225352112676056,
|
444 |
+
"score_ci_high": 0.5492957746478874,
|
445 |
+
"score_ci_low": 0.30985915492957744,
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
"accuracy": 0.18309859154929578,
|
450 |
"accuracy_ci_low": 0.09859154929577464,
|
451 |
+
"accuracy_ci_high": 0.28910654360361887,
|
452 |
"score_name": "accuracy",
|
453 |
"score": 0.18309859154929578,
|
454 |
+
"score_ci_high": 0.28910654360361887,
|
455 |
"score_ci_low": 0.09859154929577464,
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.5633802816901409,
|
460 |
+
"accuracy_ci_low": 0.4507042253521127,
|
461 |
+
"accuracy_ci_high": 0.676056338028169,
|
462 |
"score_name": "accuracy",
|
463 |
+
"score": 0.5633802816901409,
|
464 |
+
"score_ci_high": 0.676056338028169,
|
465 |
+
"score_ci_low": 0.4507042253521127,
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
+
"score": 0.3289738430583501,
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.09605263157894736,
|
475 |
"f1_suggestive": 0.0,
|
476 |
"f1_generic": 0.0,
|
477 |
+
"f1_descriptive": 0.375,
|
478 |
"f1_fanciful": 0.10526315789473684,
|
|
|
479 |
"f1_arbitrary": 0.0,
|
480 |
+
"f1_macro_ci_low": 0.048484848484848485,
|
481 |
+
"f1_macro_ci_high": 0.1610036081002675,
|
482 |
"score_name": "f1_micro",
|
483 |
+
"score": 0.13333333333333333,
|
484 |
+
"score_ci_high": 0.24299065420560748,
|
485 |
+
"score_ci_low": 0.05825242718446602,
|
486 |
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.08235294117647059,
|
488 |
+
"accuracy_ci_low": 0.03529411764705882,
|
489 |
+
"accuracy_ci_high": 0.15294117647058825,
|
490 |
+
"f1_micro": 0.13333333333333333,
|
491 |
+
"f1_micro_ci_low": 0.05825242718446602,
|
492 |
+
"f1_micro_ci_high": 0.24299065420560748
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.46628060735944554,
|
496 |
+
"f1_no": 0.5809128630705395,
|
497 |
+
"f1_yes": 0.3516483516483517,
|
498 |
+
"f1_macro_ci_low": 0.3928270444081002,
|
499 |
+
"f1_macro_ci_high": 0.5415825326375369,
|
500 |
"score_name": "f1_micro",
|
501 |
+
"score": 0.5180722891566265,
|
502 |
+
"score_ci_high": 0.58253132966529,
|
503 |
+
"score_ci_low": 0.44652531947540486,
|
504 |
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.43,
|
506 |
+
"accuracy_ci_low": 0.365,
|
507 |
+
"accuracy_ci_high": 0.495,
|
508 |
+
"f1_micro": 0.5180722891566265,
|
509 |
+
"f1_micro_ci_low": 0.44652531947540486,
|
510 |
+
"f1_micro_ci_high": 0.58253132966529
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.09994226986403168,
|
514 |
"f1_conclusion": 0.0,
|
515 |
+
"f1_decree": 0.14814814814814814,
|
516 |
+
"f1_issue": 0.05714285714285714,
|
517 |
+
"f1_analysis": 0.3076923076923077,
|
518 |
+
"f1_facts": 0.06896551724137931,
|
519 |
"f1_procedural history": 0.0,
|
520 |
+
"f1_rule": 0.11764705882352941,
|
521 |
+
"f1_macro_ci_low": 0.06467882036635723,
|
522 |
+
"f1_macro_ci_high": 0.16271896227970067,
|
523 |
"score_name": "f1_micro",
|
524 |
+
"score": 0.1222707423580786,
|
525 |
+
"score_ci_high": 0.1896551724137931,
|
526 |
+
"score_ci_low": 0.07144817486457739,
|
527 |
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.07,
|
529 |
+
"accuracy_ci_low": 0.04,
|
530 |
+
"accuracy_ci_high": 0.11032816661500704,
|
531 |
+
"f1_micro": 0.1222707423580786,
|
532 |
+
"f1_micro_ci_low": 0.07144817486457739,
|
533 |
+
"f1_micro_ci_high": 0.1896551724137931
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.3036750483558994,
|
537 |
+
"f1_yes": 0.18181818181818182,
|
538 |
+
"f1_no": 0.425531914893617,
|
539 |
+
"f1_macro_ci_low": 0.24346443663418135,
|
540 |
+
"f1_macro_ci_high": 0.37985983397043105,
|
541 |
"score_name": "f1_micro",
|
542 |
+
"score": 0.33557046979865773,
|
543 |
+
"score_ci_high": 0.41208424597764826,
|
544 |
+
"score_ci_low": 0.26697141622873294,
|
545 |
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.25,
|
547 |
+
"accuracy_ci_low": 0.2,
|
548 |
"accuracy_ci_high": 0.315,
|
549 |
+
"f1_micro": 0.33557046979865773,
|
550 |
+
"f1_micro_ci_low": 0.26697141622873294,
|
551 |
+
"f1_micro_ci_high": 0.41208424597764826
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.8285413216920066,
|
555 |
+
"f1_yes": 0.821917808219178,
|
556 |
+
"f1_no": 0.8351648351648352,
|
557 |
+
"f1_macro_ci_low": 0.7370610449522892,
|
558 |
+
"f1_macro_ci_high": 0.8925462752093225,
|
559 |
"score_name": "f1_micro",
|
560 |
+
"score": 0.8292682926829268,
|
561 |
+
"score_ci_high": 0.891566265060241,
|
562 |
+
"score_ci_low": 0.7393939393939394,
|
563 |
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.8,
|
565 |
+
"accuracy_ci_low": 0.7058823529411765,
|
566 |
+
"accuracy_ci_high": 0.8705882352941177,
|
567 |
+
"f1_micro": 0.8292682926829268,
|
568 |
+
"f1_micro_ci_low": 0.7393939393939394,
|
569 |
+
"f1_micro_ci_high": 0.891566265060241
|
570 |
},
|
571 |
+
"score": 0.3877030254659246,
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.2883227310464808,
|
578 |
+
"f1_cars": 0.45454545454545453,
|
579 |
"f1_windows x": 0.0,
|
580 |
+
"f1_atheism": 0.1,
|
581 |
+
"f1_religion": 0.11594202898550725,
|
582 |
+
"f1_medicine": 0.5161290322580645,
|
583 |
+
"f1_christianity": 0.2,
|
584 |
+
"f1_computer graphics": 0.24615384615384617,
|
585 |
+
"f1_microsoft windows": 0.2,
|
586 |
+
"f1_middle east": 0.23529411764705882,
|
587 |
+
"f1_motorcycles": 0.43902439024390244,
|
588 |
+
"f1_pc hardware": 0.38095238095238093,
|
|
|
589 |
"f1_mac hardware": 0.14285714285714285,
|
590 |
+
"f1_for sale": 0.2127659574468085,
|
591 |
+
"f1_guns": 0.04,
|
592 |
+
"f1_space": 0.4166666666666667,
|
593 |
+
"f1_cryptography": 0.36065573770491804,
|
594 |
+
"f1_baseball": 0.37333333333333335,
|
595 |
+
"f1_hockey": 0.6060606060606061,
|
596 |
+
"f1_politics": 0.23376623376623376,
|
597 |
+
"f1_electronics": 0.49230769230769234,
|
598 |
+
"f1_macro_ci_low": 0.2617609426183389,
|
599 |
+
"f1_macro_ci_high": 0.31709395962356884,
|
600 |
"score_name": "f1_micro",
|
601 |
+
"score": 0.30918595967139656,
|
602 |
+
"score_ci_high": 0.3396825281712499,
|
603 |
+
"score_ci_low": 0.2773193184164219,
|
604 |
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.207,
|
606 |
+
"accuracy_ci_low": 0.183,
|
607 |
+
"accuracy_ci_high": 0.231,
|
608 |
+
"f1_micro": 0.30918595967139656,
|
609 |
+
"f1_micro_ci_low": 0.2773193184164219,
|
610 |
+
"f1_micro_ci_high": 0.3396825281712499
|
611 |
},
|
612 |
+
"score": 0.30918595967139656,
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.4658504084132011,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.6103646833013435,
|
620 |
+
"f1_checking or savings account": 0.43243243243243246,
|
621 |
+
"f1_debt collection": 0.35294117647058826,
|
622 |
+
"f1_credit card or prepaid card": 0.4883720930232558,
|
623 |
+
"f1_money transfer or virtual currency or money service": 0.48,
|
624 |
+
"f1_mortgage": 0.6885245901639344,
|
625 |
+
"f1_payday loan or title loan or personal loan": 0.16666666666666666,
|
626 |
+
"f1_student loan": 0.5217391304347826,
|
627 |
+
"f1_vehicle loan or lease": 0.45161290322580644,
|
628 |
+
"f1_macro_ci_low": 0.4151030067042806,
|
629 |
+
"f1_macro_ci_high": 0.5374380194007881,
|
630 |
"score_name": "f1_micro",
|
631 |
+
"score": 0.5647530040053405,
|
632 |
+
"score_ci_high": 0.5946299934512115,
|
633 |
+
"score_ci_low": 0.5324708819498815,
|
634 |
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.423,
|
636 |
+
"accuracy_ci_low": 0.392,
|
637 |
+
"accuracy_ci_high": 0.4538616190423828,
|
638 |
+
"f1_micro": 0.5647530040053405,
|
639 |
+
"f1_micro_ci_low": 0.5324708819498815,
|
640 |
+
"f1_micro_ci_high": 0.5946299934512115
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.6154980967666208,
|
644 |
+
"f1_mortgages and loans": 0.6794871794871795,
|
645 |
+
"f1_credit card": 0.6394557823129252,
|
646 |
+
"f1_debt collection": 0.5405405405405406,
|
647 |
+
"f1_credit reporting": 0.6307053941908713,
|
648 |
+
"f1_retail banking": 0.5873015873015873,
|
649 |
+
"f1_macro_ci_low": 0.5757444877146287,
|
650 |
+
"f1_macro_ci_high": 0.6632621576134956,
|
651 |
"score_name": "f1_micro",
|
652 |
+
"score": 0.6152046783625731,
|
653 |
+
"score_ci_high": 0.6592588246755606,
|
654 |
+
"score_ci_low": 0.5758293838862559,
|
655 |
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.526,
|
657 |
+
"accuracy_ci_low": 0.488,
|
658 |
+
"accuracy_ci_high": 0.572065074842346,
|
659 |
+
"f1_micro": 0.6152046783625731,
|
660 |
+
"f1_micro_ci_low": 0.5758293838862559,
|
661 |
+
"f1_micro_ci_high": 0.6592588246755606
|
662 |
},
|
663 |
+
"score": 0.5899788411839568,
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
+
"program_accuracy": 0.03,
|
671 |
+
"score": 0.03,
|
|
|
672 |
"score_name": "program_accuracy",
|
673 |
+
"execution_accuracy": 0.03,
|
674 |
+
"program_accuracy_ci_low": 0.02,
|
675 |
+
"program_accuracy_ci_high": 0.041,
|
676 |
+
"score_ci_low": 0.02,
|
677 |
+
"score_ci_high": 0.041,
|
678 |
+
"execution_accuracy_ci_low": 0.02,
|
679 |
+
"execution_accuracy_ci_high": 0.042
|
680 |
},
|
681 |
+
"score": 0.03,
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.27959949741292506,
|
688 |
+
"recall": 0.5616895325224012,
|
689 |
+
"f1": 0.3093936462410963,
|
690 |
+
"precision_ci_low": 0.26037333098993004,
|
691 |
+
"precision_ci_high": 0.2998496234035471,
|
692 |
+
"recall_ci_low": 0.545189324623663,
|
693 |
+
"recall_ci_high": 0.5772522419606227,
|
694 |
+
"f1_ci_low": 0.29233863350762596,
|
695 |
+
"f1_ci_high": 0.3260584980677657,
|
696 |
"score_name": "f1",
|
697 |
+
"score": 0.3093936462410963,
|
698 |
+
"score_ci_high": 0.3260584980677657,
|
699 |
+
"score_ci_low": 0.29233863350762596,
|
700 |
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.5708662017683188,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6597496373951435,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5238148674865564,
|
704 |
+
"faithfullness_f1_token_overlap": 0.3063464322583528,
|
705 |
+
"faithfullness_recall_token_overlap": 0.2515040891461165,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5201254248503554,
|
707 |
+
"correctness_f1_token_overlap": 0.3093936462410963,
|
708 |
+
"correctness_recall_token_overlap": 0.5616895325224012,
|
709 |
+
"correctness_precision_token_overlap": 0.27959949741292506
|
710 |
},
|
711 |
+
"score": 0.3093936462410963,
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
+
"accuracy": 0.572,
|
718 |
+
"accuracy_ci_low": 0.5408695080796895,
|
719 |
+
"accuracy_ci_high": 0.602,
|
720 |
"score_name": "accuracy",
|
721 |
+
"score": 0.572,
|
722 |
+
"score_ci_high": 0.602,
|
723 |
+
"score_ci_low": 0.5408695080796895,
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
+
"accuracy": 0.878,
|
728 |
+
"accuracy_ci_low": 0.846,
|
729 |
+
"accuracy_ci_high": 0.9048842512733182,
|
730 |
"score_name": "accuracy",
|
731 |
+
"score": 0.878,
|
732 |
+
"score_ci_high": 0.9048842512733182,
|
733 |
+
"score_ci_low": 0.846,
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
+
"score": 0.725,
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
740 |
"safety": {
|
741 |
"attaq_500": {
|
742 |
+
"severity_critical": 2.0,
|
743 |
+
"severity_high": 2.0,
|
744 |
+
"severity_medium": 1.0,
|
745 |
+
"severity_low": 95.0,
|
746 |
+
"category_harmful_info": 0.8059234746569206,
|
747 |
+
"category_deception": 0.7821050891370483,
|
748 |
+
"category_explicit_content": 0.8131378136499965,
|
749 |
+
"category_violence": 0.7824397089225905,
|
750 |
+
"category_discrimination": 0.7646755935653806,
|
751 |
+
"category_substance_abuse": 0.7601792050732505,
|
752 |
+
"category_pii": 0.7583640239856861,
|
753 |
+
"safety": 0.7892763628562292,
|
754 |
+
"safety_ci_low": 0.7568372385512269,
|
755 |
+
"safety_ci_high": 0.8199271972816702,
|
756 |
"score_name": "safety",
|
757 |
+
"score": 0.7892763628562292,
|
758 |
+
"score_ci_high": 0.8199271972816702,
|
759 |
+
"score_ci_low": 0.7568372385512269,
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
+
"score": 0.7892763628562292,
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
+
"rougeL": 0.26029440810817217,
|
770 |
+
"score": 0.26029440810817217,
|
771 |
"score_name": "rougeL",
|
772 |
+
"rouge1": 0.37812164542305304,
|
773 |
+
"rougeLsum": 0.3224592121117338,
|
774 |
+
"rouge2": 0.1735255109874066,
|
775 |
+
"rougeL_ci_low": 0.25326019704193026,
|
776 |
+
"rougeL_ci_high": 0.2676771085873056,
|
777 |
+
"score_ci_low": 0.25326019704193026,
|
778 |
+
"score_ci_high": 0.2676771085873056,
|
779 |
+
"rouge1_ci_low": 0.36799150913690026,
|
780 |
+
"rouge1_ci_high": 0.3873907236511595,
|
781 |
+
"rougeLsum_ci_low": 0.3130244535911371,
|
782 |
+
"rougeLsum_ci_high": 0.33132370273160416,
|
783 |
+
"rouge2_ci_low": 0.16695272546253506,
|
784 |
+
"rouge2_ci_high": 0.18091296094174728
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
+
"rougeL": 0.09390373141967649,
|
789 |
+
"score": 0.09390373141967649,
|
790 |
"score_name": "rougeL",
|
791 |
+
"rouge1": 0.12667295475773943,
|
792 |
+
"rougeLsum": 0.10750099464975159,
|
793 |
+
"rouge2": 0.01898752670157484,
|
794 |
+
"rougeL_ci_low": 0.08937719185127607,
|
795 |
+
"rougeL_ci_high": 0.09832477865928776,
|
796 |
+
"score_ci_low": 0.08937719185127607,
|
797 |
+
"score_ci_high": 0.09832477865928776,
|
798 |
+
"rouge1_ci_low": 0.12055496978215834,
|
799 |
+
"rouge1_ci_high": 0.13195192205981862,
|
800 |
+
"rougeLsum_ci_low": 0.10257751488972287,
|
801 |
+
"rougeLsum_ci_high": 0.11236007191539416,
|
802 |
+
"rouge2_ci_low": 0.01673170907001085,
|
803 |
+
"rouge2_ci_high": 0.02151839340904419
|
804 |
},
|
805 |
+
"score": 0.17709906976392434,
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
+
1176,
|
814 |
+
681,
|
815 |
+
439,
|
816 |
+
289
|
817 |
],
|
818 |
"totals": [
|
819 |
+
1805,
|
820 |
+
1739,
|
821 |
+
1673,
|
822 |
+
1607
|
823 |
],
|
824 |
"precisions": [
|
825 |
+
0.6515235457063712,
|
826 |
+
0.39160437032777456,
|
827 |
+
0.26240286909742977,
|
828 |
+
0.17983820784069696
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
+
"sys_len": 1805,
|
832 |
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.33125088532286445,
|
834 |
+
"score": 0.33125088532286445,
|
835 |
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.28827207319430564,
|
837 |
+
"score_ci_high": 0.3750069277361591,
|
838 |
+
"sacrebleu_ci_low": 0.28827207319430564,
|
839 |
+
"sacrebleu_ci_high": 0.3750069277361591
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
+
1224,
|
845 |
+
746,
|
846 |
+
495,
|
847 |
+
332
|
848 |
],
|
849 |
"totals": [
|
850 |
+
1783,
|
851 |
+
1717,
|
852 |
+
1651,
|
853 |
+
1585
|
854 |
],
|
855 |
"precisions": [
|
856 |
+
0.6864834548513741,
|
857 |
+
0.4344787419918463,
|
858 |
+
0.29981829194427617,
|
859 |
+
0.20946372239747635
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
+
"sys_len": 1783,
|
863 |
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.36994859939150276,
|
865 |
+
"score": 0.36994859939150276,
|
866 |
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.3148788426694038,
|
868 |
+
"score_ci_high": 0.4105592207294174,
|
869 |
+
"sacrebleu_ci_low": 0.3148788426694038,
|
870 |
+
"sacrebleu_ci_high": 0.4105592207294174
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
+
215,
|
876 |
+
39,
|
877 |
+
18,
|
878 |
+
6
|
879 |
],
|
880 |
"totals": [
|
881 |
+
3272,
|
882 |
+
3206,
|
883 |
+
3140,
|
884 |
+
3074
|
885 |
],
|
886 |
"precisions": [
|
887 |
+
0.06570904645476773,
|
888 |
+
0.012164691203992516,
|
889 |
+
0.005732484076433121,
|
890 |
+
0.001951854261548471
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
+
"sys_len": 3272,
|
894 |
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.009724765205835872,
|
896 |
+
"score": 0.009724765205835872,
|
897 |
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.00598430838144436,
|
899 |
+
"score_ci_high": 0.015966807436499916,
|
900 |
+
"sacrebleu_ci_low": 0.00598430838144436,
|
901 |
+
"sacrebleu_ci_high": 0.015966807436499916
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
+
1112,
|
907 |
+
617,
|
908 |
+
379,
|
909 |
+
249
|
910 |
],
|
911 |
"totals": [
|
912 |
+
1879,
|
913 |
+
1813,
|
914 |
+
1747,
|
915 |
+
1681
|
916 |
],
|
917 |
"precisions": [
|
918 |
+
0.5918041511442257,
|
919 |
+
0.34031991174848314,
|
920 |
+
0.21694333142530053,
|
921 |
+
0.14812611540749554
|
922 |
],
|
923 |
+
"bp": 1.0,
|
924 |
+
"sys_len": 1879,
|
925 |
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.283635656706225,
|
927 |
+
"score": 0.283635656706225,
|
928 |
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.25483093745296115,
|
930 |
+
"score_ci_high": 0.3284369566535284,
|
931 |
+
"sacrebleu_ci_low": 0.25483093745296115,
|
932 |
+
"sacrebleu_ci_high": 0.3284369566535284
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
+
1422,
|
938 |
+
990,
|
939 |
+
740,
|
940 |
+
568
|
941 |
],
|
942 |
"totals": [
|
943 |
+
2012,
|
944 |
+
1946,
|
945 |
+
1880,
|
946 |
+
1814
|
947 |
],
|
948 |
"precisions": [
|
949 |
+
0.7067594433399603,
|
950 |
+
0.5087358684480986,
|
951 |
+
0.39361702127659576,
|
952 |
+
0.3131201764057332
|
953 |
],
|
954 |
+
"bp": 0.9725507672852267,
|
955 |
+
"sys_len": 2012,
|
956 |
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.44622048751232035,
|
958 |
+
"score": 0.44622048751232035,
|
959 |
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.4016486544135287,
|
961 |
+
"score_ci_high": 0.492468721152314,
|
962 |
+
"sacrebleu_ci_low": 0.4016486544135287,
|
963 |
+
"sacrebleu_ci_high": 0.492468721152314
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
+
983,
|
969 |
+
445,
|
970 |
+
221,
|
971 |
+
116
|
972 |
],
|
973 |
"totals": [
|
974 |
+
2522,
|
975 |
+
2456,
|
976 |
+
2390,
|
977 |
+
2324
|
978 |
],
|
979 |
"precisions": [
|
980 |
+
0.3897700237906423,
|
981 |
+
0.18118892508143322,
|
982 |
+
0.09246861924686192,
|
983 |
+
0.04991394148020654
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
+
"sys_len": 2522,
|
987 |
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.13436590468744522,
|
989 |
+
"score": 0.13436590468744522,
|
990 |
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.10668449271679148,
|
992 |
+
"score_ci_high": 0.16331801147285638,
|
993 |
+
"sacrebleu_ci_low": 0.10668449271679148,
|
994 |
+
"sacrebleu_ci_high": 0.16331801147285638
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
+
1397,
|
1000 |
+
976,
|
1001 |
+
718,
|
1002 |
+
534
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
+
1885,
|
1006 |
+
1819,
|
1007 |
+
1753,
|
1008 |
+
1687
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
+
0.7411140583554376,
|
1012 |
+
0.536558548653106,
|
1013 |
+
0.40958357102110665,
|
1014 |
+
0.31653823355068167
|
1015 |
],
|
1016 |
+
"bp": 0.9836888676493653,
|
1017 |
+
"sys_len": 1885,
|
1018 |
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.4687329402986153,
|
1020 |
+
"score": 0.4687329402986153,
|
1021 |
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.4090027394886287,
|
1023 |
+
"score_ci_high": 0.5072542898902915,
|
1024 |
+
"sacrebleu_ci_low": 0.4090027394886287,
|
1025 |
+
"sacrebleu_ci_high": 0.5072542898902915
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
+
1075,
|
1031 |
+
589,
|
1032 |
+
357,
|
1033 |
+
218
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
+
2037,
|
1037 |
+
1971,
|
1038 |
+
1905,
|
1039 |
+
1839
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
+
0.5277368679430535,
|
1043 |
+
0.29883307965499745,
|
1044 |
+
0.1874015748031496,
|
1045 |
+
0.11854268624252312
|
1046 |
],
|
1047 |
+
"bp": 1.0,
|
1048 |
+
"sys_len": 2037,
|
1049 |
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.24328959002445089,
|
1051 |
+
"score": 0.24328959002445089,
|
1052 |
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.2108781721763864,
|
1054 |
+
"score_ci_high": 0.2866413776406142,
|
1055 |
+
"sacrebleu_ci_low": 0.2108781721763864,
|
1056 |
+
"sacrebleu_ci_high": 0.2866413776406142
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
+
1223,
|
1062 |
+
645,
|
1063 |
+
371,
|
1064 |
+
219
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
+
2012,
|
1068 |
+
1946,
|
1069 |
+
1880,
|
1070 |
+
1814
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
+
0.6078528827037774,
|
1074 |
+
0.3314491264131552,
|
1075 |
+
0.1973404255319149,
|
1076 |
+
0.12072767364939362
|
1077 |
],
|
1078 |
+
"bp": 0.9581570887075945,
|
1079 |
+
"sys_len": 2012,
|
1080 |
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.25220069551672647,
|
1082 |
+
"score": 0.25220069551672647,
|
1083 |
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.21455240851266189,
|
1085 |
+
"score_ci_high": 0.28127381859222883,
|
1086 |
+
"sacrebleu_ci_low": 0.21455240851266189,
|
1087 |
+
"sacrebleu_ci_high": 0.28127381859222883
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
+
1265,
|
1093 |
+
811,
|
1094 |
564,
|
1095 |
+
403
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
+
1799,
|
1099 |
+
1733,
|
1100 |
+
1667,
|
1101 |
+
1601
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
+
0.7031684269038355,
|
1105 |
+
0.4679746105020196,
|
1106 |
+
0.33833233353329334,
|
1107 |
+
0.2517176764522173
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
+
"sys_len": 1799,
|
1111 |
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.40915203730030597,
|
1113 |
+
"score": 0.40915203730030597,
|
1114 |
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.37740759275198177,
|
1116 |
+
"score_ci_high": 0.45339812071532676,
|
1117 |
+
"sacrebleu_ci_low": 0.37740759275198177,
|
1118 |
+
"sacrebleu_ci_high": 0.45339812071532676
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
+
1032,
|
1124 |
+
484,
|
1125 |
+
292,
|
1126 |
+
181
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
+
1873,
|
1130 |
+
1807,
|
1131 |
+
1741,
|
1132 |
+
1675
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
+
0.5509877202349173,
|
1136 |
+
0.26784726065301606,
|
1137 |
+
0.16771970132107986,
|
1138 |
+
0.10805970149253731
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
+
"sys_len": 1873,
|
1142 |
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.22741507142824863,
|
1144 |
+
"score": 0.22741507142824863,
|
1145 |
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.19114102819495998,
|
1147 |
+
"score_ci_high": 0.26211626317047076,
|
1148 |
+
"sacrebleu_ci_low": 0.19114102819495998,
|
1149 |
+
"sacrebleu_ci_high": 0.26211626317047076
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
+
1019,
|
1155 |
+
503,
|
1156 |
+
310,
|
1157 |
+
204
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
+
1801,
|
1161 |
+
1735,
|
1162 |
+
1669,
|
1163 |
+
1603
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
+
0.5657967795669073,
|
1167 |
+
0.28991354466858793,
|
1168 |
+
0.18573996405032955,
|
1169 |
+
0.1272613849033063
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
+
"sys_len": 1801,
|
1173 |
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.24953573320283845,
|
1175 |
+
"score": 0.24953573320283845,
|
1176 |
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.20393561832427076,
|
1178 |
+
"score_ci_high": 0.27875367055825584,
|
1179 |
+
"sacrebleu_ci_low": 0.20393561832427076,
|
1180 |
+
"sacrebleu_ci_high": 0.27875367055825584
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
+
1285,
|
1186 |
864,
|
1187 |
+
625,
|
1188 |
463
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
+
1899,
|
1192 |
+
1833,
|
1193 |
+
1767,
|
1194 |
+
1701
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
+
0.6766719325961031,
|
1198 |
+
0.4713584288052373,
|
1199 |
+
0.35370684776457273,
|
1200 |
+
0.2721928277483833
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
+
"sys_len": 1899,
|
1204 |
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.4186126965889709,
|
1206 |
+
"score": 0.4186126965889709,
|
1207 |
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.3580478494686714,
|
1209 |
+
"score_ci_high": 0.47721854835474,
|
1210 |
+
"sacrebleu_ci_low": 0.3580478494686714,
|
1211 |
+
"sacrebleu_ci_high": 0.47721854835474
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
+
1286,
|
1217 |
+
859,
|
1218 |
+
598,
|
1219 |
+
417
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
+
1799,
|
1223 |
+
1733,
|
1224 |
+
1667,
|
1225 |
+
1601
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
+
0.7148415786548082,
|
1229 |
+
0.4956722446624351,
|
1230 |
+
0.3587282543491302,
|
1231 |
+
0.26046221111805123
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
+
"sys_len": 1799,
|
1235 |
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.42655857647405626,
|
1237 |
+
"score": 0.42655857647405626,
|
1238 |
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.37843554301324095,
|
1240 |
+
"score_ci_high": 0.4676738192216758,
|
1241 |
+
"sacrebleu_ci_low": 0.37843554301324095,
|
1242 |
+
"sacrebleu_ci_high": 0.4676738192216758
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
+
1178,
|
1248 |
+
657,
|
1249 |
+
409,
|
1250 |
+
266
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
+
1874,
|
1254 |
+
1808,
|
1255 |
+
1742,
|
1256 |
+
1676
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
+
0.6286019210245464,
|
1260 |
+
0.36338495575221236,
|
1261 |
+
0.23478760045924227,
|
1262 |
+
0.15871121718377088
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
+
"sys_len": 1874,
|
1266 |
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.3037430550349969,
|
1268 |
+
"score": 0.3037430550349969,
|
1269 |
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.26128349825222114,
|
1271 |
+
"score_ci_high": 0.3531214265467947,
|
1272 |
+
"sacrebleu_ci_low": 0.26128349825222114,
|
1273 |
+
"sacrebleu_ci_high": 0.3531214265467947
|
1274 |
},
|
1275 |
+
"score": 0.3049591129796936,
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
+
"score": 0.38015857299104155,
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
results/bluebench/{2025-06-21T09-36-54_evaluation_results.json β 2025-06-23T14-18-29_evaluation_results.json}
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
-
"timestamp_utc": "2025-06-
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
@@ -42,7 +42,7 @@
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
-
"unitxt_commit_hash": "
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
@@ -176,13 +176,13 @@
|
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
-
"accuracy": 0.
|
180 |
-
"accuracy_ci_low": 0.
|
181 |
"accuracy_ci_high": 0.8555555555555555,
|
182 |
"score_name": "accuracy",
|
183 |
-
"score": 0.
|
184 |
"score_ci_high": 0.8555555555555555,
|
185 |
-
"score_ci_low": 0.
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
@@ -216,13 +216,13 @@
|
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
-
"accuracy":
|
220 |
-
"accuracy_ci_low":
|
221 |
"accuracy_ci_high": 1.0,
|
222 |
"score_name": "accuracy",
|
223 |
-
"score":
|
224 |
"score_ci_high": 1.0,
|
225 |
-
"score_ci_low":
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
@@ -276,51 +276,51 @@
|
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
-
"accuracy": 0.
|
280 |
-
"accuracy_ci_low": 0.
|
281 |
-
"accuracy_ci_high": 0.
|
282 |
"score_name": "accuracy",
|
283 |
-
"score": 0.
|
284 |
-
"score_ci_high": 0.
|
285 |
-
"score_ci_low": 0.
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
-
"score": 0.
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
296 |
-
"score": 0.
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
-
"score": 0.
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
-
"f1_Person": 0.
|
307 |
-
"f1_Organization": 0.
|
308 |
-
"f1_Location": 0.
|
309 |
-
"f1_macro": 0.
|
310 |
-
"recall_macro": 0.
|
311 |
-
"precision_macro": 0.
|
312 |
-
"in_classes_support": 0.
|
313 |
-
"f1_micro": 0.
|
314 |
-
"recall_micro": 0.
|
315 |
-
"precision_micro": 0.
|
316 |
-
"score": 0.
|
317 |
"score_name": "f1_micro",
|
318 |
-
"score_ci_low": 0.
|
319 |
-
"score_ci_high": 0.
|
320 |
-
"f1_micro_ci_low": 0.
|
321 |
-
"f1_micro_ci_high": 0.
|
322 |
},
|
323 |
-
"score": 0.
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
@@ -336,23 +336,23 @@
|
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
-
"accuracy": 0.
|
340 |
"accuracy_ci_low": 0.11267605633802817,
|
341 |
-
"accuracy_ci_high": 0.
|
342 |
"score_name": "accuracy",
|
343 |
-
"score": 0.
|
344 |
-
"score_ci_high": 0.
|
345 |
"score_ci_low": 0.11267605633802817,
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
-
"accuracy": 0.
|
350 |
-
"accuracy_ci_low": 0.
|
351 |
-
"accuracy_ci_high": 0.
|
352 |
"score_name": "accuracy",
|
353 |
-
"score": 0.
|
354 |
-
"score_ci_high": 0.
|
355 |
-
"score_ci_low": 0.
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
@@ -377,52 +377,52 @@
|
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
379 |
"accuracy": 0.323943661971831,
|
380 |
-
"accuracy_ci_low": 0.
|
381 |
"accuracy_ci_high": 0.43661971830985913,
|
382 |
"score_name": "accuracy",
|
383 |
"score": 0.323943661971831,
|
384 |
"score_ci_high": 0.43661971830985913,
|
385 |
-
"score_ci_low": 0.
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
-
"accuracy": 0.
|
390 |
-
"accuracy_ci_low": 0.
|
391 |
-
"accuracy_ci_high": 0.
|
392 |
"score_name": "accuracy",
|
393 |
-
"score": 0.
|
394 |
-
"score_ci_high": 0.
|
395 |
-
"score_ci_low": 0.
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
-
"accuracy": 0.
|
400 |
-
"accuracy_ci_low": 0.
|
401 |
-
"accuracy_ci_high": 0.
|
402 |
"score_name": "accuracy",
|
403 |
-
"score": 0.
|
404 |
-
"score_ci_high": 0.
|
405 |
-
"score_ci_low": 0.
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
-
"accuracy": 0.
|
410 |
-
"accuracy_ci_low": 0.
|
411 |
"accuracy_ci_high": 0.6901408450704225,
|
412 |
"score_name": "accuracy",
|
413 |
-
"score": 0.
|
414 |
"score_ci_high": 0.6901408450704225,
|
415 |
-
"score_ci_low": 0.
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
"accuracy": 0.29577464788732394,
|
420 |
-
"accuracy_ci_low": 0.
|
421 |
-
"accuracy_ci_high": 0.
|
422 |
"score_name": "accuracy",
|
423 |
"score": 0.29577464788732394,
|
424 |
-
"score_ci_high": 0.
|
425 |
-
"score_ci_low": 0.
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
@@ -438,21 +438,21 @@
|
|
438 |
"mmlu_pro_philosophy": {
|
439 |
"accuracy": 0.5915492957746479,
|
440 |
"accuracy_ci_low": 0.4647887323943662,
|
441 |
-
"accuracy_ci_high": 0.
|
442 |
"score_name": "accuracy",
|
443 |
"score": 0.5915492957746479,
|
444 |
-
"score_ci_high": 0.
|
445 |
"score_ci_low": 0.4647887323943662,
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
-
"accuracy": 0.
|
450 |
-
"accuracy_ci_low": 0.
|
451 |
-
"accuracy_ci_high": 0.
|
452 |
"score_name": "accuracy",
|
453 |
-
"score": 0.
|
454 |
-
"score_ci_high": 0.
|
455 |
-
"score_ci_low": 0.
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
@@ -465,90 +465,90 @@
|
|
465 |
"score_ci_low": 0.5915492957746479,
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
-
"score": 0.
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
-
"f1_macro": 0.
|
475 |
"f1_suggestive": 0.5882352941176471,
|
476 |
"f1_generic": 0.72,
|
477 |
"f1_descriptive": 0.6818181818181818,
|
478 |
-
"f1_fanciful": 0.
|
479 |
-
"f1_arbitrary": 0.
|
480 |
-
"f1_macro_ci_low": 0.
|
481 |
-
"f1_macro_ci_high": 0.
|
482 |
"score_name": "f1_micro",
|
483 |
"score": 0.6547619047619048,
|
484 |
-
"score_ci_high": 0.
|
485 |
-
"score_ci_low": 0.
|
486 |
"num_of_instances": 85,
|
487 |
"accuracy": 0.6470588235294118,
|
488 |
"accuracy_ci_low": 0.5411764705882353,
|
489 |
"accuracy_ci_high": 0.7411764705882353,
|
490 |
"f1_micro": 0.6547619047619048,
|
491 |
-
"f1_micro_ci_low": 0.
|
492 |
-
"f1_micro_ci_high": 0.
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
-
"f1_macro": 0.
|
496 |
-
"f1_no": 0.
|
497 |
-
"f1_yes": 0.
|
498 |
-
"f1_macro_ci_low": 0.
|
499 |
-
"f1_macro_ci_high": 0.
|
500 |
"score_name": "f1_micro",
|
501 |
-
"score": 0.
|
502 |
-
"score_ci_high": 0.
|
503 |
-
"score_ci_low": 0.
|
504 |
"num_of_instances": 200,
|
505 |
-
"accuracy": 0.
|
506 |
-
"accuracy_ci_low": 0.
|
507 |
"accuracy_ci_high": 0.605,
|
508 |
-
"f1_micro": 0.
|
509 |
-
"f1_micro_ci_low": 0.
|
510 |
-
"f1_micro_ci_high": 0.
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
-
"f1_macro": 0.
|
514 |
-
"f1_conclusion": 0.
|
515 |
-
"f1_decree": 0.
|
516 |
"f1_issue": 0.18181818181818182,
|
517 |
-
"f1_analysis": 0.
|
518 |
-
"f1_facts": 0.
|
519 |
-
"f1_procedural history": 0.
|
520 |
-
"f1_rule": 0.
|
521 |
-
"f1_macro_ci_low": 0.
|
522 |
-
"f1_macro_ci_high": 0.
|
523 |
"score_name": "f1_micro",
|
524 |
-
"score": 0.
|
525 |
-
"score_ci_high": 0.
|
526 |
-
"score_ci_low": 0.
|
527 |
"num_of_instances": 200,
|
528 |
-
"accuracy": 0.
|
529 |
-
"accuracy_ci_low": 0.
|
530 |
-
"accuracy_ci_high": 0.
|
531 |
-
"f1_micro": 0.
|
532 |
-
"f1_micro_ci_low": 0.
|
533 |
-
"f1_micro_ci_high": 0.
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
-
"f1_macro": 0.
|
537 |
-
"f1_yes": 0.
|
538 |
-
"f1_no": 0.
|
539 |
-
"f1_macro_ci_low": 0.
|
540 |
-
"f1_macro_ci_high": 0.
|
541 |
"score_name": "f1_micro",
|
542 |
-
"score": 0.
|
543 |
-
"score_ci_high": 0.
|
544 |
-
"score_ci_low": 0.
|
545 |
"num_of_instances": 200,
|
546 |
-
"accuracy": 0.
|
547 |
-
"accuracy_ci_low": 0.
|
548 |
-
"accuracy_ci_high": 0.
|
549 |
-
"f1_micro": 0.
|
550 |
-
"f1_micro_ci_low": 0.
|
551 |
-
"f1_micro_ci_high": 0.
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
"f1_macro": 0.75,
|
@@ -568,172 +568,172 @@
|
|
568 |
"f1_micro_ci_low": 0.6573326079878734,
|
569 |
"f1_micro_ci_high": 0.8234882632928148
|
570 |
},
|
571 |
-
"score": 0.
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
-
"f1_macro": 0.
|
578 |
-
"f1_cars": 0.
|
579 |
-
"f1_windows x": 0.
|
580 |
-
"f1_computer graphics": 0.
|
581 |
-
"f1_atheism": 0.
|
582 |
-
"f1_religion": 0.
|
583 |
-
"f1_medicine": 0.
|
584 |
-
"f1_christianity": 0.
|
585 |
-
"f1_microsoft windows": 0.
|
586 |
-
"f1_middle east": 0.
|
587 |
-
"f1_motorcycles": 0.
|
588 |
-
"f1_pc hardware": 0.
|
589 |
-
"f1_mac hardware": 0.
|
590 |
-
"f1_electronics": 0.
|
591 |
-
"f1_for sale": 0.
|
592 |
-
"f1_guns": 0.
|
593 |
-
"f1_space": 0.
|
594 |
-
"f1_cryptography": 0.
|
595 |
-
"f1_baseball": 0.
|
596 |
"f1_politics": 0.3787878787878788,
|
597 |
-
"f1_hockey": 0.
|
598 |
-
"f1_macro_ci_low": 0.
|
599 |
-
"f1_macro_ci_high": 0.
|
600 |
"score_name": "f1_micro",
|
601 |
-
"score": 0.
|
602 |
-
"score_ci_high": 0.
|
603 |
-
"score_ci_low": 0.
|
604 |
"num_of_instances": 1000,
|
605 |
-
"accuracy": 0.
|
606 |
-
"accuracy_ci_low": 0.
|
607 |
-
"accuracy_ci_high": 0.
|
608 |
-
"f1_micro": 0.
|
609 |
-
"f1_micro_ci_low": 0.
|
610 |
-
"f1_micro_ci_high": 0.
|
611 |
},
|
612 |
-
"score": 0.
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
-
"f1_macro": 0.
|
619 |
-
"
|
620 |
-
"
|
621 |
-
"
|
622 |
-
"
|
623 |
-
"
|
624 |
-
"
|
625 |
-
"
|
626 |
-
"f1_money transfer or virtual currency or money service": 0.
|
627 |
-
"
|
628 |
-
"f1_macro_ci_low": 0.
|
629 |
-
"f1_macro_ci_high": 0.
|
630 |
"score_name": "f1_micro",
|
631 |
-
"score": 0.
|
632 |
-
"score_ci_high": 0.
|
633 |
-
"score_ci_low": 0.
|
634 |
"num_of_instances": 1000,
|
635 |
-
"accuracy": 0.
|
636 |
-
"accuracy_ci_low": 0.
|
637 |
-
"accuracy_ci_high": 0.
|
638 |
-
"f1_micro": 0.
|
639 |
-
"f1_micro_ci_low": 0.
|
640 |
-
"f1_micro_ci_high": 0.
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
-
"f1_macro": 0.
|
644 |
-
"f1_mortgages and loans": 0.
|
645 |
"f1_credit card": 0.7865168539325843,
|
646 |
-
"f1_debt collection": 0.
|
647 |
-
"f1_credit reporting": 0.
|
648 |
-
"f1_retail banking": 0.
|
649 |
-
"f1_macro_ci_low": 0.
|
650 |
-
"f1_macro_ci_high": 0.
|
651 |
"score_name": "f1_micro",
|
652 |
-
"score": 0.
|
653 |
-
"score_ci_high": 0.
|
654 |
-
"score_ci_low": 0.
|
655 |
"num_of_instances": 500,
|
656 |
-
"accuracy": 0.
|
657 |
-
"accuracy_ci_low": 0.
|
658 |
-
"accuracy_ci_high": 0.
|
659 |
-
"f1_micro": 0.
|
660 |
-
"f1_micro_ci_low": 0.
|
661 |
-
"f1_micro_ci_high": 0.
|
662 |
},
|
663 |
-
"score": 0.
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
-
"program_accuracy": 0.
|
671 |
-
"score": 0.
|
672 |
"score_name": "program_accuracy",
|
673 |
-
"execution_accuracy": 0.
|
674 |
-
"program_accuracy_ci_low": 0.
|
675 |
-
"program_accuracy_ci_high": 0.
|
676 |
-
"score_ci_low": 0.
|
677 |
-
"score_ci_high": 0.
|
678 |
-
"execution_accuracy_ci_low": 0.
|
679 |
-
"execution_accuracy_ci_high": 0.
|
680 |
},
|
681 |
-
"score": 0.
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
-
"precision": 0.
|
688 |
-
"recall": 0.
|
689 |
-
"f1": 0.
|
690 |
-
"precision_ci_low": 0.
|
691 |
-
"precision_ci_high": 0.
|
692 |
-
"recall_ci_low": 0.
|
693 |
-
"recall_ci_high": 0.
|
694 |
-
"f1_ci_low": 0.
|
695 |
-
"f1_ci_high": 0.
|
696 |
"score_name": "f1",
|
697 |
-
"score": 0.
|
698 |
-
"score_ci_high": 0.
|
699 |
-
"score_ci_low": 0.
|
700 |
"num_of_instances": 600,
|
701 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
702 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
703 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
704 |
-
"faithfullness_f1_token_overlap": 0.
|
705 |
-
"faithfullness_recall_token_overlap": 0.
|
706 |
-
"faithfullness_precision_token_overlap": 0.
|
707 |
-
"correctness_f1_token_overlap": 0.
|
708 |
-
"correctness_recall_token_overlap": 0.
|
709 |
-
"correctness_precision_token_overlap": 0.
|
710 |
},
|
711 |
-
"score": 0.
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
-
"accuracy": 0.
|
718 |
-
"accuracy_ci_low": 0.
|
719 |
-
"accuracy_ci_high": 0.
|
720 |
"score_name": "accuracy",
|
721 |
-
"score": 0.
|
722 |
-
"score_ci_high": 0.
|
723 |
-
"score_ci_low": 0.
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
-
"accuracy": 0.
|
728 |
-
"accuracy_ci_low": 0.
|
729 |
-
"accuracy_ci_high": 0.
|
730 |
"score_name": "accuracy",
|
731 |
-
"score": 0.
|
732 |
-
"score_ci_high": 0.
|
733 |
-
"score_ci_low": 0.
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
-
"score": 0.
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
@@ -741,68 +741,68 @@
|
|
741 |
"attaq_500": {
|
742 |
"severity_critical": 0.0,
|
743 |
"severity_high": 0.0,
|
744 |
-
"severity_medium":
|
745 |
-
"severity_low":
|
746 |
-
"category_harmful_info": 0.
|
747 |
-
"category_deception": 0.
|
748 |
-
"category_explicit_content": 0.
|
749 |
-
"category_violence": 0.
|
750 |
-
"category_discrimination": 0.
|
751 |
-
"category_substance_abuse": 0.
|
752 |
-
"category_pii": 0.
|
753 |
-
"safety": 0.
|
754 |
-
"safety_ci_low": 0.
|
755 |
-
"safety_ci_high": 0.
|
756 |
"score_name": "safety",
|
757 |
-
"score": 0.
|
758 |
-
"score_ci_high": 0.
|
759 |
-
"score_ci_low": 0.
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
-
"score": 0.
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
-
"
|
770 |
-
"
|
771 |
-
"
|
772 |
-
"
|
|
|
773 |
"score_name": "rougeL",
|
774 |
-
"
|
775 |
-
"
|
776 |
-
"
|
777 |
-
"
|
778 |
-
"
|
779 |
-
"
|
780 |
-
"
|
781 |
-
"
|
782 |
-
"
|
783 |
-
"
|
784 |
-
"rougeLsum_ci_high": 0.3821672398567385
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
-
"
|
789 |
-
"
|
790 |
-
"
|
791 |
-
"
|
|
|
792 |
"score_name": "rougeL",
|
793 |
-
"
|
794 |
-
"
|
795 |
-
"
|
796 |
-
"
|
797 |
-
"
|
798 |
-
"
|
799 |
-
"
|
800 |
-
"
|
801 |
-
"
|
802 |
-
"
|
803 |
-
"rougeLsum_ci_high": 0.11266943648159043
|
804 |
},
|
805 |
-
"score": 0.
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
@@ -810,473 +810,473 @@
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
],
|
818 |
"totals": [
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
-
|
823 |
],
|
824 |
"precisions": [
|
825 |
-
0.
|
826 |
-
0.
|
827 |
-
0.
|
828 |
-
0.
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
-
"sys_len":
|
832 |
"ref_len": 1734,
|
833 |
-
"sacrebleu": 0.
|
834 |
-
"score": 0.
|
835 |
"score_name": "sacrebleu",
|
836 |
-
"score_ci_low": 0.
|
837 |
-
"score_ci_high": 0.
|
838 |
-
"sacrebleu_ci_low": 0.
|
839 |
-
"sacrebleu_ci_high": 0.
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
-
|
845 |
-
|
846 |
-
|
847 |
-
|
848 |
],
|
849 |
"totals": [
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
],
|
855 |
"precisions": [
|
856 |
-
0.
|
857 |
-
0.
|
858 |
-
0.
|
859 |
-
0.
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
-
"sys_len":
|
863 |
"ref_len": 1734,
|
864 |
-
"sacrebleu": 0.
|
865 |
-
"score": 0.
|
866 |
"score_name": "sacrebleu",
|
867 |
-
"score_ci_low": 0.
|
868 |
-
"score_ci_high": 0.
|
869 |
-
"sacrebleu_ci_low": 0.
|
870 |
-
"sacrebleu_ci_high": 0.
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
-
|
876 |
-
|
877 |
-
|
878 |
-
|
879 |
],
|
880 |
"totals": [
|
881 |
-
|
882 |
-
|
883 |
-
|
884 |
-
|
885 |
],
|
886 |
"precisions": [
|
887 |
-
0.
|
888 |
-
0.
|
889 |
-
0.
|
890 |
-
0.
|
891 |
],
|
892 |
-
"bp":
|
893 |
-
"sys_len":
|
894 |
"ref_len": 1589,
|
895 |
-
"sacrebleu": 0.
|
896 |
-
"score": 0.
|
897 |
"score_name": "sacrebleu",
|
898 |
-
"score_ci_low": 0.
|
899 |
-
"score_ci_high": 0.
|
900 |
-
"sacrebleu_ci_low": 0.
|
901 |
-
"sacrebleu_ci_high": 0.
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
-
|
907 |
-
|
908 |
-
|
909 |
-
|
910 |
],
|
911 |
"totals": [
|
912 |
-
|
913 |
-
|
914 |
-
|
915 |
-
|
916 |
],
|
917 |
"precisions": [
|
918 |
-
0.
|
919 |
-
0.
|
920 |
-
0.
|
921 |
-
0.
|
922 |
],
|
923 |
"bp": 1.0,
|
924 |
-
"sys_len":
|
925 |
"ref_len": 1835,
|
926 |
-
"sacrebleu": 0.
|
927 |
-
"score": 0.
|
928 |
"score_name": "sacrebleu",
|
929 |
-
"score_ci_low": 0.
|
930 |
-
"score_ci_high": 0.
|
931 |
-
"sacrebleu_ci_low": 0.
|
932 |
-
"sacrebleu_ci_high": 0.
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
-
|
938 |
-
|
939 |
-
|
940 |
-
|
941 |
],
|
942 |
"totals": [
|
943 |
-
|
944 |
-
|
945 |
-
|
946 |
-
|
947 |
],
|
948 |
"precisions": [
|
949 |
-
0.
|
950 |
-
0.
|
951 |
-
0.
|
952 |
-
0.
|
953 |
],
|
954 |
-
"bp": 0.
|
955 |
-
"sys_len":
|
956 |
"ref_len": 2068,
|
957 |
-
"sacrebleu": 0.
|
958 |
-
"score": 0.
|
959 |
"score_name": "sacrebleu",
|
960 |
-
"score_ci_low": 0.
|
961 |
-
"score_ci_high": 0.
|
962 |
-
"sacrebleu_ci_low": 0.
|
963 |
-
"sacrebleu_ci_high": 0.
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
-
|
969 |
-
|
970 |
-
|
971 |
-
|
972 |
],
|
973 |
"totals": [
|
974 |
-
|
975 |
-
|
976 |
-
|
977 |
-
|
978 |
],
|
979 |
"precisions": [
|
980 |
-
0.
|
981 |
-
0.
|
982 |
-
0.
|
983 |
-
0.
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
-
"sys_len":
|
987 |
"ref_len": 2235,
|
988 |
-
"sacrebleu": 0.
|
989 |
-
"score": 0.
|
990 |
"score_name": "sacrebleu",
|
991 |
-
"score_ci_low": 0.
|
992 |
-
"score_ci_high": 0.
|
993 |
-
"sacrebleu_ci_low": 0.
|
994 |
-
"sacrebleu_ci_high": 0.
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
-
|
1000 |
-
|
1001 |
-
|
1002 |
-
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
-
|
1006 |
-
|
1007 |
-
|
1008 |
-
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
-
0.
|
1012 |
-
0.
|
1013 |
-
0.
|
1014 |
-
0.
|
1015 |
],
|
1016 |
-
"bp": 0.
|
1017 |
-
"sys_len":
|
1018 |
"ref_len": 1916,
|
1019 |
-
"sacrebleu": 0.
|
1020 |
-
"score": 0.
|
1021 |
"score_name": "sacrebleu",
|
1022 |
-
"score_ci_low": 0.
|
1023 |
-
"score_ci_high": 0.
|
1024 |
-
"sacrebleu_ci_low": 0.
|
1025 |
-
"sacrebleu_ci_high": 0.
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
-
|
1031 |
-
|
1032 |
-
|
1033 |
-
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
-
0.
|
1043 |
-
0.
|
1044 |
-
0.
|
1045 |
-
0.
|
1046 |
],
|
1047 |
"bp": 1.0,
|
1048 |
-
"sys_len":
|
1049 |
"ref_len": 1949,
|
1050 |
-
"sacrebleu": 0.
|
1051 |
-
"score": 0.
|
1052 |
"score_name": "sacrebleu",
|
1053 |
-
"score_ci_low": 0.
|
1054 |
-
"score_ci_high": 0.
|
1055 |
-
"sacrebleu_ci_low": 0.
|
1056 |
-
"sacrebleu_ci_high": 0.
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
-
|
1062 |
-
|
1063 |
-
|
1064 |
-
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
1070 |
-
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
-
0.
|
1074 |
-
0.
|
1075 |
-
0.
|
1076 |
-
0.
|
1077 |
],
|
1078 |
-
"bp": 0.
|
1079 |
-
"sys_len":
|
1080 |
"ref_len": 2098,
|
1081 |
-
"sacrebleu": 0.
|
1082 |
-
"score": 0.
|
1083 |
"score_name": "sacrebleu",
|
1084 |
-
"score_ci_low": 0.
|
1085 |
-
"score_ci_high": 0.
|
1086 |
-
"sacrebleu_ci_low": 0.
|
1087 |
-
"sacrebleu_ci_high": 0.
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
-
|
1093 |
964,
|
1094 |
-
|
1095 |
-
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
-
|
1099 |
-
|
1100 |
-
|
1101 |
-
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
-
0.
|
1105 |
-
0.
|
1106 |
-
0.
|
1107 |
-
0.
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
-
"sys_len":
|
1111 |
"ref_len": 1734,
|
1112 |
-
"sacrebleu": 0.
|
1113 |
-
"score": 0.
|
1114 |
"score_name": "sacrebleu",
|
1115 |
-
"score_ci_low": 0.
|
1116 |
-
"score_ci_high": 0.
|
1117 |
-
"sacrebleu_ci_low": 0.
|
1118 |
-
"sacrebleu_ci_high": 0.
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
-
|
1124 |
-
|
1125 |
-
|
1126 |
-
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
-
|
1130 |
-
|
1131 |
-
|
1132 |
-
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
-
0.
|
1136 |
-
0.
|
1137 |
-
0.
|
1138 |
-
0.
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
-
"sys_len":
|
1142 |
"ref_len": 1734,
|
1143 |
-
"sacrebleu": 0.
|
1144 |
-
"score": 0.
|
1145 |
"score_name": "sacrebleu",
|
1146 |
-
"score_ci_low": 0.
|
1147 |
-
"score_ci_high": 0.
|
1148 |
-
"sacrebleu_ci_low": 0.
|
1149 |
-
"sacrebleu_ci_high": 0.
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
-
|
1155 |
-
|
1156 |
-
|
1157 |
-
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
-
|
1161 |
-
|
1162 |
-
|
1163 |
-
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
-
0.
|
1167 |
-
0.
|
1168 |
-
0.
|
1169 |
-
0.
|
1170 |
],
|
1171 |
-
"bp":
|
1172 |
-
"sys_len":
|
1173 |
"ref_len": 1734,
|
1174 |
-
"sacrebleu": 0.
|
1175 |
-
"score": 0.
|
1176 |
"score_name": "sacrebleu",
|
1177 |
-
"score_ci_low": 0.
|
1178 |
-
"score_ci_high": 0.
|
1179 |
-
"sacrebleu_ci_low": 0.
|
1180 |
-
"sacrebleu_ci_high": 0.
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
-
|
1186 |
-
|
1187 |
-
|
1188 |
-
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
-
|
1192 |
-
|
1193 |
-
|
1194 |
-
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
-
0.
|
1198 |
-
0.
|
1199 |
-
0.
|
1200 |
-
0.
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
-
"sys_len":
|
1204 |
"ref_len": 1734,
|
1205 |
-
"sacrebleu": 0.
|
1206 |
-
"score": 0.
|
1207 |
"score_name": "sacrebleu",
|
1208 |
-
"score_ci_low": 0.
|
1209 |
-
"score_ci_high": 0.
|
1210 |
-
"sacrebleu_ci_low": 0.
|
1211 |
-
"sacrebleu_ci_high": 0.
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
-
|
1217 |
-
|
1218 |
-
|
1219 |
-
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
-
|
1223 |
-
|
1224 |
-
|
1225 |
-
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
-
0.
|
1229 |
-
0.
|
1230 |
-
0.
|
1231 |
-
0.
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
-
"sys_len":
|
1235 |
"ref_len": 1734,
|
1236 |
-
"sacrebleu": 0.
|
1237 |
-
"score": 0.
|
1238 |
"score_name": "sacrebleu",
|
1239 |
-
"score_ci_low": 0.
|
1240 |
-
"score_ci_high": 0.
|
1241 |
-
"sacrebleu_ci_low": 0.
|
1242 |
-
"sacrebleu_ci_high": 0.
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
-
|
1248 |
-
|
1249 |
-
|
1250 |
300
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
-
|
1254 |
-
|
1255 |
-
|
1256 |
-
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
-
0.
|
1260 |
-
0.
|
1261 |
-
0.
|
1262 |
-
0.
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
-
"sys_len":
|
1266 |
"ref_len": 1734,
|
1267 |
-
"sacrebleu": 0.
|
1268 |
-
"score": 0.
|
1269 |
"score_name": "sacrebleu",
|
1270 |
-
"score_ci_low": 0.
|
1271 |
-
"score_ci_high": 0.
|
1272 |
-
"sacrebleu_ci_low": 0.
|
1273 |
-
"sacrebleu_ci_high": 0.
|
1274 |
},
|
1275 |
-
"score": 0.
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
-
"score": 0.
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-23T18:18:25.502854Z",
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.7888888888888889,
|
180 |
+
"accuracy_ci_low": 0.7,
|
181 |
"accuracy_ci_high": 0.8555555555555555,
|
182 |
"score_name": "accuracy",
|
183 |
+
"score": 0.7888888888888889,
|
184 |
"score_ci_high": 0.8555555555555555,
|
185 |
+
"score_ci_low": 0.7,
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
|
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.9888888888888889,
|
220 |
+
"accuracy_ci_low": 0.9555555555555556,
|
221 |
"accuracy_ci_high": 1.0,
|
222 |
"score_name": "accuracy",
|
223 |
+
"score": 0.9888888888888889,
|
224 |
"score_ci_high": 1.0,
|
225 |
+
"score_ci_low": 0.9555555555555556,
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
|
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.8555555555555555,
|
280 |
+
"accuracy_ci_low": 0.7777777777777778,
|
281 |
+
"accuracy_ci_high": 0.9222222222222223,
|
282 |
"score_name": "accuracy",
|
283 |
+
"score": 0.8555555555555555,
|
284 |
+
"score_ci_high": 0.9222222222222223,
|
285 |
+
"score_ci_low": 0.7777777777777778,
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
+
"score": 0.9626262626262626,
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.14026236125126135,
|
296 |
+
"score": 0.14026236125126135,
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
+
"score": 0.14026236125126135,
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.5721784776902887,
|
307 |
+
"f1_Organization": 0.37837837837837834,
|
308 |
+
"f1_Location": 0.3692307692307692,
|
309 |
+
"f1_macro": 0.4399292084331454,
|
310 |
+
"recall_macro": 0.40673591832987227,
|
311 |
+
"precision_macro": 0.48338733915656995,
|
312 |
+
"in_classes_support": 0.6414285714285715,
|
313 |
+
"f1_micro": 0.35918367346938773,
|
314 |
+
"recall_micro": 0.41904761904761906,
|
315 |
+
"precision_micro": 0.3142857142857143,
|
316 |
+
"score": 0.35918367346938773,
|
317 |
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.311485746926114,
|
319 |
+
"score_ci_high": 0.40594536569316764,
|
320 |
+
"f1_micro_ci_low": 0.311485746926114,
|
321 |
+
"f1_micro_ci_high": 0.40594536569316764
|
322 |
},
|
323 |
+
"score": 0.35918367346938773,
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
|
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.18309859154929578,
|
340 |
"accuracy_ci_low": 0.11267605633802817,
|
341 |
+
"accuracy_ci_high": 0.28169014084507044,
|
342 |
"score_name": "accuracy",
|
343 |
+
"score": 0.18309859154929578,
|
344 |
+
"score_ci_high": 0.28169014084507044,
|
345 |
"score_ci_low": 0.11267605633802817,
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.15492957746478872,
|
350 |
+
"accuracy_ci_low": 0.08450704225352113,
|
351 |
+
"accuracy_ci_high": 0.2535211267605634,
|
352 |
"score_name": "accuracy",
|
353 |
+
"score": 0.15492957746478872,
|
354 |
+
"score_ci_high": 0.2535211267605634,
|
355 |
+
"score_ci_low": 0.08450704225352113,
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
|
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
379 |
"accuracy": 0.323943661971831,
|
380 |
+
"accuracy_ci_low": 0.22535211267605634,
|
381 |
"accuracy_ci_high": 0.43661971830985913,
|
382 |
"score_name": "accuracy",
|
383 |
"score": 0.323943661971831,
|
384 |
"score_ci_high": 0.43661971830985913,
|
385 |
+
"score_ci_low": 0.22535211267605634,
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.5352112676056338,
|
390 |
+
"accuracy_ci_low": 0.4225352112676056,
|
391 |
+
"accuracy_ci_high": 0.647887323943662,
|
392 |
"score_name": "accuracy",
|
393 |
+
"score": 0.5352112676056338,
|
394 |
+
"score_ci_high": 0.647887323943662,
|
395 |
+
"score_ci_low": 0.4225352112676056,
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.7323943661971831,
|
400 |
+
"accuracy_ci_low": 0.6310963819783834,
|
401 |
+
"accuracy_ci_high": 0.8309859154929577,
|
402 |
"score_name": "accuracy",
|
403 |
+
"score": 0.7323943661971831,
|
404 |
+
"score_ci_high": 0.8309859154929577,
|
405 |
+
"score_ci_low": 0.6310963819783834,
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.5915492957746479,
|
410 |
+
"accuracy_ci_low": 0.4788732394366197,
|
411 |
"accuracy_ci_high": 0.6901408450704225,
|
412 |
"score_name": "accuracy",
|
413 |
+
"score": 0.5915492957746479,
|
414 |
"score_ci_high": 0.6901408450704225,
|
415 |
+
"score_ci_low": 0.4788732394366197,
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
"accuracy": 0.29577464788732394,
|
420 |
+
"accuracy_ci_low": 0.19718309859154928,
|
421 |
+
"accuracy_ci_high": 0.41750158298380896,
|
422 |
"score_name": "accuracy",
|
423 |
"score": 0.29577464788732394,
|
424 |
+
"score_ci_high": 0.41750158298380896,
|
425 |
+
"score_ci_low": 0.19718309859154928,
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
|
|
438 |
"mmlu_pro_philosophy": {
|
439 |
"accuracy": 0.5915492957746479,
|
440 |
"accuracy_ci_low": 0.4647887323943662,
|
441 |
+
"accuracy_ci_high": 0.704225352112676,
|
442 |
"score_name": "accuracy",
|
443 |
"score": 0.5915492957746479,
|
444 |
+
"score_ci_high": 0.704225352112676,
|
445 |
"score_ci_low": 0.4647887323943662,
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.30985915492957744,
|
450 |
+
"accuracy_ci_low": 0.19718309859154928,
|
451 |
+
"accuracy_ci_high": 0.4084507042253521,
|
452 |
"score_name": "accuracy",
|
453 |
+
"score": 0.30985915492957744,
|
454 |
+
"score_ci_high": 0.4084507042253521,
|
455 |
+
"score_ci_low": 0.19718309859154928,
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
|
|
465 |
"score_ci_low": 0.5915492957746479,
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
+
"score": 0.4909456740442656,
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.6597867569632275,
|
475 |
"f1_suggestive": 0.5882352941176471,
|
476 |
"f1_generic": 0.72,
|
477 |
"f1_descriptive": 0.6818181818181818,
|
478 |
+
"f1_fanciful": 0.7142857142857143,
|
479 |
+
"f1_arbitrary": 0.5945945945945946,
|
480 |
+
"f1_macro_ci_low": 0.548464495780585,
|
481 |
+
"f1_macro_ci_high": 0.7575557118629758,
|
482 |
"score_name": "f1_micro",
|
483 |
"score": 0.6547619047619048,
|
484 |
+
"score_ci_high": 0.7425149700598802,
|
485 |
+
"score_ci_low": 0.5437048440428358,
|
486 |
"num_of_instances": 85,
|
487 |
"accuracy": 0.6470588235294118,
|
488 |
"accuracy_ci_low": 0.5411764705882353,
|
489 |
"accuracy_ci_high": 0.7411764705882353,
|
490 |
"f1_micro": 0.6547619047619048,
|
491 |
+
"f1_micro_ci_low": 0.5437048440428358,
|
492 |
+
"f1_micro_ci_high": 0.7425149700598802
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.5899229232086782,
|
496 |
+
"f1_no": 0.6637168141592921,
|
497 |
+
"f1_yes": 0.5161290322580645,
|
498 |
+
"f1_macro_ci_low": 0.5156319338532668,
|
499 |
+
"f1_macro_ci_high": 0.6621360959437094,
|
500 |
"score_name": "f1_micro",
|
501 |
+
"score": 0.6114285714285714,
|
502 |
+
"score_ci_high": 0.6798739003144315,
|
503 |
+
"score_ci_low": 0.5364733968179762,
|
504 |
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.535,
|
506 |
+
"accuracy_ci_low": 0.465,
|
507 |
"accuracy_ci_high": 0.605,
|
508 |
+
"f1_micro": 0.6114285714285714,
|
509 |
+
"f1_micro_ci_low": 0.5364733968179762,
|
510 |
+
"f1_micro_ci_high": 0.6798739003144315
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.24082948201110896,
|
514 |
+
"f1_conclusion": 0.10810810810810811,
|
515 |
+
"f1_decree": 0.12903225806451613,
|
516 |
"f1_issue": 0.18181818181818182,
|
517 |
+
"f1_analysis": 0.5833333333333334,
|
518 |
+
"f1_facts": 0.06666666666666667,
|
519 |
+
"f1_procedural history": 0.3125,
|
520 |
+
"f1_rule": 0.30434782608695654,
|
521 |
+
"f1_macro_ci_low": 0.19135126191537805,
|
522 |
+
"f1_macro_ci_high": 0.31037088994163425,
|
523 |
"score_name": "f1_micro",
|
524 |
+
"score": 0.29012345679012347,
|
525 |
+
"score_ci_high": 0.3634815160611135,
|
526 |
+
"score_ci_low": 0.22855500349415586,
|
527 |
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.235,
|
529 |
+
"accuracy_ci_low": 0.185,
|
530 |
+
"accuracy_ci_high": 0.3,
|
531 |
+
"f1_micro": 0.29012345679012347,
|
532 |
+
"f1_micro_ci_low": 0.22855500349415586,
|
533 |
+
"f1_micro_ci_high": 0.3634815160611135
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.4444123800591588,
|
537 |
+
"f1_yes": 0.4550898203592814,
|
538 |
+
"f1_no": 0.43373493975903615,
|
539 |
+
"f1_macro_ci_low": 0.3748865543059881,
|
540 |
+
"f1_macro_ci_high": 0.5178711641402558,
|
541 |
"score_name": "f1_micro",
|
542 |
+
"score": 0.4444444444444444,
|
543 |
+
"score_ci_high": 0.5162242117942616,
|
544 |
+
"score_ci_low": 0.37379019448718637,
|
545 |
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.37,
|
547 |
+
"accuracy_ci_low": 0.305,
|
548 |
+
"accuracy_ci_high": 0.435,
|
549 |
+
"f1_micro": 0.4444444444444444,
|
550 |
+
"f1_micro_ci_low": 0.37379019448718637,
|
551 |
+
"f1_micro_ci_high": 0.5162242117942616
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
"f1_macro": 0.75,
|
|
|
568 |
"f1_micro_ci_low": 0.6573326079878734,
|
569 |
"f1_micro_ci_high": 0.8234882632928148
|
570 |
},
|
571 |
+
"score": 0.5501516754850089,
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.625283027988775,
|
578 |
+
"f1_cars": 0.8541666666666666,
|
579 |
+
"f1_windows x": 0.11940298507462686,
|
580 |
+
"f1_computer graphics": 0.4297520661157025,
|
581 |
+
"f1_atheism": 0.509090909090909,
|
582 |
+
"f1_religion": 0.2222222222222222,
|
583 |
+
"f1_medicine": 0.8705882352941177,
|
584 |
+
"f1_christianity": 0.7755102040816326,
|
585 |
+
"f1_microsoft windows": 0.6436781609195402,
|
586 |
+
"f1_middle east": 0.6666666666666666,
|
587 |
+
"f1_motorcycles": 0.7326732673267327,
|
588 |
+
"f1_pc hardware": 0.5846153846153846,
|
589 |
+
"f1_mac hardware": 0.6458333333333334,
|
590 |
+
"f1_electronics": 0.6666666666666666,
|
591 |
+
"f1_for sale": 0.6944444444444444,
|
592 |
+
"f1_guns": 0.36923076923076925,
|
593 |
+
"f1_space": 0.8235294117647058,
|
594 |
+
"f1_cryptography": 0.6575342465753424,
|
595 |
+
"f1_baseball": 0.9310344827586207,
|
596 |
"f1_politics": 0.3787878787878788,
|
597 |
+
"f1_hockey": 0.9302325581395349,
|
598 |
+
"f1_macro_ci_low": 0.5984293289041998,
|
599 |
+
"f1_macro_ci_high": 0.65230217299566,
|
600 |
"score_name": "f1_micro",
|
601 |
+
"score": 0.6474114441416894,
|
602 |
+
"score_ci_high": 0.6749803309845304,
|
603 |
+
"score_ci_low": 0.6158904109589041,
|
604 |
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.594,
|
606 |
+
"accuracy_ci_low": 0.561,
|
607 |
+
"accuracy_ci_high": 0.622,
|
608 |
+
"f1_micro": 0.6474114441416894,
|
609 |
+
"f1_micro_ci_low": 0.6158904109589041,
|
610 |
+
"f1_micro_ci_high": 0.6749803309845304
|
611 |
},
|
612 |
+
"score": 0.6474114441416894,
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.7879467396802322,
|
619 |
+
"f1_student loan": 0.8888888888888888,
|
620 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9461756373937678,
|
621 |
+
"f1_debt collection": 0.6428571428571429,
|
622 |
+
"f1_checking or savings account": 0.8222222222222222,
|
623 |
+
"f1_mortgage": 0.9705882352941176,
|
624 |
+
"f1_payday loan or title loan or personal loan": 0.5333333333333333,
|
625 |
+
"f1_credit card or prepaid card": 0.8666666666666667,
|
626 |
+
"f1_money transfer or virtual currency or money service": 0.7111111111111111,
|
627 |
+
"f1_vehicle loan or lease": 0.7096774193548387,
|
628 |
+
"f1_macro_ci_low": 0.7309747205142173,
|
629 |
+
"f1_macro_ci_high": 0.8394377629013812,
|
630 |
"score_name": "f1_micro",
|
631 |
+
"score": 0.8977732793522267,
|
632 |
+
"score_ci_high": 0.9148163850441952,
|
633 |
+
"score_ci_low": 0.8791739655658123,
|
634 |
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.887,
|
636 |
+
"accuracy_ci_low": 0.8679599560953464,
|
637 |
+
"accuracy_ci_high": 0.906,
|
638 |
+
"f1_micro": 0.8977732793522267,
|
639 |
+
"f1_micro_ci_low": 0.8791739655658123,
|
640 |
+
"f1_micro_ci_high": 0.9148163850441952
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.755452767113785,
|
644 |
+
"f1_mortgages and loans": 0.8135593220338984,
|
645 |
"f1_credit card": 0.7865168539325843,
|
646 |
+
"f1_debt collection": 0.7069767441860465,
|
647 |
+
"f1_credit reporting": 0.7876712328767124,
|
648 |
+
"f1_retail banking": 0.6825396825396826,
|
649 |
+
"f1_macro_ci_low": 0.7186153400597911,
|
650 |
+
"f1_macro_ci_high": 0.7954872943230671,
|
651 |
"score_name": "f1_micro",
|
652 |
+
"score": 0.7611336032388664,
|
653 |
+
"score_ci_high": 0.797979797979798,
|
654 |
+
"score_ci_low": 0.7233852933885262,
|
655 |
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.752,
|
657 |
+
"accuracy_ci_low": 0.712,
|
658 |
+
"accuracy_ci_high": 0.79,
|
659 |
+
"f1_micro": 0.7611336032388664,
|
660 |
+
"f1_micro_ci_low": 0.7233852933885262,
|
661 |
+
"f1_micro_ci_high": 0.797979797979798
|
662 |
},
|
663 |
+
"score": 0.8294534412955465,
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
+
"program_accuracy": 0.21,
|
671 |
+
"score": 0.21,
|
672 |
"score_name": "program_accuracy",
|
673 |
+
"execution_accuracy": 0.19,
|
674 |
+
"program_accuracy_ci_low": 0.1850718210152138,
|
675 |
+
"program_accuracy_ci_high": 0.236,
|
676 |
+
"score_ci_low": 0.1850718210152138,
|
677 |
+
"score_ci_high": 0.236,
|
678 |
+
"execution_accuracy_ci_low": 0.167,
|
679 |
+
"execution_accuracy_ci_high": 0.214
|
680 |
},
|
681 |
+
"score": 0.21,
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.2793294516338928,
|
688 |
+
"recall": 0.5853975586330095,
|
689 |
+
"f1": 0.32244875528474853,
|
690 |
+
"precision_ci_low": 0.2608509633355691,
|
691 |
+
"precision_ci_high": 0.2992012655344498,
|
692 |
+
"recall_ci_low": 0.5709386747947787,
|
693 |
+
"recall_ci_high": 0.6004552716919522,
|
694 |
+
"f1_ci_low": 0.3057656195014431,
|
695 |
+
"f1_ci_high": 0.34039188672012827,
|
696 |
"score_name": "f1",
|
697 |
+
"score": 0.32244875528474853,
|
698 |
+
"score_ci_high": 0.34039188672012827,
|
699 |
+
"score_ci_low": 0.3057656195014431,
|
700 |
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.583492674678564,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6757716050744057,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5312159793823957,
|
704 |
+
"faithfullness_f1_token_overlap": 0.31789392117250886,
|
705 |
+
"faithfullness_recall_token_overlap": 0.25491265786006334,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5284813664748244,
|
707 |
+
"correctness_f1_token_overlap": 0.32244875528474853,
|
708 |
+
"correctness_recall_token_overlap": 0.5853975586330095,
|
709 |
+
"correctness_precision_token_overlap": 0.2793294516338928
|
710 |
},
|
711 |
+
"score": 0.32244875528474853,
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
+
"accuracy": 0.565,
|
718 |
+
"accuracy_ci_low": 0.5335774990883203,
|
719 |
+
"accuracy_ci_high": 0.5932386235436204,
|
720 |
"score_name": "accuracy",
|
721 |
+
"score": 0.565,
|
722 |
+
"score_ci_high": 0.5932386235436204,
|
723 |
+
"score_ci_low": 0.5335774990883203,
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
+
"accuracy": 0.92,
|
728 |
+
"accuracy_ci_low": 0.892,
|
729 |
+
"accuracy_ci_high": 0.94,
|
730 |
"score_name": "accuracy",
|
731 |
+
"score": 0.92,
|
732 |
+
"score_ci_high": 0.94,
|
733 |
+
"score_ci_low": 0.892,
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
+
"score": 0.7424999999999999,
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
|
|
741 |
"attaq_500": {
|
742 |
"severity_critical": 0.0,
|
743 |
"severity_high": 0.0,
|
744 |
+
"severity_medium": 0.0,
|
745 |
+
"severity_low": 100.0,
|
746 |
+
"category_harmful_info": 0.8275590381670063,
|
747 |
+
"category_deception": 0.870389111565821,
|
748 |
+
"category_explicit_content": 0.8619736799537534,
|
749 |
+
"category_violence": 0.839932223764204,
|
750 |
+
"category_discrimination": 0.7697427272796631,
|
751 |
+
"category_substance_abuse": 0.8608841189907658,
|
752 |
+
"category_pii": 0.7655655101493553,
|
753 |
+
"safety": 0.8361458782349815,
|
754 |
+
"safety_ci_low": 0.8192866587424859,
|
755 |
+
"safety_ci_high": 0.8545105900812325,
|
756 |
"score_name": "safety",
|
757 |
+
"score": 0.8361458782349815,
|
758 |
+
"score_ci_high": 0.8545105900812325,
|
759 |
+
"score_ci_low": 0.8192866587424859,
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
+
"score": 0.8361458782349815,
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
+
"rougeLsum": 0.3756142003700213,
|
770 |
+
"rouge1": 0.43798911333105917,
|
771 |
+
"rouge2": 0.22301544369181786,
|
772 |
+
"rougeL": 0.30881611781607615,
|
773 |
+
"score": 0.30881611781607615,
|
774 |
"score_name": "rougeL",
|
775 |
+
"rougeLsum_ci_low": 0.3665764288194776,
|
776 |
+
"rougeLsum_ci_high": 0.3845640981112231,
|
777 |
+
"rouge1_ci_low": 0.42803373903414665,
|
778 |
+
"rouge1_ci_high": 0.44745782528977346,
|
779 |
+
"rouge2_ci_low": 0.21565182499600158,
|
780 |
+
"rouge2_ci_high": 0.23166608743475037,
|
781 |
+
"rougeL_ci_low": 0.3011644909175205,
|
782 |
+
"rougeL_ci_high": 0.31722072238042875,
|
783 |
+
"score_ci_low": 0.3011644909175205,
|
784 |
+
"score_ci_high": 0.31722072238042875
|
|
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
+
"rougeLsum": 0.1094469715392792,
|
789 |
+
"rouge1": 0.13301752816755213,
|
790 |
+
"rouge2": 0.020621633401068214,
|
791 |
+
"rougeL": 0.09635378924374519,
|
792 |
+
"score": 0.09635378924374519,
|
793 |
"score_name": "rougeL",
|
794 |
+
"rougeLsum_ci_low": 0.10482036952689613,
|
795 |
+
"rougeLsum_ci_high": 0.11359992785988014,
|
796 |
+
"rouge1_ci_low": 0.12738784135782572,
|
797 |
+
"rouge1_ci_high": 0.13845012766033873,
|
798 |
+
"rouge2_ci_low": 0.01856074580818113,
|
799 |
+
"rouge2_ci_high": 0.02259991124480518,
|
800 |
+
"rougeL_ci_low": 0.09230760346929477,
|
801 |
+
"rougeL_ci_high": 0.09984237535822288,
|
802 |
+
"score_ci_low": 0.09230760346929477,
|
803 |
+
"score_ci_high": 0.09984237535822288
|
|
|
804 |
},
|
805 |
+
"score": 0.20258495352991068,
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
+
1310,
|
814 |
+
862,
|
815 |
+
608,
|
816 |
+
433
|
817 |
],
|
818 |
"totals": [
|
819 |
+
1791,
|
820 |
+
1725,
|
821 |
+
1659,
|
822 |
+
1593
|
823 |
],
|
824 |
"precisions": [
|
825 |
+
0.7314349525404802,
|
826 |
+
0.49971014492753624,
|
827 |
+
0.36648583484026526,
|
828 |
+
0.27181418706842436
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
+
"sys_len": 1791,
|
832 |
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.43682330208953546,
|
834 |
+
"score": 0.43682330208953546,
|
835 |
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.3952567851744898,
|
837 |
+
"score_ci_high": 0.4779782047825724,
|
838 |
+
"sacrebleu_ci_low": 0.3952567851744898,
|
839 |
+
"sacrebleu_ci_high": 0.4779782047825724
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
+
1330,
|
845 |
+
885,
|
846 |
+
621,
|
847 |
+
444
|
848 |
],
|
849 |
"totals": [
|
850 |
+
1803,
|
851 |
+
1737,
|
852 |
+
1671,
|
853 |
+
1605
|
854 |
],
|
855 |
"precisions": [
|
856 |
+
0.7376594564614531,
|
857 |
+
0.5094991364421416,
|
858 |
+
0.37163375224416517,
|
859 |
+
0.2766355140186916
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
+
"sys_len": 1803,
|
863 |
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.44335908539973706,
|
865 |
+
"score": 0.44335908539973706,
|
866 |
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.407676677665113,
|
868 |
+
"score_ci_high": 0.49444892387735484,
|
869 |
+
"sacrebleu_ci_low": 0.407676677665113,
|
870 |
+
"sacrebleu_ci_high": 0.49444892387735484
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
+
904,
|
876 |
+
502,
|
877 |
+
300,
|
878 |
+
175
|
879 |
],
|
880 |
"totals": [
|
881 |
+
1585,
|
882 |
+
1519,
|
883 |
+
1453,
|
884 |
+
1387
|
885 |
],
|
886 |
"precisions": [
|
887 |
+
0.5703470031545741,
|
888 |
+
0.3304805793285056,
|
889 |
+
0.20646937370956642,
|
890 |
+
0.12617159336697908
|
891 |
],
|
892 |
+
"bp": 0.9974795224450381,
|
893 |
+
"sys_len": 1585,
|
894 |
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.26404598765634385,
|
896 |
+
"score": 0.26404598765634385,
|
897 |
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.23497301086217232,
|
899 |
+
"score_ci_high": 0.29259086123320976,
|
900 |
+
"sacrebleu_ci_low": 0.23497301086217232,
|
901 |
+
"sacrebleu_ci_high": 0.29259086123320976
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
+
1246,
|
907 |
+
765,
|
908 |
+
517,
|
909 |
+
376
|
910 |
],
|
911 |
"totals": [
|
912 |
+
1853,
|
913 |
+
1787,
|
914 |
+
1721,
|
915 |
+
1655
|
916 |
],
|
917 |
"precisions": [
|
918 |
+
0.6724230976794388,
|
919 |
+
0.42809177392277564,
|
920 |
+
0.3004067402672865,
|
921 |
+
0.22719033232628397
|
922 |
],
|
923 |
"bp": 1.0,
|
924 |
+
"sys_len": 1853,
|
925 |
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.3743861346447394,
|
927 |
+
"score": 0.3743861346447394,
|
928 |
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.3331443738925502,
|
930 |
+
"score_ci_high": 0.4167892583826109,
|
931 |
+
"sacrebleu_ci_low": 0.3331443738925502,
|
932 |
+
"sacrebleu_ci_high": 0.4167892583826109
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
+
1562,
|
938 |
+
1176,
|
939 |
+
936,
|
940 |
+
755
|
941 |
],
|
942 |
"totals": [
|
943 |
+
2040,
|
944 |
+
1974,
|
945 |
+
1908,
|
946 |
+
1842
|
947 |
],
|
948 |
"precisions": [
|
949 |
+
0.7656862745098039,
|
950 |
+
0.5957446808510638,
|
951 |
+
0.49056603773584906,
|
952 |
+
0.40988056460369166
|
953 |
],
|
954 |
+
"bp": 0.9863682748637871,
|
955 |
+
"sys_len": 2040,
|
956 |
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.5428196432331734,
|
958 |
+
"score": 0.5428196432331734,
|
959 |
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.509892185181122,
|
961 |
+
"score_ci_high": 0.5871694828358561,
|
962 |
+
"sacrebleu_ci_low": 0.509892185181122,
|
963 |
+
"sacrebleu_ci_high": 0.5871694828358561
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
+
1356,
|
969 |
+
727,
|
970 |
+
432,
|
971 |
+
274
|
972 |
],
|
973 |
"totals": [
|
974 |
+
2382,
|
975 |
+
2316,
|
976 |
+
2250,
|
977 |
+
2184
|
978 |
],
|
979 |
"precisions": [
|
980 |
+
0.5692695214105793,
|
981 |
+
0.3139032815198618,
|
982 |
+
0.192,
|
983 |
+
0.12545787545787546
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
+
"sys_len": 2382,
|
987 |
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.25614049024804236,
|
989 |
+
"score": 0.25614049024804236,
|
990 |
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.2240052628669271,
|
992 |
+
"score_ci_high": 0.2935558382274424,
|
993 |
+
"sacrebleu_ci_low": 0.2240052628669271,
|
994 |
+
"sacrebleu_ci_high": 0.2935558382274424
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
+
1455,
|
1000 |
+
1063,
|
1001 |
+
821,
|
1002 |
+
637
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
+
1900,
|
1006 |
+
1834,
|
1007 |
+
1768,
|
1008 |
+
1702
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
+
0.7657894736842106,
|
1012 |
+
0.579607415485278,
|
1013 |
+
0.4643665158371041,
|
1014 |
+
0.37426556991774385
|
1015 |
],
|
1016 |
+
"bp": 0.9916143051127146,
|
1017 |
+
"sys_len": 1900,
|
1018 |
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.5225932644775685,
|
1020 |
+
"score": 0.5225932644775685,
|
1021 |
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.4737364435189365,
|
1023 |
+
"score_ci_high": 0.5631623567289689,
|
1024 |
+
"sacrebleu_ci_low": 0.4737364435189365,
|
1025 |
+
"sacrebleu_ci_high": 0.5631623567289689
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
+
1404,
|
1031 |
+
989,
|
1032 |
+
719,
|
1033 |
+
525
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
+
1962,
|
1037 |
+
1896,
|
1038 |
+
1830,
|
1039 |
+
1764
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
+
0.7155963302752294,
|
1043 |
+
0.5216244725738397,
|
1044 |
+
0.39289617486338796,
|
1045 |
+
0.2976190476190476
|
1046 |
],
|
1047 |
"bp": 1.0,
|
1048 |
+
"sys_len": 1962,
|
1049 |
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.45707887169863065,
|
1051 |
+
"score": 0.45707887169863065,
|
1052 |
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.4211360031313033,
|
1054 |
+
"score_ci_high": 0.5197096344136953,
|
1055 |
+
"sacrebleu_ci_low": 0.4211360031313033,
|
1056 |
+
"sacrebleu_ci_high": 0.5197096344136953
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
+
1297,
|
1062 |
+
753,
|
1063 |
+
472,
|
1064 |
+
301
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
+
2014,
|
1068 |
+
1948,
|
1069 |
+
1882,
|
1070 |
+
1816
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
+
0.6439920556107249,
|
1074 |
+
0.38655030800821355,
|
1075 |
+
0.2507970244420829,
|
1076 |
+
0.1657488986784141
|
1077 |
],
|
1078 |
+
"bp": 0.9591497695217011,
|
1079 |
+
"sys_len": 2014,
|
1080 |
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.3059153842651481,
|
1082 |
+
"score": 0.3059153842651481,
|
1083 |
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.2738471419962368,
|
1085 |
+
"score_ci_high": 0.33744567062204633,
|
1086 |
+
"sacrebleu_ci_low": 0.2738471419962368,
|
1087 |
+
"sacrebleu_ci_high": 0.33744567062204633
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
+
1371,
|
1093 |
964,
|
1094 |
+
693,
|
1095 |
+
491
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
+
1839,
|
1099 |
+
1773,
|
1100 |
+
1707,
|
1101 |
+
1641
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
+
0.7455138662316476,
|
1105 |
+
0.5437112239142696,
|
1106 |
+
0.40597539543058,
|
1107 |
+
0.2992078001218769
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
+
"sys_len": 1839,
|
1111 |
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.4710577594991048,
|
1113 |
+
"score": 0.4710577594991048,
|
1114 |
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.4268855553367512,
|
1116 |
+
"score_ci_high": 0.5049624748525423,
|
1117 |
+
"sacrebleu_ci_low": 0.4268855553367512,
|
1118 |
+
"sacrebleu_ci_high": 0.5049624748525423
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
+
1130,
|
1124 |
+
638,
|
1125 |
+
409,
|
1126 |
+
268
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
+
1794,
|
1130 |
+
1728,
|
1131 |
+
1662,
|
1132 |
+
1596
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
+
0.6298773690078038,
|
1136 |
+
0.36921296296296297,
|
1137 |
+
0.24608904933814682,
|
1138 |
+
0.16791979949874686
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
+
"sys_len": 1794,
|
1142 |
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.31309907547937593,
|
1144 |
+
"score": 0.31309907547937593,
|
1145 |
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.26572456707025455,
|
1147 |
+
"score_ci_high": 0.3514896703047057,
|
1148 |
+
"sacrebleu_ci_low": 0.26572456707025455,
|
1149 |
+
"sacrebleu_ci_high": 0.3514896703047057
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
+
1112,
|
1155 |
+
613,
|
1156 |
+
383,
|
1157 |
+
250
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
+
1725,
|
1161 |
+
1659,
|
1162 |
+
1593,
|
1163 |
+
1527
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
+
0.6446376811594203,
|
1167 |
+
0.3694996986136227,
|
1168 |
+
0.24042686754551162,
|
1169 |
+
0.16371971185330714
|
1170 |
],
|
1171 |
+
"bp": 0.9947961956419216,
|
1172 |
+
"sys_len": 1725,
|
1173 |
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.3095548068732939,
|
1175 |
+
"score": 0.3095548068732939,
|
1176 |
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.27129838546648216,
|
1178 |
+
"score_ci_high": 0.3652127158245783,
|
1179 |
+
"sacrebleu_ci_low": 0.27129838546648216,
|
1180 |
+
"sacrebleu_ci_high": 0.3652127158245783
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
+
1382,
|
1186 |
+
1013,
|
1187 |
+
771,
|
1188 |
+
586
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
+
1816,
|
1192 |
+
1750,
|
1193 |
+
1684,
|
1194 |
+
1618
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
+
0.7610132158590308,
|
1198 |
+
0.5788571428571428,
|
1199 |
+
0.4578384798099763,
|
1200 |
+
0.3621755253399258
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
+
"sys_len": 1816,
|
1204 |
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.5198747760500348,
|
1206 |
+
"score": 0.5198747760500348,
|
1207 |
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.470424249351864,
|
1209 |
+
"score_ci_high": 0.5665291609523828,
|
1210 |
+
"sacrebleu_ci_low": 0.470424249351864,
|
1211 |
+
"sacrebleu_ci_high": 0.5665291609523828
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
+
1376,
|
1217 |
+
980,
|
1218 |
+
722,
|
1219 |
+
541
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
+
1801,
|
1223 |
+
1735,
|
1224 |
+
1669,
|
1225 |
+
1603
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
+
0.7640199888950583,
|
1229 |
+
0.5648414985590778,
|
1230 |
+
0.43259436788496103,
|
1231 |
+
0.3374922021210231
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
+
"sys_len": 1801,
|
1235 |
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.5010072151606186,
|
1237 |
+
"score": 0.5010072151606186,
|
1238 |
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.4614513028457302,
|
1240 |
+
"score_ci_high": 0.5398623866578321,
|
1241 |
+
"sacrebleu_ci_low": 0.4614513028457302,
|
1242 |
+
"sacrebleu_ci_high": 0.5398623866578321
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
+
1219,
|
1248 |
+
713,
|
1249 |
+
457,
|
1250 |
300
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
+
1888,
|
1254 |
+
1822,
|
1255 |
+
1756,
|
1256 |
+
1690
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
+
0.6456567796610169,
|
1260 |
+
0.3913282107574094,
|
1261 |
+
0.260250569476082,
|
1262 |
+
0.17751479289940827
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
+
"sys_len": 1888,
|
1266 |
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.32869437944703067,
|
1268 |
+
"score": 0.32869437944703067,
|
1269 |
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.30148825657210887,
|
1271 |
+
"score_ci_high": 0.3740478120880907,
|
1272 |
+
"sacrebleu_ci_low": 0.30148825657210887,
|
1273 |
+
"sacrebleu_ci_high": 0.3740478120880907
|
1274 |
},
|
1275 |
+
"score": 0.40309667841482516,
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
+
"score": 0.5151392921367606,
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
results/bluebench/{2025-06-21T11-34-24_evaluation_results.json β 2025-06-23T15-33-11_evaluation_results.json}
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
-
"timestamp_utc": "2025-06-
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
@@ -8,7 +8,7 @@
|
|
8 |
"--model",
|
9 |
"cross_provider",
|
10 |
"--model_args",
|
11 |
-
"model_name=watsonx/mistralai/mistral-
|
12 |
"--output_path",
|
13 |
"./results/bluebench",
|
14 |
"--log_samples",
|
@@ -26,7 +26,7 @@
|
|
26 |
"num_fewshots": null,
|
27 |
"limit": null,
|
28 |
"batch_size": 8,
|
29 |
-
"model": "watsonx/mistralai/mistral-
|
30 |
"model_args": {
|
31 |
"max_tokens": 256
|
32 |
},
|
@@ -42,7 +42,7 @@
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
-
"unitxt_commit_hash": "
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
@@ -176,633 +176,633 @@
|
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
-
"accuracy": 0.
|
180 |
-
"accuracy_ci_low": 0.
|
181 |
-
"accuracy_ci_high": 0.
|
182 |
"score_name": "accuracy",
|
183 |
-
"score": 0.
|
184 |
-
"score_ci_high": 0.
|
185 |
-
"score_ci_low": 0.
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
-
"accuracy": 0.
|
190 |
-
"accuracy_ci_low": 0.
|
191 |
-
"accuracy_ci_high": 0.
|
192 |
"score_name": "accuracy",
|
193 |
-
"score": 0.
|
194 |
-
"score_ci_high": 0.
|
195 |
-
"score_ci_low": 0.
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
199 |
-
"accuracy": 0.
|
200 |
-
"accuracy_ci_low": 0.
|
201 |
-
"accuracy_ci_high": 0
|
202 |
"score_name": "accuracy",
|
203 |
-
"score": 0.
|
204 |
-
"score_ci_high": 0
|
205 |
-
"score_ci_low": 0.
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
-
"accuracy": 0.
|
210 |
-
"accuracy_ci_low": 0.
|
211 |
-
"accuracy_ci_high": 0.
|
212 |
"score_name": "accuracy",
|
213 |
-
"score": 0.
|
214 |
-
"score_ci_high": 0.
|
215 |
-
"score_ci_low": 0.
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
-
"accuracy": 0.
|
220 |
-
"accuracy_ci_low": 0.
|
221 |
-
"accuracy_ci_high": 0.
|
222 |
"score_name": "accuracy",
|
223 |
-
"score": 0.
|
224 |
-
"score_ci_high": 0.
|
225 |
-
"score_ci_low": 0.
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
-
"accuracy": 0.
|
230 |
-
"accuracy_ci_low": 0.
|
231 |
-
"accuracy_ci_high": 0.
|
232 |
"score_name": "accuracy",
|
233 |
-
"score": 0.
|
234 |
-
"score_ci_high": 0.
|
235 |
-
"score_ci_low": 0.
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
239 |
-
"accuracy": 0.
|
240 |
-
"accuracy_ci_low": 0.
|
241 |
-
"accuracy_ci_high": 0.
|
242 |
"score_name": "accuracy",
|
243 |
-
"score": 0.
|
244 |
-
"score_ci_high": 0.
|
245 |
-
"score_ci_low": 0.
|
246 |
"num_of_instances": 90
|
247 |
},
|
248 |
"safety_bbq_race_x_ses": {
|
249 |
-
"accuracy": 0.
|
250 |
-
"accuracy_ci_low": 0.
|
251 |
-
"accuracy_ci_high": 0.
|
252 |
"score_name": "accuracy",
|
253 |
-
"score": 0.
|
254 |
-
"score_ci_high": 0.
|
255 |
-
"score_ci_low": 0.
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
-
"accuracy": 0.
|
260 |
-
"accuracy_ci_low": 0.
|
261 |
-
"accuracy_ci_high": 0.
|
262 |
"score_name": "accuracy",
|
263 |
-
"score": 0.
|
264 |
-
"score_ci_high": 0.
|
265 |
-
"score_ci_low": 0.
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
"accuracy": 0.8888888888888888,
|
270 |
-
"accuracy_ci_low": 0.
|
271 |
"accuracy_ci_high": 0.9444444444444444,
|
272 |
"score_name": "accuracy",
|
273 |
"score": 0.8888888888888888,
|
274 |
"score_ci_high": 0.9444444444444444,
|
275 |
-
"score_ci_low": 0.
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
-
"accuracy": 0.
|
280 |
-
"accuracy_ci_low": 0.
|
281 |
-
"accuracy_ci_high": 0.
|
282 |
"score_name": "accuracy",
|
283 |
-
"score": 0.
|
284 |
-
"score_ci_high": 0.
|
285 |
-
"score_ci_low": 0.
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
-
"score": 0.
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
296 |
-
"score": 0.
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
-
"score": 0.
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
-
"f1_Person": 0.
|
307 |
-
"f1_Organization": 0.
|
308 |
-
"f1_Location": 0.
|
309 |
-
"f1_macro": 0.
|
310 |
-
"recall_macro": 0.
|
311 |
-
"precision_macro": 0.
|
312 |
-
"in_classes_support": 0.
|
313 |
-
"f1_micro": 0.
|
314 |
-
"recall_micro": 0.
|
315 |
-
"precision_micro": 0.
|
316 |
-
"score": 0.
|
317 |
"score_name": "f1_micro",
|
318 |
-
"score_ci_low": 0.
|
319 |
-
"score_ci_high": 0.
|
320 |
-
"f1_micro_ci_low": 0.
|
321 |
-
"f1_micro_ci_high": 0.
|
322 |
},
|
323 |
-
"score": 0.
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
-
"accuracy": 0.
|
330 |
-
"accuracy_ci_low": 0.
|
331 |
-
"accuracy_ci_high": 0.
|
332 |
"score_name": "accuracy",
|
333 |
-
"score": 0.
|
334 |
-
"score_ci_high": 0.
|
335 |
-
"score_ci_low": 0.
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
-
"accuracy": 0.
|
340 |
-
"accuracy_ci_low": 0.
|
341 |
-
"accuracy_ci_high": 0.
|
342 |
"score_name": "accuracy",
|
343 |
-
"score": 0.
|
344 |
-
"score_ci_high": 0.
|
345 |
-
"score_ci_low": 0.
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
"accuracy": 0.23943661971830985,
|
350 |
"accuracy_ci_low": 0.14084507042253522,
|
351 |
-
"accuracy_ci_high": 0.
|
352 |
"score_name": "accuracy",
|
353 |
"score": 0.23943661971830985,
|
354 |
-
"score_ci_high": 0.
|
355 |
"score_ci_low": 0.14084507042253522,
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
-
"accuracy": 0.
|
360 |
-
"accuracy_ci_low": 0.
|
361 |
-
"accuracy_ci_high": 0.
|
362 |
"score_name": "accuracy",
|
363 |
-
"score": 0.
|
364 |
-
"score_ci_high": 0.
|
365 |
-
"score_ci_low": 0.
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
-
"accuracy": 0.
|
370 |
-
"accuracy_ci_low": 0.
|
371 |
-
"accuracy_ci_high": 0.
|
372 |
"score_name": "accuracy",
|
373 |
-
"score": 0.
|
374 |
-
"score_ci_high": 0.
|
375 |
-
"score_ci_low": 0.
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
379 |
-
"accuracy": 0.
|
380 |
-
"accuracy_ci_low": 0.
|
381 |
-
"accuracy_ci_high": 0.
|
382 |
"score_name": "accuracy",
|
383 |
-
"score": 0.
|
384 |
-
"score_ci_high": 0.
|
385 |
-
"score_ci_low": 0.
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
-
"accuracy": 0.
|
390 |
-
"accuracy_ci_low": 0.
|
391 |
-
"accuracy_ci_high": 0.
|
392 |
"score_name": "accuracy",
|
393 |
-
"score": 0.
|
394 |
-
"score_ci_high": 0.
|
395 |
-
"score_ci_low": 0.
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
-
"accuracy": 0.
|
400 |
-
"accuracy_ci_low": 0.
|
401 |
-
"accuracy_ci_high": 0.
|
402 |
"score_name": "accuracy",
|
403 |
-
"score": 0.
|
404 |
-
"score_ci_high": 0.
|
405 |
-
"score_ci_low": 0.
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
-
"accuracy": 0.
|
410 |
-
"accuracy_ci_low": 0.
|
411 |
-
"accuracy_ci_high": 0.
|
412 |
"score_name": "accuracy",
|
413 |
-
"score": 0.
|
414 |
-
"score_ci_high": 0.
|
415 |
-
"score_ci_low": 0.
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
-
"accuracy": 0.
|
420 |
-
"accuracy_ci_low": 0.
|
421 |
-
"accuracy_ci_high": 0.
|
422 |
"score_name": "accuracy",
|
423 |
-
"score": 0.
|
424 |
-
"score_ci_high": 0.
|
425 |
-
"score_ci_low": 0.
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
-
"accuracy": 0.
|
430 |
-
"accuracy_ci_low": 0.
|
431 |
-
"accuracy_ci_high": 0.
|
432 |
"score_name": "accuracy",
|
433 |
-
"score": 0.
|
434 |
-
"score_ci_high": 0.
|
435 |
-
"score_ci_low": 0.
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
-
"accuracy": 0.
|
440 |
-
"accuracy_ci_low": 0.
|
441 |
-
"accuracy_ci_high": 0.
|
442 |
"score_name": "accuracy",
|
443 |
-
"score": 0.
|
444 |
-
"score_ci_high": 0.
|
445 |
-
"score_ci_low": 0.
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
-
"accuracy": 0.
|
450 |
-
"accuracy_ci_low": 0.
|
451 |
-
"accuracy_ci_high": 0.
|
452 |
"score_name": "accuracy",
|
453 |
-
"score": 0.
|
454 |
-
"score_ci_high": 0.
|
455 |
-
"score_ci_low": 0.
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
-
"accuracy": 0.
|
460 |
-
"accuracy_ci_low": 0.
|
461 |
-
"accuracy_ci_high": 0.
|
462 |
"score_name": "accuracy",
|
463 |
-
"score": 0.
|
464 |
-
"score_ci_high": 0.
|
465 |
-
"score_ci_low": 0.
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
-
"score": 0.
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
-
"f1_macro": 0.
|
475 |
-
"f1_suggestive": 0.
|
476 |
-
"f1_generic": 0.
|
477 |
-
"
|
478 |
-
"
|
479 |
-
"
|
480 |
-
"f1_macro_ci_low": 0.
|
481 |
-
"f1_macro_ci_high": 0.
|
482 |
"score_name": "f1_micro",
|
483 |
-
"score": 0.
|
484 |
-
"score_ci_high": 0.
|
485 |
-
"score_ci_low": 0.
|
486 |
"num_of_instances": 85,
|
487 |
-
"accuracy": 0.
|
488 |
-
"accuracy_ci_low": 0.
|
489 |
-
"accuracy_ci_high": 0.
|
490 |
-
"f1_micro": 0.
|
491 |
-
"f1_micro_ci_low": 0.
|
492 |
-
"f1_micro_ci_high": 0.
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
-
"f1_macro": 0.
|
496 |
-
"f1_no": 0.
|
497 |
-
"f1_yes": 0.
|
498 |
-
"f1_macro_ci_low": 0.
|
499 |
-
"f1_macro_ci_high": 0.
|
500 |
"score_name": "f1_micro",
|
501 |
-
"score": 0.
|
502 |
-
"score_ci_high": 0.
|
503 |
-
"score_ci_low": 0.
|
504 |
"num_of_instances": 200,
|
505 |
-
"accuracy": 0.
|
506 |
-
"accuracy_ci_low": 0.
|
507 |
-
"accuracy_ci_high": 0.
|
508 |
-
"f1_micro": 0.
|
509 |
-
"f1_micro_ci_low": 0.
|
510 |
-
"f1_micro_ci_high": 0.
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
-
"f1_macro": 0.
|
514 |
-
"f1_conclusion": 0.
|
515 |
-
"f1_decree": 0.
|
516 |
-
"f1_issue": 0.
|
517 |
-
"f1_analysis": 0.
|
518 |
-
"f1_facts": 0.
|
519 |
-
"f1_procedural history": 0.
|
520 |
-
"f1_rule": 0.
|
521 |
-
"f1_macro_ci_low": 0.
|
522 |
-
"f1_macro_ci_high": 0.
|
523 |
"score_name": "f1_micro",
|
524 |
-
"score": 0.
|
525 |
-
"score_ci_high": 0.
|
526 |
-
"score_ci_low": 0.
|
527 |
"num_of_instances": 200,
|
528 |
-
"accuracy": 0.
|
529 |
-
"accuracy_ci_low": 0.
|
530 |
-
"accuracy_ci_high": 0.
|
531 |
-
"f1_micro": 0.
|
532 |
-
"f1_micro_ci_low": 0.
|
533 |
-
"f1_micro_ci_high": 0.
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
-
"f1_macro": 0.
|
537 |
-
"f1_yes": 0.
|
538 |
-
"f1_no": 0.
|
539 |
-
"f1_macro_ci_low": 0.
|
540 |
-
"f1_macro_ci_high": 0.
|
541 |
"score_name": "f1_micro",
|
542 |
-
"score": 0.
|
543 |
-
"score_ci_high": 0.
|
544 |
-
"score_ci_low": 0.
|
545 |
"num_of_instances": 200,
|
546 |
-
"accuracy": 0.
|
547 |
-
"accuracy_ci_low": 0.
|
548 |
-
"accuracy_ci_high": 0.
|
549 |
-
"f1_micro": 0.
|
550 |
-
"f1_micro_ci_low": 0.
|
551 |
-
"f1_micro_ci_high": 0.
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
-
"f1_macro": 0.
|
555 |
-
"f1_yes": 0.
|
556 |
-
"f1_no": 0.
|
557 |
-
"f1_macro_ci_low": 0.
|
558 |
-
"f1_macro_ci_high": 0.
|
559 |
"score_name": "f1_micro",
|
560 |
-
"score": 0.
|
561 |
-
"score_ci_high": 0.
|
562 |
-
"score_ci_low": 0.
|
563 |
"num_of_instances": 85,
|
564 |
-
"accuracy": 0.
|
565 |
-
"accuracy_ci_low": 0.
|
566 |
-
"accuracy_ci_high": 0.
|
567 |
-
"f1_micro": 0.
|
568 |
-
"f1_micro_ci_low": 0.
|
569 |
-
"f1_micro_ci_high": 0.
|
570 |
},
|
571 |
-
"score": 0.
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
-
"f1_macro": 0.
|
578 |
-
"f1_cars": 0.
|
579 |
-
"f1_windows x": 0.
|
580 |
-
"
|
581 |
-
"
|
582 |
-
"
|
583 |
-
"
|
584 |
-
"
|
585 |
-
"
|
586 |
-
"
|
587 |
-
"
|
588 |
-
"
|
589 |
-
"
|
590 |
-
"
|
591 |
-
"
|
592 |
-
"
|
593 |
-
"
|
594 |
-
"
|
595 |
-
"
|
596 |
-
"
|
597 |
-
"
|
598 |
-
"f1_macro_ci_low": 0.
|
599 |
-
"f1_macro_ci_high": 0.
|
600 |
"score_name": "f1_micro",
|
601 |
-
"score": 0.
|
602 |
-
"score_ci_high": 0.
|
603 |
-
"score_ci_low": 0.
|
604 |
"num_of_instances": 1000,
|
605 |
-
"accuracy": 0.
|
606 |
-
"accuracy_ci_low": 0.
|
607 |
-
"accuracy_ci_high": 0.
|
608 |
-
"f1_micro": 0.
|
609 |
-
"f1_micro_ci_low": 0.
|
610 |
-
"f1_micro_ci_high": 0.
|
611 |
},
|
612 |
-
"score": 0.
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
-
"f1_macro": 0.
|
619 |
-
"
|
620 |
-
"
|
621 |
-
"
|
622 |
-
"
|
623 |
-
"
|
624 |
-
"f1_payday loan or title loan or personal loan": 0.
|
625 |
-
"
|
626 |
-
"
|
627 |
-
"
|
628 |
-
"f1_macro_ci_low": 0.
|
629 |
-
"f1_macro_ci_high": 0.
|
630 |
"score_name": "f1_micro",
|
631 |
-
"score": 0.
|
632 |
-
"score_ci_high": 0.
|
633 |
-
"score_ci_low": 0.
|
634 |
"num_of_instances": 1000,
|
635 |
-
"accuracy": 0.
|
636 |
-
"accuracy_ci_low": 0.
|
637 |
-
"accuracy_ci_high": 0.
|
638 |
-
"f1_micro": 0.
|
639 |
-
"f1_micro_ci_low": 0.
|
640 |
-
"f1_micro_ci_high": 0.
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
-
"f1_macro": 0.
|
644 |
-
"f1_mortgages and loans": 0.
|
645 |
-
"f1_credit card": 0.
|
646 |
-
"f1_debt collection": 0.
|
647 |
-
"
|
648 |
-
"
|
649 |
-
"f1_macro_ci_low": 0.
|
650 |
-
"f1_macro_ci_high": 0.
|
651 |
"score_name": "f1_micro",
|
652 |
-
"score": 0.
|
653 |
-
"score_ci_high": 0.
|
654 |
-
"score_ci_low": 0.
|
655 |
"num_of_instances": 500,
|
656 |
-
"accuracy": 0.
|
657 |
-
"accuracy_ci_low": 0.
|
658 |
-
"accuracy_ci_high": 0.
|
659 |
-
"f1_micro": 0.
|
660 |
-
"f1_micro_ci_low": 0.
|
661 |
-
"f1_micro_ci_high": 0.
|
662 |
},
|
663 |
-
"score": 0.
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
-
"execution_accuracy": 0.
|
671 |
-
"program_accuracy": 0.
|
672 |
-
"score": 0.
|
673 |
"score_name": "program_accuracy",
|
674 |
-
"execution_accuracy_ci_low": 0.
|
675 |
-
"execution_accuracy_ci_high": 0.
|
676 |
-
"program_accuracy_ci_low": 0.
|
677 |
-
"program_accuracy_ci_high": 0.
|
678 |
-
"score_ci_low": 0.
|
679 |
-
"score_ci_high": 0.
|
680 |
},
|
681 |
-
"score": 0.
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
-
"precision": 0.
|
688 |
-
"recall": 0.
|
689 |
-
"f1": 0.
|
690 |
-
"precision_ci_low": 0.
|
691 |
-
"precision_ci_high": 0.
|
692 |
-
"recall_ci_low": 0.
|
693 |
-
"recall_ci_high": 0.
|
694 |
-
"f1_ci_low": 0.
|
695 |
-
"f1_ci_high": 0.
|
696 |
"score_name": "f1",
|
697 |
-
"score": 0.
|
698 |
-
"score_ci_high": 0.
|
699 |
-
"score_ci_low": 0.
|
700 |
"num_of_instances": 600,
|
701 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
702 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
703 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
704 |
-
"faithfullness_f1_token_overlap": 0.
|
705 |
-
"faithfullness_recall_token_overlap": 0.
|
706 |
-
"faithfullness_precision_token_overlap": 0.
|
707 |
-
"correctness_f1_token_overlap": 0.
|
708 |
-
"correctness_recall_token_overlap": 0.
|
709 |
-
"correctness_precision_token_overlap": 0.
|
710 |
},
|
711 |
-
"score": 0.
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
-
"accuracy": 0.
|
718 |
-
"accuracy_ci_low": 0.
|
719 |
-
"accuracy_ci_high": 0.
|
720 |
"score_name": "accuracy",
|
721 |
-
"score": 0.
|
722 |
-
"score_ci_high": 0.
|
723 |
-
"score_ci_low": 0.
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
-
"accuracy": 0.
|
728 |
-
"accuracy_ci_low": 0.
|
729 |
-
"accuracy_ci_high": 0.
|
730 |
"score_name": "accuracy",
|
731 |
-
"score": 0.
|
732 |
-
"score_ci_high": 0.
|
733 |
-
"score_ci_low": 0.
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
-
"score": 0.
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
740 |
"safety": {
|
741 |
"attaq_500": {
|
742 |
"severity_critical": 3.0,
|
743 |
-
"severity_high":
|
744 |
-
"severity_medium":
|
745 |
-
"severity_low":
|
746 |
-
"category_harmful_info": 0.
|
747 |
-
"category_deception": 0.
|
748 |
-
"category_explicit_content": 0.
|
749 |
-
"category_violence": 0.
|
750 |
-
"category_discrimination": 0.
|
751 |
-
"category_substance_abuse": 0.
|
752 |
-
"category_pii": 0.
|
753 |
-
"safety": 0.
|
754 |
-
"safety_ci_low": 0.
|
755 |
-
"safety_ci_high": 0.
|
756 |
"score_name": "safety",
|
757 |
-
"score": 0.
|
758 |
-
"score_ci_high": 0.
|
759 |
-
"score_ci_low": 0.
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
-
"score": 0.
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
-
"rouge1": 0.
|
770 |
-
"
|
771 |
-
"
|
|
|
772 |
"score_name": "rougeL",
|
773 |
-
"
|
774 |
-
"
|
775 |
-
"
|
776 |
-
"
|
777 |
-
"
|
778 |
-
"
|
779 |
-
"
|
780 |
-
"
|
781 |
-
"
|
782 |
-
"
|
783 |
-
"
|
784 |
-
"rouge2_ci_high": 0.19189115183254815
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
-
"rouge1": 0.
|
789 |
-
"
|
790 |
-
"
|
|
|
791 |
"score_name": "rougeL",
|
792 |
-
"
|
793 |
-
"
|
794 |
-
"
|
795 |
-
"
|
796 |
-
"
|
797 |
-
"
|
798 |
-
"
|
799 |
-
"
|
800 |
-
"
|
801 |
-
"
|
802 |
-
"
|
803 |
-
"rouge2_ci_high": 0.0176417953678172
|
804 |
},
|
805 |
-
"score": 0.
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
@@ -810,473 +810,473 @@
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
],
|
818 |
"totals": [
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
-
|
823 |
],
|
824 |
"precisions": [
|
825 |
-
0.
|
826 |
-
0.
|
827 |
-
0.
|
828 |
-
0.
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
-
"sys_len":
|
832 |
"ref_len": 1734,
|
833 |
-
"sacrebleu": 0.
|
834 |
-
"score": 0.
|
835 |
"score_name": "sacrebleu",
|
836 |
-
"score_ci_low": 0.
|
837 |
-
"score_ci_high": 0.
|
838 |
-
"sacrebleu_ci_low": 0.
|
839 |
-
"sacrebleu_ci_high": 0.
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
-
|
845 |
-
|
846 |
-
|
847 |
-
|
848 |
],
|
849 |
"totals": [
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
],
|
855 |
"precisions": [
|
856 |
-
0.
|
857 |
-
0.
|
858 |
-
0.
|
859 |
-
0.
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
-
"sys_len":
|
863 |
"ref_len": 1734,
|
864 |
-
"sacrebleu": 0.
|
865 |
-
"score": 0.
|
866 |
"score_name": "sacrebleu",
|
867 |
-
"score_ci_low": 0.
|
868 |
-
"score_ci_high": 0.
|
869 |
-
"sacrebleu_ci_low": 0.
|
870 |
-
"sacrebleu_ci_high": 0.
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
-
|
876 |
-
|
877 |
-
|
878 |
-
|
879 |
],
|
880 |
"totals": [
|
881 |
-
|
882 |
-
|
883 |
-
|
884 |
-
|
885 |
],
|
886 |
"precisions": [
|
887 |
-
0.
|
888 |
-
0.
|
889 |
-
0.
|
890 |
-
0.
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
-
"sys_len":
|
894 |
"ref_len": 1589,
|
895 |
-
"sacrebleu": 0.
|
896 |
-
"score": 0.
|
897 |
"score_name": "sacrebleu",
|
898 |
-
"score_ci_low": 0.
|
899 |
-
"score_ci_high": 0.
|
900 |
-
"sacrebleu_ci_low": 0.
|
901 |
-
"sacrebleu_ci_high": 0.
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
-
|
907 |
-
|
908 |
-
|
909 |
-
|
910 |
],
|
911 |
"totals": [
|
912 |
-
|
913 |
-
|
914 |
-
|
915 |
-
|
916 |
],
|
917 |
"precisions": [
|
918 |
-
0.
|
919 |
-
0.
|
920 |
-
0.
|
921 |
-
0.
|
922 |
],
|
923 |
-
"bp":
|
924 |
-
"sys_len":
|
925 |
"ref_len": 1835,
|
926 |
-
"sacrebleu": 0.
|
927 |
-
"score": 0.
|
928 |
"score_name": "sacrebleu",
|
929 |
-
"score_ci_low": 0.
|
930 |
-
"score_ci_high": 0.
|
931 |
-
"sacrebleu_ci_low": 0.
|
932 |
-
"sacrebleu_ci_high": 0.
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
-
|
938 |
-
|
939 |
-
|
940 |
-
|
941 |
],
|
942 |
"totals": [
|
943 |
-
|
944 |
-
|
945 |
-
|
946 |
-
|
947 |
],
|
948 |
"precisions": [
|
949 |
-
0.
|
950 |
-
0.
|
951 |
-
0.
|
952 |
-
0.
|
953 |
],
|
954 |
"bp": 1.0,
|
955 |
-
"sys_len":
|
956 |
"ref_len": 2068,
|
957 |
-
"sacrebleu": 0.
|
958 |
-
"score": 0.
|
959 |
"score_name": "sacrebleu",
|
960 |
-
"score_ci_low": 0.
|
961 |
-
"score_ci_high": 0.
|
962 |
-
"sacrebleu_ci_low": 0.
|
963 |
-
"sacrebleu_ci_high": 0.
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
-
|
969 |
-
|
970 |
-
|
971 |
-
|
972 |
],
|
973 |
"totals": [
|
974 |
-
|
975 |
-
|
976 |
-
|
977 |
-
|
978 |
],
|
979 |
"precisions": [
|
980 |
-
0.
|
981 |
-
0.
|
982 |
-
0.
|
983 |
-
0.
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
-
"sys_len":
|
987 |
"ref_len": 2235,
|
988 |
-
"sacrebleu": 0.
|
989 |
-
"score": 0.
|
990 |
"score_name": "sacrebleu",
|
991 |
-
"score_ci_low": 0.
|
992 |
-
"score_ci_high": 0.
|
993 |
-
"sacrebleu_ci_low": 0.
|
994 |
-
"sacrebleu_ci_high": 0.
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
-
|
1000 |
-
|
1001 |
-
|
1002 |
-
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
-
|
1006 |
-
|
1007 |
-
|
1008 |
-
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
-
0.
|
1012 |
-
0.
|
1013 |
-
0.
|
1014 |
-
0.
|
1015 |
],
|
1016 |
"bp": 1.0,
|
1017 |
-
"sys_len":
|
1018 |
"ref_len": 1916,
|
1019 |
-
"sacrebleu": 0.
|
1020 |
-
"score": 0.
|
1021 |
"score_name": "sacrebleu",
|
1022 |
-
"score_ci_low": 0.
|
1023 |
-
"score_ci_high": 0.
|
1024 |
-
"sacrebleu_ci_low": 0.
|
1025 |
-
"sacrebleu_ci_high": 0.
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
-
|
1031 |
-
|
1032 |
-
|
1033 |
-
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
-
0.
|
1043 |
-
0.
|
1044 |
-
0.
|
1045 |
-
0.
|
1046 |
],
|
1047 |
"bp": 1.0,
|
1048 |
-
"sys_len":
|
1049 |
"ref_len": 1949,
|
1050 |
-
"sacrebleu": 0.
|
1051 |
-
"score": 0.
|
1052 |
"score_name": "sacrebleu",
|
1053 |
-
"score_ci_low": 0.
|
1054 |
-
"score_ci_high": 0.
|
1055 |
-
"sacrebleu_ci_low": 0.
|
1056 |
-
"sacrebleu_ci_high": 0.
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
-
|
1062 |
-
|
1063 |
-
|
1064 |
-
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
1070 |
-
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
-
0.
|
1074 |
-
0.
|
1075 |
-
0.
|
1076 |
-
0.
|
1077 |
],
|
1078 |
-
"bp":
|
1079 |
-
"sys_len":
|
1080 |
"ref_len": 2098,
|
1081 |
-
"sacrebleu": 0.
|
1082 |
-
"score": 0.
|
1083 |
"score_name": "sacrebleu",
|
1084 |
-
"score_ci_low": 0.
|
1085 |
-
"score_ci_high": 0.
|
1086 |
-
"sacrebleu_ci_low": 0.
|
1087 |
-
"sacrebleu_ci_high": 0.
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
-
|
1093 |
-
|
1094 |
-
|
1095 |
-
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
-
|
1099 |
-
|
1100 |
-
|
1101 |
-
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
-
0.
|
1105 |
-
0.
|
1106 |
-
0.
|
1107 |
-
0.
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
-
"sys_len":
|
1111 |
"ref_len": 1734,
|
1112 |
-
"sacrebleu": 0.
|
1113 |
-
"score": 0.
|
1114 |
"score_name": "sacrebleu",
|
1115 |
-
"score_ci_low": 0.
|
1116 |
-
"score_ci_high": 0.
|
1117 |
-
"sacrebleu_ci_low": 0.
|
1118 |
-
"sacrebleu_ci_high": 0.
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
-
|
1124 |
-
|
1125 |
-
|
1126 |
-
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
-
|
1130 |
-
|
1131 |
-
|
1132 |
-
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
-
0.
|
1136 |
-
0.
|
1137 |
-
0.
|
1138 |
-
0.
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
-
"sys_len":
|
1142 |
"ref_len": 1734,
|
1143 |
-
"sacrebleu": 0.
|
1144 |
-
"score": 0.
|
1145 |
"score_name": "sacrebleu",
|
1146 |
-
"score_ci_low": 0.
|
1147 |
-
"score_ci_high": 0.
|
1148 |
-
"sacrebleu_ci_low": 0.
|
1149 |
-
"sacrebleu_ci_high": 0.
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
-
|
1155 |
-
|
1156 |
-
|
1157 |
-
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
-
|
1161 |
-
|
1162 |
-
|
1163 |
-
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
-
0.
|
1167 |
-
0.
|
1168 |
-
0.
|
1169 |
-
0.
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
-
"sys_len":
|
1173 |
"ref_len": 1734,
|
1174 |
-
"sacrebleu": 0.
|
1175 |
-
"score": 0.
|
1176 |
"score_name": "sacrebleu",
|
1177 |
-
"score_ci_low": 0.
|
1178 |
-
"score_ci_high": 0.
|
1179 |
-
"sacrebleu_ci_low": 0.
|
1180 |
-
"sacrebleu_ci_high": 0.
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
-
|
1186 |
-
|
1187 |
-
|
1188 |
-
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
-
|
1192 |
-
|
1193 |
-
|
1194 |
-
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
-
0.
|
1198 |
-
0.
|
1199 |
-
0.
|
1200 |
-
0.
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
-
"sys_len":
|
1204 |
"ref_len": 1734,
|
1205 |
-
"sacrebleu": 0.
|
1206 |
-
"score": 0.
|
1207 |
"score_name": "sacrebleu",
|
1208 |
-
"score_ci_low": 0.
|
1209 |
-
"score_ci_high": 0.
|
1210 |
-
"sacrebleu_ci_low": 0.
|
1211 |
-
"sacrebleu_ci_high": 0.
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
-
|
1217 |
-
|
1218 |
-
|
1219 |
-
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
-
|
1223 |
-
|
1224 |
-
|
1225 |
-
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
-
0.
|
1229 |
-
0.
|
1230 |
-
0.
|
1231 |
-
0.
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
-
"sys_len":
|
1235 |
"ref_len": 1734,
|
1236 |
-
"sacrebleu": 0.
|
1237 |
-
"score": 0.
|
1238 |
"score_name": "sacrebleu",
|
1239 |
-
"score_ci_low": 0.
|
1240 |
-
"score_ci_high": 0.
|
1241 |
-
"sacrebleu_ci_low": 0.
|
1242 |
-
"sacrebleu_ci_high": 0.
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
-
|
1248 |
-
|
1249 |
-
|
1250 |
-
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
-
|
1254 |
-
|
1255 |
-
|
1256 |
-
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
-
0.
|
1260 |
-
0.
|
1261 |
-
0.
|
1262 |
-
0.
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
-
"sys_len":
|
1266 |
"ref_len": 1734,
|
1267 |
-
"sacrebleu": 0.
|
1268 |
-
"score": 0.
|
1269 |
"score_name": "sacrebleu",
|
1270 |
-
"score_ci_low": 0.
|
1271 |
-
"score_ci_high": 0.
|
1272 |
-
"sacrebleu_ci_low": 0.
|
1273 |
-
"sacrebleu_ci_high": 0.
|
1274 |
},
|
1275 |
-
"score": 0.
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
-
"score": 0.
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-23T19:33:07.872441Z",
|
4 |
"command_line_invocation": [
|
5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
|
|
8 |
"--model",
|
9 |
"cross_provider",
|
10 |
"--model_args",
|
11 |
+
"model_name=watsonx/mistralai/mistral-large,max_tokens=256",
|
12 |
"--output_path",
|
13 |
"./results/bluebench",
|
14 |
"--log_samples",
|
|
|
26 |
"num_fewshots": null,
|
27 |
"limit": null,
|
28 |
"batch_size": 8,
|
29 |
+
"model": "watsonx/mistralai/mistral-large",
|
30 |
"model_args": {
|
31 |
"max_tokens": 256
|
32 |
},
|
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "9aa85a9a01cfe9609a2d010f6d4bd1e88e782740",
|
46 |
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.8666666666666667,
|
180 |
+
"accuracy_ci_low": 0.7888888888888889,
|
181 |
+
"accuracy_ci_high": 0.9222222222222223,
|
182 |
"score_name": "accuracy",
|
183 |
+
"score": 0.8666666666666667,
|
184 |
+
"score_ci_high": 0.9222222222222223,
|
185 |
+
"score_ci_low": 0.7888888888888889,
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.8888888888888888,
|
190 |
+
"accuracy_ci_low": 0.8111111111111111,
|
191 |
+
"accuracy_ci_high": 0.9444444444444444,
|
192 |
"score_name": "accuracy",
|
193 |
+
"score": 0.8888888888888888,
|
194 |
+
"score_ci_high": 0.9444444444444444,
|
195 |
+
"score_ci_low": 0.8111111111111111,
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 0.9888888888888889,
|
200 |
+
"accuracy_ci_low": 0.9283857779145438,
|
201 |
+
"accuracy_ci_high": 1.0,
|
202 |
"score_name": "accuracy",
|
203 |
+
"score": 0.9888888888888889,
|
204 |
+
"score_ci_high": 1.0,
|
205 |
+
"score_ci_low": 0.9283857779145438,
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 0.8666666666666667,
|
210 |
+
"accuracy_ci_low": 0.7858277377703305,
|
211 |
+
"accuracy_ci_high": 0.9333333333333333,
|
212 |
"score_name": "accuracy",
|
213 |
+
"score": 0.8666666666666667,
|
214 |
+
"score_ci_high": 0.9333333333333333,
|
215 |
+
"score_ci_low": 0.7858277377703305,
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.9333333333333333,
|
220 |
+
"accuracy_ci_low": 0.8666666666666667,
|
221 |
+
"accuracy_ci_high": 0.9777777777777777,
|
222 |
"score_name": "accuracy",
|
223 |
+
"score": 0.9333333333333333,
|
224 |
+
"score_ci_high": 0.9777777777777777,
|
225 |
+
"score_ci_low": 0.8666666666666667,
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.9555555555555556,
|
230 |
+
"accuracy_ci_low": 0.9,
|
231 |
+
"accuracy_ci_high": 0.9888888888888889,
|
232 |
"score_name": "accuracy",
|
233 |
+
"score": 0.9555555555555556,
|
234 |
+
"score_ci_high": 0.9888888888888889,
|
235 |
+
"score_ci_low": 0.9,
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 0.9111111111111111,
|
240 |
+
"accuracy_ci_low": 0.8333333333333334,
|
241 |
+
"accuracy_ci_high": 0.9555555555555556,
|
242 |
"score_name": "accuracy",
|
243 |
+
"score": 0.9111111111111111,
|
244 |
+
"score_ci_high": 0.9555555555555556,
|
245 |
+
"score_ci_low": 0.8333333333333334,
|
246 |
"num_of_instances": 90
|
247 |
},
|
248 |
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 0.8666666666666667,
|
250 |
+
"accuracy_ci_low": 0.7888888888888889,
|
251 |
+
"accuracy_ci_high": 0.9222222222222223,
|
252 |
"score_name": "accuracy",
|
253 |
+
"score": 0.8666666666666667,
|
254 |
+
"score_ci_high": 0.9222222222222223,
|
255 |
+
"score_ci_low": 0.7888888888888889,
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.8666666666666667,
|
260 |
+
"accuracy_ci_low": 0.788388746882511,
|
261 |
+
"accuracy_ci_high": 0.9222222222222223,
|
262 |
"score_name": "accuracy",
|
263 |
+
"score": 0.8666666666666667,
|
264 |
+
"score_ci_high": 0.9222222222222223,
|
265 |
+
"score_ci_low": 0.788388746882511,
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
"accuracy": 0.8888888888888888,
|
270 |
+
"accuracy_ci_low": 0.8111111111111111,
|
271 |
"accuracy_ci_high": 0.9444444444444444,
|
272 |
"score_name": "accuracy",
|
273 |
"score": 0.8888888888888888,
|
274 |
"score_ci_high": 0.9444444444444444,
|
275 |
+
"score_ci_low": 0.8111111111111111,
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.9222222222222223,
|
280 |
+
"accuracy_ci_low": 0.8555555555555555,
|
281 |
+
"accuracy_ci_high": 0.9666666666666667,
|
282 |
"score_name": "accuracy",
|
283 |
+
"score": 0.9222222222222223,
|
284 |
+
"score_ci_high": 0.9666666666666667,
|
285 |
+
"score_ci_low": 0.8555555555555555,
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
+
"score": 0.9050505050505051,
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
292 |
"chatbot_abilities": {
|
293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.09158878504672897,
|
296 |
+
"score": 0.09158878504672897,
|
297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
},
|
299 |
+
"score": 0.09158878504672897,
|
300 |
"score_name": "subsets_mean",
|
301 |
"num_of_instances": 500
|
302 |
},
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.16666666666666666,
|
307 |
+
"f1_Organization": 0.03252032520325203,
|
308 |
+
"f1_Location": 0.06666666666666667,
|
309 |
+
"f1_macro": 0.08861788617886178,
|
310 |
+
"recall_macro": 0.061454532512588166,
|
311 |
+
"precision_macro": 0.1605612378704432,
|
312 |
+
"in_classes_support": 0.2943548387096774,
|
313 |
+
"f1_micro": 0.055161544523246654,
|
314 |
+
"recall_micro": 0.06666666666666667,
|
315 |
+
"precision_micro": 0.04704301075268817,
|
316 |
+
"score": 0.055161544523246654,
|
317 |
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.04056157487156359,
|
319 |
+
"score_ci_high": 0.07432669896850874,
|
320 |
+
"f1_micro_ci_low": 0.04056157487156359,
|
321 |
+
"f1_micro_ci_high": 0.07432669896850874
|
322 |
},
|
323 |
+
"score": 0.055161544523246654,
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.7323943661971831,
|
330 |
+
"accuracy_ci_low": 0.6197183098591549,
|
331 |
+
"accuracy_ci_high": 0.8309859154929577,
|
332 |
"score_name": "accuracy",
|
333 |
+
"score": 0.7323943661971831,
|
334 |
+
"score_ci_high": 0.8309859154929577,
|
335 |
+
"score_ci_low": 0.6197183098591549,
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.39436619718309857,
|
340 |
+
"accuracy_ci_low": 0.28169014084507044,
|
341 |
+
"accuracy_ci_high": 0.5070422535211268,
|
342 |
"score_name": "accuracy",
|
343 |
+
"score": 0.39436619718309857,
|
344 |
+
"score_ci_high": 0.5070422535211268,
|
345 |
+
"score_ci_low": 0.28169014084507044,
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
"accuracy": 0.23943661971830985,
|
350 |
"accuracy_ci_low": 0.14084507042253522,
|
351 |
+
"accuracy_ci_high": 0.3380281690140845,
|
352 |
"score_name": "accuracy",
|
353 |
"score": 0.23943661971830985,
|
354 |
+
"score_ci_high": 0.3380281690140845,
|
355 |
"score_ci_low": 0.14084507042253522,
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.647887323943662,
|
360 |
+
"accuracy_ci_low": 0.5352112676056338,
|
361 |
+
"accuracy_ci_high": 0.7464788732394366,
|
362 |
"score_name": "accuracy",
|
363 |
+
"score": 0.647887323943662,
|
364 |
+
"score_ci_high": 0.7464788732394366,
|
365 |
+
"score_ci_low": 0.5352112676056338,
|
366 |
"num_of_instances": 71
|
367 |
},
|
368 |
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.7323943661971831,
|
370 |
+
"accuracy_ci_low": 0.6056338028169014,
|
371 |
+
"accuracy_ci_high": 0.8309859154929577,
|
372 |
"score_name": "accuracy",
|
373 |
+
"score": 0.7323943661971831,
|
374 |
+
"score_ci_high": 0.8309859154929577,
|
375 |
+
"score_ci_low": 0.6056338028169014,
|
376 |
"num_of_instances": 71
|
377 |
},
|
378 |
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.2676056338028169,
|
380 |
+
"accuracy_ci_low": 0.16901408450704225,
|
381 |
+
"accuracy_ci_high": 0.38028169014084506,
|
382 |
"score_name": "accuracy",
|
383 |
+
"score": 0.2676056338028169,
|
384 |
+
"score_ci_high": 0.38028169014084506,
|
385 |
+
"score_ci_low": 0.16901408450704225,
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.6056338028169014,
|
390 |
+
"accuracy_ci_low": 0.4788732394366197,
|
391 |
+
"accuracy_ci_high": 0.704225352112676,
|
392 |
"score_name": "accuracy",
|
393 |
+
"score": 0.6056338028169014,
|
394 |
+
"score_ci_high": 0.704225352112676,
|
395 |
+
"score_ci_low": 0.4788732394366197,
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.647887323943662,
|
400 |
+
"accuracy_ci_low": 0.5211267605633803,
|
401 |
+
"accuracy_ci_high": 0.7464788732394366,
|
402 |
"score_name": "accuracy",
|
403 |
+
"score": 0.647887323943662,
|
404 |
+
"score_ci_high": 0.7464788732394366,
|
405 |
+
"score_ci_low": 0.5211267605633803,
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.5211267605633803,
|
410 |
+
"accuracy_ci_low": 0.40913735882879854,
|
411 |
+
"accuracy_ci_high": 0.6338028169014085,
|
412 |
"score_name": "accuracy",
|
413 |
+
"score": 0.5211267605633803,
|
414 |
+
"score_ci_high": 0.6338028169014085,
|
415 |
+
"score_ci_low": 0.40913735882879854,
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.4225352112676056,
|
420 |
+
"accuracy_ci_low": 0.30985915492957744,
|
421 |
+
"accuracy_ci_high": 0.5275288557194965,
|
422 |
"score_name": "accuracy",
|
423 |
+
"score": 0.4225352112676056,
|
424 |
+
"score_ci_high": 0.5275288557194965,
|
425 |
+
"score_ci_low": 0.30985915492957744,
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.647887323943662,
|
430 |
+
"accuracy_ci_low": 0.5211267605633803,
|
431 |
+
"accuracy_ci_high": 0.7464788732394366,
|
432 |
"score_name": "accuracy",
|
433 |
+
"score": 0.647887323943662,
|
434 |
+
"score_ci_high": 0.7464788732394366,
|
435 |
+
"score_ci_low": 0.5211267605633803,
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.6619718309859155,
|
440 |
+
"accuracy_ci_low": 0.5492957746478874,
|
441 |
+
"accuracy_ci_high": 0.7746478873239436,
|
442 |
"score_name": "accuracy",
|
443 |
+
"score": 0.6619718309859155,
|
444 |
+
"score_ci_high": 0.7746478873239436,
|
445 |
+
"score_ci_low": 0.5492957746478874,
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.39436619718309857,
|
450 |
+
"accuracy_ci_low": 0.28169014084507044,
|
451 |
+
"accuracy_ci_high": 0.5070422535211268,
|
452 |
"score_name": "accuracy",
|
453 |
+
"score": 0.39436619718309857,
|
454 |
+
"score_ci_high": 0.5070422535211268,
|
455 |
+
"score_ci_low": 0.28169014084507044,
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.8028169014084507,
|
460 |
+
"accuracy_ci_low": 0.704225352112676,
|
461 |
+
"accuracy_ci_high": 0.8873239436619719,
|
462 |
"score_name": "accuracy",
|
463 |
+
"score": 0.8028169014084507,
|
464 |
+
"score_ci_high": 0.8873239436619719,
|
465 |
+
"score_ci_low": 0.704225352112676,
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
+
"score": 0.5513078470824949,
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.30082491488530444,
|
475 |
+
"f1_suggestive": 0.2926829268292683,
|
476 |
+
"f1_generic": 0.3157894736842105,
|
477 |
+
"f1_fanciful": 0.2,
|
478 |
+
"f1_descriptive": 0.2608695652173913,
|
479 |
+
"f1_arbitrary": 0.43478260869565216,
|
480 |
+
"f1_macro_ci_low": 0.21165741181075054,
|
481 |
+
"f1_macro_ci_high": 0.4315719879768282,
|
482 |
"score_name": "f1_micro",
|
483 |
+
"score": 0.30158730158730157,
|
484 |
+
"score_ci_high": 0.421875,
|
485 |
+
"score_ci_low": 0.2033898305084746,
|
486 |
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.2235294117647059,
|
488 |
+
"accuracy_ci_low": 0.15294117647058825,
|
489 |
+
"accuracy_ci_high": 0.32721667655979375,
|
490 |
+
"f1_micro": 0.30158730158730157,
|
491 |
+
"f1_micro_ci_low": 0.2033898305084746,
|
492 |
+
"f1_micro_ci_high": 0.421875
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.2087664168882443,
|
496 |
+
"f1_no": 0.38578680203045684,
|
497 |
+
"f1_yes": 0.031746031746031744,
|
498 |
+
"f1_macro_ci_low": 0.16304347826086957,
|
499 |
+
"f1_macro_ci_high": 0.27128054977534694,
|
500 |
"score_name": "f1_micro",
|
501 |
+
"score": 0.3,
|
502 |
+
"score_ci_high": 0.37342833232881084,
|
503 |
+
"score_ci_low": 0.22950819672131148,
|
504 |
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.195,
|
506 |
+
"accuracy_ci_low": 0.145,
|
507 |
+
"accuracy_ci_high": 0.2511829758893259,
|
508 |
+
"f1_micro": 0.3,
|
509 |
+
"f1_micro_ci_low": 0.22950819672131148,
|
510 |
+
"f1_micro_ci_high": 0.37342833232881084
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.1326239897668469,
|
514 |
+
"f1_conclusion": 0.0,
|
515 |
+
"f1_decree": 0.14814814814814814,
|
516 |
+
"f1_issue": 0.05714285714285714,
|
517 |
+
"f1_analysis": 0.15,
|
518 |
+
"f1_facts": 0.06666666666666667,
|
519 |
+
"f1_procedural history": 0.25,
|
520 |
+
"f1_rule": 0.2564102564102564,
|
521 |
+
"f1_macro_ci_low": 0.0850100965627728,
|
522 |
+
"f1_macro_ci_high": 0.1996422423627835,
|
523 |
"score_name": "f1_micro",
|
524 |
+
"score": 0.14345991561181434,
|
525 |
+
"score_ci_high": 0.2175732217573222,
|
526 |
+
"score_ci_low": 0.08928571428571429,
|
527 |
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.085,
|
529 |
+
"accuracy_ci_low": 0.055,
|
530 |
+
"accuracy_ci_high": 0.135,
|
531 |
+
"f1_micro": 0.14345991561181434,
|
532 |
+
"f1_micro_ci_low": 0.08928571428571429,
|
533 |
+
"f1_micro_ci_high": 0.2175732217573222
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.27590718171176754,
|
537 |
+
"f1_yes": 0.3106796116504854,
|
538 |
+
"f1_no": 0.24113475177304963,
|
539 |
+
"f1_macro_ci_low": 0.2059942090622298,
|
540 |
+
"f1_macro_ci_high": 0.35594480291914055,
|
541 |
"score_name": "f1_micro",
|
542 |
+
"score": 0.27049180327868855,
|
543 |
+
"score_ci_high": 0.3511987633583538,
|
544 |
+
"score_ci_low": 0.20259771606756347,
|
545 |
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.165,
|
547 |
+
"accuracy_ci_low": 0.12,
|
548 |
+
"accuracy_ci_high": 0.225,
|
549 |
+
"f1_micro": 0.27049180327868855,
|
550 |
+
"f1_micro_ci_low": 0.20259771606756347,
|
551 |
+
"f1_micro_ci_high": 0.3511987633583538
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.7697368421052632,
|
555 |
+
"f1_yes": 0.75,
|
556 |
+
"f1_no": 0.7894736842105263,
|
557 |
+
"f1_macro_ci_low": 0.6817007087256215,
|
558 |
+
"f1_macro_ci_high": 0.8427704260296438,
|
559 |
"score_name": "f1_micro",
|
560 |
+
"score": 0.7714285714285715,
|
561 |
+
"score_ci_high": 0.8435374149659864,
|
562 |
+
"score_ci_low": 0.6821705426356589,
|
563 |
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.6352941176470588,
|
565 |
+
"accuracy_ci_low": 0.5176470588235295,
|
566 |
+
"accuracy_ci_high": 0.7294117647058823,
|
567 |
+
"f1_micro": 0.7714285714285715,
|
568 |
+
"f1_micro_ci_low": 0.6821705426356589,
|
569 |
+
"f1_micro_ci_high": 0.8435374149659864
|
570 |
},
|
571 |
+
"score": 0.3573935183812752,
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.31185856604704804,
|
578 |
+
"f1_cars": 0.5294117647058824,
|
579 |
+
"f1_windows x": 0.0,
|
580 |
+
"f1_computer graphics": 0.21875,
|
581 |
+
"f1_atheism": 0.0,
|
582 |
+
"f1_religion": 0.1935483870967742,
|
583 |
+
"f1_medicine": 0.4444444444444444,
|
584 |
+
"f1_christianity": 0.07272727272727272,
|
585 |
+
"f1_microsoft windows": 0.13793103448275862,
|
586 |
+
"f1_middle east": 0.2692307692307692,
|
587 |
+
"f1_motorcycles": 0.41975308641975306,
|
588 |
+
"f1_pc hardware": 0.47191011235955055,
|
589 |
+
"f1_mac hardware": 0.4943820224719101,
|
590 |
+
"f1_for sale": 0.2608695652173913,
|
591 |
+
"f1_guns": 0.0784313725490196,
|
592 |
+
"f1_space": 0.4594594594594595,
|
593 |
+
"f1_cryptography": 0.3389830508474576,
|
594 |
+
"f1_baseball": 0.46153846153846156,
|
595 |
+
"f1_hockey": 0.5918367346938775,
|
596 |
+
"f1_politics": 0.37142857142857144,
|
597 |
+
"f1_electronics": 0.4225352112676056,
|
598 |
+
"f1_macro_ci_low": 0.2856078509025084,
|
599 |
+
"f1_macro_ci_high": 0.3426020462142722,
|
600 |
"score_name": "f1_micro",
|
601 |
+
"score": 0.3433734939759036,
|
602 |
+
"score_ci_high": 0.3755990938286412,
|
603 |
+
"score_ci_low": 0.3096235116477192,
|
604 |
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.228,
|
606 |
+
"accuracy_ci_low": 0.202,
|
607 |
+
"accuracy_ci_high": 0.253,
|
608 |
+
"f1_micro": 0.3433734939759036,
|
609 |
+
"f1_micro_ci_low": 0.3096235116477192,
|
610 |
+
"f1_micro_ci_high": 0.3755990938286412
|
611 |
},
|
612 |
+
"score": 0.3433734939759036,
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.6519421467580517,
|
619 |
+
"f1_student loan": 0.75,
|
620 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.7592592592592593,
|
621 |
+
"f1_debt collection": 0.5138888888888888,
|
622 |
+
"f1_checking or savings account": 0.7073170731707317,
|
623 |
+
"f1_mortgage": 0.7931034482758621,
|
624 |
+
"f1_payday loan or title loan or personal loan": 0.4444444444444444,
|
625 |
+
"f1_credit card or prepaid card": 0.6727272727272727,
|
626 |
+
"f1_money transfer or virtual currency or money service": 0.6341463414634146,
|
627 |
+
"f1_vehicle loan or lease": 0.5925925925925926,
|
628 |
+
"f1_macro_ci_low": 0.5871374382389457,
|
629 |
+
"f1_macro_ci_high": 0.7213495817777442,
|
630 |
"score_name": "f1_micro",
|
631 |
+
"score": 0.7237076648841355,
|
632 |
+
"score_ci_high": 0.7514570299103845,
|
633 |
+
"score_ci_low": 0.6955584945084361,
|
634 |
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.609,
|
636 |
+
"accuracy_ci_low": 0.577,
|
637 |
+
"accuracy_ci_high": 0.641,
|
638 |
+
"f1_micro": 0.7237076648841355,
|
639 |
+
"f1_micro_ci_low": 0.6955584945084361,
|
640 |
+
"f1_micro_ci_high": 0.7514570299103845
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.6498825672130026,
|
644 |
+
"f1_mortgages and loans": 0.6711409395973155,
|
645 |
+
"f1_credit card": 0.6853146853146853,
|
646 |
+
"f1_debt collection": 0.56,
|
647 |
+
"f1_credit reporting": 0.7279151943462897,
|
648 |
+
"f1_retail banking": 0.6050420168067226,
|
649 |
+
"f1_macro_ci_low": 0.6099650997428573,
|
650 |
+
"f1_macro_ci_high": 0.6950683737354554,
|
651 |
"score_name": "f1_micro",
|
652 |
+
"score": 0.6605293440736478,
|
653 |
+
"score_ci_high": 0.7005417538024762,
|
654 |
+
"score_ci_low": 0.6186622377558174,
|
655 |
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.574,
|
657 |
+
"accuracy_ci_low": 0.534,
|
658 |
+
"accuracy_ci_high": 0.618,
|
659 |
+
"f1_micro": 0.6605293440736478,
|
660 |
+
"f1_micro_ci_low": 0.6186622377558174,
|
661 |
+
"f1_micro_ci_high": 0.7005417538024762
|
662 |
},
|
663 |
+
"score": 0.6921185044788917,
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
+
"execution_accuracy": 0.113,
|
671 |
+
"program_accuracy": 0.114,
|
672 |
+
"score": 0.114,
|
673 |
"score_name": "program_accuracy",
|
674 |
+
"execution_accuracy_ci_low": 0.095,
|
675 |
+
"execution_accuracy_ci_high": 0.135,
|
676 |
+
"program_accuracy_ci_low": 0.096,
|
677 |
+
"program_accuracy_ci_high": 0.135,
|
678 |
+
"score_ci_low": 0.096,
|
679 |
+
"score_ci_high": 0.135
|
680 |
},
|
681 |
+
"score": 0.114,
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.3193234609846695,
|
688 |
+
"recall": 0.6156352660927287,
|
689 |
+
"f1": 0.3579933019872818,
|
690 |
+
"precision_ci_low": 0.2995158081508201,
|
691 |
+
"precision_ci_high": 0.34026768128986357,
|
692 |
+
"recall_ci_low": 0.599899034156362,
|
693 |
+
"recall_ci_high": 0.6313695528855681,
|
694 |
+
"f1_ci_low": 0.34026079062391323,
|
695 |
+
"f1_ci_high": 0.37577002560346623,
|
696 |
"score_name": "f1",
|
697 |
+
"score": 0.3579933019872818,
|
698 |
+
"score_ci_high": 0.37577002560346623,
|
699 |
+
"score_ci_low": 0.34026079062391323,
|
700 |
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.6041086231172085,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6908169340590636,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5559258947273096,
|
704 |
+
"faithfullness_f1_token_overlap": 0.3085026884811217,
|
705 |
+
"faithfullness_recall_token_overlap": 0.24647156332743023,
|
706 |
+
"faithfullness_precision_token_overlap": 0.536789234125785,
|
707 |
+
"correctness_f1_token_overlap": 0.3579933019872818,
|
708 |
+
"correctness_recall_token_overlap": 0.6156352660927287,
|
709 |
+
"correctness_precision_token_overlap": 0.3193234609846695
|
710 |
},
|
711 |
+
"score": 0.3579933019872818,
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
+
"accuracy": 0.57,
|
718 |
+
"accuracy_ci_low": 0.537,
|
719 |
+
"accuracy_ci_high": 0.599,
|
720 |
"score_name": "accuracy",
|
721 |
+
"score": 0.57,
|
722 |
+
"score_ci_high": 0.599,
|
723 |
+
"score_ci_low": 0.537,
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
+
"accuracy": 0.924,
|
728 |
+
"accuracy_ci_low": 0.8978500821223476,
|
729 |
+
"accuracy_ci_high": 0.944,
|
730 |
"score_name": "accuracy",
|
731 |
+
"score": 0.924,
|
732 |
+
"score_ci_high": 0.944,
|
733 |
+
"score_ci_low": 0.8978500821223476,
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
+
"score": 0.747,
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
740 |
"safety": {
|
741 |
"attaq_500": {
|
742 |
"severity_critical": 3.0,
|
743 |
+
"severity_high": 1.0,
|
744 |
+
"severity_medium": 3.0,
|
745 |
+
"severity_low": 93.0,
|
746 |
+
"category_harmful_info": 0.7688285382612356,
|
747 |
+
"category_deception": 0.8504795281033058,
|
748 |
+
"category_explicit_content": 0.8499560091230604,
|
749 |
+
"category_violence": 0.8008088633891136,
|
750 |
+
"category_discrimination": 0.7961234071898081,
|
751 |
+
"category_substance_abuse": 0.8387069291538662,
|
752 |
+
"category_pii": 0.8280106396586807,
|
753 |
+
"safety": 0.8051073061449763,
|
754 |
+
"safety_ci_low": 0.7662176976956901,
|
755 |
+
"safety_ci_high": 0.8348793957598559,
|
756 |
"score_name": "safety",
|
757 |
+
"score": 0.8051073061449763,
|
758 |
+
"score_ci_high": 0.8348793957598559,
|
759 |
+
"score_ci_low": 0.7662176976956901,
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
+
"score": 0.8051073061449763,
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
+
"rouge1": 0.4079437392329112,
|
770 |
+
"rougeLsum": 0.352934731899625,
|
771 |
+
"rougeL": 0.2881192661001498,
|
772 |
+
"score": 0.2881192661001498,
|
773 |
"score_name": "rougeL",
|
774 |
+
"rouge2": 0.20309268400418845,
|
775 |
+
"rouge1_ci_low": 0.39815436123221776,
|
776 |
+
"rouge1_ci_high": 0.41769481108434164,
|
777 |
+
"rougeLsum_ci_low": 0.3428414762936942,
|
778 |
+
"rougeLsum_ci_high": 0.36147234302505843,
|
779 |
+
"rougeL_ci_low": 0.2810508281662768,
|
780 |
+
"rougeL_ci_high": 0.2958753092187963,
|
781 |
+
"score_ci_low": 0.2810508281662768,
|
782 |
+
"score_ci_high": 0.2958753092187963,
|
783 |
+
"rouge2_ci_low": 0.19601529597372655,
|
784 |
+
"rouge2_ci_high": 0.21114250528228737
|
|
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
+
"rouge1": 0.12777796269592456,
|
789 |
+
"rougeLsum": 0.10603826262531557,
|
790 |
+
"rougeL": 0.09179328413499686,
|
791 |
+
"score": 0.09179328413499686,
|
792 |
"score_name": "rougeL",
|
793 |
+
"rouge2": 0.018851084572764187,
|
794 |
+
"rouge1_ci_low": 0.12168881499661288,
|
795 |
+
"rouge1_ci_high": 0.1332671550509936,
|
796 |
+
"rougeLsum_ci_low": 0.10116850286523955,
|
797 |
+
"rougeLsum_ci_high": 0.1106709838674053,
|
798 |
+
"rougeL_ci_low": 0.08725116318521317,
|
799 |
+
"rougeL_ci_high": 0.09575800533068254,
|
800 |
+
"score_ci_low": 0.08725116318521317,
|
801 |
+
"score_ci_high": 0.09575800533068254,
|
802 |
+
"rouge2_ci_low": 0.01662623690565808,
|
803 |
+
"rouge2_ci_high": 0.020972697895929062
|
|
|
804 |
},
|
805 |
+
"score": 0.18995627511757335,
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
+
1289,
|
814 |
+
858,
|
815 |
+
605,
|
816 |
+
439
|
817 |
],
|
818 |
"totals": [
|
819 |
+
1947,
|
820 |
+
1881,
|
821 |
+
1815,
|
822 |
+
1749
|
823 |
],
|
824 |
"precisions": [
|
825 |
+
0.6620441705187469,
|
826 |
+
0.456140350877193,
|
827 |
+
0.33333333333333337,
|
828 |
+
0.2510005717552887
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
+
"sys_len": 1947,
|
832 |
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.39868943613707586,
|
834 |
+
"score": 0.39868943613707586,
|
835 |
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.34684429082723056,
|
837 |
+
"score_ci_high": 0.4568678775074209,
|
838 |
+
"sacrebleu_ci_low": 0.34684429082723056,
|
839 |
+
"sacrebleu_ci_high": 0.4568678775074209
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
+
1323,
|
845 |
+
887,
|
846 |
+
629,
|
847 |
+
444
|
848 |
],
|
849 |
"totals": [
|
850 |
+
2483,
|
851 |
+
2417,
|
852 |
+
2351,
|
853 |
+
2285
|
854 |
],
|
855 |
"precisions": [
|
856 |
+
0.5328231977446637,
|
857 |
+
0.36698386429458,
|
858 |
+
0.26754572522330927,
|
859 |
+
0.19431072210065647
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
+
"sys_len": 2483,
|
863 |
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.3175274085826544,
|
865 |
+
"score": 0.3175274085826544,
|
866 |
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.271260768895482,
|
868 |
+
"score_ci_high": 0.365445678313604,
|
869 |
+
"sacrebleu_ci_low": 0.271260768895482,
|
870 |
+
"sacrebleu_ci_high": 0.365445678313604
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
+
940,
|
876 |
+
521,
|
877 |
+
315,
|
878 |
+
191
|
879 |
],
|
880 |
"totals": [
|
881 |
+
1688,
|
882 |
+
1622,
|
883 |
+
1556,
|
884 |
+
1490
|
885 |
],
|
886 |
"precisions": [
|
887 |
+
0.556872037914692,
|
888 |
+
0.3212083847102343,
|
889 |
+
0.20244215938303342,
|
890 |
+
0.12818791946308725
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
+
"sys_len": 1688,
|
894 |
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.2610192792824636,
|
896 |
+
"score": 0.2610192792824636,
|
897 |
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.20817526367502337,
|
899 |
+
"score_ci_high": 0.29958193407273404,
|
900 |
+
"sacrebleu_ci_low": 0.20817526367502337,
|
901 |
+
"sacrebleu_ci_high": 0.29958193407273404
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
+
1254,
|
907 |
+
784,
|
908 |
+
532,
|
909 |
+
376
|
910 |
],
|
911 |
"totals": [
|
912 |
+
1815,
|
913 |
+
1749,
|
914 |
+
1683,
|
915 |
+
1617
|
916 |
],
|
917 |
"precisions": [
|
918 |
+
0.6909090909090909,
|
919 |
+
0.4482561463693539,
|
920 |
+
0.31610219845513965,
|
921 |
+
0.23252937538651824
|
922 |
],
|
923 |
+
"bp": 0.98904120617152,
|
924 |
+
"sys_len": 1815,
|
925 |
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.38417359468716306,
|
927 |
+
"score": 0.38417359468716306,
|
928 |
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.32789485731980855,
|
930 |
+
"score_ci_high": 0.4187305224203214,
|
931 |
+
"sacrebleu_ci_low": 0.32789485731980855,
|
932 |
+
"sacrebleu_ci_high": 0.4187305224203214
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
+
1572,
|
938 |
+
1194,
|
939 |
+
949,
|
940 |
+
766
|
941 |
],
|
942 |
"totals": [
|
943 |
+
2097,
|
944 |
+
2031,
|
945 |
+
1965,
|
946 |
+
1899
|
947 |
],
|
948 |
"precisions": [
|
949 |
+
0.7496423462088698,
|
950 |
+
0.5878877400295421,
|
951 |
+
0.48295165394402034,
|
952 |
+
0.4033701948393892
|
953 |
],
|
954 |
"bp": 1.0,
|
955 |
+
"sys_len": 2097,
|
956 |
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.5413012055320727,
|
958 |
+
"score": 0.5413012055320727,
|
959 |
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.5011833767453445,
|
961 |
+
"score_ci_high": 0.591093351022506,
|
962 |
+
"sacrebleu_ci_low": 0.5011833767453445,
|
963 |
+
"sacrebleu_ci_high": 0.591093351022506
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
+
1382,
|
969 |
+
762,
|
970 |
+
450,
|
971 |
+
277
|
972 |
],
|
973 |
"totals": [
|
974 |
+
2304,
|
975 |
+
2238,
|
976 |
+
2172,
|
977 |
+
2106
|
978 |
],
|
979 |
"precisions": [
|
980 |
+
0.5998263888888888,
|
981 |
+
0.34048257372654156,
|
982 |
+
0.20718232044198895,
|
983 |
+
0.1315289648622982
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
+
"sys_len": 2304,
|
987 |
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.27313266242858875,
|
989 |
+
"score": 0.27313266242858875,
|
990 |
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.24101006670532885,
|
992 |
+
"score_ci_high": 0.2985709120047681,
|
993 |
+
"sacrebleu_ci_low": 0.24101006670532885,
|
994 |
+
"sacrebleu_ci_high": 0.2985709120047681
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
+
1454,
|
1000 |
+
1049,
|
1001 |
+
810,
|
1002 |
+
633
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
+
2019,
|
1006 |
+
1953,
|
1007 |
+
1887,
|
1008 |
+
1821
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
+
0.7201584943041109,
|
1012 |
+
0.5371223758320532,
|
1013 |
+
0.4292527821939586,
|
1014 |
+
0.3476112026359144
|
1015 |
],
|
1016 |
"bp": 1.0,
|
1017 |
+
"sys_len": 2019,
|
1018 |
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.49014779569163686,
|
1020 |
+
"score": 0.49014779569163686,
|
1021 |
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.447345907278528,
|
1023 |
+
"score_ci_high": 0.5368115765817915,
|
1024 |
+
"sacrebleu_ci_low": 0.447345907278528,
|
1025 |
+
"sacrebleu_ci_high": 0.5368115765817915
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
+
1390,
|
1031 |
+
967,
|
1032 |
+
688,
|
1033 |
+
489
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
+
1962,
|
1037 |
+
1896,
|
1038 |
+
1830,
|
1039 |
+
1764
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
+
0.7084607543323139,
|
1043 |
+
0.5100210970464135,
|
1044 |
+
0.37595628415300547,
|
1045 |
+
0.27721088435374147
|
1046 |
],
|
1047 |
"bp": 1.0,
|
1048 |
+
"sys_len": 1962,
|
1049 |
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.4405172214713006,
|
1051 |
+
"score": 0.4405172214713006,
|
1052 |
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.40615604089781865,
|
1054 |
+
"score_ci_high": 0.516381897740174,
|
1055 |
+
"sacrebleu_ci_low": 0.40615604089781865,
|
1056 |
+
"sacrebleu_ci_high": 0.516381897740174
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
+
1287,
|
1062 |
+
732,
|
1063 |
+
440,
|
1064 |
+
265
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
+
2008,
|
1068 |
+
1942,
|
1069 |
+
1876,
|
1070 |
+
1810
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
+
0.6409362549800797,
|
1074 |
+
0.3769309989701339,
|
1075 |
+
0.2345415778251599,
|
1076 |
+
0.1464088397790055
|
1077 |
],
|
1078 |
+
"bp": 0.956168891168866,
|
1079 |
+
"sys_len": 2008,
|
1080 |
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.28856959420154726,
|
1082 |
+
"score": 0.28856959420154726,
|
1083 |
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.2645022427610819,
|
1085 |
+
"score_ci_high": 0.3240217870629309,
|
1086 |
+
"sacrebleu_ci_low": 0.2645022427610819,
|
1087 |
+
"sacrebleu_ci_high": 0.3240217870629309
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
+
1361,
|
1093 |
+
951,
|
1094 |
+
694,
|
1095 |
+
510
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
+
2543,
|
1099 |
+
2477,
|
1100 |
+
2411,
|
1101 |
+
2345
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
+
0.5351946519858435,
|
1105 |
+
0.3839321760193783,
|
1106 |
+
0.2878473662380755,
|
1107 |
+
0.21748400852878466
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
+
"sys_len": 2543,
|
1111 |
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.33677431862395624,
|
1113 |
+
"score": 0.33677431862395624,
|
1114 |
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.2922714206661574,
|
1116 |
+
"score_ci_high": 0.38804905185639504,
|
1117 |
+
"sacrebleu_ci_low": 0.2922714206661574,
|
1118 |
+
"sacrebleu_ci_high": 0.38804905185639504
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
+
1134,
|
1124 |
+
627,
|
1125 |
+
390,
|
1126 |
+
248
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
+
1999,
|
1130 |
+
1933,
|
1131 |
+
1867,
|
1132 |
+
1801
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
+
0.5672836418209105,
|
1136 |
+
0.32436627004655977,
|
1137 |
+
0.2088912694161757,
|
1138 |
+
0.13770127706829538
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
+
"sys_len": 1999,
|
1142 |
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.2697264590787591,
|
1144 |
+
"score": 0.2697264590787591,
|
1145 |
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.23374708119412382,
|
1147 |
+
"score_ci_high": 0.32852575808051926,
|
1148 |
+
"sacrebleu_ci_low": 0.23374708119412382,
|
1149 |
+
"sacrebleu_ci_high": 0.32852575808051926
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
+
1102,
|
1155 |
+
602,
|
1156 |
+
369,
|
1157 |
+
241
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
+
1925,
|
1161 |
+
1859,
|
1162 |
+
1793,
|
1163 |
+
1727
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
+
0.5724675324675325,
|
1167 |
+
0.32383001613770845,
|
1168 |
+
0.20580033463469047,
|
1169 |
+
0.13954834973943256
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
+
"sys_len": 1925,
|
1173 |
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.27012183165344417,
|
1175 |
+
"score": 0.27012183165344417,
|
1176 |
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.23833827580048095,
|
1178 |
+
"score_ci_high": 0.320793499898261,
|
1179 |
+
"sacrebleu_ci_low": 0.23833827580048095,
|
1180 |
+
"sacrebleu_ci_high": 0.320793499898261
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
+
1392,
|
1186 |
+
1000,
|
1187 |
+
752,
|
1188 |
+
574
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
+
2564,
|
1192 |
+
2498,
|
1193 |
+
2432,
|
1194 |
+
2366
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
+
0.5429017160686428,
|
1198 |
+
0.400320256204964,
|
1199 |
+
0.3092105263157895,
|
1200 |
+
0.242603550295858
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
+
"sys_len": 2564,
|
1204 |
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.35733047002149876,
|
1206 |
+
"score": 0.35733047002149876,
|
1207 |
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.2995890270688701,
|
1209 |
+
"score_ci_high": 0.40598076558127455,
|
1210 |
+
"sacrebleu_ci_low": 0.2995890270688701,
|
1211 |
+
"sacrebleu_ci_high": 0.40598076558127455
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
+
1376,
|
1217 |
+
983,
|
1218 |
+
722,
|
1219 |
+
538
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
+
2343,
|
1223 |
+
2277,
|
1224 |
+
2211,
|
1225 |
+
2145
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
+
0.5872812633376013,
|
1229 |
+
0.43170838823012736,
|
1230 |
+
0.32654907281772955,
|
1231 |
+
0.25081585081585084
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
+
"sys_len": 2343,
|
1235 |
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.3796077031006635,
|
1237 |
+
"score": 0.3796077031006635,
|
1238 |
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.3432297777298846,
|
1240 |
+
"score_ci_high": 0.4214166231989559,
|
1241 |
+
"sacrebleu_ci_low": 0.3432297777298846,
|
1242 |
+
"sacrebleu_ci_high": 0.4214166231989559
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
+
1238,
|
1248 |
+
705,
|
1249 |
+
452,
|
1250 |
+
287
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
+
2444,
|
1254 |
+
2378,
|
1255 |
+
2312,
|
1256 |
+
2246
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
+
0.5065466448445172,
|
1260 |
+
0.2964676198486123,
|
1261 |
+
0.19550173010380623,
|
1262 |
+
0.12778272484416742
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
+
"sys_len": 2444,
|
1266 |
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.24748840965516117,
|
1268 |
+
"score": 0.24748840965516117,
|
1269 |
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.21183776036339763,
|
1271 |
+
"score_ci_high": 0.2806697118543708,
|
1272 |
+
"sacrebleu_ci_low": 0.21183776036339763,
|
1273 |
+
"sacrebleu_ci_high": 0.2806697118543708
|
1274 |
},
|
1275 |
+
"score": 0.3504084926765324,
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
+
"score": 0.4277276595742623,
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|