cccjc commited on
Commit
f9f9466
·
1 Parent(s): e8aee10

add phi-4-multimodal single-image results

Browse files
static/eval_results/SI/Phi-4-multimodal/summary_and_keyword_stats.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 273,
5
+ "num_eval_samples": 4108,
6
+ "macro_mean_score": 0.2823941192020807
7
+ },
8
+ "open": {
9
+ "num_eval_tasks": 42,
10
+ "num_eval_samples": 808,
11
+ "macro_mean_score": 0.3823445906353607
12
+ },
13
+ "overall_score": 0.295720848726518
14
+ },
15
+ "keyword_stats": {
16
+ "skills": {
17
+ "Object Recognition and Classification": {
18
+ "count": 172,
19
+ "num_samples": 2704,
20
+ "tasks": [],
21
+ "average_score": 0.30268568055335215
22
+ },
23
+ "Language Understanding and Generation": {
24
+ "count": 102,
25
+ "num_samples": 1707,
26
+ "tasks": [],
27
+ "average_score": 0.3237790975377257
28
+ },
29
+ "Commonsense and Social Reasoning": {
30
+ "count": 38,
31
+ "num_samples": 652,
32
+ "tasks": [],
33
+ "average_score": 0.3983244100964485
34
+ },
35
+ "Scene and Event Understanding": {
36
+ "count": 60,
37
+ "num_samples": 1004,
38
+ "tasks": [],
39
+ "average_score": 0.3909562927861254
40
+ },
41
+ "Domain-Specific Knowledge and Skills": {
42
+ "count": 46,
43
+ "num_samples": 896,
44
+ "tasks": [],
45
+ "average_score": 0.2523943859302362
46
+ },
47
+ "Ethical and Safety Reasoning": {
48
+ "count": 10,
49
+ "num_samples": 170,
50
+ "tasks": [],
51
+ "average_score": 0.6309661654135339
52
+ },
53
+ "Text Recognition (OCR)": {
54
+ "count": 101,
55
+ "num_samples": 1680,
56
+ "tasks": [],
57
+ "average_score": 0.29162219012223434
58
+ },
59
+ "Spatial and Temporal Reasoning": {
60
+ "count": 78,
61
+ "num_samples": 1270,
62
+ "tasks": [],
63
+ "average_score": 0.26560617638750833
64
+ },
65
+ "Mathematical and Logical Reasoning": {
66
+ "count": 91,
67
+ "num_samples": 1628,
68
+ "tasks": [],
69
+ "average_score": 0.27699332062545984
70
+ },
71
+ "Planning and Decision Making": {
72
+ "count": 23,
73
+ "num_samples": 355,
74
+ "tasks": [],
75
+ "average_score": 0.07355920237987477
76
+ }
77
+ },
78
+ "input_format": {
79
+ "Photographs": {
80
+ "count": 83,
81
+ "num_samples": 1310,
82
+ "tasks": [],
83
+ "average_score": 0.39006013553617747
84
+ },
85
+ "Artistic and Creative Content": {
86
+ "count": 22,
87
+ "num_samples": 388,
88
+ "tasks": [],
89
+ "average_score": 0.3617172121007013
90
+ },
91
+ "Diagrams and Data Visualizations": {
92
+ "count": 88,
93
+ "num_samples": 1523,
94
+ "tasks": [],
95
+ "average_score": 0.303987814969384
96
+ },
97
+ "Text-Based Images and Documents": {
98
+ "count": 53,
99
+ "num_samples": 847,
100
+ "tasks": [],
101
+ "average_score": 0.2115999424551206
102
+ },
103
+ "User Interface Screenshots": {
104
+ "count": 67,
105
+ "num_samples": 1117,
106
+ "tasks": [],
107
+ "average_score": 0.21992427425071098
108
+ },
109
+ "3D Models and Aerial Imagery": {
110
+ "count": 2,
111
+ "num_samples": 30,
112
+ "tasks": [],
113
+ "average_score": 0.059323195455102756
114
+ }
115
+ },
116
+ "output_format": {
117
+ "contextual_formatted_text": {
118
+ "count": 63,
119
+ "num_samples": 972,
120
+ "tasks": [],
121
+ "average_score": 0.2728924252231668
122
+ },
123
+ "open_ended_output": {
124
+ "count": 51,
125
+ "num_samples": 986,
126
+ "tasks": [],
127
+ "average_score": 0.3440970391600263
128
+ },
129
+ "structured_output": {
130
+ "count": 72,
131
+ "num_samples": 1120,
132
+ "tasks": [],
133
+ "average_score": 0.24303664282966675
134
+ },
135
+ "numerical_data": {
136
+ "count": 39,
137
+ "num_samples": 694,
138
+ "tasks": [],
139
+ "average_score": 0.3165017992390139
140
+ },
141
+ "multiple_choice": {
142
+ "count": 33,
143
+ "num_samples": 567,
144
+ "tasks": [],
145
+ "average_score": 0.46490882854519217
146
+ },
147
+ "exact_text": {
148
+ "count": 57,
149
+ "num_samples": 876,
150
+ "tasks": [],
151
+ "average_score": 0.2320472941505867
152
+ }
153
+ },
154
+ "input_num": {
155
+ "1-image": {
156
+ "count": 315,
157
+ "num_samples": 5215,
158
+ "tasks": [],
159
+ "average_score": 0.29572084872651805
160
+ }
161
+ },
162
+ "app": {
163
+ "Knowledge": {
164
+ "count": 77,
165
+ "num_samples": 1291,
166
+ "tasks": [],
167
+ "average_score": 0.3013586962585727
168
+ },
169
+ "Perception": {
170
+ "count": 82,
171
+ "num_samples": 1318,
172
+ "tasks": [],
173
+ "average_score": 0.44497392384996665
174
+ },
175
+ "Coding": {
176
+ "count": 16,
177
+ "num_samples": 244,
178
+ "tasks": [],
179
+ "average_score": 0.25506535947712417
180
+ },
181
+ "Science": {
182
+ "count": 22,
183
+ "num_samples": 469,
184
+ "tasks": [],
185
+ "average_score": 0.30892938761701577
186
+ },
187
+ "Information_Extraction": {
188
+ "count": 41,
189
+ "num_samples": 639,
190
+ "tasks": [],
191
+ "average_score": 0.23344165315836407
192
+ },
193
+ "Planning": {
194
+ "count": 44,
195
+ "num_samples": 712,
196
+ "tasks": [],
197
+ "average_score": 0.10875081865164185
198
+ },
199
+ "Mathematics": {
200
+ "count": 30,
201
+ "num_samples": 497,
202
+ "tasks": [],
203
+ "average_score": 0.2414999173639976
204
+ },
205
+ "Metrics": {
206
+ "count": 3,
207
+ "num_samples": 45,
208
+ "tasks": [],
209
+ "average_score": 0.326984126984127
210
+ }
211
+ }
212
+ }
213
+ }
static/eval_results/SI/Phi-4-multimodal/task_results.json ADDED
@@ -0,0 +1,2207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "name": "ascii_art_30",
4
+ "score": 0.07142857142857142,
5
+ "eval_type": "llm",
6
+ "num_demo": 1,
7
+ "num_query": 14
8
+ },
9
+ {
10
+ "name": "humor_explanation",
11
+ "score": 0.3066666666666667,
12
+ "eval_type": "llm",
13
+ "num_demo": 1,
14
+ "num_query": 15
15
+ },
16
+ {
17
+ "name": "science_figure_explanation",
18
+ "score": 0.5000000000000001,
19
+ "eval_type": "llm",
20
+ "num_demo": 1,
21
+ "num_query": 29
22
+ },
23
+ {
24
+ "name": "vibe_eval_phrase",
25
+ "score": 0.43571428571428567,
26
+ "eval_type": "llm",
27
+ "num_demo": 1,
28
+ "num_query": 14
29
+ },
30
+ {
31
+ "name": "traffic_accident_analysis",
32
+ "score": 0.042857142857142864,
33
+ "eval_type": "llm",
34
+ "num_demo": 1,
35
+ "num_query": 14
36
+ },
37
+ {
38
+ "name": "figurative_speech_explanation",
39
+ "score": 0.6137931034482759,
40
+ "eval_type": "llm",
41
+ "num_demo": 1,
42
+ "num_query": 29
43
+ },
44
+ {
45
+ "name": "table2latex_complex",
46
+ "score": 0.43333333333333335,
47
+ "eval_type": "llm",
48
+ "num_demo": 1,
49
+ "num_query": 9
50
+ },
51
+ {
52
+ "name": "unusual_images",
53
+ "score": 0.5689655172413793,
54
+ "eval_type": "llm",
55
+ "num_demo": 1,
56
+ "num_query": 29
57
+ },
58
+ {
59
+ "name": "art_explanation",
60
+ "score": 0.3068965517241379,
61
+ "eval_type": "llm",
62
+ "num_demo": 1,
63
+ "num_query": 29
64
+ },
65
+ {
66
+ "name": "ocr_open_ended_qa",
67
+ "score": 0.6896551724137931,
68
+ "eval_type": "llm",
69
+ "num_demo": 1,
70
+ "num_query": 29
71
+ },
72
+ {
73
+ "name": "bar_chart_interpretation",
74
+ "score": 0.39655172413793105,
75
+ "eval_type": "llm",
76
+ "num_demo": 1,
77
+ "num_query": 29
78
+ },
79
+ {
80
+ "name": "scibench_w_solution_open_ended",
81
+ "score": 0.17,
82
+ "eval_type": "llm",
83
+ "num_demo": 1,
84
+ "num_query": 25
85
+ },
86
+ {
87
+ "name": "GUI_Chat_Hard",
88
+ "score": 0.6153846153846154,
89
+ "eval_type": "llm",
90
+ "num_demo": 1,
91
+ "num_query": 26
92
+ },
93
+ {
94
+ "name": "image_humor_understanding",
95
+ "score": 0.5793103448275864,
96
+ "eval_type": "llm",
97
+ "num_demo": 1,
98
+ "num_query": 29
99
+ },
100
+ {
101
+ "name": "defeasible_reasoning",
102
+ "score": 0.5827586206896552,
103
+ "eval_type": "llm",
104
+ "num_demo": 1,
105
+ "num_query": 29
106
+ },
107
+ {
108
+ "name": "funny_image_title",
109
+ "score": 0.6071428571428571,
110
+ "eval_type": "llm",
111
+ "num_demo": 1,
112
+ "num_query": 14
113
+ },
114
+ {
115
+ "name": "tweets_captioning",
116
+ "score": 0.2928571428571428,
117
+ "eval_type": "llm",
118
+ "num_demo": 1,
119
+ "num_query": 14
120
+ },
121
+ {
122
+ "name": "graph_interpretation",
123
+ "score": 0.503448275862069,
124
+ "eval_type": "llm",
125
+ "num_demo": 1,
126
+ "num_query": 29
127
+ },
128
+ {
129
+ "name": "meme_explain",
130
+ "score": 0.07142857142857142,
131
+ "eval_type": "llm",
132
+ "num_demo": 1,
133
+ "num_query": 14
134
+ },
135
+ {
136
+ "name": "guess_image_generation_prompt",
137
+ "score": 0.6736842105263158,
138
+ "eval_type": "llm",
139
+ "num_demo": 1,
140
+ "num_query": 19
141
+ },
142
+ {
143
+ "name": "visualization_with_code",
144
+ "score": 0.27142857142857146,
145
+ "eval_type": "llm",
146
+ "num_demo": 1,
147
+ "num_query": 14
148
+ },
149
+ {
150
+ "name": "iq_test_open_ended",
151
+ "score": 0.2827586206896551,
152
+ "eval_type": "llm",
153
+ "num_demo": 1,
154
+ "num_query": 29
155
+ },
156
+ {
157
+ "name": "electrocardiogram",
158
+ "score": 0.1642857142857143,
159
+ "eval_type": "llm",
160
+ "num_demo": 1,
161
+ "num_query": 14
162
+ },
163
+ {
164
+ "name": "image_captioning_with_additional_requirements",
165
+ "score": 0.742857142857143,
166
+ "eval_type": "llm",
167
+ "num_demo": 1,
168
+ "num_query": 14
169
+ },
170
+ {
171
+ "name": "docci_image_description_long",
172
+ "score": 0.4642857142857143,
173
+ "eval_type": "llm",
174
+ "num_demo": 1,
175
+ "num_query": 14
176
+ },
177
+ {
178
+ "name": "GUI_Chat_Easy",
179
+ "score": 0.6884615384615385,
180
+ "eval_type": "llm",
181
+ "num_demo": 1,
182
+ "num_query": 26
183
+ },
184
+ {
185
+ "name": "bridge_strategies_advanced",
186
+ "score": 0.10714285714285714,
187
+ "eval_type": "llm",
188
+ "num_demo": 1,
189
+ "num_query": 14
190
+ },
191
+ {
192
+ "name": "bridge_strategies_worldclass",
193
+ "score": 0.014285714285714287,
194
+ "eval_type": "llm",
195
+ "num_demo": 1,
196
+ "num_query": 14
197
+ },
198
+ {
199
+ "name": "bridge_strategies_expert",
200
+ "score": 0.2928571428571428,
201
+ "eval_type": "llm",
202
+ "num_demo": 1,
203
+ "num_query": 14
204
+ },
205
+ {
206
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
207
+ "score": 0.07857142857142858,
208
+ "eval_type": "llm",
209
+ "num_demo": 1,
210
+ "num_query": 14
211
+ },
212
+ {
213
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
214
+ "score": 0.09285714285714285,
215
+ "eval_type": "llm",
216
+ "num_demo": 1,
217
+ "num_query": 14
218
+ },
219
+ {
220
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
221
+ "score": 0.11428571428571428,
222
+ "eval_type": "llm",
223
+ "num_demo": 1,
224
+ "num_query": 14
225
+ },
226
+ {
227
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
228
+ "score": 0.05714285714285715,
229
+ "eval_type": "llm",
230
+ "num_demo": 1,
231
+ "num_query": 14
232
+ },
233
+ {
234
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
235
+ "score": 0.03571428571428571,
236
+ "eval_type": "llm",
237
+ "num_demo": 1,
238
+ "num_query": 14
239
+ },
240
+ {
241
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
242
+ "score": 0.03571428571428571,
243
+ "eval_type": "llm",
244
+ "num_demo": 1,
245
+ "num_query": 14
246
+ },
247
+ {
248
+ "name": "table_understanding_fetaqa",
249
+ "score": 0.35000000000000003,
250
+ "eval_type": "llm",
251
+ "num_demo": 1,
252
+ "num_query": 14
253
+ },
254
+ {
255
+ "name": "red_teaming_celebrity",
256
+ "score": 0.8300000000000001,
257
+ "eval_type": "llm",
258
+ "num_demo": 0,
259
+ "num_query": 20
260
+ },
261
+ {
262
+ "name": "red_teaming_captcha",
263
+ "score": 0.10000000000000003,
264
+ "eval_type": "llm",
265
+ "num_demo": 1,
266
+ "num_query": 19
267
+ },
268
+ {
269
+ "name": "red_teaming_jailbreak",
270
+ "score": 0.6700000000000002,
271
+ "eval_type": "llm",
272
+ "num_demo": 0,
273
+ "num_query": 20
274
+ },
275
+ {
276
+ "name": "red_teaming_visualmisleading",
277
+ "score": 0.8789473684210528,
278
+ "eval_type": "llm",
279
+ "num_demo": 1,
280
+ "num_query": 19
281
+ },
282
+ {
283
+ "name": "red_teaming_racial",
284
+ "score": 0.675,
285
+ "eval_type": "llm",
286
+ "num_demo": 0,
287
+ "num_query": 20
288
+ },
289
+ {
290
+ "name": "red_teaming_politics",
291
+ "score": 0.6499999999999999,
292
+ "eval_type": "llm",
293
+ "num_demo": 0,
294
+ "num_query": 20
295
+ },
296
+ {
297
+ "name": "brand_logo_recognition_and_elaboration",
298
+ "score": 0.34,
299
+ "eval_type": "rule",
300
+ "num_demo": 1,
301
+ "num_query": 25
302
+ },
303
+ {
304
+ "name": "exchange_rate_estimate_plot",
305
+ "score": 0.922935714285714,
306
+ "eval_type": "rule",
307
+ "num_demo": 1,
308
+ "num_query": 14
309
+ },
310
+ {
311
+ "name": "math_parity",
312
+ "score": 0.4666666666666667,
313
+ "eval_type": "rule",
314
+ "num_demo": 1,
315
+ "num_query": 15
316
+ },
317
+ {
318
+ "name": "traffic_future_prediction_from_line_plot",
319
+ "score": 0.5039473684210526,
320
+ "eval_type": "rule",
321
+ "num_demo": 1,
322
+ "num_query": 19
323
+ },
324
+ {
325
+ "name": "graph_chordless_cycle",
326
+ "score": 0.35714285714285715,
327
+ "eval_type": "rule",
328
+ "num_demo": 1,
329
+ "num_query": 14
330
+ },
331
+ {
332
+ "name": "youtube_video_info_parsing",
333
+ "score": 0.3809523809523809,
334
+ "eval_type": "rule",
335
+ "num_demo": 1,
336
+ "num_query": 14
337
+ },
338
+ {
339
+ "name": "super_clevr_scene_understanding",
340
+ "score": 0.6428571428571429,
341
+ "eval_type": "rule",
342
+ "num_demo": 1,
343
+ "num_query": 14
344
+ },
345
+ {
346
+ "name": "figureqa",
347
+ "score": 0.5,
348
+ "eval_type": "rule",
349
+ "num_demo": 1,
350
+ "num_query": 14
351
+ },
352
+ {
353
+ "name": "face_keypoint_detection",
354
+ "score": 0.848854419078294,
355
+ "eval_type": "rule",
356
+ "num_demo": 1,
357
+ "num_query": 14
358
+ },
359
+ {
360
+ "name": "widerface_face_count_and_event_classification",
361
+ "score": 0.42857142857142855,
362
+ "eval_type": "rule",
363
+ "num_demo": 1,
364
+ "num_query": 14
365
+ },
366
+ {
367
+ "name": "average_humidity_estimate_plot",
368
+ "score": 0.5680000000000001,
369
+ "eval_type": "rule",
370
+ "num_demo": 1,
371
+ "num_query": 15
372
+ },
373
+ {
374
+ "name": "weather_info_parsing",
375
+ "score": 0.20634920634920634,
376
+ "eval_type": "rule",
377
+ "num_demo": 1,
378
+ "num_query": 14
379
+ },
380
+ {
381
+ "name": "egocentric_analysis_single_image",
382
+ "score": 0.3333333333333333,
383
+ "eval_type": "rule",
384
+ "num_demo": 1,
385
+ "num_query": 9
386
+ },
387
+ {
388
+ "name": "waybill_number_sequence_extraction",
389
+ "score": 0.35714285714285715,
390
+ "eval_type": "rule",
391
+ "num_demo": 1,
392
+ "num_query": 14
393
+ },
394
+ {
395
+ "name": "graph_maxflow",
396
+ "score": 0.0,
397
+ "eval_type": "rule",
398
+ "num_demo": 1,
399
+ "num_query": 15
400
+ },
401
+ {
402
+ "name": "TV_show_info_parsing",
403
+ "score": 0.5634920634920635,
404
+ "eval_type": "rule",
405
+ "num_demo": 1,
406
+ "num_query": 14
407
+ },
408
+ {
409
+ "name": "insect_order_classification",
410
+ "score": 0.13333333333333333,
411
+ "eval_type": "rule",
412
+ "num_demo": 1,
413
+ "num_query": 15
414
+ },
415
+ {
416
+ "name": "electricity_plot_future_prediction",
417
+ "score": 0.4795157894736843,
418
+ "eval_type": "rule",
419
+ "num_demo": 1,
420
+ "num_query": 19
421
+ },
422
+ {
423
+ "name": "chemistry_exams_v",
424
+ "score": 0.07142857142857142,
425
+ "eval_type": "rule",
426
+ "num_demo": 1,
427
+ "num_query": 14
428
+ },
429
+ {
430
+ "name": "finance_table_understanding",
431
+ "score": 0.07142857142857142,
432
+ "eval_type": "rule",
433
+ "num_demo": 1,
434
+ "num_query": 14
435
+ },
436
+ {
437
+ "name": "funsd_document_qa",
438
+ "score": 0.7142857142857143,
439
+ "eval_type": "rule",
440
+ "num_demo": 1,
441
+ "num_query": 14
442
+ },
443
+ {
444
+ "name": "vibe_eval_open",
445
+ "score": 0.07142857142857142,
446
+ "eval_type": "rule",
447
+ "num_demo": 1,
448
+ "num_query": 14
449
+ },
450
+ {
451
+ "name": "question_solution_solving",
452
+ "score": 0.0,
453
+ "eval_type": "rule",
454
+ "num_demo": 1,
455
+ "num_query": 14
456
+ },
457
+ {
458
+ "name": "graph_theory",
459
+ "score": 0.21428571428571427,
460
+ "eval_type": "rule",
461
+ "num_demo": 1,
462
+ "num_query": 14
463
+ },
464
+ {
465
+ "name": "geometry_analytic",
466
+ "score": 0.14285714285714285,
467
+ "eval_type": "rule",
468
+ "num_demo": 1,
469
+ "num_query": 14
470
+ },
471
+ {
472
+ "name": "geometry_length",
473
+ "score": 0.14285714285714285,
474
+ "eval_type": "rule",
475
+ "num_demo": 1,
476
+ "num_query": 14
477
+ },
478
+ {
479
+ "name": "algebra",
480
+ "score": 0.14285714285714285,
481
+ "eval_type": "rule",
482
+ "num_demo": 1,
483
+ "num_query": 14
484
+ },
485
+ {
486
+ "name": "chess_puzzle_single_step",
487
+ "score": 0.0,
488
+ "eval_type": "rule",
489
+ "num_demo": 1,
490
+ "num_query": 15
491
+ },
492
+ {
493
+ "name": "chess_winner_identification",
494
+ "score": 0.3333333333333333,
495
+ "eval_type": "rule",
496
+ "num_demo": 1,
497
+ "num_query": 15
498
+ },
499
+ {
500
+ "name": "physical_property_reasoning",
501
+ "score": 0.7857142857142857,
502
+ "eval_type": "rule",
503
+ "num_demo": 1,
504
+ "num_query": 14
505
+ },
506
+ {
507
+ "name": "humor_understand_caption_match",
508
+ "score": 0.13333333333333333,
509
+ "eval_type": "rule",
510
+ "num_demo": 1,
511
+ "num_query": 15
512
+ },
513
+ {
514
+ "name": "coco_object_detection_by_query_property",
515
+ "score": 0.45080300212210156,
516
+ "eval_type": "rule",
517
+ "num_demo": 1,
518
+ "num_query": 14
519
+ },
520
+ {
521
+ "name": "multilingual_game_info_parsing",
522
+ "score": 0.375,
523
+ "eval_type": "rule",
524
+ "num_demo": 1,
525
+ "num_query": 14
526
+ },
527
+ {
528
+ "name": "mnist_pattern",
529
+ "score": 0.0,
530
+ "eval_type": "rule",
531
+ "num_demo": 1,
532
+ "num_query": 14
533
+ },
534
+ {
535
+ "name": "dvqa",
536
+ "score": 0.9473684210526315,
537
+ "eval_type": "rule",
538
+ "num_demo": 1,
539
+ "num_query": 19
540
+ },
541
+ {
542
+ "name": "physics_exams_v",
543
+ "score": 0.21428571428571427,
544
+ "eval_type": "rule",
545
+ "num_demo": 1,
546
+ "num_query": 14
547
+ },
548
+ {
549
+ "name": "snli_ve_visual_entailment",
550
+ "score": 0.7333333333333333,
551
+ "eval_type": "rule",
552
+ "num_demo": 1,
553
+ "num_query": 15
554
+ },
555
+ {
556
+ "name": "3d_indoor_scene_text_bbox_selection",
557
+ "score": 0.07142857142857142,
558
+ "eval_type": "rule",
559
+ "num_demo": 1,
560
+ "num_query": 14
561
+ },
562
+ {
563
+ "name": "geometry_descriptive",
564
+ "score": 0.14285714285714285,
565
+ "eval_type": "rule",
566
+ "num_demo": 1,
567
+ "num_query": 14
568
+ },
569
+ {
570
+ "name": "top_rated_hotel_identification",
571
+ "score": 0.42857142857142855,
572
+ "eval_type": "rule",
573
+ "num_demo": 1,
574
+ "num_query": 14
575
+ },
576
+ {
577
+ "name": "science_molecule_chemistry",
578
+ "score": 0.5333333333333333,
579
+ "eval_type": "rule",
580
+ "num_demo": 1,
581
+ "num_query": 15
582
+ },
583
+ {
584
+ "name": "game_info_parsing",
585
+ "score": 0.7857142857142856,
586
+ "eval_type": "rule",
587
+ "num_demo": 1,
588
+ "num_query": 14
589
+ },
590
+ {
591
+ "name": "deciphering_oracle_bone",
592
+ "score": 0.0,
593
+ "eval_type": "rule",
594
+ "num_demo": 1,
595
+ "num_query": 14
596
+ },
597
+ {
598
+ "name": "signboard_identification",
599
+ "score": 0.42857142857142855,
600
+ "eval_type": "rule",
601
+ "num_demo": 1,
602
+ "num_query": 14
603
+ },
604
+ {
605
+ "name": "image_style_recognition",
606
+ "score": 1.0,
607
+ "eval_type": "rule",
608
+ "num_demo": 1,
609
+ "num_query": 14
610
+ },
611
+ {
612
+ "name": "math_convexity_value_estimation",
613
+ "score": 0.3504934034834792,
614
+ "eval_type": "rule",
615
+ "num_demo": 1,
616
+ "num_query": 15
617
+ },
618
+ {
619
+ "name": "3d_indoor_scene_text_bbox_prediction",
620
+ "score": 0.047217819481634095,
621
+ "eval_type": "rule",
622
+ "num_demo": 1,
623
+ "num_query": 14
624
+ },
625
+ {
626
+ "name": "movie_info_parsing",
627
+ "score": 0.36607142857142855,
628
+ "eval_type": "rule",
629
+ "num_demo": 1,
630
+ "num_query": 14
631
+ },
632
+ {
633
+ "name": "human_relationship_reasoning",
634
+ "score": 0.7857142857142857,
635
+ "eval_type": "rule",
636
+ "num_demo": 1,
637
+ "num_query": 14
638
+ },
639
+ {
640
+ "name": "graph_shortest_path_kamada_kawai",
641
+ "score": 0.2857142857142857,
642
+ "eval_type": "rule",
643
+ "num_demo": 1,
644
+ "num_query": 14
645
+ },
646
+ {
647
+ "name": "coco_person_detection",
648
+ "score": 0.5004101034934384,
649
+ "eval_type": "rule",
650
+ "num_demo": 1,
651
+ "num_query": 14
652
+ },
653
+ {
654
+ "name": "chart_vqa",
655
+ "score": 0.21428571428571427,
656
+ "eval_type": "rule",
657
+ "num_demo": 1,
658
+ "num_query": 14
659
+ },
660
+ {
661
+ "name": "nlvr2_two_image_compare_qa",
662
+ "score": 0.42857142857142855,
663
+ "eval_type": "rule",
664
+ "num_demo": 1,
665
+ "num_query": 14
666
+ },
667
+ {
668
+ "name": "math_exams_v",
669
+ "score": 0.2857142857142857,
670
+ "eval_type": "rule",
671
+ "num_demo": 1,
672
+ "num_query": 14
673
+ },
674
+ {
675
+ "name": "newspaper_ocr_in_query_box",
676
+ "score": 0.26666666666666666,
677
+ "eval_type": "rule",
678
+ "num_demo": 1,
679
+ "num_query": 15
680
+ },
681
+ {
682
+ "name": "mvsa_sentiment_classification",
683
+ "score": 0.7857142857142857,
684
+ "eval_type": "rule",
685
+ "num_demo": 1,
686
+ "num_query": 14
687
+ },
688
+ {
689
+ "name": "egocentric_spatial_reasoning",
690
+ "score": 0.5555555555555556,
691
+ "eval_type": "rule",
692
+ "num_demo": 1,
693
+ "num_query": 9
694
+ },
695
+ {
696
+ "name": "graph_isomorphism",
697
+ "score": 0.5333333333333333,
698
+ "eval_type": "rule",
699
+ "num_demo": 1,
700
+ "num_query": 15
701
+ },
702
+ {
703
+ "name": "code_programming_test_easy",
704
+ "score": 0.0,
705
+ "eval_type": "rule",
706
+ "num_demo": 1,
707
+ "num_query": 24
708
+ },
709
+ {
710
+ "name": "biology_exams_v",
711
+ "score": 0.35714285714285715,
712
+ "eval_type": "rule",
713
+ "num_demo": 1,
714
+ "num_query": 14
715
+ },
716
+ {
717
+ "name": "long_string_number_recognition",
718
+ "score": 0.21428571428571427,
719
+ "eval_type": "rule",
720
+ "num_demo": 1,
721
+ "num_query": 14
722
+ },
723
+ {
724
+ "name": "kvqa_knowledge_aware_qa",
725
+ "score": 0.5263157894736842,
726
+ "eval_type": "rule",
727
+ "num_demo": 1,
728
+ "num_query": 19
729
+ },
730
+ {
731
+ "name": "math_breakpoint",
732
+ "score": 0.7333333333333333,
733
+ "eval_type": "rule",
734
+ "num_demo": 1,
735
+ "num_query": 15
736
+ },
737
+ {
738
+ "name": "landmark_recognition_and_qa",
739
+ "score": 0.24444444444444446,
740
+ "eval_type": "rule",
741
+ "num_demo": 1,
742
+ "num_query": 15
743
+ },
744
+ {
745
+ "name": "map_diagram_qa",
746
+ "score": 0.5,
747
+ "eval_type": "rule",
748
+ "num_demo": 1,
749
+ "num_query": 14
750
+ },
751
+ {
752
+ "name": "pmc_vqa_medical_image_qa",
753
+ "score": 0.47368421052631576,
754
+ "eval_type": "rule",
755
+ "num_demo": 1,
756
+ "num_query": 19
757
+ },
758
+ {
759
+ "name": "newspaper_page_parse_and_count",
760
+ "score": 0.28888888888888886,
761
+ "eval_type": "rule",
762
+ "num_demo": 1,
763
+ "num_query": 15
764
+ },
765
+ {
766
+ "name": "science_basic_physics",
767
+ "score": 0.7333333333333333,
768
+ "eval_type": "rule",
769
+ "num_demo": 1,
770
+ "num_query": 15
771
+ },
772
+ {
773
+ "name": "electricity_future_prediction_from_table",
774
+ "score": 0.7136842105263157,
775
+ "eval_type": "rule",
776
+ "num_demo": 1,
777
+ "num_query": 19
778
+ },
779
+ {
780
+ "name": "license_plate_recognition",
781
+ "score": 0.42857142857142855,
782
+ "eval_type": "rule",
783
+ "num_demo": 1,
784
+ "num_query": 14
785
+ },
786
+ {
787
+ "name": "places365_scene_type_classification",
788
+ "score": 0.5714285714285714,
789
+ "eval_type": "rule",
790
+ "num_demo": 1,
791
+ "num_query": 14
792
+ },
793
+ {
794
+ "name": "music_info_parsing",
795
+ "score": 0.2857142857142857,
796
+ "eval_type": "rule",
797
+ "num_demo": 1,
798
+ "num_query": 14
799
+ },
800
+ {
801
+ "name": "multilingual_movie_info_parsing",
802
+ "score": 0.26530612244897955,
803
+ "eval_type": "rule",
804
+ "num_demo": 1,
805
+ "num_query": 14
806
+ },
807
+ {
808
+ "name": "iconqa_count_and_reasoning",
809
+ "score": 0.631578947368421,
810
+ "eval_type": "rule",
811
+ "num_demo": 1,
812
+ "num_query": 19
813
+ },
814
+ {
815
+ "name": "graph_connectivity",
816
+ "score": 0.16666666666666666,
817
+ "eval_type": "rule",
818
+ "num_demo": 1,
819
+ "num_query": 15
820
+ },
821
+ {
822
+ "name": "graph_shortest_path_planar",
823
+ "score": 0.2857142857142857,
824
+ "eval_type": "rule",
825
+ "num_demo": 1,
826
+ "num_query": 14
827
+ },
828
+ {
829
+ "name": "famous_building_recognition",
830
+ "score": 0.0,
831
+ "eval_type": "rule",
832
+ "num_demo": 1,
833
+ "num_query": 16
834
+ },
835
+ {
836
+ "name": "geometry_transformation",
837
+ "score": 0.14285714285714285,
838
+ "eval_type": "rule",
839
+ "num_demo": 1,
840
+ "num_query": 14
841
+ },
842
+ {
843
+ "name": "long_string_letter_recognition",
844
+ "score": 0.0,
845
+ "eval_type": "rule",
846
+ "num_demo": 1,
847
+ "num_query": 14
848
+ },
849
+ {
850
+ "name": "handwritten_math_expression_extraction",
851
+ "score": 0.7857142857142857,
852
+ "eval_type": "rule",
853
+ "num_demo": 1,
854
+ "num_query": 14
855
+ },
856
+ {
857
+ "name": "geometry_solid",
858
+ "score": 0.07142857142857142,
859
+ "eval_type": "rule",
860
+ "num_demo": 1,
861
+ "num_query": 14
862
+ },
863
+ {
864
+ "name": "animal_pose_estimation",
865
+ "score": 0.20255737352831932,
866
+ "eval_type": "rule",
867
+ "num_demo": 1,
868
+ "num_query": 14
869
+ },
870
+ {
871
+ "name": "single_person_pose_estimation",
872
+ "score": 0.11257002137534133,
873
+ "eval_type": "rule",
874
+ "num_demo": 1,
875
+ "num_query": 14
876
+ },
877
+ {
878
+ "name": "geometry_area",
879
+ "score": 0.2857142857142857,
880
+ "eval_type": "rule",
881
+ "num_demo": 1,
882
+ "num_query": 14
883
+ },
884
+ {
885
+ "name": "hotel_booking_confirmation_parsing",
886
+ "score": 0.1928571428571429,
887
+ "eval_type": "rule",
888
+ "num_demo": 1,
889
+ "num_query": 14
890
+ },
891
+ {
892
+ "name": "ili_ratio_future_prediction",
893
+ "score": 0.08607142857142856,
894
+ "eval_type": "rule",
895
+ "num_demo": 1,
896
+ "num_query": 14
897
+ },
898
+ {
899
+ "name": "electricity_load_estimate_plot",
900
+ "score": 0.5109285714285715,
901
+ "eval_type": "rule",
902
+ "num_demo": 1,
903
+ "num_query": 14
904
+ },
905
+ {
906
+ "name": "tqa_textbook_qa",
907
+ "score": 0.7857142857142857,
908
+ "eval_type": "rule",
909
+ "num_demo": 1,
910
+ "num_query": 14
911
+ },
912
+ {
913
+ "name": "stock_info_parsing",
914
+ "score": 0.6890756302521008,
915
+ "eval_type": "rule",
916
+ "num_demo": 1,
917
+ "num_query": 14
918
+ },
919
+ {
920
+ "name": "quizlet_question_solving",
921
+ "score": 0.21428571428571427,
922
+ "eval_type": "rule",
923
+ "num_demo": 1,
924
+ "num_query": 14
925
+ },
926
+ {
927
+ "name": "stock_price_future_prediction",
928
+ "score": 0.4992142857142858,
929
+ "eval_type": "rule",
930
+ "num_demo": 1,
931
+ "num_query": 14
932
+ },
933
+ {
934
+ "name": "Ad_count_detection",
935
+ "score": 0.2857142857142857,
936
+ "eval_type": "rule",
937
+ "num_demo": 1,
938
+ "num_query": 14
939
+ },
940
+ {
941
+ "name": "recover_masked_word_in_figure",
942
+ "score": 0.0,
943
+ "eval_type": "rule",
944
+ "num_demo": 1,
945
+ "num_query": 14
946
+ },
947
+ {
948
+ "name": "polygon_interior_angles",
949
+ "score": 0.0,
950
+ "eval_type": "rule",
951
+ "num_demo": 1,
952
+ "num_query": 14
953
+ },
954
+ {
955
+ "name": "web_action_grounding",
956
+ "score": 0.7142857142857143,
957
+ "eval_type": "rule",
958
+ "num_demo": 1,
959
+ "num_query": 14
960
+ },
961
+ {
962
+ "name": "latex_complex_formula_convertion",
963
+ "score": 0.058823529411764705,
964
+ "eval_type": "rule",
965
+ "num_demo": 1,
966
+ "num_query": 17
967
+ },
968
+ {
969
+ "name": "transit_map_intersection_points",
970
+ "score": 0.11607142857142858,
971
+ "eval_type": "rule",
972
+ "num_demo": 1,
973
+ "num_query": 14
974
+ },
975
+ {
976
+ "name": "arxiv_vqa",
977
+ "score": 0.6428571428571429,
978
+ "eval_type": "rule",
979
+ "num_demo": 1,
980
+ "num_query": 14
981
+ },
982
+ {
983
+ "name": "medical_image_artifacts_indentification",
984
+ "score": 0.21428571428571427,
985
+ "eval_type": "rule",
986
+ "num_demo": 1,
987
+ "num_query": 14
988
+ },
989
+ {
990
+ "name": "song_title_identification_from_lyrics",
991
+ "score": 0.10714285714285714,
992
+ "eval_type": "rule",
993
+ "num_demo": 1,
994
+ "num_query": 14
995
+ },
996
+ {
997
+ "name": "actor_recognition_in_Movie",
998
+ "score": 0.21428571428571427,
999
+ "eval_type": "rule",
1000
+ "num_demo": 1,
1001
+ "num_query": 14
1002
+ },
1003
+ {
1004
+ "name": "bongard_problem",
1005
+ "score": 0.15789473684210525,
1006
+ "eval_type": "rule",
1007
+ "num_demo": 1,
1008
+ "num_query": 19
1009
+ },
1010
+ {
1011
+ "name": "ascii_art_understanding",
1012
+ "score": 0.5,
1013
+ "eval_type": "rule",
1014
+ "num_demo": 1,
1015
+ "num_query": 14
1016
+ },
1017
+ {
1018
+ "name": "calendar_schedule_suggestion",
1019
+ "score": 0.07142857142857142,
1020
+ "eval_type": "rule",
1021
+ "num_demo": 1,
1022
+ "num_query": 14
1023
+ },
1024
+ {
1025
+ "name": "geometry_reasoning_overlapped_circle",
1026
+ "score": 0.07142857142857142,
1027
+ "eval_type": "rule",
1028
+ "num_demo": 1,
1029
+ "num_query": 14
1030
+ },
1031
+ {
1032
+ "name": "planning_screenshot_barman",
1033
+ "score": 0.0,
1034
+ "eval_type": "rule",
1035
+ "num_demo": 1,
1036
+ "num_query": 15
1037
+ },
1038
+ {
1039
+ "name": "planning_screenshot_floortile",
1040
+ "score": 0.0,
1041
+ "eval_type": "rule",
1042
+ "num_demo": 1,
1043
+ "num_query": 15
1044
+ },
1045
+ {
1046
+ "name": "medical_blood_vessels_recognition",
1047
+ "score": 0.35714285714285715,
1048
+ "eval_type": "rule",
1049
+ "num_demo": 1,
1050
+ "num_query": 14
1051
+ },
1052
+ {
1053
+ "name": "location_vqa",
1054
+ "score": 0.35714285714285715,
1055
+ "eval_type": "rule",
1056
+ "num_demo": 1,
1057
+ "num_query": 14
1058
+ },
1059
+ {
1060
+ "name": "mindmap_elements_parsing",
1061
+ "score": 0.2857142857142857,
1062
+ "eval_type": "rule",
1063
+ "num_demo": 1,
1064
+ "num_query": 14
1065
+ },
1066
+ {
1067
+ "name": "mensa_iq_test",
1068
+ "score": 0.25490196078431376,
1069
+ "eval_type": "rule",
1070
+ "num_demo": 1,
1071
+ "num_query": 17
1072
+ },
1073
+ {
1074
+ "name": "flowchart_code_generation",
1075
+ "score": 0.4444444444444444,
1076
+ "eval_type": "rule",
1077
+ "num_demo": 1,
1078
+ "num_query": 9
1079
+ },
1080
+ {
1081
+ "name": "stackoverflow_debug_QA",
1082
+ "score": 0.4523809523809524,
1083
+ "eval_type": "rule",
1084
+ "num_demo": 1,
1085
+ "num_query": 14
1086
+ },
1087
+ {
1088
+ "name": "logical_reasoning_find_odd_one_out",
1089
+ "score": 0.0,
1090
+ "eval_type": "rule",
1091
+ "num_demo": 1,
1092
+ "num_query": 14
1093
+ },
1094
+ {
1095
+ "name": "web_action_prediction",
1096
+ "score": 0.5714285714285714,
1097
+ "eval_type": "rule",
1098
+ "num_demo": 1,
1099
+ "num_query": 14
1100
+ },
1101
+ {
1102
+ "name": "code_execution",
1103
+ "score": 0.0,
1104
+ "eval_type": "rule",
1105
+ "num_demo": 1,
1106
+ "num_query": 16
1107
+ },
1108
+ {
1109
+ "name": "music_sheet_format_QA",
1110
+ "score": 0.2857142857142857,
1111
+ "eval_type": "rule",
1112
+ "num_demo": 1,
1113
+ "num_query": 14
1114
+ },
1115
+ {
1116
+ "name": "annoying_word_search",
1117
+ "score": 0.0,
1118
+ "eval_type": "rule",
1119
+ "num_demo": 1,
1120
+ "num_query": 14
1121
+ },
1122
+ {
1123
+ "name": "interpret_force_perspective_illusion",
1124
+ "score": 0.6,
1125
+ "eval_type": "rule",
1126
+ "num_demo": 1,
1127
+ "num_query": 15
1128
+ },
1129
+ {
1130
+ "name": "healthcare_info_judgement",
1131
+ "score": 0.5714285714285714,
1132
+ "eval_type": "rule",
1133
+ "num_demo": 1,
1134
+ "num_query": 14
1135
+ },
1136
+ {
1137
+ "name": "geometry_plot_position_relationship",
1138
+ "score": 0.6428571428571429,
1139
+ "eval_type": "rule",
1140
+ "num_demo": 1,
1141
+ "num_query": 14
1142
+ },
1143
+ {
1144
+ "name": "relative_depth_of_different_points",
1145
+ "score": 0.5,
1146
+ "eval_type": "rule",
1147
+ "num_demo": 1,
1148
+ "num_query": 14
1149
+ },
1150
+ {
1151
+ "name": "topological_sort",
1152
+ "score": 0.0,
1153
+ "eval_type": "rule",
1154
+ "num_demo": 1,
1155
+ "num_query": 14
1156
+ },
1157
+ {
1158
+ "name": "scibench_fundamental_wo_solution",
1159
+ "score": 0.10204081632653061,
1160
+ "eval_type": "rule",
1161
+ "num_demo": 1,
1162
+ "num_query": 49
1163
+ },
1164
+ {
1165
+ "name": "geometry_reasoning_nested_squares",
1166
+ "score": 0.14285714285714285,
1167
+ "eval_type": "rule",
1168
+ "num_demo": 1,
1169
+ "num_query": 14
1170
+ },
1171
+ {
1172
+ "name": "font_recognition",
1173
+ "score": 0.0,
1174
+ "eval_type": "rule",
1175
+ "num_demo": 1,
1176
+ "num_query": 14
1177
+ },
1178
+ {
1179
+ "name": "geometry_reasoning_count_line_intersections",
1180
+ "score": 0.4642857142857143,
1181
+ "eval_type": "rule",
1182
+ "num_demo": 1,
1183
+ "num_query": 14
1184
+ },
1185
+ {
1186
+ "name": "circuit_diagram_understanding",
1187
+ "score": 0.0,
1188
+ "eval_type": "rule",
1189
+ "num_demo": 1,
1190
+ "num_query": 15
1191
+ },
1192
+ {
1193
+ "name": "go_capture_stone",
1194
+ "score": 0.06666666666666667,
1195
+ "eval_type": "rule",
1196
+ "num_demo": 1,
1197
+ "num_query": 15
1198
+ },
1199
+ {
1200
+ "name": "monthly_weather_days_count",
1201
+ "score": 0.16666666666666666,
1202
+ "eval_type": "rule",
1203
+ "num_demo": 1,
1204
+ "num_query": 14
1205
+ },
1206
+ {
1207
+ "name": "weather_map_climate_type_temperature_parsing",
1208
+ "score": 0.35714285714285715,
1209
+ "eval_type": "rule",
1210
+ "num_demo": 1,
1211
+ "num_query": 14
1212
+ },
1213
+ {
1214
+ "name": "top_video_creator_identification",
1215
+ "score": 0.21428571428571427,
1216
+ "eval_type": "rule",
1217
+ "num_demo": 1,
1218
+ "num_query": 14
1219
+ },
1220
+ {
1221
+ "name": "rebus",
1222
+ "score": 0.0,
1223
+ "eval_type": "rule",
1224
+ "num_demo": 1,
1225
+ "num_query": 23
1226
+ },
1227
+ {
1228
+ "name": "ishihara_test",
1229
+ "score": 0.21428571428571433,
1230
+ "eval_type": "rule",
1231
+ "num_demo": 1,
1232
+ "num_query": 14
1233
+ },
1234
+ {
1235
+ "name": "paper_vqa",
1236
+ "score": 0.2857142857142857,
1237
+ "eval_type": "rule",
1238
+ "num_demo": 1,
1239
+ "num_query": 14
1240
+ },
1241
+ {
1242
+ "name": "signage_navigation",
1243
+ "score": 0.5714285714285714,
1244
+ "eval_type": "rule",
1245
+ "num_demo": 1,
1246
+ "num_query": 14
1247
+ },
1248
+ {
1249
+ "name": "webpage_code_understanding",
1250
+ "score": 0.5555555555555556,
1251
+ "eval_type": "rule",
1252
+ "num_demo": 1,
1253
+ "num_query": 9
1254
+ },
1255
+ {
1256
+ "name": "medical_counting_lymphocytes",
1257
+ "score": 0.0,
1258
+ "eval_type": "rule",
1259
+ "num_demo": 1,
1260
+ "num_query": 14
1261
+ },
1262
+ {
1263
+ "name": "game_platform_support_identification",
1264
+ "score": 0.2857142857142857,
1265
+ "eval_type": "rule",
1266
+ "num_demo": 1,
1267
+ "num_query": 14
1268
+ },
1269
+ {
1270
+ "name": "GUI_Act_Mobile_swipe",
1271
+ "score": 0.5724262941510554,
1272
+ "eval_type": "rule",
1273
+ "num_demo": 1,
1274
+ "num_query": 13
1275
+ },
1276
+ {
1277
+ "name": "mahjong",
1278
+ "score": 0.0,
1279
+ "eval_type": "rule",
1280
+ "num_demo": 1,
1281
+ "num_query": 14
1282
+ },
1283
+ {
1284
+ "name": "scibench_calculus_wo_solution",
1285
+ "score": 0.12244897959183673,
1286
+ "eval_type": "rule",
1287
+ "num_demo": 1,
1288
+ "num_query": 49
1289
+ },
1290
+ {
1291
+ "name": "knowledge_graph_understanding",
1292
+ "score": 0.2,
1293
+ "eval_type": "rule",
1294
+ "num_demo": 1,
1295
+ "num_query": 15
1296
+ },
1297
+ {
1298
+ "name": "image_translation_en2cn",
1299
+ "score": 0.12162170301105525,
1300
+ "eval_type": "rule",
1301
+ "num_demo": 1,
1302
+ "num_query": 9
1303
+ },
1304
+ {
1305
+ "name": "realworld_qa_en2cn",
1306
+ "score": 0.14285714285714285,
1307
+ "eval_type": "rule",
1308
+ "num_demo": 1,
1309
+ "num_query": 14
1310
+ },
1311
+ {
1312
+ "name": "planning_visual_storage",
1313
+ "score": 0.06666666666666667,
1314
+ "eval_type": "rule",
1315
+ "num_demo": 1,
1316
+ "num_query": 15
1317
+ },
1318
+ {
1319
+ "name": "GUI_Act_Web_Multi",
1320
+ "score": 0.3579909577059959,
1321
+ "eval_type": "rule",
1322
+ "num_demo": 1,
1323
+ "num_query": 14
1324
+ },
1325
+ {
1326
+ "name": "chinese_idiom_recognition",
1327
+ "score": 0.0,
1328
+ "eval_type": "rule",
1329
+ "num_demo": 1,
1330
+ "num_query": 14
1331
+ },
1332
+ {
1333
+ "name": "number_comparison",
1334
+ "score": 0.7142857142857143,
1335
+ "eval_type": "rule",
1336
+ "num_demo": 1,
1337
+ "num_query": 14
1338
+ },
1339
+ {
1340
+ "name": "planning_screenshot_blocksworld",
1341
+ "score": 0.0,
1342
+ "eval_type": "rule",
1343
+ "num_demo": 1,
1344
+ "num_query": 15
1345
+ },
1346
+ {
1347
+ "name": "product_ocr_qa",
1348
+ "score": 0.2857142857142857,
1349
+ "eval_type": "rule",
1350
+ "num_demo": 1,
1351
+ "num_query": 14
1352
+ },
1353
+ {
1354
+ "name": "geometry_reasoning_circled_letter",
1355
+ "score": 0.35714285714285715,
1356
+ "eval_type": "rule",
1357
+ "num_demo": 1,
1358
+ "num_query": 14
1359
+ },
1360
+ {
1361
+ "name": "GUI_Act_Web_Single",
1362
+ "score": 0.0,
1363
+ "eval_type": "rule",
1364
+ "num_demo": 1,
1365
+ "num_query": 14
1366
+ },
1367
+ {
1368
+ "name": "extract_webpage_headline",
1369
+ "score": 0.5,
1370
+ "eval_type": "rule",
1371
+ "num_demo": 1,
1372
+ "num_query": 14
1373
+ },
1374
+ {
1375
+ "name": "planning_screenshot_storage",
1376
+ "score": 0.0,
1377
+ "eval_type": "rule",
1378
+ "num_demo": 1,
1379
+ "num_query": 15
1380
+ },
1381
+ {
1382
+ "name": "soccer_offside",
1383
+ "score": 0.2222222222222222,
1384
+ "eval_type": "rule",
1385
+ "num_demo": 1,
1386
+ "num_query": 9
1387
+ },
1388
+ {
1389
+ "name": "geometry_reasoning_grid",
1390
+ "score": 0.14285714285714285,
1391
+ "eval_type": "rule",
1392
+ "num_demo": 1,
1393
+ "num_query": 14
1394
+ },
1395
+ {
1396
+ "name": "relative_reflectance_of_different_regions",
1397
+ "score": 0.5,
1398
+ "eval_type": "rule",
1399
+ "num_demo": 1,
1400
+ "num_query": 14
1401
+ },
1402
+ {
1403
+ "name": "entertainment_web_game_style",
1404
+ "score": 0.0,
1405
+ "eval_type": "rule",
1406
+ "num_demo": 1,
1407
+ "num_query": 14
1408
+ },
1409
+ {
1410
+ "name": "orchestra_score_recognition",
1411
+ "score": 0.0,
1412
+ "eval_type": "rule",
1413
+ "num_demo": 1,
1414
+ "num_query": 14
1415
+ },
1416
+ {
1417
+ "name": "icon_arithmetic_puzzle",
1418
+ "score": 0.0,
1419
+ "eval_type": "rule",
1420
+ "num_demo": 1,
1421
+ "num_query": 14
1422
+ },
1423
+ {
1424
+ "name": "planning_screenshot_grippers",
1425
+ "score": 0.0,
1426
+ "eval_type": "rule",
1427
+ "num_demo": 1,
1428
+ "num_query": 15
1429
+ },
1430
+ {
1431
+ "name": "MMMU_pro_exam_screenshot",
1432
+ "score": 0.26262626262626265,
1433
+ "eval_type": "rule",
1434
+ "num_demo": 1,
1435
+ "num_query": 99
1436
+ },
1437
+ {
1438
+ "name": "clevrer_physics",
1439
+ "score": 0.2,
1440
+ "eval_type": "rule",
1441
+ "num_demo": 1,
1442
+ "num_query": 20
1443
+ },
1444
+ {
1445
+ "name": "MMMU_physics_chemistry_selected",
1446
+ "score": 0.35714285714285715,
1447
+ "eval_type": "rule",
1448
+ "num_demo": 1,
1449
+ "num_query": 14
1450
+ },
1451
+ {
1452
+ "name": "planning_screenshot_tyreworld",
1453
+ "score": 0.0,
1454
+ "eval_type": "rule",
1455
+ "num_demo": 1,
1456
+ "num_query": 15
1457
+ },
1458
+ {
1459
+ "name": "music_sheet_note_count",
1460
+ "score": 0.058823529411764705,
1461
+ "eval_type": "rule",
1462
+ "num_demo": 1,
1463
+ "num_query": 17
1464
+ },
1465
+ {
1466
+ "name": "hashtag_recommendation",
1467
+ "score": 0.7035714285714285,
1468
+ "eval_type": "rule",
1469
+ "num_demo": 1,
1470
+ "num_query": 14
1471
+ },
1472
+ {
1473
+ "name": "llavaguard",
1474
+ "score": 0.42857142857142855,
1475
+ "eval_type": "rule",
1476
+ "num_demo": 1,
1477
+ "num_query": 14
1478
+ },
1479
+ {
1480
+ "name": "medical_multi_organ_segmentation_rater",
1481
+ "score": 0.42857142857142855,
1482
+ "eval_type": "rule",
1483
+ "num_demo": 1,
1484
+ "num_query": 14
1485
+ },
1486
+ {
1487
+ "name": "cultural_vqa",
1488
+ "score": 0.2,
1489
+ "eval_type": "rule",
1490
+ "num_demo": 1,
1491
+ "num_query": 15
1492
+ },
1493
+ {
1494
+ "name": "logical_reasoning_fit_pattern",
1495
+ "score": 0.2857142857142857,
1496
+ "eval_type": "rule",
1497
+ "num_demo": 1,
1498
+ "num_query": 14
1499
+ },
1500
+ {
1501
+ "name": "character_recognition_in_TV_shows",
1502
+ "score": 0.2857142857142857,
1503
+ "eval_type": "rule",
1504
+ "num_demo": 1,
1505
+ "num_query": 14
1506
+ },
1507
+ {
1508
+ "name": "highest_discount_game_price_identification",
1509
+ "score": 0.35714285714285715,
1510
+ "eval_type": "rule",
1511
+ "num_demo": 1,
1512
+ "num_query": 14
1513
+ },
1514
+ {
1515
+ "name": "remaining_playback_time_calculation",
1516
+ "score": 0.0,
1517
+ "eval_type": "rule",
1518
+ "num_demo": 1,
1519
+ "num_query": 14
1520
+ },
1521
+ {
1522
+ "name": "medical_cell_recognition",
1523
+ "score": 0.14285714285714285,
1524
+ "eval_type": "rule",
1525
+ "num_demo": 1,
1526
+ "num_query": 14
1527
+ },
1528
+ {
1529
+ "name": "chess_find_legal_moves",
1530
+ "score": 0.03355324641748354,
1531
+ "eval_type": "rule",
1532
+ "num_demo": 1,
1533
+ "num_query": 14
1534
+ },
1535
+ {
1536
+ "name": "distinguish_ai_generated_image",
1537
+ "score": 0.5263157894736842,
1538
+ "eval_type": "rule",
1539
+ "num_demo": 1,
1540
+ "num_query": 19
1541
+ },
1542
+ {
1543
+ "name": "autonomous_driving_scene_analysis",
1544
+ "score": 0.8571428571428571,
1545
+ "eval_type": "rule",
1546
+ "num_demo": 1,
1547
+ "num_query": 14
1548
+ },
1549
+ {
1550
+ "name": "counting_single_image",
1551
+ "score": 0.6428571428571429,
1552
+ "eval_type": "rule",
1553
+ "num_demo": 1,
1554
+ "num_query": 14
1555
+ },
1556
+ {
1557
+ "name": "GUI_Act_Mobile_tap",
1558
+ "score": 0.2857142857142857,
1559
+ "eval_type": "rule",
1560
+ "num_demo": 1,
1561
+ "num_query": 14
1562
+ },
1563
+ {
1564
+ "name": "road_map_find_highway_between_two_place",
1565
+ "score": 0.4117647058823529,
1566
+ "eval_type": "rule",
1567
+ "num_demo": 1,
1568
+ "num_query": 17
1569
+ },
1570
+ {
1571
+ "name": "chess_sygyzy_endgames",
1572
+ "score": 0.10884353741496598,
1573
+ "eval_type": "rule",
1574
+ "num_demo": 1,
1575
+ "num_query": 14
1576
+ },
1577
+ {
1578
+ "name": "planning_screenshot_termes",
1579
+ "score": 0.0,
1580
+ "eval_type": "rule",
1581
+ "num_demo": 1,
1582
+ "num_query": 15
1583
+ },
1584
+ {
1585
+ "name": "multiple_states_identify_asia",
1586
+ "score": 0.028571428571428574,
1587
+ "eval_type": "rule",
1588
+ "num_demo": 1,
1589
+ "num_query": 14
1590
+ },
1591
+ {
1592
+ "name": "multiple_states_identify_africa",
1593
+ "score": 0.0,
1594
+ "eval_type": "rule",
1595
+ "num_demo": 1,
1596
+ "num_query": 14
1597
+ },
1598
+ {
1599
+ "name": "multiple_states_identify_europe",
1600
+ "score": 0.05714285714285715,
1601
+ "eval_type": "rule",
1602
+ "num_demo": 1,
1603
+ "num_query": 14
1604
+ },
1605
+ {
1606
+ "name": "multiple_states_identify_americas",
1607
+ "score": 0.11428571428571428,
1608
+ "eval_type": "rule",
1609
+ "num_demo": 1,
1610
+ "num_query": 14
1611
+ },
1612
+ {
1613
+ "name": "adapted_cvbench_distance",
1614
+ "score": 0.42857142857142855,
1615
+ "eval_type": "rule",
1616
+ "num_demo": 1,
1617
+ "num_query": 14
1618
+ },
1619
+ {
1620
+ "name": "adapted_cvbench_count",
1621
+ "score": 0.35714285714285715,
1622
+ "eval_type": "rule",
1623
+ "num_demo": 1,
1624
+ "num_query": 14
1625
+ },
1626
+ {
1627
+ "name": "adapted_cvbench_depth",
1628
+ "score": 0.5714285714285714,
1629
+ "eval_type": "rule",
1630
+ "num_demo": 1,
1631
+ "num_query": 14
1632
+ },
1633
+ {
1634
+ "name": "adapted_cvbench_relation",
1635
+ "score": 0.35714285714285715,
1636
+ "eval_type": "rule",
1637
+ "num_demo": 1,
1638
+ "num_query": 14
1639
+ },
1640
+ {
1641
+ "name": "symbolic_graphics_programs_computer_aided_design",
1642
+ "score": 0.2857142857142857,
1643
+ "eval_type": "rule",
1644
+ "num_demo": 1,
1645
+ "num_query": 14
1646
+ },
1647
+ {
1648
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
1649
+ "score": 0.2222222222222222,
1650
+ "eval_type": "rule",
1651
+ "num_demo": 1,
1652
+ "num_query": 18
1653
+ },
1654
+ {
1655
+ "name": "table_understanding_complex_question_answering",
1656
+ "score": 0.35714285714285715,
1657
+ "eval_type": "rule",
1658
+ "num_demo": 1,
1659
+ "num_query": 14
1660
+ },
1661
+ {
1662
+ "name": "table_understanding_fact_verification",
1663
+ "score": 0.7261904761904762,
1664
+ "eval_type": "rule",
1665
+ "num_demo": 1,
1666
+ "num_query": 14
1667
+ },
1668
+ {
1669
+ "name": "panel_images_multi_question",
1670
+ "score": 0.4047619047619047,
1671
+ "eval_type": "rule",
1672
+ "num_demo": 1,
1673
+ "num_query": 14
1674
+ },
1675
+ {
1676
+ "name": "panel_images_single_question",
1677
+ "score": 0.6428571428571429,
1678
+ "eval_type": "rule",
1679
+ "num_demo": 1,
1680
+ "num_query": 14
1681
+ },
1682
+ {
1683
+ "name": "MMSoc_Misinformation_GossipCop",
1684
+ "score": 0.5714285714285714,
1685
+ "eval_type": "rule",
1686
+ "num_demo": 1,
1687
+ "num_query": 14
1688
+ },
1689
+ {
1690
+ "name": "MMSoc_HatefulMemes",
1691
+ "score": 0.7857142857142857,
1692
+ "eval_type": "rule",
1693
+ "num_demo": 1,
1694
+ "num_query": 14
1695
+ },
1696
+ {
1697
+ "name": "MMSoc_Memotion",
1698
+ "score": 0.6705882352941177,
1699
+ "eval_type": "rule",
1700
+ "num_demo": 1,
1701
+ "num_query": 17
1702
+ },
1703
+ {
1704
+ "name": "MMSoc_Misinformation_PolitiFact",
1705
+ "score": 0.6428571428571429,
1706
+ "eval_type": "rule",
1707
+ "num_demo": 1,
1708
+ "num_query": 14
1709
+ },
1710
+ {
1711
+ "name": "poetry_acrostic_alliteration",
1712
+ "score": 0.0,
1713
+ "eval_type": "rule",
1714
+ "num_demo": 0,
1715
+ "num_query": 15
1716
+ },
1717
+ {
1718
+ "name": "poetry_acrostic",
1719
+ "score": 0.06666666666666667,
1720
+ "eval_type": "rule",
1721
+ "num_demo": 0,
1722
+ "num_query": 15
1723
+ },
1724
+ {
1725
+ "name": "poetry_limerick",
1726
+ "score": 0.06666666666666667,
1727
+ "eval_type": "rule",
1728
+ "num_demo": 0,
1729
+ "num_query": 15
1730
+ },
1731
+ {
1732
+ "name": "poetry_custom_rhyming_scheme",
1733
+ "score": 0.0,
1734
+ "eval_type": "rule",
1735
+ "num_demo": 0,
1736
+ "num_query": 15
1737
+ },
1738
+ {
1739
+ "name": "poetry_petrarchian_sonnet_optional_meter",
1740
+ "score": 0.0,
1741
+ "eval_type": "rule",
1742
+ "num_demo": 0,
1743
+ "num_query": 15
1744
+ },
1745
+ {
1746
+ "name": "poetry_haiku",
1747
+ "score": 0.0,
1748
+ "eval_type": "rule",
1749
+ "num_demo": 0,
1750
+ "num_query": 15
1751
+ },
1752
+ {
1753
+ "name": "poetry_shakespearean_sonnet",
1754
+ "score": 0.0,
1755
+ "eval_type": "rule",
1756
+ "num_demo": 0,
1757
+ "num_query": 15
1758
+ },
1759
+ {
1760
+ "name": "screenshot_lighteval_math",
1761
+ "score": 0.13333333333333333,
1762
+ "eval_type": "rule",
1763
+ "num_demo": 1,
1764
+ "num_query": 15
1765
+ },
1766
+ {
1767
+ "name": "screenshot_theoremqa",
1768
+ "score": 0.07142857142857142,
1769
+ "eval_type": "rule",
1770
+ "num_demo": 1,
1771
+ "num_query": 14
1772
+ },
1773
+ {
1774
+ "name": "number_puzzle_sudoku",
1775
+ "score": 0.0,
1776
+ "eval_type": "rule",
1777
+ "num_demo": 1,
1778
+ "num_query": 15
1779
+ },
1780
+ {
1781
+ "name": "number_puzzle_kakuro_5x5",
1782
+ "score": 0.0,
1783
+ "eval_type": "rule",
1784
+ "num_demo": 1,
1785
+ "num_query": 15
1786
+ },
1787
+ {
1788
+ "name": "text_entity_replace",
1789
+ "score": 0.42857142857142855,
1790
+ "eval_type": "rule",
1791
+ "num_demo": 1,
1792
+ "num_query": 14
1793
+ },
1794
+ {
1795
+ "name": "background_change",
1796
+ "score": 0.5,
1797
+ "eval_type": "rule",
1798
+ "num_demo": 1,
1799
+ "num_query": 14
1800
+ },
1801
+ {
1802
+ "name": "face_attribute_edit",
1803
+ "score": 0.5,
1804
+ "eval_type": "rule",
1805
+ "num_demo": 1,
1806
+ "num_query": 14
1807
+ },
1808
+ {
1809
+ "name": "face_swap",
1810
+ "score": 0.5,
1811
+ "eval_type": "rule",
1812
+ "num_demo": 1,
1813
+ "num_query": 14
1814
+ },
1815
+ {
1816
+ "name": "text_style",
1817
+ "score": 0.5,
1818
+ "eval_type": "rule",
1819
+ "num_demo": 1,
1820
+ "num_query": 14
1821
+ },
1822
+ {
1823
+ "name": "out_of_context",
1824
+ "score": 0.5,
1825
+ "eval_type": "rule",
1826
+ "num_demo": 1,
1827
+ "num_query": 14
1828
+ },
1829
+ {
1830
+ "name": "clip_stable_diffusion_generate",
1831
+ "score": 0.5,
1832
+ "eval_type": "rule",
1833
+ "num_demo": 1,
1834
+ "num_query": 14
1835
+ },
1836
+ {
1837
+ "name": "veracity",
1838
+ "score": 0.35714285714285715,
1839
+ "eval_type": "rule",
1840
+ "num_demo": 1,
1841
+ "num_query": 14
1842
+ },
1843
+ {
1844
+ "name": "counterfactual_arithmetic",
1845
+ "score": 0.0,
1846
+ "eval_type": "rule",
1847
+ "num_demo": 1,
1848
+ "num_query": 14
1849
+ },
1850
+ {
1851
+ "name": "maze_2d_8x8",
1852
+ "score": 0.0,
1853
+ "eval_type": "rule",
1854
+ "num_demo": 1,
1855
+ "num_query": 14
1856
+ },
1857
+ {
1858
+ "name": "shape_composition_shapes",
1859
+ "score": 0.2738095238095238,
1860
+ "eval_type": "rule",
1861
+ "num_demo": 1,
1862
+ "num_query": 14
1863
+ },
1864
+ {
1865
+ "name": "shape_composition_colours",
1866
+ "score": 0.2970521541950113,
1867
+ "eval_type": "rule",
1868
+ "num_demo": 1,
1869
+ "num_query": 14
1870
+ },
1871
+ {
1872
+ "name": "autorater_artifact",
1873
+ "score": 0.07142857142857142,
1874
+ "eval_type": "rule",
1875
+ "num_demo": 1,
1876
+ "num_query": 14
1877
+ },
1878
+ {
1879
+ "name": "autorater_artifact_reason",
1880
+ "score": 0.26666666666666666,
1881
+ "eval_type": "rule",
1882
+ "num_demo": 0,
1883
+ "num_query": 15
1884
+ },
1885
+ {
1886
+ "name": "chess_puzzles_crushing",
1887
+ "score": 0.0,
1888
+ "eval_type": "rule",
1889
+ "num_demo": 1,
1890
+ "num_query": 14
1891
+ },
1892
+ {
1893
+ "name": "chess_puzzles_checkmate",
1894
+ "score": 0.0,
1895
+ "eval_type": "rule",
1896
+ "num_demo": 1,
1897
+ "num_query": 14
1898
+ },
1899
+ {
1900
+ "name": "chess_puzzles_equality",
1901
+ "score": 0.0,
1902
+ "eval_type": "rule",
1903
+ "num_demo": 1,
1904
+ "num_query": 15
1905
+ },
1906
+ {
1907
+ "name": "app_layout_understanding_notes",
1908
+ "score": 0.0,
1909
+ "eval_type": "rule",
1910
+ "num_demo": 1,
1911
+ "num_query": 14
1912
+ },
1913
+ {
1914
+ "name": "app_layout_understanding_twitter",
1915
+ "score": 0.14285714285714285,
1916
+ "eval_type": "rule",
1917
+ "num_demo": 1,
1918
+ "num_query": 14
1919
+ },
1920
+ {
1921
+ "name": "app_layout_understanding_youtube",
1922
+ "score": 0.07142857142857142,
1923
+ "eval_type": "rule",
1924
+ "num_demo": 1,
1925
+ "num_query": 14
1926
+ },
1927
+ {
1928
+ "name": "app_layout_understanding_tiktok",
1929
+ "score": 0.07142857142857142,
1930
+ "eval_type": "rule",
1931
+ "num_demo": 1,
1932
+ "num_query": 14
1933
+ },
1934
+ {
1935
+ "name": "app_layout_understanding_excel",
1936
+ "score": 0.07142857142857142,
1937
+ "eval_type": "rule",
1938
+ "num_demo": 1,
1939
+ "num_query": 14
1940
+ },
1941
+ {
1942
+ "name": "app_layout_understanding_amazon",
1943
+ "score": 0.0,
1944
+ "eval_type": "rule",
1945
+ "num_demo": 1,
1946
+ "num_query": 14
1947
+ },
1948
+ {
1949
+ "name": "app_layout_understanding_instagram",
1950
+ "score": 0.07142857142857142,
1951
+ "eval_type": "rule",
1952
+ "num_demo": 1,
1953
+ "num_query": 14
1954
+ },
1955
+ {
1956
+ "name": "app_layout_understanding_zoom",
1957
+ "score": 0.2,
1958
+ "eval_type": "rule",
1959
+ "num_demo": 1,
1960
+ "num_query": 15
1961
+ },
1962
+ {
1963
+ "name": "app_layout_understanding_word",
1964
+ "score": 0.14285714285714285,
1965
+ "eval_type": "rule",
1966
+ "num_demo": 1,
1967
+ "num_query": 14
1968
+ },
1969
+ {
1970
+ "name": "app_layout_understanding_iphone_settings",
1971
+ "score": 0.0,
1972
+ "eval_type": "rule",
1973
+ "num_demo": 1,
1974
+ "num_query": 14
1975
+ },
1976
+ {
1977
+ "name": "app_layout_understanding_leetcode",
1978
+ "score": 0.0,
1979
+ "eval_type": "rule",
1980
+ "num_demo": 1,
1981
+ "num_query": 14
1982
+ },
1983
+ {
1984
+ "name": "app_layout_understanding_ppt",
1985
+ "score": 0.0,
1986
+ "eval_type": "rule",
1987
+ "num_demo": 1,
1988
+ "num_query": 14
1989
+ },
1990
+ {
1991
+ "name": "app_layout_understanding_alipay",
1992
+ "score": 0.11764705882352941,
1993
+ "eval_type": "rule",
1994
+ "num_demo": 1,
1995
+ "num_query": 17
1996
+ },
1997
+ {
1998
+ "name": "ocr_table_to_markdown",
1999
+ "score": 0.6428571428571429,
2000
+ "eval_type": "rule",
2001
+ "num_demo": 1,
2002
+ "num_query": 14
2003
+ },
2004
+ {
2005
+ "name": "ocr_table_to_latex",
2006
+ "score": 0.0,
2007
+ "eval_type": "rule",
2008
+ "num_demo": 1,
2009
+ "num_query": 14
2010
+ },
2011
+ {
2012
+ "name": "ocr_resume_employer_plain",
2013
+ "score": 0.42857142857142855,
2014
+ "eval_type": "rule",
2015
+ "num_demo": 1,
2016
+ "num_query": 14
2017
+ },
2018
+ {
2019
+ "name": "ocr_article_journal",
2020
+ "score": 0.07142857142857142,
2021
+ "eval_type": "rule",
2022
+ "num_demo": 1,
2023
+ "num_query": 14
2024
+ },
2025
+ {
2026
+ "name": "ocr_resume_experience_plain",
2027
+ "score": 0.21428571428571427,
2028
+ "eval_type": "rule",
2029
+ "num_demo": 1,
2030
+ "num_query": 14
2031
+ },
2032
+ {
2033
+ "name": "ocr_math_text_latex",
2034
+ "score": 0.07142857142857142,
2035
+ "eval_type": "rule",
2036
+ "num_demo": 1,
2037
+ "num_query": 14
2038
+ },
2039
+ {
2040
+ "name": "ocr_article_authors",
2041
+ "score": 0.42857142857142855,
2042
+ "eval_type": "rule",
2043
+ "num_demo": 1,
2044
+ "num_query": 14
2045
+ },
2046
+ {
2047
+ "name": "ocr_table_to_csv",
2048
+ "score": 0.42857142857142855,
2049
+ "eval_type": "rule",
2050
+ "num_demo": 1,
2051
+ "num_query": 14
2052
+ },
2053
+ {
2054
+ "name": "ocr_math_equation",
2055
+ "score": 0.14285714285714285,
2056
+ "eval_type": "rule",
2057
+ "num_demo": 1,
2058
+ "num_query": 14
2059
+ },
2060
+ {
2061
+ "name": "ocr_resume_school_plain",
2062
+ "score": 0.07142857142857142,
2063
+ "eval_type": "rule",
2064
+ "num_demo": 1,
2065
+ "num_query": 14
2066
+ },
2067
+ {
2068
+ "name": "ocr_table_to_html",
2069
+ "score": 0.07142857142857142,
2070
+ "eval_type": "rule",
2071
+ "num_demo": 1,
2072
+ "num_query": 14
2073
+ },
2074
+ {
2075
+ "name": "ocr_resume_skill_plain",
2076
+ "score": 0.42857142857142855,
2077
+ "eval_type": "rule",
2078
+ "num_demo": 1,
2079
+ "num_query": 14
2080
+ },
2081
+ {
2082
+ "name": "crossword_mini_5x5",
2083
+ "score": 0.0071428571428571435,
2084
+ "eval_type": "rule",
2085
+ "num_demo": 1,
2086
+ "num_query": 14
2087
+ },
2088
+ {
2089
+ "name": "contain_position_length",
2090
+ "score": 0.26666666666666666,
2091
+ "eval_type": "rule",
2092
+ "num_demo": 0,
2093
+ "num_query": 15
2094
+ },
2095
+ {
2096
+ "name": "contain_repeat_length",
2097
+ "score": 0.2,
2098
+ "eval_type": "rule",
2099
+ "num_demo": 0,
2100
+ "num_query": 15
2101
+ },
2102
+ {
2103
+ "name": "contain_length",
2104
+ "score": 0.6,
2105
+ "eval_type": "rule",
2106
+ "num_demo": 0,
2107
+ "num_query": 15
2108
+ },
2109
+ {
2110
+ "name": "contain_contain_length",
2111
+ "score": 0.8,
2112
+ "eval_type": "rule",
2113
+ "num_demo": 0,
2114
+ "num_query": 15
2115
+ },
2116
+ {
2117
+ "name": "pictionary_skribbl_io",
2118
+ "score": 0.0,
2119
+ "eval_type": "rule",
2120
+ "num_demo": 1,
2121
+ "num_query": 20
2122
+ },
2123
+ {
2124
+ "name": "pictionary_doodle_guess",
2125
+ "score": 0.2,
2126
+ "eval_type": "rule",
2127
+ "num_demo": 1,
2128
+ "num_query": 15
2129
+ },
2130
+ {
2131
+ "name": "pictionary_genai_output_chinese",
2132
+ "score": 0.0,
2133
+ "eval_type": "rule",
2134
+ "num_demo": 1,
2135
+ "num_query": 14
2136
+ },
2137
+ {
2138
+ "name": "pictionary_cartoon_drawing_guess",
2139
+ "score": 0.5,
2140
+ "eval_type": "rule",
2141
+ "num_demo": 1,
2142
+ "num_query": 14
2143
+ },
2144
+ {
2145
+ "name": "pictionary_chinese_food_img2en",
2146
+ "score": 0.07142857142857142,
2147
+ "eval_type": "rule",
2148
+ "num_demo": 1,
2149
+ "num_query": 14
2150
+ },
2151
+ {
2152
+ "name": "reward_models_i2t_reward",
2153
+ "score": 0.6428571428571429,
2154
+ "eval_type": "rule",
2155
+ "num_demo": 1,
2156
+ "num_query": 14
2157
+ },
2158
+ {
2159
+ "name": "memorization_chinese_celebrity",
2160
+ "score": 0.0,
2161
+ "eval_type": "rule",
2162
+ "num_demo": 1,
2163
+ "num_query": 14
2164
+ },
2165
+ {
2166
+ "name": "memorization_papers",
2167
+ "score": 0.0,
2168
+ "eval_type": "rule",
2169
+ "num_demo": 1,
2170
+ "num_query": 15
2171
+ },
2172
+ {
2173
+ "name": "memorization_famous_treaty",
2174
+ "score": 0.07142857142857142,
2175
+ "eval_type": "rule",
2176
+ "num_demo": 1,
2177
+ "num_query": 14
2178
+ },
2179
+ {
2180
+ "name": "memorization_indian_celebrity",
2181
+ "score": 0.14285714285714285,
2182
+ "eval_type": "rule",
2183
+ "num_demo": 1,
2184
+ "num_query": 14
2185
+ },
2186
+ {
2187
+ "name": "research_website_parsing_blogpost",
2188
+ "score": 0.07142857142857142,
2189
+ "eval_type": "rule",
2190
+ "num_demo": 1,
2191
+ "num_query": 14
2192
+ },
2193
+ {
2194
+ "name": "research_website_parsing_publication",
2195
+ "score": 0.07142857142857142,
2196
+ "eval_type": "rule",
2197
+ "num_demo": 1,
2198
+ "num_query": 14
2199
+ },
2200
+ {
2201
+ "name": "research_website_parsing_homepage",
2202
+ "score": 0.14285714285714285,
2203
+ "eval_type": "rule",
2204
+ "num_demo": 1,
2205
+ "num_query": 14
2206
+ }
2207
+ ]