tttoaster commited on
Commit
43a97f8
·
1 Parent(s): 5a5d6b1

Upload 14 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ file/SEED-Bench-2.json filter=lfs diff=lfs merge=lfs -text
__pycache__/constants.cpython-38.pyc CHANGED
Binary files a/__pycache__/constants.cpython-38.pyc and b/__pycache__/constants.cpython-38.pyc differ
 
app.py CHANGED
@@ -22,7 +22,7 @@ def prediction_analyse(prediction_content):
22
  predictions = prediction_content.split("\n")
23
 
24
  # 读取 ground_truth JSON 文件
25
- with open("./file/SEED-Bench.json", "r") as file:
26
  ground_truth_data = json.load(file)["questions"]
27
 
28
  # 将 ground_truth 数据转换为以 question_id 为键的字典
@@ -53,97 +53,228 @@ def prediction_analyse(prediction_content):
53
 
54
  return results
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def add_new_eval(
57
  input_file,
58
  model_name_textbox: str,
59
  revision_name_textbox: str,
60
  model_type: str,
61
  model_link: str,
 
 
62
  LLM_type: str,
63
  LLM_name_textbox: str,
64
  Evaluation_dimension: str,
 
 
 
65
  ):
66
  if input_file is None:
67
  return "Error! Empty file!"
68
  else:
69
- content = input_file.decode("utf-8")
70
- prediction = prediction_analyse(content)
71
- csv_data = pd.read_csv(CSV_DIR)
72
-
73
- Start_dimension, End_dimension = 1, 13
74
- if Evaluation_dimension == 'Image':
75
- End_dimension = 10
76
- elif Evaluation_dimension == 'Video':
77
- Start_dimension = 10
78
- each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 13)}
79
-
80
- # count for average image\video\all
81
- total_correct_image = sum(prediction[i]["correct"] for i in range(1, 10))
82
- total_correct_video = sum(prediction[i]["correct"] for i in range(10, 13))
83
-
84
- total_image = sum(prediction[i]["total"] for i in range(1, 10))
85
- total_video = sum(prediction[i]["total"] for i in range(10, 13))
86
-
87
- if Evaluation_dimension != 'Video':
88
- average_accuracy_image = round(total_correct_image / total_image * 100, 1)
89
- else:
90
- average_accuracy_image = 0
91
-
92
- if Evaluation_dimension != 'Image':
93
- average_accuracy_video = round(total_correct_video / total_video * 100, 1)
94
- else:
95
- average_accuracy_video = 0
96
-
97
- if Evaluation_dimension == 'All':
98
- overall_accuracy = round((total_correct_image + total_correct_video) / (total_image + total_video) * 100, 1)
99
- else:
100
- overall_accuracy = 0
 
 
101
 
102
- if LLM_type == 'Other':
103
- LLM_name = LLM_name_textbox
104
- else:
105
- LLM_name = LLM_type
106
-
107
- if revision_name_textbox == '':
108
- col = csv_data.shape[0]
109
- model_name = model_name_textbox
110
- else:
111
- model_name = revision_name_textbox
112
- model_name_list = csv_data['Model']
113
- name_list = [name.split(']')[0][1:] for name in model_name_list]
114
- if revision_name_textbox not in name_list:
115
  col = csv_data.shape[0]
 
116
  else:
117
- col = name_list.index(revision_name_textbox)
118
-
119
- if model_link == '':
120
- model_name = model_name # no url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  else:
122
- model_name = '[' + model_name + '](' + model_link + ')'
123
-
124
- # add new data
125
- new_data = [
126
- model_type,
127
- model_name,
128
- LLM_name,
129
- overall_accuracy,
130
- average_accuracy_image,
131
- average_accuracy_video,
132
- each_task_accuracy[1],
133
- each_task_accuracy[2],
134
- each_task_accuracy[3],
135
- each_task_accuracy[4],
136
- each_task_accuracy[5],
137
- each_task_accuracy[6],
138
- each_task_accuracy[7],
139
- each_task_accuracy[8],
140
- each_task_accuracy[9],
141
- each_task_accuracy[10],
142
- each_task_accuracy[11],
143
- each_task_accuracy[12],
144
- ]
145
- csv_data.loc[col] = new_data
146
- csv_data = csv_data.to_csv(CSV_DIR, index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  return 0
148
 
149
  def get_baseline_df():
@@ -202,23 +333,21 @@ with block:
202
  interactive=True,
203
  )
204
 
205
- '''
206
  # selection for model size part:
207
- filter_model_size = gr.CheckboxGroup(
208
  choices=MODEL_SIZE,
209
  value=MODEL_SIZE,
210
  label="Model Size",
211
  interactive=True,
212
  )
213
 
214
- filter_dimension_level = gr.CheckboxGroup(
215
- choices=DIMENSION_LEVEL,
216
- label="Model level",
217
- multiselect=False,
218
- value=DIMENSION_LEVEL[1],
219
- interactive=True,
220
- )
221
- '''
222
 
223
  # 创建数据帧组件
224
  data_component_v2 = gr.components.Dataframe(
@@ -229,15 +358,38 @@ with block:
229
  interactive=False,
230
  visible=True,
231
  )
232
-
233
- def on_checkbox_group_v2_change(selected_columns):
234
- # pdb.set_trace()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  selected_columns = [item for item in TASK_V2_INFO if item in selected_columns]
236
  present_columns = MODEL_INFO_V2 + selected_columns
237
- updated_data = get_all_v2_df()[present_columns]
238
- updated_data = updated_data.sort_values(by=present_columns[2], ascending=False)
239
  updated_headers = present_columns
240
- # pdb.set_trace()
241
  update_datatype = [DATA_TITILE_V2_TYPE[COLUMN_V2_NAMES.index(x)] for x in updated_headers]
242
 
243
  filter_component = gr.components.Dataframe(
@@ -252,8 +404,9 @@ with block:
252
 
253
  return filter_component.value
254
 
255
- # 将复选框组关联到处理函数
256
- checkbox_group_v2.change(fn=on_checkbox_group_v2_change, inputs=checkbox_group_v2, outputs=data_component_v2)
 
257
 
258
  # table seed-bench-v1
259
  with gr.TabItem("🏅 SEED Benchmark v1", elem_id="seed-benchmark-tab-table", id=1):
@@ -277,15 +430,21 @@ with block:
277
  interactive=True,
278
  )
279
 
280
- '''
281
  # selection for model size part:
282
- filter_model_size = gr.CheckboxGroup(
283
  choices=MODEL_SIZE,
284
  value=MODEL_SIZE,
285
  label="Model Size",
286
  interactive=True,
287
  )
288
- '''
 
 
 
 
 
 
 
289
 
290
  # 创建数据帧组件
291
  data_component = gr.components.Dataframe(
@@ -297,12 +456,36 @@ with block:
297
  visible=True,
298
  )
299
 
300
- def on_checkbox_group_change(selected_columns):
301
- # pdb.set_trace()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  selected_columns = [item for item in TASK_INFO if item in selected_columns]
303
  present_columns = MODEL_INFO + selected_columns
304
- updated_data = get_all_df()[present_columns]
305
- updated_data = updated_data.sort_values(by=present_columns[3], ascending=False)
306
  updated_headers = present_columns
307
  update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
308
 
@@ -318,8 +501,9 @@ with block:
318
 
319
  return filter_component.value
320
 
321
- # 将复选框组关联到处理函数
322
- checkbox_group.change(fn=on_checkbox_group_change, inputs=checkbox_group, outputs=data_component)
 
323
 
324
  # table 2
325
  with gr.TabItem("📝 About", elem_id="seed-benchmark-tab-table", id=2):
@@ -358,9 +542,18 @@ with block:
358
  model_link = gr.Textbox(
359
  label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
360
  )
 
 
 
 
 
 
 
 
 
 
361
 
362
  with gr.Column():
363
-
364
  LLM_type = gr.Dropdown(
365
  choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "Other"],
366
  label="LLM type",
@@ -374,11 +567,25 @@ with block:
374
  )
375
  Evaluation_dimension = gr.Dropdown(
376
  choices=["All", "Image", "Video"],
377
- label="Evaluation dimension",
378
  multiselect=False,
379
  value="All",
380
  interactive=True,
381
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
 
383
  with gr.Column():
384
 
@@ -394,18 +601,27 @@ with block:
394
  revision_name_textbox,
395
  model_type,
396
  model_link,
 
 
397
  LLM_type,
398
  LLM_name_textbox,
399
  Evaluation_dimension,
 
 
400
  ],
401
- # outputs = submission_result,
402
  )
403
 
404
 
 
 
 
 
 
 
405
  with gr.Row():
406
  data_run = gr.Button("Refresh")
407
  data_run.click(
408
- get_baseline_df, outputs=data_component
409
  )
410
 
411
  # block.load(get_baseline_df, outputs=data_title)
 
22
  predictions = prediction_content.split("\n")
23
 
24
  # 读取 ground_truth JSON 文件
25
+ with open("./file/SEED-Bench-1.json", "r") as file:
26
  ground_truth_data = json.load(file)["questions"]
27
 
28
  # 将 ground_truth 数据转换为以 question_id 为键的字典
 
53
 
54
  return results
55
 
56
+ def prediction_analyse_v2(prediction_content):
57
+ # pdb.set_trace()
58
+ predictions = prediction_content.split("\n")
59
+
60
+ # 读取 ground_truth JSON 文件
61
+ with open("./file/SEED-Bench-2.json", "r") as file:
62
+ ground_truth_data = json.load(file)["questions"]
63
+
64
+ # 将 ground_truth 数据转换为以 question_id 为键的字典
65
+ ground_truth = {item["question_id"]: item for item in ground_truth_data}
66
+
67
+ # 初始化结果统计字典
68
+ results = {i: {"correct": 0, "total": 0} for i in range(1, 28)}
69
+
70
+ # 遍历 predictions,计算每个 question_type_id 的正确预测数和总预测数
71
+ for prediction in predictions:
72
+ # pdb.set_trace()
73
+ prediction = prediction.strip()
74
+ if not prediction:
75
+ continue
76
+ try:
77
+ prediction = json.loads(prediction)
78
+ except json.JSONDecodeError:
79
+ print(f"Warning: Skipping invalid JSON data in line: {prediction}")
80
+ continue
81
+ question_id = prediction["question_id"]
82
+ gt_item = ground_truth[question_id]
83
+ question_type_id = gt_item["question_type_id"]
84
+
85
+ if prediction["prediction"] == gt_item["answer"]:
86
+ results[question_type_id]["correct"] += 1
87
+
88
+ results[question_type_id]["total"] += 1
89
+
90
+ return results
91
+
92
+
93
  def add_new_eval(
94
  input_file,
95
  model_name_textbox: str,
96
  revision_name_textbox: str,
97
  model_type: str,
98
  model_link: str,
99
+ model_size: str,
100
+ benchmark_version: str,
101
  LLM_type: str,
102
  LLM_name_textbox: str,
103
  Evaluation_dimension: str,
104
+ Evaluation_dimension_2: str,
105
+ Evaluation_method: str
106
+
107
  ):
108
  if input_file is None:
109
  return "Error! Empty file!"
110
  else:
111
+ # v1 evaluation
112
+ if benchmark_version == 'v1':
113
+ content = input_file.decode("utf-8")
114
+ prediction = prediction_analyse(content)
115
+ csv_data = pd.read_csv(CSV_DIR)
116
+
117
+ Start_dimension, End_dimension = 1, 13
118
+ if Evaluation_dimension == 'Image':
119
+ End_dimension = 10
120
+ elif Evaluation_dimension == 'Video':
121
+ Start_dimension = 10
122
+ each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 13)}
123
+
124
+ # count for average image\video\all
125
+ total_correct_image = sum(prediction[i]["correct"] for i in range(1, 10))
126
+ total_correct_video = sum(prediction[i]["correct"] for i in range(10, 13))
127
+
128
+ total_image = sum(prediction[i]["total"] for i in range(1, 10))
129
+ total_video = sum(prediction[i]["total"] for i in range(10, 13))
130
+
131
+ if Evaluation_dimension != 'Video':
132
+ average_accuracy_image = round(total_correct_image / total_image * 100, 1)
133
+ else:
134
+ average_accuracy_image = 0
135
+
136
+ if Evaluation_dimension != 'Image':
137
+ average_accuracy_video = round(total_correct_video / total_video * 100, 1)
138
+ else:
139
+ average_accuracy_video = 0
140
+
141
+ if Evaluation_dimension == 'All':
142
+ overall_accuracy = round((total_correct_image + total_correct_video) / (total_image + total_video) * 100, 1)
143
+ else:
144
+ overall_accuracy = 0
145
 
146
+ if LLM_type == 'Other':
147
+ LLM_name = LLM_name_textbox
148
+ else:
149
+ LLM_name = LLM_type
150
+
151
+ if revision_name_textbox == '':
 
 
 
 
 
 
 
152
  col = csv_data.shape[0]
153
+ model_name = model_name_textbox
154
  else:
155
+ model_name = revision_name_textbox
156
+ model_name_list = csv_data['Model']
157
+ name_list = [name.split(']')[0][1:] for name in model_name_list]
158
+ if revision_name_textbox not in name_list:
159
+ col = csv_data.shape[0]
160
+ else:
161
+ col = name_list.index(revision_name_textbox)
162
+
163
+ if model_link == '':
164
+ model_name = model_name # no url
165
+ else:
166
+ model_name = '[' + model_name + '](' + model_link + ')'
167
+
168
+ # add new data
169
+ new_data = [
170
+ model_type,
171
+ model_name,
172
+ LLM_name,
173
+ model_size,
174
+ Evaluation_method,
175
+ overall_accuracy,
176
+ average_accuracy_image,
177
+ average_accuracy_video,
178
+ each_task_accuracy[1],
179
+ each_task_accuracy[2],
180
+ each_task_accuracy[3],
181
+ each_task_accuracy[4],
182
+ each_task_accuracy[5],
183
+ each_task_accuracy[6],
184
+ each_task_accuracy[7],
185
+ each_task_accuracy[8],
186
+ each_task_accuracy[9],
187
+ each_task_accuracy[10],
188
+ each_task_accuracy[11],
189
+ each_task_accuracy[12],
190
+ ]
191
+ csv_data.loc[col] = new_data
192
+ csv_data = csv_data.to_csv(CSV_DIR, index=False)
193
+ # v2 evaluation
194
  else:
195
+ content = input_file.decode("utf-8")
196
+ prediction = prediction_analyse_v2(content)
197
+ csv_data = pd.read_csv(CSV_V2_DIR)
198
+
199
+ Start_dimension, End_dimension = 1, 28
200
+ if Evaluation_dimension_2 == 'L1':
201
+ End_dimension = 23
202
+ elif Evaluation_dimension_2 == 'L2':
203
+ End_dimension = 25
204
+ elif Evaluation_dimension_2 == 'L3':
205
+ End_dimension = 28
206
+ # pdb.set_trace()
207
+ each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 28)}
208
+ average_p1 = round(sum(each_task_accuracy[key] for key in range(1,23)) / 22, 1)
209
+
210
+ if Evaluation_dimension_2 == 'L2':
211
+ average_p2 = round(sum(each_task_accuracy[key] for key in range(23,25)) / 2, 1)
212
+ average_p3 = 0
213
+ else:
214
+ average_p2 = round(sum(each_task_accuracy[key] for key in range(23,25)) / 2, 1)
215
+ average_p3 = round(sum(each_task_accuracy[key] for key in range(25,28)) / 3, 1)
216
+
217
+ if LLM_type == 'Other':
218
+ LLM_name = LLM_name_textbox
219
+ else:
220
+ LLM_name = LLM_type
221
+
222
+ if revision_name_textbox == '':
223
+ col = csv_data.shape[0]
224
+ model_name = model_name_textbox
225
+ else:
226
+ model_name = revision_name_textbox
227
+ model_name_list = csv_data['Model']
228
+ name_list = [name.split(']')[0][1:] for name in model_name_list]
229
+ if revision_name_textbox not in name_list:
230
+ col = csv_data.shape[0]
231
+ else:
232
+ col = name_list.index(revision_name_textbox)
233
+
234
+ if model_link == '':
235
+ model_name = model_name # no url
236
+ else:
237
+ model_name = '[' + model_name + '](' + model_link + ')'
238
+
239
+ # add new data
240
+ new_data = [
241
+ model_name,
242
+ LLM_name,
243
+ model_size,
244
+ Evaluation_method,
245
+ average_p1,
246
+ average_p2,
247
+ average_p3,
248
+ each_task_accuracy[1],
249
+ each_task_accuracy[2],
250
+ each_task_accuracy[3],
251
+ each_task_accuracy[4],
252
+ each_task_accuracy[5],
253
+ each_task_accuracy[6],
254
+ each_task_accuracy[7],
255
+ each_task_accuracy[8],
256
+ each_task_accuracy[9],
257
+ each_task_accuracy[10],
258
+ each_task_accuracy[11],
259
+ each_task_accuracy[12],
260
+ each_task_accuracy[13],
261
+ each_task_accuracy[14],
262
+ each_task_accuracy[15],
263
+ each_task_accuracy[16],
264
+ each_task_accuracy[17],
265
+ each_task_accuracy[18],
266
+ each_task_accuracy[19],
267
+ each_task_accuracy[20],
268
+ each_task_accuracy[21],
269
+ each_task_accuracy[22],
270
+ each_task_accuracy[23],
271
+ each_task_accuracy[24],
272
+ each_task_accuracy[25],
273
+ each_task_accuracy[26],
274
+ each_task_accuracy[27]
275
+ ]
276
+ csv_data.loc[col] = new_data
277
+ csv_data = csv_data.to_csv(CSV_V2_DIR, index=False)
278
  return 0
279
 
280
  def get_baseline_df():
 
333
  interactive=True,
334
  )
335
 
 
336
  # selection for model size part:
337
+ model_size_v2 = gr.CheckboxGroup(
338
  choices=MODEL_SIZE,
339
  value=MODEL_SIZE,
340
  label="Model Size",
341
  interactive=True,
342
  )
343
 
344
+ # selection for model size part:
345
+ evaluation_method_v2 = gr.CheckboxGroup(
346
+ choices=EVALUATION_METHOD,
347
+ value=EVALUATION_METHOD,
348
+ label="Evaluation Method",
349
+ interactive=True,
350
+ )
 
351
 
352
  # 创建数据帧组件
353
  data_component_v2 = gr.components.Dataframe(
 
358
  interactive=False,
359
  visible=True,
360
  )
361
+
362
+ def on_filter_model_size_method_v2_change(selected_model_size, selected_evaluation_method, selected_columns):
363
+
364
+ updated_data = get_all_v2_df()
365
+ # model_size & evaluation_method:
366
+ # 自定义过滤函数
367
+ def custom_filter(row, model_size_filters, evaluation_method_filters):
368
+ model_size = row['Model Size']
369
+ evaluation_method = row['Evaluation Method']
370
+
371
+ if model_size == '-':
372
+ size_filter = '-' in model_size_filters
373
+ elif 'B' in model_size:
374
+ size = float(model_size.replace('B', ''))
375
+ size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
376
+ else:
377
+ size_filter = False
378
+
379
+ method_filter = evaluation_method in evaluation_method_filters
380
+
381
+ return size_filter and method_filter
382
+
383
+ # 使用自定义过滤函数过滤数据
384
+ mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, evaluation_method_filters=selected_evaluation_method)
385
+ updated_data = updated_data[mask]
386
+
387
+ # columns:
388
  selected_columns = [item for item in TASK_V2_INFO if item in selected_columns]
389
  present_columns = MODEL_INFO_V2 + selected_columns
390
+ updated_data = updated_data[present_columns]
391
+ updated_data = updated_data.sort_values(by="Avg. P1", ascending=False)
392
  updated_headers = present_columns
 
393
  update_datatype = [DATA_TITILE_V2_TYPE[COLUMN_V2_NAMES.index(x)] for x in updated_headers]
394
 
395
  filter_component = gr.components.Dataframe(
 
404
 
405
  return filter_component.value
406
 
407
+ model_size_v2.change(fn=on_filter_model_size_method_v2_change, inputs=[model_size_v2, evaluation_method_v2, checkbox_group_v2], outputs=data_component_v2)
408
+ evaluation_method_v2.change(fn=on_filter_model_size_method_v2_change, inputs=[model_size_v2, evaluation_method_v2, checkbox_group_v2], outputs=data_component_v2)
409
+ checkbox_group_v2.change(fn=on_filter_model_size_method_v2_change, inputs=[model_size_v2, evaluation_method_v2, checkbox_group_v2], outputs=data_component_v2)
410
 
411
  # table seed-bench-v1
412
  with gr.TabItem("🏅 SEED Benchmark v1", elem_id="seed-benchmark-tab-table", id=1):
 
430
  interactive=True,
431
  )
432
 
 
433
  # selection for model size part:
434
+ model_size = gr.CheckboxGroup(
435
  choices=MODEL_SIZE,
436
  value=MODEL_SIZE,
437
  label="Model Size",
438
  interactive=True,
439
  )
440
+
441
+ # selection for model size part:
442
+ evaluation_method = gr.CheckboxGroup(
443
+ choices=EVALUATION_METHOD,
444
+ value=EVALUATION_METHOD,
445
+ label="Evaluation Method",
446
+ interactive=True,
447
+ )
448
 
449
  # 创建数据帧组件
450
  data_component = gr.components.Dataframe(
 
456
  visible=True,
457
  )
458
 
459
+ def on_filter_model_size_method_change(selected_model_size, selected_evaluation_method, selected_columns):
460
+
461
+ updated_data = get_all_df()
462
+ # model_size & evaluation_method:
463
+ # 自定义过滤函数
464
+ def custom_filter(row, model_size_filters, evaluation_method_filters):
465
+ model_size = row['Model Size']
466
+ evaluation_method = row['Evaluation Method']
467
+
468
+ if model_size == '-':
469
+ size_filter = '-' in model_size_filters
470
+ elif 'B' in model_size:
471
+ size = float(model_size.replace('B', ''))
472
+ size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
473
+ else:
474
+ size_filter = False
475
+
476
+ method_filter = evaluation_method in evaluation_method_filters
477
+
478
+ return size_filter and method_filter
479
+
480
+ # 使用自定义过滤函数过滤数据
481
+ mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, evaluation_method_filters=selected_evaluation_method)
482
+ updated_data = updated_data[mask]
483
+
484
+ # columns:
485
  selected_columns = [item for item in TASK_INFO if item in selected_columns]
486
  present_columns = MODEL_INFO + selected_columns
487
+ updated_data = updated_data[present_columns]
488
+ updated_data = updated_data.sort_values(by="Avg. All", ascending=False)
489
  updated_headers = present_columns
490
  update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
491
 
 
501
 
502
  return filter_component.value
503
 
504
+ model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, evaluation_method, checkbox_group], outputs=data_component)
505
+ evaluation_method.change(fn=on_filter_model_size_method_change, inputs=[model_size, evaluation_method, checkbox_group], outputs=data_component)
506
+ checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, evaluation_method, checkbox_group], outputs=data_component)
507
 
508
  # table 2
509
  with gr.TabItem("📝 About", elem_id="seed-benchmark-tab-table", id=2):
 
542
  model_link = gr.Textbox(
543
  label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
544
  )
545
+ model_size = gr.Textbox(
546
+ label="Model size", placeholder="7B(Input content format must be 'number+B' or '-')"
547
+ )
548
+ benchmark_version= gr.Dropdown(
549
+ choices=["v1", "v2"],
550
+ label="Benchmark version",
551
+ multiselect=False,
552
+ value="v1",
553
+ interactive=True,
554
+ )
555
 
556
  with gr.Column():
 
557
  LLM_type = gr.Dropdown(
558
  choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "Other"],
559
  label="LLM type",
 
567
  )
568
  Evaluation_dimension = gr.Dropdown(
569
  choices=["All", "Image", "Video"],
570
+ label="Evaluation dimension for SEED-Bench 1(for evaluate SEED-Bench 1)",
571
  multiselect=False,
572
  value="All",
573
  interactive=True,
574
  )
575
+ Evaluation_dimension_2 = gr.Dropdown(
576
+ choices=["L1", "L2", "L3"],
577
+ label="Evaluation dimension for SEED-Bench 2(for evaluate SEED-Bench 2)",
578
+ multiselect=False,
579
+ value="L2",
580
+ interactive=True,
581
+ )
582
+ Evaluation_method = gr.Dropdown(
583
+ choices=EVALUATION_METHOD,
584
+ label="Evaluation method",
585
+ multiselect=False,
586
+ value=EVALUATION_METHOD[0],
587
+ interactive=True,
588
+ )
589
 
590
  with gr.Column():
591
 
 
601
  revision_name_textbox,
602
  model_type,
603
  model_link,
604
+ model_size,
605
+ benchmark_version,
606
  LLM_type,
607
  LLM_name_textbox,
608
  Evaluation_dimension,
609
+ Evaluation_dimension_2,
610
+ Evaluation_method
611
  ],
 
612
  )
613
 
614
 
615
+ def refresh_data():
616
+ value1 = get_baseline_df()
617
+ value2 = get_baseline_v2_df()
618
+
619
+ return value1, value2
620
+
621
  with gr.Row():
622
  data_run = gr.Button("Refresh")
623
  data_run.click(
624
+ refresh_data, outputs=[data_component, data_component_v2]
625
  )
626
 
627
  # block.load(get_baseline_df, outputs=data_title)
constants.py CHANGED
@@ -1,7 +1,8 @@
1
  # this is .py for store constants
2
- MODEL_INFO = ["Model Type", "Model", "Language Model"]
3
- MODEL_INFO_V2 = ["Model", "Language Model"]
4
- MODEL_SIZE = ["<10B", ">=10B"]
 
5
  DIMENSION_LEVEL = ["L1", "L2", "L3"]
6
  LEADERBOARD_VERSION = ["Version1", "Version2"]
7
  TASK_INFO = ["Avg. All", "Avg. Img", "Avg. Video", "Scene Understanding", "Instance Identity", "Instance Attribute", "Instance Location", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition", "Action Recognition", "Action Prediction", "Procedure Understanding"]
@@ -10,8 +11,8 @@ TASK_V2_INFO = ["Avg. P1", "Avg. P2", "Avg. P3", "Scene Understanding", "Instanc
10
  AVG_INFO = ["Avg. All", "Avg. Img", "Avg. Video"]
11
  AVG_V2_INFO = ["Avg. P1", "Avg. P2", "Avg. P3"]
12
 
13
- DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
14
- DATA_TITILE_V2_TYPE = ["markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
15
  CSV_DIR = "./file/result.csv"
16
  CSV_V2_DIR = "./file/result_v2.csv"
17
 
@@ -24,49 +25,83 @@ DATA_NUM_V2 = [3158, 1831, 4649, 978, 2447, 657, 97, 331, 435, 330, 500, 501, 19
24
  LEADERBORAD_INTRODUCTION = """# SEED-Bench Leaderboard
25
 
26
  Welcome to the leaderboard of the SEED-Bench! 🏆
27
- SEED-Bench consists of 19K multiple-choice questions with accurate human annotations for evaluating Multimodal LLMs, covering 12 evaluation dimensions including both the spatial and temporal understanding.
28
- Please refer to [our paper](https://arxiv.org/abs/2307.16125) for more details.
 
 
 
 
29
  """
30
 
31
- SUBMIT_INTRODUCTION = """# Submit on SEED Benchmark v1 Introduction
32
- 1. Obtain JSON file from our [github repository](https://github.com/AILab-CVC/SEED-Bench#leaderboard-submit) after evaluation. For example, you can obtain InstructBLIP's JSON file as results/results.json after running
33
  ```shell
34
  python eval.py --model instruct_blip --anno_path SEED-Bench.json --output-dir results
35
  ```
 
 
 
 
36
  2. If you want to update model performance by uploading new results, please ensure 'Model Name Revision' is the same as what's shown in the leaderboard. For example, if you want to modify InstructBLIP's performance, you need to fill in 'InstructBLIP' in 'Revision Model Name'.
37
  3. Please provide the correct link of your model's repository for each submission.
38
- 4. For the evaluation dimension, you can choose "All/Image/Video", and the results of dimensions that are not evaluated will be set to zero.
39
  5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
40
 
41
  ## Submit Example
42
- For example, if you want to upload InstructBLIP's result in the leaderboard, you need to:
43
  1. Fill in 'InstructBLIP' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
44
  2. Fill in 'InstructBLIP' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
45
- 2. Select 'ImageLLM' in 'Model Type'.
46
- 3. Fill in 'https://github.com/salesforce/LAVIS' in 'Model Link'.
47
- 4. Select 'Flan-T5-XL' in 'LLM Type'.
48
- 5. Select 'All' in 'Evaluation Dimension'.
49
- 6. Upload results.json.
50
- 7. Click the 'Submit Eval' button.
51
- 8. Click 'Refresh' to obtain the uploaded leaderboard.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  """
53
 
54
  TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
55
  We use accurancy(%) as the primary evaluation metric for each tasks.
 
 
56
  """
57
 
58
  LEADERBORAD_INFO = """
59
  Based on powerful Large Language Models (LLMs), recent generative Multimodal Large Language Models (MLLMs) have gained prominence as a pivotal research area, exhibiting remarkable capability for both comprehension and generation.
60
- In this work, we address the evaluation of generative comprehension in MLLMs as a preliminary step towards a comprehensive assessment of generative models, by introducing a benchmark named SEED-Bench.
61
- SEED-Bench consists of 19K multiple choice questions with accurate human annotations (x6 larger than existing benchmarks), which spans 12 evaluation dimensions including the comprehension of both the image and video modality.
62
  We develop an advanced pipeline for generating multiple-choice questions that target specific evaluation dimensions, integrating both automatic filtering and manual verification processes.
63
  Multiple-choice questions with groundtruth options derived from human annotation enables an objective and efficient assessment of model performance, eliminating the need for human or GPT intervention during evaluation.
64
- We further evaluate the performance of 18 models across all 12 dimensions, covering both the spatial and temporal understanding.
65
  By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
66
  """
67
 
68
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
69
- CITATION_BUTTON_TEXT = r"""@article{li2023seed,
 
 
 
 
 
 
 
70
  title={SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension},
71
  author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
72
  journal={arXiv preprint arXiv:2307.16125},
 
1
  # this is .py for store constants
2
+ MODEL_INFO = ["Model Type", "Model", "Language Model", "Evaluation Method"]
3
+ MODEL_INFO_V2 = ["Model", "Language Model", "Evaluation Method"]
4
+ MODEL_SIZE = ["<10B", ">=10B", "-"]
5
+ EVALUATION_METHOD = ["PPL", "PPL for A/B/C/D", "Generate", "NG"]
6
  DIMENSION_LEVEL = ["L1", "L2", "L3"]
7
  LEADERBOARD_VERSION = ["Version1", "Version2"]
8
  TASK_INFO = ["Avg. All", "Avg. Img", "Avg. Video", "Scene Understanding", "Instance Identity", "Instance Attribute", "Instance Location", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition", "Action Recognition", "Action Prediction", "Procedure Understanding"]
 
11
  AVG_INFO = ["Avg. All", "Avg. Img", "Avg. Video"]
12
  AVG_V2_INFO = ["Avg. P1", "Avg. P2", "Avg. P3"]
13
 
14
+ DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
15
+ DATA_TITILE_V2_TYPE = ["markdown", "markdown","markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
16
  CSV_DIR = "./file/result.csv"
17
  CSV_V2_DIR = "./file/result_v2.csv"
18
 
 
25
  LEADERBORAD_INTRODUCTION = """# SEED-Bench Leaderboard
26
 
27
  Welcome to the leaderboard of the SEED-Bench! 🏆
28
+
29
+ SEED-Bench-1 consists of 19K multiple-choice questions with accurate human annotations for evaluating Multimodal LLMs, covering 12 evaluation dimensions including both the spatial and temporal understanding.
30
+ Please refer to [SEED-Bench-1 paper](https://arxiv.org/abs/2307.16125) for more details.
31
+
32
+ SEED-Bench-2 comprises 24K multiple-choice questions with accurate human anno- tations, which spans 27 dimensions, including the evalu- ation of both text and image generation.
33
+ Please refer to [SEED-Bench-2 paper](https://arxiv.org/abs/2311.17092) for more details.
34
  """
35
 
36
+ SUBMIT_INTRODUCTION = """# Submit on SEED Benchmark Introduction
37
+ 1. Obtain JSON file from our [github repository](https://github.com/AILab-CVC/SEED-Bench#leaderboard-submit) after evaluation. For example on SEED-Bench-1, you can obtain InstructBLIP's JSON file as results/results.json after running
38
  ```shell
39
  python eval.py --model instruct_blip --anno_path SEED-Bench.json --output-dir results
40
  ```
41
+ And for example on SEED-Bench-2, you can obtain InternLM_Xcomposer_VL's JSON file as results/results.json after running
42
+ ```shell
43
+ python eval.py --model InternLM_Xcomposer_VL --anno_path SEED-Bench_v2_level1_2_3.json --output-dir results --evaluate_level L2 --evaluate_part all --evaluate_version v2
44
+ ```
45
  2. If you want to update model performance by uploading new results, please ensure 'Model Name Revision' is the same as what's shown in the leaderboard. For example, if you want to modify InstructBLIP's performance, you need to fill in 'InstructBLIP' in 'Revision Model Name'.
46
  3. Please provide the correct link of your model's repository for each submission.
47
+ 4. For the evaluation dimension, you can choose "All/Image/Video" for SEED-Bench-1 and "L1/L2/L3" for SEED-Bench-2, and the results of dimensions that are not evaluated will be set to zero.
48
  5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
49
 
50
  ## Submit Example
51
+ For example on SEED-Bench-1, if you want to upload InstructBLIP's result in the leaderboard, you need to:
52
  1. Fill in 'InstructBLIP' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
53
  2. Fill in 'InstructBLIP' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
54
+ 3. Select 'ImageLLM' in 'Model Type'.
55
+ 4. Fill in 'https://github.com/salesforce/LAVIS' in 'Model Link'.
56
+ 5. Fill in '7B' in 'Model size'.
57
+ 6. Select 'v1' in 'Benchmark version'.
58
+ 7. Select 'Flan-T5-XL' in 'LLM Type'.
59
+ 8. Select 'All' in 'Evaluation Dimension for SEED-Bench 1'.
60
+ 9. Select 'PPL' in 'Evaluate Method'.
61
+ 10. Upload results.json.
62
+ 11. Click the 'Submit Eval' button.
63
+ 12. Click 'Refresh' to obtain the uploaded leaderboard.
64
+
65
+ For example on SEED-Bench-2, if you want to upload InternLM_Xcomposer_VL's result in the leaderboard, you need to:
66
+ 1. Fill in 'InternLM_Xcomposer_VL' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
67
+ 2. Fill in 'InternLM_Xcomposer_VL' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
68
+ 3. Select 'ImageLLM' in 'Model Type'.
69
+ 4. Fill in 'https://github.com/InternLM/InternLM-XComposer' in 'Model Link'.
70
+ 5. Fill in '7B' in 'Model size'.
71
+ 6. Select 'v2' in 'Benchmark version'.
72
+ 7. Select 'Other' in 'LLM Type'.
73
+ 8. Fill 'InternLM-7B' in 'LLM model(for Other)'
74
+ 9. Select 'L2' in 'Evaluation Dimension for SEED-Bench 2'.
75
+ 10. Select 'PPL' in 'Evaluate Method'.
76
+ 11. Upload results.json.
77
+ 12. Click the 'Submit Eval' button.
78
+ 13. Click 'Refresh' to obtain the uploaded leaderboard.
79
  """
80
 
81
  TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
82
  We use accurancy(%) as the primary evaluation metric for each tasks.
83
+ SEED-Bench-1 calculates the overall accuracy by dividing the total number of correct QA answers by the total number of QA questions.
84
+ SEED-Bench-2 represents the overall accuracy using the average accuracy of each dimension.
85
  """
86
 
87
  LEADERBORAD_INFO = """
88
  Based on powerful Large Language Models (LLMs), recent generative Multimodal Large Language Models (MLLMs) have gained prominence as a pivotal research area, exhibiting remarkable capability for both comprehension and generation.
89
+ [SEED-Bench-1](https://arxiv.org/abs/2307.16125) consists of 19K multiple choice questions with accurate human annotations (x6 larger than existing benchmarks), which spans 12 evaluation dimensions including the comprehension of both the image and video modality.
90
+ [SEED-Bench-2](https://arxiv.org/abs/2311.17092) comprises 24K multiple-choice questions with accurate human anno- tations, which spans 27 dimensions, including the evalu- ation of both text and image generation.
91
  We develop an advanced pipeline for generating multiple-choice questions that target specific evaluation dimensions, integrating both automatic filtering and manual verification processes.
92
  Multiple-choice questions with groundtruth options derived from human annotation enables an objective and efficient assessment of model performance, eliminating the need for human or GPT intervention during evaluation.
 
93
  By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
94
  """
95
 
96
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
97
+ CITATION_BUTTON_TEXT = r"""@article{li2023seed2,
98
+ title={SEED-Bench-2: Benchmarking Multimodal Large Language Models},
99
+ author={Li, Bohao and Ge, Yuying and Ge, Yixiao and Wang, Guangzhi and Wang, Rui and Zhang, Ruimao and Shan, Ying},
100
+ journal={arXiv preprint arXiv:2311.17092},
101
+ year={2023}
102
+ }
103
+
104
+ @article{li2023seed,
105
  title={SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension},
106
  author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
107
  journal={arXiv preprint arXiv:2307.16125},
file/SEED-Bench-1.json ADDED
The diff for this file is too large to render. See raw diff
 
file/SEED-Bench-2.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95c29b47709e43246be32cf4343611766e7737ad7062096e197932eff7c8543d
3
+ size 18076409
file/result.csv CHANGED
@@ -1,38 +1,39 @@
1
- Model Type,Model,Language Model,Avg. All,Avg. Img,Avg. Video,Scene Understanding,Instance Identity,Instance Attribute,Instance Location,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Action Recognition,Action Prediction,Procedure Understanding
2
- LLM,[Flan-T5](https://huggingface.co/google/flan-t5-xl),Flan-T5-XL,27.7,27.3,28.6,23,29,32.8,31.8,20.5,31.8,33,18.2,19.4,23.2,34.9,25.4
3
- LLM,[Vicuna](https://huggingface.co/lmsys/vicuna-7b-v1.3),Vicuna-7B,28.5,28.2,29.5,23.4,30.7,29.7,30.9,30.8,28.6,29.8,18.5,13.4,27.3,34.5,23.8
4
- LLM,[LLaMA](https://research.facebook.com/publications/llama-open-and-efficient-foundation-language-models/),LLaMA-7B,26.8,26.6,27.3,26.3,27.4,26.2,28.3,25.1,28.8,19.2,37,9,33,23.1,26.2
5
- ImageLLM,[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,46.4,49.7,36.7,59.1,53.9,49.2,42.3,43.2,36.7,55.7,45.6,25.9,32.6,47.5,24
6
- ImageLLM,[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,52.7,57.8,38.3,60.3,58.5,63.4,40.6,58.4,38.7,51.6,45.9,25.9,33.1,49.1,27.1
7
- ImageLLM,[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,53.4,58.8,38.1,60.2,58.9,65.6,43.6,57.2,40.3,52.6,47.7,43.5,34.5,49.6,23.1
8
- ImageLLM,[LLaVA-1.5](https://github.com/haotian-liu/LLaVA),Vicuna-13B,61.6,68.2,42.7,74.9,71.3,68.9,63.5,61.3,51.4,73.2,77,60.5,48.9,41.1,36.6
9
- ImageLLM,[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Vicuna-7B,42.8,47.4,29.9,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,11.8,38.2,24.5,27.1
10
- ImageLLM,[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,39.1,41.8,31.4,51.9,44.1,39.9,36.1,33.7,36.4,32,53.2,30.6,39.5,24.3,31.9
11
- ImageLLM,[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,33.2,34.5,29.2,43.6,37.9,31.5,30.8,27.3,30.1,29.9,51.4,18.8,36.9,25.8,24
12
- ImageLLM,[Otter](https://github.com/Luodian/Otter),LLaMA-7B,33.9,35.2,30.4,44.9,38.6,32.2,30.9,26.3,31.8,32,51.4,31.8,37.9,27.2,24.8
13
- ImageLLM,[Otter](https://github.com/Luodian/Otter),MPT-7B,39.7,42.9,30.6,51.3,43.5,42.3,34.2,38.4,30.9,40.2,55.3,24.7,36.8,29.2,23.8
14
- ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,33.1,34.5,29.3,43.9,38.1,31.3,30.1,27.3,30.6,29.9,50.2,20,37.2,25.4,24.2
15
- ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),MPT-7B,40.9,42.7,35.7,53.2,45.3,40,31.2,39.3,32.6,36.1,51.4,25.9,42.9,34.7,26.9
16
- ImageLLM,[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,32.7,35.2,25.8,45.2,38.5,29.3,33,29.7,35.5,39.2,52,24.7,38.6,18.5,19.6
17
- ImageLLM,[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,33.5,35.5,27.8,41.7,35.5,31.8,29.5,36.2,32,32,51.1,27.1,33.9,25.4,23
18
- ImageLLM,[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,34,37.9,23,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,28.8,26.7,17.9,26.5
19
- ImageLLM,[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder Only 1.3B,50,54.4,37.5,63.4,57.1,58.5,44,41.4,37.9,55.7,60.7,25.9,41.3,40.4,27
20
- ImageLLM,[Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat),Qwen-7B,58.2,65.4,37.8,73.3,67.3,69.6,57.7,52.9,48.2,59.8,74.6,53.5,43.9,39.2,26.7
21
- ImageLLM,[Qwen-VL](https://huggingface.co/Qwen/Qwen-VL),Qwen-7B,56.3,62.3,39.1,71.2,66.4,67.7,53.5,44.8,43.8,62.9,74.9,51.2,44.7,38.5,32
22
- ImageLLM,[IDEFICS-9b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-7B,0,44.5,0,55.8,45.3,42.3,40.2,36.8,34.9,37.1,55.9,38.8,0,0,0
23
- ImageLLM,[IDEFICS-80b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-65B,0,53.2,0,64,52.6,50.8,48.3,46.1,45.5,62.9,68,51.8,0,0,0
24
- ImageLLM,[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,0,66.9,0,75,71.7,67.6,60.8,56.2,55.3,74.4,77,48.5,0,0,0
25
- ImageLLM,[SEED-LLaMA](https://github.com/AILab-CVC/SEED),LLaMA2-Chat-13B,48.9,53.7,35.4,64.1,54.2,54.1,46.5,45.3,38.2,51.6,60.7,44.7,37.8,45.3,20.0
26
- ImageLLM,[mPLUG-Owl2](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,57.8,64.1,39.8,72.7,67.6,63.6,53.6,58.5,50.8,70.1,76.4,30.2,46.0,38.7,32.9
27
- ImageLLM,[LLaMA-VID-7B](https://github.com/dvlab-research/LLaMA-VID),LLaMA-7B,59.9,67.6,37.9,75.4,71.2,68.9,62.9,58.4,50.7,70.1,76.1,54.7,42.8,35.2,35.6
28
- ImageLLM,[Pink-LLaMA2](https://github.com/SY-Xuan/Pink/stargazers),LLaMA2-7B,0,67.0,0,75.2,70.1,70.1,63.3,53.8,50.2,69.1,74.3,50.0,0,0,0
29
- ImageLLM,[InfMLLM-13B](https://github.com/mightyzau/InfMLLM),Vicuna-13B,62.3,69.6,41.5,75.5,73,70.4,66.2,63.3,54.2,72.2,77.9,37.2,49.5,39,33.9
30
- ImageLLM,[ShareGPT4V-7B](https://github.com/InternLM/InternLM-XComposer/tree/main/projects/ShareGPT4V),Vicuna-7B,0,69.7,0,75.3,71.4,72.3,63.1,62,53.9,70.1,79.8,54.7,0,0,0
31
- ImageLLM,[ShareGPT4V-13B](https://github.com/InternLM/InternLM-XComposer/tree/main/projects/ShareGPT4V),Vicuna-13B,0,70.8,0,75.9,74.1,73.5,66.8,62.4,54.8,75.3,77.3,46.5,0,0,0
32
- VideoLLM,[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,37.6,39,33.7,47.1,43.8,34.9,40,32.8,34.6,42.3,50.5,17.7,34.9,36.4,27.3
33
- VideoLLM,[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,31.2,33.9,23.5,37.2,31.4,33.2,28.4,35.5,29.5,23.7,42.3,25.9,27.6,21.3,21.1
34
- VideoLLM,[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,30.3,32,25.4,39.3,32.9,31.6,27.9,24.2,30.1,27.8,43.8,11.8,31.3,23.2,20.7
35
- Other,[Unified-IO-2 7B (2.5M)](),from scratch,60.5,65.6,46,70.7,69,67.4,55.4,62.6,45.5,60.8,67.1,58.1,57.5,43.2,34
36
- Other,[Unified-IO-2 7B](),from scratch,60.4,65.5,46,71.3,68.8,67.5,55.5,61.2,45.4,62.9,66.5,59.3,58,42.7,34
37
- Other,[Unified-IO-2 3B](),from scratch,58.7,63.8,44.2,68.8,65.8,67.2,52.9,60.4,43.1,55.7,64,41.9,57.5,36,39
38
- Other,[Unified-IO-2 1B](),from scratch,49.6,55.1,34,63.8,57.7,54.6,41.9,53.7,33.3,51.5,58.3,47.7,39.8,34.5,24.6
 
 
1
+ Model Type,Model,Language Model,Model Size,Evaluation Method,Avg. All,Avg. Img,Avg. Video,Scene Understanding,Instance Identity,Instance Attribute,Instance Location,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Action Recognition,Action Prediction,Procedure Understanding
2
+ LLM,[Flan-T5](https://huggingface.co/google/flan-t5-xl),Flan-T5-XL,3B,PPL,27.7,27.3,28.6,23.0,29.0,32.8,31.8,20.5,31.8,33.0,18.2,19.4,23.2,34.9,25.4
3
+ LLM,[Vicuna](https://huggingface.co/lmsys/vicuna-7b-v1.3),Vicuna-7B,7B,PPL,28.5,28.2,29.5,23.4,30.7,29.7,30.9,30.8,28.6,29.8,18.5,13.4,27.3,34.5,23.8
4
+ LLM,[LLaMA](https://research.facebook.com/publications/llama-open-and-efficient-foundation-language-models/),LLaMA-7B,7B,PPL,26.8,26.6,27.3,26.3,27.4,26.2,28.3,25.1,28.8,19.2,37.0,9.0,33.0,23.1,26.2
5
+ ImageLLM,[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,46.4,49.7,36.7,59.1,53.9,49.2,42.3,43.2,36.7,55.7,45.6,25.9,32.6,47.5,24.0
6
+ ImageLLM,[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,52.7,57.8,38.3,60.3,58.5,63.4,40.6,58.4,38.7,51.6,45.9,25.9,33.1,49.1,27.1
7
+ ImageLLM,[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,7B,PPL,53.4,58.8,38.1,60.2,58.9,65.6,43.6,57.2,40.3,52.6,47.7,43.5,34.5,49.6,23.1
8
+ ImageLLM,[LLaVA-1.5](https://github.com/haotian-liu/LLaVA),Vicuna-13B,13B,Generate,61.6,68.2,42.7,74.9,71.3,68.9,63.5,61.3,51.4,73.2,77.0,60.5,48.9,41.1,36.6
9
+ ImageLLM,[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Vicuna-7B,7B,PPL,42.8,47.4,29.9,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,11.8,38.2,24.5,27.1
10
+ ImageLLM,[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,7B,PPL,39.1,41.8,31.4,51.9,44.1,39.9,36.1,33.7,36.4,32.0,53.2,30.6,39.5,24.3,31.9
11
+ ImageLLM,[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,7B,PPL,33.2,34.5,29.2,43.6,37.9,31.5,30.8,27.3,30.1,29.9,51.4,18.8,36.9,25.8,24.0
12
+ ImageLLM,[Otter](https://github.com/Luodian/Otter),LLaMA-7B,7B,PPL,33.9,35.2,30.4,44.9,38.6,32.2,30.9,26.3,31.8,32.0,51.4,31.8,37.9,27.2,24.8
13
+ ImageLLM,[Otter](https://github.com/Luodian/Otter),MPT-7B,7B,PPL,39.7,42.9,30.6,51.3,43.5,42.3,34.2,38.4,30.9,40.2,55.3,24.7,36.8,29.2,23.8
14
+ ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,7B,PPL,33.1,34.5,29.3,43.9,38.1,31.3,30.1,27.3,30.6,29.9,50.2,20.0,37.2,25.4,24.2
15
+ ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),MPT-7B,7B,PPL,40.9,42.7,35.7,53.2,45.3,40.0,31.2,39.3,32.6,36.1,51.4,25.9,42.9,34.7,26.9
16
+ ImageLLM,[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,7B,PPL,32.7,35.2,25.8,45.2,38.5,29.3,33.0,29.7,35.5,39.2,52.0,24.7,38.6,18.5,19.6
17
+ ImageLLM,[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,7B,PPL,33.5,35.5,27.8,41.7,35.5,31.8,29.5,36.2,32.0,32.0,51.1,27.1,33.9,25.4,23.0
18
+ ImageLLM,[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,7B,PPL,34.0,37.9,23.0,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,28.8,26.7,17.9,26.5
19
+ ImageLLM,[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder Only 1.3B,1.3B,PPL,50.0,54.4,37.5,63.4,57.1,58.5,44.0,41.4,37.9,55.7,60.7,25.9,41.3,40.4,27.0
20
+ ImageLLM,[Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat),Qwen-7B,7B,PPL for A/B/C/D,58.2,65.4,37.8,73.3,67.3,69.6,57.7,52.9,48.2,59.8,74.6,53.5,43.9,39.2,26.7
21
+ ImageLLM,[Qwen-VL](https://huggingface.co/Qwen/Qwen-VL),Qwen-7B,7B,PPL for A/B/C/D,56.3,62.3,39.1,71.2,66.4,67.7,53.5,44.8,43.8,62.9,74.9,51.2,44.7,38.5,32.0
22
+ ImageLLM,[IDEFICS-9b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-7B,7B,NG,0.0,44.5,0.0,55.8,45.3,42.3,40.2,36.8,34.9,37.1,55.9,38.8,0.0,0.0,0.0
23
+ ImageLLM,[IDEFICS-80b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-65B,65B,NG,0.0,53.2,0.0,64.0,52.6,50.8,48.3,46.1,45.5,62.9,68.0,51.8,0.0,0.0,0.0
24
+ ImageLLM,[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,7B,PPL,0.0,66.9,0.0,75.0,71.7,67.6,60.8,56.2,55.3,74.4,77.0,48.5,0.0,0.0,0.0
25
+ ImageLLM,[SEED-LLaMA](https://github.com/AILab-CVC/SEED),LLaMA2-Chat-13B,13B,PPL,48.9,53.7,35.4,64.1,54.2,54.1,46.5,45.3,38.2,51.6,60.7,44.7,37.8,45.3,20.0
26
+ ImageLLM,[mPLUG-Owl2](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,7B,NG,57.8,64.1,39.8,72.7,67.6,63.6,53.6,58.5,50.8,70.1,76.4,30.2,46.0,38.7,32.9
27
+ ImageLLM,[LLaMA-VID-7B](https://github.com/dvlab-research/LLaMA-VID),LLaMA-7B,7B,Generate,59.9,67.6,37.9,75.4,71.2,68.9,62.9,58.4,50.7,70.1,76.1,54.7,42.8,35.2,35.6
28
+ ImageLLM,[Pink-LLaMA2](https://github.com/SY-Xuan/Pink/stargazers),LLaMA2-7B,7B,NG,0.0,67.0,0.0,75.2,70.1,70.1,63.3,53.8,50.2,69.1,74.3,50.0,0.0,0.0,0.0
29
+ ImageLLM,[InfMLLM-13B](https://github.com/mightyzau/InfMLLM),Vicuna-13B,13B,NG,62.3,69.6,41.5,75.5,73.0,70.4,66.2,63.3,54.2,72.2,77.9,37.2,49.5,39.0,33.9
30
+ ImageLLM,[ShareGPT4V-7B](https://github.com/InternLM/InternLM-XComposer/tree/main/projects/ShareGPT4V),Vicuna-7B,7B,Generate,0.0,69.7,0.0,75.3,71.4,72.3,63.1,62.0,53.9,70.1,79.8,54.7,0.0,0.0,0.0
31
+ ImageLLM,[ShareGPT4V-13B](https://github.com/InternLM/InternLM-XComposer/tree/main/projects/ShareGPT4V),Vicuna-13B,13B,Generate,0.0,70.8,0.0,75.9,74.1,73.5,66.8,62.4,54.8,75.3,77.3,46.5,0.0,0.0,0.0
32
+ ImageLLM,[GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,67.3,69.1,60.5,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,57.6,65.7,51.7,63.4
33
+ VideoLLM,[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,37.6,39.0,33.7,47.1,43.8,34.9,40.0,32.8,34.6,42.3,50.5,17.7,34.9,36.4,27.3
34
+ VideoLLM,[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,31.2,33.9,23.5,37.2,31.4,33.2,28.4,35.5,29.5,23.7,42.3,25.9,27.6,21.3,21.1
35
+ VideoLLM,[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,30.3,32.0,25.4,39.3,32.9,31.6,27.9,24.2,30.1,27.8,43.8,11.8,31.3,23.2,20.7
36
+ Other,[Unified-IO-2 7B (2.5M)](),from scratch,7B,NG,60.5,65.6,46.0,70.7,69.0,67.4,55.4,62.6,45.5,60.8,67.1,58.1,57.5,43.2,34.0
37
+ Other,[Unified-IO-2 7B](),from scratch,7B,NG,60.4,65.5,46.0,71.3,68.8,67.5,55.5,61.2,45.4,62.9,66.5,59.3,58.0,42.7,34.0
38
+ Other,[Unified-IO-2 3B](),from scratch,3B,NG,58.7,63.8,44.2,68.8,65.8,67.2,52.9,60.4,43.1,55.7,64.0,41.9,57.5,36.0,39.0
39
+ Other,[Unified-IO-2 1B](),from scratch,1B,NG,49.6,55.1,34.0,63.8,57.7,54.6,41.9,53.7,33.3,51.5,58.3,47.7,39.8,34.5,24.6
file/result_v2.csv CHANGED
@@ -1,24 +1,25 @@
1
- Model,Language Model,Avg. P1,Avg. P2,Avg. P3,Scene Understanding,Instance Identity,Instance Attribute,Instance Location,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Celebrity Recognition,Landmark Recognition,Chart Understanding,Visual Referring Expression,Science Knowledge,Emotion Recognition,Visual Mathematics,Difference Spotting,Meme Comprehension,Global Video Understanding,Action Recognition,Action Predicion,Procedure Understanding,In-Context Captioning,Interleaved Image-Text Analysis,Text-to-Image Generation,Next Image Prediction,Text-Image Creation
2
- [BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,41,35.3,0,58.5,48.6,49,39.1,43.4,36.2,48.5,52.9,60.7,51.8,51.4,19.2,43.2,52.4,29.3,22,17.8,38.6,42.5,37.7,36.2,22.9,40,30.6,0,0,0
3
- [InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,42.2,35.7,0,58.9,49.7,61.7,35.1,58.1,34.9,47.4,55.9,61.4,48.5,45.4,26.4,41.7,47.7,34.5,21.2,22.8,35.2,41.5,36.1,40.5,24.5,36.7,34.7,0,0,0
4
- [InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,41.4,29.7,0,53.6,43.9,49,37.8,56.5,35.8,43.3,56.2,57.2,60.3,44.4,27.9,39.2,39.4,23,26.5,36.5,55.4,40.4,38.6,31.2,15.6,26.7,32.7,0,0,0
5
- [LLaVA](https://github.com/haotian-liu/LLaVA),LLaMA-7B,38.7,30.2,0,53.8,47.5,38.3,34.2,42,34.7,40.2,52.9,46.4,51.8,45.6,30.3,40.2,37.6,34.3,20.5,27,50,44.1,36.2,25.1,18.6,40,20.4,0,0,0
6
- [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Vicuna-7B,39.4,34.1,0,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,41.8,55.2,45.2,20.2,41.2,43.3,24.2,25,19,46.7,39,38.7,27.4,28.6,45.8,22.5,0,0,0
7
- [VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,36.2,23.9,0,46.9,38.6,33.6,35.6,27.5,34.4,33,50.8,47.6,52.4,38.2,30.1,34.7,36.1,31.5,27.3,24.6,44,37.8,38.2,20.9,33.5,19.2,28.6,0,0,0
8
- [MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,37.4,34.9,0,46.9,42.5,32,32.3,27.7,29.7,29.9,48.3,35.2,60.9,50.4,24.2,42.2,37.6,32.1,27.3,40.1,56.5,37.6,38.7,25.3,24.4,39.2,30.6,0,0,0
9
- [Otter](https://github.com/Luodian/Otter),LLaMA-7B,36.4,36.6,0,45.9,39.7,31.9,31.6,26.4,32,33,49.2,39.3,59.7,53,23.6,41.2,36.1,37.3,22,27.4,46.7,36.6,37.9,26,24.8,42.5,30.6,0,0,0
10
- [OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,37.3,35.5,0,46.7,42.3,31.7,33.4,27.4,29.8,29.9,47.7,35.6,60.3,49.8,24.2,42.2,39,32.1,27.3,39.9,54.9,37.6,38.4,25.2,24.1,38.3,32.7,0,0,0
11
- [LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,37.5,0,0,45.2,38.5,29.3,33,29.7,35.5,39.2,52,48.7,58.5,46.4,24.2,41.2,40.1,39.7,23.5,29.1,52.2,41.9,38.2,18.8,20.3,0,0,0,0,0
12
- [GVT](https://github.com/TencentARC/GVT),Vicuna-7B,34.4,38.6,0,41.7,35.5,31.8,29.5,36.2,32,32,51.1,35.2,39.4,36.4,25,36.2,31.1,20.6,22.7,41.5,59.2,40.4,29.7,26.3,24.1,42.5,34.7,0,0,0
13
- [mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,39.4,28.9,0,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,49.2,70.9,49.6,23.2,44.2,44,32.5,23.5,33.5,54.9,42,37.8,18.3,19.3,29.2,28.6,0,0,0
14
- [Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder only 1.3B,46.3,23.3,0,63.4,57.1,58.5,44,41.4,37.9,55.7,60.7,68.1,82.1,51.4,21.2,48.2,43.7,30.7,28,25.2,42.8,48.5,40.8,39.5,30,24.2,22.5,0,0,0
15
- [Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat),Qwen-7B,43.1,35.5,0,56.5,47.6,54.8,46.9,54.2,40.3,55.7,55,47.4,62.4,55.6,25.2,43.7,41.2,20.6,28.8,34.3,47.2,39.7,42.8,29.6,19.1,42.5,28.6,0,0,0
16
- [LLaVA-1.5](https://github.com/haotian-liu/LLaVA),vicuna-7B,47.3,30.8,0,63.7,62.4,66.7,51.3,60.2,38.5,47.4,59.8,69,60.6,49.8,25,45.7,56.7,31.1,24.2,35.7,50.3,46.1,39.4,29.4,28.1,39.2,22.5,0,0,0
17
- [IDEFICS-9b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-7B,38,40.3,0,48.2,38.2,37.8,32.9,29,32.4,37.1,54.1,45.5,52.4,52.8,22.6,42.7,33.2,26.6,21.2,56.5,48.4,42.7,38.6,23.6,20.5,45.8,34.7,0,0,0
18
- [InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,59.2,32.1,0,74.8,70.5,67.6,60.5,55.3,53.4,76.3,76.1,61.4,86.1,78,27.2,60.3,84.8,68.9,25.8,47.7,56.6,58.6,49.9,37.6,24.9,27.5,36.7,0,0,0
19
- [Emu](https://github.com/baaivision/Emu),LLaMA-13B,42.5,41.1,41.4,59,50,43.7,37.1,44.3,33.6,49.5,58.3,61.4,68.8,61.6,19,45.7,41.5,24.2,26.4,29.3,37.1,41.9,42.7,37.9,21.8,51.7,30.6,46.8,43.2,34.2
20
- [Next-GPT](https://github.com/NExT-GPT/NExT-GPT),vicuna-7B,30.7,35.6,33.9,36.4,35.1,25.6,29.9,36.1,30.9,39.2,41.7,31,30.9,27.4,21.2,34.2,31.8,24.4,17.4,24.2,39,35.5,33.8,25.6,24.5,46.7,24.5,45.1,19.8,36.7
21
- [seed-llama](https://github.com/AILab-CVC/SEED),LLaMA2-Chat-13B,43.9,43.4,52.3,64,55,51.3,45.4,43.3,37.9,56.7,59.2,57,55.5,52.8,18.8,49.3,44.8,28.8,24.4,29.5,41.5,46.7,39.4,43.9,20.3,54.2,32.7,50.2,40.7,65.8
22
- [VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,37,35.3,0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34,30.6,27.4,40,30.6,0,0,0
23
- [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,36.4,31,0,44.1,37,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22,33.2,37.2,22.4,25,46.1,61.4,42.6,32.2,27,19,37.5,24.5,0,0,0
24
- [Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,34.5,32.2,0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52,35.2,44.9,43.4,23.8,33.2,37.2,26,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0,0,0
 
 
1
+ Model,Language Model,Model Size,Evaluation Method,Avg. P1,Avg. P2,Avg. P3,Scene Understanding,Instance Identity,Instance Attribute,Instance Location,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Celebrity Recognition,Landmark Recognition,Chart Understanding,Visual Referring Expression,Science Knowledge,Emotion Recognition,Visual Mathematics,Difference Spotting,Meme Comprehension,Global Video Understanding,Action Recognition,Action Predicion,Procedure Understanding,In-Context Captioning,Interleaved Image-Text Analysis,Text-to-Image Generation,Next Image Prediction,Text-Image Creation
2
+ [BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,41.0,35.3,0.0,58.5,48.6,49.0,39.1,43.4,36.2,48.5,52.9,60.7,51.8,51.4,19.2,43.2,52.4,29.3,22.0,17.8,38.6,42.5,37.7,36.2,22.9,40.0,30.6,0.0,0.0,0.0
3
+ [InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,42.2,35.7,0.0,58.9,49.7,61.7,35.1,58.1,34.9,47.4,55.9,61.4,48.5,45.4,26.4,41.7,47.7,34.5,21.2,22.8,35.2,41.5,36.1,40.5,24.5,36.7,34.7,0.0,0.0,0.0
4
+ [InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,7B,PPL,41.4,29.7,0.0,53.6,43.9,49.0,37.8,56.5,35.8,43.3,56.2,57.2,60.3,44.4,27.9,39.2,39.4,23.0,26.5,36.5,55.4,40.4,38.6,31.2,15.6,26.7,32.7,0.0,0.0,0.0
5
+ [LLaVA](https://github.com/haotian-liu/LLaVA),LLaMA-7B,7B,PPL,38.7,30.2,0.0,53.8,47.5,38.3,34.2,42.0,34.7,40.2,52.9,46.4,51.8,45.6,30.3,40.2,37.6,34.3,20.5,27.0,50.0,44.1,36.2,25.1,18.6,40.0,20.4,0.0,0.0,0.0
6
+ [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Vicuna-7B,7B,PPL,39.4,34.1,0.0,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,41.8,55.2,45.2,20.2,41.2,43.3,24.2,25.0,19.0,46.7,39.0,38.7,27.4,28.6,45.8,22.5,0.0,0.0,0.0
7
+ [VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,7B,PPL,36.2,23.9,0.0,46.9,38.6,33.6,35.6,27.5,34.4,33.0,50.8,47.6,52.4,38.2,30.1,34.7,36.1,31.5,27.3,24.6,44.0,37.8,38.2,20.9,33.5,19.2,28.6,0.0,0.0,0.0
8
+ [MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,7B,PPL,37.4,34.9,0.0,46.9,42.5,32.0,32.3,27.7,29.7,29.9,48.3,35.2,60.9,50.4,24.2,42.2,37.6,32.1,27.3,40.1,56.5,37.6,38.7,25.3,24.4,39.2,30.6,0.0,0.0,0.0
9
+ [Otter](https://github.com/Luodian/Otter),LLaMA-7B,7B,PPL,36.4,36.6,0.0,45.9,39.7,31.9,31.6,26.4,32.0,33.0,49.2,39.3,59.7,53.0,23.6,41.2,36.1,37.3,22.0,27.4,46.7,36.6,37.9,26.0,24.8,42.5,30.6,0.0,0.0,0.0
10
+ [OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,7B,PPL,37.3,35.5,0.0,46.7,42.3,31.7,33.4,27.4,29.8,29.9,47.7,35.6,60.3,49.8,24.2,42.2,39.0,32.1,27.3,39.9,54.9,37.6,38.4,25.2,24.1,38.3,32.7,0.0,0.0,0.0
11
+ [LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,7B,PPL,37.5,0.0,0.0,45.2,38.5,29.3,33.0,29.7,35.5,39.2,52.0,48.7,58.5,46.4,24.2,41.2,40.1,39.7,23.5,29.1,52.2,41.9,38.2,18.8,20.3,0.0,0.0,0.0,0.0,0.0
12
+ [GVT](https://github.com/TencentARC/GVT),Vicuna-7B,7B,PPL,34.4,38.6,0.0,41.7,35.5,31.8,29.5,36.2,32.0,32.0,51.1,35.2,39.4,36.4,25.0,36.2,31.1,20.6,22.7,41.5,59.2,40.4,29.7,26.3,24.1,42.5,34.7,0.0,0.0,0.0
13
+ [mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,7B,PPL,39.4,28.9,0.0,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,49.2,70.9,49.6,23.2,44.2,44.0,32.5,23.5,33.5,54.9,42.0,37.8,18.3,19.3,29.2,28.6,0.0,0.0,0.0
14
+ [Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder only 1.3B,1.3B,PPL,46.3,23.3,0.0,63.4,57.1,58.5,44.0,41.4,37.9,55.7,60.7,68.1,82.1,51.4,21.2,48.2,43.7,30.7,28.0,25.2,42.8,48.5,40.8,39.5,30.0,24.2,22.5,0.0,0.0,0.0
15
+ [Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat),Qwen-7B,7B,PPL,43.1,35.5,0.0,56.5,47.6,54.8,46.9,54.2,40.3,55.7,55.0,47.4,62.4,55.6,25.2,43.7,41.2,20.6,28.8,34.3,47.2,39.7,42.8,29.6,19.1,42.5,28.6,0.0,0.0,0.0
16
+ [LLaVA-1.5](https://github.com/haotian-liu/LLaVA),vicuna-7B,7B,PPL,47.3,30.8,0.0,63.7,62.4,66.7,51.3,60.2,38.5,47.4,59.8,69.0,60.6,49.8,25.0,45.7,56.7,31.1,24.2,35.7,50.3,46.1,39.4,29.4,28.1,39.2,22.5,0.0,0.0,0.0
17
+ [IDEFICS-9b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-7B,7B,PPL,38.0,40.3,0.0,48.2,38.2,37.8,32.9,29.0,32.4,37.1,54.1,45.5,52.4,52.8,22.6,42.7,33.2,26.6,21.2,56.5,48.4,42.7,38.6,23.6,20.5,45.8,34.7,0.0,0.0,0.0
18
+ [InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,7B,PPL,59.2,32.1,0.0,74.8,70.5,67.6,60.5,55.3,53.4,76.3,76.1,61.4,86.1,78.0,27.2,60.3,84.8,68.9,25.8,47.7,56.6,58.6,49.9,37.6,24.9,27.5,36.7,0.0,0.0,0.0
19
+ [Emu](https://github.com/baaivision/Emu),LLaMA-13B,13B,PPL,42.5,41.1,41.4,59.0,50.0,43.7,37.1,44.3,33.6,49.5,58.3,61.4,68.8,61.6,19.0,45.7,41.5,24.2,26.4,29.3,37.1,41.9,42.7,37.9,21.8,51.7,30.6,46.8,43.2,34.2
20
+ [Next-GPT](https://github.com/NExT-GPT/NExT-GPT),vicuna-7B,7B,PPL,30.7,35.6,33.9,36.4,35.1,25.6,29.9,36.1,30.9,39.2,41.7,31.0,30.9,27.4,21.2,34.2,31.8,24.4,17.4,24.2,39.0,35.5,33.8,25.6,24.5,46.7,24.5,45.1,19.8,36.7
21
+ [SEED-LLaMA](https://github.com/AILab-CVC/SEED),LLaMA2-Chat-13B,13B,PPL,43.9,43.4,52.3,64.0,55.0,51.3,45.4,43.3,37.9,56.7,59.2,57.0,55.5,52.8,18.8,49.3,44.8,28.8,24.4,29.5,41.5,46.7,39.4,43.9,20.3,54.2,32.7,50.2,40.7,65.8
22
+ [GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,68.1,44.2,0.0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,57.6,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0.0,0.0,0.0
23
+ [VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,37.0,35.3,0.0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34.0,30.6,27.4,40.0,30.6,0.0,0.0,0.0
24
+ [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,36.4,31.0,0.0,44.1,37.0,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22.0,33.2,37.2,22.4,25.0,46.1,61.4,42.6,32.2,27.0,19.0,37.5,24.5,0.0,0.0,0.0
25
+ [Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,34.5,32.2,0.0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52.0,35.2,44.9,43.4,23.8,33.2,37.2,26.0,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0.0,0.0,0.0