add visdoc sub task scores

#52
by MINGYISU - opened
Files changed (3) hide show
  1. app.py +18 -7
  2. utils.py +5 -22
  3. utils_v2.py +16 -11
app.py CHANGED
@@ -11,12 +11,9 @@ def update_table(query, min_size, max_size, selected_tasks=None):
11
  filtered_df = filtered_df[selected_columns]
12
  return filtered_df
13
 
14
- def update_table_v2(query, min_size, max_size, selected_tasks=None):
15
  df = v2.get_df()
16
  filtered_df = v2.search_and_filter_models(df, query, min_size, max_size)
17
- if selected_tasks and len(selected_tasks) > 0:
18
- selected_columns = v2.BASE_COLS + selected_tasks
19
- filtered_df = filtered_df[selected_columns]
20
  return filtered_df
21
 
22
  with gr.Blocks() as block:
@@ -42,6 +39,7 @@ with gr.Blocks() as block:
42
  elem_id="search-bar"
43
  )
44
 
 
45
  df2 = v2.get_df()
46
  min_size2, max_size2 = get_size_range(df2)
47
 
@@ -92,11 +90,25 @@ with gr.Blocks() as block:
92
  )
93
  refresh_button2.click(fn=v2.refresh_data, outputs=data_component2)
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  # table 2, image scores only
96
  with gr.TabItem("🖼️ Image", elem_id="qa-tab-table1", id=2):
97
  gr.Markdown(v2.TABLE_INTRODUCTION_I)
 
98
  data_component3 = gr.components.Dataframe(
99
- value=v2.rank_models(df2[v2.COLUMN_NAMES_I], 'Image-Overall'),
100
  headers=v2.COLUMN_NAMES_I,
101
  type="pandas",
102
  datatype=v2.DATA_TITLE_TYPE_I,
@@ -122,7 +134,7 @@ with gr.Blocks() as block:
122
  with gr.TabItem("📑 Visual Doc", elem_id="qa-tab-table1", id=4):
123
  gr.Markdown(v2.TABLE_INTRODUCTION_D)
124
  data_component5 = gr.components.Dataframe(
125
- value=v2.rank_models(df2[v2.COLUMN_NAMES_D], 'VisDoc'),
126
  headers=v2.COLUMN_NAMES_D,
127
  type="pandas",
128
  datatype=v2.DATA_TITLE_TYPE_D,
@@ -160,7 +172,6 @@ with gr.Blocks() as block:
160
  elem_id="search-bar"
161
  )
162
 
163
- df = get_df()
164
  min_size, max_size = get_size_range(df)
165
 
166
  with gr.Row():
 
11
  filtered_df = filtered_df[selected_columns]
12
  return filtered_df
13
 
14
+ def update_table_v2(query, min_size, max_size):
15
  df = v2.get_df()
16
  filtered_df = v2.search_and_filter_models(df, query, min_size, max_size)
 
 
 
17
  return filtered_df
18
 
19
  with gr.Blocks() as block:
 
39
  elem_id="search-bar"
40
  )
41
 
42
+ df = get_df()
43
  df2 = v2.get_df()
44
  min_size2, max_size2 = get_size_range(df2)
45
 
 
90
  )
91
  refresh_button2.click(fn=v2.refresh_data, outputs=data_component2)
92
 
93
+
94
+ def get_special_processed_df2():
95
+ """Temporary special processing to merge v1 scores with v2 image scores.
96
+ Will be removed later after v2 is fully adopted."""
97
+ df2_i = df2[v2.COLUMN_NAMES_I]
98
+ df1 = df.rename(columns={'V1-Overall': 'Image-Overall'})
99
+ df1 = df1[v2.BASE_COLS + v2.SUB_TASKS_I + ['Image-Overall']]
100
+ combined_df = pd.concat([df1, df2_i], ignore_index=True)
101
+ for task in v2.TASKS_I:
102
+ combined_df[task] = combined_df[task].apply(lambda score: '-' if pd.isna(score) else score)
103
+ combined_df = v2.rank_models(combined_df, 'Image-Overall')
104
+ return combined_df[v2.COLUMN_NAMES_I]
105
+
106
  # table 2, image scores only
107
  with gr.TabItem("🖼️ Image", elem_id="qa-tab-table1", id=2):
108
  gr.Markdown(v2.TABLE_INTRODUCTION_I)
109
+ df2_i = get_special_processed_df2()
110
  data_component3 = gr.components.Dataframe(
111
+ value=df2_i,
112
  headers=v2.COLUMN_NAMES_I,
113
  type="pandas",
114
  datatype=v2.DATA_TITLE_TYPE_I,
 
134
  with gr.TabItem("📑 Visual Doc", elem_id="qa-tab-table1", id=4):
135
  gr.Markdown(v2.TABLE_INTRODUCTION_D)
136
  data_component5 = gr.components.Dataframe(
137
+ value=v2.rank_models(df2[v2.COLUMN_NAMES_D], 'Visdoc-Overall'),
138
  headers=v2.COLUMN_NAMES_D,
139
  type="pandas",
140
  datatype=v2.DATA_TITLE_TYPE_D,
 
172
  elem_id="search-bar"
173
  )
174
 
 
175
  min_size, max_size = get_size_range(df)
176
 
177
  with gr.Row():
utils.py CHANGED
@@ -38,8 +38,8 @@ This comprehensive suite enables robust evaluation of multimodal embedding model
38
  | [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2) |
39
  """
40
 
41
- TABLE_INTRODUCTION = """***Important Notes:*** \n
42
- **We will be depreciating the MMEB-V1 leaderboard soon, and we will be releasing MMEB-V2 with more detailed scores and automatic evaluation.** \n"""
43
 
44
  LEADERBOARD_INFO = """
45
  ## Dataset Summary
@@ -57,9 +57,9 @@ SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
57
 
58
  ## ⚠ Please note that you need to submit the JSON file with the following format:
59
 
60
- ### ***Important Notes: We have released MMEB-V2 and will deprecate MMEB-V1 soon. All further submissions should be made using the V2 format (see following).***
 
61
  ### ***In V2, the detailed scores of each dataset will be included, and our code will automatically generate the results and calculate the overall scores. See the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for more information.***
62
- ### **A V2 Submission would look like this:**
63
  ```json
64
  {
65
  "metadata": {
@@ -103,23 +103,6 @@ SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
103
  }
104
  }
105
  ```
106
-
107
- ### **TO SUBMIT V1 ONLY (Depreciated, but we still accept this format until 2025-06-30)**
108
- ```json
109
- [
110
- {
111
- "Model": "<Model Name>",
112
- "URL": "<Model URL>" or null,
113
- "Model Size(B)": 1000 or null,
114
- "Data Source": "Self-Reported",
115
- "V1-Overall": 50.0,
116
- "I-CLS": 50.0,
117
- "I-QA": 50.0,
118
- "I-RET": 50.0,
119
- "I-VG": 50.0
120
- },
121
- ]
122
- ```
123
  Please refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for detailed instructions about evaluating your model. \n
124
  To submit, create a pull request and upload the generated JSON file to the ***scores*** folder, then send us an email at [email protected], including your model's information. \n We will review your submission and update the leaderboard accordingly. \n
125
  Please also share any feedback or suggestions you have for improving the leaderboard experience. We appreciate your contributions to the MMEB community!
@@ -195,7 +178,7 @@ def process_model_size(size):
195
  return 'unknown'
196
  try:
197
  val = float(size)
198
- return val
199
  except (ValueError, TypeError):
200
  return 'unknown'
201
 
 
38
  | [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2) |
39
  """
40
 
41
+ TABLE_INTRODUCTION = """***Important Notes: ***
42
+ This is the MMEB-V1 leaderboard, which is now deprecated. MMEB-V1 is now the Image section of MMEB-V2, and the results on this leaderboard have been integrated into MMEB-V2 Image tab. For researchers relying on MMEB-V1, we recommend transitioning to MMEB-V2 for more comprehensive evaluation metrics and support. Thank you for your collaborations and understanding! \n"""
43
 
44
  LEADERBOARD_INFO = """
45
  ## Dataset Summary
 
57
 
58
  ## ⚠ Please note that you need to submit the JSON file with the following format:
59
 
60
+ ### ***Important Notes: We have released MMEB-V2 and will deprecate MMEB-V1 soon.*** \n
61
+ ### ***All further submissions should be made using the V2 format (see following).*** \n
62
  ### ***In V2, the detailed scores of each dataset will be included, and our code will automatically generate the results and calculate the overall scores. See the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for more information.***
 
63
  ```json
64
  {
65
  "metadata": {
 
103
  }
104
  }
105
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  Please refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for detailed instructions about evaluating your model. \n
107
  To submit, create a pull request and upload the generated JSON file to the ***scores*** folder, then send us an email at [email protected], including your model's information. \n We will review your submission and update the leaderboard accordingly. \n
108
  Please also share any feedback or suggestions you have for improving the leaderboard experience. We appreciate your contributions to the MMEB community!
 
178
  return 'unknown'
179
  try:
180
  val = float(size)
181
+ return round(val, 3)
182
  except (ValueError, TypeError):
183
  return 'unknown'
184
 
utils_v2.py CHANGED
@@ -20,7 +20,10 @@ DATASETS = {
20
  "I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W']
21
  },
22
  "visdoc": {
23
- "VisDoc": ['ViDoRe_arxivqa', 'ViDoRe_docvqa', 'ViDoRe_infovqa', 'ViDoRe_tabfquad', 'ViDoRe_tatdqa', 'ViDoRe_shiftproject', 'ViDoRe_syntheticDocQA_artificial_intelligence', 'ViDoRe_syntheticDocQA_energy', 'ViDoRe_syntheticDocQA_government_reports', 'ViDoRe_syntheticDocQA_healthcare_industry', 'VisRAG_ArxivQA', 'VisRAG_ChartQA', 'VisRAG_MP-DocVQA', 'VisRAG_SlideVQA', 'VisRAG_InfoVQA', 'VisRAG_PlotQA', 'ViDoSeek-page', 'ViDoSeek-doc', 'MMLongBench-page', 'MMLongBench-doc', "ViDoRe_esg_reports_human_labeled_v2", "ViDoRe_biomedical_lectures_v2", "ViDoRe_biomedical_lectures_v2_multilingual", "ViDoRe_economics_reports_v2", "ViDoRe_economics_reports_v2_multilingual", "ViDoRe_esg_reports_v2", "ViDoRe_esg_reports_v2_multilingual"]
 
 
 
24
  },
25
  "video": {
26
  "V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'],
@@ -37,27 +40,29 @@ SPECIAL_METRICS = {
37
  }
38
 
39
  BASE_COLS = ['Rank', 'Models', 'Model Size(B)']
40
- TASKS = ["Overall", "I-CLS", "I-QA", "I-RET", "I-VG", "VisDoc", "V-CLS", "V-QA", "V-RET", "V-MRET"]
41
  BASE_DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown']
42
 
43
- COLUMN_NAMES = BASE_COLS + ["Overall", 'Image-Overall', 'Video-Overall', 'VisDoc']
44
  DATA_TITLE_TYPE = BASE_DATA_TITLE_TYPE + \
45
  ['number'] * 3
46
 
47
- TASKS_I = ['Image-Overall'] + TASKS[1:5] + ALL_DATASETS_SPLITS['image']
 
48
  COLUMN_NAMES_I = BASE_COLS + TASKS_I
49
  DATA_TITLE_TYPE_I = BASE_DATA_TITLE_TYPE + \
50
- ['number'] * (len(TASKS_I) + 4)
51
 
52
- TASKS_V = ['Video-Overall'] + TASKS[6:10] + ALL_DATASETS_SPLITS['video']
 
53
  COLUMN_NAMES_V = BASE_COLS + TASKS_V
54
  DATA_TITLE_TYPE_V = BASE_DATA_TITLE_TYPE + \
55
- ['number'] * (len(TASKS_V) + 4)
56
 
57
- TASKS_D = ['VisDoc'] + ALL_DATASETS_SPLITS['visdoc']
 
58
  COLUMN_NAMES_D = BASE_COLS + TASKS_D
59
  DATA_TITLE_TYPE_D = BASE_DATA_TITLE_TYPE + \
60
- ['number'] * len(TASKS_D)
61
 
62
  TABLE_INTRODUCTION = """**MMEB**: Massive MultiModal Embedding Benchmark \n
63
  Models are ranked based on **Overall**"""
@@ -147,10 +152,10 @@ def generate_model_row(data):
147
  row.update(scores)
148
  return row
149
 
150
- def rank_models(df, column='Overall'):
151
  """Ranks the models based on the specific score."""
152
  df = df.sort_values(by=column, ascending=False).reset_index(drop=True)
153
- df['Rank'] = range(1, len(df) + 1)
154
  return df
155
 
156
  def get_df():
 
20
  "I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W']
21
  },
22
  "visdoc": {
23
+ "ViDoRe-V1": ['ViDoRe_arxivqa', 'ViDoRe_docvqa', 'ViDoRe_infovqa', 'ViDoRe_tabfquad', 'ViDoRe_tatdqa', 'ViDoRe_shiftproject', 'ViDoRe_syntheticDocQA_artificial_intelligence', 'ViDoRe_syntheticDocQA_energy', 'ViDoRe_syntheticDocQA_government_reports', 'ViDoRe_syntheticDocQA_healthcare_industry'],
24
+ "ViDoRe-V2": ["ViDoRe_esg_reports_human_labeled_v2", "ViDoRe_biomedical_lectures_v2", "ViDoRe_economics_reports_v2", "ViDoRe_esg_reports_v2"], # Following Abandoned: "ViDoRe_biomedical_lectures_v2_multilingual", "ViDoRe_economics_reports_v2_multilingual", "ViDoRe_esg_reports_v2_multilingual"
25
+ "VisRAG": ['VisRAG_ArxivQA', 'VisRAG_ChartQA', 'VisRAG_MP-DocVQA', 'VisRAG_SlideVQA', 'VisRAG_InfoVQA', 'VisRAG_PlotQA'],
26
+ "VisDoc-OOD": ['ViDoSeek-page', 'ViDoSeek-doc', 'MMLongBench-page', 'MMLongBench-doc']
27
  },
28
  "video": {
29
  "V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'],
 
40
  }
41
 
42
  BASE_COLS = ['Rank', 'Models', 'Model Size(B)']
 
43
  BASE_DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown']
44
 
45
+ COLUMN_NAMES = BASE_COLS + ["Overall", 'Image-Overall', 'Video-Overall', 'Visdoc-Overall']
46
  DATA_TITLE_TYPE = BASE_DATA_TITLE_TYPE + \
47
  ['number'] * 3
48
 
49
+ SUB_TASKS_I = ["I-CLS", "I-QA", "I-RET", "I-VG"]
50
+ TASKS_I = ['Image-Overall'] + SUB_TASKS_I + ALL_DATASETS_SPLITS['image']
51
  COLUMN_NAMES_I = BASE_COLS + TASKS_I
52
  DATA_TITLE_TYPE_I = BASE_DATA_TITLE_TYPE + \
53
+ ['number'] * len(TASKS_I + SUB_TASKS_I)
54
 
55
+ SUB_TASKS_V = ["V-CLS", "V-QA", "V-RET", "V-MRET"]
56
+ TASKS_V = ['Video-Overall'] + SUB_TASKS_V + ALL_DATASETS_SPLITS['video']
57
  COLUMN_NAMES_V = BASE_COLS + TASKS_V
58
  DATA_TITLE_TYPE_V = BASE_DATA_TITLE_TYPE + \
59
+ ['number'] * len(TASKS_V + SUB_TASKS_V)
60
 
61
+ SUB_TASKS_D = ['ViDoRe-V1', 'ViDoRe-V2', 'VisRAG', 'VisDoc-OOD']
62
+ TASKS_D = ['Visdoc-Overall'] + SUB_TASKS_D + ALL_DATASETS_SPLITS['visdoc']
63
  COLUMN_NAMES_D = BASE_COLS + TASKS_D
64
  DATA_TITLE_TYPE_D = BASE_DATA_TITLE_TYPE + \
65
+ ['number'] * len(TASKS_D + SUB_TASKS_D)
66
 
67
  TABLE_INTRODUCTION = """**MMEB**: Massive MultiModal Embedding Benchmark \n
68
  Models are ranked based on **Overall**"""
 
152
  row.update(scores)
153
  return row
154
 
155
+ def rank_models(df, column='Overall', rank_name='Rank'):
156
  """Ranks the models based on the specific score."""
157
  df = df.sort_values(by=column, ascending=False).reset_index(drop=True)
158
+ df[rank_name] = range(1, len(df) + 1)
159
  return df
160
 
161
  def get_df():