Spaces:
Running
Running
add visdoc sub task scores
#52
by
MINGYISU
- opened
- app.py +18 -7
- utils.py +5 -22
- utils_v2.py +16 -11
app.py
CHANGED
@@ -11,12 +11,9 @@ def update_table(query, min_size, max_size, selected_tasks=None):
|
|
11 |
filtered_df = filtered_df[selected_columns]
|
12 |
return filtered_df
|
13 |
|
14 |
-
def update_table_v2(query, min_size, max_size
|
15 |
df = v2.get_df()
|
16 |
filtered_df = v2.search_and_filter_models(df, query, min_size, max_size)
|
17 |
-
if selected_tasks and len(selected_tasks) > 0:
|
18 |
-
selected_columns = v2.BASE_COLS + selected_tasks
|
19 |
-
filtered_df = filtered_df[selected_columns]
|
20 |
return filtered_df
|
21 |
|
22 |
with gr.Blocks() as block:
|
@@ -42,6 +39,7 @@ with gr.Blocks() as block:
|
|
42 |
elem_id="search-bar"
|
43 |
)
|
44 |
|
|
|
45 |
df2 = v2.get_df()
|
46 |
min_size2, max_size2 = get_size_range(df2)
|
47 |
|
@@ -92,11 +90,25 @@ with gr.Blocks() as block:
|
|
92 |
)
|
93 |
refresh_button2.click(fn=v2.refresh_data, outputs=data_component2)
|
94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
# table 2, image scores only
|
96 |
with gr.TabItem("🖼️ Image", elem_id="qa-tab-table1", id=2):
|
97 |
gr.Markdown(v2.TABLE_INTRODUCTION_I)
|
|
|
98 |
data_component3 = gr.components.Dataframe(
|
99 |
-
value=
|
100 |
headers=v2.COLUMN_NAMES_I,
|
101 |
type="pandas",
|
102 |
datatype=v2.DATA_TITLE_TYPE_I,
|
@@ -122,7 +134,7 @@ with gr.Blocks() as block:
|
|
122 |
with gr.TabItem("📑 Visual Doc", elem_id="qa-tab-table1", id=4):
|
123 |
gr.Markdown(v2.TABLE_INTRODUCTION_D)
|
124 |
data_component5 = gr.components.Dataframe(
|
125 |
-
value=v2.rank_models(df2[v2.COLUMN_NAMES_D], '
|
126 |
headers=v2.COLUMN_NAMES_D,
|
127 |
type="pandas",
|
128 |
datatype=v2.DATA_TITLE_TYPE_D,
|
@@ -160,7 +172,6 @@ with gr.Blocks() as block:
|
|
160 |
elem_id="search-bar"
|
161 |
)
|
162 |
|
163 |
-
df = get_df()
|
164 |
min_size, max_size = get_size_range(df)
|
165 |
|
166 |
with gr.Row():
|
|
|
11 |
filtered_df = filtered_df[selected_columns]
|
12 |
return filtered_df
|
13 |
|
14 |
+
def update_table_v2(query, min_size, max_size):
|
15 |
df = v2.get_df()
|
16 |
filtered_df = v2.search_and_filter_models(df, query, min_size, max_size)
|
|
|
|
|
|
|
17 |
return filtered_df
|
18 |
|
19 |
with gr.Blocks() as block:
|
|
|
39 |
elem_id="search-bar"
|
40 |
)
|
41 |
|
42 |
+
df = get_df()
|
43 |
df2 = v2.get_df()
|
44 |
min_size2, max_size2 = get_size_range(df2)
|
45 |
|
|
|
90 |
)
|
91 |
refresh_button2.click(fn=v2.refresh_data, outputs=data_component2)
|
92 |
|
93 |
+
|
94 |
+
def get_special_processed_df2():
|
95 |
+
"""Temporary special processing to merge v1 scores with v2 image scores.
|
96 |
+
Will be removed later after v2 is fully adopted."""
|
97 |
+
df2_i = df2[v2.COLUMN_NAMES_I]
|
98 |
+
df1 = df.rename(columns={'V1-Overall': 'Image-Overall'})
|
99 |
+
df1 = df1[v2.BASE_COLS + v2.SUB_TASKS_I + ['Image-Overall']]
|
100 |
+
combined_df = pd.concat([df1, df2_i], ignore_index=True)
|
101 |
+
for task in v2.TASKS_I:
|
102 |
+
combined_df[task] = combined_df[task].apply(lambda score: '-' if pd.isna(score) else score)
|
103 |
+
combined_df = v2.rank_models(combined_df, 'Image-Overall')
|
104 |
+
return combined_df[v2.COLUMN_NAMES_I]
|
105 |
+
|
106 |
# table 2, image scores only
|
107 |
with gr.TabItem("🖼️ Image", elem_id="qa-tab-table1", id=2):
|
108 |
gr.Markdown(v2.TABLE_INTRODUCTION_I)
|
109 |
+
df2_i = get_special_processed_df2()
|
110 |
data_component3 = gr.components.Dataframe(
|
111 |
+
value=df2_i,
|
112 |
headers=v2.COLUMN_NAMES_I,
|
113 |
type="pandas",
|
114 |
datatype=v2.DATA_TITLE_TYPE_I,
|
|
|
134 |
with gr.TabItem("📑 Visual Doc", elem_id="qa-tab-table1", id=4):
|
135 |
gr.Markdown(v2.TABLE_INTRODUCTION_D)
|
136 |
data_component5 = gr.components.Dataframe(
|
137 |
+
value=v2.rank_models(df2[v2.COLUMN_NAMES_D], 'Visdoc-Overall'),
|
138 |
headers=v2.COLUMN_NAMES_D,
|
139 |
type="pandas",
|
140 |
datatype=v2.DATA_TITLE_TYPE_D,
|
|
|
172 |
elem_id="search-bar"
|
173 |
)
|
174 |
|
|
|
175 |
min_size, max_size = get_size_range(df)
|
176 |
|
177 |
with gr.Row():
|
utils.py
CHANGED
@@ -38,8 +38,8 @@ This comprehensive suite enables robust evaluation of multimodal embedding model
|
|
38 |
| [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2) |
|
39 |
"""
|
40 |
|
41 |
-
TABLE_INTRODUCTION = """***Important Notes
|
42 |
-
|
43 |
|
44 |
LEADERBOARD_INFO = """
|
45 |
## Dataset Summary
|
@@ -57,9 +57,9 @@ SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
|
|
57 |
|
58 |
## ⚠ Please note that you need to submit the JSON file with the following format:
|
59 |
|
60 |
-
### ***Important Notes: We have released MMEB-V2 and will deprecate MMEB-V1 soon
|
|
|
61 |
### ***In V2, the detailed scores of each dataset will be included, and our code will automatically generate the results and calculate the overall scores. See the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for more information.***
|
62 |
-
### **A V2 Submission would look like this:**
|
63 |
```json
|
64 |
{
|
65 |
"metadata": {
|
@@ -103,23 +103,6 @@ SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
|
|
103 |
}
|
104 |
}
|
105 |
```
|
106 |
-
|
107 |
-
### **TO SUBMIT V1 ONLY (Depreciated, but we still accept this format until 2025-06-30)**
|
108 |
-
```json
|
109 |
-
[
|
110 |
-
{
|
111 |
-
"Model": "<Model Name>",
|
112 |
-
"URL": "<Model URL>" or null,
|
113 |
-
"Model Size(B)": 1000 or null,
|
114 |
-
"Data Source": "Self-Reported",
|
115 |
-
"V1-Overall": 50.0,
|
116 |
-
"I-CLS": 50.0,
|
117 |
-
"I-QA": 50.0,
|
118 |
-
"I-RET": 50.0,
|
119 |
-
"I-VG": 50.0
|
120 |
-
},
|
121 |
-
]
|
122 |
-
```
|
123 |
Please refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for detailed instructions about evaluating your model. \n
|
124 |
To submit, create a pull request and upload the generated JSON file to the ***scores*** folder, then send us an email at [email protected], including your model's information. \n We will review your submission and update the leaderboard accordingly. \n
|
125 |
Please also share any feedback or suggestions you have for improving the leaderboard experience. We appreciate your contributions to the MMEB community!
|
@@ -195,7 +178,7 @@ def process_model_size(size):
|
|
195 |
return 'unknown'
|
196 |
try:
|
197 |
val = float(size)
|
198 |
-
return val
|
199 |
except (ValueError, TypeError):
|
200 |
return 'unknown'
|
201 |
|
|
|
38 |
| [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2) |
|
39 |
"""
|
40 |
|
41 |
+
TABLE_INTRODUCTION = """***Important Notes: ***
|
42 |
+
This is the MMEB-V1 leaderboard, which is now deprecated. MMEB-V1 is now the Image section of MMEB-V2, and the results on this leaderboard have been integrated into MMEB-V2 Image tab. For researchers relying on MMEB-V1, we recommend transitioning to MMEB-V2 for more comprehensive evaluation metrics and support. Thank you for your collaborations and understanding! \n"""
|
43 |
|
44 |
LEADERBOARD_INFO = """
|
45 |
## Dataset Summary
|
|
|
57 |
|
58 |
## ⚠ Please note that you need to submit the JSON file with the following format:
|
59 |
|
60 |
+
### ***Important Notes: We have released MMEB-V2 and will deprecate MMEB-V1 soon.*** \n
|
61 |
+
### ***All further submissions should be made using the V2 format (see following).*** \n
|
62 |
### ***In V2, the detailed scores of each dataset will be included, and our code will automatically generate the results and calculate the overall scores. See the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for more information.***
|
|
|
63 |
```json
|
64 |
{
|
65 |
"metadata": {
|
|
|
103 |
}
|
104 |
}
|
105 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
Please refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for detailed instructions about evaluating your model. \n
|
107 |
To submit, create a pull request and upload the generated JSON file to the ***scores*** folder, then send us an email at [email protected], including your model's information. \n We will review your submission and update the leaderboard accordingly. \n
|
108 |
Please also share any feedback or suggestions you have for improving the leaderboard experience. We appreciate your contributions to the MMEB community!
|
|
|
178 |
return 'unknown'
|
179 |
try:
|
180 |
val = float(size)
|
181 |
+
return round(val, 3)
|
182 |
except (ValueError, TypeError):
|
183 |
return 'unknown'
|
184 |
|
utils_v2.py
CHANGED
@@ -20,7 +20,10 @@ DATASETS = {
|
|
20 |
"I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W']
|
21 |
},
|
22 |
"visdoc": {
|
23 |
-
"
|
|
|
|
|
|
|
24 |
},
|
25 |
"video": {
|
26 |
"V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'],
|
@@ -37,27 +40,29 @@ SPECIAL_METRICS = {
|
|
37 |
}
|
38 |
|
39 |
BASE_COLS = ['Rank', 'Models', 'Model Size(B)']
|
40 |
-
TASKS = ["Overall", "I-CLS", "I-QA", "I-RET", "I-VG", "VisDoc", "V-CLS", "V-QA", "V-RET", "V-MRET"]
|
41 |
BASE_DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown']
|
42 |
|
43 |
-
COLUMN_NAMES = BASE_COLS + ["Overall", 'Image-Overall', 'Video-Overall', '
|
44 |
DATA_TITLE_TYPE = BASE_DATA_TITLE_TYPE + \
|
45 |
['number'] * 3
|
46 |
|
47 |
-
|
|
|
48 |
COLUMN_NAMES_I = BASE_COLS + TASKS_I
|
49 |
DATA_TITLE_TYPE_I = BASE_DATA_TITLE_TYPE + \
|
50 |
-
['number'] *
|
51 |
|
52 |
-
|
|
|
53 |
COLUMN_NAMES_V = BASE_COLS + TASKS_V
|
54 |
DATA_TITLE_TYPE_V = BASE_DATA_TITLE_TYPE + \
|
55 |
-
['number'] *
|
56 |
|
57 |
-
|
|
|
58 |
COLUMN_NAMES_D = BASE_COLS + TASKS_D
|
59 |
DATA_TITLE_TYPE_D = BASE_DATA_TITLE_TYPE + \
|
60 |
-
['number'] * len(TASKS_D)
|
61 |
|
62 |
TABLE_INTRODUCTION = """**MMEB**: Massive MultiModal Embedding Benchmark \n
|
63 |
Models are ranked based on **Overall**"""
|
@@ -147,10 +152,10 @@ def generate_model_row(data):
|
|
147 |
row.update(scores)
|
148 |
return row
|
149 |
|
150 |
-
def rank_models(df, column='Overall'):
|
151 |
"""Ranks the models based on the specific score."""
|
152 |
df = df.sort_values(by=column, ascending=False).reset_index(drop=True)
|
153 |
-
df[
|
154 |
return df
|
155 |
|
156 |
def get_df():
|
|
|
20 |
"I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W']
|
21 |
},
|
22 |
"visdoc": {
|
23 |
+
"ViDoRe-V1": ['ViDoRe_arxivqa', 'ViDoRe_docvqa', 'ViDoRe_infovqa', 'ViDoRe_tabfquad', 'ViDoRe_tatdqa', 'ViDoRe_shiftproject', 'ViDoRe_syntheticDocQA_artificial_intelligence', 'ViDoRe_syntheticDocQA_energy', 'ViDoRe_syntheticDocQA_government_reports', 'ViDoRe_syntheticDocQA_healthcare_industry'],
|
24 |
+
"ViDoRe-V2": ["ViDoRe_esg_reports_human_labeled_v2", "ViDoRe_biomedical_lectures_v2", "ViDoRe_economics_reports_v2", "ViDoRe_esg_reports_v2"], # Following Abandoned: "ViDoRe_biomedical_lectures_v2_multilingual", "ViDoRe_economics_reports_v2_multilingual", "ViDoRe_esg_reports_v2_multilingual"
|
25 |
+
"VisRAG": ['VisRAG_ArxivQA', 'VisRAG_ChartQA', 'VisRAG_MP-DocVQA', 'VisRAG_SlideVQA', 'VisRAG_InfoVQA', 'VisRAG_PlotQA'],
|
26 |
+
"VisDoc-OOD": ['ViDoSeek-page', 'ViDoSeek-doc', 'MMLongBench-page', 'MMLongBench-doc']
|
27 |
},
|
28 |
"video": {
|
29 |
"V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'],
|
|
|
40 |
}
|
41 |
|
42 |
BASE_COLS = ['Rank', 'Models', 'Model Size(B)']
|
|
|
43 |
BASE_DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown']
|
44 |
|
45 |
+
COLUMN_NAMES = BASE_COLS + ["Overall", 'Image-Overall', 'Video-Overall', 'Visdoc-Overall']
|
46 |
DATA_TITLE_TYPE = BASE_DATA_TITLE_TYPE + \
|
47 |
['number'] * 3
|
48 |
|
49 |
+
SUB_TASKS_I = ["I-CLS", "I-QA", "I-RET", "I-VG"]
|
50 |
+
TASKS_I = ['Image-Overall'] + SUB_TASKS_I + ALL_DATASETS_SPLITS['image']
|
51 |
COLUMN_NAMES_I = BASE_COLS + TASKS_I
|
52 |
DATA_TITLE_TYPE_I = BASE_DATA_TITLE_TYPE + \
|
53 |
+
['number'] * len(TASKS_I + SUB_TASKS_I)
|
54 |
|
55 |
+
SUB_TASKS_V = ["V-CLS", "V-QA", "V-RET", "V-MRET"]
|
56 |
+
TASKS_V = ['Video-Overall'] + SUB_TASKS_V + ALL_DATASETS_SPLITS['video']
|
57 |
COLUMN_NAMES_V = BASE_COLS + TASKS_V
|
58 |
DATA_TITLE_TYPE_V = BASE_DATA_TITLE_TYPE + \
|
59 |
+
['number'] * len(TASKS_V + SUB_TASKS_V)
|
60 |
|
61 |
+
SUB_TASKS_D = ['ViDoRe-V1', 'ViDoRe-V2', 'VisRAG', 'VisDoc-OOD']
|
62 |
+
TASKS_D = ['Visdoc-Overall'] + SUB_TASKS_D + ALL_DATASETS_SPLITS['visdoc']
|
63 |
COLUMN_NAMES_D = BASE_COLS + TASKS_D
|
64 |
DATA_TITLE_TYPE_D = BASE_DATA_TITLE_TYPE + \
|
65 |
+
['number'] * len(TASKS_D + SUB_TASKS_D)
|
66 |
|
67 |
TABLE_INTRODUCTION = """**MMEB**: Massive MultiModal Embedding Benchmark \n
|
68 |
Models are ranked based on **Overall**"""
|
|
|
152 |
row.update(scores)
|
153 |
return row
|
154 |
|
155 |
+
def rank_models(df, column='Overall', rank_name='Rank'):
|
156 |
"""Ranks the models based on the specific score."""
|
157 |
df = df.sort_values(by=column, ascending=False).reset_index(drop=True)
|
158 |
+
df[rank_name] = range(1, len(df) + 1)
|
159 |
return df
|
160 |
|
161 |
def get_df():
|