Spaces:
Running
Running
update paper info (#61)
Browse files- updated info (a79c335634714f44385563180292e85e41370c17)
- datasets.py +35 -0
- utils.py +8 -3
- utils_v2.py +10 -37
datasets.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def sum_lol(lol):
|
2 |
+
assert isinstance(lol, list) and all(isinstance(i, list) for i in lol), f"Input should be a list of lists, got {type(lol)}"
|
3 |
+
total = []
|
4 |
+
for sublist in lol:
|
5 |
+
total.extend(sublist)
|
6 |
+
return total
|
7 |
+
|
8 |
+
SCORE_BASE_DIR = "scores"
|
9 |
+
META_DATA = ["model_name", "model_size", "url"]
|
10 |
+
DATASETS = {
|
11 |
+
"image": {
|
12 |
+
"I-CLS": ['VOC2007', 'N24News', 'SUN397', 'ObjectNet', 'Country211', 'Place365', 'ImageNet-1K', 'HatefulMemes', 'ImageNet-A', 'ImageNet-R'],
|
13 |
+
"I-QA": ['OK-VQA', 'A-OKVQA', 'DocVQA', 'InfographicsVQA', 'ChartQA', 'Visual7W', 'ScienceQA', 'GQA', 'TextVQA', 'VizWiz'],
|
14 |
+
"I-RET": ['VisDial', 'CIRR', 'VisualNews_t2i', 'VisualNews_i2t', 'MSCOCO_t2i', 'MSCOCO_i2t', 'NIGHTS', 'WebQA', 'FashionIQ', 'Wiki-SS-NQ', 'OVEN', 'EDIS'],
|
15 |
+
"I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W-Pointing']
|
16 |
+
},
|
17 |
+
"visdoc": {
|
18 |
+
"ViDoRe-V1": ['ViDoRe_arxivqa', 'ViDoRe_docvqa', 'ViDoRe_infovqa', 'ViDoRe_tabfquad', 'ViDoRe_tatdqa', 'ViDoRe_shiftproject', 'ViDoRe_syntheticDocQA_artificial_intelligence', 'ViDoRe_syntheticDocQA_energy', 'ViDoRe_syntheticDocQA_government_reports', 'ViDoRe_syntheticDocQA_healthcare_industry'],
|
19 |
+
"ViDoRe-V2": ["ViDoRe_esg_reports_human_labeled_v2","ViDoRe_biomedical_lectures_v2_multilingual", "ViDoRe_economics_reports_v2_multilingual", "ViDoRe_esg_reports_v2_multilingual"], # "ViDoRe_biomedical_lectures_v2", "ViDoRe_economics_reports_v2", "ViDoRe_esg_reports_v2"
|
20 |
+
"VisRAG": ['VisRAG_ArxivQA', 'VisRAG_ChartQA', 'VisRAG_MP-DocVQA', 'VisRAG_SlideVQA', 'VisRAG_InfoVQA', 'VisRAG_PlotQA'],
|
21 |
+
"VisDoc-OOD": ['ViDoSeek-page', 'ViDoSeek-doc', 'MMLongBench-page', 'MMLongBench-doc']
|
22 |
+
},
|
23 |
+
"video": {
|
24 |
+
"V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'],
|
25 |
+
"V-QA": ['Video-MME', 'MVBench', 'NExTQA', 'EgoSchema', 'ActivityNetQA'],
|
26 |
+
"V-RET": ['MSR-VTT', 'MSVD', 'DiDeMo', 'VATEX', 'YouCook2'],
|
27 |
+
"V-MRET": ['QVHighlight', 'Charades-STA', 'MomentSeeker']
|
28 |
+
}
|
29 |
+
}
|
30 |
+
ALL_DATASETS_SPLITS = {k: sum_lol(list(v.values())) for k, v in DATASETS.items()}
|
31 |
+
ALL_DATASETS = sum_lol(list(ALL_DATASETS_SPLITS.values()))
|
32 |
+
MODALITIES = list(DATASETS.keys())
|
33 |
+
SPECIAL_METRICS = {
|
34 |
+
'__default__': 'hit@1',
|
35 |
+
}
|
utils.py
CHANGED
@@ -6,8 +6,11 @@ import os
|
|
6 |
import requests
|
7 |
import io
|
8 |
import shutil
|
|
|
9 |
from huggingface_hub import Repository
|
10 |
|
|
|
|
|
11 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
12 |
|
13 |
BASE_COLS = ["Rank", "Models", "Model Size(B)", "Data Source"]
|
@@ -33,7 +36,7 @@ Building upon on **MMEB-V1**, **MMEB-V2** expands the evaluation scope to includ
|
|
33 |
This comprehensive suite enables robust evaluation of multimodal embedding models across static, temporal, and structured visual data settings.
|
34 |
|
35 |
| [**📈Overview**](https://tiger-ai-lab.github.io/VLM2Vec/) | [**Github**](https://github.com/TIGER-AI-Lab/VLM2Vec)
|
36 |
-
| [**📖MMEB-V2/VLM2Vec-V2 Paper
|
37 |
| [**📖MMEB-V1/VLM2Vec-V1 Paper**](https://arxiv.org/abs/2410.05160)
|
38 |
| [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2)
|
39 |
| [**Discord**](https://discord.gg/njyKubdtry) |
|
@@ -42,8 +45,10 @@ This comprehensive suite enables robust evaluation of multimodal embedding model
|
|
42 |
TABLE_INTRODUCTION = """***Important Notes: ***
|
43 |
This is the MMEB-V1 leaderboard, which is now deprecated. MMEB-V1 is now the Image section of MMEB-V2, and the results on this leaderboard have been integrated into MMEB-V2 Image tab. For researchers relying on MMEB-V1, we recommend transitioning to MMEB-V2 for more comprehensive evaluation metrics and support. Thank you for your collaborations and understanding! \n"""
|
44 |
|
45 |
-
LEADERBOARD_INFO = """
|
46 |
-
## Dataset
|
|
|
|
|
47 |
"""
|
48 |
|
49 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
6 |
import requests
|
7 |
import io
|
8 |
import shutil
|
9 |
+
import pprint as pp
|
10 |
from huggingface_hub import Repository
|
11 |
|
12 |
+
from datasets import DATASETS
|
13 |
+
|
14 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
15 |
|
16 |
BASE_COLS = ["Rank", "Models", "Model Size(B)", "Data Source"]
|
|
|
36 |
This comprehensive suite enables robust evaluation of multimodal embedding models across static, temporal, and structured visual data settings.
|
37 |
|
38 |
| [**📈Overview**](https://tiger-ai-lab.github.io/VLM2Vec/) | [**Github**](https://github.com/TIGER-AI-Lab/VLM2Vec)
|
39 |
+
| [**📖MMEB-V2/VLM2Vec-V2 Paper**](https://arxiv.org/abs/2507.04590)
|
40 |
| [**📖MMEB-V1/VLM2Vec-V1 Paper**](https://arxiv.org/abs/2410.05160)
|
41 |
| [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2)
|
42 |
| [**Discord**](https://discord.gg/njyKubdtry) |
|
|
|
45 |
TABLE_INTRODUCTION = """***Important Notes: ***
|
46 |
This is the MMEB-V1 leaderboard, which is now deprecated. MMEB-V1 is now the Image section of MMEB-V2, and the results on this leaderboard have been integrated into MMEB-V2 Image tab. For researchers relying on MMEB-V1, we recommend transitioning to MMEB-V2 for more comprehensive evaluation metrics and support. Thank you for your collaborations and understanding! \n"""
|
47 |
|
48 |
+
LEADERBOARD_INFO = f"""
|
49 |
+
## Dataset Overview
|
50 |
+
This is the dictionary of all datasets used in our code. Please make sure all datasets' scores are included in your submission. \n
|
51 |
+
{pp.pformat(DATASETS)}
|
52 |
"""
|
53 |
|
54 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
utils_v2.py
CHANGED
@@ -2,42 +2,7 @@ import json
|
|
2 |
import os
|
3 |
import pandas as pd
|
4 |
from utils import create_hyperlinked_names, process_model_size
|
5 |
-
|
6 |
-
def sum_lol(lol):
|
7 |
-
assert isinstance(lol, list) and all(isinstance(i, list) for i in lol), f"Input should be a list of lists, got {type(lol)}"
|
8 |
-
total = []
|
9 |
-
for sublist in lol:
|
10 |
-
total.extend(sublist)
|
11 |
-
return total
|
12 |
-
|
13 |
-
SCORE_BASE_DIR = "scores"
|
14 |
-
META_DATA = ["model_name", "model_size", "url"]
|
15 |
-
DATASETS = {
|
16 |
-
"image": {
|
17 |
-
"I-CLS": ['VOC2007', 'N24News', 'SUN397', 'ObjectNet', 'Country211', 'Place365', 'ImageNet-1K', 'HatefulMemes', 'ImageNet-A', 'ImageNet-R'],
|
18 |
-
"I-QA": ['OK-VQA', 'A-OKVQA', 'DocVQA', 'InfographicsVQA', 'ChartQA', 'Visual7W', 'ScienceQA', 'GQA', 'TextVQA', 'VizWiz'],
|
19 |
-
"I-RET": ['VisDial', 'CIRR', 'VisualNews_t2i', 'VisualNews_i2t', 'MSCOCO_t2i', 'MSCOCO_i2t', 'NIGHTS', 'WebQA', 'FashionIQ', 'Wiki-SS-NQ', 'OVEN', 'EDIS'],
|
20 |
-
"I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W-Pointing']
|
21 |
-
},
|
22 |
-
"visdoc": {
|
23 |
-
"ViDoRe-V1": ['ViDoRe_arxivqa', 'ViDoRe_docvqa', 'ViDoRe_infovqa', 'ViDoRe_tabfquad', 'ViDoRe_tatdqa', 'ViDoRe_shiftproject', 'ViDoRe_syntheticDocQA_artificial_intelligence', 'ViDoRe_syntheticDocQA_energy', 'ViDoRe_syntheticDocQA_government_reports', 'ViDoRe_syntheticDocQA_healthcare_industry'],
|
24 |
-
"ViDoRe-V2": ["ViDoRe_esg_reports_human_labeled_v2","ViDoRe_biomedical_lectures_v2_multilingual", "ViDoRe_economics_reports_v2_multilingual", "ViDoRe_esg_reports_v2_multilingual"], # "ViDoRe_biomedical_lectures_v2", "ViDoRe_economics_reports_v2", "ViDoRe_esg_reports_v2"
|
25 |
-
"VisRAG": ['VisRAG_ArxivQA', 'VisRAG_ChartQA', 'VisRAG_MP-DocVQA', 'VisRAG_SlideVQA', 'VisRAG_InfoVQA', 'VisRAG_PlotQA'],
|
26 |
-
"VisDoc-OOD": ['ViDoSeek-page', 'ViDoSeek-doc', 'MMLongBench-page', 'MMLongBench-doc']
|
27 |
-
},
|
28 |
-
"video": {
|
29 |
-
"V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'],
|
30 |
-
"V-QA": ['Video-MME', 'MVBench', 'NExTQA', 'EgoSchema', 'ActivityNetQA'],
|
31 |
-
"V-RET": ['MSR-VTT', 'MSVD', 'DiDeMo', 'VATEX', 'YouCook2'],
|
32 |
-
"V-MRET": ['QVHighlight', 'Charades-STA', 'MomentSeeker']
|
33 |
-
}
|
34 |
-
}
|
35 |
-
ALL_DATASETS_SPLITS = {k: sum_lol(list(v.values())) for k, v in DATASETS.items()}
|
36 |
-
ALL_DATASETS = sum_lol(list(ALL_DATASETS_SPLITS.values()))
|
37 |
-
MODALITIES = list(DATASETS.keys())
|
38 |
-
SPECIAL_METRICS = {
|
39 |
-
'__default__': 'hit@1',
|
40 |
-
}
|
41 |
|
42 |
BASE_COLS = ['Rank', 'Models', 'Model Size(B)']
|
43 |
BASE_DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown']
|
@@ -77,7 +42,15 @@ LEADERBOARD_INFO = """
|
|
77 |
## Dataset Summary
|
78 |
"""
|
79 |
|
80 |
-
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
def load_single_json(file_path):
|
83 |
with open(file_path, 'r') as file:
|
|
|
2 |
import os
|
3 |
import pandas as pd
|
4 |
from utils import create_hyperlinked_names, process_model_size
|
5 |
+
from datasets import *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
BASE_COLS = ['Rank', 'Models', 'Model Size(B)']
|
8 |
BASE_DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown']
|
|
|
42 |
## Dataset Summary
|
43 |
"""
|
44 |
|
45 |
+
CITATION_BUTTON_TEXT = r"""@misc{meng2025vlm2vecv2advancingmultimodalembedding,
|
46 |
+
title={VLM2Vec-V2: Advancing Multimodal Embedding for Videos, Images, and Visual Documents},
|
47 |
+
author={Rui Meng and Ziyan Jiang and Ye Liu and Mingyi Su and Xinyi Yang and Yuepeng Fu and Can Qin and Zeyuan Chen and Ran Xu and Caiming Xiong and Yingbo Zhou and Wenhu Chen and Semih Yavuz},
|
48 |
+
year={2025},
|
49 |
+
eprint={2507.04590},
|
50 |
+
archivePrefix={arXiv},
|
51 |
+
primaryClass={cs.CV},
|
52 |
+
url={https://arxiv.org/abs/2507.04590},
|
53 |
+
}"""
|
54 |
|
55 |
def load_single_json(file_path):
|
56 |
with open(file_path, 'r') as file:
|