MINGYISU commited on
Commit
a79c335
·
1 Parent(s): dc50cf5

updated info

Browse files
Files changed (3) hide show
  1. datasets.py +35 -0
  2. utils.py +8 -3
  3. utils_v2.py +10 -37
datasets.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def sum_lol(lol):
2
+ assert isinstance(lol, list) and all(isinstance(i, list) for i in lol), f"Input should be a list of lists, got {type(lol)}"
3
+ total = []
4
+ for sublist in lol:
5
+ total.extend(sublist)
6
+ return total
7
+
8
+ SCORE_BASE_DIR = "scores"
9
+ META_DATA = ["model_name", "model_size", "url"]
10
+ DATASETS = {
11
+ "image": {
12
+ "I-CLS": ['VOC2007', 'N24News', 'SUN397', 'ObjectNet', 'Country211', 'Place365', 'ImageNet-1K', 'HatefulMemes', 'ImageNet-A', 'ImageNet-R'],
13
+ "I-QA": ['OK-VQA', 'A-OKVQA', 'DocVQA', 'InfographicsVQA', 'ChartQA', 'Visual7W', 'ScienceQA', 'GQA', 'TextVQA', 'VizWiz'],
14
+ "I-RET": ['VisDial', 'CIRR', 'VisualNews_t2i', 'VisualNews_i2t', 'MSCOCO_t2i', 'MSCOCO_i2t', 'NIGHTS', 'WebQA', 'FashionIQ', 'Wiki-SS-NQ', 'OVEN', 'EDIS'],
15
+ "I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W-Pointing']
16
+ },
17
+ "visdoc": {
18
+ "ViDoRe-V1": ['ViDoRe_arxivqa', 'ViDoRe_docvqa', 'ViDoRe_infovqa', 'ViDoRe_tabfquad', 'ViDoRe_tatdqa', 'ViDoRe_shiftproject', 'ViDoRe_syntheticDocQA_artificial_intelligence', 'ViDoRe_syntheticDocQA_energy', 'ViDoRe_syntheticDocQA_government_reports', 'ViDoRe_syntheticDocQA_healthcare_industry'],
19
+ "ViDoRe-V2": ["ViDoRe_esg_reports_human_labeled_v2","ViDoRe_biomedical_lectures_v2_multilingual", "ViDoRe_economics_reports_v2_multilingual", "ViDoRe_esg_reports_v2_multilingual"], # "ViDoRe_biomedical_lectures_v2", "ViDoRe_economics_reports_v2", "ViDoRe_esg_reports_v2"
20
+ "VisRAG": ['VisRAG_ArxivQA', 'VisRAG_ChartQA', 'VisRAG_MP-DocVQA', 'VisRAG_SlideVQA', 'VisRAG_InfoVQA', 'VisRAG_PlotQA'],
21
+ "VisDoc-OOD": ['ViDoSeek-page', 'ViDoSeek-doc', 'MMLongBench-page', 'MMLongBench-doc']
22
+ },
23
+ "video": {
24
+ "V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'],
25
+ "V-QA": ['Video-MME', 'MVBench', 'NExTQA', 'EgoSchema', 'ActivityNetQA'],
26
+ "V-RET": ['MSR-VTT', 'MSVD', 'DiDeMo', 'VATEX', 'YouCook2'],
27
+ "V-MRET": ['QVHighlight', 'Charades-STA', 'MomentSeeker']
28
+ }
29
+ }
30
+ ALL_DATASETS_SPLITS = {k: sum_lol(list(v.values())) for k, v in DATASETS.items()}
31
+ ALL_DATASETS = sum_lol(list(ALL_DATASETS_SPLITS.values()))
32
+ MODALITIES = list(DATASETS.keys())
33
+ SPECIAL_METRICS = {
34
+ '__default__': 'hit@1',
35
+ }
utils.py CHANGED
@@ -6,8 +6,11 @@ import os
6
  import requests
7
  import io
8
  import shutil
 
9
  from huggingface_hub import Repository
10
 
 
 
11
  HF_TOKEN = os.environ.get("HF_TOKEN")
12
 
13
  BASE_COLS = ["Rank", "Models", "Model Size(B)", "Data Source"]
@@ -33,7 +36,7 @@ Building upon on **MMEB-V1**, **MMEB-V2** expands the evaluation scope to includ
33
  This comprehensive suite enables robust evaluation of multimodal embedding models across static, temporal, and structured visual data settings.
34
 
35
  | [**📈Overview**](https://tiger-ai-lab.github.io/VLM2Vec/) | [**Github**](https://github.com/TIGER-AI-Lab/VLM2Vec)
36
- | [**📖MMEB-V2/VLM2Vec-V2 Paper (TBA)**](https://arxiv.org/abs/2410.05160)
37
  | [**📖MMEB-V1/VLM2Vec-V1 Paper**](https://arxiv.org/abs/2410.05160)
38
  | [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2)
39
  | [**Discord**](https://discord.gg/njyKubdtry) |
@@ -42,8 +45,10 @@ This comprehensive suite enables robust evaluation of multimodal embedding model
42
  TABLE_INTRODUCTION = """***Important Notes: ***
43
  This is the MMEB-V1 leaderboard, which is now deprecated. MMEB-V1 is now the Image section of MMEB-V2, and the results on this leaderboard have been integrated into MMEB-V2 Image tab. For researchers relying on MMEB-V1, we recommend transitioning to MMEB-V2 for more comprehensive evaluation metrics and support. Thank you for your collaborations and understanding! \n"""
44
 
45
- LEADERBOARD_INFO = """
46
- ## Dataset Summary
 
 
47
  """
48
 
49
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
6
  import requests
7
  import io
8
  import shutil
9
+ import pprint as pp
10
  from huggingface_hub import Repository
11
 
12
+ from datasets import DATASETS
13
+
14
  HF_TOKEN = os.environ.get("HF_TOKEN")
15
 
16
  BASE_COLS = ["Rank", "Models", "Model Size(B)", "Data Source"]
 
36
  This comprehensive suite enables robust evaluation of multimodal embedding models across static, temporal, and structured visual data settings.
37
 
38
  | [**📈Overview**](https://tiger-ai-lab.github.io/VLM2Vec/) | [**Github**](https://github.com/TIGER-AI-Lab/VLM2Vec)
39
+ | [**📖MMEB-V2/VLM2Vec-V2 Paper**](https://arxiv.org/abs/2507.04590)
40
  | [**📖MMEB-V1/VLM2Vec-V1 Paper**](https://arxiv.org/abs/2410.05160)
41
  | [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2)
42
  | [**Discord**](https://discord.gg/njyKubdtry) |
 
45
  TABLE_INTRODUCTION = """***Important Notes: ***
46
  This is the MMEB-V1 leaderboard, which is now deprecated. MMEB-V1 is now the Image section of MMEB-V2, and the results on this leaderboard have been integrated into MMEB-V2 Image tab. For researchers relying on MMEB-V1, we recommend transitioning to MMEB-V2 for more comprehensive evaluation metrics and support. Thank you for your collaborations and understanding! \n"""
47
 
48
+ LEADERBOARD_INFO = f"""
49
+ ## Dataset Overview
50
+ This is the dictionary of all datasets used in our code. Please make sure all datasets' scores are included in your submission. \n
51
+ {pp.pformat(DATASETS)}
52
  """
53
 
54
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
utils_v2.py CHANGED
@@ -2,42 +2,7 @@ import json
2
  import os
3
  import pandas as pd
4
  from utils import create_hyperlinked_names, process_model_size
5
-
6
- def sum_lol(lol):
7
- assert isinstance(lol, list) and all(isinstance(i, list) for i in lol), f"Input should be a list of lists, got {type(lol)}"
8
- total = []
9
- for sublist in lol:
10
- total.extend(sublist)
11
- return total
12
-
13
- SCORE_BASE_DIR = "scores"
14
- META_DATA = ["model_name", "model_size", "url"]
15
- DATASETS = {
16
- "image": {
17
- "I-CLS": ['VOC2007', 'N24News', 'SUN397', 'ObjectNet', 'Country211', 'Place365', 'ImageNet-1K', 'HatefulMemes', 'ImageNet-A', 'ImageNet-R'],
18
- "I-QA": ['OK-VQA', 'A-OKVQA', 'DocVQA', 'InfographicsVQA', 'ChartQA', 'Visual7W', 'ScienceQA', 'GQA', 'TextVQA', 'VizWiz'],
19
- "I-RET": ['VisDial', 'CIRR', 'VisualNews_t2i', 'VisualNews_i2t', 'MSCOCO_t2i', 'MSCOCO_i2t', 'NIGHTS', 'WebQA', 'FashionIQ', 'Wiki-SS-NQ', 'OVEN', 'EDIS'],
20
- "I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W-Pointing']
21
- },
22
- "visdoc": {
23
- "ViDoRe-V1": ['ViDoRe_arxivqa', 'ViDoRe_docvqa', 'ViDoRe_infovqa', 'ViDoRe_tabfquad', 'ViDoRe_tatdqa', 'ViDoRe_shiftproject', 'ViDoRe_syntheticDocQA_artificial_intelligence', 'ViDoRe_syntheticDocQA_energy', 'ViDoRe_syntheticDocQA_government_reports', 'ViDoRe_syntheticDocQA_healthcare_industry'],
24
- "ViDoRe-V2": ["ViDoRe_esg_reports_human_labeled_v2","ViDoRe_biomedical_lectures_v2_multilingual", "ViDoRe_economics_reports_v2_multilingual", "ViDoRe_esg_reports_v2_multilingual"], # "ViDoRe_biomedical_lectures_v2", "ViDoRe_economics_reports_v2", "ViDoRe_esg_reports_v2"
25
- "VisRAG": ['VisRAG_ArxivQA', 'VisRAG_ChartQA', 'VisRAG_MP-DocVQA', 'VisRAG_SlideVQA', 'VisRAG_InfoVQA', 'VisRAG_PlotQA'],
26
- "VisDoc-OOD": ['ViDoSeek-page', 'ViDoSeek-doc', 'MMLongBench-page', 'MMLongBench-doc']
27
- },
28
- "video": {
29
- "V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'],
30
- "V-QA": ['Video-MME', 'MVBench', 'NExTQA', 'EgoSchema', 'ActivityNetQA'],
31
- "V-RET": ['MSR-VTT', 'MSVD', 'DiDeMo', 'VATEX', 'YouCook2'],
32
- "V-MRET": ['QVHighlight', 'Charades-STA', 'MomentSeeker']
33
- }
34
- }
35
- ALL_DATASETS_SPLITS = {k: sum_lol(list(v.values())) for k, v in DATASETS.items()}
36
- ALL_DATASETS = sum_lol(list(ALL_DATASETS_SPLITS.values()))
37
- MODALITIES = list(DATASETS.keys())
38
- SPECIAL_METRICS = {
39
- '__default__': 'hit@1',
40
- }
41
 
42
  BASE_COLS = ['Rank', 'Models', 'Model Size(B)']
43
  BASE_DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown']
@@ -77,7 +42,15 @@ LEADERBOARD_INFO = """
77
  ## Dataset Summary
78
  """
79
 
80
- CITATION_BUTTON_TEXT = r"""TBA"""
 
 
 
 
 
 
 
 
81
 
82
  def load_single_json(file_path):
83
  with open(file_path, 'r') as file:
 
2
  import os
3
  import pandas as pd
4
  from utils import create_hyperlinked_names, process_model_size
5
+ from datasets import *
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  BASE_COLS = ['Rank', 'Models', 'Model Size(B)']
8
  BASE_DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown']
 
42
  ## Dataset Summary
43
  """
44
 
45
+ CITATION_BUTTON_TEXT = r"""@misc{meng2025vlm2vecv2advancingmultimodalembedding,
46
+ title={VLM2Vec-V2: Advancing Multimodal Embedding for Videos, Images, and Visual Documents},
47
+ author={Rui Meng and Ziyan Jiang and Ye Liu and Mingyi Su and Xinyi Yang and Yuepeng Fu and Can Qin and Zeyuan Chen and Ran Xu and Caiming Xiong and Yingbo Zhou and Wenhu Chen and Semih Yavuz},
48
+ year={2025},
49
+ eprint={2507.04590},
50
+ archivePrefix={arXiv},
51
+ primaryClass={cs.CV},
52
+ url={https://arxiv.org/abs/2507.04590},
53
+ }"""
54
 
55
  def load_single_json(file_path):
56
  with open(file_path, 'r') as file: