Spaces:
Running
Running
import json | |
import os | |
import pandas as pd | |
from utils import create_hyperlinked_names, process_model_size | |
from datasets import * | |
BASE_COLS = ['Rank', 'Models', 'Model Size(B)'] | |
BASE_DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown'] | |
COLUMN_NAMES = BASE_COLS + ["Overall", 'Image-Overall', 'Video-Overall', 'Visdoc-Overall'] | |
DATA_TITLE_TYPE = BASE_DATA_TITLE_TYPE + \ | |
['number'] * 3 | |
SUB_TASKS_I = ["I-CLS", "I-QA", "I-RET", "I-VG"] | |
TASKS_I = ['Image-Overall'] + SUB_TASKS_I + ALL_DATASETS_SPLITS['image'] | |
COLUMN_NAMES_I = BASE_COLS + TASKS_I | |
DATA_TITLE_TYPE_I = BASE_DATA_TITLE_TYPE + \ | |
['number'] * len(TASKS_I + SUB_TASKS_I) | |
SUB_TASKS_V = ["V-CLS", "V-QA", "V-RET", "V-MRET"] | |
TASKS_V = ['Video-Overall'] + SUB_TASKS_V + ALL_DATASETS_SPLITS['video'] | |
COLUMN_NAMES_V = BASE_COLS + TASKS_V | |
DATA_TITLE_TYPE_V = BASE_DATA_TITLE_TYPE + \ | |
['number'] * len(TASKS_V + SUB_TASKS_V) | |
SUB_TASKS_D = ['ViDoRe-V1', 'ViDoRe-V2', 'VisRAG', 'VisDoc-OOD'] | |
TASKS_D = ['Visdoc-Overall'] + SUB_TASKS_D + ALL_DATASETS_SPLITS['visdoc'] | |
COLUMN_NAMES_D = BASE_COLS + TASKS_D | |
DATA_TITLE_TYPE_D = BASE_DATA_TITLE_TYPE + \ | |
['number'] * len(TASKS_D + SUB_TASKS_D) | |
TABLE_INTRODUCTION = """**MMEB**: Massive MultiModal Embedding Benchmark \n | |
Models are ranked based on **Overall**""" | |
TABLE_INTRODUCTION_I = """**I-CLS**: Image Classification, **I-QA**: (Image) Visual Question Answering, **I-RET**: Image Retrieval, **I-VG**: (Image) Visual Grounding \n | |
Models are ranked based on **Image-Overall**""" | |
TABLE_INTRODUCTION_V = """**V-CLS**: Video Classification, **V-QA**: (Video) Visual Question Answering, **V-RET**: Video Retrieval, **V-MRET**: Video Moment Retrieval \n | |
Models are ranked based on **Video-Overall**""" | |
TABLE_INTRODUCTION_D = """**VisDoc**: Visual Document Understanding \n | |
Models are ranked based on **VisDoc**""" | |
LEADERBOARD_INFO = """ | |
## Dataset Summary | |
""" | |
CITATION_BUTTON_TEXT = r"""@misc{meng2025vlm2vecv2advancingmultimodalembedding, | |
title={VLM2Vec-V2: Advancing Multimodal Embedding for Videos, Images, and Visual Documents}, | |
author={Rui Meng and Ziyan Jiang and Ye Liu and Mingyi Su and Xinyi Yang and Yuepeng Fu and Can Qin and Zeyuan Chen and Ran Xu and Caiming Xiong and Yingbo Zhou and Wenhu Chen and Semih Yavuz}, | |
year={2025}, | |
eprint={2507.04590}, | |
archivePrefix={arXiv}, | |
primaryClass={cs.CV}, | |
url={https://arxiv.org/abs/2507.04590}, | |
}""" | |
def load_single_json(file_path): | |
with open(file_path, 'r') as file: | |
data = json.load(file) | |
return data | |
def load_data(base_dir=SCORE_BASE_DIR): | |
all_data = [] | |
for file_name in os.listdir(base_dir): | |
if file_name.endswith('.json'): | |
file_path = os.path.join(base_dir, file_name) | |
data = load_single_json(file_path) | |
all_data.append(data) | |
return all_data | |
def load_scores(raw_scores=None): | |
"""This function loads the raw scores from the user provided scores summary and flattens them into a single dictionary.""" | |
all_scores = {} | |
for modality, datasets_list in DATASETS.items(): # Ex.: ('image', {'I-CLS': [...], 'I-QA': [...]}) | |
for sub_task, datasets in datasets_list.items(): # Ex.: ('I-CLS', ['VOC2007', 'N24News', ...]) | |
for dataset in datasets: # Ex.: 'VOC2007' | |
score = raw_scores.get(modality, {}).get(dataset, 0.0) | |
score = 0.0 if score == "FILE_N/A" else score | |
metric = SPECIAL_METRICS.get(dataset, 'hit@1') | |
if isinstance(score, dict): | |
if modality == 'visdoc': | |
metric = "ndcg_linear@5" if "ndcg_linear@5" in score else "ndcg@5" | |
score = score.get(metric, 0.0) | |
all_scores[dataset] = round(score * 100.0, 2) | |
return all_scores | |
def calculate_score(raw_scores=None): | |
"""This function calculates the overall average scores for all datasets as well as avg scores for each modality and sub-task based on the raw scores. | |
""" | |
def get_avg(sum_score, leng): | |
avg = sum_score / leng if leng > 0 else 0.0 | |
avg = round(avg, 2) # Round to 2 decimal places | |
return avg | |
all_scores = load_scores(raw_scores) | |
avg_scores = {} | |
# Calculate overall score for all datasets | |
avg_scores['Overall'] = get_avg(sum(all_scores.values()), len(ALL_DATASETS)) | |
# Calculate scores for each modality | |
for modality in MODALITIES: | |
datasets_for_each_modality = ALL_DATASETS_SPLITS[modality] | |
avg_scores[f"{modality.capitalize()}-Overall"] = get_avg( | |
sum(all_scores.get(dataset, 0.0) for dataset in datasets_for_each_modality), | |
len(datasets_for_each_modality) | |
) | |
# Calculate scores for each sub-task | |
for modality, datasets_list in DATASETS.items(): | |
for sub_task, datasets in datasets_list.items(): | |
sub_task_score = sum(all_scores.get(dataset, 0.0) for dataset in datasets) | |
avg_scores[sub_task] = get_avg(sub_task_score, len(datasets)) | |
all_scores.update(avg_scores) | |
return all_scores | |
def generate_model_row(data): | |
metadata = data['metadata'] | |
row = { | |
'Models': metadata.get('model_name', None), | |
'Model Size(B)': metadata.get('model_size', None), | |
'URL': metadata.get('url', None), | |
'Data Source': metadata.get('data_source', 'Self-Reported'), | |
} | |
scores = calculate_score(data['metrics']) | |
row.update(scores) | |
return row | |
def rank_models(df, column='Overall', rank_name='Rank'): | |
"""Ranks the models based on the specific score.""" | |
df = df.sort_values(by=column, ascending=False).reset_index(drop=True) | |
df[rank_name] = range(1, len(df) + 1) | |
return df | |
def get_df(): | |
"""Generates a DataFrame from the loaded data.""" | |
all_data = load_data() | |
rows = [generate_model_row(data) for data in all_data] | |
df = pd.DataFrame(rows) | |
df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size) | |
df = create_hyperlinked_names(df) | |
df = rank_models(df, column='Overall') | |
return df | |
def refresh_data(): | |
df = get_df() | |
return df[COLUMN_NAMES] | |
def search_and_filter_models(df, query, min_size, max_size): | |
filtered_df = df.copy() | |
if query: | |
filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)] | |
size_mask = filtered_df['Model Size(B)'].apply(lambda x: | |
(min_size <= 1000.0 <= max_size) if x == 'unknown' | |
else (min_size <= x <= max_size)) | |
filtered_df = filtered_df[size_mask] | |
return filtered_df[COLUMN_NAMES] |