Spaces:
Running
Running
import json | |
import os | |
import pandas as pd | |
from utils import create_hyperlinked_names, process_model_size | |
def sum_lol(lol): | |
assert isinstance(lol, list) and all(isinstance(i, list) for i in lol), f"Input should be a list of lists, got {type(lol)}" | |
total = [] | |
for sublist in lol: | |
total.extend(sublist) | |
return total | |
SCORE_BASE_DIR = "scores" | |
META_DATA = ["model_name", "model_size", "url"] | |
DATASETS = { | |
"image": { | |
"I-CLS": ['VOC2007', 'N24News', 'SUN397', 'ObjectNet', 'Country211', 'Place365', 'ImageNet-1K', 'HatefulMemes', 'ImageNet-A', 'ImageNet-R'], | |
"I-QA": ['OK-VQA', 'A-OKVQA', 'DocVQA', 'InfographicsVQA', 'ChartQA', 'Visual7W', 'ScienceQA', 'GQA', 'TextVQA', 'VizWiz'], | |
"I-RET": ['VisDial', 'CIRR', 'VisualNews_t2i', 'VisualNews_i2t', 'MSCOCO_t2i', 'MSCOCO_i2t', 'NIGHTS', 'WebQA', 'FashionIQ', 'Wiki-SS-NQ', 'OVEN', 'EDIS'], | |
"I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W-Pointing'] | |
}, | |
"visdoc": { | |
"ViDoRe-V1": ['ViDoRe_arxivqa', 'ViDoRe_docvqa', 'ViDoRe_infovqa', 'ViDoRe_tabfquad', 'ViDoRe_tatdqa', 'ViDoRe_shiftproject', 'ViDoRe_syntheticDocQA_artificial_intelligence', 'ViDoRe_syntheticDocQA_energy', 'ViDoRe_syntheticDocQA_government_reports', 'ViDoRe_syntheticDocQA_healthcare_industry'], | |
"ViDoRe-V2": ["ViDoRe_esg_reports_human_labeled_v2", "ViDoRe_biomedical_lectures_v2", "ViDoRe_economics_reports_v2", "ViDoRe_esg_reports_v2"], # Following Abandoned: "ViDoRe_biomedical_lectures_v2_multilingual", "ViDoRe_economics_reports_v2_multilingual", "ViDoRe_esg_reports_v2_multilingual" | |
"VisRAG": ['VisRAG_ArxivQA', 'VisRAG_ChartQA', 'VisRAG_MP-DocVQA', 'VisRAG_SlideVQA', 'VisRAG_InfoVQA', 'VisRAG_PlotQA'], | |
"VisDoc-OOD": ['ViDoSeek-page', 'ViDoSeek-doc', 'MMLongBench-page', 'MMLongBench-doc'] | |
}, | |
"video": { | |
"V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'], | |
"V-QA": ['Video-MME', 'MVBench', 'NExTQA', 'EgoSchema'], | |
"V-RET": ['MSR-VTT', 'MSVD', 'DiDeMo', 'VATEX', 'YouCook2'], | |
"V-MRET": ['QVHighlight', 'Charades-STA', 'MomentSeeker', 'ActivityNetQA'] | |
} | |
} | |
ALL_DATASETS_SPLITS = {k: sum_lol(list(v.values())) for k, v in DATASETS.items()} | |
ALL_DATASETS = sum_lol(list(ALL_DATASETS_SPLITS.values())) | |
MODALITIES = list(DATASETS.keys()) | |
SPECIAL_METRICS = { | |
'__default__': 'hit@1', | |
} | |
BASE_COLS = ['Rank', 'Models', 'Model Size(B)'] | |
BASE_DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown'] | |
COLUMN_NAMES = BASE_COLS + ["Overall", 'Image-Overall', 'Video-Overall', 'Visdoc-Overall'] | |
DATA_TITLE_TYPE = BASE_DATA_TITLE_TYPE + \ | |
['number'] * 3 | |
SUB_TASKS_I = ["I-CLS", "I-QA", "I-RET", "I-VG"] | |
TASKS_I = ['Image-Overall'] + SUB_TASKS_I + ALL_DATASETS_SPLITS['image'] | |
COLUMN_NAMES_I = BASE_COLS + TASKS_I | |
DATA_TITLE_TYPE_I = BASE_DATA_TITLE_TYPE + \ | |
['number'] * len(TASKS_I + SUB_TASKS_I) | |
SUB_TASKS_V = ["V-CLS", "V-QA", "V-RET", "V-MRET"] | |
TASKS_V = ['Video-Overall'] + SUB_TASKS_V + ALL_DATASETS_SPLITS['video'] | |
COLUMN_NAMES_V = BASE_COLS + TASKS_V | |
DATA_TITLE_TYPE_V = BASE_DATA_TITLE_TYPE + \ | |
['number'] * len(TASKS_V + SUB_TASKS_V) | |
SUB_TASKS_D = ['ViDoRe-V1', 'ViDoRe-V2', 'VisRAG', 'VisDoc-OOD'] | |
TASKS_D = ['Visdoc-Overall'] + SUB_TASKS_D + ALL_DATASETS_SPLITS['visdoc'] | |
COLUMN_NAMES_D = BASE_COLS + TASKS_D | |
DATA_TITLE_TYPE_D = BASE_DATA_TITLE_TYPE + \ | |
['number'] * len(TASKS_D + SUB_TASKS_D) | |
TABLE_INTRODUCTION = """**MMEB**: Massive MultiModal Embedding Benchmark \n | |
Models are ranked based on **Overall**""" | |
TABLE_INTRODUCTION_I = """**I-CLS**: Image Classification, **I-QA**: (Image) Visual Question Answering, **I-RET**: Image Retrieval, **I-VG**: (Image) Visual Grounding \n | |
Models are ranked based on **Image-Overall**""" | |
TABLE_INTRODUCTION_V = """**V-CLS**: Video Classification, **V-QA**: (Video) Visual Question Answering, **V-RET**: Video Retrieval, **V-MRET**: Video Moment Retrieval \n | |
Models are ranked based on **Video-Overall**""" | |
TABLE_INTRODUCTION_D = """**VisDoc**: Visual Document Understanding \n | |
Models are ranked based on **VisDoc**""" | |
LEADERBOARD_INFO = """ | |
## Dataset Summary | |
""" | |
CITATION_BUTTON_TEXT = r"""TBA""" | |
def load_single_json(file_path): | |
with open(file_path, 'r') as file: | |
data = json.load(file) | |
return data | |
def load_data(base_dir=SCORE_BASE_DIR): | |
all_data = [] | |
for file_name in os.listdir(base_dir): | |
if file_name.endswith('.json'): | |
file_path = os.path.join(base_dir, file_name) | |
data = load_single_json(file_path) | |
all_data.append(data) | |
return all_data | |
def load_scores(raw_scores=None): | |
"""This function loads the raw scores from the user provided scores summary and flattens them into a single dictionary.""" | |
all_scores = {} | |
for modality, datasets_list in DATASETS.items(): # Ex.: ('image', {'I-CLS': [...], 'I-QA': [...]}) | |
for sub_task, datasets in datasets_list.items(): # Ex.: ('I-CLS', ['VOC2007', 'N24News', ...]) | |
for dataset in datasets: # Ex.: 'VOC2007' | |
score = raw_scores.get(modality, {}).get(dataset, 0.0) | |
score = 0.0 if score == "FILE_N/A" else score | |
metric = SPECIAL_METRICS.get(dataset, 'hit@1') | |
if isinstance(score, dict): | |
if modality == 'visdoc': | |
metric = "ndcg_linear@5" if "ndcg_linear@5" in score else "ndcg@5" | |
score = score.get(metric, 0.0) | |
all_scores[dataset] = round(score * 100.0, 2) | |
return all_scores | |
def calculate_score(raw_scores=None): | |
"""This function calculates the overall average scores for all datasets as well as avg scores for each modality and sub-task based on the raw scores. | |
""" | |
def get_avg(sum_score, leng): | |
avg = sum_score / leng if leng > 0 else 0.0 | |
avg = round(avg, 2) # Round to 2 decimal places | |
return avg | |
all_scores = load_scores(raw_scores) | |
avg_scores = {} | |
# Calculate overall score for all datasets | |
avg_scores['Overall'] = get_avg(sum(all_scores.values()), len(ALL_DATASETS)) | |
# Calculate scores for each modality | |
for modality in MODALITIES: | |
datasets_for_each_modality = ALL_DATASETS_SPLITS[modality] | |
avg_scores[f"{modality.capitalize()}-Overall"] = get_avg( | |
sum(all_scores.get(dataset, 0.0) for dataset in datasets_for_each_modality), | |
len(datasets_for_each_modality) | |
) | |
# Calculate scores for each sub-task | |
for modality, datasets_list in DATASETS.items(): | |
for sub_task, datasets in datasets_list.items(): | |
sub_task_score = sum(all_scores.get(dataset, 0.0) for dataset in datasets) | |
avg_scores[sub_task] = get_avg(sub_task_score, len(datasets)) | |
all_scores.update(avg_scores) | |
return all_scores | |
def generate_model_row(data): | |
metadata = data['metadata'] | |
row = { | |
'Models': metadata.get('model_name', None), | |
'Model Size(B)': metadata.get('model_size', None), | |
'URL': metadata.get('url', None), | |
'Data Source': metadata.get('data_source', 'Self-Reported'), | |
} | |
scores = calculate_score(data['metrics']) | |
row.update(scores) | |
return row | |
def rank_models(df, column='Overall', rank_name='Rank'): | |
"""Ranks the models based on the specific score.""" | |
df = df.sort_values(by=column, ascending=False).reset_index(drop=True) | |
df[rank_name] = range(1, len(df) + 1) | |
return df | |
def get_df(): | |
"""Generates a DataFrame from the loaded data.""" | |
all_data = load_data() | |
rows = [generate_model_row(data) for data in all_data] | |
df = pd.DataFrame(rows) | |
df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size) | |
df = create_hyperlinked_names(df) | |
df = rank_models(df, column='Overall') | |
return df | |
def refresh_data(): | |
df = get_df() | |
return df[COLUMN_NAMES] | |
def search_and_filter_models(df, query, min_size, max_size): | |
filtered_df = df.copy() | |
if query: | |
filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)] | |
size_mask = filtered_df['Model Size(B)'].apply(lambda x: | |
(min_size <= 1000.0 <= max_size) if x == 'unknown' | |
else (min_size <= x <= max_size)) | |
filtered_df = filtered_df[size_mask] | |
return filtered_df[COLUMN_NAMES] |