Spaces:

TIGER-Lab
/

MMEB-Leaderboard

Running

File size: 7,876 Bytes

import pandas as pd
import gradio as gr
import csv
import json
import os
import requests
import io
import shutil
from huggingface_hub import Repository

HF_TOKEN = os.environ.get("HF_TOKEN")

BASE_COLS = ["Rank", "Models", "Model Size(B)", "Data Source"]
TASKS_V1 = ["V1-Overall", "I-CLS", "I-QA", "I-RET", "I-VG"]
COLUMN_NAMES = BASE_COLS + TASKS_V1

DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown'] + \
                    ['number'] * len(TASKS_V1)

LEADERBOARD_INTRODUCTION = """
# 📊 **MMEB LEADERBOARD (VLM2Vec)**

## Introduction
We introduce a novel benchmark, **MMEB-V1 (Massive Multimodal Embedding Benchmark)**, 
which includes 36 datasets spanning four meta-task categories: classification, visual question answering, retrieval, and visual grounding. MMEB provides a comprehensive framework for training
and evaluating embedding models across various combinations of text and image modalities. 
All tasks are reformulated as ranking tasks, where the model follows instructions, processes a query, and selects the correct target from a set of candidates. The query and target can be an image, text,
or a combination of both. MMEB-V1 is divided into 20 in-distribution datasets, which can be used for
training, and 16 out-of-distribution datasets, reserved for evaluation.

Building upon on **MMEB-V1**, **MMEB-V2** expands the evaluation scope to include five new tasks: four video-based tasks 
— Video Retrieval, Moment Retrieval, Video Classification, and Video Question Answering — and one task focused on visual documents, Visual Document Retrieval. 
This comprehensive suite enables robust evaluation of multimodal embedding models across static, temporal, and structured visual data settings.

| [**📈Overview**](https://tiger-ai-lab.github.io/VLM2Vec/) | [**Github**](https://github.com/TIGER-AI-Lab/VLM2Vec) 
| [**📖MMEB-V2/VLM2Vec-V2 Paper (TBA)**](https://arxiv.org/abs/2410.05160) 
| [**📖MMEB-V1/VLM2Vec-V1 Paper**](https://arxiv.org/abs/2410.05160) 
| [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2) |
"""

TABLE_INTRODUCTION = """***Important Notes:*** \n
**We will be depreciating the MMEB-V1 leaderboard soon, and we will be releasing MMEB-V2 with more detailed scores and automatic evaluation.** \n"""

LEADERBOARD_INFO = """
## Dataset Summary
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@article{jiang2024vlm2vec,
  title={VLM2Vec: Training Vision-Language Models for Massive Multimodal Embedding Tasks},
  author={Jiang, Ziyan and Meng, Rui and Yang, Xinyi and Yavuz, Semih and Zhou, Yingbo and Chen, Wenhu},
  journal={arXiv preprint arXiv:2410.05160},
  year={2024}
}"""

SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction

## ⚠ Please note that you need to submit the JSON file with the following format:

### ***Important Notes: We have released MMEB-V2 and will deprecate MMEB-V1 soon. All further submissions should be made using the V2 format (see following).***
### ***In V2, the detailed scores of each dataset will be included, and our code will automatically generate the results and calculate the overall scores. See the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for more information.***
### **A V2 Submission would look like this:**
```json
{
    "metadata": {
        "Model": "<Model Name>",
        "URL": "<Model URL>" or null,
        "Model Size(B)": 1000 or null,
        "Data Source": "Self-Reported",
    },
    "metrics": {
        "image": {
            "ImageNet-1K": {
                "hit@1": 0.5,
                "ndcg@1": 0.5,
                ... ...
            }, 
            "N24News": {
                ... ...
            }, 
            ... ...
        }, 
        "visdoc": {
            "ViDoRe": {
                "hit@1": 0.5,
                "ndcg@1": 0.5,
                ... ...
            }, 
            ... ...
        },
        "video": {
            "DiDeMo": {
                "hit@1": 0.5,
                "ndcg@1": 0.5,
                ... ...
            }, 
            "MSR-VTT": {
                ... ...
            }, 
            ... ...
        }
    }
}
```

### **TO SUBMIT V1 ONLY (Depreciated, but we still accept this format until 2025-06-30)**
```json
[
    {
        "Model": "<Model Name>",
        "URL": "<Model URL>" or null,
        "Model Size(B)": 1000 or null,
        "Data Source": "Self-Reported",
        "V1-Overall": 50.0,
        "I-CLS": 50.0,
        "I-QA": 50.0,
        "I-RET": 50.0,
        "I-VG": 50.0
    }, 
]
```
You may refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for detailed instructions about evaluating your model. \n
Please send us an email at [email protected], attaching the JSON file. We will review your submission and update the leaderboard accordingly. \n
Please also share any feedback or suggestions you have for improving the leaderboard experience. We appreciate your contributions to the MMEB community!
"""

def create_hyperlinked_names(df):
    def convert_url(url, model_name):
        return f'<a href="{url}">{model_name}</a>' if url is not None else model_name

    def add_link_to_model_name(row):
        row['Models'] = convert_url(row['URL'], row['Models'])
        return row
    
    df = df.copy()
    df = df.apply(add_link_to_model_name, axis=1)
    return df

# def fetch_data(file: str) -> pd.DataFrame:
#     # fetch the leaderboard data from remote
#     if file is None:
#         raise ValueError("URL Not Provided")
#     url = f"https://huggingface.co/spaces/TIGER-Lab/MMEB/resolve/main/{file}"
#     print(f"Fetching data from {url}")
#     response = requests.get(url)
#     if response.status_code != 200:
#         raise requests.HTTPError(f"Failed to fetch data: HTTP status code {response.status_code}")
#     return pd.read_json(io.StringIO(response.text), orient='records', lines=True)

def get_df(file="results.jsonl"):
    df = pd.read_json(file, orient='records', lines=True)
    df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
    for task in TASKS_V1:
        if df[task].isnull().any():
            df[task] = df[task].apply(lambda score: '-' if pd.isna(score) else score)
    df = df.sort_values(by=['V1-Overall'], ascending=False)
    df = create_hyperlinked_names(df)
    df['Rank'] = range(1, len(df) + 1)
    return df

def refresh_data():
    df = get_df()
    return df[COLUMN_NAMES]

def search_and_filter_models(df, query, min_size, max_size):
    filtered_df = df.copy()
    
    if query:
        filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]

    size_mask = filtered_df['Model Size(B)'].apply(lambda x: 
        (min_size <= 1000.0 <= max_size) if x == 'unknown' 
        else (min_size <= x <= max_size))
    
    filtered_df = filtered_df[size_mask]
    
    return filtered_df[COLUMN_NAMES]


def search_models(df, query):
    if query:
        return df[df['Models'].str.contains(query, case=False, na=False)]
    return df

def get_size_range(df):
    sizes = df['Model Size(B)'].apply(lambda x: 0.0 if x == 'unknown' else x)
    if (sizes == 0.0).all():
        return 0.0, 1000.0
    return float(sizes.min()), float(sizes.max())


def process_model_size(size):
    if pd.isna(size) or size == 'unk':
        return 'unknown'
    try:
        val = float(size)
        return val
    except (ValueError, TypeError):
        return 'unknown'

def filter_columns_by_tasks(df, selected_tasks=None):
    if selected_tasks is None or len(selected_tasks) == 0:
        return df[COLUMN_NAMES]
    
    base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall']
    selected_columns = base_columns + selected_tasks
    
    available_columns = [col for col in selected_columns if col in df.columns]
    return df[available_columns]