Spaces:

TIGER-Lab
/

MMEB-Leaderboard

Running

App Files Files Community

MINGYISU commited on 24 days ago

Commit

01ad525

verified ·

1 Parent(s): 9cfc538

add visdoc sub task scores (#52)

Browse files

- integrate v1 scores into v2 (a84b2867d9a4a0e33270363b0714046fd615e200)
- fixed issue (05eb353153751b5e47a54ab85a60184faf506c9a)
- fixed (8cbf5e6fbd748f32853ce8bf32a6ce431b94d3c6)
- updated visdoc sub tasks (8e6921ff462e6933b5cd0407b677881e761ff0d9)
- add vd sub task scores (037b103fe2c30de21eefd960e38901f1bc47d588)
- fix issues (a2d53876e17e381396fe0a5303e84841027ab539)

Files changed (3) hide show

app.py +1 -1
utils.py +0 -1
utils_v2.py +12 -9

app.py CHANGED Viewed

@@ -134,7 +134,7 @@ with gr.Blocks() as block:
         with gr.TabItem("📑 Visual Doc", elem_id="qa-tab-table1", id=4):
             gr.Markdown(v2.TABLE_INTRODUCTION_D)
             data_component5 = gr.components.Dataframe(
-                value=v2.rank_models(df2[v2.COLUMN_NAMES_D], 'VisDoc'),
                 headers=v2.COLUMN_NAMES_D,
                 type="pandas",
                 datatype=v2.DATA_TITLE_TYPE_D,

         with gr.TabItem("📑 Visual Doc", elem_id="qa-tab-table1", id=4):
             gr.Markdown(v2.TABLE_INTRODUCTION_D)
             data_component5 = gr.components.Dataframe(
+                value=v2.rank_models(df2[v2.COLUMN_NAMES_D], 'Visdoc-Overall'),
                 headers=v2.COLUMN_NAMES_D,
                 type="pandas",
                 datatype=v2.DATA_TITLE_TYPE_D,

utils.py CHANGED Viewed

@@ -103,7 +103,6 @@ SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
     }
 }
 ```
-Note: We still accept the old format until 2025-06-30.
 Please refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for detailed instructions about evaluating your model. \n
 To submit, create a pull request and upload the generated JSON file to the ***scores*** folder, then send us an email at [email protected], including your model's information. \n We will review your submission and update the leaderboard accordingly. \n
 Please also share any feedback or suggestions you have for improving the leaderboard experience. We appreciate your contributions to the MMEB community!

     }
 }
 ```
 Please refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for detailed instructions about evaluating your model. \n
 To submit, create a pull request and upload the generated JSON file to the ***scores*** folder, then send us an email at [email protected], including your model's information. \n We will review your submission and update the leaderboard accordingly. \n
 Please also share any feedback or suggestions you have for improving the leaderboard experience. We appreciate your contributions to the MMEB community!

utils_v2.py CHANGED Viewed

@@ -20,7 +20,10 @@ DATASETS = {
         "I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W']
         },
     "visdoc": {
-        "VisDoc": ['ViDoRe_arxivqa', 'ViDoRe_docvqa', 'ViDoRe_infovqa', 'ViDoRe_tabfquad', 'ViDoRe_tatdqa', 'ViDoRe_shiftproject', 'ViDoRe_syntheticDocQA_artificial_intelligence', 'ViDoRe_syntheticDocQA_energy', 'ViDoRe_syntheticDocQA_government_reports', 'ViDoRe_syntheticDocQA_healthcare_industry', 'VisRAG_ArxivQA', 'VisRAG_ChartQA', 'VisRAG_MP-DocVQA', 'VisRAG_SlideVQA', 'VisRAG_InfoVQA', 'VisRAG_PlotQA', 'ViDoSeek-page', 'ViDoSeek-doc', 'MMLongBench-page', 'MMLongBench-doc', "ViDoRe_esg_reports_human_labeled_v2", "ViDoRe_biomedical_lectures_v2", "ViDoRe_biomedical_lectures_v2_multilingual", "ViDoRe_economics_reports_v2", "ViDoRe_economics_reports_v2_multilingual", "ViDoRe_esg_reports_v2", "ViDoRe_esg_reports_v2_multilingual"]
         },
     "video": {
         "V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'],
@@ -37,29 +40,29 @@ SPECIAL_METRICS = {
 }
 BASE_COLS = ['Rank', 'Models', 'Model Size(B)']
-TASKS = ["Overall", "I-CLS", "I-QA", "I-RET", "I-VG", "VisDoc", "V-CLS", "V-QA", "V-RET", "V-MRET"]
 BASE_DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown']
-COLUMN_NAMES = BASE_COLS + ["Overall", 'Image-Overall', 'Video-Overall', 'VisDoc']
 DATA_TITLE_TYPE = BASE_DATA_TITLE_TYPE + \
                     ['number'] * 3
-SUB_TASKS_I = TASKS[1:5]
 TASKS_I = ['Image-Overall'] + SUB_TASKS_I + ALL_DATASETS_SPLITS['image']
 COLUMN_NAMES_I = BASE_COLS + TASKS_I
 DATA_TITLE_TYPE_I = BASE_DATA_TITLE_TYPE + \
-                    ['number'] * (len(TASKS_I) + 4)
-SUB_TASKS_V = TASKS[6:10]
 TASKS_V = ['Video-Overall'] + SUB_TASKS_V + ALL_DATASETS_SPLITS['video']
 COLUMN_NAMES_V = BASE_COLS + TASKS_V
 DATA_TITLE_TYPE_V = BASE_DATA_TITLE_TYPE + \
-                    ['number'] * (len(TASKS_V) + 4)
-TASKS_D = ['VisDoc'] + ALL_DATASETS_SPLITS['visdoc']
 COLUMN_NAMES_D = BASE_COLS + TASKS_D
 DATA_TITLE_TYPE_D = BASE_DATA_TITLE_TYPE + \
-                    ['number'] * len(TASKS_D)
 TABLE_INTRODUCTION = """**MMEB**: Massive MultiModal Embedding Benchmark \n
                         Models are ranked based on **Overall**"""

         "I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W']
         },
     "visdoc": {
+        "ViDoRe-V1": ['ViDoRe_arxivqa', 'ViDoRe_docvqa', 'ViDoRe_infovqa', 'ViDoRe_tabfquad', 'ViDoRe_tatdqa', 'ViDoRe_shiftproject', 'ViDoRe_syntheticDocQA_artificial_intelligence', 'ViDoRe_syntheticDocQA_energy', 'ViDoRe_syntheticDocQA_government_reports', 'ViDoRe_syntheticDocQA_healthcare_industry'],
+        "ViDoRe-V2": ["ViDoRe_esg_reports_human_labeled_v2", "ViDoRe_biomedical_lectures_v2", "ViDoRe_economics_reports_v2", "ViDoRe_esg_reports_v2"],  # Following Abandoned: "ViDoRe_biomedical_lectures_v2_multilingual", "ViDoRe_economics_reports_v2_multilingual", "ViDoRe_esg_reports_v2_multilingual"
+        "VisRAG": ['VisRAG_ArxivQA', 'VisRAG_ChartQA', 'VisRAG_MP-DocVQA', 'VisRAG_SlideVQA', 'VisRAG_InfoVQA', 'VisRAG_PlotQA'],
+        "VisDoc-OOD": ['ViDoSeek-page', 'ViDoSeek-doc', 'MMLongBench-page', 'MMLongBench-doc']
         },
     "video": {
         "V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'],
 }
 BASE_COLS = ['Rank', 'Models', 'Model Size(B)']
 BASE_DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown']
+COLUMN_NAMES = BASE_COLS + ["Overall", 'Image-Overall', 'Video-Overall', 'Visdoc-Overall']
 DATA_TITLE_TYPE = BASE_DATA_TITLE_TYPE + \
                     ['number'] * 3
+SUB_TASKS_I = ["I-CLS", "I-QA", "I-RET", "I-VG"]
 TASKS_I = ['Image-Overall'] + SUB_TASKS_I + ALL_DATASETS_SPLITS['image']
 COLUMN_NAMES_I = BASE_COLS + TASKS_I
 DATA_TITLE_TYPE_I = BASE_DATA_TITLE_TYPE + \
+                    ['number'] * len(TASKS_I + SUB_TASKS_I)
+SUB_TASKS_V = ["V-CLS", "V-QA", "V-RET", "V-MRET"]
 TASKS_V = ['Video-Overall'] + SUB_TASKS_V + ALL_DATASETS_SPLITS['video']
 COLUMN_NAMES_V = BASE_COLS + TASKS_V
 DATA_TITLE_TYPE_V = BASE_DATA_TITLE_TYPE + \
+                    ['number'] * len(TASKS_V + SUB_TASKS_V)
+SUB_TASKS_D = ['ViDoRe-V1', 'ViDoRe-V2', 'VisRAG', 'VisDoc-OOD']
+TASKS_D = ['Visdoc-Overall'] + SUB_TASKS_D + ALL_DATASETS_SPLITS['visdoc']
 COLUMN_NAMES_D = BASE_COLS + TASKS_D
 DATA_TITLE_TYPE_D = BASE_DATA_TITLE_TYPE + \
+                    ['number'] * len(TASKS_D + SUB_TASKS_D)
 TABLE_INTRODUCTION = """**MMEB**: Massive MultiModal Embedding Benchmark \n
                         Models are ranked based on **Overall**"""