MINGYISU commited on
Commit
15e5e27
·
verified ·
1 Parent(s): 66cf416
Files changed (3) hide show
  1. app.py +2 -3
  2. results.jsonl +30 -30
  3. utils.py +46 -61
app.py CHANGED
@@ -52,11 +52,10 @@ with gr.Blocks() as block:
52
  label="Maximum number of parameters (B)",
53
  )
54
 
55
- task_choices = [col for col in COLUMN_NAMES if col not in BASE_COLS]
56
  with gr.Row():
57
  tasks_select = gr.CheckboxGroup(
58
- choices=task_choices,
59
- value=task_choices,
60
  label="Select tasks to Display",
61
  elem_id="tasks-select"
62
  )
 
52
  label="Maximum number of parameters (B)",
53
  )
54
 
 
55
  with gr.Row():
56
  tasks_select = gr.CheckboxGroup(
57
+ choices=TASKS_V1 + TASKS_V2,
58
+ value=TASKS_V1,
59
  label="Select tasks to Display",
60
  elem_id="tasks-select"
61
  )
results.jsonl CHANGED
@@ -1,30 +1,30 @@
1
- {"Models":"B3","Model Size(B)":8.29,"Data Source":"Self-Reported","Overall":72.0,"Classification":70.0,"VQA":66.5,"Retrieval":74.1,"Grounding":84.6,"URL":"https:\/\/huggingface.co\/raghavlite\/B3_Qwen2_7B"}
2
- {"Models":"CLIP-FT","Model Size(B)":0.428,"Data Source":"TIGER-Lab","Overall":45.4,"Classification":55.2,"VQA":19.7,"Retrieval":53.2,"Grounding":62.2,"URL":"https:\/\/doi.org\/10.48550\/arXiv.2103.00020"}
3
- {"Models":"LLaVE-0.5B","Model Size(B)":0.894,"Data Source":"Self-Reported","Overall":59.1,"Classification":57.4,"VQA":50.3,"Retrieval":59.8,"Grounding":82.9,"URL":"https:\/\/huggingface.co\/zhibinlan\/LLaVE-0.5B"}
4
- {"Models":"LLaVE-2B","Model Size(B)":1.95,"Data Source":"Self-Reported","Overall":65.2,"Classification":62.1,"VQA":60.2,"Retrieval":65.2,"Grounding":84.9,"URL":"https:\/\/huggingface.co\/zhibinlan\/LLaVE-2B"}
5
- {"Models":"LLaVE-7B","Model Size(B)":8.03,"Data Source":"Self-Reported","Overall":70.3,"Classification":65.7,"VQA":65.4,"Retrieval":70.9,"Grounding":91.9,"URL":"https:\/\/huggingface.co\/zhibinlan\/LLaVE-7B"}
6
- {"Models":"MM-Embed","Model Size(B)":8.18,"Data Source":"Self-Reported","Overall":50.0,"Classification":48.1,"VQA":32.3,"Retrieval":63.8,"Grounding":57.8,"URL":"https:\/\/huggingface.co\/nvidia\/MM-Embed"}
7
- {"Models":"MMRet-MLLM (FT)","Model Size(B)":7.57,"Data Source":"Self-Reported","Overall":64.1,"Classification":56.0,"VQA":57.4,"Retrieval":69.9,"Grounding":83.6,"URL":"https:\/\/huggingface.co\/JUNJIE99\/MMRet-large"}
8
- {"Models":"MMRet-MLLM (LLaVA-1.6)","Model Size(B)":7.57,"Data Source":"Self-Reported","Overall":44.0,"Classification":47.2,"VQA":18.4,"Retrieval":56.5,"Grounding":62.2,"URL":"https:\/\/huggingface.co\/JUNJIE99\/MMRet-large"}
9
- {"Models":"Magiclens","Model Size(B)":0.428,"Data Source":"TIGER-Lab","Overall":27.8,"Classification":38.8,"VQA":8.3,"Retrieval":35.4,"Grounding":26.0,"URL":"https:\/\/github.com\/google-deepmind\/magiclens"}
10
- {"Models":"OpenCLIP-FT","Model Size(B)":0.428,"Data Source":"TIGER-Lab","Overall":47.2,"Classification":56.0,"VQA":21.9,"Retrieval":55.4,"Grounding":64.1,"URL":"https:\/\/doi.org\/10.48550\/arXiv.2212.07143"}
11
- {"Models":"QQMM-embed","Model Size(B)":8.297,"Data Source":"Self-Reported","Overall":72.175,"Classification":70.07,"VQA":69.52,"Retrieval":71.175,"Grounding":87.075,"URL":"https:\/\/github.com\/QQ-MM\/QQMM-embed"}
12
- {"Models":"UniIR (BLIP_FF)","Model Size(B)":0.247,"Data Source":"TIGER-Lab","Overall":42.8,"Classification":42.1,"VQA":15.0,"Retrieval":60.1,"Grounding":62.2,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/UniIR"}
13
- {"Models":"UniIR (CLIP_SF)","Model Size(B)":0.428,"Data Source":"TIGER-Lab","Overall":44.7,"Classification":44.3,"VQA":16.2,"Retrieval":61.8,"Grounding":65.3,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/UniIR"}
14
- {"Models":"UniME(LLaVA-1.6-7B-LoRA-LowRes)","Model Size(B)":7.57,"Data Source":"Self-Reported","Overall":66.6,"Classification":60.6,"VQA":52.9,"Retrieval":67.9,"Grounding":85.1,"URL":"https:\/\/huggingface.co\/DeepGlint-AI\/UniME-LLaVA-1.6-7B"}
15
- {"Models":"UniME(LLaVA-OneVision-7B-LoRA-Res336)","Model Size(B)":8.03,"Data Source":"Self-Reported","Overall":70.7,"Classification":66.8,"VQA":66.6,"Retrieval":70.5,"Grounding":90.9,"URL":"https:\/\/huggingface.co\/DeepGlint-AI\/UniME-LLaVA-OneVision-7B"}
16
- {"Models":"UniME(Phi-3.5-V-LoRA)","Model Size(B)":4.2,"Data Source":"Self-Reported","Overall":64.2,"Classification":54.8,"VQA":55.9,"Retrieval":64.5,"Grounding":81.8,"URL":"https:\/\/huggingface.co\/DeepGlint-AI\/UniME-Phi3.5-V-4.2B"}
17
- {"Models":"VLM2Vec (LLaVA-1.6-LoRA-HighRes)","Model Size(B)":7.57,"Data Source":"TIGER-Lab","Overall":62.9,"Classification":61.2,"VQA":49.9,"Retrieval":67.4,"Grounding":86.1,"URL":"https://huggingface.co/TIGER-Lab/VLM2Vec-LLaVa-Next"}
18
- {"Models":"VLM2Vec (LLaVA-1.6-LoRA-LowRes)","Model Size(B)":7.57,"Data Source":"TIGER-Lab","Overall":55.0,"Classification":54.7,"VQA":50.3,"Retrieval":56.2,"Grounding":64.0,"URL":"https://huggingface.co/TIGER-Lab/VLM2Vec-LLaVa-Next"}
19
- {"Models":"VLM2Vec (Phi-3.5-V-FT)","Model Size(B)":4.15,"Data Source":"TIGER-Lab","Overall":55.9,"Classification":52.8,"VQA":50.3,"Retrieval":57.8,"Grounding":72.3,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Full"}
20
- {"Models":"VLM2Vec (Phi-3.5-V-LoRA)","Model Size(B)":4.15,"Data Source":"TIGER-Lab","Overall":60.1,"Classification":54.8,"VQA":54.9,"Retrieval":62.3,"Grounding":79.5,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Full"}
21
- {"Models":"VLM2Vec (Qwen2-VL-2B-LoRA-HighRes)","Model Size(B)":2.21,"Data Source":"TIGER-Lab","Overall":59.3,"Classification":59.0,"VQA":49.4,"Retrieval":65.4,"Grounding":73.4,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Qwen2VL-2B"}
22
- {"Models":"VLM2Vec (Qwen2-VL-7B-LoRA-HighRes)","Model Size(B)":8.29,"Data Source":"TIGER-Lab","Overall":65.8,"Classification":62.6,"VQA":57.8,"Retrieval":69.9,"Grounding":81.7,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Qwen2VL-7B"}
23
- {"Models":"blip2-opt-2.7b","Model Size(B)":3.74,"Data Source":"TIGER-Lab","Overall":25.2,"Classification":27.0,"VQA":4.2,"Retrieval":33.9,"Grounding":47.0,"URL":"https:\/\/huggingface.co\/Salesforce\/blip2-opt-2.7b"}
24
- {"Models":"clip-vit-large-patch14","Model Size(B)":0.428,"Data Source":"TIGER-Lab","Overall":37.8,"Classification":42.8,"VQA":9.1,"Retrieval":53.0,"Grounding":51.8,"URL":"https:\/\/huggingface.co\/openai\/clip-vit-large-patch14"}
25
- {"Models":"e5-v","Model Size(B)":8.36,"Data Source":"TIGER-Lab","Overall":13.3,"Classification":21.8,"VQA":4.9,"Retrieval":11.5,"Grounding":19.0,"URL":"https:\/\/huggingface.co\/royokong\/e5-v"}
26
- {"Models":"gme-Qwen2-VL-2B-Instruct","Model Size(B)":2.21,"Data Source":"Self-Reported","Overall":55.8,"Classification":56.9,"VQA":41.2,"Retrieval":67.8,"Grounding":53.4,"URL":"https:\/\/huggingface.co\/Alibaba-NLP\/gme-Qwen2-VL-2B-Instruct"}
27
- {"Models":"mmE5 (w\/ 560K synthetic data)","Model Size(B)":10.6,"Data Source":"Self-Reported","Overall":58.6,"Classification":60.6,"VQA":55.7,"Retrieval":54.7,"Grounding":72.4,"URL":"https:\/\/huggingface.co\/intfloat\/mmE5-mllama-11b-instruct"}
28
- {"Models":"mmE5-mllama-11b-instruct","Model Size(B)":10.6,"Data Source":"Self-Reported","Overall":69.8,"Classification":67.6,"VQA":62.6,"Retrieval":71.0,"Grounding":89.6,"URL":"https:\/\/huggingface.co\/intfloat\/mmE5-mllama-11b-instruct"}
29
- {"Models":"open_clip-ViT-L\/14","Model Size(B)":0.428,"Data Source":"TIGER-Lab","Overall":39.7,"Classification":47.8,"VQA":10.9,"Retrieval":52.3,"Grounding":53.3,"URL":"https:\/\/github.com\/mlfoundations\/open_clip"}
30
- {"Models":"siglip-base-patch16-224","Model Size(B)":0.203,"Data Source":"TIGER-Lab","Overall":34.8,"Classification":40.3,"VQA":8.4,"Retrieval":31.6,"Grounding":59.5,"URL":"https:\/\/huggingface.co\/google\/siglip-base-patch16-224"}
 
1
+ {"Models":"B3","Model Size(B)":8.29,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":72.0,"I-CLS":70.0,"I-QA":66.5,"I-RET":74.1,"I-VG":84.6,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/raghavlite\/B3_Qwen2_7B"}
2
+ {"Models":"CLIP-FT","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":45.4,"I-CLS":55.2,"I-QA":19.7,"I-RET":53.2,"I-VG":62.2,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/doi.org\/10.48550\/arXiv.2103.00020"}
3
+ {"Models":"LLaVE-0.5B","Model Size(B)":0.894,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":59.1,"I-CLS":57.4,"I-QA":50.3,"I-RET":59.8,"I-VG":82.9,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/zhibinlan\/LLaVE-0.5B"}
4
+ {"Models":"LLaVE-2B","Model Size(B)":1.95,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":65.2,"I-CLS":62.1,"I-QA":60.2,"I-RET":65.2,"I-VG":84.9,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/zhibinlan\/LLaVE-2B"}
5
+ {"Models":"LLaVE-7B","Model Size(B)":8.03,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":70.3,"I-CLS":65.7,"I-QA":65.4,"I-RET":70.9,"I-VG":91.9,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/zhibinlan\/LLaVE-7B"}
6
+ {"Models":"MM-Embed","Model Size(B)":8.18,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":50.0,"I-CLS":48.1,"I-QA":32.3,"I-RET":63.8,"I-VG":57.8,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/nvidia\/MM-Embed"}
7
+ {"Models":"MMRet-MLLM (FT)","Model Size(B)":7.57,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":64.1,"I-CLS":56.0,"I-QA":57.4,"I-RET":69.9,"I-VG":83.6,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/JUNJIE99\/MMRet-large"}
8
+ {"Models":"MMRet-MLLM (LLaVA-1.6)","Model Size(B)":7.57,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":44.0,"I-CLS":47.2,"I-QA":18.4,"I-RET":56.5,"I-VG":62.2,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/JUNJIE99\/MMRet-large"}
9
+ {"Models":"Magiclens","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":27.8,"I-CLS":38.8,"I-QA":8.3,"I-RET":35.4,"I-VG":26.0,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/github.com\/google-deepmind\/magiclens"}
10
+ {"Models":"OpenCLIP-FT","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":47.2,"I-CLS":56.0,"I-QA":21.9,"I-RET":55.4,"I-VG":64.1,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/doi.org\/10.48550\/arXiv.2212.07143"}
11
+ {"Models":"QQMM-embed","Model Size(B)":8.297,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":72.175,"I-CLS":70.07,"I-QA":69.52,"I-RET":71.175,"I-VG":87.075,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/github.com\/QQ-MM\/QQMM-embed"}
12
+ {"Models":"UniIR (BLIP_FF)","Model Size(B)":0.247,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":42.8,"I-CLS":42.1,"I-QA":15.0,"I-RET":60.1,"I-VG":62.2,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/UniIR"}
13
+ {"Models":"UniIR (CLIP_SF)","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":44.7,"I-CLS":44.3,"I-QA":16.2,"I-RET":61.8,"I-VG":65.3,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/UniIR"}
14
+ {"Models":"UniME(LLaVA-1.6-7B-LoRA-LowRes)","Model Size(B)":7.57,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":66.6,"I-CLS":60.6,"I-QA":52.9,"I-RET":67.9,"I-VG":85.1,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/DeepGlint-AI\/UniME-LLaVA-1.6-7B"}
15
+ {"Models":"UniME(LLaVA-OneVision-7B-LoRA-Res336)","Model Size(B)":8.03,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":70.7,"I-CLS":66.8,"I-QA":66.6,"I-RET":70.5,"I-VG":90.9,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/DeepGlint-AI\/UniME-LLaVA-OneVision-7B"}
16
+ {"Models":"UniME(Phi-3.5-V-LoRA)","Model Size(B)":4.2,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":64.2,"I-CLS":54.8,"I-QA":55.9,"I-RET":64.5,"I-VG":81.8,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/DeepGlint-AI\/UniME-Phi3.5-V-4.2B"}
17
+ {"Models":"VLM2Vec (LLaVA-1.6-LoRA-HighRes)","Model Size(B)":7.57,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":62.9,"I-CLS":61.2,"I-QA":49.9,"I-RET":67.4,"I-VG":86.1,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-LLaVa-Next"}
18
+ {"Models":"VLM2Vec (LLaVA-1.6-LoRA-LowRes)","Model Size(B)":7.57,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":55.0,"I-CLS":54.7,"I-QA":50.3,"I-RET":56.2,"I-VG":64.0,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-LLaVa-Next"}
19
+ {"Models":"VLM2Vec (Phi-3.5-V-FT)","Model Size(B)":4.15,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":55.9,"I-CLS":52.8,"I-QA":50.3,"I-RET":57.8,"I-VG":72.3,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Full"}
20
+ {"Models":"VLM2Vec (Phi-3.5-V-LoRA)","Model Size(B)":4.15,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":60.1,"I-CLS":54.8,"I-QA":54.9,"I-RET":62.3,"I-VG":79.5,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Full"}
21
+ {"Models":"VLM2Vec (Qwen2-VL-2B-LoRA-HighRes)","Model Size(B)":2.21,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":59.3,"I-CLS":59.0,"I-QA":49.4,"I-RET":65.4,"I-VG":73.4,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Qwen2VL-2B"}
22
+ {"Models":"VLM2Vec (Qwen2-VL-7B-LoRA-HighRes)","Model Size(B)":8.29,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":65.8,"I-CLS":62.6,"I-QA":57.8,"I-RET":69.9,"I-VG":81.7,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Qwen2VL-7B"}
23
+ {"Models":"blip2-opt-2.7b","Model Size(B)":3.74,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":25.2,"I-CLS":27.0,"I-QA":4.2,"I-RET":33.9,"I-VG":47.0,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/Salesforce\/blip2-opt-2.7b"}
24
+ {"Models":"clip-vit-large-patch14","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":37.8,"I-CLS":42.8,"I-QA":9.1,"I-RET":53.0,"I-VG":51.8,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/openai\/clip-vit-large-patch14"}
25
+ {"Models":"e5-v","Model Size(B)":8.36,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":13.3,"I-CLS":21.8,"I-QA":4.9,"I-RET":11.5,"I-VG":19.0,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/royokong\/e5-v"}
26
+ {"Models":"gme-Qwen2-VL-2B-Instruct","Model Size(B)":2.21,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":55.8,"I-CLS":56.9,"I-QA":41.2,"I-RET":67.8,"I-VG":53.4,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/Alibaba-NLP\/gme-Qwen2-VL-2B-Instruct"}
27
+ {"Models":"mmE5 (w\/ 560K synthetic data)","Model Size(B)":10.6,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":58.6,"I-CLS":60.6,"I-QA":55.7,"I-RET":54.7,"I-VG":72.4,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/intfloat\/mmE5-mllama-11b-instruct"}
28
+ {"Models":"mmE5-mllama-11b-instruct","Model Size(B)":10.6,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":69.8,"I-CLS":67.6,"I-QA":62.6,"I-RET":71.0,"I-VG":89.6,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/intfloat\/mmE5-mllama-11b-instruct"}
29
+ {"Models":"open_clip-ViT-L\/14","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":39.7,"I-CLS":47.8,"I-QA":10.9,"I-RET":52.3,"I-VG":53.3,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/github.com\/mlfoundations\/open_clip"}
30
+ {"Models":"siglip-base-patch16-224","Model Size(B)":0.203,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":34.8,"I-CLS":40.3,"I-QA":8.4,"I-RET":31.6,"I-VG":59.5,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/google\/siglip-base-patch16-224"}
utils.py CHANGED
@@ -10,24 +10,13 @@ from huggingface_hub import Repository
10
 
11
  HF_TOKEN = os.environ.get("HF_TOKEN")
12
 
13
- TASKS = ["Classification", "VQA", "Retrieval", "Grounding"]
 
 
 
14
 
15
- MODEL_INFO = [
16
- "Rank", "Models", "Model Size(B)", "Data Source",
17
- "Overall",
18
- "Classification", "VQA", "Retrieval", "Grounding"
19
- ]
20
-
21
- BASE_COLS = [col for col in MODEL_INFO if col not in TASKS]
22
-
23
- DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
24
-
25
- SUBMISSION_NAME = "MMEB"
26
- SUBMISSION_URL = os.path.join("https://huggingface.co/spaces/TIGER-Lab/", SUBMISSION_NAME)
27
- FILE_NAME = "results.csv"
28
- CSV_DIR = "results.csv"
29
-
30
- COLUMN_NAMES = MODEL_INFO
31
 
32
  LEADERBOARD_INTRODUCTION = """
33
  # MMEB Leaderboard
@@ -49,25 +38,7 @@ TABLE_INTRODUCTION = """"""
49
 
50
  LEADERBOARD_INFO = """
51
  ## Dataset Summary
52
- MMEB is organized into four primary meta-task categories:
53
- - **Classification**: This category comprises 5 in-distribution and 5 out-of-distribution datasets. Queries
54
- consist of instructions and images, optionally accompanied by related text. Targets are class labels,
55
- and the number of class labels corresponds to the number of classes in the dataset. \n
56
- - IND: ImageNet-1k, N24News, HatefulMemes, VOC2007, SUN397 \n
57
- - OOD: Place365, ImageNet-A, ImageNet-R, ObjectNet, Country-211 \n
58
- - **Visual Question Answering**: This category includes 6 in-distribution and 4 out-of-distribution
59
- datasets. The query consists of an instruction, an image, and a piece of text as the question, while
60
- the target is the answer. Each query has 1,000 target candidates: 1 ground truth and 999 distractors. \n
61
- - IND: OK-VQA, A-OKVQA, DocVQA, InfographicVQA, ChartQA, Visual7W \n
62
- - OOD: ScienceQA, VizWiz, GQA, TextVQA \n
63
- - **Information Retrieval**: This category contains 8 in-distribution and 4 out-of-distribution datasets.
64
- Both the query and target sides can involve a combination of text, images, and instructions. Similar
65
- to the VQA task, each query has 1,000 candidates, with 1 ground truth and 999 distractors. \n
66
- - IND: VisDial, CIRR, VisualNews_t2i, VisualNews_i2t, MSCOCO_t2i, MSCOCO_i2t, NIGHTS, WebQA \n
67
- - OOD: OVEN, FashionIQ, EDIS, Wiki-SS-NQ \n
68
- - **Visual Grounding**: This category includes 1 in-distribution and 3 out-of-distribution datasets, which are adapted from object detection tasks. Queries consist of an instruction, an image, and text referring to a specific region or object within the image. The target may include a cropped image of the object or text describing the same region. Each query includes 1,000 candidates: 1 ground truth and 999 distractors. These distractors may include hard negatives from the same object class, other objects in the image, or random objects from different images. \n
69
- - IND: MSCOCO \n
70
- - OOD: Visual7W-Pointing, RefCOCO, RefCOCO-Matching \n
71
  """
72
 
73
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
@@ -81,6 +52,8 @@ CITATION_BUTTON_TEXT = r"""@article{jiang2024vlm2vec,
81
  SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
82
 
83
  ## ⚠ Please note that you need to submit the JSON file with the following format:
 
 
84
  ```json
85
  [
86
  {
@@ -88,15 +61,34 @@ SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
88
  <Optional>"URL": "<Model URL>",
89
  "Model Size(B)": 1000,
90
  "Data Source": Self-Reported,
91
- "Overall": 50.0,
92
- "Classification": 50.0,
93
- "VQA": 50.0,
94
- "Retrieval": 50.0,
95
- "Grounding": 50.0
96
  },
97
  ]
98
  ```
99
- You may refer to the Github page for instructions about evaluating your model.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  Github link: https://github.com/TIGER-AI-Lab/VLM2Vec. \n
101
  Please send us an email at [email protected], attaching the JSON file. We will review your submission and update the leaderboard accordingly.
102
  """
@@ -113,24 +105,21 @@ def create_hyperlinked_names(df):
113
  df = df.apply(add_link_to_model_name, axis=1)
114
  return df
115
 
116
- def fetch_data(file: str) -> pd.DataFrame:
117
- # fetch the leaderboard data from remote
118
- if file is None:
119
- raise ValueError("URL Not Provided")
120
- url = f"https://huggingface.co/spaces/TIGER-Lab/MMEB/resolve/main/{file}"
121
- print(f"Fetching data from {url}")
122
- response = requests.get(url)
123
- if response.status_code != 200:
124
- raise requests.HTTPError(f"Failed to fetch data: HTTP status code {response.status_code}")
125
- return pd.read_json(io.StringIO(response.text), orient='records', lines=True)
126
 
127
  def get_df(file="results.jsonl"):
128
- df = fetch_data(file)
129
- print(df.columns)
130
- print('URL' in df.columns)
131
- print(df)
132
  df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
133
- df = df.sort_values(by=['Overall'], ascending=False)
134
  df = create_hyperlinked_names(df)
135
  df['Rank'] = range(1, len(df) + 1)
136
  return df
@@ -185,7 +174,3 @@ def filter_columns_by_tasks(df, selected_tasks=None):
185
 
186
  available_columns = [col for col in selected_columns if col in df.columns]
187
  return df[available_columns]
188
-
189
- def get_task_choices():
190
- return TASKS
191
-
 
10
 
11
  HF_TOKEN = os.environ.get("HF_TOKEN")
12
 
13
+ BASE_COLS = ["Rank", "Models", "Model Size(B)", "Data Source"]
14
+ TASKS_V1 = ["V1-Overall", "I-CLS", "I-QA", "I-RET", "I-VG"]
15
+ TASKS_V2 = ["V2-Overall", "V-CLS", "V-QA", "V-RET", "V-MRET", "VisDoc"]
16
+ COLUMN_NAMES = BASE_COLS + TASKS_V1 + TASKS_V2
17
 
18
+ DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown'] + \
19
+ ['number'] * (len(TASKS_V1) + len(TASKS_V2))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  LEADERBOARD_INTRODUCTION = """
22
  # MMEB Leaderboard
 
38
 
39
  LEADERBOARD_INFO = """
40
  ## Dataset Summary
41
+ <img width="900" alt="abs" src="overview.png">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  """
43
 
44
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
52
  SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
53
 
54
  ## ⚠ Please note that you need to submit the JSON file with the following format:
55
+
56
+ ### **TO SUBMIT V1 ONLY**
57
  ```json
58
  [
59
  {
 
61
  <Optional>"URL": "<Model URL>",
62
  "Model Size(B)": 1000,
63
  "Data Source": Self-Reported,
64
+ "V1-Overall": 50.0,
65
+ "I-CLS": 50.0,
66
+ "I-QA": 50.0,
67
+ "I-RET": 50.0,
68
+ "I-VG": 50.0
69
  },
70
  ]
71
  ```
72
+
73
+ ### **TO SUBMIT V2 ONLY**
74
+ ```json
75
+ [
76
+ {
77
+ "Model": "<Model Name>",
78
+ <Optional>"URL": "<Model URL>",
79
+ "Model Size(B)": 1000,
80
+ "Data Source": Self-Reported,
81
+ "V2-Overall": 50.0,
82
+ "V-CLS": 50.0,
83
+ "V-QA": 50.0,
84
+ "V-RET": 50.0,
85
+ "V-VG": 50.0,
86
+ "VisDoc": 50.0
87
+ },
88
+ ]
89
+ ```
90
+ You are also welcome to submit both versions by including all the fields above! :) \n
91
+ You may refer to the Github page for instructions about evaluating your model. \n
92
  Github link: https://github.com/TIGER-AI-Lab/VLM2Vec. \n
93
  Please send us an email at [email protected], attaching the JSON file. We will review your submission and update the leaderboard accordingly.
94
  """
 
105
  df = df.apply(add_link_to_model_name, axis=1)
106
  return df
107
 
108
+ # def fetch_data(file: str) -> pd.DataFrame:
109
+ # # fetch the leaderboard data from remote
110
+ # if file is None:
111
+ # raise ValueError("URL Not Provided")
112
+ # url = f"https://huggingface.co/spaces/TIGER-Lab/MMEB/resolve/main/{file}"
113
+ # print(f"Fetching data from {url}")
114
+ # response = requests.get(url)
115
+ # if response.status_code != 200:
116
+ # raise requests.HTTPError(f"Failed to fetch data: HTTP status code {response.status_code}")
117
+ # return pd.read_json(io.StringIO(response.text), orient='records', lines=True)
118
 
119
  def get_df(file="results.jsonl"):
120
+ df = pd.read_json(file, orient='records', lines=True)
 
 
 
121
  df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
122
+ df = df.sort_values(by=['V1-Overall'], ascending=False)
123
  df = create_hyperlinked_names(df)
124
  df['Rank'] = range(1, len(df) + 1)
125
  return df
 
174
 
175
  available_columns = [col for col in selected_columns if col in df.columns]
176
  return df[available_columns]