File size: 4,422 Bytes
8460af1
b0ee7b4
 
7899d14
b0ee7b4
aa9bb5e
8f7b136
9e2afd2
aa9bb5e
 
4fbc8c7
aa9bb5e
 
4fbc8c7
7899d14
 
4fbc8c7
aa9bb5e
7899d14
4fbc8c7
8132cf9
aa9bb5e
4fbc8c7
aa9bb5e
7899d14
4fbc8c7
aa9bb5e
 
4fbc8c7
aa9bb5e
 
4fbc8c7
aa9bb5e
7899d14
b0ee7b4
 
7899d14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0ee7b4
cb50a24
7337644
aae65ae
f2e3361
b0ee7b4
 
 
20f0a61
 
 
 
 
6d0114c
 
e9c1ec6
 
 
 
6d0114c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0ee7b4
f2e3361
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pandas as pd
import gradio as gr


data = {
    "Method": [
        "AIN-7B", "GPT-4o", "GPT-4o-mini", "Qwen2-VL-7B", "Gemini-1.5-Pro", "Gemini-1.5-Flash", 
        "LLaVa-OneVision-7B", "Pangea-7B-Instruct", "Qwen2-VL-2B", "InternVL2-8B", "LLaVa-NeXt-7B", "Maya-8B"
    ],
    "MM Understanding & Reasoning": [
        56.78, 55.15, 48.83, 48.76, 46.67, 45.58, 42.90, 40.09, 40.59, 30.41, 26.33, 39.07
    ],
    "OCR & Document Understanding": [
        72.35, 54.98, 39.38, 42.73, 36.59, 33.59, 31.35, 17.75, 25.68, 15.91, 19.12, 26.70
    ],
    "Video Understanding": [
        64.09, 69.65, 66.28, 61.97, 42.94, 53.31, 29.41, 49.01, 38.90, 51.42, 44.90, 47.23
    ],
    "Remote Sensing Understanding": [
        45.92, 27.36, 16.93, 21.30, 17.07, 14.95, 10.72, 6.67, 12.56, 5.36, 8.33, 27.53
    ],
    "Charts & Diagram Understanding": [
        64.10, 62.35, 56.37, 54.67, 47.06, 48.25, 40.86, 38.75, 27.83, 30.27, 27.56, 34.25
    ],
    "Agro Specific": [
        85.05, 80.75, 78.80, 79.32, 72.12, 76.06, 75.03, 74.51, 52.02, 44.47, 42.00, 70.61
    ],
    "Cultural Specific Understanding": [
        78.09, 80.86, 65.92, 75.96, 56.24, 46.54, 66.02, 20.34, 34.27, 20.88, 28.30, 57.42
    ],
    "Medical Imaging": [
        43.77, 49.91, 47.37, 35.81, 33.77, 42.86, 27.29, 31.99, 29.12, 29.48, 22.54, 31.57
    ],
    
}


# data = {
#     "Method": [
#         "GPT-4o", "GPT-4o-mini", "Qwen2-VL-7B", "Gemini-1.5-Pro", "Gemini-1.5-Flash", 
#         "LLaVa-OneVision-7B", "Pangea-7B-Instruct", "Qwen2-VL-2B", "InternVL2-8B", "LLaVa-NeXt-7B", "Maya-8B"
#     ],
#     "MM Understanding & Reasoning": [
#         57.90, 48.82, 51.35, 46.67, 45.58, 42.90, 40.09, 40.59, 30.41, 26.33, 39.07
#     ],
#     "OCR & Document Understanding": [
#         59.11, 42.89, 49.06, 36.59, 33.59, 31.35, 17.75, 25.68, 15.91, 19.12, 26.70
#     ],
#     "Charts & Diagram Understanding": [
#         73.57, 64.98, 55.39, 47.06, 48.25, 40.86, 38.75, 27.83, 30.27, 27.56, 34.25
#     ],
#     "Video Understanding": [
#         74.27, 68.11, 62.64, 42.94, 53.31, 29.41, 49.01, 38.90, 51.42, 44.90, 47.23
#     ],
#     "Cultural Specific Understanding": [
#         80.86, 65.92, 75.64, 56.24, 46.54, 66.02, 20.34, 34.27, 20.88, 28.30, 57.42
#     ],
#     "Medical Imaging": [
#         49.90, 47.37, 39.42, 33.77, 42.86, 27.29, 31.99, 29.12, 29.48, 22.54, 31.57
#     ],
#     "Agro Specific": [
#         80.75, 79.58, 79.84, 72.12, 76.06, 75.03, 74.51, 52.02, 44.47, 42.00, 70.61
#     ],
#     "Remote Sensing Understanding": [
#         22.85, 16.93, 22.28, 17.07, 14.95, 10.72, 6.67, 12.56, 5.36, 8.33, 27.53
#     ]
# }

df = pd.DataFrame(data)
df['Average Score'] = df.iloc[:, 1:].mean(axis=1).round(2)
df = df[['Method', 'Average Score'] + [col for col in df.columns if col not in ['Method', 'Average Score']]]

def display_data():
    return df

with gr.Blocks() as demo:
    gr.Markdown("![camel icon](https://cdn-uploads.huggingface.co/production/uploads/656864e12d73834278a8dea7/n-XfVKd1xVywH_vgPyJyQ.png)", elem_id="camel-icon")  # Replace with actual camel icon URL
    gr.Markdown("# **CAMEL-Bench: Model Performance Across Vision Understanding Tasks**")
    gr.Markdown("""
    This table shows the performance of different models across various tasks including OCR, chart understanding, video, medical imaging, and more. 
    """)
    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 LLM Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
            # with gr.Row():
                # with gr.Column():
            gr.Dataframe(value=df, label="CAMEL-Bench Model Performance", interactive=False)

        with gr.TabItem("📤 How to Submit", elem_id="submission-tab", id=1):
            gr.Markdown("""
            ## Submission Instructions

            To contribute your model's results to the CAMEL-Bench leaderboard:

            - **Via GitHub Pull Request**: 
              - Use [this evaluation script](https://github.com/mbzuai-oryx/Camel-Bench/blob/main/scripts/eval_qwen.py) to test your model and generate results.
              - Create a pull request in the CAMEL-Bench GitHub repository with your results.

            - **Via Email**:
              - Send your results to **[email protected]**, and we’ll add them to the leaderboard for you.

            **We look forward to seeing your contributions!**
            """)

demo.launch()