|
import pandas as pd |
|
import gradio as gr |
|
|
|
|
|
data = { |
|
"Method": [ |
|
"AIN-7B", "GPT-4o", "GPT-4o-mini", "Qwen2-VL-7B", "Gemini-1.5-Pro", "Gemini-1.5-Flash", |
|
"LLaVa-OneVision-7B", "Pangea-7B-Instruct", "Qwen2-VL-2B", "InternVL2-8B", "LLaVa-NeXt-7B", "Maya-8B" |
|
], |
|
"MM Understanding & Reasoning": [ |
|
56.78, 55.15, 48.83, 48.76, 46.67, 45.58, 42.90, 40.09, 40.59, 30.41, 26.33, 39.07 |
|
], |
|
"OCR & Document Understanding": [ |
|
72.35, 54.98, 39.38, 42.73, 36.59, 33.59, 31.35, 17.75, 25.68, 15.91, 19.12, 26.70 |
|
], |
|
"Video Understanding": [ |
|
64.09, 69.65, 66.28, 61.97, 42.94, 53.31, 29.41, 49.01, 38.90, 51.42, 44.90, 47.23 |
|
], |
|
"Remote Sensing Understanding": [ |
|
45.92, 27.36, 16.93, 21.30, 17.07, 14.95, 10.72, 6.67, 12.56, 5.36, 8.33, 27.53 |
|
], |
|
"Charts & Diagram Understanding": [ |
|
64.10, 62.35, 56.37, 54.67, 47.06, 48.25, 40.86, 38.75, 27.83, 30.27, 27.56, 34.25 |
|
], |
|
"Agro Specific": [ |
|
85.05, 80.75, 78.80, 79.32, 72.12, 76.06, 75.03, 74.51, 52.02, 44.47, 42.00, 70.61 |
|
], |
|
"Cultural Specific Understanding": [ |
|
78.09, 80.86, 65.92, 75.96, 56.24, 46.54, 66.02, 20.34, 34.27, 20.88, 28.30, 57.42 |
|
], |
|
"Medical Imaging": [ |
|
43.77, 49.91, 47.37, 35.81, 33.77, 42.86, 27.29, 31.99, 29.12, 29.48, 22.54, 31.57 |
|
], |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df = pd.DataFrame(data) |
|
df['Average Score'] = df.iloc[:, 1:].mean(axis=1).round(2) |
|
df = df[['Method', 'Average Score'] + [col for col in df.columns if col not in ['Method', 'Average Score']]] |
|
|
|
def display_data(): |
|
return df |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("", elem_id="camel-icon") |
|
gr.Markdown("# **CAMEL-Bench: Model Performance Across Vision Understanding Tasks**") |
|
gr.Markdown(""" |
|
This table shows the performance of different models across various tasks including OCR, chart understanding, video, medical imaging, and more. |
|
""") |
|
with gr.Tabs(elem_classes="tab-buttons") as tabs: |
|
with gr.TabItem("🏅 LLM Leaderboard", elem_id="llm-benchmark-tab-table", id=0): |
|
|
|
|
|
gr.Dataframe(value=df, label="CAMEL-Bench Model Performance", interactive=False) |
|
|
|
with gr.TabItem("📤 How to Submit", elem_id="submission-tab", id=1): |
|
gr.Markdown(""" |
|
## Submission Instructions |
|
|
|
To contribute your model's results to the CAMEL-Bench leaderboard: |
|
|
|
- **Via GitHub Pull Request**: |
|
- Use [this evaluation script](https://github.com/mbzuai-oryx/Camel-Bench/blob/main/scripts/eval_qwen.py) to test your model and generate results. |
|
- Create a pull request in the CAMEL-Bench GitHub repository with your results. |
|
|
|
- **Via Email**: |
|
- Send your results to **[email protected]**, and we’ll add them to the leaderboard for you. |
|
|
|
**We look forward to seeing your contributions!** |
|
""") |
|
|
|
demo.launch() |
|
|