File size: 4,422 Bytes
8460af1 b0ee7b4 7899d14 b0ee7b4 aa9bb5e 8f7b136 9e2afd2 aa9bb5e 4fbc8c7 aa9bb5e 4fbc8c7 7899d14 4fbc8c7 aa9bb5e 7899d14 4fbc8c7 8132cf9 aa9bb5e 4fbc8c7 aa9bb5e 7899d14 4fbc8c7 aa9bb5e 4fbc8c7 aa9bb5e 4fbc8c7 aa9bb5e 7899d14 b0ee7b4 7899d14 b0ee7b4 cb50a24 7337644 aae65ae f2e3361 b0ee7b4 20f0a61 6d0114c e9c1ec6 6d0114c b0ee7b4 f2e3361 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import pandas as pd
import gradio as gr
data = {
"Method": [
"AIN-7B", "GPT-4o", "GPT-4o-mini", "Qwen2-VL-7B", "Gemini-1.5-Pro", "Gemini-1.5-Flash",
"LLaVa-OneVision-7B", "Pangea-7B-Instruct", "Qwen2-VL-2B", "InternVL2-8B", "LLaVa-NeXt-7B", "Maya-8B"
],
"MM Understanding & Reasoning": [
56.78, 55.15, 48.83, 48.76, 46.67, 45.58, 42.90, 40.09, 40.59, 30.41, 26.33, 39.07
],
"OCR & Document Understanding": [
72.35, 54.98, 39.38, 42.73, 36.59, 33.59, 31.35, 17.75, 25.68, 15.91, 19.12, 26.70
],
"Video Understanding": [
64.09, 69.65, 66.28, 61.97, 42.94, 53.31, 29.41, 49.01, 38.90, 51.42, 44.90, 47.23
],
"Remote Sensing Understanding": [
45.92, 27.36, 16.93, 21.30, 17.07, 14.95, 10.72, 6.67, 12.56, 5.36, 8.33, 27.53
],
"Charts & Diagram Understanding": [
64.10, 62.35, 56.37, 54.67, 47.06, 48.25, 40.86, 38.75, 27.83, 30.27, 27.56, 34.25
],
"Agro Specific": [
85.05, 80.75, 78.80, 79.32, 72.12, 76.06, 75.03, 74.51, 52.02, 44.47, 42.00, 70.61
],
"Cultural Specific Understanding": [
78.09, 80.86, 65.92, 75.96, 56.24, 46.54, 66.02, 20.34, 34.27, 20.88, 28.30, 57.42
],
"Medical Imaging": [
43.77, 49.91, 47.37, 35.81, 33.77, 42.86, 27.29, 31.99, 29.12, 29.48, 22.54, 31.57
],
}
# data = {
# "Method": [
# "GPT-4o", "GPT-4o-mini", "Qwen2-VL-7B", "Gemini-1.5-Pro", "Gemini-1.5-Flash",
# "LLaVa-OneVision-7B", "Pangea-7B-Instruct", "Qwen2-VL-2B", "InternVL2-8B", "LLaVa-NeXt-7B", "Maya-8B"
# ],
# "MM Understanding & Reasoning": [
# 57.90, 48.82, 51.35, 46.67, 45.58, 42.90, 40.09, 40.59, 30.41, 26.33, 39.07
# ],
# "OCR & Document Understanding": [
# 59.11, 42.89, 49.06, 36.59, 33.59, 31.35, 17.75, 25.68, 15.91, 19.12, 26.70
# ],
# "Charts & Diagram Understanding": [
# 73.57, 64.98, 55.39, 47.06, 48.25, 40.86, 38.75, 27.83, 30.27, 27.56, 34.25
# ],
# "Video Understanding": [
# 74.27, 68.11, 62.64, 42.94, 53.31, 29.41, 49.01, 38.90, 51.42, 44.90, 47.23
# ],
# "Cultural Specific Understanding": [
# 80.86, 65.92, 75.64, 56.24, 46.54, 66.02, 20.34, 34.27, 20.88, 28.30, 57.42
# ],
# "Medical Imaging": [
# 49.90, 47.37, 39.42, 33.77, 42.86, 27.29, 31.99, 29.12, 29.48, 22.54, 31.57
# ],
# "Agro Specific": [
# 80.75, 79.58, 79.84, 72.12, 76.06, 75.03, 74.51, 52.02, 44.47, 42.00, 70.61
# ],
# "Remote Sensing Understanding": [
# 22.85, 16.93, 22.28, 17.07, 14.95, 10.72, 6.67, 12.56, 5.36, 8.33, 27.53
# ]
# }
df = pd.DataFrame(data)
df['Average Score'] = df.iloc[:, 1:].mean(axis=1).round(2)
df = df[['Method', 'Average Score'] + [col for col in df.columns if col not in ['Method', 'Average Score']]]
def display_data():
return df
with gr.Blocks() as demo:
gr.Markdown("", elem_id="camel-icon") # Replace with actual camel icon URL
gr.Markdown("# **CAMEL-Bench: Model Performance Across Vision Understanding Tasks**")
gr.Markdown("""
This table shows the performance of different models across various tasks including OCR, chart understanding, video, medical imaging, and more.
""")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏅 LLM Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
# with gr.Row():
# with gr.Column():
gr.Dataframe(value=df, label="CAMEL-Bench Model Performance", interactive=False)
with gr.TabItem("📤 How to Submit", elem_id="submission-tab", id=1):
gr.Markdown("""
## Submission Instructions
To contribute your model's results to the CAMEL-Bench leaderboard:
- **Via GitHub Pull Request**:
- Use [this evaluation script](https://github.com/mbzuai-oryx/Camel-Bench/blob/main/scripts/eval_qwen.py) to test your model and generate results.
- Create a pull request in the CAMEL-Bench GitHub repository with your results.
- **Via Email**:
- Send your results to **[email protected]**, and we’ll add them to the leaderboard for you.
**We look forward to seeing your contributions!**
""")
demo.launch()
|