Spaces:
Running
Running
import gradio as gr | |
import gradio.themes.base | |
from utils import * | |
from data_utils import * | |
from datasets import load_dataset | |
ds = load_dataset("visionLMsftw/vibe-testing-samples", split="train") | |
evaluation_data = get_evaluation_data(ds) | |
ds_results = load_dataset("visionLMsftw/vibe-testing-results", split="train") | |
models = get_model_names(ds_results) | |
responses = get_responses(ds_results) | |
model_params = { | |
"Qwen/Qwen2.5-VL-32B-Instruct": 32, | |
"google/gemma-3-27b-it": 27, | |
"meta-llama/Llama-4-Maverick-17B-128E-Instruct": 17, | |
"Qwen/Qwen2.5-VL-7B-Instruct": 7, | |
"HuggingFaceTB/SmolVLM2-2.2B-Instruct": 2.2, | |
} | |
def filter_models_by_param(min_params): | |
filtered_models = [m for m, p in model_params.items() if p >= min_params] | |
selected = filtered_models[0] if filtered_models else None | |
return gr.update(choices=filtered_models, value=selected) | |
def display_model_details(model_name): | |
if model_name not in model_params: | |
return "No info available." | |
size = model_params[model_name] | |
provider = model_name.split("/")[0] if "/" in model_name else "Unknown" | |
link = f"https://huggingface.co/{model_name}" | |
return f""" | |
<div style="margin-top: 10px; font-size: 14px; display: flex; gap: 12px; align-items: center; flex-wrap: wrap;"> | |
<span><strong>Provider:</strong> {provider}</span> | |
<span style="color: #999;">|</span> | |
<span><strong>Size:</strong> {size}B</span> | |
<span style="color: #999;">|</span> | |
<span><strong>Link:</strong> <a href="{link}" target="_blank">{model_name}</a></span> | |
</div> | |
""" | |
models = list(model_params.keys()) | |
default_category = evaluation_data[0]["category"] | |
default_example_id = evaluation_data[0]["id"] | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
gr.Markdown("# VLMVibeEval") | |
gr.Markdown( | |
""" | |
A lightweight leaderboard for evaluating Vision Language Models (VLMs) β based on vibes. π | |
Traditional benchmarks don't give concrete signal for your use case and models are often saturated over them. Instead, we let you **vibe test** models across curated, in-the-wild examples: | |
1. Predefined categories with images and prompts. | |
2. Check any model on these examples. | |
3. Explore the generations and judge for yourself, as different models have different styles and strengths. π£οΈ | |
This is not about scores β it's about *how it feels*. You can submit new models in the community tab and we'll shortly update the app! π€ | |
""" | |
) | |
mode = gr.Radio(["View model-wise responses", "Compare model responses on a specific example"], label="Mode", value="View model-wise responses") | |
with gr.Column(visible=True) as model_mode: | |
param_slider = gr.Slider(minimum=2, maximum=32, step=1, label="Minimum model parameters (B)") | |
selected_model = gr.Dropdown(models, label="Choose model") | |
model_info_box = gr.HTML() | |
param_slider.change(filter_models_by_param, inputs=param_slider, outputs=selected_model) | |
model_category = gr.Dropdown( | |
choices=list(set(ex["category"] for ex in evaluation_data)), | |
label="Category", | |
value=default_category | |
) | |
model_output = gr.HTML() | |
current_index = gr.State(value=0) | |
current_html = gr.State(value="") | |
def load_initial(model, category): | |
filtered_data = [ex for ex in evaluation_data if ex["category"] == category] | |
html = display_model_responses_html(evaluation_data, responses, model, start_index=0, batch_size=5, category=category) | |
has_more = 5 < len(filtered_data) | |
model_info_html = display_model_details(model) | |
return html, 5, html, gr.update(visible=has_more), model_info_html | |
def load_more(model, index, html_so_far, category): | |
filtered_data = [ex for ex in evaluation_data if ex["category"] == category] | |
new_html = display_model_responses_html(evaluation_data, responses, model, start_index=index, batch_size=5, category=category) | |
updated_html = html_so_far + new_html | |
new_index = index + 5 | |
has_more = new_index < len(filtered_data) | |
return updated_html, new_index, updated_html, gr.update(visible=has_more) | |
more_button = gr.Button("Load more") | |
selected_model.change( | |
load_initial, | |
inputs=[selected_model, model_category], | |
outputs=[model_output, current_index, current_html, more_button, model_info_box] | |
) | |
model_category.change( | |
load_initial, | |
inputs=[selected_model, model_category], | |
outputs=[model_output, current_index, current_html, more_button, model_info_box] | |
) | |
demo.load( | |
load_initial, | |
inputs=[selected_model, model_category], | |
outputs=[model_output, current_index, current_html, more_button, model_info_box] | |
) | |
more_button.click( | |
load_more, | |
inputs=[selected_model, current_index, current_html, model_category], | |
outputs=[model_output, current_index, current_html, more_button] | |
) | |
with gr.Column(visible=False) as example_mode: | |
category = gr.Dropdown( | |
choices=list(set(ex["category"] for ex in evaluation_data)), | |
label="Category", | |
value=default_category | |
) | |
example = gr.Dropdown( | |
label="Example", | |
value=default_example_id, | |
choices=get_examples_by_category(evaluation_data, default_category) | |
) | |
example_display = gr.HTML() | |
category.change(lambda c: gr.update(choices=get_examples_by_category(evaluation_data, c)), category, example) | |
example.change( | |
fn=lambda ex_id: display_example_responses_html(evaluation_data, responses, models, ex_id), | |
inputs=example, | |
outputs=example_display | |
) | |
demo.load(fn=lambda: display_example_responses_html(evaluation_data, responses, models, default_example_id), inputs=None, outputs=example_display) | |
def switch_mode(selected): | |
return { | |
model_mode: gr.update(visible=selected == "View model-wise responses"), | |
example_mode: gr.update(visible=selected == "Compare model responses on a specific example"), | |
} | |
mode.change(switch_mode, mode, [model_mode, example_mode]) | |
gr.HTML(r""" | |
<style> | |
#image-modal { | |
display: none; | |
position: fixed; | |
z-index: 999; | |
left: 0; top: 0; | |
width: 100%; height: 100%; | |
background-color: rgba(0, 0, 0, 0.8); | |
align-items: center; | |
justify-content: center; | |
} | |
#image-modal img { | |
max-width: 90%; | |
max-height: 90%; | |
border-radius: 8px; | |
box-shadow: 0 0 20px rgba(255,255,255,0.3); | |
} | |
#image-modal .close { | |
position: absolute; | |
top: 20px; right: 30px; | |
font-size: 32px; | |
color: #fff; | |
cursor: pointer; | |
font-weight: bold; | |
} | |
</style> | |
<div id="image-modal" onclick="closeModal(event)"> | |
<span class="close" onclick="closeModal(event)">×</span> | |
<img id="modal-img" src="" alt="Enlarged Image" /> | |
</div> | |
<script> | |
function openImage(src) { | |
const modal = document.getElementById('image-modal'); | |
const img = document.getElementById('modal-img'); | |
img.src = src; | |
modal.style.display = 'flex'; | |
} | |
function closeModal(event) { | |
if (event.target.id === 'image-modal' || event.target.classList.contains('close')) { | |
document.getElementById('image-modal').style.display = 'none'; | |
} | |
} | |
// Optional: close on ESC key | |
document.addEventListener('keydown', function(e) { | |
if (e.key === "Escape") { | |
document.getElementById('image-modal').style.display = 'none'; | |
} | |
}); | |
</script> | |
""") | |
demo.launch() | |