Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import gradio as gr | |
from transformers import pipeline | |
import spaces # This module is available when deploying on HF Spaces with ZeroGPU | |
import multiprocessing | |
multiprocessing.set_start_method("spawn", force=True) | |
# --- Trending models for image text-to-text tasks --- | |
TRENDING_MODELS = [ | |
"Salesforce/blip2-opt-2.7b", | |
"Salesforce/blip2-flan-t5-xl", | |
"Salesforce/blip-image-captioning-base", | |
"Salesforce/blip-image-captioning-large", | |
"nlpconnect/vit-gpt2-image-captioning", | |
"OFA-Sys/OFA-base", | |
"OFA-Sys/OFA-large", | |
"dandelin/vilt-b32-finetuned-vqa", | |
"dandelin/vilt-b32-mlm", | |
"uclanlp/visualbert-vqa-coco-pre" | |
] | |
# --- Helper: if the user selects "Custom", then they can enter any model identifier --- | |
def resolve_model(chosen, custom): | |
if chosen == "Custom": | |
return custom.strip() | |
else: | |
return chosen | |
# --- Main inference function --- | |
# If you are using ZeroGPU on Hugging Face Spaces, make sure to set the environment variable USE_GPU=1. | |
# The @spaces.GPU() decorator ensures that heavy inference runs on GPU in a ZeroGPU Space. | |
def compare_image_to_text_models(image, prompt, model1_choice, model1_custom, model2_choice, model2_custom): | |
# Determine which model identifiers to use. | |
model1_name = resolve_model(model1_choice, model1_custom) | |
model2_name = resolve_model(model2_choice, model2_custom) | |
# Set device to GPU (0) if USE_GPU is enabled; otherwise use CPU (-1) | |
device = 0 if os.environ.get("USE_GPU", "0") == "1" else -1 | |
# Create pipelines for image-to-text. | |
# Note: Many instruction-following image models (e.g. BLIP2) accept a text prompt along with an image. | |
# We use the "image-to-text" task here so that the prompt is taken into account. | |
pipe1 = pipeline("image-to-text", model=model1_name, device=device) | |
pipe2 = pipeline("image-to-text", model=model2_name, device=device) | |
# Run inference on the image with the provided prompt. | |
# Depending on the model, the call signature may vary; here we assume a simple call with (image, prompt). | |
output1 = pipe1(image, prompt) | |
output2 = pipe2(image, prompt) | |
# Extract the generated text. | |
# (Many pipelines return a list of dicts with key 'generated_text'; if not, we simply convert the output to a string.) | |
def extract_text(output): | |
if isinstance(output, list) and len(output) > 0 and isinstance(output[0], dict) and "generated_text" in output[0]: | |
return output[0]["generated_text"] | |
else: | |
return str(output) | |
result1 = extract_text(output1) | |
result2 = extract_text(output2) | |
# Format results as chat conversations. | |
# Each chatbot conversation is a list of (speaker, message) tuples. | |
chat1 = [("User", prompt), ("Bot", result1)] | |
chat2 = [("User", prompt), ("Bot", result2)] | |
return chat1, chat2 | |
# --- Build the Gradio interface --- | |
# Pre-populated sample prompt. | |
sample_prompt = "Describe the image in explicit detail. Return a nested JSON object in response." | |
with gr.Blocks(title="Image Text-to-Text Comparison Tool") as demo: | |
gr.Markdown( | |
""" | |
# Image Text-to-Text Comparison Tool | |
Compare two trending image text-to-text (instruction-following) models side-by-side. | |
Select a model from the dropdown (or choose Custom to enter your own model identifier) and see how it describes the image. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("## Input") | |
image_input = gr.Image(label="Upload an Image", type="pil") | |
prompt_input = gr.Textbox(label="Text Prompt", value=sample_prompt, lines=3) | |
with gr.Column(scale=1): | |
gr.Markdown("## Model Selection") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### Model 1") | |
model1_choice = gr.Dropdown( | |
choices=TRENDING_MODELS + ["Custom"], | |
value=TRENDING_MODELS[0], | |
label="Select Model 1" | |
) | |
model1_custom = gr.Textbox(label="Custom Model 1", placeholder="e.g., username/model_name") | |
with gr.Column(): | |
gr.Markdown("### Model 2") | |
model2_choice = gr.Dropdown( | |
choices=TRENDING_MODELS + ["Custom"], | |
value=TRENDING_MODELS[1], | |
label="Select Model 2" | |
) | |
model2_custom = gr.Textbox(label="Custom Model 2", placeholder="e.g., username/model_name") | |
compare_button = gr.Button("Compare Models") | |
gr.Markdown("## Chatbot Outputs (Side-by-Side)") | |
with gr.Row(): | |
chatbot1 = gr.Chatbot(label="Model 1 Chatbot") | |
chatbot2 = gr.Chatbot(label="Model 2 Chatbot") | |
compare_button.click( | |
fn=compare_image_to_text_models, | |
inputs=[image_input, prompt_input, model1_choice, model1_custom, model2_choice, model2_custom], | |
outputs=[chatbot1, chatbot2] | |
) | |
demo.launch() | |