File size: 5,036 Bytes
c9d256e
0fb4283
 
c9d256e
0fb4283
c9d256e
 
 
 
 
 
 
 
 
 
 
 
 
0fb4283
c9d256e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0fb4283
 
c9d256e
 
 
 
 
0fb4283
c9d256e
 
 
 
0fb4283
c9d256e
 
 
 
 
 
 
0fb4283
c9d256e
 
 
 
 
 
 
 
0fb4283
c9d256e
 
 
 
 
 
 
 
 
 
 
 
 
0fb4283
c9d256e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0fb4283
c9d256e
 
 
 
 
 
 
0fb4283
 
c9d256e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
import gradio as gr
from transformers import pipeline
import spaces  # This module is available when deploying on HF Spaces with ZeroGPU

# --- Trending models for image text-to-text tasks ---
TRENDING_MODELS = [
    "Salesforce/blip2-opt-2.7b",
    "Salesforce/blip2-flan-t5-xl",
    "Salesforce/blip-image-captioning-base",
    "Salesforce/blip-image-captioning-large",
    "nlpconnect/vit-gpt2-image-captioning",
    "OFA-Sys/OFA-base",
    "OFA-Sys/OFA-large",
    "dandelin/vilt-b32-finetuned-vqa",
    "dandelin/vilt-b32-mlm",
    "uclanlp/visualbert-vqa-coco-pre"
]

# --- Helper: if the user selects "Custom", then they can enter any model identifier ---
def resolve_model(chosen, custom):
    if chosen == "Custom":
        return custom.strip()
    else:
        return chosen

# --- Main inference function ---
# If you are using ZeroGPU on Hugging Face Spaces, make sure to set the environment variable USE_GPU=1.
# The @spaces.GPU() decorator ensures that heavy inference runs on GPU in a ZeroGPU Space.
@spaces.GPU()
def compare_image_to_text_models(image, prompt, model1_choice, model1_custom, model2_choice, model2_custom):
    # Determine which model identifiers to use.
    model1_name = resolve_model(model1_choice, model1_custom)
    model2_name = resolve_model(model2_choice, model2_custom)

    # Set device to GPU (0) if USE_GPU is enabled; otherwise use CPU (-1)
    device = 0 if os.environ.get("USE_GPU", "0") == "1" else -1

    # Create pipelines for image-to-text.
    # Note: Many instruction-following image models (e.g. BLIP2) accept a text prompt along with an image.
    # We use the "image-to-text" task here so that the prompt is taken into account.
    pipe1 = pipeline("image-to-text", model=model1_name, device=device)
    pipe2 = pipeline("image-to-text", model=model2_name, device=device)

    # Run inference on the image with the provided prompt.
    # Depending on the model, the call signature may vary; here we assume a simple call with (image, prompt).
    output1 = pipe1(image, prompt)
    output2 = pipe2(image, prompt)

    # Extract the generated text.
    # (Many pipelines return a list of dicts with key 'generated_text'; if not, we simply convert the output to a string.)
    def extract_text(output):
        if isinstance(output, list) and len(output) > 0 and isinstance(output[0], dict) and "generated_text" in output[0]:
            return output[0]["generated_text"]
        else:
            return str(output)
    
    result1 = extract_text(output1)
    result2 = extract_text(output2)

    # Format results as chat conversations.
    # Each chatbot conversation is a list of (speaker, message) tuples.
    chat1 = [("User", prompt), ("Bot", result1)]
    chat2 = [("User", prompt), ("Bot", result2)]
    return chat1, chat2

# --- Build the Gradio interface ---
# Pre-populated sample prompt.
sample_prompt = "Describe the image in explicit detail. Return a nested JSON object in response."

with gr.Blocks(title="Image Text-to-Text Comparison Tool") as demo:
    gr.Markdown(
        """
        # Image Text-to-Text Comparison Tool  
        Compare two trending image text-to-text (instruction-following) models side-by-side.  
        Select a model from the dropdown (or choose Custom to enter your own model identifier) and see how it describes the image.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## Input")
            image_input = gr.Image(label="Upload an Image", type="pil")
            prompt_input = gr.Textbox(label="Text Prompt", value=sample_prompt, lines=3)
        with gr.Column(scale=1):
            gr.Markdown("## Model Selection")
            with gr.Row():
                with gr.Column():
                    gr.Markdown("### Model 1")
                    model1_choice = gr.Dropdown(
                        choices=TRENDING_MODELS + ["Custom"],
                        value=TRENDING_MODELS[0],
                        label="Select Model 1"
                    )
                    model1_custom = gr.Textbox(label="Custom Model 1", placeholder="e.g., username/model_name")
                with gr.Column():
                    gr.Markdown("### Model 2")
                    model2_choice = gr.Dropdown(
                        choices=TRENDING_MODELS + ["Custom"],
                        value=TRENDING_MODELS[1],
                        label="Select Model 2"
                    )
                    model2_custom = gr.Textbox(label="Custom Model 2", placeholder="e.g., username/model_name")
    
    compare_button = gr.Button("Compare Models")
    
    gr.Markdown("## Chatbot Outputs (Side-by-Side)")
    with gr.Row():
        chatbot1 = gr.Chatbot(label="Model 1 Chatbot")
        chatbot2 = gr.Chatbot(label="Model 2 Chatbot")
    
    compare_button.click(
        fn=compare_image_to_text_models,
        inputs=[image_input, prompt_input, model1_choice, model1_custom, model2_choice, model2_custom],
        outputs=[chatbot1, chatbot2]
    )

demo.launch()