File size: 3,097 Bytes
d60d34b
acda6c7
 
d60d34b
acda6c7
61b7eee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
acda6c7
73f9f45
61b7eee
3601eff
 
 
 
 
c890be1
61b7eee
 
 
 
 
 
 
3601eff
c890be1
 
61b7eee
 
 
 
 
c890be1
 
3601eff
acda6c7
d60d34b
61b7eee
 
c890be1
d60d34b
61b7eee
d60d34b
c890be1
61b7eee
 
 
 
 
 
 
 
c890be1
d60d34b
61b7eee
 
c890be1
d60d34b
c890be1
d60d34b
61b7eee
 
 
 
 
 
 
 
c890be1
d60d34b
c890be1
d60d34b
 
 
 
 
61b7eee
d60d34b
 
61b7eee
 
d60d34b
3601eff
 
c890be1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import torch
import gradio as gr
from transformers import CLIPProcessor, CLIPModel
import spaces

# Dictionary of available CLIP models with their image sizes
CLIP_MODELS = {
    "ViT-B/32": ("openai/clip-vit-base-patch32", 224),
    "ViT-B/16": ("openai/clip-vit-base-patch16", 224),
    "ViT-L/14": ("openai/clip-vit-large-patch14", 224),
    "ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336),
}

# Initialize models and processors
models = {}
processors = {}

for model_name, (model_path, _) in CLIP_MODELS.items():
    models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda")
    processors[model_name] = CLIPProcessor.from_pretrained(model_path)

@spaces.GPU
def calculate_score(image, text, model_name):
    labels = text.split(";")
    labels = [l.strip() for l in labels]
    labels = list(filter(None, labels))
    if len(labels) == 0:
        return dict()
    
    model = models[model_name]
    processor = processors[model_name]
    
    # Get the correct image size for the model
    _, image_size = CLIP_MODELS[model_name]
    
    # Preprocess the image and text
    inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    
    # Calculate scores
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits_per_image = outputs.logits_per_image.cpu().numpy()
    
    results_dict = {label: score / 100.0 for label, score in zip(labels, logits_per_image[0])}
    return results_dict

with gr.Blocks() as demo:
    gr.Markdown("# Multi-Model CLIP Score")
    gr.Markdown("Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text using different CLIP model variants")
    
    with gr.Row():
        image_input = gr.Image(type="pil")
        output_label = gr.Label()
    
    with gr.Row():
        text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
        model_dropdown = gr.Dropdown(choices=list(CLIP_MODELS.keys()), label="CLIP Model", value="ViT-B/16")
    
    def process_inputs(image, text, model_name):
        if image is None or text.strip() == "":
            return None
        return calculate_score(image, text, model_name)
    
    image_input.change(
        fn=process_inputs,
        inputs=[image_input, text_input, model_dropdown],
        outputs=output_label
    )
    
    text_input.submit(
        fn=process_inputs,
        inputs=[image_input, text_input, model_dropdown],
        outputs=output_label
    )
    
    model_dropdown.change(
        fn=process_inputs,
        inputs=[image_input, text_input, model_dropdown],
        outputs=output_label
    )
    
    gr.Examples(
        examples=[
            [
                "cat.jpg",
                "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
                "ViT-B/16"
            ]
        ],
        fn=process_inputs,
        inputs=[image_input, text_input, model_dropdown],
        outputs=output_label,
    )

demo.launch()