File size: 3,097 Bytes
d60d34b acda6c7 d60d34b acda6c7 61b7eee acda6c7 73f9f45 61b7eee 3601eff c890be1 61b7eee 3601eff c890be1 61b7eee c890be1 3601eff acda6c7 d60d34b 61b7eee c890be1 d60d34b 61b7eee d60d34b c890be1 61b7eee c890be1 d60d34b 61b7eee c890be1 d60d34b c890be1 d60d34b 61b7eee c890be1 d60d34b c890be1 d60d34b 61b7eee d60d34b 61b7eee d60d34b 3601eff c890be1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import torch
import gradio as gr
from transformers import CLIPProcessor, CLIPModel
import spaces
# Dictionary of available CLIP models with their image sizes
CLIP_MODELS = {
"ViT-B/32": ("openai/clip-vit-base-patch32", 224),
"ViT-B/16": ("openai/clip-vit-base-patch16", 224),
"ViT-L/14": ("openai/clip-vit-large-patch14", 224),
"ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336),
}
# Initialize models and processors
models = {}
processors = {}
for model_name, (model_path, _) in CLIP_MODELS.items():
models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda")
processors[model_name] = CLIPProcessor.from_pretrained(model_path)
@spaces.GPU
def calculate_score(image, text, model_name):
labels = text.split(";")
labels = [l.strip() for l in labels]
labels = list(filter(None, labels))
if len(labels) == 0:
return dict()
model = models[model_name]
processor = processors[model_name]
# Get the correct image size for the model
_, image_size = CLIP_MODELS[model_name]
# Preprocess the image and text
inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
inputs = {k: v.to("cuda") for k, v in inputs.items()}
# Calculate scores
with torch.no_grad():
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image.cpu().numpy()
results_dict = {label: score / 100.0 for label, score in zip(labels, logits_per_image[0])}
return results_dict
with gr.Blocks() as demo:
gr.Markdown("# Multi-Model CLIP Score")
gr.Markdown("Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text using different CLIP model variants")
with gr.Row():
image_input = gr.Image(type="pil")
output_label = gr.Label()
with gr.Row():
text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
model_dropdown = gr.Dropdown(choices=list(CLIP_MODELS.keys()), label="CLIP Model", value="ViT-B/16")
def process_inputs(image, text, model_name):
if image is None or text.strip() == "":
return None
return calculate_score(image, text, model_name)
image_input.change(
fn=process_inputs,
inputs=[image_input, text_input, model_dropdown],
outputs=output_label
)
text_input.submit(
fn=process_inputs,
inputs=[image_input, text_input, model_dropdown],
outputs=output_label
)
model_dropdown.change(
fn=process_inputs,
inputs=[image_input, text_input, model_dropdown],
outputs=output_label
)
gr.Examples(
examples=[
[
"cat.jpg",
"a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
"ViT-B/16"
]
],
fn=process_inputs,
inputs=[image_input, text_input, model_dropdown],
outputs=output_label,
)
demo.launch() |