|
import gradio as gr |
|
from PIL import Image |
|
import torch |
|
from transformers import CLIPProcessor, CLIPModel |
|
|
|
|
|
model = CLIPModel.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K") |
|
processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K") |
|
|
|
def calculate_similarity(image, text_prompt, similarity_type): |
|
|
|
inputs = processor(images=image, text=text_prompt, return_tensors="pt", padding=True) |
|
|
|
|
|
outputs = model(**inputs) |
|
|
|
|
|
image_features = outputs.image_embeds / outputs.image_embeds.norm(dim=-1, keepdim=True) |
|
text_features = outputs.text_embeds / outputs.text_embeds.norm(dim=-1, keepdim=True) |
|
cosine_similarity = torch.nn.functional.cosine_similarity(image_features, text_features) |
|
|
|
|
|
if similarity_type == "General Similarity (3x scaled)": |
|
adjusted_similarity = cosine_similarity.item() * 3 * 100 |
|
result_text = f"According to OpenCLIP, the image and the text prompt have a general similarity of {min(adjusted_similarity, 99.99):.2f}%." |
|
else: |
|
result_text = f"According to OpenCLIP, the image and the text prompt have a cosine similarity of {cosine_similarity.item() * 100:.2f}%." |
|
|
|
return result_text |
|
|
|
|
|
iface = gr.Interface( |
|
fn=calculate_similarity, |
|
inputs=[ |
|
gr.Image(type="pil", label="Upload Image", height=512), |
|
gr.Textbox(label="Text Prompt"), |
|
gr.Dropdown(label="Similarity Type", choices=["General Similarity (3x scaled)", "Cosine Similarity (raw)"], value="General Similarity (3x scaled)") |
|
], |
|
outputs=gr.Text(), |
|
allow_flagging="never", |
|
title="OpenClip Similarity Calculator", |
|
description="Upload an image and provide a text prompt to calculate the similarity." |
|
) |
|
|
|
|
|
iface.launch(share=True) |
|
|