File size: 1,494 Bytes
30d5af0 69aa3f2 30d5af0 cdab2a8 30d5af0 c55abad 30d5af0 69aa3f2 30d5af0 c652364 30d5af0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import numpy as np
import clip
import torch
import gradio as gr
import base64
from PIL import Image
from io import BytesIO
# Load the CLIP model
model, preprocess = clip.load("ViT-B/32")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()
def find_similarity(image_base64, text_input):
# Decode the base64 image string to bytes
image_bytes = base64.b64decode(image_base64)
image = Image.open(BytesIO(image_bytes))
# Preprocess the image
image = preprocess(image).unsqueeze(0).to(device)
# Prepare input text
text_tokens = clip.tokenize([text_input]).to(device)
# Encode image and text features
with torch.no_grad():
image_features = model.encode_image(image).float()
text_features = model.encode_text(text_tokens).float()
# Normalize features and calculate similarity
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (text_features @ image_features.T).cpu().numpy()
return similarity[0, 0]
iface = gr.Interface(
fn=find_similarity,
inputs=[
gr.inputs.Textbox(lines=3, label="Enter Base64 Image"),
gr.inputs.Textbox(lines=3, label="Enter Text"),
],
outputs="number",
live=True,
interpretation="default",
title="CLIP Model Image-Text Cosine Similarity",
description="Enter a base64-encoded image and text to find their cosine similarity.",
)
iface.launch()
|