|
import numpy as np |
|
import clip |
|
import torch |
|
import gradio as gr |
|
from PIL import Image |
|
import os |
|
import base64 |
|
from io import BytesIO |
|
|
|
|
|
model, preprocess = clip.load("ViT-B/32") |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model.to(device).eval() |
|
|
|
|
|
Business_Listing = "Air Guide" |
|
|
|
def find_similarity(image_base64, text_input): |
|
|
|
image_bytes = base64.b64decode(image_base64) |
|
image = Image.open(BytesIO(image_bytes)) |
|
|
|
|
|
image = preprocess(image).unsqueeze(0).to(device) |
|
|
|
|
|
text_tokens = clip.tokenize([text_input]).to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
image_features = model.encode_image(image).float() |
|
text_features = model.encode_text(text_tokens).float() |
|
|
|
|
|
image_features /= image_features.norm(dim=-1, keepdim=True) |
|
text_features /= text_features.norm(dim=-1, keepdim=True) |
|
similarity = (text_features @ image_features.T).cpu().numpy() |
|
|
|
return similarity[0, 0] |
|
|
|
|
|
iface = gr.Interface( |
|
fn=find_similarity, |
|
inputs=["text", gr.inputs.Textbox(lines=3, label="Enter Base64 Image"), "text"], |
|
outputs="number", |
|
live=True, |
|
interpretation="default", |
|
title="CLIP Model Image-Text Cosine Similarity", |
|
description="Enter a base64-encoded image and text to find their cosine similarity.", |
|
) |
|
|
|
iface.launch() |
|
|