Spaces:

909ahmed
/

CLIP

Sleeping

File size: 1,009 Bytes

fd2c1e0
 
 
 
c13ea66
fd2c1e0
 
 
 
c13ea66
 
 
fd2c1e0
c13ea66
02a2188
c13ea66
 
 
1e62ffa
02a2188
c13ea66
02a2188
c13ea66
fd2c1e0
c13ea66
 
02a2188
fd2c1e0
479108b
fd2c1e0
c13ea66

import gradio as gr
import torch
import clip
from PIL import Image
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

def process_image_and_text(image, text):
    # Ensure text is a NumPy array and convert it to a list of strings
    text_list = text.tolist()

    # Preprocess the image
    image = preprocess(image).unsqueeze(0).to(device)

    # Tokenize the text
    text_tokens = clip.tokenize(text_list).to(device)

    with torch.no_grad():
        # Encode image and text
        image_features = model.encode_image(image)
        text_features = model.encode_text(text_tokens)
        
        # Compute logits and probabilities
        logits_per_image, logits_per_text = model(image, text_tokens)
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()

    return probs

demo = gr.Interface(fn=process_image_and_text, inputs=[gr.inputs.Image(type="pil"), gr.inputs.Textbox()], outputs="text")
demo.launch()