Spaces:

909ahmed
/

CLIP

Sleeping

909ahmed commited on Jul 30, 2024

Commit

c13ea66

verified ·

1 Parent(s): d0bdc68

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,24 +2,31 @@ import gradio as gr
 import torch
 import clip
 from PIL import Image
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model, preprocess = clip.load("ViT-B/32", device=device)
-def clip(image, text):
-    text = text.tobytes().decode('utf-8').split(',')
     image = preprocess(image).unsqueeze(0).to(device)
-    text = clip.tokenize(text).to(device)
     with torch.no_grad():
         image_features = model.encode_image(image)
-        text_features = model.encode_text(text)
-        logits_per_image, logits_per_text = model(image, text)
         probs = logits_per_image.softmax(dim=-1).cpu().numpy()
     return probs
-demo = gr.Interface(fn=clip, inputs=["text", "image"], outputs="text")
-demo.launch()

 import torch
 import clip
 from PIL import Image
+import numpy as np
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model, preprocess = clip.load("ViT-B/32", device=device)
+def process_image_and_text(image, text):
+    # Ensure text is a NumPy array and convert it to a list of strings
+    text_list = text.tolist()
+    # Preprocess the image
     image = preprocess(image).unsqueeze(0).to(device)
+    # Tokenize the text
+    text_tokens = clip.tokenize(text_list).to(device)
     with torch.no_grad():
+        # Encode image and text
         image_features = model.encode_image(image)
+        text_features = model.encode_text(text_tokens)
+        # Compute logits and probabilities
+        logits_per_image, logits_per_text = model(image, text_tokens)
         probs = logits_per_image.softmax(dim=-1).cpu().numpy()
     return probs
+demo = gr.Interface(fn=process_image_and_text, inputs=[gr.inputs.Image(type="pil"), gr.inputs.Textbox()], outputs="text")
+demo.launch()