Spaces:

RobotJelly
/

Text_Or_Image-To-Image_Search

Build error

App Files Files Community

RobotJelly commited on Nov 22, 2021

Commit

e59258f

1 Parent(s): 333423b

app.py

Browse files

Files changed (1) hide show

app.py +16 -9

app.py CHANGED Viewed

@@ -8,15 +8,16 @@ from PIL import Image
 from io import BytesIO
 import requests
 import gradio as gr
-# Load the openAI's CLIP model
-#model, preprocess = clip.load("ViT-B/32", jit=False)
-#display output photo
 # check if CUDA available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load the openAI's CLIP model
-model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
 # taking photo IDs
 photo_ids = pd.read_csv("./photo_ids.csv")
@@ -44,13 +45,17 @@ def show_output_image(matched_images) :
      #img = Image.open('./photos/'+photo_jpg)
      image.append(img)
    return image
 # Encode and normalize the search query using CLIP
 def encode_search_query(search_query, model, device):
     with torch.no_grad():
-        text_encoded = model.encode_text(clip.tokenize(search_query).to(device))
-        text_encoded /= text_encoded.norm(dim=-1, keepdim=True)
         # Retrieve the feature vector from the GPU and convert it to a numpy array
-        return text_encoded.cpu().numpy()
 # Find all matched photos
 def find_matches(text_features, photo_features, photo_ids, results_count=4):
   # Compute the similarity between the search query and each photo using the Cosine similarity
@@ -84,8 +89,10 @@ def image_search(search_text, search_image, option):
   elif option == "Image-To-Image":
     # Input Image for Search
     with torch.no_grad():
-      image_feature = model.encode_image(preprocess(search_image).unsqueeze(0).to(device))
-      image_feature = (image_feature / image_feature.norm(dim=-1, keepdim=True)).cpu().numpy()
       # Find the matched Images
       matched_images = find_matches(image_feature, photo_features, photo_ids, 4)
       #is_input_image = True

 from io import BytesIO
 import requests
 import gradio as gr
+from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
 # check if CUDA available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load the openAI's CLIP model
+#model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 # taking photo IDs
 photo_ids = pd.read_csv("./photo_ids.csv")
      #img = Image.open('./photos/'+photo_jpg)
      image.append(img)
    return image
 # Encode and normalize the search query using CLIP
 def encode_search_query(search_query, model, device):
     with torch.no_grad():
+        inputs = tokenizer([search_query],  padding=True, return_tensors="pt")
+        #text_encoded = model.encode_text(clip.tokenize(search_query).to(device))
+        #text_encoded /= text_encoded.norm(dim=-1, keepdim=True)
         # Retrieve the feature vector from the GPU and convert it to a numpy array
+        return model.get_text_features(**inputs).cpu().numpy()
+        #return text_encoded.cpu().numpy()
 # Find all matched photos
 def find_matches(text_features, photo_features, photo_ids, results_count=4):
   # Compute the similarity between the search query and each photo using the Cosine similarity
   elif option == "Image-To-Image":
     # Input Image for Search
     with torch.no_grad():
+        processed_image = processor(text=None, images=search_image, return_tensors="pt", padding=True)["pixel_values"]
+        image_feature = model.get_image_features(processed_image.to(device))
+        image_feature /= image_feature.norm(dim=-1, keepdim=True)
+        image_feature = image_feature.cpu().numpy()
       # Find the matched Images
       matched_images = find_matches(image_feature, photo_features, photo_ids, 4)
       #is_input_image = True