Spaces:

RobotJelly
/

Text_Or_Image-To-Image_Search

Build error

App Files Files Community

RobotJelly commited on Nov 23, 2021

Commit

fb23d7b

1 Parent(s): 547056a

app.py

Browse files

Files changed (1) hide show

app.py +7 -24

app.py CHANGED Viewed

@@ -13,11 +13,12 @@ from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load the openAI's CLIP model
-#model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
 model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
 processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 # taking photo IDs
 photo_ids = pd.read_csv("./photo_ids.csv")
 photo_ids = list(photo_ids['photo_id'])
@@ -32,32 +33,18 @@ def show_output_image(matched_images) :
    image=[]
    for photo_id in matched_images:
      photo_image_url = f"https://unsplash.com/photos/{photo_id}/download?w=280"
-     #photo_image_url = f"https://unsplash.com/photos/{photo_id}?w=640"
-     #photo_image_url = f"https://unsplash.com/photos/{photo_id}?ixid=2yJhcHBfaWQiOjEyMDd9&fm=jpg"
-     #photo_found = photos[photos["photo_id"] == photo_id].iloc[0]
-     #response = requests.get(photo_found["photo_image_url"] + "?w=640")
      response = requests.get(photo_image_url, stream=True)
      img = Image.open(BytesIO(response.content))
-     #return img
-     #photo_jpg = photo_id + '.jpg'
-     #image_path = './photos/'
-     #img = Image.open('./photos/'+photo_jpg)
      image.append(img)
    return image
 # Encode and normalize the search query using CLIP
-def encode_search_query(search_query, model, device):
     with torch.no_grad():
-        inputs = tokenizer([search_query],  padding=True, return_tensors="pt")
-        #text_encoded = model.encode_text(clip.tokenize(search_query).to(device))
-        #text_encoded /= text_encoded.norm(dim=-1, keepdim=True)
-        # Retrieve the feature vector from the GPU and convert it to a numpy array
-    #text_features = model.get_text_features(**inputs).detach().numpy()
-    #text_features = model.get_text_features(**inputs).cpu().numpy()
     text_features =  model.get_text_features(**inputs).detach().numpy()
-    return np.array(text_features)
-    #return text_features
-        #return text_encoded.cpu().numpy()
 # Find all matched photos
 def find_matches(text_features, photo_features, photo_ids, results_count=4):
@@ -70,14 +57,12 @@ def find_matches(text_features, photo_features, photo_ids, results_count=4):
 def image_search(search_text, search_image, option):
-  #model = model.to(device)
   # Input Text Query
   #search_query = "The feeling when your program finally works"
   if option == "Text-To-Image" :
     # Extracting text features
-    text_features = encode_search_query(search_text, model, device)
     # Find the matched Images
     matched_images = find_matches(text_features, photo_features, photo_ids, 4)
@@ -89,11 +74,9 @@ def image_search(search_text, search_image, option):
          processed_image = processor(text=None, images=search_image, return_tensors="pt", padding=True)["pixel_values"]
          image_feature = model.get_image_features(processed_image.to(device))
          image_feature /= image_feature.norm(dim=-1, keepdim=True)
-      #image_feature = image_feature.cpu().numpy()
       image_feature = image_feature.detach().numpy()
       # Find the matched Images
       matched_images = find_matches(image_feature, photo_features, photo_ids, 4)
-      #is_input_image = True
       return show_output_image(matched_images)
 gr.Interface(fn=image_search,

 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load the openAI's CLIP model
 model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
 processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+ model = model.to(device)
 # taking photo IDs
 photo_ids = pd.read_csv("./photo_ids.csv")
 photo_ids = list(photo_ids['photo_id'])
    image=[]
    for photo_id in matched_images:
      photo_image_url = f"https://unsplash.com/photos/{photo_id}/download?w=280"
      response = requests.get(photo_image_url, stream=True)
      img = Image.open(BytesIO(response.content))
      image.append(img)
    return image
 # Encode and normalize the search query using CLIP
+def encode_search_query(search_query, model):
     with torch.no_grad():
+        #inputs = tokenizer([search_query],  padding=True, return_tensors="pt")
+        inputs = processor(text=[search_query], images=None, return_tensors="pt", padding=True)
     text_features =  model.get_text_features(**inputs).detach().numpy()
+    return text_features
 # Find all matched photos
 def find_matches(text_features, photo_features, photo_ids, results_count=4):
 def image_search(search_text, search_image, option):
   # Input Text Query
   #search_query = "The feeling when your program finally works"
   if option == "Text-To-Image" :
     # Extracting text features
+    text_features = encode_search_query(search_text, model)
     # Find the matched Images
     matched_images = find_matches(text_features, photo_features, photo_ids, 4)
          processed_image = processor(text=None, images=search_image, return_tensors="pt", padding=True)["pixel_values"]
          image_feature = model.get_image_features(processed_image.to(device))
          image_feature /= image_feature.norm(dim=-1, keepdim=True)
       image_feature = image_feature.detach().numpy()
       # Find the matched Images
       matched_images = find_matches(image_feature, photo_features, photo_ids, 4)
       return show_output_image(matched_images)
 gr.Interface(fn=image_search,