Spaces:

RobotJelly
/

Text_Or_Image-To-Image_Search

Build error

File size: 6,641 Bytes

087fe06
 
 
 
 
7c286b0
087fe06
 
 
 
06ffe20
4640c21
7c286b0
 
b564a70
90348c5
 
 
 
7c286b0
 
 
90348c5
b564a70
7c286b0
 
b564a70
 
4640c21
b564a70
 
4640c21
b564a70
0f017d1
7c286b0
 
 
 
ffaf3d7
 
7c286b0
 
60b18f5
37a1406
60b18f5
7c286b0
 
 
e59258f
087fe06
7c286b0
 
 
fde2555
7c286b0
 
e59258f
087fe06
7c286b0
087fe06
147b3ce
7c286b0
087fe06
7c286b0
087fe06
7c286b0
 
 
 
 
 
 
 
1fbfc7c
 
7c286b0
fd0cac7
1fbfc7c
fd0cac7
1fbfc7c
 
 
7c286b0
087fe06
fd0cac7
087fe06
 
0f017d1
fd0cac7
7c286b0
 
950373e
0f017d1
fd0cac7
087fe06
7c286b0
0f017d1
547056a
7c286b0
0f017d1
fd0cac7
27b8fd9
ffaf3d7
3c7ac16
7c286b0
 
 
 
 
087fe06
7c286b0
 
38e30d4
0f017d1
7c286b0
 
950373e
0f017d1
9f59463
f402651
9a9f244
fd0cac7
087fe06
fd0cac7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
087fe06
fd0cac7
f889699
a8854b2

# Import Libraries
from pathlib import Path
import pandas as pd
import numpy as np
import torch
import pickle
from PIL import Image
from io import BytesIO
import requests
import gradio as gr
import os
#from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
import sentence_transformers
from sentence_transformers import SentenceTransformer, util

# check if CUDA available 
device = "cuda" if torch.cuda.is_available() else "cpu"
  
# Load the openAI's CLIP model
#model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
#processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
#tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

# taking photo IDs
#photo_ids = pd.read_csv("./photo_ids.csv")
#photo_ids = list(photo_ids['photo_id'])
  
# Photo dataset
#photos = pd.read_csv("./photos.tsv000", sep="\t", header=0)
  
# taking features vectors
#photo_features = np.load("./features.npy")

IMAGES_DIR = Path("./photos/")
#def show_output_image(matched_images) :
   #image=[]
   #for photo_id in matched_images:
    # photo_image_url = f"https://unsplash.com/photos/{photo_id}/download?w=280"
     #response = requests.get(photo_image_url, stream=True)
     #img = Image.open(BytesIO(response.content))
    # response = requests.get(photo_image_url, stream=True).raw
    # img = Image.open(response)
     #photo = photo_id + '.jpg'
     #img = Image.open(response).convert("RGB")
     #img = Image.open(os.path.join(IMAGES_DIR, photo))
     #image.append(img)
   #return image
   
   
# Encode and normalize the search query using CLIP
#def encode_search_query(search_query, model, device):
#    with torch.no_grad():
#        inputs = tokenizer([search_query],  padding=True, return_tensors="pt")
        #inputs = processor(text=[search_query], images=None, return_tensors="pt", padding=True)
#    text_features =  model.get_text_features(**inputs).cpu().numpy()
#    return text_features
        
# Find all matched photos
#def find_matches(features, photo_ids, results_count=4):
  # Compute the similarity between the search query and each photo using the Cosine similarity
  #text_features = np.array(text_features)
  #similarities = (photo_features @ features.T).squeeze(1)
  # Sort the photos by their similarity score
  #best_photo_idx = (-similarities).argsort()
  # Return the photo IDs of the best matches
  #matches = [photo_ids[i] for i in best_photo_idx[:results_count]]
  #return matches
  
#Load CLIP model
model = SentenceTransformer('clip-ViT-B-32') 

# pre-computed embeddings
emb_filename = 'unsplash-25k-photos-embeddings.pkl'
with open(emb_filename, 'rb') as emb:
        img_names, img_emb = pickle.load(emb)
 
def display_matches(similarity, topk):
    best_matched_images = []
    top_k_indices = torch.topk(similarity, topk, 0).indices
    for matched_image in top_k_indices:
        img = Image.open(IMAGES_DIR / img_names[matched_image])
        best_matched_images.append(img)
    return best_matched_images
  
def image_search(Option, topk, search_text, search_image):
  
  # Input Text Query
  #search_query = "The feeling when your program finally works"  
  if Option == "Text-To-Image" :
    # Extracting text features embeddings
    #text_features = encode_search_query(search_text, model, device)
     text_emb = model.encode([search_text], convert_to_tensor=True)
     similarity = util.cos_sim(img_emb, text_emb)
     return display_matches(similarity, topk)
    # Find the matched Images
    #matched_images = find_matches(text_features, photo_features, photo_ids, 4)
     #matched_results = util.semantic_search(text_emb, img_emb, top_k=4)[0]
    
    # top 4 highest ranked images
     #return display_matches(matched_results)
  elif Option == "Image-To-Image":
      # Input Image for Search
      #search_image = Image.fromarray(search_image.astype('uint8'), 'RGB')
      
      #with torch.no_grad():
      #   processed_image = processor(text=None, images=search_image, return_tensors="pt", padding=True)["pixel_values"]
      #   image_feature = model.get_image_features(processed_image.to(device))
      #   image_feature /= image_feature.norm(dim=-1, keepdim=True)
      #image_feature = image_feature.cpu().numpy()
      # Find the matched Images
      #matched_images = find_matches(image_feature, photo_ids, 4)
      
      #image_emb = model.encode(Image.open(search_image), convert_to_tensor=True)
      #image_emb = model.encode(Image.open(search_image))
      # Find the matched Images
      #matched_images = find_matches(text_features, photo_features, photo_ids, 4)
      #similarity = util.cos_sim(image_emb, img_emb)
      #matched_results = util.semantic_search(image_emb, img_emb, 4)[0]
      
      image_emb = model.encode([Image.fromarray(search_image)], convert_to_tensor=True)
      similarity = util.cos_sim(img_emb, image_emb)
      return display_matches(similarity, topk)
  
#gr.Interface(fn=image_search, 
#            inputs=[gr.inputs.Textbox(lines=7, label="Input Text"),
#                    gr.inputs.Image(optional=True),
#                    gr.inputs.Dropdown(["Text-To-Image", "Image-To-Image"])
#                    ], 
#            outputs=gr.outputs.Carousel([gr.outputs.Image(type="pil"), gr.outputs.Image(type="pil"), gr.outputs.Image(type="pil"), gr.outputs.Image(type="pil")]),
#            enable_queue=True
#             ).launch(debug=True,share=True)
gr.Interface(fn=image_search, title="Search Image",
             description="Enter the text or image to search the other most relevant images...",
             article=""" 
                     Instructions:- 
                      1. Select the option - `Text to Image`  OR `Image To Image`.
                      2. Then accordingly enter the text or image.
                      3. Just on entering the text or image , you will get the output image on right side

                      Note: on entering the text, it may first show a different/unexpected image but then after a sec. it will show the correct image.
                     """,
             theme="huggingface",
            inputs=[gr.inputs.Dropdown(["Text-To-Image", "Image-To-Image"]),
                    gr.inputs.Dropdown(["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"], type="index", default="1", label="Select Top K Images"),
                    gr.inputs.Textbox(lines=3, label="Input Text", placeholder="Enter the text..."),
                    gr.inputs.Image(type="pil", optional=True)
                    ], 
            outputs=gr.outputs.Carousel([gr.outputs.Image(type="pil")]),
            enable_queue=True
             ).launch(debug=True,share=True)