Spaces:

rfmantoan
/

search-demo

Runtime error

App Files Files Community

rfmantoan commited on Sep 23, 2024

Commit

27b3217

1 Parent(s): 8b39863

add utils

Browse files

Files changed (5) hide show

utils/data_preprocessing.py +29 -0
utils/embedding_generation.py +169 -0
utils/load_models.py +20 -0
utils/refine_metadata.py +72 -0
utils/search_functions.py +80 -0

utils/data_preprocessing.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import pandas as pd
+def load_data(catalog):
+    catalog = pd.read_excel('catalog_1k.xlsx')
+    return catalog
+def preprocess_data(catalog):
+    # Clean description
+    catalog['Description'] = catalog['Description'].str.replace('\n', '')
+    # Id column to integer
+    catalog['Id'] = pd.to_numeric(catalog['Id'], errors='coerce').astype('Int64')
+    # Map gender
+    catalog['Gender'] = catalog['Gender'].map({1: 'Women', 2: 'Men', 3: 'Unisex'})
+    # Drop sub-sub-categories
+    catalog = catalog.drop(['L3'], axis=1)
+    # Drop items without gender
+    catalog = catalog.dropna(subset=['Gender'])
+    # Use best image link
+    catalog['Image'] = catalog['Image'].str.split(',').str[-1]
+    # Convert the columns to strings before joining them
+    catalog["SimpleMetadata"] = catalog[["L1", "L2", "Gender", "MaterialName", "BrandName", "Name"]].astype(str).agg(', '.join, axis=1)
+    return catalog

utils/embedding_generation.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import torch
+from PIL import Image
+from utils.load_models import fclip_model, fclip_processor
+from utils.load_models import siglip_model, siglip_preprocess_train, siglip_preprocess_val, siglip_tokenizer
+def get_info(catalog, column):
+    image_paths = []
+    text_descriptions = []
+    for index, row in catalog.iterrows():
+        path = "/content/drive/MyDrive/images/" + str(row["Id"]) + ".jpg"
+        image_paths.append(path)
+        text_descriptions.append(row[column])
+    return image_paths, text_descriptions
+def normalize_embedding(embedding):
+    norm = torch.norm(embedding, p=2, dim=-1, keepdim=True).item()  # Get the norm before normalization
+    embedding = embedding / norm
+    return embedding.detach().cpu().numpy()
+def normalize_embeddings(embeddings):
+    norm = torch.norm(embeddings, p=2, dim=-1, keepdim=True)
+    normalized_embeddings = embeddings / norm
+    return normalized_embeddings
+def generate_fclip_embeddings(image_paths, texts, batch_size, alpha):
+    image_embeds_list = []
+    text_embeds_list = []
+    # Batch processing loop
+    for i in range(0, len(image_paths), batch_size):
+        batch_image_paths = image_paths[i:i + batch_size]
+        batch_texts = texts[i:i + batch_size]
+        # Load and preprocess batch of images and texts
+        images = [Image.open(path).convert("RGB") for path in batch_image_paths]
+        # Set the maximum sequence length to 77 to match the position embeddings
+        inputs = fclip_processor(text=batch_texts, images=images, return_tensors="pt", padding=True, truncation=True, max_length=77)
+        # Move inputs to the GPU
+        if torch.cuda.is_available():
+          inputs = {k: v.to("cuda") for k, v in inputs.items()} # Move inputs to GPU
+        # Generate embeddings
+        with torch.no_grad():
+            outputs = fclip_model(**inputs)
+        image_embeds_list.append(outputs.image_embeds)
+        text_embeds_list.append(outputs.text_embeds)
+    # Concatenate all embeddings
+    image_embeds = torch.cat(image_embeds_list, dim=0)
+    text_embeds = torch.cat(text_embeds_list, dim=0)
+    # Normalize embeddings
+    image_embeds = normalize_embeddings(image_embeds)
+    text_embeds = normalize_embeddings(text_embeds)
+    # Average embeddings
+    avg_embeds = (image_embeds + text_embeds) / 2
+    weighted_avg_embeds = alpha * image_embeds + (1 - alpha) * text_embeds
+    avg_embeds = normalize_embeddings(avg_embeds)
+    weighted_avg_embeds = normalize_embeddings(weighted_avg_embeds)
+    return image_embeds.cpu().numpy(), text_embeds.cpu().numpy(), avg_embeds.cpu().numpy(), weighted_avg_embeds.cpu().numpy()
+def generate_siglip_embeddings(image_paths, texts, batch_size, alpha):
+    image_embeds_list = []
+    text_embeds_list = []
+    # Batch processing loop
+    for i in range(0, len(image_paths), batch_size):
+        batch_image_paths = image_paths[i:i + batch_size]
+        batch_texts = texts[i:i + batch_size]
+        # Load and preprocess batch of images and texts
+        images = [siglip_preprocess_val(Image.open(image_path).convert('RGB')).unsqueeze(0) for image_path in batch_image_paths]
+        images = torch.cat(images)
+        tokens = siglip_tokenizer(batch_texts)
+        # Move images to the same device as the model weights (GPU if available)
+        if torch.cuda.is_available():
+            images = images.cuda()
+            tokens = tokens.cuda()
+        # Generate embeddings
+        with torch.no_grad():
+            image_embeddings_batch = siglip_model.encode_image(images)
+            text_embeddings_batch = siglip_model.encode_text(tokens)
+        # Store embeddings
+        image_embeds_list.append(image_embeddings_batch)
+        text_embeds_list.append(text_embeddings_batch)
+    # Concatenate all embeddings
+    image_embeds = torch.cat(image_embeds_list, dim=0)
+    text_embeds = torch.cat(text_embeds_list, dim=0)
+    # Normalize embeddings
+    image_embeds = normalize_embeddings(image_embeds)
+    text_embeds = normalize_embeddings(text_embeds)
+    # Average embeddings
+    avg_embeds = (image_embeds + text_embeds) / 2
+    weighted_avg_embeds = alpha * image_embeds + (1 - alpha) * text_embeds
+    avg_embeds = normalize_embeddings(avg_embeds)
+    weighted_avg_embeds = normalize_embeddings(weighted_avg_embeds)
+    return image_embeds.cpu().numpy(), text_embeds.cpu().numpy(), avg_embeds.cpu().numpy(), weighted_avg_embeds.cpu().numpy()
+# Function to process text embedding for any model
+def generate_text_embedding(model, tokenizer, query, model_type):
+    if model_type == "fashionCLIP":
+        # Process the text with the tokenizer and move to GPU
+        inputs = tokenizer(text=query, return_tensors="pt", padding=True, truncation=True, max_length=77)
+        if torch.cuda.is_available():
+          inputs = {k: v.to("cuda") for k, v in inputs.items()}
+        # Get text embedding from the model
+        text_embed = model.get_text_features(**inputs)
+    elif model_type == "fashionSigLIP":
+        tokens = tokenizer(query)
+        # Tokenize text and move to GPU
+        if torch.cuda.is_available():
+          tokens = tokens.to("cuda")
+        # Get text embedding from the model
+        text_embed = model.encode_text(tokens)
+    return normalize_embedding(text_embed)
+# Function to process image embedding for any model
+def generate_image_embedding(model, processor, image_path, model_type):
+    image = Image.open(image_path).convert("RGB")
+    if model_type == "fashionCLIP":
+        # Preprocess image for FashionCLIP and move to GPU
+        inputs = processor(images=image, return_tensors="pt")
+        if torch.cuda.is_available():
+          inputs = {k: v.to("cuda") for k, v in inputs.items()}
+        # Get image embedding from the model
+        image_embed = model.get_image_features(**inputs)
+    elif model_type == "fashionSigLIP":
+        # Preprocess image for SigLip and move to GPU
+        image_tensor = processor(image).unsqueeze(0)
+        if torch.cuda.is_available():
+          image_tensor = image_tensor.to("cuda")
+        # Get image embedding from the model
+        image_embed = model.encode_image(image_tensor)
+    return normalize_embedding(image_embed)
+# Unified function to generate embeddings for both models and query types
+def generate_query_embedding(query, query_type, model, processor, tokenizer, model_type):
+    if query_type == "text":
+        return generate_text_embedding(model, tokenizer, query, model_type)
+    elif query_type == "image":
+        return generate_image_embedding(model, processor, query, model_type)
+    else:
+        raise ValueError("Invalid query type. Choose 'text' or 'image'.")

utils/load_models.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+import open_clip
+from transformers import CLIPProcessor, CLIPModel
+fclip_model = None
+fclip_processor = None
+siglip_model = None
+siglip_tokenizer = None
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+fclip_model = CLIPModel.from_pretrained("patrickjohncyh/fashion-clip")
+fclip_processor = CLIPProcessor.from_pretrained("patrickjohncyh/fashion-clip")
+siglip_model, siglip_preprocess_train, siglip_preprocess_val = open_clip.create_model_and_transforms('hf-hub:Marqo/marqo-fashionSigLIP')
+siglip_tokenizer = open_clip.get_tokenizer('hf-hub:Marqo/marqo-fashionSigLIP')
+if torch.cuda.is_available():
+    fclip_model.to(device)
+    siglip_model.to(device)

utils/refine_metadata.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import torch
+from PIL import Image
+from transformers import BitsAndBytesConfig, pipeline
+def ask_llava(metadata, image_path):
+    """
+    Function to get the image description using LLaVA.
+    """
+    # Unpack metadata
+    category = metadata.get('category', '')
+    subcategory = metadata.get('subcategory', '')
+    material = metadata.get('material', '')
+    gender = metadata.get('gender', '')
+    brand = metadata.get('brand', '')
+    name = metadata.get('name', '')
+    # Build the prompt for LLaVA
+    image = Image.open(image_path)
+    #prompt = f"""USER: <image>\nYou are an expert in fashion and visual analysis. Given the following metadata and an image, use your knowledge of fashion trends, styles, colors, gender preferences and brand information as well as your ability to describe, analyze and understand the image of the item to refine the metadata. Your goal is to improve the embedding process for models like FashionCLIP and MARGO-FashionSigLip by creating a more nuanced and detailed description that would boost the performance of the models. Metadata Provided: - Category: {category} - Subcategory: {subcategory} - Material: {material} - Gender: {gender} - Brand: {brand} - Name: {name} - Description: {description} Refine and expand the metadata by incorporating information from the image and about the fashion item's style, cut, pattern, color scheme, brand, and any notable details. Include insights on current fashion trends and how the item fits within those trends. Be mindful that the it should be too around 77 tokens only, therefore, try to be concise and keep the description direct and useful for text to image and text to text search. Return the refined metadata as a single paragraph.\nASSISTANT:"""
+    prompt = f"""USER: <image>\nYou are an expert in fashion and visual analysis. Given the following metadata and an image, return an enhanced metadata structured in a single sentence with each field separated by a comma (do not include the field name, just use the same order). Keep it very concise and simple but make it more unterstandle for embedding models that will be used for search purposes. Also do a color analysis and add an extra field for the color of the item. Metadata Provided: - Category: {category} - Subcategory: {subcategory} - Material: {material} - Gender: {gender} - Brand: {brand} - Name: {name}.\nASSISTANT:"""
+    # Generate description
+    outputs = img2text_pipeline(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
+    description = outputs[0]["generated_text"]
+    description = description.split("ASSISTANT: ")
+    return description[1]
+def refine_metadata(catalog, column):
+    catalog[column] = ""
+    # Iterate over the DataFrame and process each item
+    for index, row in catalog.iterrows():
+        metadata = {
+            'category': row['L1'],
+            'subcategory': row['L2'],
+            'material': row['MaterialName'],
+            'gender': row['Gender'],
+            'brand': row['BrandName'],
+            'name': row['Name'],
+            'description': row['Description']
+        }
+        # Ensure the image ID is converted to a string
+        #image_path = "/content/drive/MyDrive/images/" + str(row["Id"]) + ".jpg"
+        image_path = "/images/" + str(row["Id"]) + ".jpg"
+        # Generate the image description using LLaVA
+        refined_metadata = refine_metadata(metadata, image_path)
+        # Store results back in the DataFrame
+        catalog.at[index, column] = refined_metadata
+        return catalog
+img2text_pipeline = None
+quantization_config = BitsAndBytesConfig(
+  load_in_4bit=True,
+  bnb_4bit_compute_dtype=torch.float16
+)
+model_id = "llava-hf/llava-1.5-7b-hf"
+if torch.cuda.is_available():
+  img2text_pipeline = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})
+else:
+  img2text_pipeline = pipeline("image-to-text", model=model_id)

utils/search_functions.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import pandas as pd
+from utils.vector_database import search_in_milvus, fashionclip_collection, fashionsiglip_collection
+from utils.embedding_generation import generate_query_embedding
+from utils.load_models import fclip_model, fclip_processor
+from utils.load_models import siglip_model, siglip_preprocess_val, siglip_tokenizer
+# Function to dynamically select the Milvus collection and search field
+def get_milvus_collection_and_field(model_type, embedding_type):
+    # Define mapping of model and embedding types to collections and fields
+    if model_type == "fashionCLIP":
+        collection = fashionclip_collection
+        if embedding_type == "text":
+            search_field = "text_embedding"
+        elif embedding_type == "image":
+            search_field = "image_embedding"
+        elif embedding_type == "average":
+            search_field = "avg_embedding"
+        elif embedding_type == "weighted average":
+            search_field = "weighted_avg_embedding"
+    elif model_type == "fashionSigLIP":
+        collection = fashionsiglip_collection
+        if embedding_type == "text":
+            search_field = "text_embedding"
+        elif embedding_type == "image":
+            search_field = "image_embedding"
+        elif embedding_type == "average":
+            search_field = "avg_embedding"
+        elif embedding_type == "weighted average":
+            search_field = "weighted_avg_embedding"
+    else:
+        raise ValueError("Invalid model type. Choose 'fashionCLIP' or 'fashionSigLIP'.")
+    return collection, search_field
+# Function to handle the complete search flow
+def search(query, query_type, model_type, embedding_type):
+    # Step 1: Generate the query embedding based on the user input and model type
+    if model_type == "fashionCLIP":
+        query_embedding = generate_query_embedding(query, query_type, fclip_model, fclip_processor, fclip_processor, "fashionCLIP")
+    elif model_type == "fashionSigLIP":
+        query_embedding = generate_query_embedding(query, query_type, siglip_model, siglip_preprocess_val, siglip_tokenizer, "fashionSigLIP")
+    # Step 2: Get the appropriate Milvus collection and search field
+    collection, search_field = get_milvus_collection_and_field(model_type, embedding_type)
+    # Step 3: Perform search in Milvus using the query embedding
+    search_results = search_in_milvus(collection, search_field, query_embedding, top_k=10)
+    # Step 4: Extract images, similarity scores, and metadata from the search results
+    images = [result['image'] for result in search_results]
+    scores = [result['similarity_score'] for result in search_results]
+    metadata = [result['metadata'] for result in search_results]
+    return images, scores, metadata
+# Function to run the search and get results for both models
+def run_search(query_type, embedding_type, query_input_text, query_input_image):
+    if query_type == "text":
+        query = query_input_text
+    else:
+        query = query_input_image
+    # Perform search for FashionCLIP
+    fclip_images, fclip_scores, fclip_metadata = search(query, query_type, "fashionCLIP", embedding_type)
+    # Perform search for MARGO-FashionSigLip
+    siglip_images, siglip_scores, siglip_metadata = search(query, query_type, "fashionSigLIP", embedding_type)
+    # Convert scores and metadata into a pandas DataFrame for each model
+    fclip_results_df = pd.DataFrame({
+        "Score": fclip_scores,
+        "Metadata": fclip_metadata,
+    })
+    siglip_results_df = pd.DataFrame({
+        "Score": siglip_scores,
+        "Metadata": siglip_metadata,
+    })
+    return fclip_images, fclip_results_df, siglip_images, siglip_results_df