Spaces:

rfmantoan
/

search-demo

Runtime error

File size: 6,712 Bytes

import torch
from PIL import Image
from utils.load_models import fclip_model, fclip_processor
from utils.load_models import siglip_model, siglip_preprocess_train, siglip_preprocess_val, siglip_tokenizer

def get_info(catalog, column):
    image_paths = []
    text_descriptions = []
    
    for index, row in catalog.iterrows():
        path = "/home/user/app/images" + str(row["Id"]) + ".jpg"
        image_paths.append(path)
        text_descriptions.append(row[column])
    
    return image_paths, text_descriptions

def normalize_embedding(embedding):
    norm = torch.norm(embedding, p=2, dim=-1, keepdim=True).item()  # Get the norm before normalization
    embedding = embedding / norm
    return embedding.detach().cpu().numpy()

def normalize_embeddings(embeddings):
    norm = torch.norm(embeddings, p=2, dim=-1, keepdim=True)
    normalized_embeddings = embeddings / norm
    return normalized_embeddings

def generate_fclip_embeddings(image_paths, texts, batch_size, alpha):
    image_embeds_list = []
    text_embeds_list = []

    # Batch processing loop
    for i in range(0, len(image_paths), batch_size):
        batch_image_paths = image_paths[i:i + batch_size]
        batch_texts = texts[i:i + batch_size]

        # Load and preprocess batch of images and texts
        images = [Image.open(path).convert("RGB") for path in batch_image_paths]

        # Set the maximum sequence length to 77 to match the position embeddings
        inputs = fclip_processor(text=batch_texts, images=images, return_tensors="pt", padding=True, truncation=True, max_length=77)

        # Move inputs to the GPU
        if torch.cuda.is_available():
          inputs = {k: v.to("cuda") for k, v in inputs.items()} # Move inputs to GPU

        # Generate embeddings
        with torch.no_grad():
            outputs = fclip_model(**inputs)

        image_embeds_list.append(outputs.image_embeds)
        text_embeds_list.append(outputs.text_embeds)

    # Concatenate all embeddings
    image_embeds = torch.cat(image_embeds_list, dim=0)
    text_embeds = torch.cat(text_embeds_list, dim=0)

    # Normalize embeddings
    image_embeds = normalize_embeddings(image_embeds)
    text_embeds = normalize_embeddings(text_embeds)

    # Average embeddings
    avg_embeds = (image_embeds + text_embeds) / 2
    weighted_avg_embeds = alpha * image_embeds + (1 - alpha) * text_embeds
    avg_embeds = normalize_embeddings(avg_embeds)
    weighted_avg_embeds = normalize_embeddings(weighted_avg_embeds)

    return image_embeds.cpu().numpy(), text_embeds.cpu().numpy(), avg_embeds.cpu().numpy(), weighted_avg_embeds.cpu().numpy()

def generate_siglip_embeddings(image_paths, texts, batch_size, alpha):
    image_embeds_list = []
    text_embeds_list = []

    # Batch processing loop
    for i in range(0, len(image_paths), batch_size):
        batch_image_paths = image_paths[i:i + batch_size]
        batch_texts = texts[i:i + batch_size]

        # Load and preprocess batch of images and texts
        images = [siglip_preprocess_val(Image.open(image_path).convert('RGB')).unsqueeze(0) for image_path in batch_image_paths]
        images = torch.cat(images)

        tokens = siglip_tokenizer(batch_texts)

        # Move images to the same device as the model weights (GPU if available)
        if torch.cuda.is_available():
            images = images.cuda()
            tokens = tokens.cuda()

        # Generate embeddings
        with torch.no_grad():
            image_embeddings_batch = siglip_model.encode_image(images)
            text_embeddings_batch = siglip_model.encode_text(tokens)

        # Store embeddings
        image_embeds_list.append(image_embeddings_batch)
        text_embeds_list.append(text_embeddings_batch)
        
    # Concatenate all embeddings
    image_embeds = torch.cat(image_embeds_list, dim=0)
    text_embeds = torch.cat(text_embeds_list, dim=0)

    # Normalize embeddings
    image_embeds = normalize_embeddings(image_embeds)
    text_embeds = normalize_embeddings(text_embeds)

    # Average embeddings
    avg_embeds = (image_embeds + text_embeds) / 2
    weighted_avg_embeds = alpha * image_embeds + (1 - alpha) * text_embeds
    avg_embeds = normalize_embeddings(avg_embeds)
    weighted_avg_embeds = normalize_embeddings(weighted_avg_embeds)

    return image_embeds.cpu().numpy(), text_embeds.cpu().numpy(), avg_embeds.cpu().numpy(), weighted_avg_embeds.cpu().numpy()

# Function to process text embedding for any model
def generate_text_embedding(model, tokenizer, query, model_type):
    if model_type == "fashionCLIP":
        # Process the text with the tokenizer and move to GPU
        inputs = tokenizer(text=query, return_tensors="pt", padding=True, truncation=True, max_length=77)
        
        if torch.cuda.is_available():
          inputs = {k: v.to("cuda") for k, v in inputs.items()}
        
        # Get text embedding from the model
        text_embed = model.get_text_features(**inputs)
    elif model_type == "fashionSigLIP":
        tokens = tokenizer(query)
        
        # Tokenize text and move to GPU
        if torch.cuda.is_available():
          tokens = tokens.to("cuda")
        
        # Get text embedding from the model
        text_embed = model.encode_text(tokens)

    return normalize_embedding(text_embed)

# Function to process image embedding for any model
def generate_image_embedding(model, processor, image_path, model_type):
    image = Image.open(image_path).convert("RGB")

    if model_type == "fashionCLIP":
        # Preprocess image for FashionCLIP and move to GPU
        inputs = processor(images=image, return_tensors="pt")
        
        if torch.cuda.is_available():
          inputs = {k: v.to("cuda") for k, v in inputs.items()}
        
        # Get image embedding from the model
        image_embed = model.get_image_features(**inputs)
    elif model_type == "fashionSigLIP":
        # Preprocess image for SigLip and move to GPU
        image_tensor = processor(image).unsqueeze(0)
        
        if torch.cuda.is_available():
          image_tensor = image_tensor.to("cuda")
        
        # Get image embedding from the model
        image_embed = model.encode_image(image_tensor)

    return normalize_embedding(image_embed)

# Unified function to generate embeddings for both models and query types
def generate_query_embedding(query, query_type, model, processor, tokenizer, model_type):
    if query_type == "text":
        return generate_text_embedding(model, tokenizer, query, model_type)
    elif query_type == "image":
        return generate_image_embedding(model, processor, query, model_type)
    else:
        raise ValueError("Invalid query type. Choose 'text' or 'image'.")