import gradio as gr
from PIL import Image
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from transformers import CLIPProcessor, CLIPModel
from datasets import load_dataset
import torch

# Load the pre-trained CLIP model and its tokenizer
model_name = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(model_name)
model = CLIPModel.from_pretrained(model_name)

# Load the fashion product images dataset from Hugging Face
dataset = load_dataset("ashraq/fashion-product-images-small")
deepfashion_database = dataset["train"]

# Define the preprocessing function for images
def preprocess_image(image):
    preprocess = Compose([
        Resize(256, interpolation=Image.BICUBIC),
        CenterCrop(224),
        ToTensor(),
        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
    ])
    return preprocess(image).unsqueeze(0)

# Define a function to process the image and text inputs
def initial_query(image, text):
    return process_query(image, text, deepfashion_database)

def process_query(image, text, database):
    image_tensor = preprocess_image(image)
    inputs = processor(text, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        outputs = model(input_ids=inputs["input_ids"], pixel_values=image_tensor)
        logits_per_image = outputs.logits_per_image
        probs = logits_per_image.softmax(dim=-1)
        similarities = probs.squeeze()

    product_scores = []
    for product in database:
        product_image = Image.open(product["image_path"]).convert("RGB")
        product_image_tensor = preprocess_image(product_image)
        product_text = product["description"]
        product_inputs = processor(product_text, return_tensors="pt", padding=True, truncation=True)

        with torch.no_grad():
            product_outputs = model(input_ids=product_inputs["input_ids"], pixel_values=product_image_tensor)
            product_logits_per_image = product_outputs.logits_per_image
            product_probs = product_logits_per_image.softmax(dim=-1)
            product_similarity = product_probs.squeeze().item()

        product_scores.append((product, product_similarity))

    top_3_products = sorted(product_scores, key=lambda x: x[1], reverse=True)[:3]
    return top_3_products

def refine_query(selected_product_index, additional_text, initial_results):
    selected_product = initial_results[selected_product_index]
    modified_description = selected_product["description"] + " " + additional_text
    refined_product = {"description": modified_description, "image_path": selected_product["image_path"]}
    refined_database = [product for i, product in enumerate(initial_results) if i != selected_product_index]
    refined_database.append(refined_product)
    return process_query(Image.open(selected_product["image_path"]).convert("RGB"), modified_description, refined_database)


def generate_output_html(products):
    html_output = "<ol>"
    for product in products:
        html_output += f'<li><img src="{product[0]["image_path"]}" width="100" height="100"><br>{product[0]["description"]}</li>'
    html_output += "</ol>"
    return html_output

def initial_query_wrapper(image, text):
    top_3_products = initial_query(image, text)
    return generate_output_html(top_3_products),

def refine_query_wrapper(selected_product_index, additional_text, initial_results):
    top_3_products = refine_query(selected_product_index, additional_text, initial_results)
    return generate_output_html(top_3_products),

def product_search_wrapper(image=None, text=None, selected_product_index=None, additional_text=None):
    if image is not None and text is not None:
        top_3_products = initial_query(image, text)
        return generate_output_html(top_3_products),
    elif selected_product_index is not None and additional_text is not None:
        top_3_products = refine_query(selected_product_index, additional_text)
        return generate_output_html(top_3_products),
    else:
        return "",

iface = gr.Interface(
    fn=product_search_wrapper,
    inputs=[
        gr.inputs.Image(optional=True),
        gr.inputs.Textbox(lines=3, label="Initial Text Query", optional=True),
        gr.inputs.Radio(["0", "1", "2"], label="Select Product Index", optional=True),
        gr.inputs.Textbox(lines=3, label="Additional Text Query", optional=True)
    ],
    outputs=[
        gr.outputs.HTML(label="Results")
    ],
    title="Product Search",
    description="Find the best matching products using images and text queries.",
    layout="vertical"
)

# iface = gr.Interface(
#     fn=[initial_query_wrapper, refine_query_wrapper],
#     inputs=[
#         [gr.inputs.Image(), gr.inputs.Textbox(lines=3, label="Initial Text Query")],
#         [gr.inputs.Radio(["0", "1", "2"], label="Select Product Index"), gr.inputs.Textbox(lines=3, label="Additional Text Query"), gr.inputs.Hidden(initial_results="initial_query")]
#     ],
#     outputs=[
#         gr.outputs.HTML(label="Top 3 Matches"),
#         gr.outputs.HTML(label="Refined Top 3 Matches")
#     ],
#     title="Product Search",
#     description="Find the best matching products using images and text queries.",
#     layout="vertical"
# )

iface.launch()