from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import gradio as gr

# Initialize CLIP model and processor
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

def image_similarity(image: Image.Image, positive_prompt: str, negative_prompt: str):
    
    inputs = processor(
        text=[positive_prompt, negative_prompt], 
        images=image, 
        return_tensors="pt", 
        padding=True
    )

    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image  # image-text similarity score
    probs = logits_per_image.softmax(dim=1)  # take the softmax to get the label probabilities
    
    # Determine if positive prompt has a higher probability than the negative prompt
    result = probs[0][0] > probs[0][1]
    return bool(result), f"Probabilities: Positive {probs[0][0]:.4f}, Negative {probs[0][1]:.4f}"

interface = gr.Interface(
    fn=image_similarity, 
    inputs=[
        gr.components.Image(type="pil"), 
        gr.components.Text(label="Enter positive prompt e.g. 'a smiling face'"),
        gr.components.Text(label="Enter negative prompt e.g. 'a sad face'")
    ], 
    outputs=[
        gr.components.Textbox(label="Result"),
        gr.components.Textbox(label="Probabilities")
    ],
    title="Engagify's Image Action Detection",
    description="[Author: Ibrahim Hasani] This Method uses CLIP-VIT [Version: BASE-PATCH-16] to determine if an action is being performed in a image or not. (Binaray Classifier). It contrasts an Action against a negative label. Ensure the prompts accurately describe the desired detection.",
    live=False,
    theme=gr.themes.Monochrome(),

)

interface.launch()