IbrahimHasani's picture
Update app.py
22ae21c
raw
history blame
1.76 kB
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import gradio as gr
# Initialize CLIP model and processor
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
def image_similarity(image: Image.Image, positive_prompt: str, negative_prompt: str):
inputs = processor(
text=[positive_prompt, negative_prompt],
images=image,
return_tensors="pt",
padding=True
)
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # image-text similarity score
probs = logits_per_image.softmax(dim=1) # take the softmax to get the label probabilities
# Determine if positive prompt has a higher probability than the negative prompt
result = probs[0][0] > probs[0][1]
return bool(result), f"Probabilities: Positive {probs[0][0]:.4f}, Negative {probs[0][1]:.4f}"
interface = gr.Interface(
fn=image_similarity,
inputs=[
gr.components.Image(type="pil"),
gr.components.Text(label="Enter positive prompt e.g. 'a smiling face'"),
gr.components.Text(label="Enter negative prompt e.g. 'a sad face'")
],
outputs=[
gr.components.Textbox(label="Result"),
gr.components.Textbox(label="Probabilities")
],
title="Engagify's Image Action Detection",
description="[Author: Ibrahim Hasani] This Method uses CLIP-VIT [Version: BASE-PATCH-16] to determine if an action is being performed in a image or not. (Binaray Classifier). It contrasts an Action against a negative label. Ensure the prompts accurately describe the desired detection.",
live=False,
theme=gr.themes.Monochrome(),
)
interface.launch()