from PIL import Image from transformers import CLIPProcessor, CLIPModel import gradio as gr # Initialize CLIP model and processor processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") def image_similarity(image: Image.Image, positive_prompt: str, negative_prompt: str): inputs = processor( text=[positive_prompt, negative_prompt], images=image, return_tensors="pt", padding=True ) outputs = model(**inputs) logits_per_image = outputs.logits_per_image # image-text similarity score probs = logits_per_image.softmax(dim=1) # take the softmax to get the label probabilities # Determine if positive prompt has a higher probability than the negative prompt result = probs[0][0] > probs[0][1] return bool(result), f"Probabilities: Positive {probs[0][0]:.4f}, Negative {probs[0][1]:.4f}" interface = gr.Interface( fn=image_similarity, inputs=[ gr.components.Image(type="pil"), gr.components.Text(label="Enter positive prompt e.g. 'a smiling face'"), gr.components.Text(label="Enter negative prompt e.g. 'a sad face'") ], outputs=[ gr.components.Textbox(label="Result"), gr.components.Textbox(label="Probabilities") ], title="Engagify's Image Action Detection", description="[Author: Ibrahim Hasani] This Method uses CLIP-VIT [Version: BASE-PATCH-16] to determine if an action is being performed in a image or not. (Binaray Classifier). It contrasts an Action against a negative label. Ensure the prompts accurately describe the desired detection.", live=False, theme=gr.themes.Monochrome(), ) interface.launch()