Spaces:
Runtime error
Runtime error
from PIL import Image | |
from transformers import BlipProcessor, BlipForConditionalGeneration | |
import numpy as np | |
import cv2 | |
from deepface import DeepFace | |
import gradio as gr | |
# Load BLIP model | |
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") | |
# Clothing extractor | |
def extract_clothing(text): | |
colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'brown', 'gray', 'pink', 'orange'] | |
patterns = ['striped', 'checkered', 'plaid', 'polka-dot', 'solid', 'patterned', 'floral'] | |
items = ['jacket', 'coat', 'dress', 'shirt', 't-shirt', 'jeans', 'pants', 'shorts', | |
'suit', 'sneakers', 'hat', 'scarf', 'uniform'] | |
found_colors = [c for c in colors if c in text.lower()] | |
found_patterns = [p for p in patterns if p in text.lower()] | |
found_items = [i for i in items if i in text.lower()] | |
return found_colors, found_patterns, found_items | |
# Main function | |
def analyze_image(image_pil): | |
image_pil = image_pil.convert("RGB") | |
image_np = np.array(image_pil) | |
# Caption generation | |
inputs = processor(image_pil, return_tensors="pt") | |
out = model.generate(**inputs) | |
caption = processor.decode(out[0], skip_special_tokens=True) | |
# Convert to BGR for DeepFace | |
image_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR) | |
# Face detection using DeepFace with RetinaFace backend | |
try: | |
faces = DeepFace.extract_faces(img_path=image_bgr, detector_backend="retinaface", enforce_detection=False) | |
print(f"DeepFace detected {len(faces)} face(s)") | |
except Exception as e: | |
print("DeepFace error:", e) | |
faces = [] | |
face_infos = [] | |
for face_data in faces: | |
face_crop = face_data["face"] | |
try: | |
analysis = DeepFace.analyze(face_crop, actions=['age', 'gender', 'emotion'], enforce_detection=False) | |
age = analysis[0]['age'] | |
gender = analysis[0]['gender'] | |
emotion = analysis[0]['dominant_emotion'] | |
if age < 13: | |
age_group = "child" | |
elif age < 20: | |
age_group = "teen" | |
elif age < 60: | |
age_group = "adult" | |
else: | |
age_group = "senior" | |
face_infos.append({ | |
"age": age, | |
"gender": gender, | |
"age_group": age_group, | |
"emotion": emotion | |
}) | |
except Exception: | |
continue | |
# Summary stats | |
num_faces = len(face_infos) | |
gender_counts = {"Man": 0, "Woman": 0} | |
age_summary = {} | |
emotion_summary = {} | |
for face in face_infos: | |
gender = face['gender'] | |
age_group = face['age_group'] | |
emotion = face['emotion'] | |
gender_counts[gender] += 1 | |
age_summary[age_group] = age_summary.get(age_group, 0) + 1 | |
emotion_summary[emotion] = emotion_summary.get(emotion, 0) + 1 | |
# Clothing info from caption | |
colors, patterns, items = extract_clothing(caption) | |
# Generate 15 sentences | |
sentences = [] | |
sentences.append(f"According to the BLIP model, the scene can be described as: \"{caption}\".") | |
sentences.append(f"The image contains {num_faces} visible face(s) detected using DeepFace (RetinaFace backend).") | |
gender_desc = [] | |
if gender_counts["Man"] > 0: | |
gender_desc.append(f"{gender_counts['Man']} male(s)") | |
if gender_counts["Woman"] > 0: | |
gender_desc.append(f"{gender_counts['Woman']} female(s)") | |
if gender_desc: | |
sentences.append("Gender distribution shows " + " and ".join(gender_desc) + ".") | |
else: | |
sentences.append("Gender analysis was inconclusive.") | |
if age_summary: | |
age_list = [f"{count} {group}(s)" for group, count in age_summary.items()] | |
sentences.append("Age groups represented include " + ", ".join(age_list) + ".") | |
else: | |
sentences.append("No conclusive age groupings found.") | |
if emotion_summary: | |
emo_list = [f"{count} showing {emo}" for emo, count in emotion_summary.items()] | |
sentences.append("Facial expressions include " + ", ".join(emo_list) + ".") | |
else: | |
sentences.append("Emotion detection yielded limited results.") | |
if colors or patterns or items: | |
cloth_parts = [] | |
if colors: | |
cloth_parts.append(f"colors like {', '.join(colors)}") | |
if patterns: | |
cloth_parts.append(f"patterns such as {', '.join(patterns)}") | |
if items: | |
cloth_parts.append(f"items like {', '.join(items)}") | |
sentences.append("The clothing observed includes " + " and ".join(cloth_parts) + ".") | |
else: | |
sentences.append("Clothing details were not clearly identified.") | |
if num_faces > 0: | |
sentences.append("Faces are distributed naturally across the image.") | |
sentences.append("Differences in face size suggest variation in distance from the camera.") | |
sentences.append("Hairstyles appear diverse, from short to tied-back styles.") | |
sentences.append("Lighting emphasizes certain facial features and expressions.") | |
sentences.append("Some individuals face the camera while others look away.") | |
sentences.append("Mood diversity is reflected in the variety of facial expressions.") | |
sentences.append("The clothing style appears casual or semi-formal.") | |
else: | |
sentences.append("No visible faces were found to analyze further visual characteristics.") | |
sentences.append("Overall, the image integrates facial, emotional, and clothing features into a cohesive scene.") | |
return "\n".join([f"{i+1}. {s}" for i, s in enumerate(sentences)]) | |
# Gradio Interface | |
demo = gr.Interface( | |
fn=analyze_image, | |
inputs=gr.Image(type="pil"), | |
outputs=gr.Textbox(label="📝 15-Sentence Detailed Description"), | |
title="🖼️ Image Analysis with BLIP + DeepFace", | |
description ="Upload an image to get a detailed 15-sentence description of facial features, age, gender, clothing, and more." | |
) | |
demo.launch() | |