Spaces:
Runtime error
Runtime error
File size: 6,096 Bytes
6affcd7 cea2902 6affcd7 2a2fa4c 6affcd7 4f843fa 6affcd7 4f843fa 6affcd7 4f843fa 6affcd7 5ba87b6 4f843fa 6affcd7 4f843fa 2a2fa4c 4f843fa 2a2fa4c cea2902 74460b6 cea2902 74460b6 2a2fa4c 74460b6 2a2fa4c 4f843fa 2a2fa4c 4f843fa 2a2fa4c 4f843fa 2a2fa4c 4f843fa 2a2fa4c 4f843fa 2a2fa4c 4f843fa 2a2fa4c 4f843fa 2a2fa4c 4f843fa 2a2fa4c 4f843fa 2a2fa4c 4f843fa 2a2fa4c 4f843fa 2a2fa4c 4f843fa 74460b6 4f843fa e2b45fa 2a2fa4c 2a43544 4f843fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import numpy as np
import cv2
from deepface import DeepFace
import gradio as gr
# Load BLIP model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# Clothing extractor
def extract_clothing(text):
colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'brown', 'gray', 'pink', 'orange']
patterns = ['striped', 'checkered', 'plaid', 'polka-dot', 'solid', 'patterned', 'floral']
items = ['jacket', 'coat', 'dress', 'shirt', 't-shirt', 'jeans', 'pants', 'shorts',
'suit', 'sneakers', 'hat', 'scarf', 'uniform']
found_colors = [c for c in colors if c in text.lower()]
found_patterns = [p for p in patterns if p in text.lower()]
found_items = [i for i in items if i in text.lower()]
return found_colors, found_patterns, found_items
# Main function
def analyze_image(image_pil):
image_pil = image_pil.convert("RGB")
image_np = np.array(image_pil)
# Caption generation
inputs = processor(image_pil, return_tensors="pt")
out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)
# Convert to BGR for DeepFace
image_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
# Face detection using DeepFace with RetinaFace backend
try:
faces = DeepFace.extract_faces(img_path=image_bgr, detector_backend="retinaface", enforce_detection=False)
print(f"DeepFace detected {len(faces)} face(s)")
except Exception as e:
print("DeepFace error:", e)
faces = []
face_infos = []
for face_data in faces:
face_crop = face_data["face"]
try:
analysis = DeepFace.analyze(face_crop, actions=['age', 'gender', 'emotion'], enforce_detection=False)
age = analysis[0]['age']
gender = analysis[0]['gender']
emotion = analysis[0]['dominant_emotion']
if age < 13:
age_group = "child"
elif age < 20:
age_group = "teen"
elif age < 60:
age_group = "adult"
else:
age_group = "senior"
face_infos.append({
"age": age,
"gender": gender,
"age_group": age_group,
"emotion": emotion
})
except Exception:
continue
# Summary stats
num_faces = len(face_infos)
gender_counts = {"Man": 0, "Woman": 0}
age_summary = {}
emotion_summary = {}
for face in face_infos:
gender = face['gender']
age_group = face['age_group']
emotion = face['emotion']
gender_counts[gender] += 1
age_summary[age_group] = age_summary.get(age_group, 0) + 1
emotion_summary[emotion] = emotion_summary.get(emotion, 0) + 1
# Clothing info from caption
colors, patterns, items = extract_clothing(caption)
# Generate 15 sentences
sentences = []
sentences.append(f"According to the BLIP model, the scene can be described as: \"{caption}\".")
sentences.append(f"The image contains {num_faces} visible face(s) detected using DeepFace (RetinaFace backend).")
gender_desc = []
if gender_counts["Man"] > 0:
gender_desc.append(f"{gender_counts['Man']} male(s)")
if gender_counts["Woman"] > 0:
gender_desc.append(f"{gender_counts['Woman']} female(s)")
if gender_desc:
sentences.append("Gender distribution shows " + " and ".join(gender_desc) + ".")
else:
sentences.append("Gender analysis was inconclusive.")
if age_summary:
age_list = [f"{count} {group}(s)" for group, count in age_summary.items()]
sentences.append("Age groups represented include " + ", ".join(age_list) + ".")
else:
sentences.append("No conclusive age groupings found.")
if emotion_summary:
emo_list = [f"{count} showing {emo}" for emo, count in emotion_summary.items()]
sentences.append("Facial expressions include " + ", ".join(emo_list) + ".")
else:
sentences.append("Emotion detection yielded limited results.")
if colors or patterns or items:
cloth_parts = []
if colors:
cloth_parts.append(f"colors like {', '.join(colors)}")
if patterns:
cloth_parts.append(f"patterns such as {', '.join(patterns)}")
if items:
cloth_parts.append(f"items like {', '.join(items)}")
sentences.append("The clothing observed includes " + " and ".join(cloth_parts) + ".")
else:
sentences.append("Clothing details were not clearly identified.")
if num_faces > 0:
sentences.append("Faces are distributed naturally across the image.")
sentences.append("Differences in face size suggest variation in distance from the camera.")
sentences.append("Hairstyles appear diverse, from short to tied-back styles.")
sentences.append("Lighting emphasizes certain facial features and expressions.")
sentences.append("Some individuals face the camera while others look away.")
sentences.append("Mood diversity is reflected in the variety of facial expressions.")
sentences.append("The clothing style appears casual or semi-formal.")
else:
sentences.append("No visible faces were found to analyze further visual characteristics.")
sentences.append("Overall, the image integrates facial, emotional, and clothing features into a cohesive scene.")
return "\n".join([f"{i+1}. {s}" for i, s in enumerate(sentences)])
# Gradio Interface
demo = gr.Interface(
fn=analyze_image,
inputs=gr.Image(type="pil"),
outputs=gr.Textbox(label="📝 15-Sentence Detailed Description"),
title="🖼️ Image Analysis with BLIP + DeepFace",
description ="Upload an image to get a detailed 15-sentence description of facial features, age, gender, clothing, and more."
)
demo.launch()
|