File size: 6,096 Bytes
6affcd7
 
 
cea2902
6affcd7
2a2fa4c
6affcd7
4f843fa
6affcd7
 
 
4f843fa
6affcd7
 
 
 
 
4f843fa
6affcd7
5ba87b6
4f843fa
 
6affcd7
 
4f843fa
 
2a2fa4c
 
 
4f843fa
2a2fa4c
 
 
 
cea2902
 
 
 
74460b6
cea2902
 
 
 
74460b6
2a2fa4c
 
74460b6
 
2a2fa4c
4f843fa
2a2fa4c
 
4f843fa
 
2a2fa4c
 
 
 
 
 
 
 
4f843fa
2a2fa4c
4f843fa
2a2fa4c
4f843fa
 
2a2fa4c
4f843fa
2a2fa4c
 
4f843fa
2a2fa4c
4f843fa
2a2fa4c
4f843fa
 
2a2fa4c
4f843fa
 
 
2a2fa4c
4f843fa
 
 
 
 
2a2fa4c
4f843fa
 
 
 
74460b6
4f843fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2b45fa
2a2fa4c
2a43544
4f843fa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import numpy as np
import cv2
from deepface import DeepFace
import gradio as gr

# Load BLIP model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Clothing extractor
def extract_clothing(text):
    colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'brown', 'gray', 'pink', 'orange']
    patterns = ['striped', 'checkered', 'plaid', 'polka-dot', 'solid', 'patterned', 'floral']
    items = ['jacket', 'coat', 'dress', 'shirt', 't-shirt', 'jeans', 'pants', 'shorts',
             'suit', 'sneakers', 'hat', 'scarf', 'uniform']

    found_colors = [c for c in colors if c in text.lower()]
    found_patterns = [p for p in patterns if p in text.lower()]
    found_items = [i for i in items if i in text.lower()]

    return found_colors, found_patterns, found_items

# Main function
def analyze_image(image_pil):
    image_pil = image_pil.convert("RGB")
    image_np = np.array(image_pil)

    # Caption generation
    inputs = processor(image_pil, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    # Convert to BGR for DeepFace
    image_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)

    # Face detection using DeepFace with RetinaFace backend
    try:
        faces = DeepFace.extract_faces(img_path=image_bgr, detector_backend="retinaface", enforce_detection=False)
        print(f"DeepFace detected {len(faces)} face(s)")
    except Exception as e:
        print("DeepFace error:", e)
        faces = []

    face_infos = []
    for face_data in faces:
        face_crop = face_data["face"]
        try:
            analysis = DeepFace.analyze(face_crop, actions=['age', 'gender', 'emotion'], enforce_detection=False)
            age = analysis[0]['age']
            gender = analysis[0]['gender']
            emotion = analysis[0]['dominant_emotion']

            if age < 13:
                age_group = "child"
            elif age < 20:
                age_group = "teen"
            elif age < 60:
                age_group = "adult"
            else:
                age_group = "senior"

            face_infos.append({
                "age": age,
                "gender": gender,
                "age_group": age_group,
                "emotion": emotion
            })
        except Exception:
            continue

    # Summary stats
    num_faces = len(face_infos)
    gender_counts = {"Man": 0, "Woman": 0}
    age_summary = {}
    emotion_summary = {}

    for face in face_infos:
        gender = face['gender']
        age_group = face['age_group']
        emotion = face['emotion']

        gender_counts[gender] += 1
        age_summary[age_group] = age_summary.get(age_group, 0) + 1
        emotion_summary[emotion] = emotion_summary.get(emotion, 0) + 1

    # Clothing info from caption
    colors, patterns, items = extract_clothing(caption)

    # Generate 15 sentences
    sentences = []
    sentences.append(f"According to the BLIP model, the scene can be described as: \"{caption}\".")
    sentences.append(f"The image contains {num_faces} visible face(s) detected using DeepFace (RetinaFace backend).")

    gender_desc = []
    if gender_counts["Man"] > 0:
        gender_desc.append(f"{gender_counts['Man']} male(s)")
    if gender_counts["Woman"] > 0:
        gender_desc.append(f"{gender_counts['Woman']} female(s)")
    if gender_desc:
        sentences.append("Gender distribution shows " + " and ".join(gender_desc) + ".")
    else:
        sentences.append("Gender analysis was inconclusive.")

    if age_summary:
        age_list = [f"{count} {group}(s)" for group, count in age_summary.items()]
        sentences.append("Age groups represented include " + ", ".join(age_list) + ".")
    else:
        sentences.append("No conclusive age groupings found.")

    if emotion_summary:
        emo_list = [f"{count} showing {emo}" for emo, count in emotion_summary.items()]
        sentences.append("Facial expressions include " + ", ".join(emo_list) + ".")
    else:
        sentences.append("Emotion detection yielded limited results.")

    if colors or patterns or items:
        cloth_parts = []
        if colors:
            cloth_parts.append(f"colors like {', '.join(colors)}")
        if patterns:
            cloth_parts.append(f"patterns such as {', '.join(patterns)}")
        if items:
            cloth_parts.append(f"items like {', '.join(items)}")
        sentences.append("The clothing observed includes " + " and ".join(cloth_parts) + ".")
    else:
        sentences.append("Clothing details were not clearly identified.")

    if num_faces > 0:
        sentences.append("Faces are distributed naturally across the image.")
        sentences.append("Differences in face size suggest variation in distance from the camera.")
        sentences.append("Hairstyles appear diverse, from short to tied-back styles.")
        sentences.append("Lighting emphasizes certain facial features and expressions.")
        sentences.append("Some individuals face the camera while others look away.")
        sentences.append("Mood diversity is reflected in the variety of facial expressions.")
        sentences.append("The clothing style appears casual or semi-formal.")
    else:
        sentences.append("No visible faces were found to analyze further visual characteristics.")

    sentences.append("Overall, the image integrates facial, emotional, and clothing features into a cohesive scene.")

    return "\n".join([f"{i+1}. {s}" for i, s in enumerate(sentences)])

# Gradio Interface
demo = gr.Interface(
    fn=analyze_image,
    inputs=gr.Image(type="pil"),
    outputs=gr.Textbox(label="📝 15-Sentence Detailed Description"),
    title="🖼️ Image Analysis with BLIP + DeepFace",
    description ="Upload an image to get a detailed 15-sentence description of facial features, age, gender, clothing, and more."
)

demo.launch()