from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration import numpy as np import cv2 from deepface import DeepFace import gradio as gr # Load BLIP model processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") # Clothing extractor def extract_clothing(text): colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'brown', 'gray', 'pink', 'orange'] patterns = ['striped', 'checkered', 'plaid', 'polka-dot', 'solid', 'patterned', 'floral'] items = ['jacket', 'coat', 'dress', 'shirt', 't-shirt', 'jeans', 'pants', 'shorts', 'suit', 'sneakers', 'hat', 'scarf', 'uniform'] found_colors = [c for c in colors if c in text.lower()] found_patterns = [p for p in patterns if p in text.lower()] found_items = [i for i in items if i in text.lower()] return found_colors, found_patterns, found_items # Main function def analyze_image(image_pil): image_pil = image_pil.convert("RGB") image_np = np.array(image_pil) # Caption generation inputs = processor(image_pil, return_tensors="pt") out = model.generate(**inputs) caption = processor.decode(out[0], skip_special_tokens=True) # Convert to BGR for DeepFace image_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR) # Face detection using DeepFace with RetinaFace backend try: faces = DeepFace.extract_faces(img_path=image_bgr, detector_backend="retinaface", enforce_detection=False) print(f"DeepFace detected {len(faces)} face(s)") except Exception as e: print("DeepFace error:", e) faces = [] face_infos = [] for face_data in faces: face_crop = face_data["face"] try: analysis = DeepFace.analyze(face_crop, actions=['age', 'gender', 'emotion'], enforce_detection=False) age = analysis[0]['age'] gender = analysis[0]['gender'] emotion = analysis[0]['dominant_emotion'] if age < 13: age_group = "child" elif age < 20: age_group = "teen" elif age < 60: age_group = "adult" else: age_group = "senior" face_infos.append({ "age": age, "gender": gender, "age_group": age_group, "emotion": emotion }) except Exception: continue # Summary stats num_faces = len(face_infos) gender_counts = {"Man": 0, "Woman": 0} age_summary = {} emotion_summary = {} for face in face_infos: gender = face['gender'] age_group = face['age_group'] emotion = face['emotion'] gender_counts[gender] += 1 age_summary[age_group] = age_summary.get(age_group, 0) + 1 emotion_summary[emotion] = emotion_summary.get(emotion, 0) + 1 # Clothing info from caption colors, patterns, items = extract_clothing(caption) # Generate 15 sentences sentences = [] sentences.append(f"According to the BLIP model, the scene can be described as: \"{caption}\".") sentences.append(f"The image contains {num_faces} visible face(s) detected using DeepFace (RetinaFace backend).") gender_desc = [] if gender_counts["Man"] > 0: gender_desc.append(f"{gender_counts['Man']} male(s)") if gender_counts["Woman"] > 0: gender_desc.append(f"{gender_counts['Woman']} female(s)") if gender_desc: sentences.append("Gender distribution shows " + " and ".join(gender_desc) + ".") else: sentences.append("Gender analysis was inconclusive.") if age_summary: age_list = [f"{count} {group}(s)" for group, count in age_summary.items()] sentences.append("Age groups represented include " + ", ".join(age_list) + ".") else: sentences.append("No conclusive age groupings found.") if emotion_summary: emo_list = [f"{count} showing {emo}" for emo, count in emotion_summary.items()] sentences.append("Facial expressions include " + ", ".join(emo_list) + ".") else: sentences.append("Emotion detection yielded limited results.") if colors or patterns or items: cloth_parts = [] if colors: cloth_parts.append(f"colors like {', '.join(colors)}") if patterns: cloth_parts.append(f"patterns such as {', '.join(patterns)}") if items: cloth_parts.append(f"items like {', '.join(items)}") sentences.append("The clothing observed includes " + " and ".join(cloth_parts) + ".") else: sentences.append("Clothing details were not clearly identified.") if num_faces > 0: sentences.append("Faces are distributed naturally across the image.") sentences.append("Differences in face size suggest variation in distance from the camera.") sentences.append("Hairstyles appear diverse, from short to tied-back styles.") sentences.append("Lighting emphasizes certain facial features and expressions.") sentences.append("Some individuals face the camera while others look away.") sentences.append("Mood diversity is reflected in the variety of facial expressions.") sentences.append("The clothing style appears casual or semi-formal.") else: sentences.append("No visible faces were found to analyze further visual characteristics.") sentences.append("Overall, the image integrates facial, emotional, and clothing features into a cohesive scene.") return "\n".join([f"{i+1}. {s}" for i, s in enumerate(sentences)]) # Gradio Interface demo = gr.Interface( fn=analyze_image, inputs=gr.Image(type="pil"), outputs=gr.Textbox(label="📝 15-Sentence Detailed Description"), title="🖼️ Image Analysis with BLIP + DeepFace", description ="Upload an image to get a detailed 15-sentence description of facial features, age, gender, clothing, and more." ) demo.launch()