Spaces:
Runtime error
Runtime error
from PIL import Image | |
from transformers import BlipProcessor, BlipForConditionalGeneration | |
import torch | |
import cv2 | |
import numpy as np | |
from deepface import DeepFace | |
import re | |
# Load BLIP model | |
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") | |
# Load image | |
image_path = "your_image.jpg" # Replace with your image path | |
image_pil = Image.open(image_path).convert('RGB') | |
image_np = np.array(image_pil) | |
# BLIP caption | |
inputs = processor(image_pil, return_tensors="pt") | |
out = model.generate(**inputs) | |
caption = processor.decode(out[0], skip_special_tokens=True) | |
# OpenCV for face detection | |
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml") | |
gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY) | |
faces = face_cascade.detectMultiScale(gray, 1.1, 4) | |
# Analyze each face with DeepFace | |
face_infos = [] | |
for (x, y, w, h) in faces: | |
face_crop = image_np[y:y+h, x:x+w] | |
try: | |
analysis = DeepFace.analyze(face_crop, actions=['age', 'gender'], enforce_detection=False) | |
age = analysis[0]['age'] | |
gender = analysis[0]['gender'] | |
# Map age to range | |
if age < 13: | |
age_group = "child" | |
elif age < 20: | |
age_group = "teen" | |
elif age < 60: | |
age_group = "adult" | |
else: | |
age_group = "senior" | |
face_infos.append({ | |
"age_group": age_group, | |
"gender": gender, | |
}) | |
except Exception as e: | |
continue | |
# ์ผ๊ตด ์, ์ฐ๋ น๋ ์์ฝ | |
num_faces = len(face_infos) | |
age_summary = {} | |
for face in face_infos: | |
key = f"{face['gender']} {face['age_group']}" | |
age_summary[key] = age_summary.get(key, 0) + 1 | |
# Extract clothing details | |
def extract_clothing(text): | |
colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'brown', 'gray', 'pink', 'orange'] | |
patterns = ['striped', 'checkered', 'plaid', 'polka-dot', 'solid', 'patterned', 'floral'] | |
items = ['jacket', 'coat', 'dress', 'shirt', 't-shirt', 'jeans', 'pants', 'shorts', | |
'suit', 'sneakers', 'hat', 'scarf', 'uniform'] | |
found_colors = [c for c in colors if c in text.lower()] | |
found_patterns = [p for p in patterns if p in text.lower()] | |
found_items = [i for i in items if i in text.lower()] | |
return found_colors, found_patterns, found_items | |
colors, patterns, items = extract_clothing(caption) | |
def clothing_sentence(): | |
parts = [] | |
if colors: | |
parts.append(f"colors such as {', '.join(colors)}") | |
if patterns: | |
parts.append(f"patterns like {', '.join(patterns)}") | |
if items: | |
parts.append(f"clothing items such as {', '.join(items)}") | |
return "The clothing observed includes " + " with ".join(parts) + "." if parts else "Clothing is present but not clearly distinguishable." | |
# Generate final 15-sentence description | |
def generate_15_sentences(): | |
sentences = [] | |
sentences.append(f"The image presents the scene: {caption}.") | |
sentences.append("The visual tone combines human presence with context-rich elements.") | |
sentences.append(f"A total of {num_faces} people with visible faces were detected.") | |
if age_summary: | |
summary_list = [f"{v} {k}(s)" for k, v in age_summary.items()] | |
sentences.append("The crowd includes " + ", ".join(summary_list) + ".") | |
else: | |
sentences.append("No specific age or gender details were identified.") | |
sentences.append(clothing_sentence()) | |
sentences.append("Facial expressions range from neutral to slightly expressive, adding emotional context.") | |
sentences.append("Some individuals appear to be interacting with the environment or each other.") | |
sentences.append("Although specific facial shapes are not automatically classified here, a mix of face sizes and angles is present.") | |
sentences.append("Hairstyles vary, including short hair, longer cuts, and tied-back styles depending on individual orientation.") | |
sentences.append("The photo captures diversity not only in people but also in visual textures and tones.") | |
sentences.append("Clothing styles vary, suggesting informal or casual settings rather than formal events.") | |
sentences.append("The spatial arrangement of individuals indicates natural movement or candid posture.") | |
sentences.append("Background elements such as buildings or trees provide additional narrative depth.") | |
sentences.append("The lighting helps highlight human features and adds dimensionality to the scene.") | |
sentences.append("Overall, the image blends appearance, age, fashion, and emotion into a coherent story.") | |
return sentences | |
# Output result | |
final_description = generate_15_sentences() | |
print("\n๐ Full 15-Sentence Detailed Description:\n") | |
for i, s in enumerate(final_description, 1): | |
print(f"{i}. {s}") | |