from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import numpy as np
import cv2
from deepface import DeepFace
import gradio as gr

# Load BLIP model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Clothing extractor
def extract_clothing(text):
    colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'brown', 'gray', 'pink', 'orange']
    patterns = ['striped', 'checkered', 'plaid', 'polka-dot', 'solid', 'patterned', 'floral']
    items = ['jacket', 'coat', 'dress', 'shirt', 't-shirt', 'jeans', 'pants', 'shorts',
             'suit', 'sneakers', 'hat', 'scarf', 'uniform']

    found_colors = [c for c in colors if c in text.lower()]
    found_patterns = [p for p in patterns if p in text.lower()]
    found_items = [i for i in items if i in text.lower()]

    return found_colors, found_patterns, found_items

# Main function
def analyze_image(image_pil):
    image_pil = image_pil.convert("RGB")
    image_np = np.array(image_pil)

    # Caption generation
    inputs = processor(image_pil, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    # Convert to BGR for DeepFace
    image_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)

    # Face detection using DeepFace with RetinaFace backend
    try:
        faces = DeepFace.extract_faces(img_path=image_bgr, detector_backend="retinaface", enforce_detection=False)
        print(f"DeepFace detected {len(faces)} face(s)")
    except Exception as e:
        print("DeepFace error:", e)
        faces = []

    face_infos = []
    for face_data in faces:
        face_crop = face_data["face"]
        try:
            analysis = DeepFace.analyze(face_crop, actions=['age', 'gender', 'emotion'], enforce_detection=False)
            age = analysis[0]['age']
            gender = analysis[0]['gender']
            emotion = analysis[0]['dominant_emotion']

            if age < 13:
                age_group = "child"
            elif age < 20:
                age_group = "teen"
            elif age < 60:
                age_group = "adult"
            else:
                age_group = "senior"

            face_infos.append({
                "age": age,
                "gender": gender,
                "age_group": age_group,
                "emotion": emotion
            })
        except Exception:
            continue

    # Summary stats
    num_faces = len(face_infos)
    gender_counts = {"Man": 0, "Woman": 0}
    age_summary = {}
    emotion_summary = {}

    for face in face_infos:
        gender = face['gender']
        age_group = face['age_group']
        emotion = face['emotion']

        gender_counts[gender] += 1
        age_summary[age_group] = age_summary.get(age_group, 0) + 1
        emotion_summary[emotion] = emotion_summary.get(emotion, 0) + 1

    # Clothing info from caption
    colors, patterns, items = extract_clothing(caption)

    # Generate 15 sentences
    sentences = []
    sentences.append(f"According to the BLIP model, the scene can be described as: \"{caption}\".")
    sentences.append(f"The image contains {num_faces} visible face(s) detected using DeepFace (RetinaFace backend).")

    gender_desc = []
    if gender_counts["Man"] > 0:
        gender_desc.append(f"{gender_counts['Man']} male(s)")
    if gender_counts["Woman"] > 0:
        gender_desc.append(f"{gender_counts['Woman']} female(s)")
    if gender_desc:
        sentences.append("Gender distribution shows " + " and ".join(gender_desc) + ".")
    else:
        sentences.append("Gender analysis was inconclusive.")

    if age_summary:
        age_list = [f"{count} {group}(s)" for group, count in age_summary.items()]
        sentences.append("Age groups represented include " + ", ".join(age_list) + ".")
    else:
        sentences.append("No conclusive age groupings found.")

    if emotion_summary:
        emo_list = [f"{count} showing {emo}" for emo, count in emotion_summary.items()]
        sentences.append("Facial expressions include " + ", ".join(emo_list) + ".")
    else:
        sentences.append("Emotion detection yielded limited results.")

    if colors or patterns or items:
        cloth_parts = []
        if colors:
            cloth_parts.append(f"colors like {', '.join(colors)}")
        if patterns:
            cloth_parts.append(f"patterns such as {', '.join(patterns)}")
        if items:
            cloth_parts.append(f"items like {', '.join(items)}")
        sentences.append("The clothing observed includes " + " and ".join(cloth_parts) + ".")
    else:
        sentences.append("Clothing details were not clearly identified.")

    if num_faces > 0:
        sentences.append("Faces are distributed naturally across the image.")
        sentences.append("Differences in face size suggest variation in distance from the camera.")
        sentences.append("Hairstyles appear diverse, from short to tied-back styles.")
        sentences.append("Lighting emphasizes certain facial features and expressions.")
        sentences.append("Some individuals face the camera while others look away.")
        sentences.append("Mood diversity is reflected in the variety of facial expressions.")
        sentences.append("The clothing style appears casual or semi-formal.")
    else:
        sentences.append("No visible faces were found to analyze further visual characteristics.")

    sentences.append("Overall, the image integrates facial, emotional, and clothing features into a cohesive scene.")

    return "\n".join([f"{i+1}. {s}" for i, s in enumerate(sentences)])

# Gradio Interface
demo = gr.Interface(
    fn=analyze_image,
    inputs=gr.Image(type="pil"),
    outputs=gr.Textbox(label="📝 15-Sentence Detailed Description"),
    title="🖼️ Image Analysis with BLIP + DeepFace",
    description ="Upload an image to get a detailed 15-sentence description of facial features, age, gender, clothing, and more."
)

demo.launch()