import streamlit as st
from transformers import DetrImageProcessor, DetrForObjectDetection
from PIL import Image, ImageDraw, ImageFont
import torch
import io

# Load model and processor
@st.cache_resource
def load_model():
    processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
    model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
    return processor, model

def draw_boxes(image, results, labels):
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()

    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        box = [round(i, 2) for i in box.tolist()]
        draw.rectangle(box, outline="red", width=2)
        label_text = f"{labels[label.item()]}: {score:.2f}"
        draw.text((box[0], box[1] - 10), label_text, fill="red", font=font)
    return image

def main():
    st.set_page_config(page_title="🔍 Object Detection Demo", layout="centered")
    st.markdown("**🎯 Object Detection using Transformers (DETR)**")
    st.write("Upload an image to detect objects using a pre-trained Transformer model: `facebook/detr-resnet-50`.")

    uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])

    if uploaded_file:
        image = Image.open(uploaded_file).convert("RGB")

        # Create two columns for displaying images
        col1, col2 = st.columns(2)

        with col1:
            st.image(image, caption="Original Image", width=200)

        processor, model = load_model()

        # Preprocess
        inputs = processor(images=image, return_tensors="pt")
        outputs = model(**inputs)

        # Post-process
        target_sizes = torch.tensor([image.size[::-1]])
        results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

        st.markdown("**📦 Detected Objects**")
        if results["boxes"].shape[0] == 0:
            st.warning("No objects detected with confidence > 90%")
        else:
            labeled_image = image.copy()
            labeled_image = draw_boxes(labeled_image, results, model.config.id2label)

            with col2:
                st.image(labeled_image, caption="Detected Objects", width=200)

            for score, label in zip(results["scores"], results["labels"]):
                st.write(f"- **{model.config.id2label[label.item()]}** → Confidence: `{score:.2f}`")

    with st.expander("ℹ️ What is Pretraining and Which Model Are We Using?"):
        st.markdown("""
        **Pretraining** is like teaching a model some basic skills before asking it to do a specific task. 
        Just like a child first learns shapes, colors, and objects before learning to name or sort them, 
        a pre-trained model has already learned to recognize **general patterns** in thousands or millions of images.

        👇 In our app, we are using a pre-trained model called:

        ### 🔍 `facebook/detr-resnet-50`
        - **DETR** stands for *DEtection TRansformer*. It's a special kind of deep learning model made by Facebook AI.
        - It's been **trained on COCO dataset**, which includes 80 common object types like people, cars, dogs, chairs, etc.
        - Because it's pre-trained, we **don’t need to train it ourselves** — it already knows how to detect these objects!

        🧠 So when you upload an image, the model just applies what it has already learned during pretraining to spot things in your image.
        """)
        
    with st.expander("ℹ️ What is Object Detection?"):
        st.markdown("""
        Object detection is like playing "I spy with my little eye" — but using AI! 
        Instead of just saying "there's a dog", object detection can also say **where** the dog is in the image using a **bounding box**.

        It helps in:
        - 🛻 Self-driving cars (detecting pedestrians and vehicles)
        - 📸 Security cameras (detecting intrusions)
        - 📦 Inventory systems (detecting objects on shelves)
        """)
        
    with st.expander("ℹ️ How Does the DETR Model Work?"):
        st.markdown("""
        **DETR** stands for **DEtection TRansformer**, a cutting-edge model developed by Facebook AI Research. It combines **Convolutional Neural Networks (CNNs)** and **Transformers** — the same architecture used in ChatGPT and BERT — to detect objects in images.

        ###### 🏗️ How is it Different?
        Most older object detection models work in stages:
        - First, they **generate regions of interest (ROIs)** (like boxes where something might be).
        - Then they **classify** what's inside each box (cat, dog, etc.).

        But DETR skips this multi-step process by using a **Transformer** to directly:
        - Look at the image
        - Predict **all objects and their locations at once** (end-to-end)

        ###### ⚙️ Key Components of DETR:
        - **CNN Backbone (like ResNet):** Extracts visual features from the image (e.g., edges, textures)
        - **Transformer Encoder-Decoder:** Understands **global relationships** between features (e.g., where objects are in relation to each other)
        - **Prediction Heads:** Predicts bounding boxes and labels

        ###### ✨ Why is DETR Special?
        - No need for complicated anchor boxes or region proposals
        - Handles overlapping or cluttered objects better
        - Learns in a more "human-like" way — understanding the **whole scene**, not just pieces

        ###### 📦 Pretrained Model in This App:
        In this app, we're using **`facebook/detr-resnet-50`**, a model trained on **COCO dataset** (Common Objects in Context) with:
        - 80 object categories (like person, car, bottle, chair)
        - Over 100,000 images for training

        It can detect things like:
        """)
        
        col1, col2, col3, col4 = st.columns(4)

        with col1:
            st.write("🐶 Dogs")
            st.write("🐱 Cats")
            st.write("👧 People")
            st.write("🚗 Cars")
            st.write("🚌 Buses")
            st.write("🚴 Bicycles")
            st.write("🏍️ Motorcycles")

        with col2:
            st.write("✈️ Airplanes")
            st.write("🚤 Boats")
            st.write("🪑 Chairs")
            st.write("🛏️ Beds")
            st.write("🖥️ Monitors")
            st.write("📱 Cell Phones")
            st.write("📷 Cameras")

        with col3:
            st.write("🍎 Apples")
            st.write("🍌 Bananas")
            st.write("🍕 Pizzas")
            st.write("🥫 Cans")
            st.write("🍽️ Dining Tables")
            st.write("🛋️ Couches")
            st.write("🧴 Bottles")

        with col4:
            st.write("👜 Handbags")
            st.write("🧳 Suitcases")
            st.write("⛺ Tents")
            st.write("🖼️ Paintings")
            st.write("🚦 Traffic Lights")
            st.write("🛑 Stop Signs")
            st.write("🐄 Cows")
            
    with st.expander("🌍 Real-World Use Cases of Object Detection"):
        st.markdown("""
        Object detection models like DETR are widely used in many industries. Here are some practical examples:

        🔐 **Security & Surveillance**
        - Detecting people in restricted zones
        - Identifying abandoned objects in public places

        🏥 **Healthcare**
        - Analyzing X-rays and MRI scans to detect tumors or anomalies
        - Assisting doctors in surgical planning

        🚗 **Autonomous Vehicles**
        - Identifying pedestrians, vehicles, traffic lights, and road signs in real-time

        🛍️ **Retail**
        - Automated checkout systems (e.g., Amazon Go)
        - Shelf inventory monitoring using cameras

        🏗️ **Construction & Safety**
        - Monitoring helmet usage and safety compliance on sites
        - Tracking equipment and workers

        🛸 **Aerial & Drone Imagery**
        - Detecting objects (cars, animals, buildings) from satellite or drone images

        📱 **Mobile Applications**
        - Real-time AR object tagging (e.g., identifying products in camera view)

        🎮 **Gaming & Sports**
        - Player and object tracking in sports analytics
        - Enhanced real-time visuals in AR/VR environments
        """)
        
    with st.expander("🔍 Categories vs Real-World Use Cases"):
        st.markdown("""
        ###### 🎯 What DETR Can Detect (Pretrained Model)
        The base DETR model (`facebook/detr-resnet-50`) is trained on the **COCO dataset**, which includes 91 common object categories, such as:

        - 🧍 Person &nbsp;&nbsp; 🚗 Car &nbsp;&nbsp; 🚌 Bus &nbsp;&nbsp; 🏍️ Motorcycle  
        - 🐶 Dog &nbsp;&nbsp; 🐱 Cat &nbsp;&nbsp; 🐄 Cow  
        - 🍎 Apple &nbsp;&nbsp; 🍌 Banana  
        - 🛋️ Sofa &nbsp;&nbsp; 🪑 Chair &nbsp;&nbsp; 🛏️ Bed  
        - 📺 TV &nbsp;&nbsp; 🖥️ Laptop &nbsp;&nbsp; 📷 Camera  
        - 🐦 Bird &nbsp;&nbsp; 🐟 Fish  

        This is great for **general object detection**, but there are some gaps when it comes to real-world applications.

        ---

        ###### ❌ What It Misses for Real-World Use Cases

        In specialized or industrial domains, we often need to detect:

        ###### 🏥 **Medical Imaging**
        - Tumors, organs (lungs, liver), anomalies  
        > ⚠️ COCO doesn’t have these.

        ###### 🛡️ **Security/Surveillance**
        - Weapons, intrusions, suspicious behavior  
        > Not covered in COCO.

        ###### 🏭 **Manufacturing**
        - Machine parts, tools, defects  
        > Also outside COCO’s categories.

        ###### 🧪 **Scientific Research**
        - Cells, molecules, lab equipment

        ###### 📦 **Retail**
        - Brands, barcodes, product layouts

        ---

        ###### ✅ Solutions & Next Steps

        - **`Fine-tune DETR`** on custom datasets for your domain.
        - Use **domain-specific pretrained models** (e.g., BioMed DETR, retail YOLOs).
        - Try other models like **SAM**, **DINOv2**, or **GroundingDINO** for more advanced segmentation or open-vocabulary detection.

            """)
        
    with st.expander("🛠️ Fine-Tuning an Object Detection Model: Dos, Don’ts & Real Effort"):
        st.markdown("""
        Fine-tuning is the process of **adapting a pretrained model** (like DETR) to recognize **new or custom objects** from your own dataset.

        ---

        ###### ✅ What You *Should* Do

        - **Start with a pretrained model**  
        Saves time and works well with smaller datasets.

        - **Prepare a clean, labeled dataset**  ⚠️ *Labor-Intensive*  
        You'll need hundreds or thousands of images **manually annotated with bounding boxes**.  
        Tools: `LabelImg`, `Roboflow`, `CVAT`.

        - **Use transfer learning wisely**  
        Freeze base layers and train higher layers first to avoid overfitting.

        - **Train in small batches initially**  
        Helps you catch issues early (e.g., wrong labels, overfitting).

        - **Use data augmentation**  
        Automatically increases variation using flips, crops, brightness, etc.

        ---

        ###### ❌ What You *Shouldn’t* Do

        - **Don’t use a high learning rate**  
        It might "unlearn" everything from pretraining.

        - **Don’t train from scratch** unless you have a huge dataset  
        Pretrained models save compute and training time.

        - **Don’t skip validation**  
        Always use a validation set to evaluate generalization.

        - **Don’t mismatch model formats**  
        Your model might expect COCO-style annotations; ensure consistency.

        ---

        ###### 🧪 When Should You Fine-Tune?

        - You're detecting **custom objects** (e.g., tools, animals, X-ray anomalies).
        - Your domain is **very different** (e.g., drones, medical imaging).
        - You want **higher accuracy** for specific categories.

        ---

        ###### 🔍 Most Labor-Intensive Sub-Activity

        **✔️ Annotating the dataset** (images + bounding boxes + class labels)  
        This step takes **significant manual effort** and often requires **domain experts** (e.g., doctors for medical images, engineers for defect detection).  

        💡 *Tip:* Use small datasets to experiment, and crowdsource or semi-automate annotation for larger ones.

            """)
        
        image_url = "https://raw.githubusercontent.com/gridflowai/gridflowAI-datasets-icons/7ec17a8e039d53a1dac09d22270251e318649457/AI-icons-images/image_bounding_boxes.png"
        
        st.image(image_url, caption="Objects manually annotated with bounding boxes", width=300)


if __name__ == "__main__":
    main()