import streamlit as st from transformers import DetrImageProcessor, DetrForObjectDetection from PIL import Image, ImageDraw, ImageFont import torch import io # Load model and processor @st.cache_resource def load_model(): processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50") return processor, model def draw_boxes(image, results, labels): draw = ImageDraw.Draw(image) font = ImageFont.load_default() for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): box = [round(i, 2) for i in box.tolist()] draw.rectangle(box, outline="red", width=2) label_text = f"{labels[label.item()]}: {score:.2f}" draw.text((box[0], box[1] - 10), label_text, fill="red", font=font) return image def main(): st.set_page_config(page_title="๐Ÿ” Object Detection Demo", layout="centered") st.markdown("**๐ŸŽฏ Object Detection using Transformers (DETR)**") st.write("Upload an image to detect objects using a pre-trained Transformer model: `facebook/detr-resnet-50`.") uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"]) if uploaded_file: image = Image.open(uploaded_file).convert("RGB") # Create two columns for displaying images col1, col2 = st.columns(2) with col1: st.image(image, caption="Original Image", width=200) processor, model = load_model() # Preprocess inputs = processor(images=image, return_tensors="pt") outputs = model(**inputs) # Post-process target_sizes = torch.tensor([image.size[::-1]]) results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0] st.markdown("**๐Ÿ“ฆ Detected Objects**") if results["boxes"].shape[0] == 0: st.warning("No objects detected with confidence > 90%") else: labeled_image = image.copy() labeled_image = draw_boxes(labeled_image, results, model.config.id2label) with col2: st.image(labeled_image, caption="Detected Objects", width=200) for score, label in zip(results["scores"], results["labels"]): st.write(f"- **{model.config.id2label[label.item()]}** โ†’ Confidence: `{score:.2f}`") with st.expander("โ„น๏ธ What is Pretraining and Which Model Are We Using?"): st.markdown(""" **Pretraining** is like teaching a model some basic skills before asking it to do a specific task. Just like a child first learns shapes, colors, and objects before learning to name or sort them, a pre-trained model has already learned to recognize **general patterns** in thousands or millions of images. ๐Ÿ‘‡ In our app, we are using a pre-trained model called: ### ๐Ÿ” `facebook/detr-resnet-50` - **DETR** stands for *DEtection TRansformer*. It's a special kind of deep learning model made by Facebook AI. - It's been **trained on COCO dataset**, which includes 80 common object types like people, cars, dogs, chairs, etc. - Because it's pre-trained, we **donโ€™t need to train it ourselves** โ€” it already knows how to detect these objects! ๐Ÿง  So when you upload an image, the model just applies what it has already learned during pretraining to spot things in your image. """) with st.expander("โ„น๏ธ What is Object Detection?"): st.markdown(""" Object detection is like playing "I spy with my little eye" โ€” but using AI! Instead of just saying "there's a dog", object detection can also say **where** the dog is in the image using a **bounding box**. It helps in: - ๐Ÿ›ป Self-driving cars (detecting pedestrians and vehicles) - ๐Ÿ“ธ Security cameras (detecting intrusions) - ๐Ÿ“ฆ Inventory systems (detecting objects on shelves) """) with st.expander("โ„น๏ธ How Does the DETR Model Work?"): st.markdown(""" **DETR** stands for **DEtection TRansformer**, a cutting-edge model developed by Facebook AI Research. It combines **Convolutional Neural Networks (CNNs)** and **Transformers** โ€” the same architecture used in ChatGPT and BERT โ€” to detect objects in images. ###### ๐Ÿ—๏ธ How is it Different? Most older object detection models work in stages: - First, they **generate regions of interest (ROIs)** (like boxes where something might be). - Then they **classify** what's inside each box (cat, dog, etc.). But DETR skips this multi-step process by using a **Transformer** to directly: - Look at the image - Predict **all objects and their locations at once** (end-to-end) ###### โš™๏ธ Key Components of DETR: - **CNN Backbone (like ResNet):** Extracts visual features from the image (e.g., edges, textures) - **Transformer Encoder-Decoder:** Understands **global relationships** between features (e.g., where objects are in relation to each other) - **Prediction Heads:** Predicts bounding boxes and labels ###### โœจ Why is DETR Special? - No need for complicated anchor boxes or region proposals - Handles overlapping or cluttered objects better - Learns in a more "human-like" way โ€” understanding the **whole scene**, not just pieces ###### ๐Ÿ“ฆ Pretrained Model in This App: In this app, we're using **`facebook/detr-resnet-50`**, a model trained on **COCO dataset** (Common Objects in Context) with: - 80 object categories (like person, car, bottle, chair) - Over 100,000 images for training It can detect things like: """) col1, col2, col3, col4 = st.columns(4) with col1: st.write("๐Ÿถ Dogs") st.write("๐Ÿฑ Cats") st.write("๐Ÿ‘ง People") st.write("๐Ÿš— Cars") st.write("๐ŸšŒ Buses") st.write("๐Ÿšด Bicycles") st.write("๐Ÿ๏ธ Motorcycles") with col2: st.write("โœˆ๏ธ Airplanes") st.write("๐Ÿšค Boats") st.write("๐Ÿช‘ Chairs") st.write("๐Ÿ›๏ธ Beds") st.write("๐Ÿ–ฅ๏ธ Monitors") st.write("๐Ÿ“ฑ Cell Phones") st.write("๐Ÿ“ท Cameras") with col3: st.write("๐ŸŽ Apples") st.write("๐ŸŒ Bananas") st.write("๐Ÿ• Pizzas") st.write("๐Ÿฅซ Cans") st.write("๐Ÿฝ๏ธ Dining Tables") st.write("๐Ÿ›‹๏ธ Couches") st.write("๐Ÿงด Bottles") with col4: st.write("๐Ÿ‘œ Handbags") st.write("๐Ÿงณ Suitcases") st.write("โ›บ Tents") st.write("๐Ÿ–ผ๏ธ Paintings") st.write("๐Ÿšฆ Traffic Lights") st.write("๐Ÿ›‘ Stop Signs") st.write("๐Ÿ„ Cows") with st.expander("๐ŸŒ Real-World Use Cases of Object Detection"): st.markdown(""" Object detection models like DETR are widely used in many industries. Here are some practical examples: ๐Ÿ” **Security & Surveillance** - Detecting people in restricted zones - Identifying abandoned objects in public places ๐Ÿฅ **Healthcare** - Analyzing X-rays and MRI scans to detect tumors or anomalies - Assisting doctors in surgical planning ๐Ÿš— **Autonomous Vehicles** - Identifying pedestrians, vehicles, traffic lights, and road signs in real-time ๐Ÿ›๏ธ **Retail** - Automated checkout systems (e.g., Amazon Go) - Shelf inventory monitoring using cameras ๐Ÿ—๏ธ **Construction & Safety** - Monitoring helmet usage and safety compliance on sites - Tracking equipment and workers ๐Ÿ›ธ **Aerial & Drone Imagery** - Detecting objects (cars, animals, buildings) from satellite or drone images ๐Ÿ“ฑ **Mobile Applications** - Real-time AR object tagging (e.g., identifying products in camera view) ๐ŸŽฎ **Gaming & Sports** - Player and object tracking in sports analytics - Enhanced real-time visuals in AR/VR environments """) with st.expander("๐Ÿ” Categories vs Real-World Use Cases"): st.markdown(""" ###### ๐ŸŽฏ What DETR Can Detect (Pretrained Model) The base DETR model (`facebook/detr-resnet-50`) is trained on the **COCO dataset**, which includes 91 common object categories, such as: - ๐Ÿง Person    ๐Ÿš— Car    ๐ŸšŒ Bus    ๐Ÿ๏ธ Motorcycle - ๐Ÿถ Dog    ๐Ÿฑ Cat    ๐Ÿ„ Cow - ๐ŸŽ Apple    ๐ŸŒ Banana - ๐Ÿ›‹๏ธ Sofa    ๐Ÿช‘ Chair    ๐Ÿ›๏ธ Bed - ๐Ÿ“บ TV    ๐Ÿ–ฅ๏ธ Laptop    ๐Ÿ“ท Camera - ๐Ÿฆ Bird    ๐ŸŸ Fish This is great for **general object detection**, but there are some gaps when it comes to real-world applications. --- ###### โŒ What It Misses for Real-World Use Cases In specialized or industrial domains, we often need to detect: ###### ๐Ÿฅ **Medical Imaging** - Tumors, organs (lungs, liver), anomalies > โš ๏ธ COCO doesnโ€™t have these. ###### ๐Ÿ›ก๏ธ **Security/Surveillance** - Weapons, intrusions, suspicious behavior > Not covered in COCO. ###### ๐Ÿญ **Manufacturing** - Machine parts, tools, defects > Also outside COCOโ€™s categories. ###### ๐Ÿงช **Scientific Research** - Cells, molecules, lab equipment ###### ๐Ÿ“ฆ **Retail** - Brands, barcodes, product layouts --- ###### โœ… Solutions & Next Steps - **`Fine-tune DETR`** on custom datasets for your domain. - Use **domain-specific pretrained models** (e.g., BioMed DETR, retail YOLOs). - Try other models like **SAM**, **DINOv2**, or **GroundingDINO** for more advanced segmentation or open-vocabulary detection. """) with st.expander("๐Ÿ› ๏ธ Fine-Tuning an Object Detection Model: Dos, Donโ€™ts & Real Effort"): st.markdown(""" Fine-tuning is the process of **adapting a pretrained model** (like DETR) to recognize **new or custom objects** from your own dataset. --- ###### โœ… What You *Should* Do - **Start with a pretrained model** Saves time and works well with smaller datasets. - **Prepare a clean, labeled dataset** โš ๏ธ *Labor-Intensive* You'll need hundreds or thousands of images **manually annotated with bounding boxes**. Tools: `LabelImg`, `Roboflow`, `CVAT`. - **Use transfer learning wisely** Freeze base layers and train higher layers first to avoid overfitting. - **Train in small batches initially** Helps you catch issues early (e.g., wrong labels, overfitting). - **Use data augmentation** Automatically increases variation using flips, crops, brightness, etc. --- ###### โŒ What You *Shouldnโ€™t* Do - **Donโ€™t use a high learning rate** It might "unlearn" everything from pretraining. - **Donโ€™t train from scratch** unless you have a huge dataset Pretrained models save compute and training time. - **Donโ€™t skip validation** Always use a validation set to evaluate generalization. - **Donโ€™t mismatch model formats** Your model might expect COCO-style annotations; ensure consistency. --- ###### ๐Ÿงช When Should You Fine-Tune? - You're detecting **custom objects** (e.g., tools, animals, X-ray anomalies). - Your domain is **very different** (e.g., drones, medical imaging). - You want **higher accuracy** for specific categories. --- ###### ๐Ÿ” Most Labor-Intensive Sub-Activity **โœ”๏ธ Annotating the dataset** (images + bounding boxes + class labels) This step takes **significant manual effort** and often requires **domain experts** (e.g., doctors for medical images, engineers for defect detection). ๐Ÿ’ก *Tip:* Use small datasets to experiment, and crowdsource or semi-automate annotation for larger ones. """) image_url = "https://raw.githubusercontent.com/gridflowai/gridflowAI-datasets-icons/7ec17a8e039d53a1dac09d22270251e318649457/AI-icons-images/image_bounding_boxes.png" st.image(image_url, caption="Objects manually annotated with bounding boxes", width=300) if __name__ == "__main__": main()