Spaces:
Running
Running
Bhupen
commited on
Commit
ยท
7da6924
1
Parent(s):
984e513
Object Detection intro
Browse files- app.py +305 -0
- requirements.txt +4 -0
app.py
ADDED
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import DetrImageProcessor, DetrForObjectDetection
|
3 |
+
from PIL import Image, ImageDraw, ImageFont
|
4 |
+
import torch
|
5 |
+
import io
|
6 |
+
|
7 |
+
# Load model and processor
|
8 |
+
@st.cache_resource
|
9 |
+
def load_model():
|
10 |
+
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
|
11 |
+
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
|
12 |
+
return processor, model
|
13 |
+
|
14 |
+
def draw_boxes(image, results, labels):
|
15 |
+
draw = ImageDraw.Draw(image)
|
16 |
+
font = ImageFont.load_default()
|
17 |
+
|
18 |
+
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
19 |
+
box = [round(i, 2) for i in box.tolist()]
|
20 |
+
draw.rectangle(box, outline="red", width=2)
|
21 |
+
label_text = f"{labels[label.item()]}: {score:.2f}"
|
22 |
+
draw.text((box[0], box[1] - 10), label_text, fill="red", font=font)
|
23 |
+
return image
|
24 |
+
|
25 |
+
def main():
|
26 |
+
st.set_page_config(page_title="๐ Object Detection Demo", layout="centered")
|
27 |
+
st.markdown("**๐ฏ Object Detection using Transformers (DETR)**")
|
28 |
+
st.write("Upload an image to detect objects using a pre-trained Transformer model: `facebook/detr-resnet-50`.")
|
29 |
+
|
30 |
+
uploaded_file = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
|
31 |
+
|
32 |
+
if uploaded_file:
|
33 |
+
image = Image.open(uploaded_file).convert("RGB")
|
34 |
+
|
35 |
+
# Create two columns for displaying images
|
36 |
+
col1, col2 = st.columns(2)
|
37 |
+
|
38 |
+
with col1:
|
39 |
+
st.image(image, caption="Original Image", width=200)
|
40 |
+
|
41 |
+
processor, model = load_model()
|
42 |
+
|
43 |
+
# Preprocess
|
44 |
+
inputs = processor(images=image, return_tensors="pt")
|
45 |
+
outputs = model(**inputs)
|
46 |
+
|
47 |
+
# Post-process
|
48 |
+
target_sizes = torch.tensor([image.size[::-1]])
|
49 |
+
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]
|
50 |
+
|
51 |
+
st.markdown("**๐ฆ Detected Objects**")
|
52 |
+
if results["boxes"].shape[0] == 0:
|
53 |
+
st.warning("No objects detected with confidence > 90%")
|
54 |
+
else:
|
55 |
+
labeled_image = image.copy()
|
56 |
+
labeled_image = draw_boxes(labeled_image, results, model.config.id2label)
|
57 |
+
|
58 |
+
with col2:
|
59 |
+
st.image(labeled_image, caption="Detected Objects", width=200)
|
60 |
+
|
61 |
+
for score, label in zip(results["scores"], results["labels"]):
|
62 |
+
st.write(f"- **{model.config.id2label[label.item()]}** โ Confidence: `{score:.2f}`")
|
63 |
+
|
64 |
+
with st.expander("โน๏ธ What is Pretraining and Which Model Are We Using?"):
|
65 |
+
st.markdown("""
|
66 |
+
**Pretraining** is like teaching a model some basic skills before asking it to do a specific task.
|
67 |
+
Just like a child first learns shapes, colors, and objects before learning to name or sort them,
|
68 |
+
a pre-trained model has already learned to recognize **general patterns** in thousands or millions of images.
|
69 |
+
|
70 |
+
๐ In our app, we are using a pre-trained model called:
|
71 |
+
|
72 |
+
### ๐ `facebook/detr-resnet-50`
|
73 |
+
- **DETR** stands for *DEtection TRansformer*. It's a special kind of deep learning model made by Facebook AI.
|
74 |
+
- It's been **trained on COCO dataset**, which includes 80 common object types like people, cars, dogs, chairs, etc.
|
75 |
+
- Because it's pre-trained, we **donโt need to train it ourselves** โ it already knows how to detect these objects!
|
76 |
+
|
77 |
+
๐ง So when you upload an image, the model just applies what it has already learned during pretraining to spot things in your image.
|
78 |
+
""")
|
79 |
+
|
80 |
+
with st.expander("โน๏ธ What is Object Detection?"):
|
81 |
+
st.markdown("""
|
82 |
+
Object detection is like playing "I spy with my little eye" โ but using AI!
|
83 |
+
Instead of just saying "there's a dog", object detection can also say **where** the dog is in the image using a **bounding box**.
|
84 |
+
|
85 |
+
It helps in:
|
86 |
+
- ๐ป Self-driving cars (detecting pedestrians and vehicles)
|
87 |
+
- ๐ธ Security cameras (detecting intrusions)
|
88 |
+
- ๐ฆ Inventory systems (detecting objects on shelves)
|
89 |
+
""")
|
90 |
+
|
91 |
+
with st.expander("โน๏ธ How Does the DETR Model Work?"):
|
92 |
+
st.markdown("""
|
93 |
+
**DETR** stands for **DEtection TRansformer**, a cutting-edge model developed by Facebook AI Research. It combines **Convolutional Neural Networks (CNNs)** and **Transformers** โ the same architecture used in ChatGPT and BERT โ to detect objects in images.
|
94 |
+
|
95 |
+
###### ๐๏ธ How is it Different?
|
96 |
+
Most older object detection models work in stages:
|
97 |
+
- First, they **generate regions of interest (ROIs)** (like boxes where something might be).
|
98 |
+
- Then they **classify** what's inside each box (cat, dog, etc.).
|
99 |
+
|
100 |
+
But DETR skips this multi-step process by using a **Transformer** to directly:
|
101 |
+
- Look at the image
|
102 |
+
- Predict **all objects and their locations at once** (end-to-end)
|
103 |
+
|
104 |
+
###### โ๏ธ Key Components of DETR:
|
105 |
+
- **CNN Backbone (like ResNet):** Extracts visual features from the image (e.g., edges, textures)
|
106 |
+
- **Transformer Encoder-Decoder:** Understands **global relationships** between features (e.g., where objects are in relation to each other)
|
107 |
+
- **Prediction Heads:** Predicts bounding boxes and labels
|
108 |
+
|
109 |
+
###### โจ Why is DETR Special?
|
110 |
+
- No need for complicated anchor boxes or region proposals
|
111 |
+
- Handles overlapping or cluttered objects better
|
112 |
+
- Learns in a more "human-like" way โ understanding the **whole scene**, not just pieces
|
113 |
+
|
114 |
+
###### ๐ฆ Pretrained Model in This App:
|
115 |
+
In this app, we're using **`facebook/detr-resnet-50`**, a model trained on **COCO dataset** (Common Objects in Context) with:
|
116 |
+
- 80 object categories (like person, car, bottle, chair)
|
117 |
+
- Over 100,000 images for training
|
118 |
+
|
119 |
+
It can detect things like:
|
120 |
+
""")
|
121 |
+
|
122 |
+
col1, col2, col3, col4 = st.columns(4)
|
123 |
+
|
124 |
+
with col1:
|
125 |
+
st.write("๐ถ Dogs")
|
126 |
+
st.write("๐ฑ Cats")
|
127 |
+
st.write("๐ง People")
|
128 |
+
st.write("๐ Cars")
|
129 |
+
st.write("๐ Buses")
|
130 |
+
st.write("๐ด Bicycles")
|
131 |
+
st.write("๐๏ธ Motorcycles")
|
132 |
+
|
133 |
+
with col2:
|
134 |
+
st.write("โ๏ธ Airplanes")
|
135 |
+
st.write("๐ค Boats")
|
136 |
+
st.write("๐ช Chairs")
|
137 |
+
st.write("๐๏ธ Beds")
|
138 |
+
st.write("๐ฅ๏ธ Monitors")
|
139 |
+
st.write("๐ฑ Cell Phones")
|
140 |
+
st.write("๐ท Cameras")
|
141 |
+
|
142 |
+
with col3:
|
143 |
+
st.write("๐ Apples")
|
144 |
+
st.write("๐ Bananas")
|
145 |
+
st.write("๐ Pizzas")
|
146 |
+
st.write("๐ฅซ Cans")
|
147 |
+
st.write("๐ฝ๏ธ Dining Tables")
|
148 |
+
st.write("๐๏ธ Couches")
|
149 |
+
st.write("๐งด Bottles")
|
150 |
+
|
151 |
+
with col4:
|
152 |
+
st.write("๐ Handbags")
|
153 |
+
st.write("๐งณ Suitcases")
|
154 |
+
st.write("โบ Tents")
|
155 |
+
st.write("๐ผ๏ธ Paintings")
|
156 |
+
st.write("๐ฆ Traffic Lights")
|
157 |
+
st.write("๐ Stop Signs")
|
158 |
+
st.write("๐ Cows")
|
159 |
+
|
160 |
+
with st.expander("๐ Real-World Use Cases of Object Detection"):
|
161 |
+
st.markdown("""
|
162 |
+
Object detection models like DETR are widely used in many industries. Here are some practical examples:
|
163 |
+
|
164 |
+
๐ **Security & Surveillance**
|
165 |
+
- Detecting people in restricted zones
|
166 |
+
- Identifying abandoned objects in public places
|
167 |
+
|
168 |
+
๐ฅ **Healthcare**
|
169 |
+
- Analyzing X-rays and MRI scans to detect tumors or anomalies
|
170 |
+
- Assisting doctors in surgical planning
|
171 |
+
|
172 |
+
๐ **Autonomous Vehicles**
|
173 |
+
- Identifying pedestrians, vehicles, traffic lights, and road signs in real-time
|
174 |
+
|
175 |
+
๐๏ธ **Retail**
|
176 |
+
- Automated checkout systems (e.g., Amazon Go)
|
177 |
+
- Shelf inventory monitoring using cameras
|
178 |
+
|
179 |
+
๐๏ธ **Construction & Safety**
|
180 |
+
- Monitoring helmet usage and safety compliance on sites
|
181 |
+
- Tracking equipment and workers
|
182 |
+
|
183 |
+
๐ธ **Aerial & Drone Imagery**
|
184 |
+
- Detecting objects (cars, animals, buildings) from satellite or drone images
|
185 |
+
|
186 |
+
๐ฑ **Mobile Applications**
|
187 |
+
- Real-time AR object tagging (e.g., identifying products in camera view)
|
188 |
+
|
189 |
+
๐ฎ **Gaming & Sports**
|
190 |
+
- Player and object tracking in sports analytics
|
191 |
+
- Enhanced real-time visuals in AR/VR environments
|
192 |
+
""")
|
193 |
+
|
194 |
+
with st.expander("๐ Categories vs Real-World Use Cases"):
|
195 |
+
st.markdown("""
|
196 |
+
###### ๐ฏ What DETR Can Detect (Pretrained Model)
|
197 |
+
The base DETR model (`facebook/detr-resnet-50`) is trained on the **COCO dataset**, which includes 91 common object categories, such as:
|
198 |
+
|
199 |
+
- ๐ง Person ๐ Car ๐ Bus ๐๏ธ Motorcycle
|
200 |
+
- ๐ถ Dog ๐ฑ Cat ๐ Cow
|
201 |
+
- ๐ Apple ๐ Banana
|
202 |
+
- ๐๏ธ Sofa ๐ช Chair ๐๏ธ Bed
|
203 |
+
- ๐บ TV ๐ฅ๏ธ Laptop ๐ท Camera
|
204 |
+
- ๐ฆ Bird ๐ Fish
|
205 |
+
|
206 |
+
This is great for **general object detection**, but there are some gaps when it comes to real-world applications.
|
207 |
+
|
208 |
+
---
|
209 |
+
|
210 |
+
###### โ What It Misses for Real-World Use Cases
|
211 |
+
|
212 |
+
In specialized or industrial domains, we often need to detect:
|
213 |
+
|
214 |
+
###### ๐ฅ **Medical Imaging**
|
215 |
+
- Tumors, organs (lungs, liver), anomalies
|
216 |
+
> โ ๏ธ COCO doesnโt have these.
|
217 |
+
|
218 |
+
###### ๐ก๏ธ **Security/Surveillance**
|
219 |
+
- Weapons, intrusions, suspicious behavior
|
220 |
+
> Not covered in COCO.
|
221 |
+
|
222 |
+
###### ๐ญ **Manufacturing**
|
223 |
+
- Machine parts, tools, defects
|
224 |
+
> Also outside COCOโs categories.
|
225 |
+
|
226 |
+
###### ๐งช **Scientific Research**
|
227 |
+
- Cells, molecules, lab equipment
|
228 |
+
|
229 |
+
###### ๐ฆ **Retail**
|
230 |
+
- Brands, barcodes, product layouts
|
231 |
+
|
232 |
+
---
|
233 |
+
|
234 |
+
###### โ
Solutions & Next Steps
|
235 |
+
|
236 |
+
- **`Fine-tune DETR`** on custom datasets for your domain.
|
237 |
+
- Use **domain-specific pretrained models** (e.g., BioMed DETR, retail YOLOs).
|
238 |
+
- Try other models like **SAM**, **DINOv2**, or **GroundingDINO** for more advanced segmentation or open-vocabulary detection.
|
239 |
+
|
240 |
+
""")
|
241 |
+
|
242 |
+
with st.expander("๐ ๏ธ Fine-Tuning an Object Detection Model: Dos, Donโts & Real Effort"):
|
243 |
+
st.markdown("""
|
244 |
+
Fine-tuning is the process of **adapting a pretrained model** (like DETR) to recognize **new or custom objects** from your own dataset.
|
245 |
+
|
246 |
+
---
|
247 |
+
|
248 |
+
###### โ
What You *Should* Do
|
249 |
+
|
250 |
+
- **Start with a pretrained model**
|
251 |
+
Saves time and works well with smaller datasets.
|
252 |
+
|
253 |
+
- **Prepare a clean, labeled dataset** โ ๏ธ *Labor-Intensive*
|
254 |
+
You'll need hundreds or thousands of images **manually annotated with bounding boxes**.
|
255 |
+
Tools: `LabelImg`, `Roboflow`, `CVAT`.
|
256 |
+
|
257 |
+
- **Use transfer learning wisely**
|
258 |
+
Freeze base layers and train higher layers first to avoid overfitting.
|
259 |
+
|
260 |
+
- **Train in small batches initially**
|
261 |
+
Helps you catch issues early (e.g., wrong labels, overfitting).
|
262 |
+
|
263 |
+
- **Use data augmentation**
|
264 |
+
Automatically increases variation using flips, crops, brightness, etc.
|
265 |
+
|
266 |
+
---
|
267 |
+
|
268 |
+
###### โ What You *Shouldnโt* Do
|
269 |
+
|
270 |
+
- **Donโt use a high learning rate**
|
271 |
+
It might "unlearn" everything from pretraining.
|
272 |
+
|
273 |
+
- **Donโt train from scratch** unless you have a huge dataset
|
274 |
+
Pretrained models save compute and training time.
|
275 |
+
|
276 |
+
- **Donโt skip validation**
|
277 |
+
Always use a validation set to evaluate generalization.
|
278 |
+
|
279 |
+
- **Donโt mismatch model formats**
|
280 |
+
Your model might expect COCO-style annotations; ensure consistency.
|
281 |
+
|
282 |
+
---
|
283 |
+
|
284 |
+
###### ๐งช When Should You Fine-Tune?
|
285 |
+
|
286 |
+
- You're detecting **custom objects** (e.g., tools, animals, X-ray anomalies).
|
287 |
+
- Your domain is **very different** (e.g., drones, medical imaging).
|
288 |
+
- You want **higher accuracy** for specific categories.
|
289 |
+
|
290 |
+
---
|
291 |
+
|
292 |
+
###### ๐ Most Labor-Intensive Sub-Activity
|
293 |
+
|
294 |
+
**โ๏ธ Annotating the dataset** (images + bounding boxes + class labels)
|
295 |
+
This step takes **significant manual effort** and often requires **domain experts** (e.g., doctors for medical images, engineers for defect detection).
|
296 |
+
|
297 |
+
๐ก *Tip:* Use small datasets to experiment, and crowdsource or semi-automate annotation for larger ones.
|
298 |
+
|
299 |
+
""")
|
300 |
+
|
301 |
+
st.image("image3.png", caption="Objects manually annotated with bounding boxes", width=300)
|
302 |
+
|
303 |
+
|
304 |
+
if __name__ == "__main__":
|
305 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch>=1.13.0
|
2 |
+
transformers>=4.25.0
|
3 |
+
Pillow>=9.3.0
|
4 |
+
numpy>=1.23.0
|