Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import cv2
|
3 |
+
import json
|
4 |
+
import tempfile
|
5 |
+
import os
|
6 |
+
from ultralytics import YOLO
|
7 |
+
import numpy as np
|
8 |
+
from collections import defaultdict
|
9 |
+
from typing import Dict, List, Tuple, Any
|
10 |
+
|
11 |
+
class HumanTracker:
|
12 |
+
def __init__(self):
|
13 |
+
# Load YOLOv11 model - using the nano version for faster processing
|
14 |
+
# You can change to yolo11s.pt, yolo11m.pt, yolo11l.pt, or yolo11x.pt for better accuracy
|
15 |
+
self.model = YOLO("yolo11n.pt")
|
16 |
+
|
17 |
+
def calculate_center(self, x1: float, y1: float, x2: float, y2: float) -> Tuple[float, float]:
|
18 |
+
"""Calculate center coordinates from bounding box coordinates."""
|
19 |
+
center_x = (x1 + x2) / 2
|
20 |
+
center_y = (y1 + y2) / 2
|
21 |
+
return center_x, center_y
|
22 |
+
|
23 |
+
def process_video(self, video_path: str, progress_callback=None) -> Dict[str, Any]:
|
24 |
+
"""
|
25 |
+
Process video file and extract human tracking data.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
video_path: Path to the input video file
|
29 |
+
progress_callback: Optional callback function for progress updates
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
Dictionary containing processed tracking data in the required JSON format
|
33 |
+
"""
|
34 |
+
cap = cv2.VideoCapture(video_path)
|
35 |
+
if not cap.isOpened():
|
36 |
+
raise ValueError(f"Could not open video file: {video_path}")
|
37 |
+
|
38 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
39 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
40 |
+
|
41 |
+
frame_data = {}
|
42 |
+
id_mapping = {} # Maps original YOLO IDs to simplified sequential IDs
|
43 |
+
next_person_id = 1
|
44 |
+
|
45 |
+
print(f"Processing video: {total_frames} frames at {fps} FPS")
|
46 |
+
|
47 |
+
# Process video with YOLO tracking
|
48 |
+
# Using stream=True for memory efficiency with large videos
|
49 |
+
results = self.model.track(
|
50 |
+
video_path,
|
51 |
+
classes=[0], # Only detect humans (class 0)
|
52 |
+
persist=True, # Enable tracking
|
53 |
+
stream=True,
|
54 |
+
verbose=False
|
55 |
+
)
|
56 |
+
|
57 |
+
frame_count = 0
|
58 |
+
for result in results:
|
59 |
+
if progress_callback:
|
60 |
+
progress = (frame_count + 1) / total_frames
|
61 |
+
progress_callback(progress, f"Processing frame {frame_count + 1}/{total_frames}")
|
62 |
+
|
63 |
+
# Check if any detections exist
|
64 |
+
if result.boxes is not None and len(result.boxes) > 0:
|
65 |
+
# Extract bounding boxes, track IDs, and confidences
|
66 |
+
boxes = result.boxes.xyxy.cpu().numpy() # x1, y1, x2, y2 format
|
67 |
+
track_ids = result.boxes.id
|
68 |
+
confidences = result.boxes.conf.cpu().numpy()
|
69 |
+
|
70 |
+
if track_ids is not None:
|
71 |
+
track_ids = track_ids.int().cpu().numpy()
|
72 |
+
|
73 |
+
people_in_frame = []
|
74 |
+
|
75 |
+
for box, track_id, confidence in zip(boxes, track_ids, confidences):
|
76 |
+
x1, y1, x2, y2 = box
|
77 |
+
|
78 |
+
# Map original YOLO ID to simplified sequential ID
|
79 |
+
if track_id not in id_mapping:
|
80 |
+
id_mapping[track_id] = next_person_id
|
81 |
+
next_person_id += 1
|
82 |
+
|
83 |
+
person_id = id_mapping[track_id]
|
84 |
+
|
85 |
+
# Calculate center coordinates
|
86 |
+
center_x, center_y = self.calculate_center(x1, y1, x2, y2)
|
87 |
+
|
88 |
+
# Create person data
|
89 |
+
person_data = {
|
90 |
+
"person_id": person_id,
|
91 |
+
"center_x": float(center_x),
|
92 |
+
"center_y": float(center_y),
|
93 |
+
"confidence": float(confidence),
|
94 |
+
"bbox": {
|
95 |
+
"x1": float(x1),
|
96 |
+
"y1": float(y1),
|
97 |
+
"x2": float(x2),
|
98 |
+
"y2": float(y2)
|
99 |
+
}
|
100 |
+
}
|
101 |
+
people_in_frame.append(person_data)
|
102 |
+
|
103 |
+
if people_in_frame:
|
104 |
+
# Sort people by person_id for consistency
|
105 |
+
people_in_frame.sort(key=lambda x: x["person_id"])
|
106 |
+
frame_data[frame_count] = people_in_frame
|
107 |
+
|
108 |
+
frame_count += 1
|
109 |
+
|
110 |
+
cap.release()
|
111 |
+
|
112 |
+
# Convert to the required JSON format
|
113 |
+
frames_list = []
|
114 |
+
sorted_frames = sorted(frame_data.keys())
|
115 |
+
|
116 |
+
for frame_num in sorted_frames:
|
117 |
+
frames_list.append({
|
118 |
+
"frame": frame_num,
|
119 |
+
"people": frame_data[frame_num]
|
120 |
+
})
|
121 |
+
|
122 |
+
# Create the final output structure
|
123 |
+
output = {
|
124 |
+
"metadata": {
|
125 |
+
"total_frames": len(frames_list),
|
126 |
+
"total_people": len(id_mapping),
|
127 |
+
"video_info": {
|
128 |
+
"fps": float(fps),
|
129 |
+
"total_video_frames": total_frames
|
130 |
+
},
|
131 |
+
"id_mapping": {str(original_id): simplified_id for original_id, simplified_id in id_mapping.items()}
|
132 |
+
},
|
133 |
+
"frames": frames_list
|
134 |
+
}
|
135 |
+
|
136 |
+
return output
|
137 |
+
|
138 |
+
def process_video_gradio(video_file, progress=gr.Progress()):
|
139 |
+
"""
|
140 |
+
Gradio interface function for processing videos.
|
141 |
+
|
142 |
+
Args:
|
143 |
+
video_file: Uploaded video file from Gradio
|
144 |
+
progress: Gradio progress tracker
|
145 |
+
|
146 |
+
Returns:
|
147 |
+
Tuple of (JSON file path, status message, preview of results)
|
148 |
+
"""
|
149 |
+
if video_file is None:
|
150 |
+
return None, "β Please upload a video file", "No video uploaded"
|
151 |
+
|
152 |
+
try:
|
153 |
+
# Initialize the tracker
|
154 |
+
tracker = HumanTracker()
|
155 |
+
|
156 |
+
# Create progress callback
|
157 |
+
def update_progress(prog, msg):
|
158 |
+
progress(prog, desc=msg)
|
159 |
+
|
160 |
+
# Process the video
|
161 |
+
progress(0.1, desc="Starting video processing...")
|
162 |
+
results = tracker.process_video(video_file, update_progress)
|
163 |
+
|
164 |
+
progress(0.9, desc="Generating JSON output...")
|
165 |
+
|
166 |
+
# Create temporary JSON file
|
167 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
|
168 |
+
json.dump(results, f, indent=2)
|
169 |
+
json_path = f.name
|
170 |
+
|
171 |
+
# Create a preview of the results
|
172 |
+
metadata = results["metadata"]
|
173 |
+
total_frames = metadata["total_frames"]
|
174 |
+
total_people = metadata["total_people"]
|
175 |
+
|
176 |
+
preview = f"""
|
177 |
+
π **Processing Results:**
|
178 |
+
- **Total frames with detections:** {total_frames}
|
179 |
+
- **Unique people detected:** {total_people}
|
180 |
+
- **Original video frames:** {metadata.get('video_info', {}).get('total_video_frames', 'N/A')}
|
181 |
+
- **Video FPS:** {metadata.get('video_info', {}).get('fps', 'N/A'):.2f}
|
182 |
+
|
183 |
+
π **ID Mapping:**
|
184 |
+
{json.dumps(metadata["id_mapping"], indent=2)}
|
185 |
+
|
186 |
+
π **Sample Frame Data (first frame):**
|
187 |
+
{json.dumps(results["frames"][:1] if results["frames"] else [], indent=2)}
|
188 |
+
"""
|
189 |
+
|
190 |
+
progress(1.0, desc="β
Processing complete!")
|
191 |
+
|
192 |
+
return (
|
193 |
+
json_path,
|
194 |
+
f"β
Successfully processed video! Detected {total_people} unique people across {total_frames} frames.",
|
195 |
+
preview
|
196 |
+
)
|
197 |
+
|
198 |
+
except Exception as e:
|
199 |
+
error_msg = f"β Error processing video: {str(e)}"
|
200 |
+
print(error_msg)
|
201 |
+
return None, error_msg, f"Error details: {str(e)}"
|
202 |
+
|
203 |
+
# Create the Gradio interface
|
204 |
+
def create_interface():
|
205 |
+
with gr.Blocks(
|
206 |
+
title="YOLOv11 Human Tracking & Coordinate Extraction",
|
207 |
+
theme=gr.themes.Soft()
|
208 |
+
) as demo:
|
209 |
+
|
210 |
+
gr.Markdown("""
|
211 |
+
# π― YOLOv11 Human Tracking & Coordinate Extraction
|
212 |
+
|
213 |
+
Upload a video to detect and track humans using YOLOv11. The app will:
|
214 |
+
- π Detect humans in each frame
|
215 |
+
- π― Track individuals across frames with unique IDs
|
216 |
+
- π Extract bounding box coordinates and center points
|
217 |
+
- π Generate JSON output for text overlay positioning
|
218 |
+
|
219 |
+
**Supported formats:** MP4, AVI, MOV, WEBM
|
220 |
+
""")
|
221 |
+
|
222 |
+
with gr.Row():
|
223 |
+
with gr.Column(scale=1):
|
224 |
+
video_input = gr.Video(
|
225 |
+
label="πΉ Upload Video",
|
226 |
+
height=400
|
227 |
+
)
|
228 |
+
|
229 |
+
process_btn = gr.Button(
|
230 |
+
"π Process Video",
|
231 |
+
variant="primary",
|
232 |
+
size="lg"
|
233 |
+
)
|
234 |
+
|
235 |
+
with gr.Column(scale=1):
|
236 |
+
json_output = gr.File(
|
237 |
+
label="π Download JSON Results",
|
238 |
+
file_count="single"
|
239 |
+
)
|
240 |
+
|
241 |
+
status_output = gr.Textbox(
|
242 |
+
label="π Status",
|
243 |
+
value="Ready to process video...",
|
244 |
+
interactive=False
|
245 |
+
)
|
246 |
+
|
247 |
+
with gr.Row():
|
248 |
+
preview_output = gr.Textbox(
|
249 |
+
label="ποΈ Results Preview",
|
250 |
+
lines=15,
|
251 |
+
interactive=False,
|
252 |
+
placeholder="Results preview will appear here after processing..."
|
253 |
+
)
|
254 |
+
|
255 |
+
# Event handlers
|
256 |
+
process_btn.click(
|
257 |
+
fn=process_video_gradio,
|
258 |
+
inputs=[video_input],
|
259 |
+
outputs=[json_output, status_output, preview_output],
|
260 |
+
show_progress=True
|
261 |
+
)
|
262 |
+
|
263 |
+
# Example section
|
264 |
+
gr.Markdown("""
|
265 |
+
## π Output Format
|
266 |
+
|
267 |
+
The generated JSON file contains:
|
268 |
+
- **metadata**: Video info, total people count, ID mappings
|
269 |
+
- **frames**: Array of frame data with person detections
|
270 |
+
|
271 |
+
Each person detection includes:
|
272 |
+
- `person_id`: Unique identifier for tracking
|
273 |
+
- `center_x`, `center_y`: Center coordinates for text overlay positioning
|
274 |
+
- `confidence`: Detection confidence score
|
275 |
+
- `bbox`: Full bounding box coordinates (x1, y1, x2, y2)
|
276 |
+
""")
|
277 |
+
|
278 |
+
return demo
|
279 |
+
|
280 |
+
if __name__ == "__main__":
|
281 |
+
# Create and launch the interface
|
282 |
+
demo = create_interface()
|
283 |
+
demo.launch(
|
284 |
+
server_name="0.0.0.0", # Allow external access
|
285 |
+
server_port=7860,
|
286 |
+
share=False, # Set to True if you want a public link
|
287 |
+
show_error=True
|
288 |
+
)
|