Taino commited on
Commit
783b779
Β·
verified Β·
1 Parent(s): f24c883

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +288 -0
app.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import cv2
3
+ import json
4
+ import tempfile
5
+ import os
6
+ from ultralytics import YOLO
7
+ import numpy as np
8
+ from collections import defaultdict
9
+ from typing import Dict, List, Tuple, Any
10
+
11
+ class HumanTracker:
12
+ def __init__(self):
13
+ # Load YOLOv11 model - using the nano version for faster processing
14
+ # You can change to yolo11s.pt, yolo11m.pt, yolo11l.pt, or yolo11x.pt for better accuracy
15
+ self.model = YOLO("yolo11n.pt")
16
+
17
+ def calculate_center(self, x1: float, y1: float, x2: float, y2: float) -> Tuple[float, float]:
18
+ """Calculate center coordinates from bounding box coordinates."""
19
+ center_x = (x1 + x2) / 2
20
+ center_y = (y1 + y2) / 2
21
+ return center_x, center_y
22
+
23
+ def process_video(self, video_path: str, progress_callback=None) -> Dict[str, Any]:
24
+ """
25
+ Process video file and extract human tracking data.
26
+
27
+ Args:
28
+ video_path: Path to the input video file
29
+ progress_callback: Optional callback function for progress updates
30
+
31
+ Returns:
32
+ Dictionary containing processed tracking data in the required JSON format
33
+ """
34
+ cap = cv2.VideoCapture(video_path)
35
+ if not cap.isOpened():
36
+ raise ValueError(f"Could not open video file: {video_path}")
37
+
38
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
39
+ fps = cap.get(cv2.CAP_PROP_FPS)
40
+
41
+ frame_data = {}
42
+ id_mapping = {} # Maps original YOLO IDs to simplified sequential IDs
43
+ next_person_id = 1
44
+
45
+ print(f"Processing video: {total_frames} frames at {fps} FPS")
46
+
47
+ # Process video with YOLO tracking
48
+ # Using stream=True for memory efficiency with large videos
49
+ results = self.model.track(
50
+ video_path,
51
+ classes=[0], # Only detect humans (class 0)
52
+ persist=True, # Enable tracking
53
+ stream=True,
54
+ verbose=False
55
+ )
56
+
57
+ frame_count = 0
58
+ for result in results:
59
+ if progress_callback:
60
+ progress = (frame_count + 1) / total_frames
61
+ progress_callback(progress, f"Processing frame {frame_count + 1}/{total_frames}")
62
+
63
+ # Check if any detections exist
64
+ if result.boxes is not None and len(result.boxes) > 0:
65
+ # Extract bounding boxes, track IDs, and confidences
66
+ boxes = result.boxes.xyxy.cpu().numpy() # x1, y1, x2, y2 format
67
+ track_ids = result.boxes.id
68
+ confidences = result.boxes.conf.cpu().numpy()
69
+
70
+ if track_ids is not None:
71
+ track_ids = track_ids.int().cpu().numpy()
72
+
73
+ people_in_frame = []
74
+
75
+ for box, track_id, confidence in zip(boxes, track_ids, confidences):
76
+ x1, y1, x2, y2 = box
77
+
78
+ # Map original YOLO ID to simplified sequential ID
79
+ if track_id not in id_mapping:
80
+ id_mapping[track_id] = next_person_id
81
+ next_person_id += 1
82
+
83
+ person_id = id_mapping[track_id]
84
+
85
+ # Calculate center coordinates
86
+ center_x, center_y = self.calculate_center(x1, y1, x2, y2)
87
+
88
+ # Create person data
89
+ person_data = {
90
+ "person_id": person_id,
91
+ "center_x": float(center_x),
92
+ "center_y": float(center_y),
93
+ "confidence": float(confidence),
94
+ "bbox": {
95
+ "x1": float(x1),
96
+ "y1": float(y1),
97
+ "x2": float(x2),
98
+ "y2": float(y2)
99
+ }
100
+ }
101
+ people_in_frame.append(person_data)
102
+
103
+ if people_in_frame:
104
+ # Sort people by person_id for consistency
105
+ people_in_frame.sort(key=lambda x: x["person_id"])
106
+ frame_data[frame_count] = people_in_frame
107
+
108
+ frame_count += 1
109
+
110
+ cap.release()
111
+
112
+ # Convert to the required JSON format
113
+ frames_list = []
114
+ sorted_frames = sorted(frame_data.keys())
115
+
116
+ for frame_num in sorted_frames:
117
+ frames_list.append({
118
+ "frame": frame_num,
119
+ "people": frame_data[frame_num]
120
+ })
121
+
122
+ # Create the final output structure
123
+ output = {
124
+ "metadata": {
125
+ "total_frames": len(frames_list),
126
+ "total_people": len(id_mapping),
127
+ "video_info": {
128
+ "fps": float(fps),
129
+ "total_video_frames": total_frames
130
+ },
131
+ "id_mapping": {str(original_id): simplified_id for original_id, simplified_id in id_mapping.items()}
132
+ },
133
+ "frames": frames_list
134
+ }
135
+
136
+ return output
137
+
138
+ def process_video_gradio(video_file, progress=gr.Progress()):
139
+ """
140
+ Gradio interface function for processing videos.
141
+
142
+ Args:
143
+ video_file: Uploaded video file from Gradio
144
+ progress: Gradio progress tracker
145
+
146
+ Returns:
147
+ Tuple of (JSON file path, status message, preview of results)
148
+ """
149
+ if video_file is None:
150
+ return None, "❌ Please upload a video file", "No video uploaded"
151
+
152
+ try:
153
+ # Initialize the tracker
154
+ tracker = HumanTracker()
155
+
156
+ # Create progress callback
157
+ def update_progress(prog, msg):
158
+ progress(prog, desc=msg)
159
+
160
+ # Process the video
161
+ progress(0.1, desc="Starting video processing...")
162
+ results = tracker.process_video(video_file, update_progress)
163
+
164
+ progress(0.9, desc="Generating JSON output...")
165
+
166
+ # Create temporary JSON file
167
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
168
+ json.dump(results, f, indent=2)
169
+ json_path = f.name
170
+
171
+ # Create a preview of the results
172
+ metadata = results["metadata"]
173
+ total_frames = metadata["total_frames"]
174
+ total_people = metadata["total_people"]
175
+
176
+ preview = f"""
177
+ πŸ“Š **Processing Results:**
178
+ - **Total frames with detections:** {total_frames}
179
+ - **Unique people detected:** {total_people}
180
+ - **Original video frames:** {metadata.get('video_info', {}).get('total_video_frames', 'N/A')}
181
+ - **Video FPS:** {metadata.get('video_info', {}).get('fps', 'N/A'):.2f}
182
+
183
+ πŸ†” **ID Mapping:**
184
+ {json.dumps(metadata["id_mapping"], indent=2)}
185
+
186
+ πŸ“‹ **Sample Frame Data (first frame):**
187
+ {json.dumps(results["frames"][:1] if results["frames"] else [], indent=2)}
188
+ """
189
+
190
+ progress(1.0, desc="βœ… Processing complete!")
191
+
192
+ return (
193
+ json_path,
194
+ f"βœ… Successfully processed video! Detected {total_people} unique people across {total_frames} frames.",
195
+ preview
196
+ )
197
+
198
+ except Exception as e:
199
+ error_msg = f"❌ Error processing video: {str(e)}"
200
+ print(error_msg)
201
+ return None, error_msg, f"Error details: {str(e)}"
202
+
203
+ # Create the Gradio interface
204
+ def create_interface():
205
+ with gr.Blocks(
206
+ title="YOLOv11 Human Tracking & Coordinate Extraction",
207
+ theme=gr.themes.Soft()
208
+ ) as demo:
209
+
210
+ gr.Markdown("""
211
+ # 🎯 YOLOv11 Human Tracking & Coordinate Extraction
212
+
213
+ Upload a video to detect and track humans using YOLOv11. The app will:
214
+ - πŸ” Detect humans in each frame
215
+ - 🎯 Track individuals across frames with unique IDs
216
+ - πŸ“ Extract bounding box coordinates and center points
217
+ - πŸ“ Generate JSON output for text overlay positioning
218
+
219
+ **Supported formats:** MP4, AVI, MOV, WEBM
220
+ """)
221
+
222
+ with gr.Row():
223
+ with gr.Column(scale=1):
224
+ video_input = gr.Video(
225
+ label="πŸ“Ή Upload Video",
226
+ height=400
227
+ )
228
+
229
+ process_btn = gr.Button(
230
+ "πŸš€ Process Video",
231
+ variant="primary",
232
+ size="lg"
233
+ )
234
+
235
+ with gr.Column(scale=1):
236
+ json_output = gr.File(
237
+ label="πŸ“ Download JSON Results",
238
+ file_count="single"
239
+ )
240
+
241
+ status_output = gr.Textbox(
242
+ label="πŸ“Š Status",
243
+ value="Ready to process video...",
244
+ interactive=False
245
+ )
246
+
247
+ with gr.Row():
248
+ preview_output = gr.Textbox(
249
+ label="πŸ‘οΈ Results Preview",
250
+ lines=15,
251
+ interactive=False,
252
+ placeholder="Results preview will appear here after processing..."
253
+ )
254
+
255
+ # Event handlers
256
+ process_btn.click(
257
+ fn=process_video_gradio,
258
+ inputs=[video_input],
259
+ outputs=[json_output, status_output, preview_output],
260
+ show_progress=True
261
+ )
262
+
263
+ # Example section
264
+ gr.Markdown("""
265
+ ## πŸ“‹ Output Format
266
+
267
+ The generated JSON file contains:
268
+ - **metadata**: Video info, total people count, ID mappings
269
+ - **frames**: Array of frame data with person detections
270
+
271
+ Each person detection includes:
272
+ - `person_id`: Unique identifier for tracking
273
+ - `center_x`, `center_y`: Center coordinates for text overlay positioning
274
+ - `confidence`: Detection confidence score
275
+ - `bbox`: Full bounding box coordinates (x1, y1, x2, y2)
276
+ """)
277
+
278
+ return demo
279
+
280
+ if __name__ == "__main__":
281
+ # Create and launch the interface
282
+ demo = create_interface()
283
+ demo.launch(
284
+ server_name="0.0.0.0", # Allow external access
285
+ server_port=7860,
286
+ share=False, # Set to True if you want a public link
287
+ show_error=True
288
+ )