Segizu commited on
Commit
b518740
·
1 Parent(s): 8ac08dc
Files changed (2) hide show
  1. app.py +158 -80
  2. requirements.txt +1 -5
app.py CHANGED
@@ -1,87 +1,165 @@
1
- import cv2
2
  import gradio as gr
 
3
  from ultralytics import YOLO
4
- from PIL import Image
 
 
5
  import tempfile
6
 
7
- # Cargamos el modelo YOLOv8 (puedes usar yolov8n.pt, yolov8s.pt, etc.)
8
- model = YOLO("yolov8n.pt")
9
-
10
- def process_video(video_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  """
12
- Procesa un video, detecta personas, bicicletas y motos con YOLOv8,
13
- y dibuja los recuadros y etiquetas en cada frame. Devuelve un .mp4 anotado.
14
  """
15
- cap = cv2.VideoCapture(video_path)
16
- if not cap.isOpened():
17
- return None
18
-
19
- width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
20
- height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
21
- fps = cap.get(cv2.CAP_PROP_FPS)
22
-
23
- # Creamos un archivo temporal para guardar el resultado
24
- tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
25
- output_path = tmp_file.name
26
- tmp_file.close()
27
-
28
- # Usamos un códec compatible con navegadores (H.264 / avc1)
29
- fourcc = cv2.VideoWriter_fourcc(*'avc1')
30
- out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
31
-
32
- # Clases que nos interesan
33
- valid_classes = ["person", "bicycle", "motorcycle"]
34
-
35
- while True:
36
- ret, frame = cap.read()
37
- if not ret:
38
- break
39
-
40
- # Convertir BGR -> RGB para predecir con YOLO
41
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
42
 
43
- # Hacemos la inferencia con un umbral de confianza del 0.5
44
- results = model.predict(frame_rgb, conf=0.5)
45
- # results es una lista; tomamos la primera predicción
46
- boxes = results[0].boxes
47
-
48
- # Dibujamos cada bounding box
49
- for box in boxes:
50
- # box.cls, box.conf y box.xyxy son tensores, así que convertimos a Python float/int
51
- cls_id = int(box.cls[0].item()) # Índice de la clase
52
- conf = float(box.conf[0].item()) # Confianza
53
- x1, y1, x2, y2 = box.xyxy[0] # Coordenadas [xmin, ymin, xmax, ymax]
54
-
55
- class_name = model.names[cls_id]
56
- if class_name in valid_classes:
57
- # Dibujamos el rectángulo
58
- cv2.rectangle(frame,
59
- (int(x1), int(y1)),
60
- (int(x2), int(y2)),
61
- (0, 255, 0), 2)
62
-
63
- text = f"{class_name} {conf:.2f}"
64
- cv2.putText(frame, text,
65
- (int(x1), int(y1) - 10),
66
- cv2.FONT_HERSHEY_SIMPLEX, 0.5,
67
- (0, 255, 0), 2)
68
-
69
- # Guardamos el frame anotado en el video de salida
70
- out.write(frame)
71
-
72
- cap.release()
73
- out.release()
74
- return output_path
75
-
76
- # Interfaz de Gradio
77
- iface = gr.Interface(
78
- fn=process_video,
79
- inputs=gr.Video(label="Sube tu video"),
80
- outputs=gr.Video(label="Video procesado"),
81
- title="Detección de Objetos con YOLOv8",
82
- description="Sube un video y se detectan personas, bicicletas y motos con YOLOv8. "
83
- "Los objetos se enmarcan y etiquetan en el video resultante."
84
- )
85
-
86
- if __name__ == "__main__":
87
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from PIL import Image, ImageDraw, ImageFont
3
  from ultralytics import YOLO
4
+ import spaces
5
+ import cv2
6
+ import numpy as np
7
  import tempfile
8
 
9
+ @spaces.GPU
10
+ def yolo_inference(input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection):
11
+ if input_type == "Image":
12
+ if image is None:
13
+ width, height = 640, 480
14
+ blank_image = Image.new("RGB", (width, height), color="white")
15
+ draw = ImageDraw.Draw(blank_image)
16
+ message = "No image provided"
17
+ font = ImageFont.load_default(size=40)
18
+ bbox = draw.textbbox((0, 0), message, font=font)
19
+ text_width = bbox[2] - bbox[0]
20
+ text_height = bbox[3] - bbox[1]
21
+ text_x = (width - text_width) / 2
22
+ text_y = (height - text_height) / 2
23
+ draw.text((text_x, text_y), message, fill="black", font=font)
24
+ return blank_image, None
25
+
26
+ model = YOLO(model_id)
27
+ results = model.predict(
28
+ source=image,
29
+ conf=conf_threshold,
30
+ iou=iou_threshold,
31
+ imgsz=640,
32
+ max_det=max_detection,
33
+ show_labels=True,
34
+ show_conf=True,
35
+ )
36
+ for r in results:
37
+ image_array = r.plot()
38
+ annotated_image = Image.fromarray(image_array[..., ::-1])
39
+ return annotated_image, None
40
+
41
+ elif input_type == "Video":
42
+ if video is None:
43
+ width, height = 640, 480
44
+ blank_image = Image.new("RGB", (width, height), color="white")
45
+ draw = ImageDraw.Draw(blank_image)
46
+ message = "No video provided"
47
+ font = ImageFont.load_default(size=40)
48
+ bbox = draw.textbbox((0, 0), message, font=font)
49
+ text_width = bbox[2] - bbox[0]
50
+ text_height = bbox[3] - bbox[1]
51
+ text_x = (width - text_width) / 2
52
+ text_y = (height - text_height) / 2
53
+ draw.text((text_x, text_y), message, fill="black", font=font)
54
+ temp_video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
55
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
56
+ out = cv2.VideoWriter(temp_video_file, fourcc, 1, (width, height))
57
+ frame = cv2.cvtColor(np.array(blank_image), cv2.COLOR_RGB2BGR)
58
+ out.write(frame)
59
+ out.release()
60
+ return None, temp_video_file
61
+
62
+ model = YOLO(model_id)
63
+ cap = cv2.VideoCapture(video)
64
+ fps = cap.get(cv2.CAP_PROP_FPS) if cap.get(cv2.CAP_PROP_FPS) > 0 else 25
65
+ frames = []
66
+ while True:
67
+ ret, frame = cap.read()
68
+ if not ret:
69
+ break
70
+ pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
71
+ results = model.predict(
72
+ source=pil_frame,
73
+ conf=conf_threshold,
74
+ iou=iou_threshold,
75
+ imgsz=640,
76
+ max_det=max_detection,
77
+ show_labels=True,
78
+ show_conf=True,
79
+ )
80
+ for r in results:
81
+ annotated_frame_array = r.plot()
82
+ annotated_frame = cv2.cvtColor(annotated_frame_array, cv2.COLOR_BGR2RGB)
83
+ frames.append(annotated_frame)
84
+ cap.release()
85
+ if len(frames) == 0:
86
+ return None, None
87
+
88
+ height_out, width_out, _ = frames[0].shape
89
+ temp_video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
90
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
91
+ out = cv2.VideoWriter(temp_video_file, fourcc, fps, (width_out, height_out))
92
+ for f in frames:
93
+ f_bgr = cv2.cvtColor(f, cv2.COLOR_RGB2BGR)
94
+ out.write(f_bgr)
95
+ out.release()
96
+ return None, temp_video_file
97
+
98
+ else:
99
+ return None, None
100
+
101
+ def update_visibility(input_type):
102
  """
103
+ Show/hide image/video input and output depending on input_type.
 
104
  """
105
+ if input_type == "Image":
106
+ # image, video, output_image, output_video
107
+ return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
108
+ else:
109
+ return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ def yolo_inference_for_examples(image, model_id, conf_threshold, iou_threshold, max_detection):
112
+ """
113
+ This is called by gr.Examples. We force the radio to 'Image'
114
+ and then do a standard image inference, returning both updated radio
115
+ value and the annotated image.
116
+ """
117
+ annotated_image, _ = yolo_inference(
118
+ input_type="Image",
119
+ image=image,
120
+ video=None,
121
+ model_id=model_id,
122
+ conf_threshold=conf_threshold,
123
+ iou_threshold=iou_threshold,
124
+ max_detection=max_detection
125
+ )
126
+ return gr.update(value="Image"), annotated_image
127
+
128
+ with gr.Blocks() as app:
129
+ gr.Markdown("# Yolo11: Object Detection, Instance Segmentation, Pose/Keypoints, Oriented Detection, Classification")
130
+ gr.Markdown("Upload image(s) or video(s) for inference using the latest Ultralytics YOLO11 models.")
131
+
132
+ with gr.Row():
133
+ with gr.Column():
134
+ image = gr.Image(type="pil", label="Image", visible=True)
135
+ video = gr.Video(label="Video", visible=False)
136
+ input_type = gr.Radio(
137
+ choices=["Image", "Video"],
138
+ value="Image",
139
+ label="Input Type",
140
+ )
141
+ conf_threshold = gr.Slider(minimum=0, maximum=1, value=0.25, label="Confidence Threshold")
142
+ iou_threshold = gr.Slider(minimum=0, maximum=1, value=0.45, label="IoU Threshold")
143
+ max_detection = gr.Slider(minimum=1, maximum=300, step=1, value=300, label="Max Detection")
144
+ infer_button = gr.Button("Detect Objects")
145
+ with gr.Column():
146
+ output_image = gr.Image(type="pil", label="Annotated Image", visible=True)
147
+ output_video = gr.Video(label="Annotated Video", visible=False)
148
+
149
+ # Toggle input/output visibility
150
+ input_type.change(
151
+ fn=update_visibility,
152
+ inputs=input_type,
153
+ outputs=[image, video, output_image, output_video],
154
+ )
155
+
156
+ # Main inference for button click
157
+ infer_button.click(
158
+ fn=yolo_inference,
159
+ inputs=[input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection],
160
+ outputs=[output_image, output_video],
161
+ )
162
+
163
+
164
+ if __name__ == '__main__':
165
+ app.launch()
requirements.txt CHANGED
@@ -1,9 +1,5 @@
1
- gradio
2
- opencv-python
3
- transformers
4
  torch
5
- tensorflow
6
  torchvision
7
- timm
8
  ultralytics
9
  Pillow
 
1
+ spaces
 
 
2
  torch
 
3
  torchvision
 
4
  ultralytics
5
  Pillow