Spaces:

Segizu
/

Computer_Vision

Sleeping

App Files Files Community

Segizu commited on Mar 14

Commit

8ac08dc

1 Parent(s): 27e96af

yolov8

Browse files

Files changed (2) hide show

app.py +50 -48
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -1,84 +1,86 @@
 import cv2
 import gradio as gr
-from transformers import pipeline
 from PIL import Image
 import tempfile
-# Cargar el modelo de detección de objetos usando CPU
-detector = pipeline("object-detection", model="facebook/detr-resnet-50", device=-1)
 def process_video(video_path):
     """
-    Procesa un video, detecta objetos y dibuja cuadros y etiquetas sobre ellos.
-    Solo se procesarán las detecciones de personas, bicicletas y motos.
-    Devuelve el video anotado.
     """
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
         return None
-    # Obtener propiedades del video
-    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    fps    = cap.get(cv2.CAP_PROP_FPS)
-    # Crear un archivo temporal para guardar el video de salida
     tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
     output_path = tmp_file.name
-    tmp_file.close()  # Se cierra para que VideoWriter pueda escribir en él
-    # Configurar VideoWriter (usamos el códec mp4v)
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
     out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
-    # Definir las clases de interés
-    valid_labels = {"person", "bicycle", "motorcycle"}
-    threshold = 0.7  # Umbral de confianza
     while True:
         ret, frame = cap.read()
         if not ret:
             break
-        # Convertir el frame de BGR a RGB y a imagen PIL
         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        pil_image = Image.fromarray(frame_rgb)
-        # Obtener detecciones con el pipeline
-        results = detector(pil_image)
-        # Dibujar cada detección válida en el frame
-        for detection in results:
-            score = detection["score"]
-            label = detection["label"].lower()
-            if score < threshold or label not in valid_labels:
-                continue
-            # Extraer la caja del objeto (dado que es un diccionario)
-            box = detection["box"]
-            xmin = box["xmin"]
-            ymin = box["ymin"]
-            xmax = box["xmax"]
-            ymax = box["ymax"]
-            # Dibujar el rectángulo y la etiqueta en el frame
-            cv2.rectangle(frame, (int(xmin), int(ymin)), (int(xmax), int(ymax)), color=(0, 255, 0), thickness=2)
-            text = f"{label}: {score:.2f}"
-            cv2.putText(frame, text, (int(xmin), int(ymin)-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
-        # Escribir el frame anotado en el video de salida
         out.write(frame)
     cap.release()
     out.release()
     return output_path
 iface = gr.Interface(
     fn=process_video,
     inputs=gr.Video(label="Sube tu video"),
     outputs=gr.Video(label="Video procesado"),
-    title="Detección y Visualización de Objetos en Video",
-    description="Carga un video y se detectan personas, bicicletas y motos. Los objetos se enmarcan y etiquetan en tiempo real."
 )
 if __name__ == "__main__":

 import cv2
 import gradio as gr
+from ultralytics import YOLO
 from PIL import Image
 import tempfile
+# Cargamos el modelo YOLOv8 (puedes usar yolov8n.pt, yolov8s.pt, etc.)
+model = YOLO("yolov8n.pt")
 def process_video(video_path):
     """
+    Procesa un video, detecta personas, bicicletas y motos con YOLOv8,
+    y dibuja los recuadros y etiquetas en cada frame. Devuelve un .mp4 anotado.
     """
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
         return None
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    # Creamos un archivo temporal para guardar el resultado
     tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
     output_path = tmp_file.name
+    tmp_file.close()
+    # Usamos un códec compatible con navegadores (H.264 / avc1)
+    fourcc = cv2.VideoWriter_fourcc(*'avc1')
     out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+    # Clases que nos interesan
+    valid_classes = ["person", "bicycle", "motorcycle"]
     while True:
         ret, frame = cap.read()
         if not ret:
             break
+        # Convertir BGR -> RGB para predecir con YOLO
         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        # Hacemos la inferencia con un umbral de confianza del 0.5
+        results = model.predict(frame_rgb, conf=0.5)
+        # results es una lista; tomamos la primera predicción
+        boxes = results[0].boxes
+        # Dibujamos cada bounding box
+        for box in boxes:
+            # box.cls, box.conf y box.xyxy son tensores, así que convertimos a Python float/int
+            cls_id = int(box.cls[0].item())     # Índice de la clase
+            conf   = float(box.conf[0].item())  # Confianza
+            x1, y1, x2, y2 = box.xyxy[0]        # Coordenadas [xmin, ymin, xmax, ymax]
+            class_name = model.names[cls_id]
+            if class_name in valid_classes:
+                # Dibujamos el rectángulo
+                cv2.rectangle(frame,
+                              (int(x1), int(y1)),
+                              (int(x2), int(y2)),
+                              (0, 255, 0), 2)
+                text = f"{class_name} {conf:.2f}"
+                cv2.putText(frame, text,
+                            (int(x1), int(y1) - 10),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.5,
+                            (0, 255, 0), 2)
+        # Guardamos el frame anotado en el video de salida
         out.write(frame)
     cap.release()
     out.release()
     return output_path
+# Interfaz de Gradio
 iface = gr.Interface(
     fn=process_video,
     inputs=gr.Video(label="Sube tu video"),
     outputs=gr.Video(label="Video procesado"),
+    title="Detección de Objetos con YOLOv8",
+    description="Sube un video y se detectan personas, bicicletas y motos con YOLOv8. "
+                "Los objetos se enmarcan y etiquetan en el video resultante."
 )
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -4,4 +4,6 @@ transformers
 torch
 tensorflow
 torchvision
-timm

 torch
 tensorflow
 torchvision
+timm
+ultralytics
+Pillow