Spaces:

Segizu
/

Computer_Vision

Running on Zero

File size: 3,599 Bytes

7d1ac51
 
 
a2562c0
de9232e
456adb5
de9232e
456adb5
daa16ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d1ac51
 
 
de9232e
 
 
7d1ac51
 
 
de9232e
 
 
 
 
 
7d1ac51
de9232e
 
 
 
 
 
 
 
7d1ac51
de9232e
 
 
 
7d1ac51
 
 
 
 
de9232e
7d1ac51
a2562c0
 
de9232e
a2562c0
7d1ac51
de9232e
7d1ac51
de9232e
7d1ac51
de9232e
 
 
 
 
 
 
 
 
 
 
 
 
7d1ac51
de9232e
 
7d1ac51
 
de9232e
 
7d1ac51
 
 
 
de9232e
 
 
7d1ac51

import cv2
import gradio as gr
from transformers import pipeline
from PIL import Image
import tempfile
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Cargar el modelo de detección de objetos
try:
    detector = pipeline(
        "object-detection",
        model="facebook/detr-resnet-50",
        device=0 if device == "cuda" else -1,  # 0 para GPU, -1 para CPU
        framework="pt"  # Especificar PyTorch como framework
    )
    print("Model loaded successfully on", device)
except Exception as e:
    print(f"Error loading model: {e}")
    print("Falling back to CPU")
    detector = pipeline(
        "object-detection",
        model="facebook/detr-resnet-50",
        device=-1,
        framework="pt"
    )

def process_video(video_path):
    """
    Procesa un video, detecta objetos y dibuja cuadros y etiquetas sobre ellos.
    Solo se procesarán las detecciones de personas, bicicletas y motos.
    Devuelve el video anotado.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None

    # Obtener propiedades del video
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps    = cap.get(cv2.CAP_PROP_FPS)
    
    # Crear un archivo temporal para guardar el video de salida
    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
    output_path = tmp_file.name
    tmp_file.close()  # Se cierra para que VideoWriter pueda escribir en él

    # Configurar VideoWriter (utilizamos el códec mp4v)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    
    # Definir las clases a las que queremos aplicar detección
    valid_labels = {"person", "bicycle", "motorcycle"}
    threshold = 0.7  # Umbral de confianza

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Convertir el frame de BGR a RGB y luego a imagen PIL
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(frame_rgb)
        
        # Obtener detecciones con el pipeline
        results = detector(pil_image)
        
        # Dibujar cada detección válida en el frame
        for detection in results:
            score = detection["score"]
            label = detection["label"].lower()
            if score < threshold or label not in valid_labels:
                continue
            
            # Obtener la caja del objeto en formato [xmin, ymin, width, height]
            box = detection["box"]
            xmin, ymin, w, h = box
            xmax = xmin + w
            ymax = ymin + h

            # Dibujar el rectángulo y la etiqueta en el frame
            cv2.rectangle(frame, (int(xmin), int(ymin)), (int(xmax), int(ymax)), color=(0, 255, 0), thickness=2)
            text = f"{label}: {score:.2f}"
            cv2.putText(frame, text, (int(xmin), int(ymin)-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        
        # Escribir el frame anotado en el video de salida
        out.write(frame)
    
    cap.release()
    out.release()
    return output_path

iface = gr.Interface(
    fn=process_video,
    inputs=gr.Video(label="Sube tu video"),
    outputs=gr.Video(label="Video procesado"),
    title="Detección y Visualización de Objetos en Video",
    description="Carga un video y se detectan personas, bicicletas y motos. Los objetos se enmarcan y etiquetan, mostrando la detección en tiempo real."
)

if __name__ == "__main__":
    iface.launch()