import cv2
import gradio as gr
from transformers import pipeline
from PIL import Image
import tempfile

# Cargar el modelo de detección de objetos usando CPU
detector = pipeline("object-detection", model="facebook/detr-resnet-50", device=-1)

def process_video(video_path):
    """
    Procesa un video, detecta objetos y dibuja cuadros y etiquetas sobre ellos.
    Solo se procesarán las detecciones de personas, bicicletas y motos.
    Devuelve el video anotado.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None

    # Obtener propiedades del video
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps    = cap.get(cv2.CAP_PROP_FPS)
    
    # Crear un archivo temporal para guardar el video de salida
    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
    output_path = tmp_file.name
    tmp_file.close()  # Se cierra para que VideoWriter pueda escribir en él

    # Configurar VideoWriter (utilizamos el códec mp4v)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    
    # Definir las clases a las que queremos aplicar detección
    valid_labels = {"person", "bicycle", "motorcycle"}
    threshold = 0.7  # Umbral de confianza

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Convertir el frame de BGR a RGB y luego a imagen PIL
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(frame_rgb)
        
        # Obtener detecciones con el pipeline
        results = detector(pil_image)
        
        # Dibujar cada detección válida en el frame
        for detection in results:
            score = detection["score"]
            label = detection["label"].lower()
            if score < threshold or label not in valid_labels:
                continue
            
            # Obtener la caja del objeto en formato [xmin, ymin, width, height]
            box = detection["box"]
            xmin, ymin, w, h = box
            xmax = xmin + w
            ymax = ymin + h

            # Dibujar el rectángulo y la etiqueta en el frame
            cv2.rectangle(frame, (int(xmin), int(ymin)), (int(xmax), int(ymax)), color=(0, 255, 0), thickness=2)
            text = f"{label}: {score:.2f}"
            cv2.putText(frame, text, (int(xmin), int(ymin)-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        
        # Escribir el frame anotado en el video de salida
        out.write(frame)
    
    cap.release()
    out.release()
    return output_path

iface = gr.Interface(
    fn=process_video,
    inputs=gr.Video(label="Sube tu video"),
    outputs=gr.Video(label="Video procesado"),
    title="Detección y Visualización de Objetos en Video",
    description="Carga un video y se detectan personas, bicicletas y motos. Los objetos se enmarcan y etiquetan, mostrando la detección en tiempo real."
)

if __name__ == "__main__":
    iface.launch()