Spaces:

Segizu
/

Computer_Vision

Sleeping

Computer_Vision / app.py

de9232e 4 months ago

3.07 kB

	import cv2
	import gradio as gr
	from transformers import pipeline
	from PIL import Image
	import tempfile

	# Cargar el modelo de detección de objetos usando CPU
	detector = pipeline("object-detection", model="facebook/detr-resnet-50", device=-1)

	def process_video(video_path):
	"""
	Procesa un video, detecta objetos y dibuja cuadros y etiquetas sobre ellos.
	Solo se procesarán las detecciones de personas, bicicletas y motos.
	Devuelve el video anotado.
	"""
	cap = cv2.VideoCapture(video_path)
	if not cap.isOpened():
	return None

	# Obtener propiedades del video
	width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	fps = cap.get(cv2.CAP_PROP_FPS)

	# Crear un archivo temporal para guardar el video de salida
	tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
	output_path = tmp_file.name
	tmp_file.close() # Se cierra para que VideoWriter pueda escribir en él

	# Configurar VideoWriter (utilizamos el códec mp4v)
	fourcc = cv2.VideoWriter_fourcc(*'mp4v')
	out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

	# Definir las clases a las que queremos aplicar detección
	valid_labels = {"person", "bicycle", "motorcycle"}
	threshold = 0.7 # Umbral de confianza

	while True:
	ret, frame = cap.read()
	if not ret:
	break

	# Convertir el frame de BGR a RGB y luego a imagen PIL
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	pil_image = Image.fromarray(frame_rgb)

	# Obtener detecciones con el pipeline
	results = detector(pil_image)

	# Dibujar cada detección válida en el frame
	for detection in results:
	score = detection["score"]
	label = detection["label"].lower()
	if score < threshold or label not in valid_labels:
	continue

	# Obtener la caja del objeto en formato [xmin, ymin, width, height]
	box = detection["box"]
	xmin, ymin, w, h = box
	xmax = xmin + w
	ymax = ymin + h

	# Dibujar el rectángulo y la etiqueta en el frame
	cv2.rectangle(frame, (int(xmin), int(ymin)), (int(xmax), int(ymax)), color=(0, 255, 0), thickness=2)
	text = f"{label}: {score:.2f}"
	cv2.putText(frame, text, (int(xmin), int(ymin)-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

	# Escribir el frame anotado en el video de salida
	out.write(frame)

	cap.release()
	out.release()
	return output_path

	iface = gr.Interface(
	fn=process_video,
	inputs=gr.Video(label="Sube tu video"),
	outputs=gr.Video(label="Video procesado"),
	title="Detección y Visualización de Objetos en Video",
	description="Carga un video y se detectan personas, bicicletas y motos. Los objetos se enmarcan y etiquetan, mostrando la detección en tiempo real."
	)

	if __name__ == "__main__":
	iface.launch()