Marcus Vinicius Zerbini Canhaço
feat: atualização do detector com otimizações para GPU T4
2dfae6f
import torch
from transformers import Owlv2Processor, Owlv2ForObjectDetection
from PIL import Image
import numpy as np
import cv2
import time
from typing import List, Dict, Tuple, Optional, Union
import os
from tqdm import tqdm
import json
from pathlib import Path
from contextlib import nullcontext
import threading
import hashlib
import pickle
from datetime import datetime
from dotenv import load_dotenv
import tempfile
import subprocess
import shutil
import traceback
import psutil
import logging
import gc
import sys
from concurrent.futures import ThreadPoolExecutor
import torch.nn.functional as F
from .base import BaseDetector, BaseCache
# Configurar logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Carregar variáveis de ambiente
load_dotenv()
class CPUCache(BaseCache):
"""Cache otimizado para CPU."""
def __init__(self, max_size: int = 1000):
super().__init__(max_size)
self.device = torch.device('cpu')
class WeaponDetectorCPU(BaseDetector):
"""Implementação CPU do detector de armas."""
def __init__(self):
"""Inicializa variáveis básicas."""
super().__init__()
self.default_resolution = 640
self.device = torch.device('cpu')
def _get_best_device(self):
return torch.device('cpu')
def _initialize(self):
"""Inicializa o modelo e o processador para execução em CPU."""
try:
# Configurações otimizadas para CPU
torch.set_num_threads(min(8, os.cpu_count()))
torch.set_num_interop_threads(min(8, os.cpu_count()))
# Carregar modelo com configurações otimizadas
cache_dir = os.path.join(tempfile.gettempdir(), 'weapon_detection_cache')
os.makedirs(cache_dir, exist_ok=True)
model_name = "google/owlv2-base-patch16"
logger.info("Carregando modelo e processador...")
self.owlv2_processor = Owlv2Processor.from_pretrained(
model_name,
cache_dir=cache_dir
)
self.owlv2_model = Owlv2ForObjectDetection.from_pretrained(
model_name,
cache_dir=cache_dir,
torch_dtype=torch.float32,
low_cpu_mem_usage=True
).to(self.device)
self.owlv2_model.eval()
# Usar queries do método base
self.text_queries = self._get_detection_queries()
logger.info(f"Total de queries carregadas: {len(self.text_queries)}")
# Processar queries uma única vez
logger.info("Processando queries...")
self.processed_text = self.owlv2_processor(
text=self.text_queries,
return_tensors="pt",
padding=True
).to(self.device)
# Inicializar cache
cache_size = int(os.getenv('RESULT_CACHE_SIZE', '1000'))
self.result_cache = CPUCache(max_size=cache_size)
logger.info("Inicialização CPU completa!")
self._initialized = True
except Exception as e:
logger.error(f"Erro na inicialização CPU: {str(e)}")
raise
def _apply_nms(self, detections: list, iou_threshold: float = 0.5) -> list:
"""Aplica NMS usando operações em CPU."""
try:
if not detections:
return []
boxes = torch.tensor([[d["box"][0], d["box"][1], d["box"][2], d["box"][3]] for d in detections])
scores = torch.tensor([d["confidence"] for d in detections])
labels = [d["label"] for d in detections]
area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
_, order = scores.sort(descending=True)
keep = []
while order.numel() > 0:
if order.numel() == 1:
keep.append(order.item())
break
i = order[0]
keep.append(i.item())
xx1 = torch.max(boxes[i, 0], boxes[order[1:], 0])
yy1 = torch.max(boxes[i, 1], boxes[order[1:], 1])
xx2 = torch.min(boxes[i, 2], boxes[order[1:], 2])
yy2 = torch.min(boxes[i, 3], boxes[order[1:], 3])
w = torch.clamp(xx2 - xx1, min=0)
h = torch.clamp(yy2 - yy1, min=0)
inter = w * h
ovr = inter / (area[i] + area[order[1:]] - inter)
ids = (ovr <= iou_threshold).nonzero().squeeze()
if ids.numel() == 0:
break
order = order[ids + 1]
filtered_detections = []
for idx in keep:
filtered_detections.append({
"confidence": scores[idx].item(),
"box": boxes[idx].tolist(),
"label": labels[idx]
})
return filtered_detections
except Exception as e:
logger.error(f"Erro ao aplicar NMS: {str(e)}")
return []
def _preprocess_image(self, image: Image.Image) -> Image.Image:
"""Pré-processa a imagem para o tamanho 640x640 e garante RGB."""
try:
target_size = (640, 640)
if image.mode != 'RGB':
image = image.convert('RGB')
if image.size != target_size:
ratio = min(target_size[0] / image.size[0], target_size[1] / image.size[1])
new_size = tuple(int(dim * ratio) for dim in image.size)
image = image.resize(new_size, Image.LANCZOS)
if new_size != target_size:
new_image = Image.new('RGB', target_size, (0, 0, 0))
paste_x = (target_size[0] - new_size[0]) // 2
paste_y = (target_size[1] - new_size[1]) // 2
new_image.paste(image, (paste_x, paste_y))
image = new_image
return image
except Exception as e:
logger.error(f"Erro no pré-processamento: {str(e)}")
return image
def detect_objects(self, image: Image.Image, threshold: float = 0.3) -> list:
"""Detecta objetos em uma imagem utilizando CPU."""
try:
image = self._preprocess_image(image)
with torch.no_grad():
image_inputs = self.owlv2_processor(
images=image,
return_tensors="pt"
).to(self.device)
inputs = {**image_inputs, **self.processed_text}
outputs = self.owlv2_model(**inputs)
target_sizes = torch.tensor([image.size[::-1]])
results = self.owlv2_processor.post_process_grounded_object_detection(
outputs=outputs,
target_sizes=target_sizes,
threshold=threshold
)[0]
detections = []
for score, box, label in zip(results["scores"], results["boxes"], results["labels"]):
x1, y1, x2, y2 = box.tolist()
detections.append({
"confidence": score.item(),
"box": [int(x1), int(y1), int(x2), int(y2)],
"label": self.text_queries[label]
})
return self._apply_nms(detections)
except Exception as e:
logger.error(f"Erro em detect_objects: {str(e)}")
return []
def process_video(self, video_path: str, fps: int = None, threshold: float = 0.3, resolution: int = 640) -> tuple:
"""Processa um vídeo utilizando CPU. Para na primeira detecção encontrada."""
try:
metrics = {
"total_time": 0,
"frame_extraction_time": 0,
"analysis_time": 0,
"frames_analyzed": 0,
"video_duration": 0,
"device_type": self.device.type,
"detections": [],
"technical": {
"model": "owlv2-base-patch16-ensemble",
"input_size": f"{resolution}x{resolution}",
"nms_threshold": 0.5,
"preprocessing": "basic",
"early_stop": True
},
}
start_time = time.time()
t0 = time.time()
frames = self.extract_frames(video_path, fps, resolution)
metrics["frame_extraction_time"] = time.time() - t0
metrics["frames_analyzed"] = len(frames)
if not frames:
logger.warning("Nenhum frame extraído do vídeo")
return video_path, metrics
metrics["video_duration"] = len(frames) / (fps or 2)
t0 = time.time()
detections = []
frames_processed = 0
# Processar um frame por vez para otimizar memória e permitir parada precoce
for frame_idx, frame in enumerate(frames):
frames_processed += 1
# Converter frame para RGB e pré-processar
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
image = Image.fromarray(frame_rgb)
image = self._preprocess_image(image)
# Detectar objetos com threshold direto
with torch.no_grad():
image_inputs = self.owlv2_processor(
images=image,
return_tensors="pt"
).to(self.device)
inputs = {**image_inputs, **self.processed_text}
outputs = self.owlv2_model(**inputs)
target_sizes = torch.tensor([image.size[::-1]])
results = self.owlv2_processor.post_process_grounded_object_detection(
outputs=outputs,
target_sizes=target_sizes,
threshold=threshold # Aplicar threshold diretamente
)[0]
# Se encontrou alguma detecção acima do threshold
if len(results["scores"]) > 0:
# Pegar a detecção com maior confiança
max_score_idx = torch.argmax(results["scores"])
score = results["scores"][max_score_idx].item()
box = results["boxes"][max_score_idx].tolist()
label = results["labels"][max_score_idx].item()
detections.append({
"frame": frame_idx,
"confidence": score,
"box": [int(x) for x in box],
"label": self.text_queries[label]
})
# Atualizar métricas e parar o processamento
metrics["frames_processed_until_detection"] = frames_processed
metrics["analysis_time"] = time.time() - t0
metrics["total_time"] = time.time() - start_time
metrics["detections"] = detections
logger.info(f"Detecção encontrada após processar {frames_processed} frames")
return video_path, metrics
# Liberar memória a cada 10 frames
if frames_processed % 10 == 0:
gc.collect()
# Se chegou aqui, não encontrou nenhuma detecção
metrics["analysis_time"] = time.time() - t0
metrics["total_time"] = time.time() - start_time
metrics["frames_processed_until_detection"] = frames_processed
metrics["detections"] = detections
return video_path, metrics
except Exception as e:
logger.error(f"Erro ao processar vídeo: {str(e)}")
return video_path, {}
def extract_frames(self, video_path: str, fps: int = 2, resolution: int = 480) -> list:
"""Extrai frames de um vídeo utilizando ffmpeg."""
frames = []
temp_dir = Path(tempfile.mkdtemp())
try:
threads = min(os.cpu_count(), 4) # Menor número de threads para CPU
cmd = [
'ffmpeg', '-i', video_path,
'-threads', str(threads),
'-vf', (f'fps={fps},'
f'scale={resolution}:{resolution}:force_original_aspect_ratio=decrease:flags=lanczos,'
f'pad={resolution}:{resolution}:(ow-iw)/2:(oh-ih)/2'),
'-frame_pts', '1',
f'{temp_dir}/%d.jpg'
]
subprocess.run(cmd, check=True, capture_output=True)
frame_files = sorted(temp_dir.glob('*.jpg'), key=lambda x: int(x.stem))
chunk_size = 50 # Menor chunk size para CPU
with ThreadPoolExecutor(max_workers=threads) as executor:
for i in range(0, len(frame_files), chunk_size):
chunk = frame_files[i:i + chunk_size]
chunk_frames = list(tqdm(
executor.map(lambda f: cv2.imread(str(f)), chunk),
desc=f"Carregando frames {i+1}-{min(i+chunk_size, len(frame_files))}",
total=len(chunk)
))
frames.extend(chunk_frames)
if i % (chunk_size * 5) == 0:
gc.collect()
finally:
shutil.rmtree(temp_dir)
return frames
def clear_cache(self):
"""Limpa o cache de resultados e libera memória."""
try:
if hasattr(self, 'result_cache'):
self.result_cache.clear()
gc.collect()
logger.info("Cache CPU limpo com sucesso")
except Exception as e:
logger.error(f"Erro ao limpar cache CPU: {str(e)}")
def _apply_nms(self, detections: list, iou_threshold: float = 0.5) -> list:
"""Aplica NMS usando operações em CPU."""
try:
if not detections:
return []
boxes = torch.tensor([[d["box"][0], d["box"][1], d["box"][2], d["box"][3]] for d in detections])
scores = torch.tensor([d["confidence"] for d in detections])
labels = [d["label"] for d in detections]
area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
_, order = scores.sort(descending=True)
keep = []
while order.numel() > 0:
if order.numel() == 1:
keep.append(order.item())
break
i = order[0]
keep.append(i.item())
xx1 = torch.max(boxes[i, 0], boxes[order[1:], 0])
yy1 = torch.max(boxes[i, 1], boxes[order[1:], 1])
xx2 = torch.min(boxes[i, 2], boxes[order[1:], 2])
yy2 = torch.min(boxes[i, 3], boxes[order[1:], 3])
w = torch.clamp(xx2 - xx1, min=0)
h = torch.clamp(yy2 - yy1, min=0)
inter = w * h
ovr = inter / (area[i] + area[order[1:]] - inter)
ids = (ovr <= iou_threshold).nonzero().squeeze()
if ids.numel() == 0:
break
order = order[ids + 1]
filtered_detections = []
for idx in keep:
filtered_detections.append({
"confidence": scores[idx].item(),
"box": boxes[idx].tolist(),
"label": labels[idx]
})
return filtered_detections
except Exception as e:
logger.error(f"Erro ao aplicar NMS: {str(e)}")
return []
def _preprocess_image(self, image: Image.Image) -> Image.Image:
"""Pré-processa a imagem para o tamanho 640x640 e garante RGB."""
try:
target_size = (640, 640)
if image.mode != 'RGB':
image = image.convert('RGB')
if image.size != target_size:
ratio = min(target_size[0] / image.size[0], target_size[1] / image.size[1])
new_size = tuple(int(dim * ratio) for dim in image.size)
image = image.resize(new_size, Image.LANCZOS)
if new_size != target_size:
new_image = Image.new('RGB', target_size, (0, 0, 0))
paste_x = (target_size[0] - new_size[0]) // 2
paste_y = (target_size[1] - new_size[1]) // 2
new_image.paste(image, (paste_x, paste_y))
image = new_image
return image
except Exception as e:
logger.error(f"Erro no pré-processamento: {str(e)}")
return image
def detect_objects(self, image: Image.Image, threshold: float = 0.3) -> list:
"""Detecta objetos em uma imagem utilizando CPU."""
try:
image = self._preprocess_image(image)
with torch.no_grad():
image_inputs = self.owlv2_processor(
images=image,
return_tensors="pt"
).to(self.device)
inputs = {**image_inputs, **self.processed_text}
outputs = self.owlv2_model(**inputs)
target_sizes = torch.tensor([image.size[::-1]])
results = self.owlv2_processor.post_process_grounded_object_detection(
outputs=outputs,
target_sizes=target_sizes,
threshold=threshold
)[0]
detections = []
for score, box, label in zip(results["scores"], results["boxes"], results["labels"]):
x1, y1, x2, y2 = box.tolist()
detections.append({
"confidence": score.item(),
"box": [int(x1), int(y1), int(x2), int(y2)],
"label": self.text_queries[label]
})
return self._apply_nms(detections)
except Exception as e:
logger.error(f"Erro em detect_objects: {str(e)}")
return []
def process_video(self, video_path: str, fps: int = None, threshold: float = 0.3, resolution: int = 640) -> tuple:
"""Processa um vídeo utilizando CPU. Para na primeira detecção encontrada."""
try:
metrics = {
"total_time": 0,
"frame_extraction_time": 0,
"analysis_time": 0,
"frames_analyzed": 0,
"video_duration": 0,
"device_type": self.device.type,
"detections": [],
"technical": {
"model": "owlv2-base-patch16-ensemble",
"input_size": f"{resolution}x{resolution}",
"nms_threshold": 0.5,
"preprocessing": "basic",
"early_stop": True
},
}
start_time = time.time()
t0 = time.time()
frames = self.extract_frames(video_path, fps, resolution)
metrics["frame_extraction_time"] = time.time() - t0
metrics["frames_analyzed"] = len(frames)
if not frames:
logger.warning("Nenhum frame extraído do vídeo")
return video_path, metrics
metrics["video_duration"] = len(frames) / (fps or 2)
t0 = time.time()
detections = []
frames_processed = 0
# Processar um frame por vez para otimizar memória e permitir parada precoce
for frame_idx, frame in enumerate(frames):
frames_processed += 1
# Converter frame para RGB e pré-processar
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
image = Image.fromarray(frame_rgb)
image = self._preprocess_image(image)
# Detectar objetos com threshold direto
with torch.no_grad():
image_inputs = self.owlv2_processor(
images=image,
return_tensors="pt"
).to(self.device)
inputs = {**image_inputs, **self.processed_text}
outputs = self.owlv2_model(**inputs)
target_sizes = torch.tensor([image.size[::-1]])
results = self.owlv2_processor.post_process_grounded_object_detection(
outputs=outputs,
target_sizes=target_sizes,
threshold=threshold # Aplicar threshold diretamente
)[0]
# Se encontrou alguma detecção acima do threshold
if len(results["scores"]) > 0:
# Pegar a detecção com maior confiança
max_score_idx = torch.argmax(results["scores"])
score = results["scores"][max_score_idx].item()
box = results["boxes"][max_score_idx].tolist()
label = results["labels"][max_score_idx].item()
detections.append({
"frame": frame_idx,
"confidence": score,
"box": [int(x) for x in box],
"label": self.text_queries[label]
})
# Atualizar métricas e parar o processamento
metrics["frames_processed_until_detection"] = frames_processed
metrics["analysis_time"] = time.time() - t0
metrics["total_time"] = time.time() - start_time
metrics["detections"] = detections
logger.info(f"Detecção encontrada após processar {frames_processed} frames")
return video_path, metrics
# Liberar memória a cada 10 frames
if frames_processed % 10 == 0:
gc.collect()
# Se chegou aqui, não encontrou nenhuma detecção
metrics["analysis_time"] = time.time() - t0
metrics["total_time"] = time.time() - start_time
metrics["frames_processed_until_detection"] = frames_processed
metrics["detections"] = detections
return video_path, metrics
except Exception as e:
logger.error(f"Erro ao processar vídeo: {str(e)}")
return video_path, {}
def extract_frames(self, video_path: str, fps: int = 2, resolution: int = 480) -> list:
"""Extrai frames de um vídeo utilizando ffmpeg."""
frames = []
temp_dir = Path(tempfile.mkdtemp())
try:
threads = min(os.cpu_count(), 4) # Menor número de threads para CPU
cmd = [
'ffmpeg', '-i', video_path,
'-threads', str(threads),
'-vf', (f'fps={fps},'
f'scale={resolution}:{resolution}:force_original_aspect_ratio=decrease:flags=lanczos,'
f'pad={resolution}:{resolution}:(ow-iw)/2:(oh-ih)/2'),
'-frame_pts', '1',
f'{temp_dir}/%d.jpg'
]
subprocess.run(cmd, check=True, capture_output=True)
frame_files = sorted(temp_dir.glob('*.jpg'), key=lambda x: int(x.stem))
chunk_size = 50 # Menor chunk size para CPU
with ThreadPoolExecutor(max_workers=threads) as executor:
for i in range(0, len(frame_files), chunk_size):
chunk = frame_files[i:i + chunk_size]
chunk_frames = list(tqdm(
executor.map(lambda f: cv2.imread(str(f)), chunk),
desc=f"Carregando frames {i+1}-{min(i+chunk_size, len(frame_files))}",
total=len(chunk)
))
frames.extend(chunk_frames)
if i % (chunk_size * 5) == 0:
gc.collect()
finally:
shutil.rmtree(temp_dir)
return frames
def clear_cache(self):
"""Limpa cache e libera memória."""
self.result_cache.clear()
gc.collect()