Spaces:
Runtime error
Runtime error
Marcus Vinicius Zerbini Canhaço
commited on
Commit
·
a83ece3
1
Parent(s):
479ff3a
feat: atualização do detector com otimizações para GPU T4
Browse files- src/domain/detectors/gpu.py +60 -107
src/domain/detectors/gpu.py
CHANGED
@@ -149,6 +149,10 @@ class WeaponDetectorGPU(BaseDetector):
|
|
149 |
try:
|
150 |
start_time = time.time()
|
151 |
|
|
|
|
|
|
|
|
|
152 |
# Extrair frames
|
153 |
t0 = time.time()
|
154 |
frames = self.extract_frames(video_path, fps or 2, resolution)
|
@@ -164,115 +168,51 @@ class WeaponDetectorGPU(BaseDetector):
|
|
164 |
|
165 |
# Processar frames em batch
|
166 |
t0 = time.time()
|
167 |
-
batch_size =
|
168 |
detections_by_frame = []
|
169 |
|
170 |
-
|
|
|
|
|
|
|
|
|
171 |
try:
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
# Preparar batch
|
176 |
-
for frame in batch_frames:
|
177 |
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
178 |
frame_pil = Image.fromarray(frame_rgb)
|
179 |
-
|
180 |
-
|
|
|
181 |
|
182 |
-
# Processar
|
183 |
-
|
184 |
-
images=
|
185 |
-
return_tensors="pt"
|
186 |
-
padding=True
|
187 |
)
|
188 |
-
|
189 |
-
# Validar shapes antes da inferência
|
190 |
-
if not self._validate_batch_shapes(batch_inputs):
|
191 |
-
logger.warning(f"Shape inválido detectado no batch {i}, processando frames individualmente...")
|
192 |
-
# Processar frames individualmente
|
193 |
-
for frame_idx, frame_pil in enumerate(batch_pil_frames):
|
194 |
-
try:
|
195 |
-
single_input = self.owlv2_processor(
|
196 |
-
images=frame_pil,
|
197 |
-
return_tensors="pt"
|
198 |
-
)
|
199 |
-
single_input = {
|
200 |
-
key: val.to(self.device)
|
201 |
-
for key, val in single_input.items()
|
202 |
-
}
|
203 |
-
|
204 |
-
with torch.no_grad():
|
205 |
-
inputs = {**single_input, **self.processed_text}
|
206 |
-
outputs = self.owlv2_model(**inputs)
|
207 |
-
|
208 |
-
target_sizes = torch.tensor([frame_pil.size[::-1]], device=self.device)
|
209 |
-
results = self.owlv2_processor.post_process_grounded_object_detection(
|
210 |
-
outputs=outputs,
|
211 |
-
target_sizes=target_sizes,
|
212 |
-
threshold=threshold
|
213 |
-
)
|
214 |
-
|
215 |
-
if len(results[0]["scores"]) > 0:
|
216 |
-
scores = results[0]["scores"]
|
217 |
-
boxes = results[0]["boxes"]
|
218 |
-
labels = results[0]["labels"]
|
219 |
-
|
220 |
-
frame_detections = []
|
221 |
-
for score, box, label in zip(scores, boxes, labels):
|
222 |
-
score_val = score.item()
|
223 |
-
if score_val >= threshold:
|
224 |
-
label_idx = min(label.item(), len(self.text_queries) - 1)
|
225 |
-
label_text = self.text_queries[label_idx]
|
226 |
-
frame_detections.append({
|
227 |
-
"confidence": round(score_val * 100, 2),
|
228 |
-
"box": [int(x) for x in box.tolist()],
|
229 |
-
"label": label_text,
|
230 |
-
"frame": i + frame_idx,
|
231 |
-
"timestamp": (i + frame_idx) / (fps or 2)
|
232 |
-
})
|
233 |
-
|
234 |
-
if frame_detections:
|
235 |
-
frame_detections = self._apply_nms(frame_detections)
|
236 |
-
detections_by_frame.extend(frame_detections)
|
237 |
-
|
238 |
-
except Exception as e:
|
239 |
-
logger.error(f"Erro ao processar frame individual {i + frame_idx}: {str(e)}")
|
240 |
-
continue
|
241 |
-
|
242 |
-
finally:
|
243 |
-
if 'single_input' in locals():
|
244 |
-
del single_input
|
245 |
-
if 'outputs' in locals():
|
246 |
-
del outputs
|
247 |
-
torch.cuda.empty_cache()
|
248 |
-
continue
|
249 |
-
|
250 |
-
# Processar batch normalmente
|
251 |
-
batch_inputs = {
|
252 |
key: val.to(self.device)
|
253 |
-
for key, val in
|
254 |
}
|
255 |
|
|
|
256 |
with torch.no_grad():
|
257 |
-
|
258 |
-
outputs = self.owlv2_model(**
|
259 |
|
260 |
-
target_sizes = torch.tensor(
|
261 |
-
[frame.size[::-1] for frame in batch_pil_frames],
|
262 |
-
device=self.device
|
263 |
-
)
|
264 |
results = self.owlv2_processor.post_process_grounded_object_detection(
|
265 |
outputs=outputs,
|
266 |
target_sizes=target_sizes,
|
267 |
threshold=threshold
|
268 |
)
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
labels = frame_results["labels"]
|
276 |
|
277 |
frame_detections = []
|
278 |
for score, box, label in zip(scores, boxes, labels):
|
@@ -284,27 +224,29 @@ class WeaponDetectorGPU(BaseDetector):
|
|
284 |
"confidence": round(score_val * 100, 2),
|
285 |
"box": [int(x) for x in box.tolist()],
|
286 |
"label": label_text,
|
287 |
-
"frame": i
|
288 |
-
"timestamp":
|
289 |
})
|
290 |
|
291 |
if frame_detections:
|
292 |
frame_detections = self._apply_nms(frame_detections)
|
293 |
detections_by_frame.extend(frame_detections)
|
294 |
|
295 |
-
except
|
296 |
-
logger.error(f"Erro
|
297 |
-
if "out of memory" in str(e):
|
298 |
-
torch.cuda.empty_cache()
|
299 |
-
gc.collect()
|
300 |
continue
|
301 |
|
302 |
finally:
|
303 |
-
# Liberar memória
|
304 |
-
|
|
|
305 |
if 'outputs' in locals():
|
306 |
del outputs
|
307 |
torch.cuda.empty_cache()
|
|
|
|
|
|
|
|
|
308 |
|
309 |
# Atualizar métricas finais
|
310 |
metrics["analysis_time"] = time.time() - t0
|
@@ -342,26 +284,37 @@ class WeaponDetectorGPU(BaseDetector):
|
|
342 |
def _preprocess_image(self, image: Image.Image) -> Image.Image:
|
343 |
"""Pré-processa a imagem para o formato esperado pelo modelo."""
|
344 |
try:
|
345 |
-
#
|
|
|
|
|
|
|
|
|
346 |
if image.mode != 'RGB':
|
347 |
image = image.convert('RGB')
|
348 |
|
349 |
-
# Redimensionar mantendo proporção
|
350 |
target_size = (self.default_resolution, self.default_resolution)
|
351 |
if image.size != target_size:
|
|
|
352 |
ratio = min(target_size[0] / image.size[0], target_size[1] / image.size[1])
|
353 |
new_size = tuple(int(dim * ratio) for dim in image.size)
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
|
|
|
|
358 |
new_image = Image.new('RGB', target_size, (0, 0, 0))
|
|
|
359 |
paste_x = (target_size[0] - new_size[0]) // 2
|
360 |
paste_y = (target_size[1] - new_size[1]) // 2
|
361 |
new_image.paste(image, (paste_x, paste_y))
|
362 |
image = new_image
|
363 |
|
|
|
|
|
364 |
return image
|
|
|
365 |
except Exception as e:
|
366 |
logger.error(f"Erro no pré-processamento: {str(e)}")
|
367 |
return image
|
|
|
149 |
try:
|
150 |
start_time = time.time()
|
151 |
|
152 |
+
# Limpar cache de GPU antes de começar
|
153 |
+
torch.cuda.empty_cache()
|
154 |
+
gc.collect()
|
155 |
+
|
156 |
# Extrair frames
|
157 |
t0 = time.time()
|
158 |
frames = self.extract_frames(video_path, fps or 2, resolution)
|
|
|
168 |
|
169 |
# Processar frames em batch
|
170 |
t0 = time.time()
|
171 |
+
batch_size = 1 # Processar um frame por vez para garantir compatibilidade
|
172 |
detections_by_frame = []
|
173 |
|
174 |
+
# Pré-alocar tensores para evitar alocações frequentes
|
175 |
+
with torch.cuda.device(self.device):
|
176 |
+
torch.cuda.empty_cache() # Limpar memória antes de começar
|
177 |
+
|
178 |
+
for i in range(0, len(frames)):
|
179 |
try:
|
180 |
+
# Preparar frame com otimização de memória
|
181 |
+
frame = frames[i]
|
182 |
+
if isinstance(frame, np.ndarray):
|
|
|
|
|
183 |
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
184 |
frame_pil = Image.fromarray(frame_rgb)
|
185 |
+
else:
|
186 |
+
frame_pil = frame
|
187 |
+
frame_pil = self._preprocess_image(frame_pil)
|
188 |
|
189 |
+
# Processar frame
|
190 |
+
inputs = self.owlv2_processor(
|
191 |
+
images=frame_pil,
|
192 |
+
return_tensors="pt"
|
|
|
193 |
)
|
194 |
+
inputs = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
key: val.to(self.device)
|
196 |
+
for key, val in inputs.items()
|
197 |
}
|
198 |
|
199 |
+
# Inferência
|
200 |
with torch.no_grad():
|
201 |
+
model_inputs = {**inputs, **self.processed_text}
|
202 |
+
outputs = self.owlv2_model(**model_inputs)
|
203 |
|
204 |
+
target_sizes = torch.tensor([frame_pil.size[::-1]], device=self.device)
|
|
|
|
|
|
|
205 |
results = self.owlv2_processor.post_process_grounded_object_detection(
|
206 |
outputs=outputs,
|
207 |
target_sizes=target_sizes,
|
208 |
threshold=threshold
|
209 |
)
|
210 |
+
|
211 |
+
# Processar resultados
|
212 |
+
if len(results[0]["scores"]) > 0:
|
213 |
+
scores = results[0]["scores"]
|
214 |
+
boxes = results[0]["boxes"]
|
215 |
+
labels = results[0]["labels"]
|
|
|
216 |
|
217 |
frame_detections = []
|
218 |
for score, box, label in zip(scores, boxes, labels):
|
|
|
224 |
"confidence": round(score_val * 100, 2),
|
225 |
"box": [int(x) for x in box.tolist()],
|
226 |
"label": label_text,
|
227 |
+
"frame": i,
|
228 |
+
"timestamp": i / (fps or 2)
|
229 |
})
|
230 |
|
231 |
if frame_detections:
|
232 |
frame_detections = self._apply_nms(frame_detections)
|
233 |
detections_by_frame.extend(frame_detections)
|
234 |
|
235 |
+
except Exception as e:
|
236 |
+
logger.error(f"Erro ao processar frame {i}: {str(e)}")
|
|
|
|
|
|
|
237 |
continue
|
238 |
|
239 |
finally:
|
240 |
+
# Liberar memória
|
241 |
+
if 'inputs' in locals():
|
242 |
+
del inputs
|
243 |
if 'outputs' in locals():
|
244 |
del outputs
|
245 |
torch.cuda.empty_cache()
|
246 |
+
|
247 |
+
# Log de progresso
|
248 |
+
if i % 10 == 0:
|
249 |
+
logger.info(f"Processados {i}/{len(frames)} frames")
|
250 |
|
251 |
# Atualizar métricas finais
|
252 |
metrics["analysis_time"] = time.time() - t0
|
|
|
284 |
def _preprocess_image(self, image: Image.Image) -> Image.Image:
|
285 |
"""Pré-processa a imagem para o formato esperado pelo modelo."""
|
286 |
try:
|
287 |
+
# Cache de tamanho para evitar redimensionamentos desnecessários
|
288 |
+
if hasattr(self, '_last_size') and self._last_size == image.size:
|
289 |
+
return image
|
290 |
+
|
291 |
+
# Converter para RGB se necessário usando conversão direta
|
292 |
if image.mode != 'RGB':
|
293 |
image = image.convert('RGB')
|
294 |
|
295 |
+
# Redimensionar mantendo proporção com otimização de memória
|
296 |
target_size = (self.default_resolution, self.default_resolution)
|
297 |
if image.size != target_size:
|
298 |
+
# Calcular novo tamanho uma única vez
|
299 |
ratio = min(target_size[0] / image.size[0], target_size[1] / image.size[1])
|
300 |
new_size = tuple(int(dim * ratio) for dim in image.size)
|
301 |
+
|
302 |
+
# Redimensionar diretamente para o tamanho final se possível
|
303 |
+
if new_size == target_size:
|
304 |
+
image = image.resize(target_size, Image.Resampling.BILINEAR)
|
305 |
+
else:
|
306 |
+
# Criar imagem com padding em uma única operação
|
307 |
new_image = Image.new('RGB', target_size, (0, 0, 0))
|
308 |
+
image = image.resize(new_size, Image.Resampling.BILINEAR)
|
309 |
paste_x = (target_size[0] - new_size[0]) // 2
|
310 |
paste_y = (target_size[1] - new_size[1]) // 2
|
311 |
new_image.paste(image, (paste_x, paste_y))
|
312 |
image = new_image
|
313 |
|
314 |
+
# Armazenar tamanho para cache
|
315 |
+
self._last_size = image.size
|
316 |
return image
|
317 |
+
|
318 |
except Exception as e:
|
319 |
logger.error(f"Erro no pré-processamento: {str(e)}")
|
320 |
return image
|