Spaces:

marcuscanhaco
/

weapon-detection-app

Runtime error

App Files Files Community

Marcus Vinicius Zerbini Canhaço commited on Feb 12

Commit

3374810

1 Parent(s): 178f6aa

feat: atualização do detector com otimizações para GPU T4

Browse files

Files changed (2) hide show

src/domain/detectors/gpu.py +11 -8
src/main.py +20 -4

src/domain/detectors/gpu.py CHANGED Viewed

@@ -21,10 +21,13 @@ torch.backends.cudnn.benchmark = True
 torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
 torch._dynamo.config.suppress_errors = True
 class GPUCache(BaseCache):
     """Cache otimizado para GPU."""
-    def __init__(self, max_size: int = 1000):
         super().__init__(max_size)
         self.device = torch.device('cuda')
@@ -35,12 +38,12 @@ class WeaponDetectorGPU(BaseDetector):
     def __init__(self):
         """Inicializa variáveis básicas."""
         super().__init__()
-        self.default_resolution = 640
         self.amp_dtype = torch.float16
         self.preprocess_stream = torch.cuda.Stream()
-        self.max_batch_size = 16  # Aumentado para 16
-        self.current_batch_size = 8  # Aumentado para 8
-        self.min_batch_size = 2
     def _initialize(self):
         """Inicializa o modelo e o processador para execução exclusiva em GPU."""
@@ -64,18 +67,18 @@ class WeaponDetectorGPU(BaseDetector):
                 cache_dir=cache_dir
             )
-            # Configurações otimizadas para T4
             self.owlv2_model = Owlv2ForObjectDetection.from_pretrained(
                 model_name,
                 cache_dir=cache_dir,
                 torch_dtype=self.amp_dtype,
                 device_map="auto",
-                low_cpu_mem_usage=True
             ).to(self.device)
             # Otimizar modelo para inferência
             self.owlv2_model.eval()
-            torch.compile(self.owlv2_model)  # Usar torch.compile para otimização
             # Usar queries do método base
             self.text_queries = self._get_detection_queries()

 torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
 torch._dynamo.config.suppress_errors = True
+# Configurações para Zero-GPU
+os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
 class GPUCache(BaseCache):
     """Cache otimizado para GPU."""
+    def __init__(self, max_size: int = 100):  # Reduzido para economizar memória
         super().__init__(max_size)
         self.device = torch.device('cuda')
     def __init__(self):
         """Inicializa variáveis básicas."""
         super().__init__()
+        self.default_resolution = 512  # Reduzido para economizar memória
         self.amp_dtype = torch.float16
         self.preprocess_stream = torch.cuda.Stream()
+        self.max_batch_size = 4  # Reduzido para Zero-GPU
+        self.current_batch_size = 2  # Reduzido para Zero-GPU
+        self.min_batch_size = 1
     def _initialize(self):
         """Inicializa o modelo e o processador para execução exclusiva em GPU."""
                 cache_dir=cache_dir
             )
+            # Configurações otimizadas para Zero-GPU
             self.owlv2_model = Owlv2ForObjectDetection.from_pretrained(
                 model_name,
                 cache_dir=cache_dir,
                 torch_dtype=self.amp_dtype,
                 device_map="auto",
+                low_cpu_mem_usage=True,
+                max_memory={'cuda:0': '10GB'}  # Limitar uso de memória
             ).to(self.device)
             # Otimizar modelo para inferência
             self.owlv2_model.eval()
             # Usar queries do método base
             self.text_queries = self._get_detection_queries()

src/main.py CHANGED Viewed

@@ -3,6 +3,7 @@ from dotenv import load_dotenv
 from src.presentation.web.gradio_interface import GradioInterface
 import logging
 import torch
 # Configurar logging
 logging.basicConfig(
@@ -11,6 +12,19 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 def main():
     """Função principal que inicia a aplicação."""
     try:
@@ -21,6 +35,7 @@ def main():
         if IS_HUGGINGFACE:
             load_dotenv('.env.huggingface')
             logger.info("Ambiente HuggingFace detectado")
         else:
             load_dotenv('.env')
             logger.info("Ambiente local detectado")
@@ -33,22 +48,23 @@ def main():
             # Calcular número ideal de workers baseado na GPU
             if torch.cuda.is_available():
                 gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # em GB
-                max_concurrent = min(2, int(gpu_mem / 8))  # 8GB por worker
                 logger.info(f"GPU Memory: {gpu_mem:.1f}GB, Max Concurrent: {max_concurrent}")
             else:
                 max_concurrent = 1
             # Primeiro configurar a fila
             demo = demo.queue(
-                concurrency_limit=max_concurrent,  # Simplificando para um worker
                 api_open=False,
-                status_update_rate="auto"
             )
             # Depois fazer o launch
             demo.launch(
                 server_name="0.0.0.0",
                 server_port=7860,
-                share=False
             )
         else:
             # Ambiente local - apenas launch direto

 from src.presentation.web.gradio_interface import GradioInterface
 import logging
 import torch
+import gc
 # Configurar logging
 logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
+def setup_zero_gpu():
+    """Configurações otimizadas para Zero-GPU."""
+    # Limpar cache CUDA
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()
+    # Configurações para otimizar memória
+    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cudnn.allow_tf32 = True
 def main():
     """Função principal que inicia a aplicação."""
     try:
         if IS_HUGGINGFACE:
             load_dotenv('.env.huggingface')
             logger.info("Ambiente HuggingFace detectado")
+            setup_zero_gpu()
         else:
             load_dotenv('.env')
             logger.info("Ambiente local detectado")
             # Calcular número ideal de workers baseado na GPU
             if torch.cuda.is_available():
                 gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # em GB
+                max_concurrent = 1  # Forçar single worker para Zero-GPU
                 logger.info(f"GPU Memory: {gpu_mem:.1f}GB, Max Concurrent: {max_concurrent}")
             else:
                 max_concurrent = 1
             # Primeiro configurar a fila
             demo = demo.queue(
                 api_open=False,
+                status_update_rate="auto",
+                max_size=5  # Reduzir tamanho da fila para economizar memória
             )
             # Depois fazer o launch
             demo.launch(
                 server_name="0.0.0.0",
                 server_port=7860,
+                share=False,
+                max_threads=2  # Reduzir número de threads
             )
         else:
             # Ambiente local - apenas launch direto