Spaces:

stem-content-ai-project
/

content-pipeline

Sleeping

App Files Files Community

AK1239 commited on Mar 13

Commit

e8f7557

1 Parent(s): 925a57e

Memory Optimizations

Browse files

Files changed (1) hide show

app/main.py +35 -5

app/main.py CHANGED Viewed

@@ -418,12 +418,30 @@ async def startup_event():
     logger = logging.getLogger(__name__)
     logger.info("Starting application initialization...")
-    # Check if CUDA is available
     device = "cuda" if torch.cuda.is_available() else "cpu"
     logger.info(f"Using device: {device}")
     if device == "cpu":
-        logger.warning("GPU not detected. Model will run slower on CPU.")
     # Set NLTK data path
     nltk_data_dir = os.environ.get('NLTK_DATA', os.path.join(os.path.expanduser('~'), 'nltk_data'))
@@ -438,15 +456,27 @@ async def startup_event():
         logger.error(f"Error downloading NLTK data: {str(e)}")
         raise Exception(f"Failed to initialize application: {str(e)}")
-    # Initialize the model and index
     try:
         app.state.pipe = pipeline(
             "text-generation",
             model=MODEL_ID,
             trust_remote_code=True,
             token=HUGGINGFACE_TOKEN,
-            device_map="auto",
-            torch_dtype=torch.float16 if device == "cuda" else torch.float32
         )
         faiss_index, documents, embedding_model = await load_or_create_index()

     logger = logging.getLogger(__name__)
     logger.info("Starting application initialization...")
+    # Set PyTorch memory management settings
+    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
+    # Check if CUDA is available and has enough memory
     device = "cuda" if torch.cuda.is_available() else "cpu"
     logger.info(f"Using device: {device}")
+    if device == "cuda":
+        try:
+            # Try to estimate available GPU memory
+            torch.cuda.empty_cache()
+            total_memory = torch.cuda.get_device_properties(0).total_memory
+            free_memory = torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0)
+            logger.info(f"GPU Memory - Total: {total_memory/1e9:.2f}GB, Free: {free_memory/1e9:.2f}GB")
+            if free_memory < 1e9:  # If less than 2GB free
+                logger.warning("Low GPU memory detected, falling back to CPU")
+                device = "cpu"
+        except Exception as e:
+            logger.warning(f"Error checking GPU memory: {e}. Falling back to CPU")
+            device = "cpu"
     if device == "cpu":
+        logger.warning("Using CPU. Model will run slower.")
     # Set NLTK data path
     nltk_data_dir = os.environ.get('NLTK_DATA', os.path.join(os.path.expanduser('~'), 'nltk_data'))
         logger.error(f"Error downloading NLTK data: {str(e)}")
         raise Exception(f"Failed to initialize application: {str(e)}")
+    # Initialize the model and index with memory optimizations
     try:
+        model_kwargs = {
+            "device_map": "auto" if device == "cuda" else "cpu",
+            "torch_dtype": torch.float16 if device == "cuda" else torch.float32,
+            "low_cpu_mem_usage": True,
+        }
+        if device == "cpu":
+            # Additional CPU optimizations
+            model_kwargs.update({
+                "offload_folder": "offload",
+                "offload_state_dict": True
+            })
         app.state.pipe = pipeline(
             "text-generation",
             model=MODEL_ID,
             trust_remote_code=True,
             token=HUGGINGFACE_TOKEN,
+            **model_kwargs
         )
         faiss_index, documents, embedding_model = await load_or_create_index()