Spaces:

stem-content-ai-project
/

content-pipeline

Sleeping

App Files Files Community

AK1239 commited on Mar 13

Commit

b2fe716

1 Parent(s): ee55a04

Memory Optimizations

Browse files

Files changed (1) hide show

app/main.py +15 -38

app/main.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from transformers import pipeline
 import torch
 import nltk
 from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
@@ -425,19 +425,8 @@ async def startup_event():
     device = "cuda" if torch.cuda.is_available() else "cpu"
     logger.info(f"Using device: {device}")
-    if device == "cuda":
-        try:
-            # Try to estimate available GPU memory
-            torch.cuda.empty_cache()
-            total_memory = torch.cuda.get_device_properties(0).total_memory
-            free_memory = torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0)
-            logger.info(f"GPU Memory - Total: {total_memory/1e9:.2f}GB, Free: {free_memory/1e9:.2f}GB")
-        except Exception as e:
-            logger.warning(f"Error checking GPU memory: {e}. Falling back to CPU")
-            device = "cpu"
     if device == "cpu":
-        logger.warning("Using CPU. Model will run slower.")
     # Set NLTK data path
     nltk_data_dir = os.environ.get('NLTK_DATA', os.path.join(os.path.expanduser('~'), 'nltk_data'))
@@ -452,36 +441,24 @@ async def startup_event():
         logger.error(f"Error downloading NLTK data: {str(e)}")
         raise Exception(f"Failed to initialize application: {str(e)}")
-    # Initialize the model and index with memory optimizations
     try:
-        from transformers import AutoModelForCausalLM, AutoTokenizer
-        import bitsandbytes as bnb
-        logger.info("Loading model with 8-bit quantization...")
-        model_kwargs = {
-            "device_map": "auto",
-            "load_in_8bit": True,  # Enable 8-bit quantization
-            "torch_dtype": torch.float16,
-            "low_cpu_mem_usage": True,
-        }
-        # Initialize tokenizer
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HUGGINGFACE_TOKEN)
-        # Load model with 8-bit quantization
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            token=HUGGINGFACE_TOKEN,
-            trust_remote_code=True,
-            **model_kwargs
         )
-        # Create pipeline with quantized model
         app.state.pipe = pipeline(
             "text-generation",
-            model=model,
-            tokenizer=tokenizer,
             trust_remote_code=True,
         )
         faiss_index, documents, embedding_model = await load_or_create_index()

 import os
+from transformers import pipeline, BitsAndBytesConfig
 import torch
 import nltk
 from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
     device = "cuda" if torch.cuda.is_available() else "cpu"
     logger.info(f"Using device: {device}")
     if device == "cpu":
+        logger.warning("GPU not detected. Model will run slower on CPU.")
     # Set NLTK data path
     nltk_data_dir = os.environ.get('NLTK_DATA', os.path.join(os.path.expanduser('~'), 'nltk_data'))
         logger.error(f"Error downloading NLTK data: {str(e)}")
         raise Exception(f"Failed to initialize application: {str(e)}")
+    # Initialize the model and index with quantization
     try:
+        # Configure 8-bit quantization
+        quantization_config = BitsAndBytesConfig(
+            load_in_8bit=True,
+            llm_int8_threshold=6.0,  # Default threshold for good balance of performance/accuracy
+            llm_int8_skip_modules=None,  # No modules to skip
+            llm_int8_enable_fp32_cpu_offload=True  # Enable CPU offloading if needed
         )
         app.state.pipe = pipeline(
             "text-generation",
+            model=MODEL_ID,
             trust_remote_code=True,
+            token=HUGGINGFACE_TOKEN,
+            device_map="auto",
+            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+            quantization_config=quantization_config  # Add quantization config
         )
         faiss_index, documents, embedding_model = await load_or_create_index()