Spaces:

stem-content-ai-project
/

content-pipeline

Sleeping

App Files Files Community

AK1239 commited on Mar 13

Commit

ee55a04

1 Parent(s): e8f7557

Using bits and bytes

Browse files

Files changed (1) hide show

app/main.py +20 -15

app/main.py CHANGED Viewed

@@ -432,10 +432,6 @@ async def startup_event():
             total_memory = torch.cuda.get_device_properties(0).total_memory
             free_memory = torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0)
             logger.info(f"GPU Memory - Total: {total_memory/1e9:.2f}GB, Free: {free_memory/1e9:.2f}GB")
-            if free_memory < 1e9:  # If less than 2GB free
-                logger.warning("Low GPU memory detected, falling back to CPU")
-                device = "cpu"
         except Exception as e:
             logger.warning(f"Error checking GPU memory: {e}. Falling back to CPU")
             device = "cpu"
@@ -458,25 +454,34 @@ async def startup_event():
     # Initialize the model and index with memory optimizations
     try:
         model_kwargs = {
-            "device_map": "auto" if device == "cuda" else "cpu",
-            "torch_dtype": torch.float16 if device == "cuda" else torch.float32,
             "low_cpu_mem_usage": True,
         }
-        if device == "cpu":
-            # Additional CPU optimizations
-            model_kwargs.update({
-                "offload_folder": "offload",
-                "offload_state_dict": True
-            })
         app.state.pipe = pipeline(
             "text-generation",
-            model=MODEL_ID,
             trust_remote_code=True,
-            token=HUGGINGFACE_TOKEN,
-            **model_kwargs
         )
         faiss_index, documents, embedding_model = await load_or_create_index()

             total_memory = torch.cuda.get_device_properties(0).total_memory
             free_memory = torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0)
             logger.info(f"GPU Memory - Total: {total_memory/1e9:.2f}GB, Free: {free_memory/1e9:.2f}GB")
         except Exception as e:
             logger.warning(f"Error checking GPU memory: {e}. Falling back to CPU")
             device = "cpu"
     # Initialize the model and index with memory optimizations
     try:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        import bitsandbytes as bnb
+        logger.info("Loading model with 8-bit quantization...")
         model_kwargs = {
+            "device_map": "auto",
+            "load_in_8bit": True,  # Enable 8-bit quantization
+            "torch_dtype": torch.float16,
             "low_cpu_mem_usage": True,
         }
+        # Initialize tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HUGGINGFACE_TOKEN)
+        # Load model with 8-bit quantization
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            token=HUGGINGFACE_TOKEN,
+            trust_remote_code=True,
+            **model_kwargs
+        )
+        # Create pipeline with quantized model
         app.state.pipe = pipeline(
             "text-generation",
+            model=model,
+            tokenizer=tokenizer,
             trust_remote_code=True,
         )
         faiss_index, documents, embedding_model = await load_or_create_index()