AK1239 commited on
Commit
b2fe716
·
1 Parent(s): ee55a04

Memory Optimizations

Browse files
Files changed (1) hide show
  1. app/main.py +15 -38
app/main.py CHANGED
@@ -1,5 +1,5 @@
1
  import os
2
- from transformers import pipeline
3
  import torch
4
  import nltk
5
  from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
@@ -425,19 +425,8 @@ async def startup_event():
425
  device = "cuda" if torch.cuda.is_available() else "cpu"
426
  logger.info(f"Using device: {device}")
427
 
428
- if device == "cuda":
429
- try:
430
- # Try to estimate available GPU memory
431
- torch.cuda.empty_cache()
432
- total_memory = torch.cuda.get_device_properties(0).total_memory
433
- free_memory = torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0)
434
- logger.info(f"GPU Memory - Total: {total_memory/1e9:.2f}GB, Free: {free_memory/1e9:.2f}GB")
435
- except Exception as e:
436
- logger.warning(f"Error checking GPU memory: {e}. Falling back to CPU")
437
- device = "cpu"
438
-
439
  if device == "cpu":
440
- logger.warning("Using CPU. Model will run slower.")
441
 
442
  # Set NLTK data path
443
  nltk_data_dir = os.environ.get('NLTK_DATA', os.path.join(os.path.expanduser('~'), 'nltk_data'))
@@ -452,36 +441,24 @@ async def startup_event():
452
  logger.error(f"Error downloading NLTK data: {str(e)}")
453
  raise Exception(f"Failed to initialize application: {str(e)}")
454
 
455
- # Initialize the model and index with memory optimizations
456
  try:
457
- from transformers import AutoModelForCausalLM, AutoTokenizer
458
- import bitsandbytes as bnb
459
-
460
- logger.info("Loading model with 8-bit quantization...")
461
- model_kwargs = {
462
- "device_map": "auto",
463
- "load_in_8bit": True, # Enable 8-bit quantization
464
- "torch_dtype": torch.float16,
465
- "low_cpu_mem_usage": True,
466
- }
467
-
468
- # Initialize tokenizer
469
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HUGGINGFACE_TOKEN)
470
-
471
- # Load model with 8-bit quantization
472
- model = AutoModelForCausalLM.from_pretrained(
473
- MODEL_ID,
474
- token=HUGGINGFACE_TOKEN,
475
- trust_remote_code=True,
476
- **model_kwargs
477
  )
478
-
479
- # Create pipeline with quantized model
480
  app.state.pipe = pipeline(
481
  "text-generation",
482
- model=model,
483
- tokenizer=tokenizer,
484
  trust_remote_code=True,
 
 
 
 
485
  )
486
 
487
  faiss_index, documents, embedding_model = await load_or_create_index()
 
1
  import os
2
+ from transformers import pipeline, BitsAndBytesConfig
3
  import torch
4
  import nltk
5
  from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
 
425
  device = "cuda" if torch.cuda.is_available() else "cpu"
426
  logger.info(f"Using device: {device}")
427
 
 
 
 
 
 
 
 
 
 
 
 
428
  if device == "cpu":
429
+ logger.warning("GPU not detected. Model will run slower on CPU.")
430
 
431
  # Set NLTK data path
432
  nltk_data_dir = os.environ.get('NLTK_DATA', os.path.join(os.path.expanduser('~'), 'nltk_data'))
 
441
  logger.error(f"Error downloading NLTK data: {str(e)}")
442
  raise Exception(f"Failed to initialize application: {str(e)}")
443
 
444
+ # Initialize the model and index with quantization
445
  try:
446
+ # Configure 8-bit quantization
447
+ quantization_config = BitsAndBytesConfig(
448
+ load_in_8bit=True,
449
+ llm_int8_threshold=6.0, # Default threshold for good balance of performance/accuracy
450
+ llm_int8_skip_modules=None, # No modules to skip
451
+ llm_int8_enable_fp32_cpu_offload=True # Enable CPU offloading if needed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  )
453
+
 
454
  app.state.pipe = pipeline(
455
  "text-generation",
456
+ model=MODEL_ID,
 
457
  trust_remote_code=True,
458
+ token=HUGGINGFACE_TOKEN,
459
+ device_map="auto",
460
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
461
+ quantization_config=quantization_config # Add quantization config
462
  )
463
 
464
  faiss_index, documents, embedding_model = await load_or_create_index()