Memory Optimizations
Browse files- app/main.py +15 -38
app/main.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import os
|
2 |
-
from transformers import pipeline
|
3 |
import torch
|
4 |
import nltk
|
5 |
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
@@ -425,19 +425,8 @@ async def startup_event():
|
|
425 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
426 |
logger.info(f"Using device: {device}")
|
427 |
|
428 |
-
if device == "cuda":
|
429 |
-
try:
|
430 |
-
# Try to estimate available GPU memory
|
431 |
-
torch.cuda.empty_cache()
|
432 |
-
total_memory = torch.cuda.get_device_properties(0).total_memory
|
433 |
-
free_memory = torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0)
|
434 |
-
logger.info(f"GPU Memory - Total: {total_memory/1e9:.2f}GB, Free: {free_memory/1e9:.2f}GB")
|
435 |
-
except Exception as e:
|
436 |
-
logger.warning(f"Error checking GPU memory: {e}. Falling back to CPU")
|
437 |
-
device = "cpu"
|
438 |
-
|
439 |
if device == "cpu":
|
440 |
-
logger.warning("
|
441 |
|
442 |
# Set NLTK data path
|
443 |
nltk_data_dir = os.environ.get('NLTK_DATA', os.path.join(os.path.expanduser('~'), 'nltk_data'))
|
@@ -452,36 +441,24 @@ async def startup_event():
|
|
452 |
logger.error(f"Error downloading NLTK data: {str(e)}")
|
453 |
raise Exception(f"Failed to initialize application: {str(e)}")
|
454 |
|
455 |
-
# Initialize the model and index with
|
456 |
try:
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
"load_in_8bit": True, # Enable 8-bit quantization
|
464 |
-
"torch_dtype": torch.float16,
|
465 |
-
"low_cpu_mem_usage": True,
|
466 |
-
}
|
467 |
-
|
468 |
-
# Initialize tokenizer
|
469 |
-
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HUGGINGFACE_TOKEN)
|
470 |
-
|
471 |
-
# Load model with 8-bit quantization
|
472 |
-
model = AutoModelForCausalLM.from_pretrained(
|
473 |
-
MODEL_ID,
|
474 |
-
token=HUGGINGFACE_TOKEN,
|
475 |
-
trust_remote_code=True,
|
476 |
-
**model_kwargs
|
477 |
)
|
478 |
-
|
479 |
-
# Create pipeline with quantized model
|
480 |
app.state.pipe = pipeline(
|
481 |
"text-generation",
|
482 |
-
model=
|
483 |
-
tokenizer=tokenizer,
|
484 |
trust_remote_code=True,
|
|
|
|
|
|
|
|
|
485 |
)
|
486 |
|
487 |
faiss_index, documents, embedding_model = await load_or_create_index()
|
|
|
1 |
import os
|
2 |
+
from transformers import pipeline, BitsAndBytesConfig
|
3 |
import torch
|
4 |
import nltk
|
5 |
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
|
|
425 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
426 |
logger.info(f"Using device: {device}")
|
427 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
428 |
if device == "cpu":
|
429 |
+
logger.warning("GPU not detected. Model will run slower on CPU.")
|
430 |
|
431 |
# Set NLTK data path
|
432 |
nltk_data_dir = os.environ.get('NLTK_DATA', os.path.join(os.path.expanduser('~'), 'nltk_data'))
|
|
|
441 |
logger.error(f"Error downloading NLTK data: {str(e)}")
|
442 |
raise Exception(f"Failed to initialize application: {str(e)}")
|
443 |
|
444 |
+
# Initialize the model and index with quantization
|
445 |
try:
|
446 |
+
# Configure 8-bit quantization
|
447 |
+
quantization_config = BitsAndBytesConfig(
|
448 |
+
load_in_8bit=True,
|
449 |
+
llm_int8_threshold=6.0, # Default threshold for good balance of performance/accuracy
|
450 |
+
llm_int8_skip_modules=None, # No modules to skip
|
451 |
+
llm_int8_enable_fp32_cpu_offload=True # Enable CPU offloading if needed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
452 |
)
|
453 |
+
|
|
|
454 |
app.state.pipe = pipeline(
|
455 |
"text-generation",
|
456 |
+
model=MODEL_ID,
|
|
|
457 |
trust_remote_code=True,
|
458 |
+
token=HUGGINGFACE_TOKEN,
|
459 |
+
device_map="auto",
|
460 |
+
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
|
461 |
+
quantization_config=quantization_config # Add quantization config
|
462 |
)
|
463 |
|
464 |
faiss_index, documents, embedding_model = await load_or_create_index()
|