reverted back to old version
Browse files- app/main.py +4 -16
app/main.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import os
|
2 |
-
from transformers import pipeline
|
3 |
import torch
|
4 |
import nltk
|
5 |
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
@@ -418,10 +418,7 @@ async def startup_event():
|
|
418 |
logger = logging.getLogger(__name__)
|
419 |
logger.info("Starting application initialization...")
|
420 |
|
421 |
-
#
|
422 |
-
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
423 |
-
|
424 |
-
# Check if CUDA is available and has enough memory
|
425 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
426 |
logger.info(f"Using device: {device}")
|
427 |
|
@@ -441,24 +438,15 @@ async def startup_event():
|
|
441 |
logger.error(f"Error downloading NLTK data: {str(e)}")
|
442 |
raise Exception(f"Failed to initialize application: {str(e)}")
|
443 |
|
444 |
-
# Initialize the model and index
|
445 |
try:
|
446 |
-
# Configure 8-bit quantization
|
447 |
-
quantization_config = BitsAndBytesConfig(
|
448 |
-
load_in_8bit=True,
|
449 |
-
llm_int8_threshold=6.0, # Default threshold for good balance of performance/accuracy
|
450 |
-
llm_int8_skip_modules=None, # No modules to skip
|
451 |
-
llm_int8_enable_fp32_cpu_offload=True # Enable CPU offloading if needed
|
452 |
-
)
|
453 |
-
|
454 |
app.state.pipe = pipeline(
|
455 |
"text-generation",
|
456 |
model=MODEL_ID,
|
457 |
trust_remote_code=True,
|
458 |
token=HUGGINGFACE_TOKEN,
|
459 |
device_map="auto",
|
460 |
-
torch_dtype=torch.float16 if device == "cuda" else torch.float32
|
461 |
-
quantization_config=quantization_config # Add quantization config
|
462 |
)
|
463 |
|
464 |
faiss_index, documents, embedding_model = await load_or_create_index()
|
|
|
1 |
import os
|
2 |
+
from transformers import pipeline
|
3 |
import torch
|
4 |
import nltk
|
5 |
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
|
|
418 |
logger = logging.getLogger(__name__)
|
419 |
logger.info("Starting application initialization...")
|
420 |
|
421 |
+
# Check if CUDA is available
|
|
|
|
|
|
|
422 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
423 |
logger.info(f"Using device: {device}")
|
424 |
|
|
|
438 |
logger.error(f"Error downloading NLTK data: {str(e)}")
|
439 |
raise Exception(f"Failed to initialize application: {str(e)}")
|
440 |
|
441 |
+
# Initialize the model and index
|
442 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
443 |
app.state.pipe = pipeline(
|
444 |
"text-generation",
|
445 |
model=MODEL_ID,
|
446 |
trust_remote_code=True,
|
447 |
token=HUGGINGFACE_TOKEN,
|
448 |
device_map="auto",
|
449 |
+
torch_dtype=torch.float16 if device == "cuda" else torch.float32
|
|
|
450 |
)
|
451 |
|
452 |
faiss_index, documents, embedding_model = await load_or_create_index()
|