llm

Sleeping

App Files Files Community

Chris4K commited on Jan 12

Commit

32a90bc

verified ·

1 Parent(s): a524d94

Update services/model_service.py

Browse files

Files changed (1) hide show

services/model_service.py +10 -8

services/model_service.py CHANGED Viewed

@@ -1,8 +1,6 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaConfig
 from sentence_transformers import SentenceTransformer
 import torch
-from functools import lru_cache
-from config.config import settings
 import logging
 logger = logging.getLogger(__name__)
@@ -25,10 +23,10 @@ class ModelService:
         try:
             # Load tokenizer
             self.tokenizer = AutoTokenizer.from_pretrained(settings.MODEL_NAME)
             # Load model configuration
             config = LlamaConfig.from_pretrained(settings.MODEL_NAME)
             # Check and update rope_scaling if necessary
             if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
                 logger.info("Updating rope_scaling in configuration...")
@@ -36,7 +34,12 @@ class ModelService:
                     "type": "linear",  # Ensure this matches the expected type
                     "factor": config.rope_scaling.get('factor', 1.0)  # Use existing factor or default to 1.0
                 }
             # Load model with the updated configuration
             self.model = AutoModelForCausalLM.from_pretrained(
                 settings.MODEL_NAME,
@@ -44,14 +47,13 @@ class ModelService:
                 device_map="auto" if settings.DEVICE == "cuda" else None,
                 config=config
             )
             # Load sentence embedder
             self.embedder = SentenceTransformer(settings.EMBEDDER_MODEL)
         except Exception as e:
             logger.error(f"Error loading models: {e}")
             raise
     def get_models(self):
         return self.tokenizer, self.model, self.embedder

 from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaConfig
 from sentence_transformers import SentenceTransformer
 import torch
 import logging
 logger = logging.getLogger(__name__)
         try:
             # Load tokenizer
             self.tokenizer = AutoTokenizer.from_pretrained(settings.MODEL_NAME)
             # Load model configuration
             config = LlamaConfig.from_pretrained(settings.MODEL_NAME)
             # Check and update rope_scaling if necessary
             if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
                 logger.info("Updating rope_scaling in configuration...")
                     "type": "linear",  # Ensure this matches the expected type
                     "factor": config.rope_scaling.get('factor', 1.0)  # Use existing factor or default to 1.0
                 }
+            # Check quantization type and adjust accordingly
+            if config.get('quantization_config', {}).get('type', '') == 'compressed-tensors':
+                logger.warning("Quantization type 'compressed-tensors' is not supported. Switching to 'bitsandbytes_8bit'.")
+                config.quantization_config['type'] = 'bitsandbytes_8bit'
             # Load model with the updated configuration
             self.model = AutoModelForCausalLM.from_pretrained(
                 settings.MODEL_NAME,
                 device_map="auto" if settings.DEVICE == "cuda" else None,
                 config=config
             )
             # Load sentence embedder
             self.embedder = SentenceTransformer(settings.EMBEDDER_MODEL)
         except Exception as e:
             logger.error(f"Error loading models: {e}")
             raise
     def get_models(self):
         return self.tokenizer, self.model, self.embedder