Chris4K commited on
Commit
32a90bc
·
verified ·
1 Parent(s): a524d94

Update services/model_service.py

Browse files
Files changed (1) hide show
  1. services/model_service.py +10 -8
services/model_service.py CHANGED
@@ -1,8 +1,6 @@
1
  from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaConfig
2
  from sentence_transformers import SentenceTransformer
3
  import torch
4
- from functools import lru_cache
5
- from config.config import settings
6
  import logging
7
 
8
  logger = logging.getLogger(__name__)
@@ -25,10 +23,10 @@ class ModelService:
25
  try:
26
  # Load tokenizer
27
  self.tokenizer = AutoTokenizer.from_pretrained(settings.MODEL_NAME)
28
-
29
  # Load model configuration
30
  config = LlamaConfig.from_pretrained(settings.MODEL_NAME)
31
-
32
  # Check and update rope_scaling if necessary
33
  if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
34
  logger.info("Updating rope_scaling in configuration...")
@@ -36,7 +34,12 @@ class ModelService:
36
  "type": "linear", # Ensure this matches the expected type
37
  "factor": config.rope_scaling.get('factor', 1.0) # Use existing factor or default to 1.0
38
  }
39
-
 
 
 
 
 
40
  # Load model with the updated configuration
41
  self.model = AutoModelForCausalLM.from_pretrained(
42
  settings.MODEL_NAME,
@@ -44,14 +47,13 @@ class ModelService:
44
  device_map="auto" if settings.DEVICE == "cuda" else None,
45
  config=config
46
  )
47
-
48
  # Load sentence embedder
49
  self.embedder = SentenceTransformer(settings.EMBEDDER_MODEL)
50
-
51
  except Exception as e:
52
  logger.error(f"Error loading models: {e}")
53
  raise
54
 
55
-
56
  def get_models(self):
57
  return self.tokenizer, self.model, self.embedder
 
1
  from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaConfig
2
  from sentence_transformers import SentenceTransformer
3
  import torch
 
 
4
  import logging
5
 
6
  logger = logging.getLogger(__name__)
 
23
  try:
24
  # Load tokenizer
25
  self.tokenizer = AutoTokenizer.from_pretrained(settings.MODEL_NAME)
26
+
27
  # Load model configuration
28
  config = LlamaConfig.from_pretrained(settings.MODEL_NAME)
29
+
30
  # Check and update rope_scaling if necessary
31
  if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
32
  logger.info("Updating rope_scaling in configuration...")
 
34
  "type": "linear", # Ensure this matches the expected type
35
  "factor": config.rope_scaling.get('factor', 1.0) # Use existing factor or default to 1.0
36
  }
37
+
38
+ # Check quantization type and adjust accordingly
39
+ if config.get('quantization_config', {}).get('type', '') == 'compressed-tensors':
40
+ logger.warning("Quantization type 'compressed-tensors' is not supported. Switching to 'bitsandbytes_8bit'.")
41
+ config.quantization_config['type'] = 'bitsandbytes_8bit'
42
+
43
  # Load model with the updated configuration
44
  self.model = AutoModelForCausalLM.from_pretrained(
45
  settings.MODEL_NAME,
 
47
  device_map="auto" if settings.DEVICE == "cuda" else None,
48
  config=config
49
  )
50
+
51
  # Load sentence embedder
52
  self.embedder = SentenceTransformer(settings.EMBEDDER_MODEL)
53
+
54
  except Exception as e:
55
  logger.error(f"Error loading models: {e}")
56
  raise
57
 
 
58
  def get_models(self):
59
  return self.tokenizer, self.model, self.embedder