Chris4K commited on
Commit
16f48ef
·
verified ·
1 Parent(s): 8c9754a

Update services/model_service.py

Browse files
Files changed (1) hide show
  1. services/model_service.py +3 -14
services/model_service.py CHANGED
@@ -25,16 +25,7 @@ class ModelService:
25
  # Load tokenizer
26
  self.tokenizer = AutoTokenizer.from_pretrained(settings.MODEL_NAME)
27
 
28
- # Load model configuration
29
- config = LlamaConfig.from_pretrained(settings.MODEL_NAME)
30
-
31
- # Check and update rope_scaling if necessary
32
- if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
33
- logger.info("Updating rope_scaling in configuration...")
34
- config.rope_scaling = {
35
- "type": "linear", # Ensure this matches the expected type
36
- "factor": config.rope_scaling.get('factor', 1.0) # Use existing factor or default to 1.0
37
- }
38
 
39
  # Check quantization type and adjust accordingly
40
  if config.get('quantization_config', {}).get('type', '') == 'compressed-tensors':
@@ -43,11 +34,9 @@ class ModelService:
43
 
44
  # Load model with the updated configuration
45
  self.model = AutoModelForCausalLM.from_pretrained(
46
- settings.MODEL_NAME,
47
- model_type = "llama",
48
  torch_dtype=torch.float16 if settings.DEVICE == "cuda" else torch.float32,
49
- device_map="auto" if settings.DEVICE == "cuda" else None,
50
- config=config
51
  )
52
 
53
  # Load sentence embedder
 
25
  # Load tokenizer
26
  self.tokenizer = AutoTokenizer.from_pretrained(settings.MODEL_NAME)
27
 
28
+
 
 
 
 
 
 
 
 
 
29
 
30
  # Check quantization type and adjust accordingly
31
  if config.get('quantization_config', {}).get('type', '') == 'compressed-tensors':
 
34
 
35
  # Load model with the updated configuration
36
  self.model = AutoModelForCausalLM.from_pretrained(
37
+ settings.MODEL_NAME,
 
38
  torch_dtype=torch.float16 if settings.DEVICE == "cuda" else torch.float32,
39
+ device_map="auto" if settings.DEVICE == "cuda" else None
 
40
  )
41
 
42
  # Load sentence embedder