|
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaConfig |
|
from sentence_transformers import SentenceTransformer |
|
import torch |
|
from functools import lru_cache |
|
from config.config import settings |
|
import logging |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
class ModelService: |
|
_instance = None |
|
|
|
def __new__(cls): |
|
if cls._instance is None: |
|
cls._instance = super().__new__(cls) |
|
cls._instance._initialized = False |
|
return cls._instance |
|
|
|
def __init__(self): |
|
if not self._initialized: |
|
self._initialized = True |
|
self._load_models() |
|
|
|
def _load_models(self): |
|
try: |
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(settings.MODEL_NAME) |
|
|
|
|
|
config = LlamaConfig.from_pretrained(settings.MODEL_NAME) |
|
|
|
|
|
if hasattr(config, "rope_scaling") and config.rope_scaling is not None: |
|
logger.info("Updating rope_scaling in configuration...") |
|
config.rope_scaling = { |
|
"type": "linear", |
|
"factor": config.rope_scaling.get('factor', 1.0) |
|
} |
|
|
|
|
|
self.model = AutoModelForCausalLM.from_pretrained( |
|
settings.MODEL_NAME, |
|
torch_dtype=torch.float16 if settings.DEVICE == "cuda" else torch.float32, |
|
device_map="auto" if settings.DEVICE == "cuda" else None, |
|
config=config |
|
) |
|
|
|
|
|
self.embedder = SentenceTransformer(settings.EMBEDDER_MODEL) |
|
|
|
except Exception as e: |
|
logger.error(f"Error loading models: {e}") |
|
raise |
|
|
|
|
|
def get_models(self): |
|
return self.tokenizer, self.model, self.embedder |
|
|