chatbot-backend / src /vectorstores /optimized_vectorstore.py
TalatMasood's picture
Updating chroma db to be singleton class
6082154
raw
history blame
7.33 kB
# src/vectorstores/optimized_vectorstore.py
import asyncio
from typing import Tuple, Optional, List, Dict, Any, Callable
import concurrent.futures
from functools import lru_cache
import chromadb
from chromadb.config import Settings
import shutil
import os
from .base_vectorstore import BaseVectorStore
from .chroma_vectorstore import ChromaVectorStore
from src.embeddings.huggingface_embedding import HuggingFaceEmbedding
from src.utils.logger import logger
from config.config import settings
from src.vectorstores.chroma_settings import get_chroma_settings
class OptimizedVectorStore(ChromaVectorStore):
_instance: Optional['OptimizedVectorStore'] = None
_lock = asyncio.Lock()
_initialized = False
_embedding_model: Optional[HuggingFaceEmbedding] = None
_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
def __new__(cls, *args, **kwargs):
if not cls._instance:
cls._instance = super().__new__(cls)
return cls._instance
def __init__(
self,
embedding_function: Optional[Callable] = None,
persist_directory: str = settings.CHROMA_PATH,
collection_name: str = "documents",
client_settings: Optional[Dict[str, Any]] = None
):
if not self._initialized:
self._persist_directory = persist_directory
self._collection_name = collection_name
self._client_settings = client_settings
self._embedding_function = embedding_function
async def _initialize(self) -> None:
"""Initialize the vector store and embedding model"""
if self._initialized:
return
try:
# Load embedding model in background thread
self._embedding_model = await self._load_embedding_model()
# Get embedding dimension
test_embedding = self._embedding_model.embed_query("test")
required_dim = len(test_embedding)
# Clean up existing database if dimensions don't match
await self._cleanup_if_needed(required_dim)
# Create ChromaDB client with consistent settings
client = chromadb.PersistentClient(
path=self._persist_directory,
settings=get_chroma_settings() # Use shared settings function
)
# Create new collection with correct dimensions
# collection = self._setup_collection(client)
# Initialize parent class
super().__init__(
embedding_function=self._embedding_model.embed_documents,
persist_directory=self._persist_directory,
collection_name=self._collection_name,
client=client # Pass the existing client
)
self._initialized = True
logger.info(
f"Successfully initialized vector store with dimension {required_dim}")
except Exception as e:
logger.error(f"Error initializing vector store: {str(e)}")
raise
async def _cleanup_if_needed(self, required_dim: int) -> None:
"""Clean up existing database if dimensions don't match"""
try:
# Create temporary client to check existing collection
temp_client = chromadb.PersistentClient(
path=self._persist_directory,
settings=Settings(allow_reset=True, is_persistent=True)
)
try:
# Try to get existing collection
collection = temp_client.get_collection(self._collection_name)
current_dim = collection.metadata.get(
"hnsw:dim") if collection.metadata else None
if current_dim != required_dim:
logger.info(
f"Dimension mismatch: current={current_dim}, required={required_dim}")
# Close client connection
temp_client.reset()
# Remove the entire directory
if os.path.exists(self._persist_directory):
shutil.rmtree(self._persist_directory)
logger.info(
f"Removed existing database at {self._persist_directory}")
# Recreate empty directory
os.makedirs(self._persist_directory, exist_ok=True)
except ValueError:
# Collection doesn't exist, no cleanup needed
pass
except Exception as e:
logger.error(f"Error during cleanup: {str(e)}")
raise
async def _load_embedding_model(self) -> HuggingFaceEmbedding:
"""Load embedding model in background thread"""
try:
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
self._executor,
self._create_embedding_model
)
except Exception as e:
logger.error(f"Error loading embedding model: {str(e)}")
raise
@staticmethod
@lru_cache(maxsize=1)
def _create_embedding_model() -> HuggingFaceEmbedding:
"""Create and cache embedding model"""
return HuggingFaceEmbedding(model_name=settings.EMBEDDING_MODEL)
@classmethod
async def create(
cls,
persist_directory: str = settings.CHROMA_PATH,
collection_name: str = "documents",
client_settings: Optional[Dict[str, Any]] = None
) -> Tuple['OptimizedVectorStore', HuggingFaceEmbedding]:
"""Asynchronously create or get instance"""
async with cls._lock:
if not cls._instance or not cls._initialized:
instance = cls(
persist_directory=persist_directory,
collection_name=collection_name,
client_settings=client_settings
)
await instance._initialize()
cls._instance = instance
return cls._instance, cls._instance._embedding_model
# Override parent class methods to ensure initialization
def add_documents(self, *args, **kwargs):
if not self._initialized:
raise RuntimeError("Vector store not initialized")
return super().add_documents(*args, **kwargs)
def similarity_search(self, *args, **kwargs):
if not self._initialized:
raise RuntimeError("Vector store not initialized")
return super().similarity_search(*args, **kwargs)
def get_document_chunks(self, *args, **kwargs):
if not self._initialized:
raise RuntimeError("Vector store not initialized")
return super().get_document_chunks(*args, **kwargs)
def delete_document(self, *args, **kwargs):
if not self._initialized:
raise RuntimeError("Vector store not initialized")
return super().delete_document(*args, **kwargs)
def get_all_documents(self, *args, **kwargs):
if not self._initialized:
raise RuntimeError("Vector store not initialized")
return super().get_all_documents(*args, **kwargs)
async def get_optimized_vector_store() -> Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
"""Get or create an optimized vector store instance"""
return await OptimizedVectorStore.create()