# src/vectorstores/optimized_vectorstore.py import asyncio from typing import Tuple, Optional, List, Dict, Any, Callable import concurrent.futures from functools import lru_cache import chromadb from chromadb.config import Settings import shutil import os from .base_vectorstore import BaseVectorStore from .chroma_vectorstore import ChromaVectorStore from src.embeddings.huggingface_embedding import HuggingFaceEmbedding from src.utils.logger import logger from config.config import settings from src.vectorstores.chroma_settings import get_chroma_settings class OptimizedVectorStore(ChromaVectorStore): _instance: Optional['OptimizedVectorStore'] = None _lock = asyncio.Lock() _initialized = False _embedding_model: Optional[HuggingFaceEmbedding] = None _executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) def __new__(cls, *args, **kwargs): if not cls._instance: cls._instance = super().__new__(cls) return cls._instance def __init__( self, embedding_function: Optional[Callable] = None, persist_directory: str = settings.CHROMA_PATH, collection_name: str = "documents", client_settings: Optional[Dict[str, Any]] = None ): if not self._initialized: self._persist_directory = persist_directory self._collection_name = collection_name self._client_settings = client_settings self._embedding_function = embedding_function async def _initialize(self) -> None: """Initialize the vector store and embedding model""" if self._initialized: return try: # Load embedding model in background thread self._embedding_model = await self._load_embedding_model() # Get embedding dimension test_embedding = self._embedding_model.embed_query("test") required_dim = len(test_embedding) # Clean up existing database if dimensions don't match await self._cleanup_if_needed(required_dim) # Create ChromaDB client with consistent settings client = chromadb.PersistentClient( path=self._persist_directory, settings=get_chroma_settings() # Use shared settings function ) # Create new collection with correct dimensions # collection = self._setup_collection(client) # Initialize parent class super().__init__( embedding_function=self._embedding_model.embed_documents, persist_directory=self._persist_directory, collection_name=self._collection_name, client=client # Pass the existing client ) self._initialized = True logger.info( f"Successfully initialized vector store with dimension {required_dim}") except Exception as e: logger.error(f"Error initializing vector store: {str(e)}") raise async def _cleanup_if_needed(self, required_dim: int) -> None: """Clean up existing database if dimensions don't match""" try: # Create temporary client to check existing collection temp_client = chromadb.PersistentClient( path=self._persist_directory, settings=Settings(allow_reset=True, is_persistent=True) ) try: # Try to get existing collection collection = temp_client.get_collection(self._collection_name) current_dim = collection.metadata.get( "hnsw:dim") if collection.metadata else None if current_dim != required_dim: logger.info( f"Dimension mismatch: current={current_dim}, required={required_dim}") # Close client connection temp_client.reset() # Remove the entire directory if os.path.exists(self._persist_directory): shutil.rmtree(self._persist_directory) logger.info( f"Removed existing database at {self._persist_directory}") # Recreate empty directory os.makedirs(self._persist_directory, exist_ok=True) except ValueError: # Collection doesn't exist, no cleanup needed pass except Exception as e: logger.error(f"Error during cleanup: {str(e)}") raise async def _load_embedding_model(self) -> HuggingFaceEmbedding: """Load embedding model in background thread""" try: loop = asyncio.get_event_loop() return await loop.run_in_executor( self._executor, self._create_embedding_model ) except Exception as e: logger.error(f"Error loading embedding model: {str(e)}") raise @staticmethod @lru_cache(maxsize=1) def _create_embedding_model() -> HuggingFaceEmbedding: """Create and cache embedding model""" return HuggingFaceEmbedding(model_name=settings.EMBEDDING_MODEL) @classmethod async def create( cls, persist_directory: str = settings.CHROMA_PATH, collection_name: str = "documents", client_settings: Optional[Dict[str, Any]] = None ) -> Tuple['OptimizedVectorStore', HuggingFaceEmbedding]: """Asynchronously create or get instance""" async with cls._lock: if not cls._instance or not cls._initialized: instance = cls( persist_directory=persist_directory, collection_name=collection_name, client_settings=client_settings ) await instance._initialize() cls._instance = instance return cls._instance, cls._instance._embedding_model # Override parent class methods to ensure initialization def add_documents(self, *args, **kwargs): if not self._initialized: raise RuntimeError("Vector store not initialized") return super().add_documents(*args, **kwargs) def similarity_search(self, *args, **kwargs): if not self._initialized: raise RuntimeError("Vector store not initialized") return super().similarity_search(*args, **kwargs) def get_document_chunks(self, *args, **kwargs): if not self._initialized: raise RuntimeError("Vector store not initialized") return super().get_document_chunks(*args, **kwargs) def delete_document(self, *args, **kwargs): if not self._initialized: raise RuntimeError("Vector store not initialized") return super().delete_document(*args, **kwargs) def get_all_documents(self, *args, **kwargs): if not self._initialized: raise RuntimeError("Vector store not initialized") return super().get_all_documents(*args, **kwargs) async def get_optimized_vector_store() -> Tuple[ChromaVectorStore, HuggingFaceEmbedding]: """Get or create an optimized vector store instance""" return await OptimizedVectorStore.create()