Spaces:
Running
Running
# src/vectorstores/optimized_vectorstore.py | |
import asyncio | |
from typing import Tuple, Optional, List, Dict, Any, Callable | |
import concurrent.futures | |
from functools import lru_cache | |
import chromadb | |
from chromadb.config import Settings | |
import shutil | |
import os | |
from .base_vectorstore import BaseVectorStore | |
from .chroma_vectorstore import ChromaVectorStore | |
from src.embeddings.huggingface_embedding import HuggingFaceEmbedding | |
from src.utils.logger import logger | |
from config.config import settings | |
from src.vectorstores.chroma_settings import get_chroma_settings | |
class OptimizedVectorStore(ChromaVectorStore): | |
_instance: Optional['OptimizedVectorStore'] = None | |
_lock = asyncio.Lock() | |
_initialized = False | |
_embedding_model: Optional[HuggingFaceEmbedding] = None | |
_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) | |
def __new__(cls, *args, **kwargs): | |
if not cls._instance: | |
cls._instance = super().__new__(cls) | |
return cls._instance | |
def __init__( | |
self, | |
embedding_function: Optional[Callable] = None, | |
persist_directory: str = settings.CHROMA_PATH, | |
collection_name: str = "documents", | |
client_settings: Optional[Dict[str, Any]] = None | |
): | |
if not self._initialized: | |
self._persist_directory = persist_directory | |
self._collection_name = collection_name | |
self._client_settings = client_settings | |
self._embedding_function = embedding_function | |
async def _initialize(self) -> None: | |
"""Initialize the vector store and embedding model""" | |
if self._initialized: | |
return | |
try: | |
# Load embedding model in background thread | |
self._embedding_model = await self._load_embedding_model() | |
# Get embedding dimension | |
test_embedding = self._embedding_model.embed_query("test") | |
required_dim = len(test_embedding) | |
# Clean up existing database if dimensions don't match | |
await self._cleanup_if_needed(required_dim) | |
# Create ChromaDB client with consistent settings | |
client = chromadb.PersistentClient( | |
path=self._persist_directory, | |
settings=get_chroma_settings() # Use shared settings function | |
) | |
# Create new collection with correct dimensions | |
# collection = self._setup_collection(client) | |
# Initialize parent class | |
super().__init__( | |
embedding_function=self._embedding_model.embed_documents, | |
persist_directory=self._persist_directory, | |
collection_name=self._collection_name, | |
client=client # Pass the existing client | |
) | |
self._initialized = True | |
logger.info( | |
f"Successfully initialized vector store with dimension {required_dim}") | |
except Exception as e: | |
logger.error(f"Error initializing vector store: {str(e)}") | |
raise | |
async def _cleanup_if_needed(self, required_dim: int) -> None: | |
"""Clean up existing database if dimensions don't match""" | |
try: | |
# Create temporary client to check existing collection | |
temp_client = chromadb.PersistentClient( | |
path=self._persist_directory, | |
settings=Settings(allow_reset=True, is_persistent=True) | |
) | |
try: | |
# Try to get existing collection | |
collection = temp_client.get_collection(self._collection_name) | |
current_dim = collection.metadata.get( | |
"hnsw:dim") if collection.metadata else None | |
if current_dim != required_dim: | |
logger.info( | |
f"Dimension mismatch: current={current_dim}, required={required_dim}") | |
# Close client connection | |
temp_client.reset() | |
# Remove the entire directory | |
if os.path.exists(self._persist_directory): | |
shutil.rmtree(self._persist_directory) | |
logger.info( | |
f"Removed existing database at {self._persist_directory}") | |
# Recreate empty directory | |
os.makedirs(self._persist_directory, exist_ok=True) | |
except ValueError: | |
# Collection doesn't exist, no cleanup needed | |
pass | |
except Exception as e: | |
logger.error(f"Error during cleanup: {str(e)}") | |
raise | |
async def _load_embedding_model(self) -> HuggingFaceEmbedding: | |
"""Load embedding model in background thread""" | |
try: | |
loop = asyncio.get_event_loop() | |
return await loop.run_in_executor( | |
self._executor, | |
self._create_embedding_model | |
) | |
except Exception as e: | |
logger.error(f"Error loading embedding model: {str(e)}") | |
raise | |
def _create_embedding_model() -> HuggingFaceEmbedding: | |
"""Create and cache embedding model""" | |
return HuggingFaceEmbedding(model_name=settings.EMBEDDING_MODEL) | |
async def create( | |
cls, | |
persist_directory: str = settings.CHROMA_PATH, | |
collection_name: str = "documents", | |
client_settings: Optional[Dict[str, Any]] = None | |
) -> Tuple['OptimizedVectorStore', HuggingFaceEmbedding]: | |
"""Asynchronously create or get instance""" | |
async with cls._lock: | |
if not cls._instance or not cls._initialized: | |
instance = cls( | |
persist_directory=persist_directory, | |
collection_name=collection_name, | |
client_settings=client_settings | |
) | |
await instance._initialize() | |
cls._instance = instance | |
return cls._instance, cls._instance._embedding_model | |
# Override parent class methods to ensure initialization | |
def add_documents(self, *args, **kwargs): | |
if not self._initialized: | |
raise RuntimeError("Vector store not initialized") | |
return super().add_documents(*args, **kwargs) | |
def similarity_search(self, *args, **kwargs): | |
if not self._initialized: | |
raise RuntimeError("Vector store not initialized") | |
return super().similarity_search(*args, **kwargs) | |
def get_document_chunks(self, *args, **kwargs): | |
if not self._initialized: | |
raise RuntimeError("Vector store not initialized") | |
return super().get_document_chunks(*args, **kwargs) | |
def delete_document(self, *args, **kwargs): | |
if not self._initialized: | |
raise RuntimeError("Vector store not initialized") | |
return super().delete_document(*args, **kwargs) | |
def get_all_documents(self, *args, **kwargs): | |
if not self._initialized: | |
raise RuntimeError("Vector store not initialized") | |
return super().get_all_documents(*args, **kwargs) | |
async def get_optimized_vector_store() -> Tuple[ChromaVectorStore, HuggingFaceEmbedding]: | |
"""Get or create an optimized vector store instance""" | |
return await OptimizedVectorStore.create() | |