File size: 7,487 Bytes
b953016
 
 
 
 
b08d8ce
 
 
 
b953016
 
 
 
 
 
 
b08d8ce
b953016
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b08d8ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b953016
 
 
b08d8ce
b953016
b08d8ce
b953016
b08d8ce
 
 
b953016
 
 
 
b08d8ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b953016
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b08d8ce
 
 
 
 
 
 
 
 
 
 
 
 
 
b953016
b08d8ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b953016
 
b08d8ce
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# src/vectorstores/optimized_vectorstore.py
import asyncio
from typing import Tuple, Optional, List, Dict, Any, Callable
import concurrent.futures
from functools import lru_cache
import chromadb
from chromadb.config import Settings
import shutil
import os

from .base_vectorstore import BaseVectorStore
from .chroma_vectorstore import ChromaVectorStore
from src.embeddings.huggingface_embedding import HuggingFaceEmbedding
from src.utils.logger import logger
from config.config import settings


class OptimizedVectorStore(ChromaVectorStore):
    _instance: Optional['OptimizedVectorStore'] = None
    _lock = asyncio.Lock()
    _initialized = False
    _embedding_model: Optional[HuggingFaceEmbedding] = None
    _executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)

    def __new__(cls, *args, **kwargs):
        if not cls._instance:
            cls._instance = super().__new__(cls)
        return cls._instance

    def __init__(
        self,
        embedding_function: Optional[Callable] = None,
        persist_directory: str = settings.CHROMA_PATH,
        collection_name: str = "documents",
        client_settings: Optional[Dict[str, Any]] = None
    ):
        if not self._initialized:
            self._persist_directory = persist_directory
            self._collection_name = collection_name
            self._client_settings = client_settings
            self._embedding_function = embedding_function

    async def _initialize(self) -> None:
        """Initialize the vector store and embedding model"""
        if self._initialized:
            return

        try:
            # Load embedding model in background thread
            self._embedding_model = await self._load_embedding_model()

            # Get embedding dimension
            test_embedding = self._embedding_model.embed_query("test")
            required_dim = len(test_embedding)

            # Clean up existing database if dimensions don't match
            await self._cleanup_if_needed(required_dim)

            # Create ChromaDB client with fresh settings
            client = chromadb.PersistentClient(
                path=self._persist_directory,
                settings=Settings(
                    allow_reset=True,
                    is_persistent=True,
                    anonymized_telemetry=False
                )
            )

            # Create new collection with correct dimensions
            collection = client.create_collection(
                name=self._collection_name,
                metadata={
                    "hnsw:space": "cosine",
                    "hnsw:dim": required_dim
                }
            )

            # Initialize parent class
            super().__init__(
                embedding_function=self._embedding_model.embed_documents,
                persist_directory=self._persist_directory,
                collection_name=self._collection_name
            )

            self._initialized = True
            logger.info(
                f"Successfully initialized vector store with dimension {required_dim}")

        except Exception as e:
            logger.error(f"Error initializing vector store: {str(e)}")
            raise

    async def _cleanup_if_needed(self, required_dim: int) -> None:
        """Clean up existing database if dimensions don't match"""
        try:
            # Create temporary client to check existing collection
            temp_client = chromadb.PersistentClient(
                path=self._persist_directory,
                settings=Settings(allow_reset=True, is_persistent=True)
            )

            try:
                # Try to get existing collection
                collection = temp_client.get_collection(self._collection_name)
                current_dim = collection.metadata.get(
                    "hnsw:dim") if collection.metadata else None

                if current_dim != required_dim:
                    logger.info(
                        f"Dimension mismatch: current={current_dim}, required={required_dim}")
                    # Close client connection
                    temp_client.reset()

                    # Remove the entire directory
                    if os.path.exists(self._persist_directory):
                        shutil.rmtree(self._persist_directory)
                        logger.info(
                            f"Removed existing database at {self._persist_directory}")

                    # Recreate empty directory
                    os.makedirs(self._persist_directory, exist_ok=True)

            except ValueError:
                # Collection doesn't exist, no cleanup needed
                pass

        except Exception as e:
            logger.error(f"Error during cleanup: {str(e)}")
            raise

    async def _load_embedding_model(self) -> HuggingFaceEmbedding:
        """Load embedding model in background thread"""
        try:
            loop = asyncio.get_event_loop()
            return await loop.run_in_executor(
                self._executor,
                self._create_embedding_model
            )
        except Exception as e:
            logger.error(f"Error loading embedding model: {str(e)}")
            raise

    @staticmethod
    @lru_cache(maxsize=1)
    def _create_embedding_model() -> HuggingFaceEmbedding:
        """Create and cache embedding model"""
        return HuggingFaceEmbedding(model_name=settings.EMBEDDING_MODEL)

    @classmethod
    async def create(
        cls,
        persist_directory: str = settings.CHROMA_PATH,
        collection_name: str = "documents",
        client_settings: Optional[Dict[str, Any]] = None
    ) -> Tuple['OptimizedVectorStore', HuggingFaceEmbedding]:
        """Asynchronously create or get instance"""
        async with cls._lock:
            if not cls._instance or not cls._initialized:
                instance = cls(
                    persist_directory=persist_directory,
                    collection_name=collection_name,
                    client_settings=client_settings
                )
                await instance._initialize()
                cls._instance = instance
            return cls._instance, cls._instance._embedding_model

    # Override parent class methods to ensure initialization
    def add_documents(self, *args, **kwargs):
        if not self._initialized:
            raise RuntimeError("Vector store not initialized")
        return super().add_documents(*args, **kwargs)

    def similarity_search(self, *args, **kwargs):
        if not self._initialized:
            raise RuntimeError("Vector store not initialized")
        return super().similarity_search(*args, **kwargs)

    def get_document_chunks(self, *args, **kwargs):
        if not self._initialized:
            raise RuntimeError("Vector store not initialized")
        return super().get_document_chunks(*args, **kwargs)

    def delete_document(self, *args, **kwargs):
        if not self._initialized:
            raise RuntimeError("Vector store not initialized")
        return super().delete_document(*args, **kwargs)

    def get_all_documents(self, *args, **kwargs):
        if not self._initialized:
            raise RuntimeError("Vector store not initialized")
        return super().get_all_documents(*args, **kwargs)


async def get_optimized_vector_store() -> Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
    """Get or create an optimized vector store instance"""
    return await OptimizedVectorStore.create()