TalatMasood commited on
Commit
b08d8ce
·
1 Parent(s): 37a7e05

1) Log google drive documents entry in mongo db

Browse files

2) changed the embedding model to "BAAI/bge-large-en-v1.5"

config/__pycache__/config.cpython-312.pyc CHANGED
Binary files a/config/__pycache__/config.cpython-312.pyc and b/config/__pycache__/config.cpython-312.pyc differ
 
config/config.py CHANGED
@@ -27,15 +27,16 @@ class Settings:
27
  # Environment Configuration
28
  ENVIRONMENT = os.getenv('ENVIRONMENT').lower()
29
 
30
- # Embedding Configuration
31
- @property
32
- def EMBEDDING_MODEL(self):
33
- if self.ENVIRONMENT == 'production':
34
- # Better model for demos
35
- # return os.getenv('EMBEDDING_MODEL', 'openai/text-embedding-3-large')
36
- return os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
37
- # Better for development purposes.
38
- return os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
 
39
 
40
  # MongoDB Configuration
41
  # MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017')
 
27
  # Environment Configuration
28
  ENVIRONMENT = os.getenv('ENVIRONMENT').lower()
29
 
30
+ # Embedding Configuration - Updated for BAAI model
31
+ EMBEDDING_MODEL = 'BAAI/bge-large-en-v1.5'
32
+ # @property
33
+ # def EMBEDDING_MODEL(self):
34
+ # if self.ENVIRONMENT == 'production':
35
+ # # Better model for demos
36
+ # return os.getenv('EMBEDDING_MODEL', 'openai/text-embedding-3-large')
37
+ # #return os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
38
+ # # Better for development purposes.
39
+ # return os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
40
 
41
  # MongoDB Configuration
42
  # MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017')
src/__pycache__/main.cpython-312.pyc CHANGED
Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ
 
src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc CHANGED
Binary files a/src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc differ
 
src/vectorstores/optimized_vectorstore.py CHANGED
@@ -3,6 +3,10 @@ import asyncio
3
  from typing import Tuple, Optional, List, Dict, Any, Callable
4
  import concurrent.futures
5
  from functools import lru_cache
 
 
 
 
6
 
7
  from .base_vectorstore import BaseVectorStore
8
  from .chroma_vectorstore import ChromaVectorStore
@@ -10,11 +14,8 @@ from src.embeddings.huggingface_embedding import HuggingFaceEmbedding
10
  from src.utils.logger import logger
11
  from config.config import settings
12
 
 
13
  class OptimizedVectorStore(ChromaVectorStore):
14
- """
15
- Optimized vector store that maintains ChromaVectorStore compatibility
16
- while adding caching and async initialization
17
- """
18
  _instance: Optional['OptimizedVectorStore'] = None
19
  _lock = asyncio.Lock()
20
  _initialized = False
@@ -33,41 +34,11 @@ class OptimizedVectorStore(ChromaVectorStore):
33
  collection_name: str = "documents",
34
  client_settings: Optional[Dict[str, Any]] = None
35
  ):
36
- """
37
- Initialize the optimized vector store
38
- Note: The actual initialization is deferred until needed
39
- """
40
  if not self._initialized:
41
  self._persist_directory = persist_directory
42
  self._collection_name = collection_name
43
  self._client_settings = client_settings
44
  self._embedding_function = embedding_function
45
- # Don't call super().__init__() here - we'll do it in _initialize()
46
-
47
- @classmethod
48
- async def create(
49
- cls,
50
- persist_directory: str = settings.CHROMA_PATH,
51
- collection_name: str = "documents",
52
- client_settings: Optional[Dict[str, Any]] = None
53
- ) -> Tuple['OptimizedVectorStore', HuggingFaceEmbedding]:
54
- """
55
- Asynchronously create or get instance
56
-
57
- Returns:
58
- Tuple[OptimizedVectorStore, HuggingFaceEmbedding]:
59
- The vector store instance and embedding model
60
- """
61
- async with cls._lock:
62
- if not cls._instance or not cls._initialized:
63
- instance = cls(
64
- persist_directory=persist_directory,
65
- collection_name=collection_name,
66
- client_settings=client_settings
67
- )
68
- await instance._initialize()
69
- cls._instance = instance
70
- return cls._instance, cls._instance._embedding_model
71
 
72
  async def _initialize(self) -> None:
73
  """Initialize the vector store and embedding model"""
@@ -77,21 +48,86 @@ class OptimizedVectorStore(ChromaVectorStore):
77
  try:
78
  # Load embedding model in background thread
79
  self._embedding_model = await self._load_embedding_model()
80
-
81
- # Initialize ChromaVectorStore with the loaded model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  super().__init__(
83
  embedding_function=self._embedding_model.embed_documents,
84
  persist_directory=self._persist_directory,
85
- collection_name=self._collection_name,
86
- client_settings=self._client_settings
87
  )
88
-
89
  self._initialized = True
90
-
 
 
91
  except Exception as e:
92
  logger.error(f"Error initializing vector store: {str(e)}")
93
  raise
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  async def _load_embedding_model(self) -> HuggingFaceEmbedding:
96
  """Load embedding model in background thread"""
97
  try:
@@ -110,28 +146,52 @@ class OptimizedVectorStore(ChromaVectorStore):
110
  """Create and cache embedding model"""
111
  return HuggingFaceEmbedding(model_name=settings.EMBEDDING_MODEL)
112
 
113
- def __getattribute__(self, name):
114
- """
115
- Ensure initialization before accessing any ChromaVectorStore methods
116
- """
117
- # Get the attribute from the class
118
- attr = super().__getattribute__(name)
119
-
120
- # If it's a method from ChromaVectorStore, ensure initialization
121
- if callable(attr) and name in ChromaVectorStore.__dict__:
122
- if not self._initialized:
123
- raise RuntimeError(
124
- "Vector store not initialized. Please use 'await OptimizedVectorStore.create()'"
 
 
125
  )
126
- return attr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- # Factory function for getting optimized vector store
129
  async def get_optimized_vector_store() -> Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
130
- """
131
- Get or create an optimized vector store instance
132
-
133
- Returns:
134
- Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
135
- The vector store and embedding model instances
136
- """
137
- return await OptimizedVectorStore.create()
 
3
  from typing import Tuple, Optional, List, Dict, Any, Callable
4
  import concurrent.futures
5
  from functools import lru_cache
6
+ import chromadb
7
+ from chromadb.config import Settings
8
+ import shutil
9
+ import os
10
 
11
  from .base_vectorstore import BaseVectorStore
12
  from .chroma_vectorstore import ChromaVectorStore
 
14
  from src.utils.logger import logger
15
  from config.config import settings
16
 
17
+
18
  class OptimizedVectorStore(ChromaVectorStore):
 
 
 
 
19
  _instance: Optional['OptimizedVectorStore'] = None
20
  _lock = asyncio.Lock()
21
  _initialized = False
 
34
  collection_name: str = "documents",
35
  client_settings: Optional[Dict[str, Any]] = None
36
  ):
 
 
 
 
37
  if not self._initialized:
38
  self._persist_directory = persist_directory
39
  self._collection_name = collection_name
40
  self._client_settings = client_settings
41
  self._embedding_function = embedding_function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  async def _initialize(self) -> None:
44
  """Initialize the vector store and embedding model"""
 
48
  try:
49
  # Load embedding model in background thread
50
  self._embedding_model = await self._load_embedding_model()
51
+
52
+ # Get embedding dimension
53
+ test_embedding = self._embedding_model.embed_query("test")
54
+ required_dim = len(test_embedding)
55
+
56
+ # Clean up existing database if dimensions don't match
57
+ await self._cleanup_if_needed(required_dim)
58
+
59
+ # Create ChromaDB client with fresh settings
60
+ client = chromadb.PersistentClient(
61
+ path=self._persist_directory,
62
+ settings=Settings(
63
+ allow_reset=True,
64
+ is_persistent=True,
65
+ anonymized_telemetry=False
66
+ )
67
+ )
68
+
69
+ # Create new collection with correct dimensions
70
+ collection = client.create_collection(
71
+ name=self._collection_name,
72
+ metadata={
73
+ "hnsw:space": "cosine",
74
+ "hnsw:dim": required_dim
75
+ }
76
+ )
77
+
78
+ # Initialize parent class
79
  super().__init__(
80
  embedding_function=self._embedding_model.embed_documents,
81
  persist_directory=self._persist_directory,
82
+ collection_name=self._collection_name
 
83
  )
84
+
85
  self._initialized = True
86
+ logger.info(
87
+ f"Successfully initialized vector store with dimension {required_dim}")
88
+
89
  except Exception as e:
90
  logger.error(f"Error initializing vector store: {str(e)}")
91
  raise
92
 
93
+ async def _cleanup_if_needed(self, required_dim: int) -> None:
94
+ """Clean up existing database if dimensions don't match"""
95
+ try:
96
+ # Create temporary client to check existing collection
97
+ temp_client = chromadb.PersistentClient(
98
+ path=self._persist_directory,
99
+ settings=Settings(allow_reset=True, is_persistent=True)
100
+ )
101
+
102
+ try:
103
+ # Try to get existing collection
104
+ collection = temp_client.get_collection(self._collection_name)
105
+ current_dim = collection.metadata.get(
106
+ "hnsw:dim") if collection.metadata else None
107
+
108
+ if current_dim != required_dim:
109
+ logger.info(
110
+ f"Dimension mismatch: current={current_dim}, required={required_dim}")
111
+ # Close client connection
112
+ temp_client.reset()
113
+
114
+ # Remove the entire directory
115
+ if os.path.exists(self._persist_directory):
116
+ shutil.rmtree(self._persist_directory)
117
+ logger.info(
118
+ f"Removed existing database at {self._persist_directory}")
119
+
120
+ # Recreate empty directory
121
+ os.makedirs(self._persist_directory, exist_ok=True)
122
+
123
+ except ValueError:
124
+ # Collection doesn't exist, no cleanup needed
125
+ pass
126
+
127
+ except Exception as e:
128
+ logger.error(f"Error during cleanup: {str(e)}")
129
+ raise
130
+
131
  async def _load_embedding_model(self) -> HuggingFaceEmbedding:
132
  """Load embedding model in background thread"""
133
  try:
 
146
  """Create and cache embedding model"""
147
  return HuggingFaceEmbedding(model_name=settings.EMBEDDING_MODEL)
148
 
149
+ @classmethod
150
+ async def create(
151
+ cls,
152
+ persist_directory: str = settings.CHROMA_PATH,
153
+ collection_name: str = "documents",
154
+ client_settings: Optional[Dict[str, Any]] = None
155
+ ) -> Tuple['OptimizedVectorStore', HuggingFaceEmbedding]:
156
+ """Asynchronously create or get instance"""
157
+ async with cls._lock:
158
+ if not cls._instance or not cls._initialized:
159
+ instance = cls(
160
+ persist_directory=persist_directory,
161
+ collection_name=collection_name,
162
+ client_settings=client_settings
163
  )
164
+ await instance._initialize()
165
+ cls._instance = instance
166
+ return cls._instance, cls._instance._embedding_model
167
+
168
+ # Override parent class methods to ensure initialization
169
+ def add_documents(self, *args, **kwargs):
170
+ if not self._initialized:
171
+ raise RuntimeError("Vector store not initialized")
172
+ return super().add_documents(*args, **kwargs)
173
+
174
+ def similarity_search(self, *args, **kwargs):
175
+ if not self._initialized:
176
+ raise RuntimeError("Vector store not initialized")
177
+ return super().similarity_search(*args, **kwargs)
178
+
179
+ def get_document_chunks(self, *args, **kwargs):
180
+ if not self._initialized:
181
+ raise RuntimeError("Vector store not initialized")
182
+ return super().get_document_chunks(*args, **kwargs)
183
+
184
+ def delete_document(self, *args, **kwargs):
185
+ if not self._initialized:
186
+ raise RuntimeError("Vector store not initialized")
187
+ return super().delete_document(*args, **kwargs)
188
+
189
+ def get_all_documents(self, *args, **kwargs):
190
+ if not self._initialized:
191
+ raise RuntimeError("Vector store not initialized")
192
+ return super().get_all_documents(*args, **kwargs)
193
+
194
 
 
195
  async def get_optimized_vector_store() -> Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
196
+ """Get or create an optimized vector store instance"""
197
+ return await OptimizedVectorStore.create()