Spaces:
Running
Running
Commit
·
b08d8ce
1
Parent(s):
37a7e05
1) Log google drive documents entry in mongo db
Browse files2) changed the embedding model to "BAAI/bge-large-en-v1.5"
config/__pycache__/config.cpython-312.pyc
CHANGED
Binary files a/config/__pycache__/config.cpython-312.pyc and b/config/__pycache__/config.cpython-312.pyc differ
|
|
config/config.py
CHANGED
@@ -27,15 +27,16 @@ class Settings:
|
|
27 |
# Environment Configuration
|
28 |
ENVIRONMENT = os.getenv('ENVIRONMENT').lower()
|
29 |
|
30 |
-
# Embedding Configuration
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
39 |
|
40 |
# MongoDB Configuration
|
41 |
# MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017')
|
|
|
27 |
# Environment Configuration
|
28 |
ENVIRONMENT = os.getenv('ENVIRONMENT').lower()
|
29 |
|
30 |
+
# Embedding Configuration - Updated for BAAI model
|
31 |
+
EMBEDDING_MODEL = 'BAAI/bge-large-en-v1.5'
|
32 |
+
# @property
|
33 |
+
# def EMBEDDING_MODEL(self):
|
34 |
+
# if self.ENVIRONMENT == 'production':
|
35 |
+
# # Better model for demos
|
36 |
+
# return os.getenv('EMBEDDING_MODEL', 'openai/text-embedding-3-large')
|
37 |
+
# #return os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
|
38 |
+
# # Better for development purposes.
|
39 |
+
# return os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
|
40 |
|
41 |
# MongoDB Configuration
|
42 |
# MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017')
|
src/__pycache__/main.cpython-312.pyc
CHANGED
Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ
|
|
src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc
CHANGED
Binary files a/src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc differ
|
|
src/vectorstores/optimized_vectorstore.py
CHANGED
@@ -3,6 +3,10 @@ import asyncio
|
|
3 |
from typing import Tuple, Optional, List, Dict, Any, Callable
|
4 |
import concurrent.futures
|
5 |
from functools import lru_cache
|
|
|
|
|
|
|
|
|
6 |
|
7 |
from .base_vectorstore import BaseVectorStore
|
8 |
from .chroma_vectorstore import ChromaVectorStore
|
@@ -10,11 +14,8 @@ from src.embeddings.huggingface_embedding import HuggingFaceEmbedding
|
|
10 |
from src.utils.logger import logger
|
11 |
from config.config import settings
|
12 |
|
|
|
13 |
class OptimizedVectorStore(ChromaVectorStore):
|
14 |
-
"""
|
15 |
-
Optimized vector store that maintains ChromaVectorStore compatibility
|
16 |
-
while adding caching and async initialization
|
17 |
-
"""
|
18 |
_instance: Optional['OptimizedVectorStore'] = None
|
19 |
_lock = asyncio.Lock()
|
20 |
_initialized = False
|
@@ -33,41 +34,11 @@ class OptimizedVectorStore(ChromaVectorStore):
|
|
33 |
collection_name: str = "documents",
|
34 |
client_settings: Optional[Dict[str, Any]] = None
|
35 |
):
|
36 |
-
"""
|
37 |
-
Initialize the optimized vector store
|
38 |
-
Note: The actual initialization is deferred until needed
|
39 |
-
"""
|
40 |
if not self._initialized:
|
41 |
self._persist_directory = persist_directory
|
42 |
self._collection_name = collection_name
|
43 |
self._client_settings = client_settings
|
44 |
self._embedding_function = embedding_function
|
45 |
-
# Don't call super().__init__() here - we'll do it in _initialize()
|
46 |
-
|
47 |
-
@classmethod
|
48 |
-
async def create(
|
49 |
-
cls,
|
50 |
-
persist_directory: str = settings.CHROMA_PATH,
|
51 |
-
collection_name: str = "documents",
|
52 |
-
client_settings: Optional[Dict[str, Any]] = None
|
53 |
-
) -> Tuple['OptimizedVectorStore', HuggingFaceEmbedding]:
|
54 |
-
"""
|
55 |
-
Asynchronously create or get instance
|
56 |
-
|
57 |
-
Returns:
|
58 |
-
Tuple[OptimizedVectorStore, HuggingFaceEmbedding]:
|
59 |
-
The vector store instance and embedding model
|
60 |
-
"""
|
61 |
-
async with cls._lock:
|
62 |
-
if not cls._instance or not cls._initialized:
|
63 |
-
instance = cls(
|
64 |
-
persist_directory=persist_directory,
|
65 |
-
collection_name=collection_name,
|
66 |
-
client_settings=client_settings
|
67 |
-
)
|
68 |
-
await instance._initialize()
|
69 |
-
cls._instance = instance
|
70 |
-
return cls._instance, cls._instance._embedding_model
|
71 |
|
72 |
async def _initialize(self) -> None:
|
73 |
"""Initialize the vector store and embedding model"""
|
@@ -77,21 +48,86 @@ class OptimizedVectorStore(ChromaVectorStore):
|
|
77 |
try:
|
78 |
# Load embedding model in background thread
|
79 |
self._embedding_model = await self._load_embedding_model()
|
80 |
-
|
81 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
super().__init__(
|
83 |
embedding_function=self._embedding_model.embed_documents,
|
84 |
persist_directory=self._persist_directory,
|
85 |
-
collection_name=self._collection_name
|
86 |
-
client_settings=self._client_settings
|
87 |
)
|
88 |
-
|
89 |
self._initialized = True
|
90 |
-
|
|
|
|
|
91 |
except Exception as e:
|
92 |
logger.error(f"Error initializing vector store: {str(e)}")
|
93 |
raise
|
94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
async def _load_embedding_model(self) -> HuggingFaceEmbedding:
|
96 |
"""Load embedding model in background thread"""
|
97 |
try:
|
@@ -110,28 +146,52 @@ class OptimizedVectorStore(ChromaVectorStore):
|
|
110 |
"""Create and cache embedding model"""
|
111 |
return HuggingFaceEmbedding(model_name=settings.EMBEDDING_MODEL)
|
112 |
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
if not
|
123 |
-
|
124 |
-
|
|
|
|
|
125 |
)
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
-
# Factory function for getting optimized vector store
|
129 |
async def get_optimized_vector_store() -> Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
|
130 |
-
"""
|
131 |
-
|
132 |
-
|
133 |
-
Returns:
|
134 |
-
Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
|
135 |
-
The vector store and embedding model instances
|
136 |
-
"""
|
137 |
-
return await OptimizedVectorStore.create()
|
|
|
3 |
from typing import Tuple, Optional, List, Dict, Any, Callable
|
4 |
import concurrent.futures
|
5 |
from functools import lru_cache
|
6 |
+
import chromadb
|
7 |
+
from chromadb.config import Settings
|
8 |
+
import shutil
|
9 |
+
import os
|
10 |
|
11 |
from .base_vectorstore import BaseVectorStore
|
12 |
from .chroma_vectorstore import ChromaVectorStore
|
|
|
14 |
from src.utils.logger import logger
|
15 |
from config.config import settings
|
16 |
|
17 |
+
|
18 |
class OptimizedVectorStore(ChromaVectorStore):
|
|
|
|
|
|
|
|
|
19 |
_instance: Optional['OptimizedVectorStore'] = None
|
20 |
_lock = asyncio.Lock()
|
21 |
_initialized = False
|
|
|
34 |
collection_name: str = "documents",
|
35 |
client_settings: Optional[Dict[str, Any]] = None
|
36 |
):
|
|
|
|
|
|
|
|
|
37 |
if not self._initialized:
|
38 |
self._persist_directory = persist_directory
|
39 |
self._collection_name = collection_name
|
40 |
self._client_settings = client_settings
|
41 |
self._embedding_function = embedding_function
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
async def _initialize(self) -> None:
|
44 |
"""Initialize the vector store and embedding model"""
|
|
|
48 |
try:
|
49 |
# Load embedding model in background thread
|
50 |
self._embedding_model = await self._load_embedding_model()
|
51 |
+
|
52 |
+
# Get embedding dimension
|
53 |
+
test_embedding = self._embedding_model.embed_query("test")
|
54 |
+
required_dim = len(test_embedding)
|
55 |
+
|
56 |
+
# Clean up existing database if dimensions don't match
|
57 |
+
await self._cleanup_if_needed(required_dim)
|
58 |
+
|
59 |
+
# Create ChromaDB client with fresh settings
|
60 |
+
client = chromadb.PersistentClient(
|
61 |
+
path=self._persist_directory,
|
62 |
+
settings=Settings(
|
63 |
+
allow_reset=True,
|
64 |
+
is_persistent=True,
|
65 |
+
anonymized_telemetry=False
|
66 |
+
)
|
67 |
+
)
|
68 |
+
|
69 |
+
# Create new collection with correct dimensions
|
70 |
+
collection = client.create_collection(
|
71 |
+
name=self._collection_name,
|
72 |
+
metadata={
|
73 |
+
"hnsw:space": "cosine",
|
74 |
+
"hnsw:dim": required_dim
|
75 |
+
}
|
76 |
+
)
|
77 |
+
|
78 |
+
# Initialize parent class
|
79 |
super().__init__(
|
80 |
embedding_function=self._embedding_model.embed_documents,
|
81 |
persist_directory=self._persist_directory,
|
82 |
+
collection_name=self._collection_name
|
|
|
83 |
)
|
84 |
+
|
85 |
self._initialized = True
|
86 |
+
logger.info(
|
87 |
+
f"Successfully initialized vector store with dimension {required_dim}")
|
88 |
+
|
89 |
except Exception as e:
|
90 |
logger.error(f"Error initializing vector store: {str(e)}")
|
91 |
raise
|
92 |
|
93 |
+
async def _cleanup_if_needed(self, required_dim: int) -> None:
|
94 |
+
"""Clean up existing database if dimensions don't match"""
|
95 |
+
try:
|
96 |
+
# Create temporary client to check existing collection
|
97 |
+
temp_client = chromadb.PersistentClient(
|
98 |
+
path=self._persist_directory,
|
99 |
+
settings=Settings(allow_reset=True, is_persistent=True)
|
100 |
+
)
|
101 |
+
|
102 |
+
try:
|
103 |
+
# Try to get existing collection
|
104 |
+
collection = temp_client.get_collection(self._collection_name)
|
105 |
+
current_dim = collection.metadata.get(
|
106 |
+
"hnsw:dim") if collection.metadata else None
|
107 |
+
|
108 |
+
if current_dim != required_dim:
|
109 |
+
logger.info(
|
110 |
+
f"Dimension mismatch: current={current_dim}, required={required_dim}")
|
111 |
+
# Close client connection
|
112 |
+
temp_client.reset()
|
113 |
+
|
114 |
+
# Remove the entire directory
|
115 |
+
if os.path.exists(self._persist_directory):
|
116 |
+
shutil.rmtree(self._persist_directory)
|
117 |
+
logger.info(
|
118 |
+
f"Removed existing database at {self._persist_directory}")
|
119 |
+
|
120 |
+
# Recreate empty directory
|
121 |
+
os.makedirs(self._persist_directory, exist_ok=True)
|
122 |
+
|
123 |
+
except ValueError:
|
124 |
+
# Collection doesn't exist, no cleanup needed
|
125 |
+
pass
|
126 |
+
|
127 |
+
except Exception as e:
|
128 |
+
logger.error(f"Error during cleanup: {str(e)}")
|
129 |
+
raise
|
130 |
+
|
131 |
async def _load_embedding_model(self) -> HuggingFaceEmbedding:
|
132 |
"""Load embedding model in background thread"""
|
133 |
try:
|
|
|
146 |
"""Create and cache embedding model"""
|
147 |
return HuggingFaceEmbedding(model_name=settings.EMBEDDING_MODEL)
|
148 |
|
149 |
+
@classmethod
|
150 |
+
async def create(
|
151 |
+
cls,
|
152 |
+
persist_directory: str = settings.CHROMA_PATH,
|
153 |
+
collection_name: str = "documents",
|
154 |
+
client_settings: Optional[Dict[str, Any]] = None
|
155 |
+
) -> Tuple['OptimizedVectorStore', HuggingFaceEmbedding]:
|
156 |
+
"""Asynchronously create or get instance"""
|
157 |
+
async with cls._lock:
|
158 |
+
if not cls._instance or not cls._initialized:
|
159 |
+
instance = cls(
|
160 |
+
persist_directory=persist_directory,
|
161 |
+
collection_name=collection_name,
|
162 |
+
client_settings=client_settings
|
163 |
)
|
164 |
+
await instance._initialize()
|
165 |
+
cls._instance = instance
|
166 |
+
return cls._instance, cls._instance._embedding_model
|
167 |
+
|
168 |
+
# Override parent class methods to ensure initialization
|
169 |
+
def add_documents(self, *args, **kwargs):
|
170 |
+
if not self._initialized:
|
171 |
+
raise RuntimeError("Vector store not initialized")
|
172 |
+
return super().add_documents(*args, **kwargs)
|
173 |
+
|
174 |
+
def similarity_search(self, *args, **kwargs):
|
175 |
+
if not self._initialized:
|
176 |
+
raise RuntimeError("Vector store not initialized")
|
177 |
+
return super().similarity_search(*args, **kwargs)
|
178 |
+
|
179 |
+
def get_document_chunks(self, *args, **kwargs):
|
180 |
+
if not self._initialized:
|
181 |
+
raise RuntimeError("Vector store not initialized")
|
182 |
+
return super().get_document_chunks(*args, **kwargs)
|
183 |
+
|
184 |
+
def delete_document(self, *args, **kwargs):
|
185 |
+
if not self._initialized:
|
186 |
+
raise RuntimeError("Vector store not initialized")
|
187 |
+
return super().delete_document(*args, **kwargs)
|
188 |
+
|
189 |
+
def get_all_documents(self, *args, **kwargs):
|
190 |
+
if not self._initialized:
|
191 |
+
raise RuntimeError("Vector store not initialized")
|
192 |
+
return super().get_all_documents(*args, **kwargs)
|
193 |
+
|
194 |
|
|
|
195 |
async def get_optimized_vector_store() -> Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
|
196 |
+
"""Get or create an optimized vector store instance"""
|
197 |
+
return await OptimizedVectorStore.create()
|
|
|
|
|
|
|
|
|
|
|
|