TalatMasood commited on
Commit
6082154
·
1 Parent(s): 7e7ab71

Updating chroma db to be singleton class

Browse files
src/__pycache__/main.cpython-312.pyc CHANGED
Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ
 
src/main.py CHANGED
@@ -46,6 +46,8 @@ import random
46
  from typing import List
47
  from src.utils.logger import logger
48
  from config.config import settings
 
 
49
 
50
  os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
51
  # os.environ["OAUTHLIB_RELAX_TOKEN_SCOPE"] = "1"
@@ -678,6 +680,33 @@ async def debug_config():
678
  return debug_info
679
 
680
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
681
  @app.post("/admin/cleanup")
682
  async def cleanup_databases(
683
  include_files: bool = True,
 
46
  from typing import List
47
  from src.utils.logger import logger
48
  from config.config import settings
49
+ from src.vectorstores.chroma_manager import ChromaManager
50
+ from src.utils.llm_utils import cleanup_vectorstore
51
 
52
  os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
53
  # os.environ["OAUTHLIB_RELAX_TOKEN_SCOPE"] = "1"
 
680
  return debug_info
681
 
682
 
683
+ @app.post("/admin/reset-chroma")
684
+ async def reset_chroma_db(api_key: str = Depends(verify_api_key)):
685
+ """
686
+ Reset ChromaDB completely - use with caution
687
+
688
+ This endpoint cleans up all resources and recreates ChromaDB
689
+ """
690
+ try:
691
+ # Reset all vector store resources
692
+ await cleanup_vectorstore()
693
+
694
+ return {
695
+ "status": "success",
696
+ "message": "ChromaDB reset complete. You may need to restart the application for changes to take effect.",
697
+ "details": {
698
+ "chroma_path": settings.CHROMA_PATH
699
+ }
700
+ }
701
+
702
+ except Exception as e:
703
+ logger.error(f"Error resetting ChromaDB: {str(e)}")
704
+ raise HTTPException(
705
+ status_code=500,
706
+ detail=f"Failed to reset ChromaDB: {str(e)}"
707
+ )
708
+
709
+
710
  @app.post("/admin/cleanup")
711
  async def cleanup_databases(
712
  include_files: bool = True,
src/utils/__pycache__/drive_document_processor.cpython-312.pyc CHANGED
Binary files a/src/utils/__pycache__/drive_document_processor.cpython-312.pyc and b/src/utils/__pycache__/drive_document_processor.cpython-312.pyc differ
 
src/utils/__pycache__/llm_utils.cpython-312.pyc CHANGED
Binary files a/src/utils/__pycache__/llm_utils.cpython-312.pyc and b/src/utils/__pycache__/llm_utils.cpython-312.pyc differ
 
src/utils/llm_utils.py CHANGED
@@ -2,6 +2,9 @@
2
  from fastapi import HTTPException
3
  from typing import Tuple
4
 
 
 
 
5
  from src.llms.openai_llm import OpenAILanguageModel
6
  from src.llms.ollama_llm import OllamaLanguageModel
7
  from src.llms.bert_llm import BERTLanguageModel
@@ -9,20 +12,26 @@ from src.llms.falcon_llm import FalconLanguageModel
9
  from src.llms.llama_llm import LlamaLanguageModel
10
  from src.embeddings.huggingface_embedding import HuggingFaceEmbedding
11
  from src.vectorstores.chroma_vectorstore import ChromaVectorStore
12
- from src.vectorstores.optimized_vectorstore import get_optimized_vector_store
13
  from src.utils.logger import logger
14
  from config.config import settings
15
 
 
 
 
 
 
 
16
  def get_llm_instance(provider: str):
17
  """
18
  Get LLM instance based on provider
19
-
20
  Args:
21
  provider (str): Name of the LLM provider
22
-
23
  Returns:
24
  BaseLLM: Instance of the LLM
25
-
26
  Raises:
27
  ValueError: If provider is not supported
28
  """
@@ -33,29 +42,84 @@ def get_llm_instance(provider: str):
33
  'falcon': lambda: FalconLanguageModel(),
34
  'llama': lambda: LlamaLanguageModel(),
35
  }
36
-
37
  if provider not in llm_map:
38
  raise ValueError(f"Unsupported LLM provider: {provider}")
39
  return llm_map[provider]()
40
 
 
41
  async def get_vector_store() -> Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
42
  """
43
- Get vector store and embedding model instances
44
- Uses optimized implementation while maintaining backward compatibility
45
-
46
  Returns:
47
  Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
48
  Vector store and embedding model instances
49
  """
50
- try:
51
- return await get_optimized_vector_store()
52
- except Exception as e:
53
- logger.error(f"Error getting optimized vector store: {str(e)}")
54
- # Fallback to original implementation if optimization fails
55
- logger.warning("Falling back to standard vector store implementation")
56
- embedding = HuggingFaceEmbedding(model_name=settings.EMBEDDING_MODEL)
57
- vector_store = ChromaVectorStore(
58
- embedding_function=embedding.embed_documents,
59
- persist_directory=settings.CHROMA_PATH
60
- )
61
- return vector_store, embedding
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from fastapi import HTTPException
3
  from typing import Tuple
4
 
5
+ import asyncio
6
+ import logging
7
+
8
  from src.llms.openai_llm import OpenAILanguageModel
9
  from src.llms.ollama_llm import OllamaLanguageModel
10
  from src.llms.bert_llm import BERTLanguageModel
 
12
  from src.llms.llama_llm import LlamaLanguageModel
13
  from src.embeddings.huggingface_embedding import HuggingFaceEmbedding
14
  from src.vectorstores.chroma_vectorstore import ChromaVectorStore
15
+ from src.vectorstores.chroma_manager import ChromaManager
16
  from src.utils.logger import logger
17
  from config.config import settings
18
 
19
+ # Global vector store instance for reuse
20
+ _vector_store = None
21
+ _embedding_model = None
22
+ _vs_lock = asyncio.Lock()
23
+
24
+
25
  def get_llm_instance(provider: str):
26
  """
27
  Get LLM instance based on provider
28
+
29
  Args:
30
  provider (str): Name of the LLM provider
31
+
32
  Returns:
33
  BaseLLM: Instance of the LLM
34
+
35
  Raises:
36
  ValueError: If provider is not supported
37
  """
 
42
  'falcon': lambda: FalconLanguageModel(),
43
  'llama': lambda: LlamaLanguageModel(),
44
  }
45
+
46
  if provider not in llm_map:
47
  raise ValueError(f"Unsupported LLM provider: {provider}")
48
  return llm_map[provider]()
49
 
50
+
51
  async def get_vector_store() -> Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
52
  """
53
+ Get vector store and embedding model instances with proper initialization
54
+
 
55
  Returns:
56
  Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
57
  Vector store and embedding model instances
58
  """
59
+ global _vector_store, _embedding_model, _vs_lock
60
+
61
+ async with _vs_lock:
62
+ if _vector_store is not None and _embedding_model is not None:
63
+ return _vector_store, _embedding_model
64
+
65
+ try:
66
+ # Load embedding model
67
+ _embedding_model = HuggingFaceEmbedding(
68
+ model_name=settings.EMBEDDING_MODEL)
69
+ logger.info(f"Loaded embedding model: {settings.EMBEDDING_MODEL}")
70
+
71
+ # Get ChromaDB client through the manager
72
+ try:
73
+ client = await ChromaManager.get_client(
74
+ persist_directory=settings.CHROMA_PATH,
75
+ reset_if_needed=True
76
+ )
77
+ logger.info("Successfully initialized ChromaDB client")
78
+ except Exception as e:
79
+ logger.error(f"Error getting ChromaDB client: {str(e)}")
80
+
81
+ # Try to reset ChromaDB completely
82
+ await ChromaManager.reset_chroma(settings.CHROMA_PATH)
83
+ client = await ChromaManager.get_client(
84
+ persist_directory=settings.CHROMA_PATH
85
+ )
86
+ logger.info("Recreated ChromaDB client after reset")
87
+
88
+ # Create and initialize vector store
89
+ _vector_store = ChromaVectorStore(
90
+ embedding_function=_embedding_model.embed_documents,
91
+ persist_directory=settings.CHROMA_PATH,
92
+ collection_name="documents",
93
+ client=client
94
+ )
95
+
96
+ # Initialize the vector store
97
+ await _vector_store.initialize()
98
+ logger.info("Vector store successfully initialized")
99
+
100
+ return _vector_store, _embedding_model
101
+
102
+ except Exception as e:
103
+ logger.error(f"Error initializing vector store: {str(e)}")
104
+ raise HTTPException(
105
+ status_code=500,
106
+ detail=f"Failed to initialize vector store: {str(e)}"
107
+ )
108
+
109
+
110
+ async def cleanup_vectorstore():
111
+ """
112
+ Cleanup and reset vector store resources
113
+ """
114
+ global _vector_store, _embedding_model, _vs_lock
115
+
116
+ async with _vs_lock:
117
+ _vector_store = None
118
+ _embedding_model = None
119
+
120
+ # Force garbage collection
121
+ import gc
122
+ gc.collect()
123
+
124
+ # Reset ChromaDB completely
125
+ await ChromaManager.reset_chroma(settings.CHROMA_PATH)
src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc CHANGED
Binary files a/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc differ
 
src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc CHANGED
Binary files a/src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc differ
 
src/vectorstores/chroma_manager.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/vectorstores/chroma_manager.py
2
+ """
3
+ ChromaDB connection manager to ensure consistent settings and connection handling
4
+ """
5
+
6
+ import os
7
+ import shutil
8
+ import asyncio
9
+ import logging
10
+ import chromadb
11
+ from chromadb.config import Settings
12
+ from typing import Optional, Dict, Any
13
+
14
+ # Global connection lock and instance
15
+ _instance_lock = asyncio.Lock()
16
+ _chroma_instance = None
17
+ _collection_lock = asyncio.Lock()
18
+ _collection_instances = {}
19
+
20
+
21
+ class ChromaManager:
22
+ """Singleton manager for ChromaDB connections"""
23
+
24
+ @staticmethod
25
+ async def get_client(
26
+ persist_directory: str,
27
+ reset_if_needed: bool = False
28
+ ) -> chromadb.PersistentClient:
29
+ """
30
+ Get a shared ChromaDB client with consistent settings
31
+
32
+ Args:
33
+ persist_directory (str): Directory to persist ChromaDB
34
+ reset_if_needed (bool): Whether to reset the database if connection fails
35
+
36
+ Returns:
37
+ chromadb.PersistentClient: Shared client instance
38
+ """
39
+ global _chroma_instance, _instance_lock
40
+
41
+ async with _instance_lock:
42
+ if _chroma_instance is not None:
43
+ return _chroma_instance
44
+
45
+ # Try to create a client
46
+ try:
47
+ settings = Settings(
48
+ allow_reset=True,
49
+ anonymized_telemetry=False,
50
+ is_persistent=True
51
+ )
52
+
53
+ _chroma_instance = chromadb.PersistentClient(
54
+ path=persist_directory,
55
+ settings=settings
56
+ )
57
+ logging.info(
58
+ f"Successfully created ChromaDB client at {persist_directory}")
59
+ return _chroma_instance
60
+
61
+ except ValueError as e:
62
+ if "already exists" in str(e) and reset_if_needed:
63
+ logging.warning(
64
+ f"ChromaDB instance exists with different settings. Attempting reset: {str(e)}")
65
+ await ChromaManager.reset_chroma(persist_directory)
66
+
67
+ # Try again after reset
68
+ _chroma_instance = chromadb.PersistentClient(
69
+ path=persist_directory,
70
+ settings=settings
71
+ )
72
+ return _chroma_instance
73
+ raise
74
+
75
+ @staticmethod
76
+ async def get_or_create_collection(
77
+ client: chromadb.PersistentClient,
78
+ collection_name: str,
79
+ embedding_dimension: int = 1024
80
+ ):
81
+ """
82
+ Get or create a collection with proper error handling
83
+
84
+ Args:
85
+ client (chromadb.PersistentClient): ChromaDB client
86
+ collection_name (str): Name of the collection
87
+ embedding_dimension (int): Dimension of embeddings
88
+
89
+ Returns:
90
+ Collection: ChromaDB collection
91
+ """
92
+ global _collection_lock, _collection_instances
93
+
94
+ # Use just the collection name as key
95
+ collection_key = collection_name
96
+
97
+ async with _collection_lock:
98
+ if collection_key in _collection_instances:
99
+ return _collection_instances[collection_key]
100
+
101
+ try:
102
+ # Try to get existing collection
103
+ collection = client.get_collection(
104
+ name=collection_name,
105
+ embedding_function=None
106
+ )
107
+ logging.info(f"Found existing collection: {collection_name}")
108
+ _collection_instances[collection_key] = collection
109
+ return collection
110
+
111
+ except Exception as e:
112
+ logging.info(
113
+ f"Collection {collection_name} does not exist, creating new one: {str(e)}")
114
+
115
+ # Create new collection with minimal metadata
116
+ # Removed the problematic "hnsw:dim" parameter
117
+ try:
118
+ collection = client.create_collection(
119
+ name=collection_name,
120
+ metadata={"hnsw:space": "cosine"}
121
+ )
122
+ except Exception as create_error:
123
+ # If that fails too, try with no metadata
124
+ logging.warning(
125
+ f"Error creating collection with metadata: {str(create_error)}")
126
+ collection = client.create_collection(
127
+ name=collection_name
128
+ )
129
+
130
+ _collection_instances[collection_key] = collection
131
+ return collection
132
+
133
+ @staticmethod
134
+ async def reset_chroma(persist_directory: str):
135
+ """
136
+ Reset ChromaDB completely by removing the directory
137
+
138
+ Args:
139
+ persist_directory (str): Directory to remove
140
+ """
141
+ global _chroma_instance, _collection_instances
142
+
143
+ # Clear global instances first
144
+ _chroma_instance = None
145
+ _collection_instances = {}
146
+
147
+ try:
148
+ # Force garbage collection to release file handles
149
+ import gc
150
+ gc.collect()
151
+
152
+ # Remove the entire directory
153
+ if os.path.exists(persist_directory):
154
+ shutil.rmtree(persist_directory)
155
+ logging.info(
156
+ f"Removed ChromaDB directory: {persist_directory}")
157
+
158
+ # Recreate empty directory
159
+ os.makedirs(persist_directory, exist_ok=True)
160
+ logging.info(
161
+ f"Created fresh ChromaDB directory: {persist_directory}")
162
+
163
+ except Exception as e:
164
+ logging.error(f"Error resetting ChromaDB: {str(e)}")
165
+ raise
src/vectorstores/chroma_settings.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/vectorstores/chroma_settings.py
2
+ """
3
+ Shared settings for ChromaDB to ensure consistency across the application
4
+ """
5
+
6
+ import chromadb
7
+
8
+
9
+ def get_chroma_settings():
10
+ """Get consistent ChromaDB settings"""
11
+ return chromadb.Settings(
12
+ allow_reset=True,
13
+ is_persistent=True,
14
+ anonymized_telemetry=False
15
+ )
src/vectorstores/chroma_vectorstore.py CHANGED
@@ -1,10 +1,12 @@
1
  # src/vectorstores/chroma_vectorstore.py
 
2
  import chromadb
3
  from typing import List, Callable, Any, Dict, Optional
4
- from chromadb.config import Settings
5
  import logging
 
6
 
7
  from .base_vectorstore import BaseVectorStore
 
8
 
9
 
10
  class ChromaVectorStore(BaseVectorStore):
@@ -13,7 +15,8 @@ class ChromaVectorStore(BaseVectorStore):
13
  embedding_function: Callable[[List[str]], List[List[float]]],
14
  persist_directory: str = './chroma_db',
15
  collection_name: str = "documents",
16
- client_settings: Optional[Dict[str, Any]] = None
 
17
  ):
18
  """
19
  Initialize Chroma Vector Store
@@ -23,25 +26,78 @@ class ChromaVectorStore(BaseVectorStore):
23
  persist_directory (str): Directory to persist the vector store
24
  collection_name (str): Name of the collection to use
25
  client_settings (Optional[Dict[str, Any]]): Additional settings for ChromaDB client
 
26
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  try:
28
- settings = Settings(
29
- persist_directory=persist_directory,
30
- **(client_settings or {})
31
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- self.client = chromadb.PersistentClient(settings=settings)
34
- self.collection = self.client.get_or_create_collection(
35
- name=collection_name,
36
- # Using cosine similarity by default
37
- metadata={"hnsw:space": "cosine"}
38
- )
39
- self.embedding_function = embedding_function
40
  except Exception as e:
41
- logging.error(f"Error initializing ChromaDB: {str(e)}")
 
 
 
 
42
  raise
43
 
44
- def add_documents(
 
 
 
 
 
45
  self,
46
  documents: List[str],
47
  embeddings: Optional[List[List[float]]] = None,
@@ -49,7 +105,7 @@ class ChromaVectorStore(BaseVectorStore):
49
  ids: Optional[List[str]] = None
50
  ) -> None:
51
  """
52
- Add documents to the vector store
53
 
54
  Args:
55
  documents (List[str]): List of document texts
@@ -57,48 +113,141 @@ class ChromaVectorStore(BaseVectorStore):
57
  metadatas (Optional[List[Dict[str, Any]]]): Metadata for each document
58
  ids (Optional[List[str]]): Custom IDs for the documents
59
  """
60
- try:
61
- if not documents:
62
- logging.warning("No documents provided to add_documents")
63
- return
 
 
 
 
 
64
 
65
- if not embeddings:
 
 
 
 
 
66
  embeddings = self.embedding_function(documents)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- if len(documents) != len(embeddings):
69
- raise ValueError(
70
- "Number of documents and embeddings must match")
71
-
72
- # Use provided IDs or generate them
73
- doc_ids = ids if ids is not None else [
74
- f"doc_{i}" for i in range(len(documents))]
75
-
76
- # Prepare add parameters
77
- add_params = {
78
- "documents": documents,
79
- "embeddings": embeddings,
80
- "ids": doc_ids
81
- }
82
-
83
- # Only include metadatas if provided
84
- if metadatas is not None:
85
- if len(metadatas) != len(documents):
86
- raise ValueError(
87
- "Number of documents and metadatas must match")
88
- add_params["metadatas"] = metadatas
89
-
90
- self.collection.add(**add_params)
91
  except Exception as e:
92
- logging.error(f"Error adding documents to ChromaDB: {str(e)}")
93
- raise
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- def similarity_search(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  self,
97
  query_embedding: List[float],
98
  top_k: int = 3,
99
  **kwargs
100
  ) -> List[Dict[str, Any]]:
101
- """Perform similarity search with improved chunk handling"""
 
 
 
 
102
  try:
103
  # Get more initial results to account for sequential chunks
104
  results = self.collection.query(
@@ -177,19 +326,122 @@ class ChromaVectorStore(BaseVectorStore):
177
  logging.error(f"Error in similarity search: {str(e)}")
178
  raise
179
 
180
- def get_all_documents(
181
  self,
182
- include_embeddings: bool = False
 
 
183
  ) -> List[Dict[str, Any]]:
184
  """
185
- Retrieve all documents from the vector store
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
- Args:
188
- include_embeddings (bool): Whether to include embeddings in the response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
- Returns:
191
- List[Dict[str, Any]]: List of documents with their IDs and optionally embeddings
 
 
192
  """
 
 
 
 
193
  try:
194
  include = ["documents", "metadatas"]
195
  if include_embeddings:
@@ -227,16 +479,70 @@ class ChromaVectorStore(BaseVectorStore):
227
  f"Error retrieving documents from ChromaDB: {str(e)}")
228
  raise
229
 
230
- def get_document_chunks(self, document_id: str) -> List[Dict[str, Any]]:
 
 
 
231
  """
232
- Retrieve all chunks for a specific document
233
-
234
- Args:
235
- document_id (str): ID of the document to retrieve chunks for
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
- Returns:
238
- List[Dict[str, Any]]: List of document chunks with their metadata
239
  """
 
 
 
 
240
  try:
241
  results = self.collection.get(
242
  where={"document_id": document_id},
@@ -263,18 +569,53 @@ class ChromaVectorStore(BaseVectorStore):
263
  logging.error(f"Error retrieving document chunks: {str(e)}")
264
  raise
265
 
266
- def delete_document(self, document_id: str) -> None:
 
 
267
  """
268
- Delete all chunks associated with a document_id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
- Args:
271
- document_id (str): ID of the document to delete
272
  """
 
 
 
 
273
  try:
274
  # Get all chunks with the given document_id
275
  results = self.collection.get(
276
  where={"document_id": document_id},
277
- include=["metadatas"]
278
  )
279
 
280
  if not results or 'ids' not in results:
@@ -282,11 +623,28 @@ class ChromaVectorStore(BaseVectorStore):
282
  return
283
 
284
  # Delete all chunks associated with the document
285
- chunk_ids = [
286
- f"{document_id}-chunk-{i}" for i in range(len(results['metadatas']))]
287
- self.collection.delete(ids=chunk_ids)
288
 
289
  except Exception as e:
290
  logging.error(
291
  f"Error deleting document {document_id} from ChromaDB: {str(e)}")
292
  raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # src/vectorstores/chroma_vectorstore.py
2
+ from pathlib import Path
3
  import chromadb
4
  from typing import List, Callable, Any, Dict, Optional
 
5
  import logging
6
+ import asyncio
7
 
8
  from .base_vectorstore import BaseVectorStore
9
+ from .chroma_manager import ChromaManager
10
 
11
 
12
  class ChromaVectorStore(BaseVectorStore):
 
15
  embedding_function: Callable[[List[str]], List[List[float]]],
16
  persist_directory: str = './chroma_db',
17
  collection_name: str = "documents",
18
+ client_settings: Optional[Dict[str, Any]] = None,
19
+ client=None # Allow passing an existing client
20
  ):
21
  """
22
  Initialize Chroma Vector Store
 
26
  persist_directory (str): Directory to persist the vector store
27
  collection_name (str): Name of the collection to use
28
  client_settings (Optional[Dict[str, Any]]): Additional settings for ChromaDB client
29
+ client: Optional existing ChromaDB client to use
30
  """
31
+ self.embedding_function = embedding_function
32
+ self.persist_directory = persist_directory
33
+ self.collection_name = collection_name
34
+ self.client = client # Store client for later initialization
35
+
36
+ # Will be populated during async initialization
37
+ self.collection = None
38
+ self.initialized = False
39
+
40
+ async def initialize(self):
41
+ """Asynchronously initialize the vector store with enhanced error handling"""
42
+ if self.initialized:
43
+ return
44
+
45
  try:
46
+ # Get client via manager if not provided
47
+ if self.client is None:
48
+ self.client = await ChromaManager.get_client(self.persist_directory)
49
+
50
+ # Validate client
51
+ if not self.client:
52
+ raise ValueError("Failed to obtain ChromaDB client")
53
+
54
+ # Get or create collection with more robust handling
55
+ try:
56
+ self.collection = await ChromaManager.get_or_create_collection(
57
+ client=self.client,
58
+ collection_name=self.collection_name,
59
+ embedding_dimension=1024 # Default for most models
60
+ )
61
+ except Exception as collection_error:
62
+ logging.error(
63
+ f"Error creating collection: {str(collection_error)}")
64
+
65
+ # Try to reset and recreate
66
+ try:
67
+ # Attempt to delete existing collection
68
+ self.client.delete_collection(self.collection_name)
69
+ except:
70
+ pass
71
+
72
+ # Recreate collection
73
+ self.collection = self.client.create_collection(
74
+ name=self.collection_name,
75
+ metadata={"hnsw:space": "cosine"}
76
+ )
77
+
78
+ # Additional validation
79
+ if not self.collection:
80
+ raise ValueError(
81
+ "Failed to create or obtain ChromaDB collection")
82
+
83
+ self.initialized = True
84
+ logging.info(
85
+ f"ChromaVectorStore initialized with collection: {self.collection_name}")
86
 
 
 
 
 
 
 
 
87
  except Exception as e:
88
+ logging.error(
89
+ f"Critical error initializing ChromaVectorStore: {str(e)}")
90
+ # Reset initialization state
91
+ self.initialized = False
92
+ self.collection = None
93
  raise
94
 
95
+ async def _ensure_initialized(self):
96
+ """Make sure the vector store is initialized before use"""
97
+ if not self.initialized:
98
+ await self.initialize()
99
+
100
+ async def add_documents_async(
101
  self,
102
  documents: List[str],
103
  embeddings: Optional[List[List[float]]] = None,
 
105
  ids: Optional[List[str]] = None
106
  ) -> None:
107
  """
108
+ Add documents asynchronously with enhanced error handling
109
 
110
  Args:
111
  documents (List[str]): List of document texts
 
113
  metadatas (Optional[List[Dict[str, Any]]]): Metadata for each document
114
  ids (Optional[List[str]]): Custom IDs for the documents
115
  """
116
+ await self._ensure_initialized()
117
+
118
+ if not documents:
119
+ logging.warning("No documents provided to add_documents")
120
+ return
121
+
122
+ # Validate input lists
123
+ if embeddings and len(documents) != len(embeddings):
124
+ raise ValueError("Number of documents and embeddings must match")
125
 
126
+ if metadatas and len(documents) != len(metadatas):
127
+ raise ValueError("Number of documents and metadatas must match")
128
+
129
+ # Generate embeddings if not provided
130
+ if not embeddings:
131
+ try:
132
  embeddings = self.embedding_function(documents)
133
+ except Exception as e:
134
+ logging.error(f"Error generating embeddings: {str(e)}")
135
+ raise
136
+
137
+ # Use provided IDs or generate them
138
+ if not ids:
139
+ ids = [f"doc_{i}" for i in range(len(documents))]
140
+
141
+ # Ensure collection exists and is usable
142
+ if not self.collection:
143
+ logging.error("ChromaDB collection is not initialized")
144
+ await self.initialize()
145
+
146
+ # Prepare add parameters
147
+ add_params = {
148
+ "documents": documents,
149
+ "embeddings": embeddings,
150
+ "ids": ids
151
+ }
152
+
153
+ # Add metadatas if provided
154
+ if metadatas is not None:
155
+ add_params["metadatas"] = metadatas
156
+
157
+ try:
158
+ # Add documents to collection with retry mechanism
159
+ max_retries = 3
160
+ for attempt in range(max_retries):
161
+ try:
162
+ # Clear any cached state
163
+ import gc
164
+ gc.collect()
165
+
166
+ # Attempt to add documents
167
+ self.collection.add(**add_params)
168
+ logging.info(
169
+ f"Successfully added {len(documents)} documents")
170
+ break
171
+ except (StopIteration, RuntimeError) as retry_error:
172
+ if attempt < max_retries - 1:
173
+ logging.warning(
174
+ f"Retry attempt {attempt + 1}: {str(retry_error)}")
175
+ # Optional: Add a small delay between retries
176
+ await asyncio.sleep(0.5)
177
+ else:
178
+ logging.error(
179
+ f"Failed to add documents after {max_retries} attempts")
180
+ raise
181
+
182
+ except (StopIteration, RuntimeError) as retry_error:
183
+ if attempt < max_retries - 1:
184
+ logging.warning(
185
+ f"Retry attempt {attempt + 1}: {str(retry_error)}")
186
+ # Optional: Add a small delay between retries
187
+ await asyncio.sleep(0.5)
188
+ else:
189
+ logging.error(
190
+ f"Failed to add documents after {max_retries} attempts")
191
+ raise
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  except Exception as e:
194
+ logging.error(
195
+ f"Unexpected error adding documents to ChromaDB: {str(e)}")
196
+
197
+ # Additional debugging information
198
+ try:
199
+ logging.info(f"Collection status: {self.collection}")
200
+ logging.info(f"Documents count: {len(documents)}")
201
+ logging.info(
202
+ f"Embeddings count: {len(add_params.get('embeddings', []))}")
203
+ logging.info(
204
+ f"Metadatas count: {len(add_params.get('metadatas', []))}")
205
+ logging.info(f"IDs count: {len(add_params.get('ids', []))}")
206
+ except Exception as debug_error:
207
+ logging.error(f"Error during debugging: {str(debug_error)}")
208
 
209
+ def add_documents(
210
+ self,
211
+ documents: List[str],
212
+ embeddings: Optional[List[List[float]]] = None,
213
+ metadatas: Optional[List[Dict[str, Any]]] = None,
214
+ ids: Optional[List[str]] = None
215
+ ) -> None:
216
+ """
217
+ Synchronous wrapper for add_documents_async
218
+ """
219
+ # Create and run a new event loop if needed
220
+ try:
221
+ loop = asyncio.get_event_loop()
222
+ if loop.is_running():
223
+ # Create a future that can be run in the existing loop
224
+ asyncio.create_task(self.add_documents_async(
225
+ documents, embeddings, metadatas, ids
226
+ ))
227
+ else:
228
+ # Run in a new event loop
229
+ loop.run_until_complete(self.add_documents_async(
230
+ documents, embeddings, metadatas, ids
231
+ ))
232
+ except RuntimeError:
233
+ # No event loop, create a new one
234
+ loop = asyncio.new_event_loop()
235
+ asyncio.set_event_loop(loop)
236
+ loop.run_until_complete(self.add_documents_async(
237
+ documents, embeddings, metadatas, ids
238
+ ))
239
+
240
+ async def similarity_search_async(
241
  self,
242
  query_embedding: List[float],
243
  top_k: int = 3,
244
  **kwargs
245
  ) -> List[Dict[str, Any]]:
246
+ """
247
+ Perform similarity search asynchronously
248
+ """
249
+ await self._ensure_initialized()
250
+
251
  try:
252
  # Get more initial results to account for sequential chunks
253
  results = self.collection.query(
 
326
  logging.error(f"Error in similarity search: {str(e)}")
327
  raise
328
 
329
+ def similarity_search(
330
  self,
331
+ query_embedding: List[float],
332
+ top_k: int = 3,
333
+ **kwargs
334
  ) -> List[Dict[str, Any]]:
335
  """
336
+ Synchronous wrapper for similarity_search_async
337
+ """
338
+ try:
339
+ loop = asyncio.get_event_loop()
340
+ if loop.is_running():
341
+ # We're in an async context, but need to process directly
342
+ try:
343
+ # Get more initial results to account for sequential chunks
344
+ results = self.collection.query(
345
+ query_embeddings=[query_embedding],
346
+ n_results=max(top_k * 2, 10),
347
+ include=['documents', 'metadatas', 'distances']
348
+ )
349
+
350
+ if not results or 'documents' not in results:
351
+ return []
352
+
353
+ formatted_results = []
354
+ documents = results['documents'][0]
355
+ metadatas = results['metadatas'][0]
356
+ distances = results['distances'][0]
357
+
358
+ # Group chunks by document_id
359
+ doc_chunks = {}
360
+ for doc, meta, dist in zip(documents, metadatas, distances):
361
+ doc_id = meta.get('document_id')
362
+ chunk_index = meta.get('chunk_index', 0)
363
+
364
+ if doc_id not in doc_chunks:
365
+ doc_chunks[doc_id] = []
366
+
367
+ doc_chunks[doc_id].append({
368
+ 'text': doc,
369
+ 'metadata': meta,
370
+ 'score': 1.0 - dist,
371
+ 'chunk_index': chunk_index
372
+ })
373
+
374
+ # Process each document's chunks
375
+ for doc_id, chunks in doc_chunks.items():
376
+ # Sort chunks by index
377
+ chunks.sort(key=lambda x: x['chunk_index'])
378
+
379
+ # Find sequences of chunks with good scores
380
+ good_sequences = []
381
+ current_sequence = []
382
 
383
+ for chunk in chunks:
384
+ if chunk['score'] > 0.3: # Adjust threshold as needed
385
+ if not current_sequence or \
386
+ chunk['chunk_index'] == current_sequence[-1]['chunk_index'] + 1:
387
+ current_sequence.append(chunk)
388
+ else:
389
+ if current_sequence:
390
+ good_sequences.append(current_sequence)
391
+ current_sequence = [chunk]
392
+ else:
393
+ if current_sequence:
394
+ good_sequences.append(current_sequence)
395
+ current_sequence = []
396
+
397
+ if current_sequence:
398
+ good_sequences.append(current_sequence)
399
+
400
+ # Add best sequences to results
401
+ for sequence in good_sequences:
402
+ avg_score = sum(c['score']
403
+ for c in sequence) / len(sequence)
404
+ combined_text = ' '.join(
405
+ c['text'] for c in sequence)
406
+
407
+ formatted_results.append({
408
+ 'text': combined_text,
409
+ 'metadata': sequence[0]['metadata'],
410
+ 'score': avg_score
411
+ })
412
+
413
+ # Sort by score and return top_k
414
+ formatted_results.sort(
415
+ key=lambda x: x['score'], reverse=True)
416
+ return formatted_results[:top_k]
417
+
418
+ except Exception as e:
419
+ logging.error(
420
+ f"Error in direct similarity search: {str(e)}")
421
+ return []
422
+ else:
423
+ # Run in existing loop
424
+ return loop.run_until_complete(
425
+ self.similarity_search_async(
426
+ query_embedding, top_k, **kwargs)
427
+ )
428
+ except RuntimeError:
429
+ # No event loop, create a new one
430
+ loop = asyncio.new_event_loop()
431
+ asyncio.set_event_loop(loop)
432
+ return loop.run_until_complete(
433
+ self.similarity_search_async(query_embedding, top_k, **kwargs)
434
+ )
435
 
436
+ async def get_all_documents_async(
437
+ self,
438
+ include_embeddings: bool = False
439
+ ) -> List[Dict[str, Any]]:
440
  """
441
+ Retrieve all documents asynchronously
442
+ """
443
+ await self._ensure_initialized()
444
+
445
  try:
446
  include = ["documents", "metadatas"]
447
  if include_embeddings:
 
479
  f"Error retrieving documents from ChromaDB: {str(e)}")
480
  raise
481
 
482
+ def get_all_documents(
483
+ self,
484
+ include_embeddings: bool = False
485
+ ) -> List[Dict[str, Any]]:
486
  """
487
+ Synchronous wrapper for get_all_documents_async
488
+ """
489
+ try:
490
+ loop = asyncio.get_event_loop()
491
+ if loop.is_running():
492
+ # We're in an async context, but need to return synchronously
493
+ # Process the results just like in the async version
494
+ try:
495
+ include = ["documents", "metadatas"]
496
+ if include_embeddings:
497
+ include.append("embeddings")
498
+
499
+ results = self.collection.get(
500
+ include=include
501
+ )
502
+
503
+ if not results or 'documents' not in results:
504
+ return []
505
+
506
+ documents = []
507
+ for i in range(len(results['documents'])):
508
+ doc = {
509
+ 'id': str(i), # Generate sequential IDs
510
+ 'text': results['documents'][i],
511
+ }
512
+
513
+ if include_embeddings and 'embeddings' in results:
514
+ doc['embedding'] = results['embeddings'][i]
515
+
516
+ if 'metadatas' in results and results['metadatas'][i]:
517
+ doc['metadata'] = results['metadatas'][i]
518
+
519
+ # Use document_id from metadata if available
520
+ if 'document_id' in results['metadatas'][i]:
521
+ doc['id'] = results['metadatas'][i]['document_id']
522
+
523
+ documents.append(doc)
524
+
525
+ return documents
526
+ except:
527
+ return []
528
+ else:
529
+ return loop.run_until_complete(
530
+ self.get_all_documents_async(include_embeddings)
531
+ )
532
+ except RuntimeError:
533
+ # No event loop, create a new one
534
+ loop = asyncio.new_event_loop()
535
+ asyncio.set_event_loop(loop)
536
+ return loop.run_until_complete(
537
+ self.get_all_documents_async(include_embeddings)
538
+ )
539
 
540
+ async def get_document_chunks_async(self, document_id: str) -> List[Dict[str, Any]]:
 
541
  """
542
+ Retrieve all chunks for a specific document asynchronously
543
+ """
544
+ await self._ensure_initialized()
545
+
546
  try:
547
  results = self.collection.get(
548
  where={"document_id": document_id},
 
569
  logging.error(f"Error retrieving document chunks: {str(e)}")
570
  raise
571
 
572
+ def get_document_chunks(self, document_id: str) -> List[Dict[str, Any]]:
573
+ """
574
+ Synchronous wrapper for get_document_chunks_async
575
  """
576
+ try:
577
+ loop = asyncio.get_event_loop()
578
+ if loop.is_running():
579
+ # Fall back to direct query which may fail
580
+ try:
581
+ results = self.collection.get(
582
+ where={"document_id": document_id},
583
+ include=["documents", "metadatas"]
584
+ )
585
+
586
+ chunks = []
587
+ for i in range(len(results['documents'])):
588
+ chunk = {
589
+ 'text': results['documents'][i],
590
+ 'metadata': results['metadatas'][i] if results.get('metadatas') else None
591
+ }
592
+ chunks.append(chunk)
593
+ return chunks
594
+ except:
595
+ return []
596
+ else:
597
+ return loop.run_until_complete(
598
+ self.get_document_chunks_async(document_id)
599
+ )
600
+ except RuntimeError:
601
+ # No event loop, create a new one
602
+ loop = asyncio.new_event_loop()
603
+ asyncio.set_event_loop(loop)
604
+ return loop.run_until_complete(
605
+ self.get_document_chunks_async(document_id)
606
+ )
607
 
608
+ async def delete_document_async(self, document_id: str) -> None:
 
609
  """
610
+ Delete all chunks associated with a document_id asynchronously
611
+ """
612
+ await self._ensure_initialized()
613
+
614
  try:
615
  # Get all chunks with the given document_id
616
  results = self.collection.get(
617
  where={"document_id": document_id},
618
+ include=["ids"]
619
  )
620
 
621
  if not results or 'ids' not in results:
 
623
  return
624
 
625
  # Delete all chunks associated with the document
626
+ self.collection.delete(ids=results['ids'])
 
 
627
 
628
  except Exception as e:
629
  logging.error(
630
  f"Error deleting document {document_id} from ChromaDB: {str(e)}")
631
  raise
632
+
633
+ def delete_document(self, document_id: str) -> None:
634
+ """
635
+ Synchronous wrapper for delete_document_async
636
+ """
637
+ try:
638
+ loop = asyncio.get_event_loop()
639
+ if loop.is_running():
640
+ # Create a future that can be run in the existing loop
641
+ asyncio.create_task(self.delete_document_async(document_id))
642
+ else:
643
+ # Run in a new event loop
644
+ loop.run_until_complete(
645
+ self.delete_document_async(document_id))
646
+ except RuntimeError:
647
+ # No event loop, create a new one
648
+ loop = asyncio.new_event_loop()
649
+ asyncio.set_event_loop(loop)
650
+ loop.run_until_complete(self.delete_document_async(document_id))
src/vectorstores/optimized_vectorstore.py CHANGED
@@ -13,6 +13,7 @@ from .chroma_vectorstore import ChromaVectorStore
13
  from src.embeddings.huggingface_embedding import HuggingFaceEmbedding
14
  from src.utils.logger import logger
15
  from config.config import settings
 
16
 
17
 
18
  class OptimizedVectorStore(ChromaVectorStore):
@@ -56,30 +57,21 @@ class OptimizedVectorStore(ChromaVectorStore):
56
  # Clean up existing database if dimensions don't match
57
  await self._cleanup_if_needed(required_dim)
58
 
59
- # Create ChromaDB client with fresh settings
60
  client = chromadb.PersistentClient(
61
  path=self._persist_directory,
62
- settings=Settings(
63
- allow_reset=True,
64
- is_persistent=True,
65
- anonymized_telemetry=False
66
- )
67
  )
68
 
69
  # Create new collection with correct dimensions
70
- collection = client.create_collection(
71
- name=self._collection_name,
72
- metadata={
73
- "hnsw:space": "cosine",
74
- "hnsw:dim": required_dim
75
- }
76
- )
77
 
78
  # Initialize parent class
79
  super().__init__(
80
  embedding_function=self._embedding_model.embed_documents,
81
  persist_directory=self._persist_directory,
82
- collection_name=self._collection_name
 
83
  )
84
 
85
  self._initialized = True
 
13
  from src.embeddings.huggingface_embedding import HuggingFaceEmbedding
14
  from src.utils.logger import logger
15
  from config.config import settings
16
+ from src.vectorstores.chroma_settings import get_chroma_settings
17
 
18
 
19
  class OptimizedVectorStore(ChromaVectorStore):
 
57
  # Clean up existing database if dimensions don't match
58
  await self._cleanup_if_needed(required_dim)
59
 
60
+ # Create ChromaDB client with consistent settings
61
  client = chromadb.PersistentClient(
62
  path=self._persist_directory,
63
+ settings=get_chroma_settings() # Use shared settings function
 
 
 
 
64
  )
65
 
66
  # Create new collection with correct dimensions
67
+ # collection = self._setup_collection(client)
 
 
 
 
 
 
68
 
69
  # Initialize parent class
70
  super().__init__(
71
  embedding_function=self._embedding_model.embed_documents,
72
  persist_directory=self._persist_directory,
73
+ collection_name=self._collection_name,
74
+ client=client # Pass the existing client
75
  )
76
 
77
  self._initialized = True