TalatMasood commited on
Commit
acdfaa9
·
1 Parent(s): 1a54bda

Log google drive documents in the mongodb, add source of the document and made chunks to overlap text.

Browse files
Install your driver.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2. Install your driver
2
+ Run the following on the command line
3
+ Note: Use appropriate Python 3 executable
4
+ python -m pip install "pymongo[srv]"==3.12
5
+
6
+
7
+ View MongoDB Python Driver installation instructions.
8
+ 3. Add your connection string into your application code
9
+ Use this connection string in your application
10
+
11
+
12
+ View full code sample
13
+
14
+
15
+ Show Password
16
+
17
+ mongodb+srv://talat:[email protected]/?retryWrites=true&w=majority&appName=Chatbot
18
+
19
+ The password for talat is included in the connection string for your first time setup. This password will not be available again after exiting this connect flow.
config/__pycache__/config.cpython-312.pyc CHANGED
Binary files a/config/__pycache__/config.cpython-312.pyc and b/config/__pycache__/config.cpython-312.pyc differ
 
config/config.py CHANGED
@@ -22,6 +22,8 @@ class Settings:
22
  # Anthropic Configuration
23
  ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
24
 
 
 
25
  # Environment Configuration
26
  ENVIRONMENT = os.getenv('ENVIRONMENT').lower()
27
 
 
22
  # Anthropic Configuration
23
  ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
24
 
25
+ # top number of chunks to retrieve.
26
+ TOP_CHUNKS = int(os.getenv('TOP_CHUNKS', '10'))
27
  # Environment Configuration
28
  ENVIRONMENT = os.getenv('ENVIRONMENT').lower()
29
 
src/__pycache__/main.cpython-312.pyc CHANGED
Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ
 
src/agents/__pycache__/rag_agent.cpython-312.pyc CHANGED
Binary files a/src/agents/__pycache__/rag_agent.cpython-312.pyc and b/src/agents/__pycache__/rag_agent.cpython-312.pyc differ
 
src/agents/__pycache__/system_instructions_rag.cpython-312.pyc CHANGED
Binary files a/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc and b/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc differ
 
src/agents/rag_agent.py CHANGED
@@ -10,6 +10,7 @@ from src.utils.conversation_manager import ConversationManager
10
  from src.db.mongodb_store import MongoDBStore
11
  from src.models.rag import RAGResponse
12
  from src.utils.logger import logger
 
13
 
14
 
15
  class RAGAgent(ExcelAwareRAGAgent):
@@ -43,6 +44,50 @@ class RAGAgent(ExcelAwareRAGAgent):
43
  max_messages=max_history_messages
44
  )
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  async def generate_response(
47
  self,
48
  query: str,
@@ -51,9 +96,9 @@ class RAGAgent(ExcelAwareRAGAgent):
51
  max_tokens: Optional[int] = None,
52
  context_docs: Optional[List[str]] = None
53
  ) -> RAGResponse:
54
- """Generate response with specific handling for different query types"""
55
  try:
56
- # First, check if this is an introduction/welcome message query
57
  is_introduction = (
58
  "wants support" in query and
59
  "This is Introduction" in query and
@@ -61,7 +106,6 @@ class RAGAgent(ExcelAwareRAGAgent):
61
  )
62
 
63
  if is_introduction:
64
- # Handle introduction message - no context needed
65
  welcome_message = self._handle_contact_query(query)
66
  return RAGResponse(
67
  response=welcome_message,
@@ -77,8 +121,6 @@ class RAGAgent(ExcelAwareRAGAgent):
77
  conversation_id,
78
  limit=self.conversation_manager.max_messages
79
  )
80
-
81
- # Get relevant history within token limits
82
  history = self.conversation_manager.get_relevant_history(
83
  messages=history,
84
  current_query=query
@@ -94,6 +136,21 @@ class RAGAgent(ExcelAwareRAGAgent):
94
  sources = None
95
  scores = None
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  # Check if we have any relevant context
98
  if not context_docs:
99
  return RAGResponse(
@@ -103,15 +160,6 @@ class RAGAgent(ExcelAwareRAGAgent):
103
  scores=None
104
  )
105
 
106
- # Check if this is an Excel-related query
107
- has_excel_content = any('Sheet:' in doc for doc in context_docs)
108
- if has_excel_content:
109
- try:
110
- context_docs = self._process_excel_context(
111
- context_docs, query)
112
- except Exception as e:
113
- logger.warning(f"Error processing Excel context: {str(e)}")
114
-
115
  # Generate prompt with context and history
116
  augmented_prompt = self.conversation_manager.generate_prompt_with_history(
117
  current_query=query,
@@ -119,7 +167,7 @@ class RAGAgent(ExcelAwareRAGAgent):
119
  context_docs=context_docs
120
  )
121
 
122
- # Generate initial response
123
  response = self.llm.generate(
124
  prompt=augmented_prompt,
125
  temperature=temperature,
@@ -129,19 +177,6 @@ class RAGAgent(ExcelAwareRAGAgent):
129
  # Clean the response
130
  cleaned_response = self._clean_response(response)
131
 
132
- # For Excel queries, enhance the response
133
- if has_excel_content:
134
- try:
135
- enhanced_response = await self.enhance_excel_response(
136
- query=query,
137
- response=cleaned_response,
138
- context_docs=context_docs
139
- )
140
- if enhanced_response:
141
- cleaned_response = enhanced_response
142
- except Exception as e:
143
- logger.warning(f"Error enhancing Excel response: {str(e)}")
144
-
145
  # Return the final response
146
  return RAGResponse(
147
  response=cleaned_response,
@@ -151,7 +186,7 @@ class RAGAgent(ExcelAwareRAGAgent):
151
  )
152
 
153
  except Exception as e:
154
- logger.error(f"Error in SystemInstructionsRAGAgent: {str(e)}")
155
  raise
156
 
157
  def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
@@ -201,8 +236,7 @@ class RAGAgent(ExcelAwareRAGAgent):
201
  async def retrieve_context(
202
  self,
203
  query: str,
204
- conversation_history: Optional[List[Dict]] = None,
205
- top_k: int = 3
206
  ) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
207
  """
208
  Retrieve context with conversation history enhancement
@@ -229,7 +263,7 @@ class RAGAgent(ExcelAwareRAGAgent):
229
  # Retrieve similar documents
230
  results = self.vector_store.similarity_search(
231
  query_embedding,
232
- top_k=top_k
233
  )
234
 
235
  # Debug log search results
 
10
  from src.db.mongodb_store import MongoDBStore
11
  from src.models.rag import RAGResponse
12
  from src.utils.logger import logger
13
+ from config.config import settings
14
 
15
 
16
  class RAGAgent(ExcelAwareRAGAgent):
 
44
  max_messages=max_history_messages
45
  )
46
 
47
+ def _extract_markdown_section(self, docs: List[str], section_header: str) -> str:
48
+ """Extract complete section content from markdown documents"""
49
+ combined_text = '\n'.join(docs)
50
+
51
+ section_start = combined_text.find(section_header)
52
+ if section_start == -1:
53
+ return ""
54
+
55
+ next_section = combined_text.find(
56
+ "\n\n**", section_start + len(section_header))
57
+ if next_section == -1:
58
+ section_content = combined_text[section_start:]
59
+ else:
60
+ section_content = combined_text[section_start:next_section]
61
+
62
+ return self._clean_markdown_content(section_content)
63
+
64
+ def _clean_markdown_content(self, content: str) -> str:
65
+ """Clean and format markdown content"""
66
+ lines = content.split('\n')
67
+ seen_lines = set()
68
+ cleaned_lines = []
69
+
70
+ for line in lines:
71
+ # Always keep headers and table formatting
72
+ if '| :----' in line or line.startswith('**'):
73
+ if line not in seen_lines:
74
+ cleaned_lines.append(line)
75
+ seen_lines.add(line)
76
+ continue
77
+
78
+ # Keep table rows and list items
79
+ if line.strip().startswith('|') or line.strip().startswith('-'):
80
+ cleaned_lines.append(line)
81
+ continue
82
+
83
+ # Remove duplicates for other content
84
+ stripped = line.strip()
85
+ if stripped and stripped not in seen_lines:
86
+ cleaned_lines.append(line)
87
+ seen_lines.add(stripped)
88
+
89
+ return '\n'.join(cleaned_lines)
90
+
91
  async def generate_response(
92
  self,
93
  query: str,
 
96
  max_tokens: Optional[int] = None,
97
  context_docs: Optional[List[str]] = None
98
  ) -> RAGResponse:
99
+ """Generate response with improved markdown and conversation handling"""
100
  try:
101
+ # Handle introduction/welcome message queries
102
  is_introduction = (
103
  "wants support" in query and
104
  "This is Introduction" in query and
 
106
  )
107
 
108
  if is_introduction:
 
109
  welcome_message = self._handle_contact_query(query)
110
  return RAGResponse(
111
  response=welcome_message,
 
121
  conversation_id,
122
  limit=self.conversation_manager.max_messages
123
  )
 
 
124
  history = self.conversation_manager.get_relevant_history(
125
  messages=history,
126
  current_query=query
 
136
  sources = None
137
  scores = None
138
 
139
+ # Special handling for markdown section queries
140
+ if "DISCUSSIONS AND ACTION ITEMS" in query.upper():
141
+ section_content = self._extract_markdown_section(
142
+ context_docs,
143
+ "**DISCUSSIONS AND ACTION ITEMS**"
144
+ )
145
+
146
+ if section_content:
147
+ return RAGResponse(
148
+ response=section_content.strip(),
149
+ context_docs=context_docs,
150
+ sources=sources,
151
+ scores=scores
152
+ )
153
+
154
  # Check if we have any relevant context
155
  if not context_docs:
156
  return RAGResponse(
 
160
  scores=None
161
  )
162
 
 
 
 
 
 
 
 
 
 
163
  # Generate prompt with context and history
164
  augmented_prompt = self.conversation_manager.generate_prompt_with_history(
165
  current_query=query,
 
167
  context_docs=context_docs
168
  )
169
 
170
+ # Generate response
171
  response = self.llm.generate(
172
  prompt=augmented_prompt,
173
  temperature=temperature,
 
177
  # Clean the response
178
  cleaned_response = self._clean_response(response)
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  # Return the final response
181
  return RAGResponse(
182
  response=cleaned_response,
 
186
  )
187
 
188
  except Exception as e:
189
+ logger.error(f"Error in RAGAgent: {str(e)}")
190
  raise
191
 
192
  def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
 
236
  async def retrieve_context(
237
  self,
238
  query: str,
239
+ conversation_history: Optional[List[Dict]] = None
 
240
  ) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
241
  """
242
  Retrieve context with conversation history enhancement
 
263
  # Retrieve similar documents
264
  results = self.vector_store.similarity_search(
265
  query_embedding,
266
+ top_k=settings.TOP_CHUNKS
267
  )
268
 
269
  # Debug log search results
src/db/__pycache__/mongodb_store.cpython-312.pyc CHANGED
Binary files a/src/db/__pycache__/mongodb_store.cpython-312.pyc and b/src/db/__pycache__/mongodb_store.cpython-312.pyc differ
 
src/db/mongodb_store.py CHANGED
@@ -4,6 +4,7 @@ from datetime import datetime
4
  from typing import List, Dict, Optional, Any
5
  from bson import ObjectId
6
 
 
7
  class MongoDBStore:
8
  def __init__(self, mongo_uri: str = "mongodb://localhost:27017"):
9
  """Initialize MongoDB connection"""
@@ -20,7 +21,8 @@ class MongoDBStore:
20
  filename: str,
21
  content_type: str,
22
  file_size: int,
23
- url_path: str
 
24
  ) -> str:
25
  """Store document metadata in MongoDB"""
26
  document = {
@@ -29,9 +31,10 @@ class MongoDBStore:
29
  "content_type": content_type,
30
  "file_size": file_size,
31
  "url_path": url_path,
 
32
  "upload_timestamp": datetime.now()
33
  }
34
-
35
  await self.documents.insert_one(document)
36
  return document_id
37
 
@@ -53,7 +56,8 @@ class MongoDBStore:
53
  "content_type": 1,
54
  "file_size": 1,
55
  "url_path": 1,
56
- "upload_timestamp": 1
 
57
  }
58
  )
59
  return await cursor.to_list(length=None)
@@ -62,7 +66,7 @@ class MongoDBStore:
62
  """Delete document from MongoDB"""
63
  result = await self.documents.delete_one({"document_id": document_id})
64
  return result.deleted_count > 0
65
-
66
  async def find_existing_user(
67
  self,
68
  email: str,
@@ -70,11 +74,11 @@ class MongoDBStore:
70
  ) -> Optional[str]:
71
  """
72
  Find existing user by email or phone number
73
-
74
  Args:
75
  email (str): User's email
76
  phone_number (str): User's phone number
77
-
78
  Returns:
79
  Optional[str]: Conversation ID if found, None otherwise
80
  """
@@ -84,7 +88,7 @@ class MongoDBStore:
84
  {"phone_number": phone_number}
85
  ]
86
  })
87
-
88
  return result["conversation_id"] if result else None
89
 
90
  # Conversation and chat history methods
@@ -105,7 +109,7 @@ class MongoDBStore:
105
  full_name (Optional[str]): User's full name
106
  email (Optional[str]): User's email
107
  phone_number (Optional[str]): User's phone number
108
-
109
  Returns:
110
  str: Conversation ID
111
  """
@@ -124,7 +128,7 @@ class MongoDBStore:
124
  conversation["email"] = email
125
  if phone_number:
126
  conversation["phone_number"] = phone_number
127
-
128
  await self.conversations.insert_one(conversation)
129
  return conversation_id
130
 
@@ -200,30 +204,31 @@ class MongoDBStore:
200
  "rating": None
201
  }
202
  result = await self.chat_history.insert_one(assistant_message)
203
-
204
  # Update conversation metadata
205
  await self.conversations.update_one(
206
  {"conversation_id": conversation_id},
207
  {
208
  "$set": {"last_updated": datetime.now()},
209
- "$inc": {"message_count": 2} # Increment by 2 since we store both messages
 
210
  },
211
  upsert=True
212
  )
213
-
214
  return str(result.inserted_id)
215
-
216
  async def get_conversation_history(self, conversation_id: str) -> List[Dict]:
217
  """Retrieve complete conversation history"""
218
  cursor = self.chat_history.find(
219
  {"conversation_id": conversation_id}
220
  ).sort("timestamp", 1)
221
-
222
  history = []
223
  async for document in cursor:
224
  document["_id"] = str(document["_id"])
225
  history.append(document)
226
-
227
  return history
228
 
229
  async def get_recent_messages(
@@ -234,14 +239,15 @@ class MongoDBStore:
234
  """Get most recent messages from conversation"""
235
  cursor = self.chat_history.find(
236
  {"conversation_id": conversation_id}
237
- ).sort("timestamp", -1).limit(limit * 2) # Multiply limit by 2 to account for user-assistant pairs
238
-
 
239
  messages = []
240
  async for doc in cursor:
241
  messages.append(self._format_message(doc))
242
-
243
  return list(reversed(messages))
244
-
245
  async def update_feedback(
246
  self,
247
  conversation_id: str,
@@ -250,20 +256,20 @@ class MongoDBStore:
250
  ) -> bool:
251
  """
252
  Update feedback for a conversation
253
-
254
  Args:
255
  conversation_id (str): Conversation ID
256
  feedback (Optional[str]): Feedback text
257
  rating (Optional[int]): Numeric rating
258
-
259
  Returns:
260
  bool: True if update successful
261
  """
262
  update_fields = {}
263
-
264
  if feedback is not None:
265
  update_fields["feedback"] = feedback
266
-
267
  if rating is not None:
268
  from config.config import settings
269
  formatted_rating = f"{rating}/{settings.MAX_RATING}"
@@ -271,7 +277,7 @@ class MongoDBStore:
271
  "rating": rating, # Store numeric value
272
  "formatted_rating": formatted_rating # Store formatted string
273
  })
274
-
275
  if not update_fields:
276
  return False
277
 
@@ -279,7 +285,7 @@ class MongoDBStore:
279
  {"conversation_id": conversation_id},
280
  {"$set": update_fields}
281
  )
282
-
283
  # Also update conversation metadata
284
  if result.modified_count > 0:
285
  await self.update_conversation_metadata(
@@ -290,7 +296,7 @@ class MongoDBStore:
290
  "formatted_rating": formatted_rating if rating is not None else None
291
  }
292
  )
293
-
294
  return result.modified_count > 0
295
 
296
  async def get_messages_for_summary(
@@ -301,7 +307,7 @@ class MongoDBStore:
301
  cursor = self.chat_history.find(
302
  {"conversation_id": conversation_id}
303
  ).sort("timestamp", 1)
304
-
305
  messages = []
306
  async for doc in cursor:
307
  formatted = self._format_message(doc)
@@ -312,10 +318,9 @@ class MongoDBStore:
312
  'timestamp': formatted['timestamp'],
313
  'sources': formatted['sources']
314
  })
315
-
316
  return messages
317
-
318
-
319
  def _format_message(self, doc: Dict) -> Dict:
320
  """Helper method to format message documents consistently"""
321
  return {
@@ -330,7 +335,7 @@ class MongoDBStore:
330
  "feedback": doc.get("feedback"),
331
  "rating": doc.get("rating")
332
  }
333
-
334
  # Vector store related methods
335
  async def store_vector_metadata(
336
  self,
@@ -345,7 +350,7 @@ class MongoDBStore:
345
  "metadata": metadata,
346
  "created_at": datetime.now()
347
  }
348
-
349
  result = await self.db.vector_metadata.insert_one(vector_metadata)
350
  return str(result.inserted_id)
351
 
@@ -367,4 +372,4 @@ class MongoDBStore:
367
  result = await self.db.vector_metadata.delete_many(
368
  {"document_id": document_id}
369
  )
370
- return result.deleted_count > 0
 
4
  from typing import List, Dict, Optional, Any
5
  from bson import ObjectId
6
 
7
+
8
  class MongoDBStore:
9
  def __init__(self, mongo_uri: str = "mongodb://localhost:27017"):
10
  """Initialize MongoDB connection"""
 
21
  filename: str,
22
  content_type: str,
23
  file_size: int,
24
+ url_path: str,
25
+ source: str
26
  ) -> str:
27
  """Store document metadata in MongoDB"""
28
  document = {
 
31
  "content_type": content_type,
32
  "file_size": file_size,
33
  "url_path": url_path,
34
+ "source": source,
35
  "upload_timestamp": datetime.now()
36
  }
37
+
38
  await self.documents.insert_one(document)
39
  return document_id
40
 
 
56
  "content_type": 1,
57
  "file_size": 1,
58
  "url_path": 1,
59
+ "upload_timestamp": 1,
60
+ "source": 1
61
  }
62
  )
63
  return await cursor.to_list(length=None)
 
66
  """Delete document from MongoDB"""
67
  result = await self.documents.delete_one({"document_id": document_id})
68
  return result.deleted_count > 0
69
+
70
  async def find_existing_user(
71
  self,
72
  email: str,
 
74
  ) -> Optional[str]:
75
  """
76
  Find existing user by email or phone number
77
+
78
  Args:
79
  email (str): User's email
80
  phone_number (str): User's phone number
81
+
82
  Returns:
83
  Optional[str]: Conversation ID if found, None otherwise
84
  """
 
88
  {"phone_number": phone_number}
89
  ]
90
  })
91
+
92
  return result["conversation_id"] if result else None
93
 
94
  # Conversation and chat history methods
 
109
  full_name (Optional[str]): User's full name
110
  email (Optional[str]): User's email
111
  phone_number (Optional[str]): User's phone number
112
+
113
  Returns:
114
  str: Conversation ID
115
  """
 
128
  conversation["email"] = email
129
  if phone_number:
130
  conversation["phone_number"] = phone_number
131
+
132
  await self.conversations.insert_one(conversation)
133
  return conversation_id
134
 
 
204
  "rating": None
205
  }
206
  result = await self.chat_history.insert_one(assistant_message)
207
+
208
  # Update conversation metadata
209
  await self.conversations.update_one(
210
  {"conversation_id": conversation_id},
211
  {
212
  "$set": {"last_updated": datetime.now()},
213
+ # Increment by 2 since we store both messages
214
+ "$inc": {"message_count": 2}
215
  },
216
  upsert=True
217
  )
218
+
219
  return str(result.inserted_id)
220
+
221
  async def get_conversation_history(self, conversation_id: str) -> List[Dict]:
222
  """Retrieve complete conversation history"""
223
  cursor = self.chat_history.find(
224
  {"conversation_id": conversation_id}
225
  ).sort("timestamp", 1)
226
+
227
  history = []
228
  async for document in cursor:
229
  document["_id"] = str(document["_id"])
230
  history.append(document)
231
+
232
  return history
233
 
234
  async def get_recent_messages(
 
239
  """Get most recent messages from conversation"""
240
  cursor = self.chat_history.find(
241
  {"conversation_id": conversation_id}
242
+ # Multiply limit by 2 to account for user-assistant pairs
243
+ ).sort("timestamp", -1).limit(limit * 2)
244
+
245
  messages = []
246
  async for doc in cursor:
247
  messages.append(self._format_message(doc))
248
+
249
  return list(reversed(messages))
250
+
251
  async def update_feedback(
252
  self,
253
  conversation_id: str,
 
256
  ) -> bool:
257
  """
258
  Update feedback for a conversation
259
+
260
  Args:
261
  conversation_id (str): Conversation ID
262
  feedback (Optional[str]): Feedback text
263
  rating (Optional[int]): Numeric rating
264
+
265
  Returns:
266
  bool: True if update successful
267
  """
268
  update_fields = {}
269
+
270
  if feedback is not None:
271
  update_fields["feedback"] = feedback
272
+
273
  if rating is not None:
274
  from config.config import settings
275
  formatted_rating = f"{rating}/{settings.MAX_RATING}"
 
277
  "rating": rating, # Store numeric value
278
  "formatted_rating": formatted_rating # Store formatted string
279
  })
280
+
281
  if not update_fields:
282
  return False
283
 
 
285
  {"conversation_id": conversation_id},
286
  {"$set": update_fields}
287
  )
288
+
289
  # Also update conversation metadata
290
  if result.modified_count > 0:
291
  await self.update_conversation_metadata(
 
296
  "formatted_rating": formatted_rating if rating is not None else None
297
  }
298
  )
299
+
300
  return result.modified_count > 0
301
 
302
  async def get_messages_for_summary(
 
307
  cursor = self.chat_history.find(
308
  {"conversation_id": conversation_id}
309
  ).sort("timestamp", 1)
310
+
311
  messages = []
312
  async for doc in cursor:
313
  formatted = self._format_message(doc)
 
318
  'timestamp': formatted['timestamp'],
319
  'sources': formatted['sources']
320
  })
321
+
322
  return messages
323
+
 
324
  def _format_message(self, doc: Dict) -> Dict:
325
  """Helper method to format message documents consistently"""
326
  return {
 
335
  "feedback": doc.get("feedback"),
336
  "rating": doc.get("rating")
337
  }
338
+
339
  # Vector store related methods
340
  async def store_vector_metadata(
341
  self,
 
350
  "metadata": metadata,
351
  "created_at": datetime.now()
352
  }
353
+
354
  result = await self.db.vector_metadata.insert_one(vector_metadata)
355
  return str(result.inserted_id)
356
 
 
372
  result = await self.db.vector_metadata.delete_many(
373
  {"document_id": document_id}
374
  )
375
+ return result.deleted_count > 0
src/implementations/__pycache__/document_service.cpython-312.pyc CHANGED
Binary files a/src/implementations/__pycache__/document_service.cpython-312.pyc and b/src/implementations/__pycache__/document_service.cpython-312.pyc differ
 
src/implementations/document_service.py CHANGED
@@ -13,9 +13,10 @@ from src.models import DocumentResponse, DocumentInfo, BatchUploadResponse
13
  from src.utils.logger import logger
14
  from src.db.mongodb_store import MongoDBStore
15
 
 
16
  class DocumentService:
17
  def __init__(
18
- self,
19
  doc_processor: DocumentProcessor,
20
  mongodb: MongoDBStore
21
  ):
@@ -27,10 +28,10 @@ class DocumentService:
27
  async def check_duplicate_filename(self, filename: str) -> bool:
28
  """
29
  Check if a file with the same name exists
30
-
31
  Args:
32
  filename (str): Original filename to check
33
-
34
  Returns:
35
  bool: True if duplicate exists, False otherwise
36
  """
@@ -45,8 +46,8 @@ class DocumentService:
45
  ) -> BatchUploadResponse:
46
  """Process multiple document uploads"""
47
  processed_files, failed_files = await self._handle_file_uploads(
48
- files,
49
- vector_store,
50
  background_tasks
51
  )
52
 
@@ -78,22 +79,23 @@ class DocumentService:
78
 
79
  if not self._is_supported_format(file.filename):
80
  failed_files.append(self._create_failed_file_entry(
81
- file.filename,
82
  "Unsupported file format"
83
  ))
84
  continue
85
 
86
  document_response = await self._process_single_file(
87
- file,
88
- vector_store,
89
  background_tasks
90
  )
91
  processed_files.append(document_response)
92
 
93
  except Exception as e:
94
- logger.error(f"Error processing file {file.filename}: {str(e)}")
 
95
  failed_files.append(self._create_failed_file_entry(
96
- file.filename,
97
  str(e)
98
  ))
99
 
@@ -110,7 +112,7 @@ class DocumentService:
110
  filename = f"{document_id}_{file.filename}"
111
  file_path = self.permanent_dir / filename
112
  url_path = f"/docs/{filename}"
113
-
114
  try:
115
  # Save file to permanent location using a context manager
116
  with open(file_path, "wb") as buffer:
@@ -122,12 +124,12 @@ class DocumentService:
122
  # Process document with proper cleanup for Excel files
123
  try:
124
  processed_doc = await self.doc_processor.process_document(file_path)
125
-
126
  # For Excel files, ensure pandas closes the file
127
  if file_path.suffix.lower() in ['.xlsx', '.xls']:
128
  import gc
129
  gc.collect() # Help cleanup any lingering file handles
130
-
131
  except Exception as proc_error:
132
  logger.error(f"Error processing document: {str(proc_error)}")
133
  raise
@@ -138,7 +140,8 @@ class DocumentService:
138
  filename=file.filename,
139
  content_type=file.content_type,
140
  file_size=os.path.getsize(file_path),
141
- url_path=url_path
 
142
  )
143
 
144
  # Process for vector store in background
@@ -161,21 +164,23 @@ class DocumentService:
161
  url_path=url_path
162
  )
163
  )
164
-
165
  except Exception as e:
166
  # Clean up file if it was created
167
  if file_path.exists():
168
  try:
169
  file_path.unlink()
170
  except Exception as cleanup_error:
171
- logger.error(f"Error cleaning up file {file_path}: {str(cleanup_error)}")
172
-
 
173
  # Clean up from MongoDB if document was created
174
  try:
175
  await self.mongodb.delete_document(document_id)
176
  except Exception as db_cleanup_error:
177
- logger.error(f"Error cleaning up MongoDB document {document_id}: {str(db_cleanup_error)}")
178
-
 
179
  logger.error(f"Error processing file {file.filename}: {str(e)}")
180
  raise
181
 
@@ -189,11 +194,12 @@ class DocumentService:
189
  """Process document content for vector store"""
190
  try:
191
  # Generate chunk IDs using document_id
192
- chunk_ids = [f"{document_id}-chunk-{i}" for i in range(len(chunks))]
193
-
 
194
  # Get embeddings
195
  embeddings = vector_store.embedding_function(chunks)
196
-
197
  # Prepare metadata for each chunk
198
  metadatas = [{
199
  'document_id': document_id,
@@ -201,7 +207,7 @@ class DocumentService:
201
  'chunk_index': i,
202
  'total_chunks': len(chunks)
203
  } for i in range(len(chunks))]
204
-
205
  # Store in vector store
206
  vector_store.add_documents(
207
  documents=chunks,
@@ -209,17 +215,19 @@ class DocumentService:
209
  metadatas=metadatas,
210
  ids=chunk_ids
211
  )
212
-
213
- logger.info(f"Successfully processed document {filename} (ID: {document_id}) into {len(chunks)} chunks")
214
-
 
215
  except Exception as e:
216
- logger.error(f"Error processing document {filename} (ID: {document_id}) for vector store: {str(e)}")
 
217
  raise
218
 
219
  def _is_supported_format(self, filename: str) -> bool:
220
  """Check if file format is supported"""
221
- return any(filename.lower().endswith(ext)
222
- for ext in self.doc_processor.supported_formats)
223
 
224
  def _create_failed_file_entry(self, filename: str, error: str) -> dict:
225
  """Create a failed file entry"""
@@ -237,15 +245,15 @@ class DocumentService:
237
  # Get filename from url_path
238
  filename = doc['url_path'].split('/')[-1]
239
  file_path = self.permanent_dir / filename
240
-
241
  # Delete physical file if it exists
242
  if file_path.exists():
243
  file_path.unlink()
244
-
245
  # Delete from MongoDB
246
  return await self.mongodb.delete_document(document_id)
247
  return False
248
-
249
  except Exception as e:
250
  logger.error(f"Error deleting document: {str(e)}")
251
  raise
@@ -253,4 +261,4 @@ class DocumentService:
253
  def cleanup(self):
254
  """Clean up permanent directory if empty"""
255
  if self.permanent_dir.exists() and not any(self.permanent_dir.iterdir()):
256
- self.permanent_dir.rmdir()
 
13
  from src.utils.logger import logger
14
  from src.db.mongodb_store import MongoDBStore
15
 
16
+
17
  class DocumentService:
18
  def __init__(
19
+ self,
20
  doc_processor: DocumentProcessor,
21
  mongodb: MongoDBStore
22
  ):
 
28
  async def check_duplicate_filename(self, filename: str) -> bool:
29
  """
30
  Check if a file with the same name exists
31
+
32
  Args:
33
  filename (str): Original filename to check
34
+
35
  Returns:
36
  bool: True if duplicate exists, False otherwise
37
  """
 
46
  ) -> BatchUploadResponse:
47
  """Process multiple document uploads"""
48
  processed_files, failed_files = await self._handle_file_uploads(
49
+ files,
50
+ vector_store,
51
  background_tasks
52
  )
53
 
 
79
 
80
  if not self._is_supported_format(file.filename):
81
  failed_files.append(self._create_failed_file_entry(
82
+ file.filename,
83
  "Unsupported file format"
84
  ))
85
  continue
86
 
87
  document_response = await self._process_single_file(
88
+ file,
89
+ vector_store,
90
  background_tasks
91
  )
92
  processed_files.append(document_response)
93
 
94
  except Exception as e:
95
+ logger.error(
96
+ f"Error processing file {file.filename}: {str(e)}")
97
  failed_files.append(self._create_failed_file_entry(
98
+ file.filename,
99
  str(e)
100
  ))
101
 
 
112
  filename = f"{document_id}_{file.filename}"
113
  file_path = self.permanent_dir / filename
114
  url_path = f"/docs/{filename}"
115
+
116
  try:
117
  # Save file to permanent location using a context manager
118
  with open(file_path, "wb") as buffer:
 
124
  # Process document with proper cleanup for Excel files
125
  try:
126
  processed_doc = await self.doc_processor.process_document(file_path)
127
+
128
  # For Excel files, ensure pandas closes the file
129
  if file_path.suffix.lower() in ['.xlsx', '.xls']:
130
  import gc
131
  gc.collect() # Help cleanup any lingering file handles
132
+
133
  except Exception as proc_error:
134
  logger.error(f"Error processing document: {str(proc_error)}")
135
  raise
 
140
  filename=file.filename,
141
  content_type=file.content_type,
142
  file_size=os.path.getsize(file_path),
143
+ url_path=url_path,
144
+ source="user_upload"
145
  )
146
 
147
  # Process for vector store in background
 
164
  url_path=url_path
165
  )
166
  )
167
+
168
  except Exception as e:
169
  # Clean up file if it was created
170
  if file_path.exists():
171
  try:
172
  file_path.unlink()
173
  except Exception as cleanup_error:
174
+ logger.error(
175
+ f"Error cleaning up file {file_path}: {str(cleanup_error)}")
176
+
177
  # Clean up from MongoDB if document was created
178
  try:
179
  await self.mongodb.delete_document(document_id)
180
  except Exception as db_cleanup_error:
181
+ logger.error(
182
+ f"Error cleaning up MongoDB document {document_id}: {str(db_cleanup_error)}")
183
+
184
  logger.error(f"Error processing file {file.filename}: {str(e)}")
185
  raise
186
 
 
194
  """Process document content for vector store"""
195
  try:
196
  # Generate chunk IDs using document_id
197
+ chunk_ids = [
198
+ f"{document_id}-chunk-{i}" for i in range(len(chunks))]
199
+
200
  # Get embeddings
201
  embeddings = vector_store.embedding_function(chunks)
202
+
203
  # Prepare metadata for each chunk
204
  metadatas = [{
205
  'document_id': document_id,
 
207
  'chunk_index': i,
208
  'total_chunks': len(chunks)
209
  } for i in range(len(chunks))]
210
+
211
  # Store in vector store
212
  vector_store.add_documents(
213
  documents=chunks,
 
215
  metadatas=metadatas,
216
  ids=chunk_ids
217
  )
218
+
219
+ logger.info(
220
+ f"Successfully processed document {filename} (ID: {document_id}) into {len(chunks)} chunks")
221
+
222
  except Exception as e:
223
+ logger.error(
224
+ f"Error processing document {filename} (ID: {document_id}) for vector store: {str(e)}")
225
  raise
226
 
227
  def _is_supported_format(self, filename: str) -> bool:
228
  """Check if file format is supported"""
229
+ return any(filename.lower().endswith(ext)
230
+ for ext in self.doc_processor.supported_formats)
231
 
232
  def _create_failed_file_entry(self, filename: str, error: str) -> dict:
233
  """Create a failed file entry"""
 
245
  # Get filename from url_path
246
  filename = doc['url_path'].split('/')[-1]
247
  file_path = self.permanent_dir / filename
248
+
249
  # Delete physical file if it exists
250
  if file_path.exists():
251
  file_path.unlink()
252
+
253
  # Delete from MongoDB
254
  return await self.mongodb.delete_document(document_id)
255
  return False
256
+
257
  except Exception as e:
258
  logger.error(f"Error deleting document: {str(e)}")
259
  raise
 
261
  def cleanup(self):
262
  """Clean up permanent directory if empty"""
263
  if self.permanent_dir.exists() and not any(self.permanent_dir.iterdir()):
264
+ self.permanent_dir.rmdir()
src/main.py CHANGED
@@ -123,7 +123,8 @@ async def get_all_documents():
123
  "content_type": doc.get("content_type"),
124
  "file_size": doc.get("file_size"),
125
  "url_path": doc.get("url_path"),
126
- "upload_timestamp": doc.get("upload_timestamp")
 
127
  }
128
  formatted_documents.append(formatted_doc)
129
  except Exception as e:
@@ -334,7 +335,8 @@ async def process_drive_documents():
334
  google_service_account_path=settings.GOOGLE_SERVICE_ACCOUNT_PATH,
335
  folder_id=settings.GOOGLE_DRIVE_FOLDER_ID,
336
  temp_dir=settings.TEMP_DOWNLOAD_DIR,
337
- doc_processor=doc_processor
 
338
  )
339
 
340
  # Process documents
 
123
  "content_type": doc.get("content_type"),
124
  "file_size": doc.get("file_size"),
125
  "url_path": doc.get("url_path"),
126
+ "upload_timestamp": doc.get("upload_timestamp"),
127
+ "source": doc.get("source")
128
  }
129
  formatted_documents.append(formatted_doc)
130
  except Exception as e:
 
335
  google_service_account_path=settings.GOOGLE_SERVICE_ACCOUNT_PATH,
336
  folder_id=settings.GOOGLE_DRIVE_FOLDER_ID,
337
  temp_dir=settings.TEMP_DOWNLOAD_DIR,
338
+ doc_processor=doc_processor,
339
+ mongodb=mongodb # Add MongoDB instance
340
  )
341
 
342
  # Process documents
src/utils/__pycache__/document_processor.cpython-312.pyc CHANGED
Binary files a/src/utils/__pycache__/document_processor.cpython-312.pyc and b/src/utils/__pycache__/document_processor.cpython-312.pyc differ
 
src/utils/__pycache__/drive_document_processor.cpython-312.pyc CHANGED
Binary files a/src/utils/__pycache__/drive_document_processor.cpython-312.pyc and b/src/utils/__pycache__/drive_document_processor.cpython-312.pyc differ
 
src/utils/document_processor.py CHANGED
@@ -116,124 +116,6 @@ class DocumentProcessor:
116
  strip_whitespace=False # Keep whitespace to maintain markdown formatting
117
  )
118
 
119
- def split_text(self, text: str) -> List[str]:
120
- """Split text with enforced overlap while preserving structure"""
121
- try:
122
- # Get initial split using RecursiveCharacterTextSplitter
123
- initial_chunks = self.text_splitter.split_text(text)
124
- if len(initial_chunks) <= 1:
125
- return initial_chunks
126
-
127
- # Process chunks with enforced overlap
128
- final_chunks = []
129
-
130
- for i, current_chunk in enumerate(initial_chunks):
131
- if i == 0:
132
- final_chunks.append(current_chunk)
133
- continue
134
-
135
- prev_chunk = final_chunks[-1]
136
-
137
- # Get the last part of previous chunk for overlap
138
- overlap_size = min(self.chunk_overlap, len(prev_chunk))
139
- overlap_text = prev_chunk[-overlap_size:]
140
-
141
- # For tables, include the header row
142
- if '|' in current_chunk and '\n' in current_chunk:
143
- table_lines = current_chunk.split('\n')
144
- header_lines = []
145
- for line in table_lines:
146
- if line.strip().startswith('|'):
147
- header_lines.append(line)
148
- else:
149
- break
150
- if header_lines:
151
- header_text = '\n'.join(header_lines) + '\n'
152
- overlap_text = header_text + overlap_text
153
-
154
- # Create new chunk with overlap
155
- new_chunk = overlap_text + current_chunk
156
-
157
- # Ensure we don't have duplicate content at the overlap point
158
- if current_chunk.startswith(overlap_text):
159
- new_chunk = current_chunk
160
-
161
- # Add context from previous chunk when needed
162
- if not any(marker in new_chunk for marker in ['**AGENDA**', '**DISCUSSIONS**', '| No |']):
163
- context_markers = ['**AGENDA**',
164
- '**DISCUSSIONS**', '| No |']
165
- for marker in context_markers:
166
- if marker in prev_chunk and marker not in new_chunk:
167
- new_chunk = marker + "\n" + new_chunk
168
- break
169
-
170
- final_chunks.append(new_chunk)
171
-
172
- # Validate and log overlaps
173
- for i in range(len(final_chunks)-1):
174
- actual_overlap = self._find_actual_overlap(
175
- final_chunks[i], final_chunks[i+1])
176
- logging.debug(
177
- f"Overlap between chunks {i} and {i+1}: {len(actual_overlap)} characters")
178
- if len(actual_overlap) < self.chunk_overlap:
179
- logging.warning(
180
- f"Insufficient overlap between chunks {i} and {i+1}")
181
-
182
- return final_chunks
183
-
184
- for start, end in table_sections:
185
- # Process text before table if exists
186
- if start > current_position:
187
- non_table_text = text[current_position:start]
188
- if non_table_text.strip():
189
- text_chunks = self.text_splitter.split_text(
190
- non_table_text)
191
- if chunks and text_chunks:
192
- # Ensure overlap with previous chunk
193
- prev_chunk = chunks[-1]
194
- overlap = self._get_overlap_text(prev_chunk)
195
- text_chunks[0] = overlap + text_chunks[0]
196
- chunks.extend(text_chunks)
197
-
198
- # Process table as a single chunk with overlap
199
- table_text = text[start:end]
200
- if chunks:
201
- prev_chunk = chunks[-1]
202
- overlap = self._get_overlap_text(prev_chunk)
203
- table_text = overlap + table_text
204
- chunks.append(table_text)
205
- current_position = end
206
-
207
- # Process remaining text after last table
208
- if current_position < len(text):
209
- remaining_text = text[current_position:]
210
- if remaining_text.strip():
211
- text_chunks = self.text_splitter.split_text(remaining_text)
212
- if chunks and text_chunks:
213
- # Ensure overlap with previous chunk
214
- prev_chunk = chunks[-1]
215
- overlap = self._get_overlap_text(prev_chunk)
216
- text_chunks[0] = overlap + text_chunks[0]
217
- chunks.extend(text_chunks)
218
-
219
- # Validate and adjust overlaps
220
- chunks = self._ensure_minimum_overlap(chunks)
221
-
222
- # Log chunk details for debugging
223
- for i in range(len(chunks)-1):
224
- overlap = self._find_actual_overlap(chunks[i], chunks[i+1])
225
- logging.debug(
226
- f"Overlap between chunks {i} and {i+1}: {len(overlap)} characters")
227
- logging.debug(f"End of chunk {i}: {chunks[i][-50:]}")
228
- logging.debug(f"Start of chunk {i+1}: {chunks[i+1][:50]}")
229
-
230
- return chunks
231
-
232
- except Exception as e:
233
- logging.error(f"Error in split_text: {str(e)}")
234
- # Fallback to original text splitter
235
- return self.text_splitter.split_text(text)
236
-
237
  def _find_break_point(self, text: str, prev_chunk: str) -> int:
238
  """
239
  Find suitable breaking point that maintains document structure
@@ -630,38 +512,42 @@ class DocumentProcessor:
630
  """Calculate SHA-256 hash of text"""
631
  return hashlib.sha256(text.encode()).hexdigest()
632
 
633
- async def process_document(self, file_path: Union[str, Path], metadata: Optional[Dict] = None) -> Dict:
634
- """Process a document with metadata and content extraction"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
  file_path = Path(file_path)
636
 
637
  if not self._validate_file(file_path):
638
  raise ValueError(f"Invalid file: {file_path}")
639
 
640
  content = self._extract_content(file_path)
641
- doc_metadata = self._generate_metadata(file_path, content, metadata)
642
-
643
- # Try enhanced splitting with validation
644
- chunks = self.split_text(content)
645
- if not self._validate_chunks(content, chunks):
646
- logging.warning(
647
- "Enhanced splitting failed validation, falling back to original splitter")
648
- chunks = self.text_splitter.split_text(content)
649
-
650
- # Add logging to verify chunk overlap
651
- for i in range(len(chunks)-1):
652
- logging.debug(f"Chunk {i} ends with: {chunks[i][-50:]}")
653
- logging.debug(f"Chunk {i+1} starts with: {chunks[i+1][:50]}")
654
- logging.debug(
655
- f"Overlap size: {self._calculate_overlap_size(chunks[i], chunks[i+1])} characters")
656
-
657
- chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]
658
 
659
  return {
660
  'content': content,
661
  'chunks': chunks,
662
- 'chunk_hashes': chunk_hashes,
663
- 'metadata': doc_metadata,
664
- 'statistics': self._generate_statistics(content, chunks)
665
  }
666
 
667
  def _calculate_overlap_size(self, chunk1: str, chunk2: str) -> int:
 
116
  strip_whitespace=False # Keep whitespace to maintain markdown formatting
117
  )
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  def _find_break_point(self, text: str, prev_chunk: str) -> int:
120
  """
121
  Find suitable breaking point that maintains document structure
 
512
  """Calculate SHA-256 hash of text"""
513
  return hashlib.sha256(text.encode()).hexdigest()
514
 
515
+ def _process_chunks(self, text: str) -> List[str]:
516
+ """Process text into chunks with proper overlap"""
517
+ chunks = self.text_splitter.split_text(text)
518
+
519
+ # Ensure minimum chunk size and handle overlaps
520
+ processed_chunks = []
521
+ for i, chunk in enumerate(chunks):
522
+ if i > 0:
523
+ # Add overlap from previous chunk
524
+ overlap_start = max(
525
+ 0, len(processed_chunks[-1]) - self.chunk_overlap)
526
+ chunk = processed_chunks[-1][overlap_start:] + chunk
527
+
528
+ if len(chunk) > self.chunk_size:
529
+ # Split oversized chunks
530
+ sub_chunks = self.text_splitter.split_text(chunk)
531
+ processed_chunks.extend(sub_chunks)
532
+ else:
533
+ processed_chunks.append(chunk)
534
+
535
+ return processed_chunks
536
+
537
+ async def process_document(self, file_path: Union[str, Path]) -> Dict:
538
+ """Process document with chunk overlapping"""
539
  file_path = Path(file_path)
540
 
541
  if not self._validate_file(file_path):
542
  raise ValueError(f"Invalid file: {file_path}")
543
 
544
  content = self._extract_content(file_path)
545
+ chunks = self._process_chunks(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
546
 
547
  return {
548
  'content': content,
549
  'chunks': chunks,
550
+ 'metadata': self._generate_metadata(file_path, content)
 
 
551
  }
552
 
553
  def _calculate_overlap_size(self, chunk1: str, chunk2: str) -> int:
src/utils/drive_document_processor.py CHANGED
@@ -8,6 +8,8 @@ from src.utils.google_drive_service import GoogleDriveService
8
  from src.utils.document_processor import DocumentProcessor
9
  from src.vectorstores.chroma_vectorstore import ChromaVectorStore
10
  from src.utils.logger import logger
 
 
11
 
12
  class DriveDocumentProcessor:
13
  def __init__(
@@ -15,38 +17,41 @@ class DriveDocumentProcessor:
15
  google_service_account_path: str,
16
  folder_id: str,
17
  temp_dir: str,
18
- doc_processor: DocumentProcessor
 
19
  ):
20
  """
21
  Initialize Drive Document Processor
22
-
23
  Args:
24
  google_service_account_path (str): Path to Google service account credentials
25
  folder_id (str): Google Drive folder ID to process
26
  temp_dir (str): Directory for temporary files
27
  doc_processor (DocumentProcessor): Instance of DocumentProcessor
28
  """
29
- self.google_drive_service = GoogleDriveService(google_service_account_path)
 
30
  self.folder_id = folder_id
31
  self.temp_dir = Path(temp_dir)
32
  self.doc_processor = doc_processor
33
-
 
34
  # Create temp directory if it doesn't exist
35
  self.temp_dir.mkdir(exist_ok=True)
36
-
37
  # Define supported MIME types
38
  self.supported_mime_types = {
39
  # Google Docs
40
  'application/vnd.google-apps.document': '.docx',
41
-
42
  # Microsoft Word Documents
43
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
44
  'application/msword': '.doc',
45
-
46
  # Microsoft Excel Documents
47
  'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
48
  'application/vnd.ms-excel': '.xls',
49
-
50
  # Text Documents
51
  'text/plain': '.txt',
52
  'text/csv': '.csv',
@@ -55,7 +60,7 @@ class DriveDocumentProcessor:
55
  'text/xml': '.xml',
56
  'application/json': '.json',
57
  'application/rtf': '.rtf',
58
-
59
  # PDF Documents
60
  'application/pdf': '.pdf'
61
  }
@@ -64,18 +69,78 @@ class DriveDocumentProcessor:
64
  'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
65
  }
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  async def process_documents(
68
  self,
69
  vector_store: ChromaVectorStore,
70
- include_subfolders: bool = True # New parameter with default True for backward compatibility
 
71
  ) -> Dict[str, Any]:
72
  """
73
  Process all documents in the specified Drive folder
74
-
75
  Args:
76
  vector_store (ChromaVectorStore): Vector store instance
77
  include_subfolders (bool): Whether to process documents in subfolders
78
-
79
  Returns:
80
  Dict[str, Any]: Processing results
81
  """
@@ -85,32 +150,35 @@ class DriveDocumentProcessor:
85
  self.folder_id,
86
  include_subfolders=include_subfolders
87
  )
88
-
 
 
 
89
  processed_files = []
90
  skipped_files = []
91
  errors = []
92
-
93
  for file in files:
94
  # Skip if it's a folder
95
  if file.get('mimeType') == 'application/vnd.google-apps.folder':
96
  continue
97
-
98
  # Get file path (including folder structure if available)
99
  file_path = self._get_file_path(file)
100
  file['display_path'] = file_path
101
-
102
  result = await self._process_single_file(file, vector_store)
103
-
104
  if result['status'] == 'processed':
105
  processed_files.append(result['data'])
106
  elif result['status'] == 'skipped':
107
  skipped_files.append(result['data'])
108
  else: # status == 'error'
109
  errors.append(result['data'])
110
-
111
  # Clean up temporary directory if empty
112
  self._cleanup_temp_dir()
113
-
114
  return {
115
  "status": "completed",
116
  "processed_files": {
@@ -126,7 +194,7 @@ class DriveDocumentProcessor:
126
  "details": errors
127
  }
128
  }
129
-
130
  except Exception as e:
131
  logger.error(f"Error processing Drive documents: {str(e)}")
132
  raise HTTPException(
@@ -137,20 +205,20 @@ class DriveDocumentProcessor:
137
  def _get_file_path(self, file: Dict[str, Any]) -> str:
138
  """
139
  Get the full path for a file including its folder structure
140
-
141
  Args:
142
  file (Dict[str, Any]): File metadata
143
-
144
  Returns:
145
  str: Display path of the file
146
  """
147
  path_parts = [file['name']]
148
-
149
  # Add folder path if available (new structure)
150
  if folder_path := file.get('folder_path', []):
151
  for folder in reversed(folder_path):
152
  path_parts.insert(0, folder['name'])
153
-
154
  return '/'.join(path_parts)
155
 
156
  async def _process_single_file(
@@ -160,7 +228,7 @@ class DriveDocumentProcessor:
160
  ) -> Dict[str, Any]:
161
  """Process a single Drive file"""
162
  mime_type = file.get('mimeType', '')
163
-
164
  # Skip if mime type not supported
165
  if mime_type not in self.supported_mime_types:
166
  return {
@@ -171,11 +239,11 @@ class DriveDocumentProcessor:
171
  'reason': f'Unsupported mime type: {mime_type}'
172
  }
173
  }
174
-
175
  try:
176
  document_id = file['id']
177
  modified_time = file.get('modifiedTime', 'N/A')
178
-
179
  # Check if document should be processed
180
  if self.save_document(document_id, vector_store, modified_time):
181
  # Download and process file
@@ -183,13 +251,13 @@ class DriveDocumentProcessor:
183
  file['id'],
184
  mime_type
185
  )
186
-
187
  try:
188
  # Process document
189
  processed_doc = await self.doc_processor.process_document(
190
  str(temp_file_path)
191
  )
192
-
193
  # Add to vector store with path information
194
  self._add_to_vector_store(
195
  processed_doc['chunks'],
@@ -197,7 +265,17 @@ class DriveDocumentProcessor:
197
  mime_type,
198
  vector_store
199
  )
200
-
 
 
 
 
 
 
 
 
 
 
201
  return {
202
  'status': 'processed',
203
  'data': {
@@ -207,7 +285,7 @@ class DriveDocumentProcessor:
207
  'chunks_processed': len(processed_doc['chunks'])
208
  }
209
  }
210
-
211
  finally:
212
  # Clean up temporary file
213
  if temp_file_path.exists():
@@ -221,7 +299,7 @@ class DriveDocumentProcessor:
221
  'reason': 'Document already exists in the memory.'
222
  }
223
  }
224
-
225
  except Exception as e:
226
  logger.error(f"Error processing file {file['name']}: {str(e)}")
227
  return {
@@ -243,7 +321,7 @@ class DriveDocumentProcessor:
243
  """Add processed chunks to vector store with path information"""
244
  chunk_metadatas = []
245
  chunk_ids = []
246
-
247
  modified_time = file.get('modifiedTime', 'N/A')
248
  file_path = file.get('display_path', file['name'])
249
 
@@ -260,7 +338,7 @@ class DriveDocumentProcessor:
260
  "file_type": self.supported_mime_types[mime_type],
261
  "is_google_doc": mime_type.startswith('application/vnd.google-apps')
262
  })
263
-
264
  vector_store.add_documents(
265
  documents=chunks,
266
  metadatas=chunk_metadatas,
@@ -275,7 +353,7 @@ class DriveDocumentProcessor:
275
  """Download and save file to temporary location"""
276
  extension = self.supported_mime_types[mime_type]
277
  temp_file_path = self.temp_dir / f"{file_id}{extension}"
278
-
279
  if mime_type in self.google_docs_export_types:
280
  # Download Google Doc in the specified export format
281
  content = self.google_drive_service.export_file(
@@ -285,13 +363,13 @@ class DriveDocumentProcessor:
285
  else:
286
  # Download regular file
287
  content = self.google_drive_service.download_file(file_id)
288
-
289
  with open(temp_file_path, 'wb') as f:
290
  if isinstance(content, str):
291
  f.write(content.encode('utf-8'))
292
  else:
293
  f.write(content)
294
-
295
  return temp_file_path
296
 
297
  def save_document(
@@ -302,35 +380,36 @@ class DriveDocumentProcessor:
302
  ) -> bool:
303
  """
304
  Check if document needs to be processed based on modification date
305
-
306
  Args:
307
  document_id (str): ID of the document to check
308
  vector_store (ChromaVectorStore): Vector store instance
309
  modified_date (str): Modified date to compare against
310
-
311
  Returns:
312
  bool: True if document should be processed, False otherwise
313
  """
314
  try:
315
  # Retrieve all chunks for the given document_id
316
  chunks = vector_store.get_document_chunks(document_id)
317
-
318
  if not chunks:
319
  # Document doesn't exist in vector store
320
  return True
321
-
322
  # Check the modified_time of the first chunk
323
  first_chunk_metadata = chunks[0].get("metadata", {})
324
-
325
  if first_chunk_metadata.get("modified_time") != modified_date:
326
  # If modified_time doesn't match, delete existing chunks
327
  vector_store.delete_document(document_id)
328
- logger.info(f"Document {document_id} has been modified, will reprocess")
 
329
  return True
330
-
331
  logger.info(f"Document {document_id} is up to date, skipping")
332
  return False
333
-
334
  except Exception as e:
335
  logger.error(f"Error checking document status: {str(e)}")
336
  # In case of error, process the document to be safe
@@ -343,4 +422,4 @@ class DriveDocumentProcessor:
343
  self.temp_dir.rmdir()
344
  except Exception as e:
345
  logger.error(f"Error cleaning up temp directory: {str(e)}")
346
- # Don't raise the error as this is a cleanup operation
 
8
  from src.utils.document_processor import DocumentProcessor
9
  from src.vectorstores.chroma_vectorstore import ChromaVectorStore
10
  from src.utils.logger import logger
11
+ from src.db.mongodb_store import MongoDBStore
12
+
13
 
14
  class DriveDocumentProcessor:
15
  def __init__(
 
17
  google_service_account_path: str,
18
  folder_id: str,
19
  temp_dir: str,
20
+ doc_processor: DocumentProcessor,
21
+ mongodb: MongoDBStore # Add MongoDB
22
  ):
23
  """
24
  Initialize Drive Document Processor
25
+
26
  Args:
27
  google_service_account_path (str): Path to Google service account credentials
28
  folder_id (str): Google Drive folder ID to process
29
  temp_dir (str): Directory for temporary files
30
  doc_processor (DocumentProcessor): Instance of DocumentProcessor
31
  """
32
+ self.google_drive_service = GoogleDriveService(
33
+ google_service_account_path)
34
  self.folder_id = folder_id
35
  self.temp_dir = Path(temp_dir)
36
  self.doc_processor = doc_processor
37
+ self.mongodb = mongodb # Store MongoDB instance
38
+
39
  # Create temp directory if it doesn't exist
40
  self.temp_dir.mkdir(exist_ok=True)
41
+
42
  # Define supported MIME types
43
  self.supported_mime_types = {
44
  # Google Docs
45
  'application/vnd.google-apps.document': '.docx',
46
+
47
  # Microsoft Word Documents
48
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
49
  'application/msword': '.doc',
50
+
51
  # Microsoft Excel Documents
52
  'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
53
  'application/vnd.ms-excel': '.xls',
54
+
55
  # Text Documents
56
  'text/plain': '.txt',
57
  'text/csv': '.csv',
 
60
  'text/xml': '.xml',
61
  'application/json': '.json',
62
  'application/rtf': '.rtf',
63
+
64
  # PDF Documents
65
  'application/pdf': '.pdf'
66
  }
 
69
  'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
70
  }
71
 
72
+ async def _cleanup_orphaned_documents(
73
+ self,
74
+ drive_files: List[Dict[str, Any]],
75
+ vector_store: ChromaVectorStore
76
+ ) -> Dict[str, Any]:
77
+ """
78
+ Clean up documents that exist in MongoDB but not in Google Drive
79
+
80
+ Args:
81
+ drive_files (List[Dict[str, Any]]): List of files from Google Drive
82
+ vector_store (ChromaVectorStore): Vector store instance
83
+
84
+ Returns:
85
+ Dict[str, Any]: Cleanup statistics
86
+ """
87
+ try:
88
+ # Get all documents from MongoDB
89
+ mongo_docs = await self.mongodb.get_all_documents()
90
+
91
+ # Create set of Google Drive file IDs
92
+ drive_file_ids = {file['id'] for file in drive_files}
93
+
94
+ deleted_count = 0
95
+ failed_deletions = []
96
+
97
+ # Check each MongoDB document
98
+ for doc in mongo_docs:
99
+ # Only process Google Drive documents
100
+ if doc.get('source') != 'google_drive':
101
+ continue
102
+
103
+ doc_id = doc.get('document_id')
104
+ if not doc_id or doc_id not in drive_file_ids:
105
+ try:
106
+ # Delete from MongoDB
107
+ await self.mongodb.delete_document(doc_id)
108
+
109
+ # Delete from vector store
110
+ vector_store.delete_document(doc_id)
111
+
112
+ deleted_count += 1
113
+
114
+ except Exception as e:
115
+ logger.error(
116
+ f"Error deleting orphaned document {doc_id}: {str(e)}")
117
+ failed_deletions.append({
118
+ 'document_id': doc_id,
119
+ 'error': str(e)
120
+ })
121
+
122
+ return {
123
+ 'orphaned_documents_deleted': deleted_count,
124
+ 'failed_deletions': failed_deletions
125
+ }
126
+
127
+ except Exception as e:
128
+ logger.error(f"Error in cleanup_orphaned_documents: {str(e)}")
129
+ raise
130
+
131
  async def process_documents(
132
  self,
133
  vector_store: ChromaVectorStore,
134
+ # New parameter with default True for backward compatibility
135
+ include_subfolders: bool = True
136
  ) -> Dict[str, Any]:
137
  """
138
  Process all documents in the specified Drive folder
139
+
140
  Args:
141
  vector_store (ChromaVectorStore): Vector store instance
142
  include_subfolders (bool): Whether to process documents in subfolders
143
+
144
  Returns:
145
  Dict[str, Any]: Processing results
146
  """
 
150
  self.folder_id,
151
  include_subfolders=include_subfolders
152
  )
153
+
154
+ # Clean up orphaned documents first
155
+ cleanup_results = await self._cleanup_orphaned_documents(files, vector_store)
156
+
157
  processed_files = []
158
  skipped_files = []
159
  errors = []
160
+
161
  for file in files:
162
  # Skip if it's a folder
163
  if file.get('mimeType') == 'application/vnd.google-apps.folder':
164
  continue
165
+
166
  # Get file path (including folder structure if available)
167
  file_path = self._get_file_path(file)
168
  file['display_path'] = file_path
169
+
170
  result = await self._process_single_file(file, vector_store)
171
+
172
  if result['status'] == 'processed':
173
  processed_files.append(result['data'])
174
  elif result['status'] == 'skipped':
175
  skipped_files.append(result['data'])
176
  else: # status == 'error'
177
  errors.append(result['data'])
178
+
179
  # Clean up temporary directory if empty
180
  self._cleanup_temp_dir()
181
+
182
  return {
183
  "status": "completed",
184
  "processed_files": {
 
194
  "details": errors
195
  }
196
  }
197
+
198
  except Exception as e:
199
  logger.error(f"Error processing Drive documents: {str(e)}")
200
  raise HTTPException(
 
205
  def _get_file_path(self, file: Dict[str, Any]) -> str:
206
  """
207
  Get the full path for a file including its folder structure
208
+
209
  Args:
210
  file (Dict[str, Any]): File metadata
211
+
212
  Returns:
213
  str: Display path of the file
214
  """
215
  path_parts = [file['name']]
216
+
217
  # Add folder path if available (new structure)
218
  if folder_path := file.get('folder_path', []):
219
  for folder in reversed(folder_path):
220
  path_parts.insert(0, folder['name'])
221
+
222
  return '/'.join(path_parts)
223
 
224
  async def _process_single_file(
 
228
  ) -> Dict[str, Any]:
229
  """Process a single Drive file"""
230
  mime_type = file.get('mimeType', '')
231
+
232
  # Skip if mime type not supported
233
  if mime_type not in self.supported_mime_types:
234
  return {
 
239
  'reason': f'Unsupported mime type: {mime_type}'
240
  }
241
  }
242
+
243
  try:
244
  document_id = file['id']
245
  modified_time = file.get('modifiedTime', 'N/A')
246
+
247
  # Check if document should be processed
248
  if self.save_document(document_id, vector_store, modified_time):
249
  # Download and process file
 
251
  file['id'],
252
  mime_type
253
  )
254
+
255
  try:
256
  # Process document
257
  processed_doc = await self.doc_processor.process_document(
258
  str(temp_file_path)
259
  )
260
+
261
  # Add to vector store with path information
262
  self._add_to_vector_store(
263
  processed_doc['chunks'],
 
265
  mime_type,
266
  vector_store
267
  )
268
+
269
+ # Add MongoDB storage - Store Google Drive URL
270
+ await self.mongodb.store_document(
271
+ document_id=document_id,
272
+ filename=file['name'],
273
+ content_type=mime_type,
274
+ file_size=0, # Not needed for drive documents
275
+ url_path=f"https://drive.google.com/file/d/{document_id}/view",
276
+ source="google_drive"
277
+ )
278
+
279
  return {
280
  'status': 'processed',
281
  'data': {
 
285
  'chunks_processed': len(processed_doc['chunks'])
286
  }
287
  }
288
+
289
  finally:
290
  # Clean up temporary file
291
  if temp_file_path.exists():
 
299
  'reason': 'Document already exists in the memory.'
300
  }
301
  }
302
+
303
  except Exception as e:
304
  logger.error(f"Error processing file {file['name']}: {str(e)}")
305
  return {
 
321
  """Add processed chunks to vector store with path information"""
322
  chunk_metadatas = []
323
  chunk_ids = []
324
+
325
  modified_time = file.get('modifiedTime', 'N/A')
326
  file_path = file.get('display_path', file['name'])
327
 
 
338
  "file_type": self.supported_mime_types[mime_type],
339
  "is_google_doc": mime_type.startswith('application/vnd.google-apps')
340
  })
341
+
342
  vector_store.add_documents(
343
  documents=chunks,
344
  metadatas=chunk_metadatas,
 
353
  """Download and save file to temporary location"""
354
  extension = self.supported_mime_types[mime_type]
355
  temp_file_path = self.temp_dir / f"{file_id}{extension}"
356
+
357
  if mime_type in self.google_docs_export_types:
358
  # Download Google Doc in the specified export format
359
  content = self.google_drive_service.export_file(
 
363
  else:
364
  # Download regular file
365
  content = self.google_drive_service.download_file(file_id)
366
+
367
  with open(temp_file_path, 'wb') as f:
368
  if isinstance(content, str):
369
  f.write(content.encode('utf-8'))
370
  else:
371
  f.write(content)
372
+
373
  return temp_file_path
374
 
375
  def save_document(
 
380
  ) -> bool:
381
  """
382
  Check if document needs to be processed based on modification date
383
+
384
  Args:
385
  document_id (str): ID of the document to check
386
  vector_store (ChromaVectorStore): Vector store instance
387
  modified_date (str): Modified date to compare against
388
+
389
  Returns:
390
  bool: True if document should be processed, False otherwise
391
  """
392
  try:
393
  # Retrieve all chunks for the given document_id
394
  chunks = vector_store.get_document_chunks(document_id)
395
+
396
  if not chunks:
397
  # Document doesn't exist in vector store
398
  return True
399
+
400
  # Check the modified_time of the first chunk
401
  first_chunk_metadata = chunks[0].get("metadata", {})
402
+
403
  if first_chunk_metadata.get("modified_time") != modified_date:
404
  # If modified_time doesn't match, delete existing chunks
405
  vector_store.delete_document(document_id)
406
+ logger.info(
407
+ f"Document {document_id} has been modified, will reprocess")
408
  return True
409
+
410
  logger.info(f"Document {document_id} is up to date, skipping")
411
  return False
412
+
413
  except Exception as e:
414
  logger.error(f"Error checking document status: {str(e)}")
415
  # In case of error, process the document to be safe
 
422
  self.temp_dir.rmdir()
423
  except Exception as e:
424
  logger.error(f"Error cleaning up temp directory: {str(e)}")
425
+ # Don't raise the error as this is a cleanup operation
src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc CHANGED
Binary files a/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc differ
 
src/vectorstores/chroma_vectorstore.py CHANGED
@@ -97,70 +97,83 @@ class ChromaVectorStore(BaseVectorStore):
97
  top_k: int = 3,
98
  **kwargs
99
  ) -> List[Dict[str, Any]]:
100
- """
101
- Perform similarity search with improved matching
102
- """
103
  try:
104
- # Increase n_results to get more potential matches
105
  results = self.collection.query(
106
  query_embeddings=[query_embedding],
107
- n_results=10, # Get more initial results
108
  include=['documents', 'metadatas', 'distances']
109
  )
110
 
111
- if not results or 'documents' not in results or not results['documents']:
112
- logging.warning("No results found in similarity search")
113
  return []
114
 
115
  formatted_results = []
116
- documents = results['documents'][0] # First query's results
117
- metadatas = results['metadatas'][0] if results.get('metadatas') else [
118
- None] * len(documents)
119
- distances = results['distances'][0] if results.get('distances') else [
120
- None] * len(documents)
121
 
122
- # Process all results
 
123
  for doc, meta, dist in zip(documents, metadatas, distances):
124
- # Convert distance to similarity score (1 is most similar, 0 is least)
125
- similarity_score = 1.0 - \
126
- (dist or 0.0) if dist is not None else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- # More permissive threshold and include all results for filtering
129
- if similarity_score is not None and similarity_score > 0.2: # Lower threshold
130
  formatted_results.append({
131
- 'text': doc,
132
- 'metadata': meta or {},
133
- 'score': similarity_score
134
  })
135
 
136
- # Sort by score and get top_k results
137
- formatted_results.sort(key=lambda x: x['score'] or 0, reverse=True)
138
-
139
- # Check if results are from same document and get consecutive chunks
140
- if formatted_results:
141
- first_doc_id = formatted_results[0]['metadata'].get(
142
- 'document_id')
143
- all_chunks_same_doc = []
144
-
145
- # Get all chunks from the same document
146
- for result in formatted_results:
147
- if result['metadata'].get('document_id') == first_doc_id:
148
- all_chunks_same_doc.append(result)
149
-
150
- # Sort chunks by their index to maintain document flow
151
- all_chunks_same_doc.sort(
152
- key=lambda x: x['metadata'].get('chunk_index', 0)
153
- )
154
-
155
- # Return either all chunks from same document or top_k results
156
- if len(all_chunks_same_doc) > 0:
157
- return all_chunks_same_doc[:top_k]
158
-
159
  return formatted_results[:top_k]
160
 
161
  except Exception as e:
162
- logging.error(
163
- f"Error performing similarity search in ChromaDB: {str(e)}")
164
  raise
165
 
166
  def get_all_documents(
 
97
  top_k: int = 3,
98
  **kwargs
99
  ) -> List[Dict[str, Any]]:
100
+ """Perform similarity search with improved chunk handling"""
 
 
101
  try:
102
+ # Get more initial results to account for sequential chunks
103
  results = self.collection.query(
104
  query_embeddings=[query_embedding],
105
+ n_results=max(top_k * 2, 10),
106
  include=['documents', 'metadatas', 'distances']
107
  )
108
 
109
+ if not results or 'documents' not in results:
 
110
  return []
111
 
112
  formatted_results = []
113
+ documents = results['documents'][0]
114
+ metadatas = results['metadatas'][0]
115
+ distances = results['distances'][0]
 
 
116
 
117
+ # Group chunks by document_id
118
+ doc_chunks = {}
119
  for doc, meta, dist in zip(documents, metadatas, distances):
120
+ doc_id = meta.get('document_id')
121
+ chunk_index = meta.get('chunk_index', 0)
122
+
123
+ if doc_id not in doc_chunks:
124
+ doc_chunks[doc_id] = []
125
+
126
+ doc_chunks[doc_id].append({
127
+ 'text': doc,
128
+ 'metadata': meta,
129
+ 'score': 1.0 - dist,
130
+ 'chunk_index': chunk_index
131
+ })
132
+
133
+ # Process each document's chunks
134
+ for doc_id, chunks in doc_chunks.items():
135
+ # Sort chunks by index
136
+ chunks.sort(key=lambda x: x['chunk_index'])
137
+
138
+ # Find sequences of chunks with good scores
139
+ good_sequences = []
140
+ current_sequence = []
141
+
142
+ for chunk in chunks:
143
+ if chunk['score'] > 0.3: # Adjust threshold as needed
144
+ if not current_sequence or \
145
+ chunk['chunk_index'] == current_sequence[-1]['chunk_index'] + 1:
146
+ current_sequence.append(chunk)
147
+ else:
148
+ if current_sequence:
149
+ good_sequences.append(current_sequence)
150
+ current_sequence = [chunk]
151
+ else:
152
+ if current_sequence:
153
+ good_sequences.append(current_sequence)
154
+ current_sequence = []
155
+
156
+ if current_sequence:
157
+ good_sequences.append(current_sequence)
158
+
159
+ # Add best sequences to results
160
+ for sequence in good_sequences:
161
+ avg_score = sum(c['score']
162
+ for c in sequence) / len(sequence)
163
+ combined_text = ' '.join(c['text'] for c in sequence)
164
 
 
 
165
  formatted_results.append({
166
+ 'text': combined_text,
167
+ 'metadata': sequence[0]['metadata'],
168
+ 'score': avg_score
169
  })
170
 
171
+ # Sort by score and return top_k
172
+ formatted_results.sort(key=lambda x: x['score'], reverse=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  return formatted_results[:top_k]
174
 
175
  except Exception as e:
176
+ logging.error(f"Error in similarity search: {str(e)}")
 
177
  raise
178
 
179
  def get_all_documents(