TalatMasood commited on
Commit
be32fd8
·
1 Parent(s): aee2bfd

Changes to be committed:

Browse files

modified: chroma/5c23c332-257c-4409-8a58-767cdd7c3dea/length.bin
modified: chroma/chroma.sqlite3
modified: config/__pycache__/config.cpython-312.pyc
modified: src/__pycache__/main.cpython-312.pyc
modified: src/agents/__pycache__/rag_agent.cpython-312.pyc
modified: src/agents/__pycache__/system_instructions_rag.cpython-312.pyc
modified: src/agents/rag_agent.py
modified: src/agents/system_instructions_rag.py
modified: src/utils/__pycache__/drive_document_processor.cpython-312.pyc
modified: src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc
modified: src/utils/__pycache__/google_drive_service.cpython-312.pyc
modified: src/utils/drive_document_processor.py
modified: src/utils/enhanced_excel_processor.py
modified: src/utils/google_drive_service.py
modified: src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc
modified: src/vectorstores/chroma_vectorstore.py
new file: temp_downloads/17he27jN4louYr1xOYASf4BP2e-tGTICt.xlsx
new file: temp_downloads/1K608-Qr03M6nf5FhB6AajbHm8kjQujx1.xlsx

Enhanced the Support for Excel sheets

config/__pycache__/config.cpython-312.pyc CHANGED
Binary files a/config/__pycache__/config.cpython-312.pyc and b/config/__pycache__/config.cpython-312.pyc differ
 
src/__pycache__/main.cpython-312.pyc CHANGED
Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ
 
src/agents/__pycache__/rag_agent.cpython-312.pyc CHANGED
Binary files a/src/agents/__pycache__/rag_agent.cpython-312.pyc and b/src/agents/__pycache__/rag_agent.cpython-312.pyc differ
 
src/agents/__pycache__/system_instructions_rag.cpython-312.pyc CHANGED
Binary files a/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc and b/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc differ
 
src/agents/rag_agent.py CHANGED
@@ -45,85 +45,156 @@ class RAGAgent(ExcelAwareRAGAgent):
45
  async def generate_response(
46
  self,
47
  query: str,
48
- conversation_id: Optional[str] = None,
49
- temperature: float = 0.7,
50
  max_tokens: Optional[int] = None,
51
  context_docs: Optional[List[str]] = None
52
  ) -> RAGResponse:
53
- """Generate a response using RAG with conversation history"""
54
  try:
55
- # Create new conversation if no ID provided
56
- if not conversation_id:
57
- conversation_id = str(uuid.uuid4())
58
- await self.mongodb.create_conversation(conversation_id)
59
-
60
- # Get conversation history
61
- history = await self.mongodb.get_recent_messages(
62
- conversation_id,
63
- limit=self.conversation_manager.max_messages
64
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- # Get relevant history within token limits
67
- relevant_history = self.conversation_manager.get_relevant_history(
68
- messages=history,
69
- current_query=query
70
- ) if history else []
71
 
72
  # Retrieve context if not provided
73
  if not context_docs:
74
  context_docs, sources, scores = await self.retrieve_context(
75
- query,
76
- conversation_history=relevant_history
77
  )
78
  else:
79
  sources = None
80
  scores = None
81
 
82
- # Check if this is an Excel-related query and enhance context if needed
83
- has_excel_content = any('Sheet:' in doc for doc in (context_docs or []))
 
 
 
 
 
 
 
 
 
84
  if has_excel_content:
85
  try:
86
  context_docs = self._process_excel_context(context_docs, query)
87
  except Exception as e:
88
  logger.warning(f"Error processing Excel context: {str(e)}")
89
- # Continue with original context if Excel processing fails
90
 
91
  # Generate prompt with context and history
92
  augmented_prompt = self.conversation_manager.generate_prompt_with_history(
93
  current_query=query,
94
- history=relevant_history,
95
  context_docs=context_docs
96
  )
97
 
98
- # Generate initial response using LLM
99
  response = self.llm.generate(
100
- augmented_prompt,
101
  temperature=temperature,
102
  max_tokens=max_tokens
103
  )
104
 
105
- # Enhance response for Excel queries if applicable
 
 
 
106
  if has_excel_content:
107
  try:
108
- response = await self.enhance_excel_response(
109
  query=query,
110
- response=response,
111
  context_docs=context_docs
112
  )
 
 
113
  except Exception as e:
114
  logger.warning(f"Error enhancing Excel response: {str(e)}")
115
- # Continue with original response if enhancement fails
116
 
 
117
  return RAGResponse(
118
- response=response,
119
  context_docs=context_docs,
120
  sources=sources,
121
  scores=scores
122
  )
123
 
124
  except Exception as e:
125
- logger.error(f"Error generating response: {str(e)}")
126
  raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
  async def retrieve_context(
129
  self,
@@ -133,15 +204,6 @@ class RAGAgent(ExcelAwareRAGAgent):
133
  ) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
134
  """
135
  Retrieve context with conversation history enhancement
136
-
137
- Args:
138
- query (str): Current query
139
- conversation_history (Optional[List[Dict]]): Recent conversation history
140
- top_k (int): Number of documents to retrieve
141
-
142
- Returns:
143
- Tuple[List[str], List[Dict], Optional[List[float]]]:
144
- Retrieved documents, sources, and scores
145
  """
146
  # Enhance query with conversation history
147
  if conversation_history:
@@ -153,8 +215,14 @@ class RAGAgent(ExcelAwareRAGAgent):
153
  else:
154
  enhanced_query = query
155
 
 
 
 
156
  # Embed the enhanced query
157
  query_embedding = self.embedding.embed_query(enhanced_query)
 
 
 
158
 
159
  # Retrieve similar documents
160
  results = self.vector_store.similarity_search(
@@ -162,6 +230,12 @@ class RAGAgent(ExcelAwareRAGAgent):
162
  top_k=top_k
163
  )
164
 
 
 
 
 
 
 
165
  # Process results
166
  documents = [doc['text'] for doc in results]
167
  sources = [self._convert_metadata_to_strings(doc['metadata'])
 
45
  async def generate_response(
46
  self,
47
  query: str,
48
+ conversation_id: Optional[str],
49
+ temperature: float,
50
  max_tokens: Optional[int] = None,
51
  context_docs: Optional[List[str]] = None
52
  ) -> RAGResponse:
53
+ """Generate response with specific handling for different query types"""
54
  try:
55
+ # First, check if this is an introduction/welcome message query
56
+ is_introduction = (
57
+ "wants support" in query and
58
+ "This is Introduction" in query and
59
+ ("A new user with name:" in query or "An old user with name:" in query)
 
 
 
 
60
  )
61
+
62
+ if is_introduction:
63
+ # Handle introduction message - no context needed
64
+ welcome_message = self._handle_contact_query(query)
65
+ return RAGResponse(
66
+ response=welcome_message,
67
+ context_docs=[],
68
+ sources=[],
69
+ scores=None
70
+ )
71
+
72
+ # Get conversation history if conversation_id exists
73
+ history = []
74
+ if conversation_id:
75
+ history = await self.mongodb.get_recent_messages(
76
+ conversation_id,
77
+ limit=self.conversation_manager.max_messages
78
+ )
79
 
80
+ # Get relevant history within token limits
81
+ history = self.conversation_manager.get_relevant_history(
82
+ messages=history,
83
+ current_query=query
84
+ )
85
 
86
  # Retrieve context if not provided
87
  if not context_docs:
88
  context_docs, sources, scores = await self.retrieve_context(
89
+ query=query,
90
+ conversation_history=history
91
  )
92
  else:
93
  sources = None
94
  scores = None
95
 
96
+ # Check if we have any relevant context
97
+ if not context_docs:
98
+ return RAGResponse(
99
+ response="Information about this is not available, do you want to inquire about something else?",
100
+ context_docs=[],
101
+ sources=[],
102
+ scores=None
103
+ )
104
+
105
+ # Check if this is an Excel-related query
106
+ has_excel_content = any('Sheet:' in doc for doc in context_docs)
107
  if has_excel_content:
108
  try:
109
  context_docs = self._process_excel_context(context_docs, query)
110
  except Exception as e:
111
  logger.warning(f"Error processing Excel context: {str(e)}")
 
112
 
113
  # Generate prompt with context and history
114
  augmented_prompt = self.conversation_manager.generate_prompt_with_history(
115
  current_query=query,
116
+ history=history,
117
  context_docs=context_docs
118
  )
119
 
120
+ # Generate initial response
121
  response = self.llm.generate(
122
+ prompt=augmented_prompt,
123
  temperature=temperature,
124
  max_tokens=max_tokens
125
  )
126
 
127
+ # Clean the response
128
+ cleaned_response = self._clean_response(response)
129
+
130
+ # For Excel queries, enhance the response
131
  if has_excel_content:
132
  try:
133
+ enhanced_response = await self.enhance_excel_response(
134
  query=query,
135
+ response=cleaned_response,
136
  context_docs=context_docs
137
  )
138
+ if enhanced_response:
139
+ cleaned_response = enhanced_response
140
  except Exception as e:
141
  logger.warning(f"Error enhancing Excel response: {str(e)}")
 
142
 
143
+ # Return the final response
144
  return RAGResponse(
145
+ response=cleaned_response,
146
  context_docs=context_docs,
147
  sources=sources,
148
  scores=scores
149
  )
150
 
151
  except Exception as e:
152
+ logger.error(f"Error in SystemInstructionsRAGAgent: {str(e)}")
153
  raise
154
+
155
+ def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
156
+ """
157
+ Create prompt for generating response from context
158
+
159
+ Args:
160
+ query (str): User query
161
+ context_docs (List[str]): Retrieved context documents
162
+
163
+ Returns:
164
+ str: Formatted prompt for the LLM
165
+ """
166
+ if not context_docs:
167
+ return f"Query: {query}\nResponse: Information about this is not available, do you want to inquire about something else?"
168
+
169
+ # Format context documents
170
+ formatted_context = "\n\n".join(
171
+ f"Context {i+1}:\n{doc.strip()}"
172
+ for i, doc in enumerate(context_docs)
173
+ if doc and doc.strip()
174
+ )
175
+
176
+ # Build the prompt with detailed instructions
177
+ prompt = f"""You are a knowledgeable assistant. Use the following context to answer the query accurately and informatively.
178
+
179
+ Context Information:
180
+ {formatted_context}
181
+
182
+ Query: {query}
183
+
184
+ Instructions:
185
+ 1. Base your response ONLY on the information provided in the context above
186
+ 2. If the context contains numbers, statistics, or specific details, include them in your response
187
+ 3. Keep your response focused and relevant to the query
188
+ 4. Use clear and professional language
189
+ 5. If the context includes technical terms, explain them appropriately
190
+ 6. Do not make assumptions or add information not present in the context
191
+ 7. If specific sections of a report are mentioned, maintain their original structure
192
+ 8. Format the response in a clear, readable manner
193
+ 9. If the context includes chronological information, maintain the proper sequence
194
+
195
+ Response:"""
196
+
197
+ return prompt
198
 
199
  async def retrieve_context(
200
  self,
 
204
  ) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
205
  """
206
  Retrieve context with conversation history enhancement
 
 
 
 
 
 
 
 
 
207
  """
208
  # Enhance query with conversation history
209
  if conversation_history:
 
215
  else:
216
  enhanced_query = query
217
 
218
+ # Debug log the enhanced query
219
+ logger.info(f"Enhanced query: {enhanced_query}")
220
+
221
  # Embed the enhanced query
222
  query_embedding = self.embedding.embed_query(enhanced_query)
223
+
224
+ # Debug log embedding shape
225
+ logger.info(f"Query embedding shape: {len(query_embedding)}")
226
 
227
  # Retrieve similar documents
228
  results = self.vector_store.similarity_search(
 
230
  top_k=top_k
231
  )
232
 
233
+ # Debug log search results
234
+ logger.info(f"Number of search results: {len(results)}")
235
+ for i, result in enumerate(results):
236
+ logger.info(f"Result {i} score: {result.get('score', 'N/A')}")
237
+ logger.info(f"Result {i} text preview: {result.get('text', '')[:100]}...")
238
+
239
  # Process results
240
  documents = [doc['text'] for doc in results]
241
  sources = [self._convert_metadata_to_strings(doc['metadata'])
src/agents/system_instructions_rag.py CHANGED
@@ -103,19 +103,36 @@ class SystemInstructionsRAGAgent(RAGAgent):
103
  if not context_docs:
104
  return False
105
 
106
- # Extract key terms from query
107
  query_words = query.lower().split()
108
- stop_words = {'share', 'me', 'a', 'about', 'information', 'what', 'is', 'are', 'the', 'in', 'how', 'why', 'when', 'where'}
 
 
109
  query_terms = {word for word in query_words if word not in stop_words}
110
-
 
 
 
 
 
 
 
 
 
 
 
111
  # Check each context document for relevance
112
  for doc in context_docs:
113
  if not doc:
114
  continue
115
  doc_lower = doc.lower()
116
- if any(term in doc_lower for term in query_terms):
117
- # Found relevant content
 
 
 
118
  return True
 
119
  return False
120
 
121
  def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
 
103
  if not context_docs:
104
  return False
105
 
106
+ # Extract key terms from query (keeping important words)
107
  query_words = query.lower().split()
108
+ stop_words = {'me', 'a', 'about', 'what', 'is', 'are', 'the', 'in', 'how', 'why', 'when', 'where'}
109
+
110
+ # Remove only basic stop words, keep important terms like "report", "share", etc.
111
  query_terms = {word for word in query_words if word not in stop_words}
112
+
113
+ # Add additional relevant terms that might appear in the content
114
+ related_terms = {
115
+ 'comprehensive',
116
+ 'report',
117
+ 'overview',
118
+ 'summary',
119
+ 'details',
120
+ 'information'
121
+ }
122
+ query_terms.update(word for word in query_words if word in related_terms)
123
+
124
  # Check each context document for relevance
125
  for doc in context_docs:
126
  if not doc:
127
  continue
128
  doc_lower = doc.lower()
129
+
130
+ # Consider document relevant if it contains any query terms
131
+ # or if it starts with common report headers
132
+ if any(term in doc_lower for term in query_terms) or \
133
+ any(header in doc_lower for header in ['overview', 'comprehensive report', 'summary']):
134
  return True
135
+
136
  return False
137
 
138
  def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
src/utils/__pycache__/drive_document_processor.cpython-312.pyc CHANGED
Binary files a/src/utils/__pycache__/drive_document_processor.cpython-312.pyc and b/src/utils/__pycache__/drive_document_processor.cpython-312.pyc differ
 
src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc CHANGED
Binary files a/src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc and b/src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc differ
 
src/utils/__pycache__/google_drive_service.cpython-312.pyc CHANGED
Binary files a/src/utils/__pycache__/google_drive_service.cpython-312.pyc and b/src/utils/__pycache__/google_drive_service.cpython-312.pyc differ
 
src/utils/drive_document_processor.py CHANGED
@@ -37,7 +37,7 @@ class DriveDocumentProcessor:
37
  # Define supported MIME types
38
  self.supported_mime_types = {
39
  # Google Docs
40
- 'application/vnd.google-apps.document': '.docx', # Export Google Docs as DOCX
41
 
42
  # Microsoft Word Documents
43
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
@@ -60,33 +60,45 @@ class DriveDocumentProcessor:
60
  'application/pdf': '.pdf'
61
  }
62
 
63
- # Define export MIME types for Google Docs formats
64
  self.google_docs_export_types = {
65
  'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
66
  }
67
 
68
  async def process_documents(
69
  self,
70
- vector_store: ChromaVectorStore
 
71
  ) -> Dict[str, Any]:
72
  """
73
  Process all documents in the specified Drive folder
74
 
75
  Args:
76
  vector_store (ChromaVectorStore): Vector store instance
 
77
 
78
  Returns:
79
  Dict[str, Any]: Processing results
80
  """
81
  try:
82
  # Get documents from folder
83
- files = self.google_drive_service.get_folder_contents(self.folder_id)
 
 
 
84
 
85
  processed_files = []
86
  skipped_files = []
87
  errors = []
88
 
89
  for file in files:
 
 
 
 
 
 
 
 
90
  result = await self._process_single_file(file, vector_store)
91
 
92
  if result['status'] == 'processed':
@@ -122,13 +134,31 @@ class DriveDocumentProcessor:
122
  detail=f"Failed to process drive documents: {str(e)}"
123
  )
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  async def _process_single_file(
126
  self,
127
  file: Dict[str, Any],
128
  vector_store: ChromaVectorStore
129
  ) -> Dict[str, Any]:
130
  """Process a single Drive file"""
131
-
132
  mime_type = file.get('mimeType', '')
133
 
134
  # Skip if mime type not supported
@@ -137,13 +167,14 @@ class DriveDocumentProcessor:
137
  'status': 'skipped',
138
  'data': {
139
  'name': file['name'],
 
140
  'reason': f'Unsupported mime type: {mime_type}'
141
  }
142
  }
143
 
144
  try:
145
  document_id = file['id']
146
- modified_time = file.get('modifiedTime', 'N/A') # Get last modified time
147
 
148
  # Check if document should be processed
149
  if self.save_document(document_id, vector_store, modified_time):
@@ -159,7 +190,7 @@ class DriveDocumentProcessor:
159
  str(temp_file_path)
160
  )
161
 
162
- # Add to vector store
163
  self._add_to_vector_store(
164
  processed_doc['chunks'],
165
  file,
@@ -171,6 +202,7 @@ class DriveDocumentProcessor:
171
  'status': 'processed',
172
  'data': {
173
  'name': file['name'],
 
174
  'id': file['id'],
175
  'chunks_processed': len(processed_doc['chunks'])
176
  }
@@ -181,11 +213,11 @@ class DriveDocumentProcessor:
181
  if temp_file_path.exists():
182
  temp_file_path.unlink()
183
  else:
184
- # Return skipped status if document already exists and is up to date
185
  return {
186
  'status': 'skipped',
187
  'data': {
188
  'name': file['name'],
 
189
  'reason': 'Document already exists in the memory.'
190
  }
191
  }
@@ -196,46 +228,10 @@ class DriveDocumentProcessor:
196
  'status': 'error',
197
  'data': {
198
  'file_name': file['name'],
 
199
  'error': str(e)
200
  }
201
  }
202
-
203
- except Exception as e:
204
- logger.error(f"Error processing file {file['name']}: {str(e)}")
205
- return {
206
- 'status': 'error',
207
- 'data': {
208
- 'file_name': file['name'],
209
- 'error': str(e)
210
- }
211
- }
212
-
213
- async def _download_and_save_file(
214
- self,
215
- file_id: str,
216
- mime_type: str
217
- ) -> Path:
218
- """Download and save file to temporary location"""
219
- extension = self.supported_mime_types[mime_type]
220
- temp_file_path = self.temp_dir / f"{file_id}{extension}"
221
-
222
- if mime_type in self.google_docs_export_types:
223
- # Download Google Doc in the specified export format
224
- content = self.google_drive_service.export_file(
225
- file_id,
226
- self.google_docs_export_types[mime_type]
227
- )
228
- else:
229
- # Download regular file
230
- content = self.google_drive_service.download_file(file_id)
231
-
232
- with open(temp_file_path, 'wb') as f:
233
- if isinstance(content, str):
234
- f.write(content.encode('utf-8'))
235
- else:
236
- f.write(content)
237
-
238
- return temp_file_path
239
 
240
  def _add_to_vector_store(
241
  self,
@@ -244,20 +240,18 @@ class DriveDocumentProcessor:
244
  mime_type: str,
245
  vector_store: ChromaVectorStore
246
  ) -> None:
247
- """Add processed chunks to vector store"""
248
  chunk_metadatas = []
249
  chunk_ids = []
250
 
251
- # document_id = file['id']
252
- modified_time = file.get('modifiedTime', 'N/A') # Get last modified time
253
- #self.delete_updated_document(document_id, vector_store, modified_time)
254
-
255
 
256
  for i, chunk in enumerate(chunks):
257
  chunk_id = f"{file['id']}-chunk-{i}"
258
  chunk_ids.append(chunk_id)
259
  chunk_metadatas.append({
260
- "source": file['name'],
261
  "document_id": file['id'],
262
  "chunk_index": i,
263
  "mime_type": mime_type,
@@ -272,44 +266,81 @@ class DriveDocumentProcessor:
272
  metadatas=chunk_metadatas,
273
  ids=chunk_ids
274
  )
275
-
276
- def save_document(self, document_id: str, vector_store: ChromaVectorStore, modified_date: str) -> bool:
277
- """
278
- Deletes all chunks of a document if the modified_time does not match the given modified_date.
279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  Args:
281
- document_id (str): The ID of the document.
282
- vector_store (ChromaVectorStore): The Chroma vector store instance.
283
- modified_date (str): The expected modification date.
 
 
 
284
  """
285
  try:
286
  # Retrieve all chunks for the given document_id
287
  chunks = vector_store.get_document_chunks(document_id)
288
-
289
  if not chunks:
290
- logging.warning(f"No chunks found for document_id: {document_id}. Nothing to delete.")
291
  return True
292
-
293
  # Check the modified_time of the first chunk
294
  first_chunk_metadata = chunks[0].get("metadata", {})
295
-
296
  if first_chunk_metadata.get("modified_time") != modified_date:
297
- # If modified_time doesn't match, delete all chunks
298
  vector_store.delete_document(document_id)
299
- logging.info(f"Deleted all chunks for document_id: {document_id} due to modified_time mismatch.")
300
  return True
301
- else:
302
- logging.info(f"No deletion needed for document_id: {document_id}, modified_time is unchanged.")
303
- return False
304
-
305
-
306
  except Exception as e:
307
- logging.error(f"Error while deleting chunks for document_id {document_id}: {str(e)}")
 
308
  return True
309
 
310
-
311
-
312
  def _cleanup_temp_dir(self) -> None:
313
  """Clean up temporary directory if empty"""
314
- if self.temp_dir.exists() and not any(self.temp_dir.iterdir()):
315
- self.temp_dir.rmdir()
 
 
 
 
 
37
  # Define supported MIME types
38
  self.supported_mime_types = {
39
  # Google Docs
40
+ 'application/vnd.google-apps.document': '.docx',
41
 
42
  # Microsoft Word Documents
43
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
 
60
  'application/pdf': '.pdf'
61
  }
62
 
 
63
  self.google_docs_export_types = {
64
  'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
65
  }
66
 
67
  async def process_documents(
68
  self,
69
+ vector_store: ChromaVectorStore,
70
+ include_subfolders: bool = True # New parameter with default True for backward compatibility
71
  ) -> Dict[str, Any]:
72
  """
73
  Process all documents in the specified Drive folder
74
 
75
  Args:
76
  vector_store (ChromaVectorStore): Vector store instance
77
+ include_subfolders (bool): Whether to process documents in subfolders
78
 
79
  Returns:
80
  Dict[str, Any]: Processing results
81
  """
82
  try:
83
  # Get documents from folder
84
+ files = self.google_drive_service.get_folder_contents(
85
+ self.folder_id,
86
+ include_subfolders=include_subfolders
87
+ )
88
 
89
  processed_files = []
90
  skipped_files = []
91
  errors = []
92
 
93
  for file in files:
94
+ # Skip if it's a folder
95
+ if file.get('mimeType') == 'application/vnd.google-apps.folder':
96
+ continue
97
+
98
+ # Get file path (including folder structure if available)
99
+ file_path = self._get_file_path(file)
100
+ file['display_path'] = file_path
101
+
102
  result = await self._process_single_file(file, vector_store)
103
 
104
  if result['status'] == 'processed':
 
134
  detail=f"Failed to process drive documents: {str(e)}"
135
  )
136
 
137
+ def _get_file_path(self, file: Dict[str, Any]) -> str:
138
+ """
139
+ Get the full path for a file including its folder structure
140
+
141
+ Args:
142
+ file (Dict[str, Any]): File metadata
143
+
144
+ Returns:
145
+ str: Display path of the file
146
+ """
147
+ path_parts = [file['name']]
148
+
149
+ # Add folder path if available (new structure)
150
+ if folder_path := file.get('folder_path', []):
151
+ for folder in reversed(folder_path):
152
+ path_parts.insert(0, folder['name'])
153
+
154
+ return '/'.join(path_parts)
155
+
156
  async def _process_single_file(
157
  self,
158
  file: Dict[str, Any],
159
  vector_store: ChromaVectorStore
160
  ) -> Dict[str, Any]:
161
  """Process a single Drive file"""
 
162
  mime_type = file.get('mimeType', '')
163
 
164
  # Skip if mime type not supported
 
167
  'status': 'skipped',
168
  'data': {
169
  'name': file['name'],
170
+ 'path': file.get('display_path', file['name']),
171
  'reason': f'Unsupported mime type: {mime_type}'
172
  }
173
  }
174
 
175
  try:
176
  document_id = file['id']
177
+ modified_time = file.get('modifiedTime', 'N/A')
178
 
179
  # Check if document should be processed
180
  if self.save_document(document_id, vector_store, modified_time):
 
190
  str(temp_file_path)
191
  )
192
 
193
+ # Add to vector store with path information
194
  self._add_to_vector_store(
195
  processed_doc['chunks'],
196
  file,
 
202
  'status': 'processed',
203
  'data': {
204
  'name': file['name'],
205
+ 'path': file.get('display_path', file['name']),
206
  'id': file['id'],
207
  'chunks_processed': len(processed_doc['chunks'])
208
  }
 
213
  if temp_file_path.exists():
214
  temp_file_path.unlink()
215
  else:
 
216
  return {
217
  'status': 'skipped',
218
  'data': {
219
  'name': file['name'],
220
+ 'path': file.get('display_path', file['name']),
221
  'reason': 'Document already exists in the memory.'
222
  }
223
  }
 
228
  'status': 'error',
229
  'data': {
230
  'file_name': file['name'],
231
+ 'path': file.get('display_path', file['name']),
232
  'error': str(e)
233
  }
234
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
  def _add_to_vector_store(
237
  self,
 
240
  mime_type: str,
241
  vector_store: ChromaVectorStore
242
  ) -> None:
243
+ """Add processed chunks to vector store with path information"""
244
  chunk_metadatas = []
245
  chunk_ids = []
246
 
247
+ modified_time = file.get('modifiedTime', 'N/A')
248
+ file_path = file.get('display_path', file['name'])
 
 
249
 
250
  for i, chunk in enumerate(chunks):
251
  chunk_id = f"{file['id']}-chunk-{i}"
252
  chunk_ids.append(chunk_id)
253
  chunk_metadatas.append({
254
+ "source": file_path, # Use full path instead of just name
255
  "document_id": file['id'],
256
  "chunk_index": i,
257
  "mime_type": mime_type,
 
266
  metadatas=chunk_metadatas,
267
  ids=chunk_ids
268
  )
 
 
 
 
269
 
270
+ async def _download_and_save_file(
271
+ self,
272
+ file_id: str,
273
+ mime_type: str
274
+ ) -> Path:
275
+ """Download and save file to temporary location"""
276
+ extension = self.supported_mime_types[mime_type]
277
+ temp_file_path = self.temp_dir / f"{file_id}{extension}"
278
+
279
+ if mime_type in self.google_docs_export_types:
280
+ # Download Google Doc in the specified export format
281
+ content = self.google_drive_service.export_file(
282
+ file_id,
283
+ self.google_docs_export_types[mime_type]
284
+ )
285
+ else:
286
+ # Download regular file
287
+ content = self.google_drive_service.download_file(file_id)
288
+
289
+ with open(temp_file_path, 'wb') as f:
290
+ if isinstance(content, str):
291
+ f.write(content.encode('utf-8'))
292
+ else:
293
+ f.write(content)
294
+
295
+ return temp_file_path
296
+
297
+ def save_document(
298
+ self,
299
+ document_id: str,
300
+ vector_store: ChromaVectorStore,
301
+ modified_date: str
302
+ ) -> bool:
303
+ """
304
+ Check if document needs to be processed based on modification date
305
+
306
  Args:
307
+ document_id (str): ID of the document to check
308
+ vector_store (ChromaVectorStore): Vector store instance
309
+ modified_date (str): Modified date to compare against
310
+
311
+ Returns:
312
+ bool: True if document should be processed, False otherwise
313
  """
314
  try:
315
  # Retrieve all chunks for the given document_id
316
  chunks = vector_store.get_document_chunks(document_id)
317
+
318
  if not chunks:
319
+ # Document doesn't exist in vector store
320
  return True
321
+
322
  # Check the modified_time of the first chunk
323
  first_chunk_metadata = chunks[0].get("metadata", {})
324
+
325
  if first_chunk_metadata.get("modified_time") != modified_date:
326
+ # If modified_time doesn't match, delete existing chunks
327
  vector_store.delete_document(document_id)
328
+ logger.info(f"Document {document_id} has been modified, will reprocess")
329
  return True
330
+
331
+ logger.info(f"Document {document_id} is up to date, skipping")
332
+ return False
333
+
 
334
  except Exception as e:
335
+ logger.error(f"Error checking document status: {str(e)}")
336
+ # In case of error, process the document to be safe
337
  return True
338
 
 
 
339
  def _cleanup_temp_dir(self) -> None:
340
  """Clean up temporary directory if empty"""
341
+ try:
342
+ if self.temp_dir.exists() and not any(self.temp_dir.iterdir()):
343
+ self.temp_dir.rmdir()
344
+ except Exception as e:
345
+ logger.error(f"Error cleaning up temp directory: {str(e)}")
346
+ # Don't raise the error as this is a cleanup operation
src/utils/enhanced_excel_processor.py CHANGED
@@ -2,7 +2,9 @@ from typing import Dict, List, Any, Optional
2
  import pandas as pd
3
  import numpy as np
4
  from pathlib import Path
5
- import json
 
 
6
 
7
  class EnhancedExcelProcessor:
8
  def __init__(self):
@@ -13,7 +15,7 @@ class EnhancedExcelProcessor:
13
 
14
  def process_excel(self, file_path: Path) -> str:
15
  """
16
- Process Excel file with enhanced multi-sheet handling
17
 
18
  Args:
19
  file_path (Path): Path to Excel file
@@ -21,97 +23,146 @@ class EnhancedExcelProcessor:
21
  Returns:
22
  str: Structured text representation of Excel content
23
  """
24
- # Read all sheets
25
  excel_file = pd.ExcelFile(file_path)
26
  sheets_data = {}
27
 
 
 
 
28
  for sheet_name in excel_file.sheet_names:
29
- df = pd.read_excel(excel_file, sheet_name=sheet_name)
 
 
 
 
 
 
 
 
 
 
 
 
30
  sheets_data[sheet_name] = df
31
 
32
- # Generate sheet summary
33
- self.sheet_summaries[sheet_name] = self._generate_sheet_summary(df)
 
 
 
34
 
35
- # Extract sheet metadata
36
- self.sheet_metadata[sheet_name] = {
37
- 'columns': list(df.columns),
38
- 'rows': len(df),
39
- 'numeric_columns': df.select_dtypes(include=[np.number]).columns.tolist(),
40
- 'date_columns': df.select_dtypes(include=['datetime64']).columns.tolist(),
41
- 'categorical_columns': df.select_dtypes(include=['object']).columns.tolist()
42
- }
43
 
44
  # Detect relationships between sheets
45
  self.relationships = self._detect_relationships(sheets_data)
46
 
47
  # Generate structured text representation
48
- return self._generate_structured_text(sheets_data)
49
 
50
- def _generate_sheet_summary(self, df: pd.DataFrame) -> Dict:
51
- """Generate statistical summary for a sheet"""
52
  summary = {
53
  'total_rows': len(df),
54
  'total_columns': len(df.columns),
55
  'column_types': {},
56
  'numeric_summaries': {},
57
  'categorical_summaries': {},
58
- 'null_counts': df.isnull().sum().to_dict()
 
 
59
  }
60
 
61
- # Process numeric columns
62
  numeric_cols = df.select_dtypes(include=[np.number]).columns
63
  for col in numeric_cols:
 
64
  summary['numeric_summaries'][col] = {
65
- 'mean': float(df[col].mean()),
66
- 'median': float(df[col].median()),
67
- 'std': float(df[col].std()),
68
- 'min': float(df[col].min()),
69
- 'max': float(df[col].max())
 
70
  }
71
  summary['column_types'][col] = 'numeric'
72
 
73
- # Process categorical columns
74
  categorical_cols = df.select_dtypes(include=['object']).columns
75
  for col in categorical_cols:
76
- value_counts = df[col].value_counts()
77
- summary['categorical_summaries'][col] = {
78
- 'unique_values': int(len(value_counts)),
79
- 'top_values': value_counts.head(5).to_dict()
80
- }
 
 
 
 
 
81
  summary['column_types'][col] = 'categorical'
82
 
83
  return summary
84
 
85
- def _detect_relationships(self, sheets_data: Dict[str, pd.DataFrame]) -> Dict:
86
- """Detect potential relationships between sheets"""
87
- relationships = {}
88
- sheet_names = list(sheets_data.keys())
89
-
90
- for i, sheet1 in enumerate(sheet_names):
91
- for sheet2 in sheet_names[i+1:]:
92
- common_cols = set(sheets_data[sheet1].columns) & set(sheets_data[sheet2].columns)
93
- if common_cols:
94
- relationships[f"{sheet1}__{sheet2}"] = {
95
- 'common_columns': list(common_cols),
96
- 'type': 'potential_join'
97
- }
98
-
99
- # Check for foreign key relationships
100
- for col1 in sheets_data[sheet1].columns:
101
- for col2 in sheets_data[sheet2].columns:
102
- if (col1.lower().endswith('_id') or col2.lower().endswith('_id')):
103
- unique_vals1 = set(sheets_data[sheet1][col1].dropna())
104
- unique_vals2 = set(sheets_data[sheet2][col2].dropna())
105
- if unique_vals1 & unique_vals2:
106
- relationships[f"{sheet1}__{sheet2}__{col1}__{col2}"] = {
107
- 'type': 'foreign_key',
108
- 'columns': [col1, col2]
109
- }
110
-
111
- return relationships
 
 
 
112
 
113
- def _generate_structured_text(self, sheets_data: Dict[str, pd.DataFrame]) -> str:
114
- """Generate structured text representation of Excel content"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  output_parts = []
116
 
117
  # Overall summary
@@ -130,36 +181,59 @@ class EnhancedExcelProcessor:
130
  # Basic info
131
  output_parts.append(f"Rows: {metadata['rows']}")
132
  output_parts.append(f"Columns: {', '.join(metadata['columns'])}")
133
- output_parts.append("")
134
 
135
- # Column summaries
 
 
 
 
 
 
 
 
 
 
 
 
136
  if metadata['numeric_columns']:
137
- output_parts.append("Numeric Columns Summary:")
138
  for col in metadata['numeric_columns']:
139
  stats = summary['numeric_summaries'][col]
140
  output_parts.append(f" {col}:")
141
  output_parts.append(f" Range: {stats['min']} to {stats['max']}")
142
  output_parts.append(f" Average: {stats['mean']:.2f}")
143
- output_parts.append("")
144
 
 
145
  if metadata['categorical_columns']:
146
- output_parts.append("Categorical Columns Summary:")
147
  for col in metadata['categorical_columns']:
148
- cats = summary['categorical_summaries'][col]
149
- output_parts.append(f" {col}:")
150
- output_parts.append(f" Unique Values: {cats['unique_values']}")
151
- if cats['top_values']:
152
- output_parts.append(" Top Values: " +
153
- ", ".join(f"{k} ({v})" for k, v in
154
- list(cats['top_values'].items())[:3]))
155
- output_parts.append("")
 
 
 
 
 
 
 
 
 
 
156
 
157
- # Sample data
158
- output_parts.append("Sample Data:")
159
- output_parts.append(df.head(3).to_string())
 
160
  output_parts.append("\n")
161
 
162
- # Relationships
163
  if self.relationships:
164
  output_parts.append("Sheet Relationships:")
165
  for rel_key, rel_info in self.relationships.items():
@@ -173,7 +247,7 @@ class EnhancedExcelProcessor:
173
  f"{parts[0]}.{parts[2]} and {parts[1]}.{parts[3]}")
174
 
175
  return "\n".join(output_parts)
176
-
177
  def get_sheet_summary(self, sheet_name: str) -> Optional[Dict]:
178
  """Get summary for a specific sheet"""
179
  return self.sheet_summaries.get(sheet_name)
 
2
  import pandas as pd
3
  import numpy as np
4
  from pathlib import Path
5
+ import logging
6
+ from openpyxl import load_workbook
7
+ from openpyxl.utils.cell import get_column_letter
8
 
9
  class EnhancedExcelProcessor:
10
  def __init__(self):
 
15
 
16
  def process_excel(self, file_path: Path) -> str:
17
  """
18
+ Process Excel file with enhanced data extraction
19
 
20
  Args:
21
  file_path (Path): Path to Excel file
 
23
  Returns:
24
  str: Structured text representation of Excel content
25
  """
26
+ # Read all sheets with improved handling
27
  excel_file = pd.ExcelFile(file_path)
28
  sheets_data = {}
29
 
30
+ # Load workbook for additional metadata
31
+ workbook = load_workbook(file_path, data_only=True)
32
+
33
  for sheet_name in excel_file.sheet_names:
34
+ # Read with pandas for data structure
35
+ df = pd.read_excel(
36
+ excel_file,
37
+ sheet_name=sheet_name,
38
+ header=None # Read without assuming header to capture all data
39
+ )
40
+
41
+ # Clean column names
42
+ if df.iloc[0].notna().any(): # If first row has any data
43
+ df.columns = [f"Column_{i}" if pd.isna(x) else str(x).strip()
44
+ for i, x in enumerate(df.iloc[0])]
45
+ df = df.iloc[1:] # Remove header row from data
46
+
47
  sheets_data[sheet_name] = df
48
 
49
+ # Generate enhanced sheet summary
50
+ self.sheet_summaries[sheet_name] = self._generate_enhanced_sheet_summary(
51
+ df,
52
+ workbook[sheet_name]
53
+ )
54
 
55
+ # Extract enhanced sheet metadata
56
+ self.sheet_metadata[sheet_name] = self._extract_enhanced_metadata(
57
+ df,
58
+ workbook[sheet_name]
59
+ )
 
 
 
60
 
61
  # Detect relationships between sheets
62
  self.relationships = self._detect_relationships(sheets_data)
63
 
64
  # Generate structured text representation
65
+ return self._generate_enhanced_structured_text(sheets_data, workbook)
66
 
67
+ def _generate_enhanced_sheet_summary(self, df: pd.DataFrame, ws) -> Dict:
68
+ """Generate comprehensive statistical summary for a sheet"""
69
  summary = {
70
  'total_rows': len(df),
71
  'total_columns': len(df.columns),
72
  'column_types': {},
73
  'numeric_summaries': {},
74
  'categorical_summaries': {},
75
+ 'null_counts': df.isnull().sum().to_dict(),
76
+ 'merged_cells': self._get_merged_cells_info(ws),
77
+ 'formulas': self._get_formulas_info(ws)
78
  }
79
 
80
+ # Process numeric columns with enhanced detection
81
  numeric_cols = df.select_dtypes(include=[np.number]).columns
82
  for col in numeric_cols:
83
+ col_data = pd.to_numeric(df[col], errors='coerce')
84
  summary['numeric_summaries'][col] = {
85
+ 'mean': float(col_data.mean()) if not col_data.empty else None,
86
+ 'median': float(col_data.median()) if not col_data.empty else None,
87
+ 'std': float(col_data.std()) if not col_data.empty else None,
88
+ 'min': float(col_data.min()) if not col_data.empty else None,
89
+ 'max': float(col_data.max()) if not col_data.empty else None,
90
+ 'sum': float(col_data.sum()) if not col_data.empty else None
91
  }
92
  summary['column_types'][col] = 'numeric'
93
 
94
+ # Process categorical and text columns with enhanced analysis
95
  categorical_cols = df.select_dtypes(include=['object']).columns
96
  for col in categorical_cols:
97
+ # Clean and process values
98
+ values = df[col].astype(str).replace('nan', pd.NA).dropna()
99
+ if not values.empty:
100
+ value_counts = values.value_counts()
101
+ summary['categorical_summaries'][col] = {
102
+ 'unique_values': int(len(value_counts)),
103
+ 'top_values': value_counts.head(5).to_dict(),
104
+ 'contains_currency': self._detect_currency(values),
105
+ 'contains_dates': self._detect_dates(values)
106
+ }
107
  summary['column_types'][col] = 'categorical'
108
 
109
  return summary
110
 
111
+ def _extract_enhanced_metadata(self, df: pd.DataFrame, ws) -> Dict:
112
+ """Extract comprehensive metadata including Excel-specific features"""
113
+ metadata = {
114
+ 'columns': list(df.columns),
115
+ 'rows': len(df),
116
+ 'numeric_columns': df.select_dtypes(include=[np.number]).columns.tolist(),
117
+ 'date_columns': df.select_dtypes(include=['datetime64']).columns.tolist(),
118
+ 'categorical_columns': df.select_dtypes(include=['object']).columns.tolist(),
119
+ 'column_widths': {get_column_letter(i+1): ws.column_dimensions[get_column_letter(i+1)].width
120
+ for i in range(len(df.columns))
121
+ if get_column_letter(i+1) in ws.column_dimensions},
122
+ 'hidden_rows': [idx for idx in range(1, ws.max_row + 1) if ws.row_dimensions[idx].hidden],
123
+ 'hidden_columns': [get_column_letter(idx) for idx in range(1, ws.max_column + 1)
124
+ if ws.column_dimensions[get_column_letter(idx)].hidden],
125
+ 'has_charts': bool(ws._charts),
126
+ 'has_images': bool(ws._images),
127
+ 'frozen_panes': ws.freeze_panes is not None
128
+ }
129
+ return metadata
130
+
131
+ def _get_merged_cells_info(self, ws) -> List[Dict]:
132
+ """Extract information about merged cells"""
133
+ merged_cells = []
134
+ for merged_range in ws.merged_cells.ranges:
135
+ merged_cells.append({
136
+ 'range': str(merged_range),
137
+ 'start_cell': merged_range.start_cell.coordinate,
138
+ 'end_cell': merged_range.end_cell.coordinate
139
+ })
140
+ return merged_cells
141
 
142
+ def _get_formulas_info(self, ws) -> Dict[str, str]:
143
+ """Extract formulas from the worksheet"""
144
+ formulas = {}
145
+ for row in ws.iter_rows():
146
+ for cell in row:
147
+ if cell.formula:
148
+ formulas[cell.coordinate] = cell.formula
149
+ return formulas
150
+
151
+ def _detect_currency(self, series: pd.Series) -> bool:
152
+ """Detect if a series contains currency values"""
153
+ currency_patterns = ['$', '€', '£', '¥']
154
+ return any(series.astype(str).str.contains('|'.join(currency_patterns)).any())
155
+
156
+ def _detect_dates(self, series: pd.Series) -> bool:
157
+ """Detect if a series contains date values"""
158
+ try:
159
+ pd.to_datetime(series, errors='raise')
160
+ return True
161
+ except:
162
+ return False
163
+
164
+ def _generate_enhanced_structured_text(self, sheets_data: Dict[str, pd.DataFrame], workbook) -> str:
165
+ """Generate detailed structured text representation of Excel content"""
166
  output_parts = []
167
 
168
  # Overall summary
 
181
  # Basic info
182
  output_parts.append(f"Rows: {metadata['rows']}")
183
  output_parts.append(f"Columns: {', '.join(metadata['columns'])}")
 
184
 
185
+ # Add information about hidden elements
186
+ if metadata['hidden_rows']:
187
+ output_parts.append(f"Hidden Rows: {len(metadata['hidden_rows'])}")
188
+ if metadata['hidden_columns']:
189
+ output_parts.append(f"Hidden Columns: {len(metadata['hidden_columns'])}")
190
+
191
+ # Add information about merged cells
192
+ if summary['merged_cells']:
193
+ output_parts.append("\nMerged Cells:")
194
+ for merge_info in summary['merged_cells'][:5]: # Show first 5 merged ranges
195
+ output_parts.append(f" - Range: {merge_info['range']}")
196
+
197
+ # Numeric columns summary
198
  if metadata['numeric_columns']:
199
+ output_parts.append("\nNumeric Columns Summary:")
200
  for col in metadata['numeric_columns']:
201
  stats = summary['numeric_summaries'][col]
202
  output_parts.append(f" {col}:")
203
  output_parts.append(f" Range: {stats['min']} to {stats['max']}")
204
  output_parts.append(f" Average: {stats['mean']:.2f}")
205
+ output_parts.append(f" Sum: {stats['sum']:.2f}")
206
 
207
+ # Categorical columns summary
208
  if metadata['categorical_columns']:
209
+ output_parts.append("\nCategorical Columns Summary:")
210
  for col in metadata['categorical_columns']:
211
+ if col in summary['categorical_summaries']:
212
+ cats = summary['categorical_summaries'][col]
213
+ output_parts.append(f" {col}:")
214
+ output_parts.append(f" Unique Values: {cats['unique_values']}")
215
+ if cats['top_values']:
216
+ output_parts.append(" Top Values: " +
217
+ ", ".join(f"{k} ({v})" for k, v in
218
+ list(cats['top_values'].items())[:3]))
219
+ if cats['contains_currency']:
220
+ output_parts.append(" Contains Currency Values")
221
+ if cats['contains_dates']:
222
+ output_parts.append(" Contains Date Values")
223
+
224
+ # Add formula information
225
+ if summary['formulas']:
226
+ output_parts.append("\nFormulas Present:")
227
+ for cell, formula in list(summary['formulas'].items())[:5]: # Show first 5 formulas
228
+ output_parts.append(f" {cell}: {formula}")
229
 
230
+ # Sample data with improved formatting
231
+ output_parts.append("\nSample Data:")
232
+ sample_data = df.head(5).fillna("").to_string(index=False)
233
+ output_parts.append(sample_data)
234
  output_parts.append("\n")
235
 
236
+ # Sheet relationships
237
  if self.relationships:
238
  output_parts.append("Sheet Relationships:")
239
  for rel_key, rel_info in self.relationships.items():
 
247
  f"{parts[0]}.{parts[2]} and {parts[1]}.{parts[3]}")
248
 
249
  return "\n".join(output_parts)
250
+
251
  def get_sheet_summary(self, sheet_name: str) -> Optional[Dict]:
252
  """Get summary for a specific sheet"""
253
  return self.sheet_summaries.get(sheet_name)
src/utils/google_drive_service.py CHANGED
@@ -3,7 +3,8 @@ from google.oauth2 import service_account
3
  from googleapiclient.discovery import build
4
  from googleapiclient.http import MediaIoBaseDownload
5
  import io
6
- import os
 
7
 
8
  class GoogleDriveService:
9
  def __init__(self, credentials_path: str):
@@ -19,24 +20,61 @@ class GoogleDriveService:
19
  )
20
  self.service = build('drive', 'v3', credentials=self.credentials)
21
 
22
- def get_folder_contents(self, folder_id: str):
23
  """
24
- Get contents of a Drive folder
25
 
26
  Args:
27
  folder_id (str): ID of the folder to process
 
28
 
29
  Returns:
30
  List[Dict]: List of file metadata
31
  """
32
- query = f"'{folder_id}' in parents and trashed=false"
33
- results = self.service.files().list(
34
- q=query,
35
- fields="files(id, name, mimeType,modifiedTime)",
36
- supportsAllDrives=True,
37
- includeItemsFromAllDrives=True
38
- ).execute()
39
- return results.get('files', [])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  def download_file(self, file_id: str) -> bytes:
42
  """
 
3
  from googleapiclient.discovery import build
4
  from googleapiclient.http import MediaIoBaseDownload
5
  import io
6
+ from typing import List, Dict, Any
7
+ import logging
8
 
9
  class GoogleDriveService:
10
  def __init__(self, credentials_path: str):
 
20
  )
21
  self.service = build('drive', 'v3', credentials=self.credentials)
22
 
23
+ def get_folder_contents(self, folder_id: str, include_subfolders: bool = False) -> List[Dict[str, Any]]:
24
  """
25
+ Get contents of a Drive folder including subfolders if specified
26
 
27
  Args:
28
  folder_id (str): ID of the folder to process
29
+ include_subfolders (bool): Whether to include contents of subfolders (default: False)
30
 
31
  Returns:
32
  List[Dict]: List of file metadata
33
  """
34
+ all_files = []
35
+ try:
36
+ # Get all items in the current folder
37
+ query = f"'{folder_id}' in parents and trashed=false"
38
+ results = self.service.files().list(
39
+ q=query,
40
+ fields="files(id, name, mimeType, modifiedTime, parents)",
41
+ supportsAllDrives=True,
42
+ includeItemsFromAllDrives=True
43
+ ).execute()
44
+
45
+ items = results.get('files', [])
46
+
47
+ for item in items:
48
+ if item['mimeType'] == 'application/vnd.google-apps.folder' and include_subfolders:
49
+ # Recursively get contents of subfolder
50
+ try:
51
+ subfolder_files = self.get_folder_contents(
52
+ item['id'],
53
+ include_subfolders=True
54
+ )
55
+ # Add folder path information to each file
56
+ for file in subfolder_files:
57
+ if not file.get('folder_path'):
58
+ file['folder_path'] = []
59
+ file['folder_path'].insert(0, {
60
+ 'id': item['id'],
61
+ 'name': item['name']
62
+ })
63
+ all_files.extend(subfolder_files)
64
+ except Exception as e:
65
+ logging.error(f"Error processing subfolder {item['name']}: {str(e)}")
66
+ continue
67
+ else:
68
+ # For backward compatibility, maintain original structure
69
+ # but add folder path information
70
+ item['folder_path'] = []
71
+ all_files.append(item)
72
+
73
+ return all_files
74
+
75
+ except Exception as e:
76
+ logging.error(f"Error getting folder contents for folder {folder_id}: {str(e)}")
77
+ return [] # Return empty list for backward compatibility
78
 
79
  def download_file(self, file_id: str) -> bytes:
80
  """
src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc CHANGED
Binary files a/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc differ
 
src/vectorstores/chroma_vectorstore.py CHANGED
@@ -93,42 +93,62 @@ class ChromaVectorStore(BaseVectorStore):
93
  **kwargs
94
  ) -> List[Dict[str, Any]]:
95
  """
96
- Perform similarity search
97
-
98
- Args:
99
- query_embedding (List[float]): Embedding of the query
100
- top_k (int): Number of top similar documents to retrieve
101
- **kwargs: Additional search parameters
102
-
103
- Returns:
104
- List[Dict[str, Any]]: List of documents with their text, metadata, and scores
105
  """
106
  try:
 
107
  results = self.collection.query(
108
  query_embeddings=[query_embedding],
109
- n_results=top_k,
110
  include=['documents', 'metadatas', 'distances']
111
  )
112
-
113
- # Handle the case where no results are found
114
  if not results or 'documents' not in results or not results['documents']:
 
115
  return []
116
-
117
- # Format results to include text, metadata, and scores
118
  formatted_results = []
119
  documents = results['documents'][0] # First query's results
120
  metadatas = results['metadatas'][0] if results.get('metadatas') else [None] * len(documents)
121
  distances = results['distances'][0] if results.get('distances') else [None] * len(documents)
122
-
 
123
  for doc, meta, dist in zip(documents, metadatas, distances):
124
- formatted_results.append({
125
- 'text': doc,
126
- 'metadata': meta or {},
127
- 'score': 1.0 - (dist or 0.0) if dist is not None else None # Convert distance to similarity score
128
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
- return formatted_results
131
-
132
  except Exception as e:
133
  logging.error(f"Error performing similarity search in ChromaDB: {str(e)}")
134
  raise
 
93
  **kwargs
94
  ) -> List[Dict[str, Any]]:
95
  """
96
+ Perform similarity search with improved matching
 
 
 
 
 
 
 
 
97
  """
98
  try:
99
+ # Increase n_results to get more potential matches
100
  results = self.collection.query(
101
  query_embeddings=[query_embedding],
102
+ n_results=10, # Get more initial results
103
  include=['documents', 'metadatas', 'distances']
104
  )
105
+
 
106
  if not results or 'documents' not in results or not results['documents']:
107
+ logging.warning("No results found in similarity search")
108
  return []
109
+
 
110
  formatted_results = []
111
  documents = results['documents'][0] # First query's results
112
  metadatas = results['metadatas'][0] if results.get('metadatas') else [None] * len(documents)
113
  distances = results['distances'][0] if results.get('distances') else [None] * len(documents)
114
+
115
+ # Process all results
116
  for doc, meta, dist in zip(documents, metadatas, distances):
117
+ # Convert distance to similarity score (1 is most similar, 0 is least)
118
+ similarity_score = 1.0 - (dist or 0.0) if dist is not None else None
119
+
120
+ # More permissive threshold and include all results for filtering
121
+ if similarity_score is not None and similarity_score > 0.2: # Lower threshold
122
+ formatted_results.append({
123
+ 'text': doc,
124
+ 'metadata': meta or {},
125
+ 'score': similarity_score
126
+ })
127
+
128
+ # Sort by score and get top_k results
129
+ formatted_results.sort(key=lambda x: x['score'] or 0, reverse=True)
130
+
131
+ # Check if results are from same document and get consecutive chunks
132
+ if formatted_results:
133
+ first_doc_id = formatted_results[0]['metadata'].get('document_id')
134
+ all_chunks_same_doc = []
135
+
136
+ # Get all chunks from the same document
137
+ for result in formatted_results:
138
+ if result['metadata'].get('document_id') == first_doc_id:
139
+ all_chunks_same_doc.append(result)
140
+
141
+ # Sort chunks by their index to maintain document flow
142
+ all_chunks_same_doc.sort(
143
+ key=lambda x: x['metadata'].get('chunk_index', 0)
144
+ )
145
+
146
+ # Return either all chunks from same document or top_k results
147
+ if len(all_chunks_same_doc) > 0:
148
+ return all_chunks_same_doc[:top_k]
149
+
150
+ return formatted_results[:top_k]
151
 
 
 
152
  except Exception as e:
153
  logging.error(f"Error performing similarity search in ChromaDB: {str(e)}")
154
  raise
temp_downloads/17he27jN4louYr1xOYASf4BP2e-tGTICt.xlsx ADDED
Binary file (9.81 kB). View file
 
temp_downloads/1K608-Qr03M6nf5FhB6AajbHm8kjQujx1.xlsx ADDED
Binary file (30.4 kB). View file