Spaces:

TalatMasud
/

chatbot-backend

Running

App Files Files Community

TalatMasood commited on Feb 23

Commit

acdfaa9

1 Parent(s): 1a54bda

Log google drive documents in the mongodb, add source of the document and made chunks to overlap text.

Browse files

Files changed (18) hide show

Install your driver.txt +19 -0
config/__pycache__/config.cpython-312.pyc +0 -0
config/config.py +2 -0
src/__pycache__/main.cpython-312.pyc +0 -0
src/agents/__pycache__/rag_agent.cpython-312.pyc +0 -0
src/agents/__pycache__/system_instructions_rag.cpython-312.pyc +0 -0
src/agents/rag_agent.py +66 -32
src/db/__pycache__/mongodb_store.cpython-312.pyc +0 -0
src/db/mongodb_store.py +38 -33
src/implementations/__pycache__/document_service.cpython-312.pyc +0 -0
src/implementations/document_service.py +41 -33
src/main.py +4 -2
src/utils/__pycache__/document_processor.cpython-312.pyc +0 -0
src/utils/__pycache__/drive_document_processor.cpython-312.pyc +0 -0
src/utils/document_processor.py +26 -140
src/utils/drive_document_processor.py +125 -46
src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc +0 -0
src/vectorstores/chroma_vectorstore.py +59 -46

Install your driver.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+2. Install your driver
+Run the following on the command line
+Note: Use appropriate Python 3 executable
+python -m pip install "pymongo[srv]"==3.12
+View MongoDB Python Driver installation instructions.
+3. Add your connection string into your application code
+Use this connection string in your application
+View full code sample
+Show Password
+mongodb+srv://talat:[email protected]/?retryWrites=true&w=majority&appName=Chatbot
+The password for talat is included in the connection string for your first time setup. This password will not be available again after exiting this connect flow.

config/__pycache__/config.cpython-312.pyc CHANGED Viewed

Binary files a/config/__pycache__/config.cpython-312.pyc and b/config/__pycache__/config.cpython-312.pyc differ

config/config.py CHANGED Viewed

@@ -22,6 +22,8 @@ class Settings:
     # Anthropic Configuration
     ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
     # Environment Configuration
     ENVIRONMENT = os.getenv('ENVIRONMENT').lower()

     # Anthropic Configuration
     ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
+    # top number of chunks to retrieve.
+    TOP_CHUNKS = int(os.getenv('TOP_CHUNKS', '10'))
     # Environment Configuration
     ENVIRONMENT = os.getenv('ENVIRONMENT').lower()

src/__pycache__/main.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ

src/agents/__pycache__/rag_agent.cpython-312.pyc CHANGED Viewed

Binary files a/src/agents/__pycache__/rag_agent.cpython-312.pyc and b/src/agents/__pycache__/rag_agent.cpython-312.pyc differ

src/agents/__pycache__/system_instructions_rag.cpython-312.pyc CHANGED Viewed

Binary files a/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc and b/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc differ

src/agents/rag_agent.py CHANGED Viewed

@@ -10,6 +10,7 @@ from src.utils.conversation_manager import ConversationManager
 from src.db.mongodb_store import MongoDBStore
 from src.models.rag import RAGResponse
 from src.utils.logger import logger
 class RAGAgent(ExcelAwareRAGAgent):
@@ -43,6 +44,50 @@ class RAGAgent(ExcelAwareRAGAgent):
             max_messages=max_history_messages
         )
     async def generate_response(
         self,
         query: str,
@@ -51,9 +96,9 @@ class RAGAgent(ExcelAwareRAGAgent):
         max_tokens: Optional[int] = None,
         context_docs: Optional[List[str]] = None
     ) -> RAGResponse:
-        """Generate response with specific handling for different query types"""
         try:
-            # First, check if this is an introduction/welcome message query
             is_introduction = (
                 "wants support" in query and
                 "This is Introduction" in query and
@@ -61,7 +106,6 @@ class RAGAgent(ExcelAwareRAGAgent):
             )
             if is_introduction:
-                # Handle introduction message - no context needed
                 welcome_message = self._handle_contact_query(query)
                 return RAGResponse(
                     response=welcome_message,
@@ -77,8 +121,6 @@ class RAGAgent(ExcelAwareRAGAgent):
                     conversation_id,
                     limit=self.conversation_manager.max_messages
                 )
-                # Get relevant history within token limits
                 history = self.conversation_manager.get_relevant_history(
                     messages=history,
                     current_query=query
@@ -94,6 +136,21 @@ class RAGAgent(ExcelAwareRAGAgent):
                 sources = None
                 scores = None
             # Check if we have any relevant context
             if not context_docs:
                 return RAGResponse(
@@ -103,15 +160,6 @@ class RAGAgent(ExcelAwareRAGAgent):
                     scores=None
                 )
-            # Check if this is an Excel-related query
-            has_excel_content = any('Sheet:' in doc for doc in context_docs)
-            if has_excel_content:
-                try:
-                    context_docs = self._process_excel_context(
-                        context_docs, query)
-                except Exception as e:
-                    logger.warning(f"Error processing Excel context: {str(e)}")
             # Generate prompt with context and history
             augmented_prompt = self.conversation_manager.generate_prompt_with_history(
                 current_query=query,
@@ -119,7 +167,7 @@ class RAGAgent(ExcelAwareRAGAgent):
                 context_docs=context_docs
             )
-            # Generate initial response
             response = self.llm.generate(
                 prompt=augmented_prompt,
                 temperature=temperature,
@@ -129,19 +177,6 @@ class RAGAgent(ExcelAwareRAGAgent):
             # Clean the response
             cleaned_response = self._clean_response(response)
-            # For Excel queries, enhance the response
-            if has_excel_content:
-                try:
-                    enhanced_response = await self.enhance_excel_response(
-                        query=query,
-                        response=cleaned_response,
-                        context_docs=context_docs
-                    )
-                    if enhanced_response:
-                        cleaned_response = enhanced_response
-                except Exception as e:
-                    logger.warning(f"Error enhancing Excel response: {str(e)}")
             # Return the final response
             return RAGResponse(
                 response=cleaned_response,
@@ -151,7 +186,7 @@ class RAGAgent(ExcelAwareRAGAgent):
             )
         except Exception as e:
-            logger.error(f"Error in SystemInstructionsRAGAgent: {str(e)}")
             raise
     def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
@@ -201,8 +236,7 @@ class RAGAgent(ExcelAwareRAGAgent):
     async def retrieve_context(
         self,
         query: str,
-        conversation_history: Optional[List[Dict]] = None,
-        top_k: int = 3
     ) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
         """
         Retrieve context with conversation history enhancement
@@ -229,7 +263,7 @@ class RAGAgent(ExcelAwareRAGAgent):
         # Retrieve similar documents
         results = self.vector_store.similarity_search(
             query_embedding,
-            top_k=top_k
         )
         # Debug log search results

 from src.db.mongodb_store import MongoDBStore
 from src.models.rag import RAGResponse
 from src.utils.logger import logger
+from config.config import settings
 class RAGAgent(ExcelAwareRAGAgent):
             max_messages=max_history_messages
         )
+    def _extract_markdown_section(self, docs: List[str], section_header: str) -> str:
+        """Extract complete section content from markdown documents"""
+        combined_text = '\n'.join(docs)
+        section_start = combined_text.find(section_header)
+        if section_start == -1:
+            return ""
+        next_section = combined_text.find(
+            "\n\n**", section_start + len(section_header))
+        if next_section == -1:
+            section_content = combined_text[section_start:]
+        else:
+            section_content = combined_text[section_start:next_section]
+        return self._clean_markdown_content(section_content)
+    def _clean_markdown_content(self, content: str) -> str:
+        """Clean and format markdown content"""
+        lines = content.split('\n')
+        seen_lines = set()
+        cleaned_lines = []
+        for line in lines:
+            # Always keep headers and table formatting
+            if '| :----' in line or line.startswith('**'):
+                if line not in seen_lines:
+                    cleaned_lines.append(line)
+                    seen_lines.add(line)
+                continue
+            # Keep table rows and list items
+            if line.strip().startswith('|') or line.strip().startswith('-'):
+                cleaned_lines.append(line)
+                continue
+            # Remove duplicates for other content
+            stripped = line.strip()
+            if stripped and stripped not in seen_lines:
+                cleaned_lines.append(line)
+                seen_lines.add(stripped)
+        return '\n'.join(cleaned_lines)
     async def generate_response(
         self,
         query: str,
         max_tokens: Optional[int] = None,
         context_docs: Optional[List[str]] = None
     ) -> RAGResponse:
+        """Generate response with improved markdown and conversation handling"""
         try:
+            # Handle introduction/welcome message queries
             is_introduction = (
                 "wants support" in query and
                 "This is Introduction" in query and
             )
             if is_introduction:
                 welcome_message = self._handle_contact_query(query)
                 return RAGResponse(
                     response=welcome_message,
                     conversation_id,
                     limit=self.conversation_manager.max_messages
                 )
                 history = self.conversation_manager.get_relevant_history(
                     messages=history,
                     current_query=query
                 sources = None
                 scores = None
+            # Special handling for markdown section queries
+            if "DISCUSSIONS AND ACTION ITEMS" in query.upper():
+                section_content = self._extract_markdown_section(
+                    context_docs,
+                    "**DISCUSSIONS AND ACTION ITEMS**"
+                )
+                if section_content:
+                    return RAGResponse(
+                        response=section_content.strip(),
+                        context_docs=context_docs,
+                        sources=sources,
+                        scores=scores
+                    )
             # Check if we have any relevant context
             if not context_docs:
                 return RAGResponse(
                     scores=None
                 )
             # Generate prompt with context and history
             augmented_prompt = self.conversation_manager.generate_prompt_with_history(
                 current_query=query,
                 context_docs=context_docs
             )
+            # Generate response
             response = self.llm.generate(
                 prompt=augmented_prompt,
                 temperature=temperature,
             # Clean the response
             cleaned_response = self._clean_response(response)
             # Return the final response
             return RAGResponse(
                 response=cleaned_response,
             )
         except Exception as e:
+            logger.error(f"Error in RAGAgent: {str(e)}")
             raise
     def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
     async def retrieve_context(
         self,
         query: str,
+        conversation_history: Optional[List[Dict]] = None
     ) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
         """
         Retrieve context with conversation history enhancement
         # Retrieve similar documents
         results = self.vector_store.similarity_search(
             query_embedding,
+            top_k=settings.TOP_CHUNKS
         )
         # Debug log search results

src/db/__pycache__/mongodb_store.cpython-312.pyc CHANGED Viewed

Binary files a/src/db/__pycache__/mongodb_store.cpython-312.pyc and b/src/db/__pycache__/mongodb_store.cpython-312.pyc differ

src/db/mongodb_store.py CHANGED Viewed

@@ -4,6 +4,7 @@ from datetime import datetime
 from typing import List, Dict, Optional, Any
 from bson import ObjectId
 class MongoDBStore:
     def __init__(self, mongo_uri: str = "mongodb://localhost:27017"):
         """Initialize MongoDB connection"""
@@ -20,7 +21,8 @@ class MongoDBStore:
         filename: str,
         content_type: str,
         file_size: int,
-        url_path: str
     ) -> str:
         """Store document metadata in MongoDB"""
         document = {
@@ -29,9 +31,10 @@ class MongoDBStore:
             "content_type": content_type,
             "file_size": file_size,
             "url_path": url_path,
             "upload_timestamp": datetime.now()
         }
         await self.documents.insert_one(document)
         return document_id
@@ -53,7 +56,8 @@ class MongoDBStore:
                 "content_type": 1,
                 "file_size": 1,
                 "url_path": 1,
-                "upload_timestamp": 1
             }
         )
         return await cursor.to_list(length=None)
@@ -62,7 +66,7 @@ class MongoDBStore:
         """Delete document from MongoDB"""
         result = await self.documents.delete_one({"document_id": document_id})
         return result.deleted_count > 0
     async def find_existing_user(
         self,
         email: str,
@@ -70,11 +74,11 @@ class MongoDBStore:
     ) -> Optional[str]:
         """
         Find existing user by email or phone number
         Args:
             email (str): User's email
             phone_number (str): User's phone number
         Returns:
             Optional[str]: Conversation ID if found, None otherwise
         """
@@ -84,7 +88,7 @@ class MongoDBStore:
                 {"phone_number": phone_number}
             ]
         })
         return result["conversation_id"] if result else None
     # Conversation and chat history methods
@@ -105,7 +109,7 @@ class MongoDBStore:
             full_name (Optional[str]): User's full name
             email (Optional[str]): User's email
             phone_number (Optional[str]): User's phone number
         Returns:
             str: Conversation ID
         """
@@ -124,7 +128,7 @@ class MongoDBStore:
             conversation["email"] = email
         if phone_number:
             conversation["phone_number"] = phone_number
         await self.conversations.insert_one(conversation)
         return conversation_id
@@ -200,30 +204,31 @@ class MongoDBStore:
             "rating": None
         }
         result = await self.chat_history.insert_one(assistant_message)
         # Update conversation metadata
         await self.conversations.update_one(
             {"conversation_id": conversation_id},
             {
                 "$set": {"last_updated": datetime.now()},
-                "$inc": {"message_count": 2}  # Increment by 2 since we store both messages
             },
             upsert=True
         )
         return str(result.inserted_id)
     async def get_conversation_history(self, conversation_id: str) -> List[Dict]:
         """Retrieve complete conversation history"""
         cursor = self.chat_history.find(
             {"conversation_id": conversation_id}
         ).sort("timestamp", 1)
         history = []
         async for document in cursor:
             document["_id"] = str(document["_id"])
             history.append(document)
         return history
     async def get_recent_messages(
@@ -234,14 +239,15 @@ class MongoDBStore:
         """Get most recent messages from conversation"""
         cursor = self.chat_history.find(
             {"conversation_id": conversation_id}
-        ).sort("timestamp", -1).limit(limit * 2)  # Multiply limit by 2 to account for user-assistant pairs
         messages = []
         async for doc in cursor:
             messages.append(self._format_message(doc))
         return list(reversed(messages))
     async def update_feedback(
         self,
         conversation_id: str,
@@ -250,20 +256,20 @@ class MongoDBStore:
     ) -> bool:
         """
         Update feedback for a conversation
         Args:
             conversation_id (str): Conversation ID
             feedback (Optional[str]): Feedback text
             rating (Optional[int]): Numeric rating
         Returns:
             bool: True if update successful
         """
         update_fields = {}
         if feedback is not None:
             update_fields["feedback"] = feedback
         if rating is not None:
             from config.config import settings
             formatted_rating = f"{rating}/{settings.MAX_RATING}"
@@ -271,7 +277,7 @@ class MongoDBStore:
                 "rating": rating,  # Store numeric value
                 "formatted_rating": formatted_rating  # Store formatted string
             })
         if not update_fields:
             return False
@@ -279,7 +285,7 @@ class MongoDBStore:
             {"conversation_id": conversation_id},
             {"$set": update_fields}
         )
         # Also update conversation metadata
         if result.modified_count > 0:
             await self.update_conversation_metadata(
@@ -290,7 +296,7 @@ class MongoDBStore:
                     "formatted_rating": formatted_rating if rating is not None else None
                 }
             )
         return result.modified_count > 0
     async def get_messages_for_summary(
@@ -301,7 +307,7 @@ class MongoDBStore:
         cursor = self.chat_history.find(
             {"conversation_id": conversation_id}
         ).sort("timestamp", 1)
         messages = []
         async for doc in cursor:
             formatted = self._format_message(doc)
@@ -312,10 +318,9 @@ class MongoDBStore:
                 'timestamp': formatted['timestamp'],
                 'sources': formatted['sources']
             })
         return messages
     def _format_message(self, doc: Dict) -> Dict:
         """Helper method to format message documents consistently"""
         return {
@@ -330,7 +335,7 @@ class MongoDBStore:
             "feedback": doc.get("feedback"),
             "rating": doc.get("rating")
         }
     # Vector store related methods
     async def store_vector_metadata(
         self,
@@ -345,7 +350,7 @@ class MongoDBStore:
             "metadata": metadata,
             "created_at": datetime.now()
         }
         result = await self.db.vector_metadata.insert_one(vector_metadata)
         return str(result.inserted_id)
@@ -367,4 +372,4 @@ class MongoDBStore:
         result = await self.db.vector_metadata.delete_many(
             {"document_id": document_id}
         )
-        return result.deleted_count > 0

 from typing import List, Dict, Optional, Any
 from bson import ObjectId
 class MongoDBStore:
     def __init__(self, mongo_uri: str = "mongodb://localhost:27017"):
         """Initialize MongoDB connection"""
         filename: str,
         content_type: str,
         file_size: int,
+        url_path: str,
+        source: str
     ) -> str:
         """Store document metadata in MongoDB"""
         document = {
             "content_type": content_type,
             "file_size": file_size,
             "url_path": url_path,
+            "source": source,
             "upload_timestamp": datetime.now()
         }
         await self.documents.insert_one(document)
         return document_id
                 "content_type": 1,
                 "file_size": 1,
                 "url_path": 1,
+                "upload_timestamp": 1,
+                "source": 1
             }
         )
         return await cursor.to_list(length=None)
         """Delete document from MongoDB"""
         result = await self.documents.delete_one({"document_id": document_id})
         return result.deleted_count > 0
     async def find_existing_user(
         self,
         email: str,
     ) -> Optional[str]:
         """
         Find existing user by email or phone number
         Args:
             email (str): User's email
             phone_number (str): User's phone number
         Returns:
             Optional[str]: Conversation ID if found, None otherwise
         """
                 {"phone_number": phone_number}
             ]
         })
         return result["conversation_id"] if result else None
     # Conversation and chat history methods
             full_name (Optional[str]): User's full name
             email (Optional[str]): User's email
             phone_number (Optional[str]): User's phone number
         Returns:
             str: Conversation ID
         """
             conversation["email"] = email
         if phone_number:
             conversation["phone_number"] = phone_number
         await self.conversations.insert_one(conversation)
         return conversation_id
             "rating": None
         }
         result = await self.chat_history.insert_one(assistant_message)
         # Update conversation metadata
         await self.conversations.update_one(
             {"conversation_id": conversation_id},
             {
                 "$set": {"last_updated": datetime.now()},
+                # Increment by 2 since we store both messages
+                "$inc": {"message_count": 2}
             },
             upsert=True
         )
         return str(result.inserted_id)
     async def get_conversation_history(self, conversation_id: str) -> List[Dict]:
         """Retrieve complete conversation history"""
         cursor = self.chat_history.find(
             {"conversation_id": conversation_id}
         ).sort("timestamp", 1)
         history = []
         async for document in cursor:
             document["_id"] = str(document["_id"])
             history.append(document)
         return history
     async def get_recent_messages(
         """Get most recent messages from conversation"""
         cursor = self.chat_history.find(
             {"conversation_id": conversation_id}
+            # Multiply limit by 2 to account for user-assistant pairs
+        ).sort("timestamp", -1).limit(limit * 2)
         messages = []
         async for doc in cursor:
             messages.append(self._format_message(doc))
         return list(reversed(messages))
     async def update_feedback(
         self,
         conversation_id: str,
     ) -> bool:
         """
         Update feedback for a conversation
         Args:
             conversation_id (str): Conversation ID
             feedback (Optional[str]): Feedback text
             rating (Optional[int]): Numeric rating
         Returns:
             bool: True if update successful
         """
         update_fields = {}
         if feedback is not None:
             update_fields["feedback"] = feedback
         if rating is not None:
             from config.config import settings
             formatted_rating = f"{rating}/{settings.MAX_RATING}"
                 "rating": rating,  # Store numeric value
                 "formatted_rating": formatted_rating  # Store formatted string
             })
         if not update_fields:
             return False
             {"conversation_id": conversation_id},
             {"$set": update_fields}
         )
         # Also update conversation metadata
         if result.modified_count > 0:
             await self.update_conversation_metadata(
                     "formatted_rating": formatted_rating if rating is not None else None
                 }
             )
         return result.modified_count > 0
     async def get_messages_for_summary(
         cursor = self.chat_history.find(
             {"conversation_id": conversation_id}
         ).sort("timestamp", 1)
         messages = []
         async for doc in cursor:
             formatted = self._format_message(doc)
                 'timestamp': formatted['timestamp'],
                 'sources': formatted['sources']
             })
         return messages
     def _format_message(self, doc: Dict) -> Dict:
         """Helper method to format message documents consistently"""
         return {
             "feedback": doc.get("feedback"),
             "rating": doc.get("rating")
         }
     # Vector store related methods
     async def store_vector_metadata(
         self,
             "metadata": metadata,
             "created_at": datetime.now()
         }
         result = await self.db.vector_metadata.insert_one(vector_metadata)
         return str(result.inserted_id)
         result = await self.db.vector_metadata.delete_many(
             {"document_id": document_id}
         )
+        return result.deleted_count > 0

src/implementations/__pycache__/document_service.cpython-312.pyc CHANGED Viewed

Binary files a/src/implementations/__pycache__/document_service.cpython-312.pyc and b/src/implementations/__pycache__/document_service.cpython-312.pyc differ

src/implementations/document_service.py CHANGED Viewed

@@ -13,9 +13,10 @@ from src.models import DocumentResponse, DocumentInfo, BatchUploadResponse
 from src.utils.logger import logger
 from src.db.mongodb_store import MongoDBStore
 class DocumentService:
     def __init__(
-        self,
         doc_processor: DocumentProcessor,
         mongodb: MongoDBStore
     ):
@@ -27,10 +28,10 @@ class DocumentService:
     async def check_duplicate_filename(self, filename: str) -> bool:
         """
         Check if a file with the same name exists
         Args:
             filename (str): Original filename to check
         Returns:
             bool: True if duplicate exists, False otherwise
         """
@@ -45,8 +46,8 @@ class DocumentService:
     ) -> BatchUploadResponse:
         """Process multiple document uploads"""
         processed_files, failed_files = await self._handle_file_uploads(
-            files,
-            vector_store,
             background_tasks
         )
@@ -78,22 +79,23 @@ class DocumentService:
                 if not self._is_supported_format(file.filename):
                     failed_files.append(self._create_failed_file_entry(
-                        file.filename,
                         "Unsupported file format"
                     ))
                     continue
                 document_response = await self._process_single_file(
-                    file,
-                    vector_store,
                     background_tasks
                 )
                 processed_files.append(document_response)
             except Exception as e:
-                logger.error(f"Error processing file {file.filename}: {str(e)}")
                 failed_files.append(self._create_failed_file_entry(
-                    file.filename,
                     str(e)
                 ))
@@ -110,7 +112,7 @@ class DocumentService:
         filename = f"{document_id}_{file.filename}"
         file_path = self.permanent_dir / filename
         url_path = f"/docs/{filename}"
         try:
             # Save file to permanent location using a context manager
             with open(file_path, "wb") as buffer:
@@ -122,12 +124,12 @@ class DocumentService:
             # Process document with proper cleanup for Excel files
             try:
                 processed_doc = await self.doc_processor.process_document(file_path)
                 # For Excel files, ensure pandas closes the file
                 if file_path.suffix.lower() in ['.xlsx', '.xls']:
                     import gc
                     gc.collect()  # Help cleanup any lingering file handles
             except Exception as proc_error:
                 logger.error(f"Error processing document: {str(proc_error)}")
                 raise
@@ -138,7 +140,8 @@ class DocumentService:
                 filename=file.filename,
                 content_type=file.content_type,
                 file_size=os.path.getsize(file_path),
-                url_path=url_path
             )
             # Process for vector store in background
@@ -161,21 +164,23 @@ class DocumentService:
                     url_path=url_path
                 )
             )
         except Exception as e:
             # Clean up file if it was created
             if file_path.exists():
                 try:
                     file_path.unlink()
                 except Exception as cleanup_error:
-                    logger.error(f"Error cleaning up file {file_path}: {str(cleanup_error)}")
             # Clean up from MongoDB if document was created
             try:
                 await self.mongodb.delete_document(document_id)
             except Exception as db_cleanup_error:
-                logger.error(f"Error cleaning up MongoDB document {document_id}: {str(db_cleanup_error)}")
             logger.error(f"Error processing file {file.filename}: {str(e)}")
             raise
@@ -189,11 +194,12 @@ class DocumentService:
         """Process document content for vector store"""
         try:
             # Generate chunk IDs using document_id
-            chunk_ids = [f"{document_id}-chunk-{i}" for i in range(len(chunks))]
             # Get embeddings
             embeddings = vector_store.embedding_function(chunks)
             # Prepare metadata for each chunk
             metadatas = [{
                 'document_id': document_id,
@@ -201,7 +207,7 @@ class DocumentService:
                 'chunk_index': i,
                 'total_chunks': len(chunks)
             } for i in range(len(chunks))]
             # Store in vector store
             vector_store.add_documents(
                 documents=chunks,
@@ -209,17 +215,19 @@ class DocumentService:
                 metadatas=metadatas,
                 ids=chunk_ids
             )
-            logger.info(f"Successfully processed document {filename} (ID: {document_id}) into {len(chunks)} chunks")
         except Exception as e:
-            logger.error(f"Error processing document {filename} (ID: {document_id}) for vector store: {str(e)}")
             raise
     def _is_supported_format(self, filename: str) -> bool:
         """Check if file format is supported"""
-        return any(filename.lower().endswith(ext)
-                  for ext in self.doc_processor.supported_formats)
     def _create_failed_file_entry(self, filename: str, error: str) -> dict:
         """Create a failed file entry"""
@@ -237,15 +245,15 @@ class DocumentService:
                 # Get filename from url_path
                 filename = doc['url_path'].split('/')[-1]
                 file_path = self.permanent_dir / filename
                 # Delete physical file if it exists
                 if file_path.exists():
                     file_path.unlink()
                 # Delete from MongoDB
                 return await self.mongodb.delete_document(document_id)
             return False
         except Exception as e:
             logger.error(f"Error deleting document: {str(e)}")
             raise
@@ -253,4 +261,4 @@ class DocumentService:
     def cleanup(self):
         """Clean up permanent directory if empty"""
         if self.permanent_dir.exists() and not any(self.permanent_dir.iterdir()):
-            self.permanent_dir.rmdir()

 from src.utils.logger import logger
 from src.db.mongodb_store import MongoDBStore
 class DocumentService:
     def __init__(
+        self,
         doc_processor: DocumentProcessor,
         mongodb: MongoDBStore
     ):
     async def check_duplicate_filename(self, filename: str) -> bool:
         """
         Check if a file with the same name exists
         Args:
             filename (str): Original filename to check
         Returns:
             bool: True if duplicate exists, False otherwise
         """
     ) -> BatchUploadResponse:
         """Process multiple document uploads"""
         processed_files, failed_files = await self._handle_file_uploads(
+            files,
+            vector_store,
             background_tasks
         )
                 if not self._is_supported_format(file.filename):
                     failed_files.append(self._create_failed_file_entry(
+                        file.filename,
                         "Unsupported file format"
                     ))
                     continue
                 document_response = await self._process_single_file(
+                    file,
+                    vector_store,
                     background_tasks
                 )
                 processed_files.append(document_response)
             except Exception as e:
+                logger.error(
+                    f"Error processing file {file.filename}: {str(e)}")
                 failed_files.append(self._create_failed_file_entry(
+                    file.filename,
                     str(e)
                 ))
         filename = f"{document_id}_{file.filename}"
         file_path = self.permanent_dir / filename
         url_path = f"/docs/{filename}"
         try:
             # Save file to permanent location using a context manager
             with open(file_path, "wb") as buffer:
             # Process document with proper cleanup for Excel files
             try:
                 processed_doc = await self.doc_processor.process_document(file_path)
                 # For Excel files, ensure pandas closes the file
                 if file_path.suffix.lower() in ['.xlsx', '.xls']:
                     import gc
                     gc.collect()  # Help cleanup any lingering file handles
             except Exception as proc_error:
                 logger.error(f"Error processing document: {str(proc_error)}")
                 raise
                 filename=file.filename,
                 content_type=file.content_type,
                 file_size=os.path.getsize(file_path),
+                url_path=url_path,
+                source="user_upload"
             )
             # Process for vector store in background
                     url_path=url_path
                 )
             )
         except Exception as e:
             # Clean up file if it was created
             if file_path.exists():
                 try:
                     file_path.unlink()
                 except Exception as cleanup_error:
+                    logger.error(
+                        f"Error cleaning up file {file_path}: {str(cleanup_error)}")
             # Clean up from MongoDB if document was created
             try:
                 await self.mongodb.delete_document(document_id)
             except Exception as db_cleanup_error:
+                logger.error(
+                    f"Error cleaning up MongoDB document {document_id}: {str(db_cleanup_error)}")
             logger.error(f"Error processing file {file.filename}: {str(e)}")
             raise
         """Process document content for vector store"""
         try:
             # Generate chunk IDs using document_id
+            chunk_ids = [
+                f"{document_id}-chunk-{i}" for i in range(len(chunks))]
             # Get embeddings
             embeddings = vector_store.embedding_function(chunks)
             # Prepare metadata for each chunk
             metadatas = [{
                 'document_id': document_id,
                 'chunk_index': i,
                 'total_chunks': len(chunks)
             } for i in range(len(chunks))]
             # Store in vector store
             vector_store.add_documents(
                 documents=chunks,
                 metadatas=metadatas,
                 ids=chunk_ids
             )
+            logger.info(
+                f"Successfully processed document {filename} (ID: {document_id}) into {len(chunks)} chunks")
         except Exception as e:
+            logger.error(
+                f"Error processing document {filename} (ID: {document_id}) for vector store: {str(e)}")
             raise
     def _is_supported_format(self, filename: str) -> bool:
         """Check if file format is supported"""
+        return any(filename.lower().endswith(ext)
+                   for ext in self.doc_processor.supported_formats)
     def _create_failed_file_entry(self, filename: str, error: str) -> dict:
         """Create a failed file entry"""
                 # Get filename from url_path
                 filename = doc['url_path'].split('/')[-1]
                 file_path = self.permanent_dir / filename
                 # Delete physical file if it exists
                 if file_path.exists():
                     file_path.unlink()
                 # Delete from MongoDB
                 return await self.mongodb.delete_document(document_id)
             return False
         except Exception as e:
             logger.error(f"Error deleting document: {str(e)}")
             raise
     def cleanup(self):
         """Clean up permanent directory if empty"""
         if self.permanent_dir.exists() and not any(self.permanent_dir.iterdir()):
+            self.permanent_dir.rmdir()

src/main.py CHANGED Viewed

@@ -123,7 +123,8 @@ async def get_all_documents():
                     "content_type": doc.get("content_type"),
                     "file_size": doc.get("file_size"),
                     "url_path": doc.get("url_path"),
-                    "upload_timestamp": doc.get("upload_timestamp")
                 }
                 formatted_documents.append(formatted_doc)
             except Exception as e:
@@ -334,7 +335,8 @@ async def process_drive_documents():
             google_service_account_path=settings.GOOGLE_SERVICE_ACCOUNT_PATH,
             folder_id=settings.GOOGLE_DRIVE_FOLDER_ID,
             temp_dir=settings.TEMP_DOWNLOAD_DIR,
-            doc_processor=doc_processor
         )
         # Process documents

                     "content_type": doc.get("content_type"),
                     "file_size": doc.get("file_size"),
                     "url_path": doc.get("url_path"),
+                    "upload_timestamp": doc.get("upload_timestamp"),
+                    "source": doc.get("source")
                 }
                 formatted_documents.append(formatted_doc)
             except Exception as e:
             google_service_account_path=settings.GOOGLE_SERVICE_ACCOUNT_PATH,
             folder_id=settings.GOOGLE_DRIVE_FOLDER_ID,
             temp_dir=settings.TEMP_DOWNLOAD_DIR,
+            doc_processor=doc_processor,
+            mongodb=mongodb  # Add MongoDB instance
         )
         # Process documents

src/utils/__pycache__/document_processor.cpython-312.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/document_processor.cpython-312.pyc and b/src/utils/__pycache__/document_processor.cpython-312.pyc differ

src/utils/__pycache__/drive_document_processor.cpython-312.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/drive_document_processor.cpython-312.pyc and b/src/utils/__pycache__/drive_document_processor.cpython-312.pyc differ

src/utils/document_processor.py CHANGED Viewed

@@ -116,124 +116,6 @@ class DocumentProcessor:
             strip_whitespace=False  # Keep whitespace to maintain markdown formatting
         )
-    def split_text(self, text: str) -> List[str]:
-        """Split text with enforced overlap while preserving structure"""
-        try:
-            # Get initial split using RecursiveCharacterTextSplitter
-            initial_chunks = self.text_splitter.split_text(text)
-            if len(initial_chunks) <= 1:
-                return initial_chunks
-            # Process chunks with enforced overlap
-            final_chunks = []
-            for i, current_chunk in enumerate(initial_chunks):
-                if i == 0:
-                    final_chunks.append(current_chunk)
-                    continue
-                prev_chunk = final_chunks[-1]
-                # Get the last part of previous chunk for overlap
-                overlap_size = min(self.chunk_overlap, len(prev_chunk))
-                overlap_text = prev_chunk[-overlap_size:]
-                # For tables, include the header row
-                if '|' in current_chunk and '\n' in current_chunk:
-                    table_lines = current_chunk.split('\n')
-                    header_lines = []
-                    for line in table_lines:
-                        if line.strip().startswith('|'):
-                            header_lines.append(line)
-                        else:
-                            break
-                    if header_lines:
-                        header_text = '\n'.join(header_lines) + '\n'
-                        overlap_text = header_text + overlap_text
-                # Create new chunk with overlap
-                new_chunk = overlap_text + current_chunk
-                # Ensure we don't have duplicate content at the overlap point
-                if current_chunk.startswith(overlap_text):
-                    new_chunk = current_chunk
-                # Add context from previous chunk when needed
-                if not any(marker in new_chunk for marker in ['**AGENDA**', '**DISCUSSIONS**', '| No |']):
-                    context_markers = ['**AGENDA**',
-                                       '**DISCUSSIONS**', '| No |']
-                    for marker in context_markers:
-                        if marker in prev_chunk and marker not in new_chunk:
-                            new_chunk = marker + "\n" + new_chunk
-                            break
-                final_chunks.append(new_chunk)
-            # Validate and log overlaps
-            for i in range(len(final_chunks)-1):
-                actual_overlap = self._find_actual_overlap(
-                    final_chunks[i], final_chunks[i+1])
-                logging.debug(
-                    f"Overlap between chunks {i} and {i+1}: {len(actual_overlap)} characters")
-                if len(actual_overlap) < self.chunk_overlap:
-                    logging.warning(
-                        f"Insufficient overlap between chunks {i} and {i+1}")
-            return final_chunks
-            for start, end in table_sections:
-                # Process text before table if exists
-                if start > current_position:
-                    non_table_text = text[current_position:start]
-                    if non_table_text.strip():
-                        text_chunks = self.text_splitter.split_text(
-                            non_table_text)
-                        if chunks and text_chunks:
-                            # Ensure overlap with previous chunk
-                            prev_chunk = chunks[-1]
-                            overlap = self._get_overlap_text(prev_chunk)
-                            text_chunks[0] = overlap + text_chunks[0]
-                        chunks.extend(text_chunks)
-                # Process table as a single chunk with overlap
-                table_text = text[start:end]
-                if chunks:
-                    prev_chunk = chunks[-1]
-                    overlap = self._get_overlap_text(prev_chunk)
-                    table_text = overlap + table_text
-                chunks.append(table_text)
-                current_position = end
-            # Process remaining text after last table
-            if current_position < len(text):
-                remaining_text = text[current_position:]
-                if remaining_text.strip():
-                    text_chunks = self.text_splitter.split_text(remaining_text)
-                    if chunks and text_chunks:
-                        # Ensure overlap with previous chunk
-                        prev_chunk = chunks[-1]
-                        overlap = self._get_overlap_text(prev_chunk)
-                        text_chunks[0] = overlap + text_chunks[0]
-                    chunks.extend(text_chunks)
-            # Validate and adjust overlaps
-            chunks = self._ensure_minimum_overlap(chunks)
-            # Log chunk details for debugging
-            for i in range(len(chunks)-1):
-                overlap = self._find_actual_overlap(chunks[i], chunks[i+1])
-                logging.debug(
-                    f"Overlap between chunks {i} and {i+1}: {len(overlap)} characters")
-                logging.debug(f"End of chunk {i}: {chunks[i][-50:]}")
-                logging.debug(f"Start of chunk {i+1}: {chunks[i+1][:50]}")
-            return chunks
-        except Exception as e:
-            logging.error(f"Error in split_text: {str(e)}")
-            # Fallback to original text splitter
-            return self.text_splitter.split_text(text)
     def _find_break_point(self, text: str, prev_chunk: str) -> int:
         """
     Find suitable breaking point that maintains document structure
@@ -630,38 +512,42 @@ class DocumentProcessor:
         """Calculate SHA-256 hash of text"""
         return hashlib.sha256(text.encode()).hexdigest()
-    async def process_document(self, file_path: Union[str, Path], metadata: Optional[Dict] = None) -> Dict:
-        """Process a document with metadata and content extraction"""
         file_path = Path(file_path)
         if not self._validate_file(file_path):
             raise ValueError(f"Invalid file: {file_path}")
         content = self._extract_content(file_path)
-        doc_metadata = self._generate_metadata(file_path, content, metadata)
-        # Try enhanced splitting with validation
-        chunks = self.split_text(content)
-        if not self._validate_chunks(content, chunks):
-            logging.warning(
-                "Enhanced splitting failed validation, falling back to original splitter")
-            chunks = self.text_splitter.split_text(content)
-        # Add logging to verify chunk overlap
-        for i in range(len(chunks)-1):
-            logging.debug(f"Chunk {i} ends with: {chunks[i][-50:]}")
-            logging.debug(f"Chunk {i+1} starts with: {chunks[i+1][:50]}")
-            logging.debug(
-                f"Overlap size: {self._calculate_overlap_size(chunks[i], chunks[i+1])} characters")
-        chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]
         return {
             'content': content,
             'chunks': chunks,
-            'chunk_hashes': chunk_hashes,
-            'metadata': doc_metadata,
-            'statistics': self._generate_statistics(content, chunks)
         }
     def _calculate_overlap_size(self, chunk1: str, chunk2: str) -> int:

             strip_whitespace=False  # Keep whitespace to maintain markdown formatting
         )
     def _find_break_point(self, text: str, prev_chunk: str) -> int:
         """
     Find suitable breaking point that maintains document structure
         """Calculate SHA-256 hash of text"""
         return hashlib.sha256(text.encode()).hexdigest()
+    def _process_chunks(self, text: str) -> List[str]:
+        """Process text into chunks with proper overlap"""
+        chunks = self.text_splitter.split_text(text)
+        # Ensure minimum chunk size and handle overlaps
+        processed_chunks = []
+        for i, chunk in enumerate(chunks):
+            if i > 0:
+                # Add overlap from previous chunk
+                overlap_start = max(
+                    0, len(processed_chunks[-1]) - self.chunk_overlap)
+                chunk = processed_chunks[-1][overlap_start:] + chunk
+            if len(chunk) > self.chunk_size:
+                # Split oversized chunks
+                sub_chunks = self.text_splitter.split_text(chunk)
+                processed_chunks.extend(sub_chunks)
+            else:
+                processed_chunks.append(chunk)
+        return processed_chunks
+    async def process_document(self, file_path: Union[str, Path]) -> Dict:
+        """Process document with chunk overlapping"""
         file_path = Path(file_path)
         if not self._validate_file(file_path):
             raise ValueError(f"Invalid file: {file_path}")
         content = self._extract_content(file_path)
+        chunks = self._process_chunks(content)
         return {
             'content': content,
             'chunks': chunks,
+            'metadata': self._generate_metadata(file_path, content)
         }
     def _calculate_overlap_size(self, chunk1: str, chunk2: str) -> int:

src/utils/drive_document_processor.py CHANGED Viewed

@@ -8,6 +8,8 @@ from src.utils.google_drive_service import GoogleDriveService
 from src.utils.document_processor import DocumentProcessor
 from src.vectorstores.chroma_vectorstore import ChromaVectorStore
 from src.utils.logger import logger
 class DriveDocumentProcessor:
     def __init__(
@@ -15,38 +17,41 @@ class DriveDocumentProcessor:
         google_service_account_path: str,
         folder_id: str,
         temp_dir: str,
-        doc_processor: DocumentProcessor
     ):
         """
         Initialize Drive Document Processor
         Args:
             google_service_account_path (str): Path to Google service account credentials
             folder_id (str): Google Drive folder ID to process
             temp_dir (str): Directory for temporary files
             doc_processor (DocumentProcessor): Instance of DocumentProcessor
         """
-        self.google_drive_service = GoogleDriveService(google_service_account_path)
         self.folder_id = folder_id
         self.temp_dir = Path(temp_dir)
         self.doc_processor = doc_processor
         # Create temp directory if it doesn't exist
         self.temp_dir.mkdir(exist_ok=True)
         # Define supported MIME types
         self.supported_mime_types = {
             # Google Docs
             'application/vnd.google-apps.document': '.docx',
             # Microsoft Word Documents
             'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
             'application/msword': '.doc',
             # Microsoft Excel Documents
             'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
             'application/vnd.ms-excel': '.xls',
             # Text Documents
             'text/plain': '.txt',
             'text/csv': '.csv',
@@ -55,7 +60,7 @@ class DriveDocumentProcessor:
             'text/xml': '.xml',
             'application/json': '.json',
             'application/rtf': '.rtf',
             # PDF Documents
             'application/pdf': '.pdf'
         }
@@ -64,18 +69,78 @@ class DriveDocumentProcessor:
             'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
         }
     async def process_documents(
         self,
         vector_store: ChromaVectorStore,
-        include_subfolders: bool = True  # New parameter with default True for backward compatibility
     ) -> Dict[str, Any]:
         """
         Process all documents in the specified Drive folder
         Args:
             vector_store (ChromaVectorStore): Vector store instance
             include_subfolders (bool): Whether to process documents in subfolders
         Returns:
             Dict[str, Any]: Processing results
         """
@@ -85,32 +150,35 @@ class DriveDocumentProcessor:
                 self.folder_id,
                 include_subfolders=include_subfolders
             )
             processed_files = []
             skipped_files = []
             errors = []
             for file in files:
                 # Skip if it's a folder
                 if file.get('mimeType') == 'application/vnd.google-apps.folder':
                     continue
                 # Get file path (including folder structure if available)
                 file_path = self._get_file_path(file)
                 file['display_path'] = file_path
                 result = await self._process_single_file(file, vector_store)
                 if result['status'] == 'processed':
                     processed_files.append(result['data'])
                 elif result['status'] == 'skipped':
                     skipped_files.append(result['data'])
                 else:  # status == 'error'
                     errors.append(result['data'])
             # Clean up temporary directory if empty
             self._cleanup_temp_dir()
             return {
                 "status": "completed",
                 "processed_files": {
@@ -126,7 +194,7 @@ class DriveDocumentProcessor:
                     "details": errors
                 }
             }
         except Exception as e:
             logger.error(f"Error processing Drive documents: {str(e)}")
             raise HTTPException(
@@ -137,20 +205,20 @@ class DriveDocumentProcessor:
     def _get_file_path(self, file: Dict[str, Any]) -> str:
         """
         Get the full path for a file including its folder structure
         Args:
             file (Dict[str, Any]): File metadata
         Returns:
             str: Display path of the file
         """
         path_parts = [file['name']]
         # Add folder path if available (new structure)
         if folder_path := file.get('folder_path', []):
             for folder in reversed(folder_path):
                 path_parts.insert(0, folder['name'])
         return '/'.join(path_parts)
     async def _process_single_file(
@@ -160,7 +228,7 @@ class DriveDocumentProcessor:
     ) -> Dict[str, Any]:
         """Process a single Drive file"""
         mime_type = file.get('mimeType', '')
         # Skip if mime type not supported
         if mime_type not in self.supported_mime_types:
             return {
@@ -171,11 +239,11 @@ class DriveDocumentProcessor:
                     'reason': f'Unsupported mime type: {mime_type}'
                 }
             }
         try:
             document_id = file['id']
             modified_time = file.get('modifiedTime', 'N/A')
             # Check if document should be processed
             if self.save_document(document_id, vector_store, modified_time):
                 # Download and process file
@@ -183,13 +251,13 @@ class DriveDocumentProcessor:
                     file['id'],
                     mime_type
                 )
                 try:
                     # Process document
                     processed_doc = await self.doc_processor.process_document(
                         str(temp_file_path)
                     )
                     # Add to vector store with path information
                     self._add_to_vector_store(
                         processed_doc['chunks'],
@@ -197,7 +265,17 @@ class DriveDocumentProcessor:
                         mime_type,
                         vector_store
                     )
                     return {
                         'status': 'processed',
                         'data': {
@@ -207,7 +285,7 @@ class DriveDocumentProcessor:
                             'chunks_processed': len(processed_doc['chunks'])
                         }
                     }
                 finally:
                     # Clean up temporary file
                     if temp_file_path.exists():
@@ -221,7 +299,7 @@ class DriveDocumentProcessor:
                         'reason': 'Document already exists in the memory.'
                     }
                 }
         except Exception as e:
             logger.error(f"Error processing file {file['name']}: {str(e)}")
             return {
@@ -243,7 +321,7 @@ class DriveDocumentProcessor:
         """Add processed chunks to vector store with path information"""
         chunk_metadatas = []
         chunk_ids = []
         modified_time = file.get('modifiedTime', 'N/A')
         file_path = file.get('display_path', file['name'])
@@ -260,7 +338,7 @@ class DriveDocumentProcessor:
                 "file_type": self.supported_mime_types[mime_type],
                 "is_google_doc": mime_type.startswith('application/vnd.google-apps')
             })
         vector_store.add_documents(
             documents=chunks,
             metadatas=chunk_metadatas,
@@ -275,7 +353,7 @@ class DriveDocumentProcessor:
         """Download and save file to temporary location"""
         extension = self.supported_mime_types[mime_type]
         temp_file_path = self.temp_dir / f"{file_id}{extension}"
         if mime_type in self.google_docs_export_types:
             # Download Google Doc in the specified export format
             content = self.google_drive_service.export_file(
@@ -285,13 +363,13 @@ class DriveDocumentProcessor:
         else:
             # Download regular file
             content = self.google_drive_service.download_file(file_id)
         with open(temp_file_path, 'wb') as f:
             if isinstance(content, str):
                 f.write(content.encode('utf-8'))
             else:
                 f.write(content)
         return temp_file_path
     def save_document(
@@ -302,35 +380,36 @@ class DriveDocumentProcessor:
     ) -> bool:
         """
         Check if document needs to be processed based on modification date
         Args:
             document_id (str): ID of the document to check
             vector_store (ChromaVectorStore): Vector store instance
             modified_date (str): Modified date to compare against
         Returns:
             bool: True if document should be processed, False otherwise
         """
         try:
             # Retrieve all chunks for the given document_id
             chunks = vector_store.get_document_chunks(document_id)
             if not chunks:
                 # Document doesn't exist in vector store
                 return True
             # Check the modified_time of the first chunk
             first_chunk_metadata = chunks[0].get("metadata", {})
             if first_chunk_metadata.get("modified_time") != modified_date:
                 # If modified_time doesn't match, delete existing chunks
                 vector_store.delete_document(document_id)
-                logger.info(f"Document {document_id} has been modified, will reprocess")
                 return True
             logger.info(f"Document {document_id} is up to date, skipping")
             return False
         except Exception as e:
             logger.error(f"Error checking document status: {str(e)}")
             # In case of error, process the document to be safe
@@ -343,4 +422,4 @@ class DriveDocumentProcessor:
                 self.temp_dir.rmdir()
         except Exception as e:
             logger.error(f"Error cleaning up temp directory: {str(e)}")
-            # Don't raise the error as this is a cleanup operation

 from src.utils.document_processor import DocumentProcessor
 from src.vectorstores.chroma_vectorstore import ChromaVectorStore
 from src.utils.logger import logger
+from src.db.mongodb_store import MongoDBStore
 class DriveDocumentProcessor:
     def __init__(
         google_service_account_path: str,
         folder_id: str,
         temp_dir: str,
+        doc_processor: DocumentProcessor,
+        mongodb: MongoDBStore  # Add MongoDB
     ):
         """
         Initialize Drive Document Processor
         Args:
             google_service_account_path (str): Path to Google service account credentials
             folder_id (str): Google Drive folder ID to process
             temp_dir (str): Directory for temporary files
             doc_processor (DocumentProcessor): Instance of DocumentProcessor
         """
+        self.google_drive_service = GoogleDriveService(
+            google_service_account_path)
         self.folder_id = folder_id
         self.temp_dir = Path(temp_dir)
         self.doc_processor = doc_processor
+        self.mongodb = mongodb  # Store MongoDB instance
         # Create temp directory if it doesn't exist
         self.temp_dir.mkdir(exist_ok=True)
         # Define supported MIME types
         self.supported_mime_types = {
             # Google Docs
             'application/vnd.google-apps.document': '.docx',
             # Microsoft Word Documents
             'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
             'application/msword': '.doc',
             # Microsoft Excel Documents
             'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
             'application/vnd.ms-excel': '.xls',
             # Text Documents
             'text/plain': '.txt',
             'text/csv': '.csv',
             'text/xml': '.xml',
             'application/json': '.json',
             'application/rtf': '.rtf',
             # PDF Documents
             'application/pdf': '.pdf'
         }
             'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
         }
+    async def _cleanup_orphaned_documents(
+        self,
+        drive_files: List[Dict[str, Any]],
+        vector_store: ChromaVectorStore
+    ) -> Dict[str, Any]:
+        """
+        Clean up documents that exist in MongoDB but not in Google Drive
+        Args:
+            drive_files (List[Dict[str, Any]]): List of files from Google Drive
+            vector_store (ChromaVectorStore): Vector store instance
+        Returns:
+            Dict[str, Any]: Cleanup statistics
+        """
+        try:
+            # Get all documents from MongoDB
+            mongo_docs = await self.mongodb.get_all_documents()
+            # Create set of Google Drive file IDs
+            drive_file_ids = {file['id'] for file in drive_files}
+            deleted_count = 0
+            failed_deletions = []
+            # Check each MongoDB document
+            for doc in mongo_docs:
+                # Only process Google Drive documents
+                if doc.get('source') != 'google_drive':
+                    continue
+                doc_id = doc.get('document_id')
+                if not doc_id or doc_id not in drive_file_ids:
+                    try:
+                        # Delete from MongoDB
+                        await self.mongodb.delete_document(doc_id)
+                        # Delete from vector store
+                        vector_store.delete_document(doc_id)
+                        deleted_count += 1
+                    except Exception as e:
+                        logger.error(
+                            f"Error deleting orphaned document {doc_id}: {str(e)}")
+                        failed_deletions.append({
+                            'document_id': doc_id,
+                            'error': str(e)
+                        })
+            return {
+                'orphaned_documents_deleted': deleted_count,
+                'failed_deletions': failed_deletions
+            }
+        except Exception as e:
+            logger.error(f"Error in cleanup_orphaned_documents: {str(e)}")
+            raise
     async def process_documents(
         self,
         vector_store: ChromaVectorStore,
+        # New parameter with default True for backward compatibility
+        include_subfolders: bool = True
     ) -> Dict[str, Any]:
         """
         Process all documents in the specified Drive folder
         Args:
             vector_store (ChromaVectorStore): Vector store instance
             include_subfolders (bool): Whether to process documents in subfolders
         Returns:
             Dict[str, Any]: Processing results
         """
                 self.folder_id,
                 include_subfolders=include_subfolders
             )
+            # Clean up orphaned documents first
+            cleanup_results = await self._cleanup_orphaned_documents(files, vector_store)
             processed_files = []
             skipped_files = []
             errors = []
             for file in files:
                 # Skip if it's a folder
                 if file.get('mimeType') == 'application/vnd.google-apps.folder':
                     continue
                 # Get file path (including folder structure if available)
                 file_path = self._get_file_path(file)
                 file['display_path'] = file_path
                 result = await self._process_single_file(file, vector_store)
                 if result['status'] == 'processed':
                     processed_files.append(result['data'])
                 elif result['status'] == 'skipped':
                     skipped_files.append(result['data'])
                 else:  # status == 'error'
                     errors.append(result['data'])
             # Clean up temporary directory if empty
             self._cleanup_temp_dir()
             return {
                 "status": "completed",
                 "processed_files": {
                     "details": errors
                 }
             }
         except Exception as e:
             logger.error(f"Error processing Drive documents: {str(e)}")
             raise HTTPException(
     def _get_file_path(self, file: Dict[str, Any]) -> str:
         """
         Get the full path for a file including its folder structure
         Args:
             file (Dict[str, Any]): File metadata
         Returns:
             str: Display path of the file
         """
         path_parts = [file['name']]
         # Add folder path if available (new structure)
         if folder_path := file.get('folder_path', []):
             for folder in reversed(folder_path):
                 path_parts.insert(0, folder['name'])
         return '/'.join(path_parts)
     async def _process_single_file(
     ) -> Dict[str, Any]:
         """Process a single Drive file"""
         mime_type = file.get('mimeType', '')
         # Skip if mime type not supported
         if mime_type not in self.supported_mime_types:
             return {
                     'reason': f'Unsupported mime type: {mime_type}'
                 }
             }
         try:
             document_id = file['id']
             modified_time = file.get('modifiedTime', 'N/A')
             # Check if document should be processed
             if self.save_document(document_id, vector_store, modified_time):
                 # Download and process file
                     file['id'],
                     mime_type
                 )
                 try:
                     # Process document
                     processed_doc = await self.doc_processor.process_document(
                         str(temp_file_path)
                     )
                     # Add to vector store with path information
                     self._add_to_vector_store(
                         processed_doc['chunks'],
                         mime_type,
                         vector_store
                     )
+                    # Add MongoDB storage - Store Google Drive URL
+                    await self.mongodb.store_document(
+                        document_id=document_id,
+                        filename=file['name'],
+                        content_type=mime_type,
+                        file_size=0,  # Not needed for drive documents
+                        url_path=f"https://drive.google.com/file/d/{document_id}/view",
+                        source="google_drive"
+                    )
                     return {
                         'status': 'processed',
                         'data': {
                             'chunks_processed': len(processed_doc['chunks'])
                         }
                     }
                 finally:
                     # Clean up temporary file
                     if temp_file_path.exists():
                         'reason': 'Document already exists in the memory.'
                     }
                 }
         except Exception as e:
             logger.error(f"Error processing file {file['name']}: {str(e)}")
             return {
         """Add processed chunks to vector store with path information"""
         chunk_metadatas = []
         chunk_ids = []
         modified_time = file.get('modifiedTime', 'N/A')
         file_path = file.get('display_path', file['name'])
                 "file_type": self.supported_mime_types[mime_type],
                 "is_google_doc": mime_type.startswith('application/vnd.google-apps')
             })
         vector_store.add_documents(
             documents=chunks,
             metadatas=chunk_metadatas,
         """Download and save file to temporary location"""
         extension = self.supported_mime_types[mime_type]
         temp_file_path = self.temp_dir / f"{file_id}{extension}"
         if mime_type in self.google_docs_export_types:
             # Download Google Doc in the specified export format
             content = self.google_drive_service.export_file(
         else:
             # Download regular file
             content = self.google_drive_service.download_file(file_id)
         with open(temp_file_path, 'wb') as f:
             if isinstance(content, str):
                 f.write(content.encode('utf-8'))
             else:
                 f.write(content)
         return temp_file_path
     def save_document(
     ) -> bool:
         """
         Check if document needs to be processed based on modification date
         Args:
             document_id (str): ID of the document to check
             vector_store (ChromaVectorStore): Vector store instance
             modified_date (str): Modified date to compare against
         Returns:
             bool: True if document should be processed, False otherwise
         """
         try:
             # Retrieve all chunks for the given document_id
             chunks = vector_store.get_document_chunks(document_id)
             if not chunks:
                 # Document doesn't exist in vector store
                 return True
             # Check the modified_time of the first chunk
             first_chunk_metadata = chunks[0].get("metadata", {})
             if first_chunk_metadata.get("modified_time") != modified_date:
                 # If modified_time doesn't match, delete existing chunks
                 vector_store.delete_document(document_id)
+                logger.info(
+                    f"Document {document_id} has been modified, will reprocess")
                 return True
             logger.info(f"Document {document_id} is up to date, skipping")
             return False
         except Exception as e:
             logger.error(f"Error checking document status: {str(e)}")
             # In case of error, process the document to be safe
                 self.temp_dir.rmdir()
         except Exception as e:
             logger.error(f"Error cleaning up temp directory: {str(e)}")
+            # Don't raise the error as this is a cleanup operation

src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc CHANGED Viewed

Binary files a/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc differ

src/vectorstores/chroma_vectorstore.py CHANGED Viewed

@@ -97,70 +97,83 @@ class ChromaVectorStore(BaseVectorStore):
         top_k: int = 3,
         **kwargs
     ) -> List[Dict[str, Any]]:
-        """
-        Perform similarity search with improved matching
-        """
         try:
-            # Increase n_results to get more potential matches
             results = self.collection.query(
                 query_embeddings=[query_embedding],
-                n_results=10,  # Get more initial results
                 include=['documents', 'metadatas', 'distances']
             )
-            if not results or 'documents' not in results or not results['documents']:
-                logging.warning("No results found in similarity search")
                 return []
             formatted_results = []
-            documents = results['documents'][0]  # First query's results
-            metadatas = results['metadatas'][0] if results.get('metadatas') else [
-                None] * len(documents)
-            distances = results['distances'][0] if results.get('distances') else [
-                None] * len(documents)
-            # Process all results
             for doc, meta, dist in zip(documents, metadatas, distances):
-                # Convert distance to similarity score (1 is most similar, 0 is least)
-                similarity_score = 1.0 - \
-                    (dist or 0.0) if dist is not None else None
-                # More permissive threshold and include all results for filtering
-                if similarity_score is not None and similarity_score > 0.2:  # Lower threshold
                     formatted_results.append({
-                        'text': doc,
-                        'metadata': meta or {},
-                        'score': similarity_score
                     })
-            # Sort by score and get top_k results
-            formatted_results.sort(key=lambda x: x['score'] or 0, reverse=True)
-            # Check if results are from same document and get consecutive chunks
-            if formatted_results:
-                first_doc_id = formatted_results[0]['metadata'].get(
-                    'document_id')
-                all_chunks_same_doc = []
-                # Get all chunks from the same document
-                for result in formatted_results:
-                    if result['metadata'].get('document_id') == first_doc_id:
-                        all_chunks_same_doc.append(result)
-                # Sort chunks by their index to maintain document flow
-                all_chunks_same_doc.sort(
-                    key=lambda x: x['metadata'].get('chunk_index', 0)
-                )
-                # Return either all chunks from same document or top_k results
-                if len(all_chunks_same_doc) > 0:
-                    return all_chunks_same_doc[:top_k]
             return formatted_results[:top_k]
         except Exception as e:
-            logging.error(
-                f"Error performing similarity search in ChromaDB: {str(e)}")
             raise
     def get_all_documents(

         top_k: int = 3,
         **kwargs
     ) -> List[Dict[str, Any]]:
+        """Perform similarity search with improved chunk handling"""
         try:
+            # Get more initial results to account for sequential chunks
             results = self.collection.query(
                 query_embeddings=[query_embedding],
+                n_results=max(top_k * 2, 10),
                 include=['documents', 'metadatas', 'distances']
             )
+            if not results or 'documents' not in results:
                 return []
             formatted_results = []
+            documents = results['documents'][0]
+            metadatas = results['metadatas'][0]
+            distances = results['distances'][0]
+            # Group chunks by document_id
+            doc_chunks = {}
             for doc, meta, dist in zip(documents, metadatas, distances):
+                doc_id = meta.get('document_id')
+                chunk_index = meta.get('chunk_index', 0)
+                if doc_id not in doc_chunks:
+                    doc_chunks[doc_id] = []
+                doc_chunks[doc_id].append({
+                    'text': doc,
+                    'metadata': meta,
+                    'score': 1.0 - dist,
+                    'chunk_index': chunk_index
+                })
+            # Process each document's chunks
+            for doc_id, chunks in doc_chunks.items():
+                # Sort chunks by index
+                chunks.sort(key=lambda x: x['chunk_index'])
+                # Find sequences of chunks with good scores
+                good_sequences = []
+                current_sequence = []
+                for chunk in chunks:
+                    if chunk['score'] > 0.3:  # Adjust threshold as needed
+                        if not current_sequence or \
+                           chunk['chunk_index'] == current_sequence[-1]['chunk_index'] + 1:
+                            current_sequence.append(chunk)
+                        else:
+                            if current_sequence:
+                                good_sequences.append(current_sequence)
+                            current_sequence = [chunk]
+                    else:
+                        if current_sequence:
+                            good_sequences.append(current_sequence)
+                        current_sequence = []
+                if current_sequence:
+                    good_sequences.append(current_sequence)
+                # Add best sequences to results
+                for sequence in good_sequences:
+                    avg_score = sum(c['score']
+                                    for c in sequence) / len(sequence)
+                    combined_text = ' '.join(c['text'] for c in sequence)
                     formatted_results.append({
+                        'text': combined_text,
+                        'metadata': sequence[0]['metadata'],
+                        'score': avg_score
                     })
+            # Sort by score and return top_k
+            formatted_results.sort(key=lambda x: x['score'], reverse=True)
             return formatted_results[:top_k]
         except Exception as e:
+            logging.error(f"Error in similarity search: {str(e)}")
             raise
     def get_all_documents(