# src/utils/drive_document_processor.py from pathlib import Path from typing import Dict, List, Any, Tuple import logging from fastapi import HTTPException from src.utils.google_drive_service import GoogleDriveService from src.utils.document_processor import DocumentProcessor from src.vectorstores.chroma_vectorstore import ChromaVectorStore from src.utils.logger import logger from src.db.mongodb_store import MongoDBStore class DriveDocumentProcessor: def __init__( self, google_service_account_path: str, folder_id: str, temp_dir: str, doc_processor: DocumentProcessor, mongodb: MongoDBStore # Add MongoDB ): """ Initialize Drive Document Processor Args: google_service_account_path (str): Path to Google service account credentials folder_id (str): Google Drive folder ID to process temp_dir (str): Directory for temporary files doc_processor (DocumentProcessor): Instance of DocumentProcessor """ self.google_drive_service = GoogleDriveService( google_service_account_path) self.folder_id = folder_id self.temp_dir = Path(temp_dir) self.doc_processor = doc_processor self.mongodb = mongodb # Store MongoDB instance # Create temp directory if it doesn't exist self.temp_dir.mkdir(exist_ok=True) # Define supported MIME types self.supported_mime_types = { # Google Docs 'application/vnd.google-apps.document': '.docx', # Microsoft Word Documents 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx', 'application/msword': '.doc', # Microsoft Excel Documents 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx', 'application/vnd.ms-excel': '.xls', # Text Documents 'text/plain': '.txt', 'text/csv': '.csv', 'text/markdown': '.md', 'text/html': '.html', 'text/xml': '.xml', 'application/json': '.json', 'application/rtf': '.rtf', # PDF Documents 'application/pdf': '.pdf' } self.google_docs_export_types = { 'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' } async def _cleanup_orphaned_documents( self, drive_files: List[Dict[str, Any]], vector_store: ChromaVectorStore ) -> Dict[str, Any]: """ Clean up documents that exist in MongoDB but not in Google Drive Args: drive_files (List[Dict[str, Any]]): List of files from Google Drive vector_store (ChromaVectorStore): Vector store instance Returns: Dict[str, Any]: Cleanup statistics """ try: # Get all documents from MongoDB mongo_docs = await self.mongodb.get_all_documents() # Create set of Google Drive file IDs drive_file_ids = {file['id'] for file in drive_files} deleted_count = 0 failed_deletions = [] # Check each MongoDB document for doc in mongo_docs: # Only process Google Drive documents if doc.get('source') != 'google_drive': continue doc_id = doc.get('document_id') if not doc_id or doc_id not in drive_file_ids: try: # Delete from MongoDB await self.mongodb.delete_document(doc_id) # Delete from vector store vector_store.delete_document(doc_id) deleted_count += 1 except Exception as e: logger.error( f"Error deleting orphaned document {doc_id}: {str(e)}") failed_deletions.append({ 'document_id': doc_id, 'error': str(e) }) return { 'orphaned_documents_deleted': deleted_count, 'failed_deletions': failed_deletions } except Exception as e: logger.error(f"Error in cleanup_orphaned_documents: {str(e)}") raise async def process_documents( self, vector_store: ChromaVectorStore, # New parameter with default True for backward compatibility include_subfolders: bool = True ) -> Dict[str, Any]: """ Process all documents in the specified Drive folder Args: vector_store (ChromaVectorStore): Vector store instance include_subfolders (bool): Whether to process documents in subfolders Returns: Dict[str, Any]: Processing results """ try: # Get documents from folder files = self.google_drive_service.get_folder_contents( self.folder_id, include_subfolders=include_subfolders ) # Clean up orphaned documents first cleanup_results = await self._cleanup_orphaned_documents(files, vector_store) processed_files = [] skipped_files = [] errors = [] for file in files: # Skip if it's a folder if file.get('mimeType') == 'application/vnd.google-apps.folder': continue # Get file path (including folder structure if available) file_path = self._get_file_path(file) file['display_path'] = file_path result = await self._process_single_file(file, vector_store) if result['status'] == 'processed': processed_files.append(result['data']) elif result['status'] == 'skipped': skipped_files.append(result['data']) else: # status == 'error' errors.append(result['data']) # Clean up temporary directory if empty self._cleanup_temp_dir() return { "status": "completed", "processed_files": { "count": len(processed_files), "details": processed_files }, "skipped_files": { "count": len(skipped_files), "details": skipped_files }, "errors": { "count": len(errors), "details": errors } } except Exception as e: logger.error(f"Error processing Drive documents: {str(e)}") raise HTTPException( status_code=500, detail=f"Failed to process drive documents: {str(e)}" ) def _get_file_path(self, file: Dict[str, Any]) -> str: """ Get the full path for a file including its folder structure Args: file (Dict[str, Any]): File metadata Returns: str: Display path of the file """ path_parts = [file['name']] # Add folder path if available (new structure) if folder_path := file.get('folder_path', []): for folder in reversed(folder_path): path_parts.insert(0, folder['name']) return '/'.join(path_parts) async def _process_single_file( self, file: Dict[str, Any], vector_store: ChromaVectorStore ) -> Dict[str, Any]: """Process a single Drive file""" mime_type = file.get('mimeType', '') # Skip if mime type not supported if mime_type not in self.supported_mime_types: return { 'status': 'skipped', 'data': { 'name': file['name'], 'path': file.get('display_path', file['name']), 'reason': f'Unsupported mime type: {mime_type}' } } try: document_id = file['id'] modified_time = file.get('modifiedTime', 'N/A') # Check if document should be processed if self.save_document(document_id, vector_store, modified_time): # Download and process file temp_file_path = await self._download_and_save_file( file['id'], mime_type ) try: # Process document processed_doc = await self.doc_processor.process_document( str(temp_file_path) ) # Add to vector store with path information self._add_to_vector_store( processed_doc['chunks'], file, mime_type, vector_store ) # Add MongoDB storage - Store Google Drive URL await self.mongodb.store_document( document_id=document_id, filename=file['name'], content_type=mime_type, file_size=0, # Not needed for drive documents url_path=f"https://drive.google.com/file/d/{document_id}/view", source="google_drive" ) return { 'status': 'processed', 'data': { 'name': file['name'], 'path': file.get('display_path', file['name']), 'id': file['id'], 'chunks_processed': len(processed_doc['chunks']) } } finally: # Clean up temporary file if temp_file_path.exists(): temp_file_path.unlink() else: return { 'status': 'skipped', 'data': { 'name': file['name'], 'path': file.get('display_path', file['name']), 'reason': 'Document already exists in the memory.' } } except Exception as e: logger.error(f"Error processing file {file['name']}: {str(e)}") return { 'status': 'error', 'data': { 'file_name': file['name'], 'path': file.get('display_path', file['name']), 'error': str(e) } } def _add_to_vector_store( self, chunks: List[str], file: Dict[str, Any], mime_type: str, vector_store: ChromaVectorStore ) -> None: """Add processed chunks to vector store with path information""" chunk_metadatas = [] chunk_ids = [] modified_time = file.get('modifiedTime', 'N/A') file_path = file.get('display_path', file['name']) for i, chunk in enumerate(chunks): chunk_id = f"{file['id']}-chunk-{i}" chunk_ids.append(chunk_id) chunk_metadatas.append({ "source": file_path, # Use full path instead of just name "document_id": file['id'], "chunk_index": i, "mime_type": mime_type, "modified_time": modified_time, "total_chunks": len(chunks), "file_type": self.supported_mime_types[mime_type], "is_google_doc": mime_type.startswith('application/vnd.google-apps') }) vector_store.add_documents( documents=chunks, metadatas=chunk_metadatas, ids=chunk_ids ) async def _download_and_save_file( self, file_id: str, mime_type: str ) -> Path: """Download and save file to temporary location""" extension = self.supported_mime_types[mime_type] temp_file_path = self.temp_dir / f"{file_id}{extension}" if mime_type in self.google_docs_export_types: # Download Google Doc in the specified export format content = self.google_drive_service.export_file( file_id, self.google_docs_export_types[mime_type] ) else: # Download regular file content = self.google_drive_service.download_file(file_id) with open(temp_file_path, 'wb') as f: if isinstance(content, str): f.write(content.encode('utf-8')) else: f.write(content) return temp_file_path def save_document( self, document_id: str, vector_store: ChromaVectorStore, modified_date: str ) -> bool: """ Check if document needs to be processed based on modification date Args: document_id (str): ID of the document to check vector_store (ChromaVectorStore): Vector store instance modified_date (str): Modified date to compare against Returns: bool: True if document should be processed, False otherwise """ try: # Retrieve all chunks for the given document_id chunks = vector_store.get_document_chunks(document_id) if not chunks: # Document doesn't exist in vector store return True # Check the modified_time of the first chunk first_chunk_metadata = chunks[0].get("metadata", {}) if first_chunk_metadata.get("modified_time") != modified_date: # If modified_time doesn't match, delete existing chunks vector_store.delete_document(document_id) logger.info( f"Document {document_id} has been modified, will reprocess") return True logger.info(f"Document {document_id} is up to date, skipping") return False except Exception as e: logger.error(f"Error checking document status: {str(e)}") # In case of error, process the document to be safe return True def _cleanup_temp_dir(self) -> None: """Clean up temporary directory if empty""" try: if self.temp_dir.exists() and not any(self.temp_dir.iterdir()): self.temp_dir.rmdir() except Exception as e: logger.error(f"Error cleaning up temp directory: {str(e)}") # Don't raise the error as this is a cleanup operation