Spaces:
Running
Running
Log google drive documents in the mongodb, add source of the document and made chunks to overlap text.
acdfaa9
# src/utils/drive_document_processor.py | |
from pathlib import Path | |
from typing import Dict, List, Any, Tuple | |
import logging | |
from fastapi import HTTPException | |
from src.utils.google_drive_service import GoogleDriveService | |
from src.utils.document_processor import DocumentProcessor | |
from src.vectorstores.chroma_vectorstore import ChromaVectorStore | |
from src.utils.logger import logger | |
from src.db.mongodb_store import MongoDBStore | |
class DriveDocumentProcessor: | |
def __init__( | |
self, | |
google_service_account_path: str, | |
folder_id: str, | |
temp_dir: str, | |
doc_processor: DocumentProcessor, | |
mongodb: MongoDBStore # Add MongoDB | |
): | |
""" | |
Initialize Drive Document Processor | |
Args: | |
google_service_account_path (str): Path to Google service account credentials | |
folder_id (str): Google Drive folder ID to process | |
temp_dir (str): Directory for temporary files | |
doc_processor (DocumentProcessor): Instance of DocumentProcessor | |
""" | |
self.google_drive_service = GoogleDriveService( | |
google_service_account_path) | |
self.folder_id = folder_id | |
self.temp_dir = Path(temp_dir) | |
self.doc_processor = doc_processor | |
self.mongodb = mongodb # Store MongoDB instance | |
# Create temp directory if it doesn't exist | |
self.temp_dir.mkdir(exist_ok=True) | |
# Define supported MIME types | |
self.supported_mime_types = { | |
# Google Docs | |
'application/vnd.google-apps.document': '.docx', | |
# Microsoft Word Documents | |
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx', | |
'application/msword': '.doc', | |
# Microsoft Excel Documents | |
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx', | |
'application/vnd.ms-excel': '.xls', | |
# Text Documents | |
'text/plain': '.txt', | |
'text/csv': '.csv', | |
'text/markdown': '.md', | |
'text/html': '.html', | |
'text/xml': '.xml', | |
'application/json': '.json', | |
'application/rtf': '.rtf', | |
# PDF Documents | |
'application/pdf': '.pdf' | |
} | |
self.google_docs_export_types = { | |
'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' | |
} | |
async def _cleanup_orphaned_documents( | |
self, | |
drive_files: List[Dict[str, Any]], | |
vector_store: ChromaVectorStore | |
) -> Dict[str, Any]: | |
""" | |
Clean up documents that exist in MongoDB but not in Google Drive | |
Args: | |
drive_files (List[Dict[str, Any]]): List of files from Google Drive | |
vector_store (ChromaVectorStore): Vector store instance | |
Returns: | |
Dict[str, Any]: Cleanup statistics | |
""" | |
try: | |
# Get all documents from MongoDB | |
mongo_docs = await self.mongodb.get_all_documents() | |
# Create set of Google Drive file IDs | |
drive_file_ids = {file['id'] for file in drive_files} | |
deleted_count = 0 | |
failed_deletions = [] | |
# Check each MongoDB document | |
for doc in mongo_docs: | |
# Only process Google Drive documents | |
if doc.get('source') != 'google_drive': | |
continue | |
doc_id = doc.get('document_id') | |
if not doc_id or doc_id not in drive_file_ids: | |
try: | |
# Delete from MongoDB | |
await self.mongodb.delete_document(doc_id) | |
# Delete from vector store | |
vector_store.delete_document(doc_id) | |
deleted_count += 1 | |
except Exception as e: | |
logger.error( | |
f"Error deleting orphaned document {doc_id}: {str(e)}") | |
failed_deletions.append({ | |
'document_id': doc_id, | |
'error': str(e) | |
}) | |
return { | |
'orphaned_documents_deleted': deleted_count, | |
'failed_deletions': failed_deletions | |
} | |
except Exception as e: | |
logger.error(f"Error in cleanup_orphaned_documents: {str(e)}") | |
raise | |
async def process_documents( | |
self, | |
vector_store: ChromaVectorStore, | |
# New parameter with default True for backward compatibility | |
include_subfolders: bool = True | |
) -> Dict[str, Any]: | |
""" | |
Process all documents in the specified Drive folder | |
Args: | |
vector_store (ChromaVectorStore): Vector store instance | |
include_subfolders (bool): Whether to process documents in subfolders | |
Returns: | |
Dict[str, Any]: Processing results | |
""" | |
try: | |
# Get documents from folder | |
files = self.google_drive_service.get_folder_contents( | |
self.folder_id, | |
include_subfolders=include_subfolders | |
) | |
# Clean up orphaned documents first | |
cleanup_results = await self._cleanup_orphaned_documents(files, vector_store) | |
processed_files = [] | |
skipped_files = [] | |
errors = [] | |
for file in files: | |
# Skip if it's a folder | |
if file.get('mimeType') == 'application/vnd.google-apps.folder': | |
continue | |
# Get file path (including folder structure if available) | |
file_path = self._get_file_path(file) | |
file['display_path'] = file_path | |
result = await self._process_single_file(file, vector_store) | |
if result['status'] == 'processed': | |
processed_files.append(result['data']) | |
elif result['status'] == 'skipped': | |
skipped_files.append(result['data']) | |
else: # status == 'error' | |
errors.append(result['data']) | |
# Clean up temporary directory if empty | |
self._cleanup_temp_dir() | |
return { | |
"status": "completed", | |
"processed_files": { | |
"count": len(processed_files), | |
"details": processed_files | |
}, | |
"skipped_files": { | |
"count": len(skipped_files), | |
"details": skipped_files | |
}, | |
"errors": { | |
"count": len(errors), | |
"details": errors | |
} | |
} | |
except Exception as e: | |
logger.error(f"Error processing Drive documents: {str(e)}") | |
raise HTTPException( | |
status_code=500, | |
detail=f"Failed to process drive documents: {str(e)}" | |
) | |
def _get_file_path(self, file: Dict[str, Any]) -> str: | |
""" | |
Get the full path for a file including its folder structure | |
Args: | |
file (Dict[str, Any]): File metadata | |
Returns: | |
str: Display path of the file | |
""" | |
path_parts = [file['name']] | |
# Add folder path if available (new structure) | |
if folder_path := file.get('folder_path', []): | |
for folder in reversed(folder_path): | |
path_parts.insert(0, folder['name']) | |
return '/'.join(path_parts) | |
async def _process_single_file( | |
self, | |
file: Dict[str, Any], | |
vector_store: ChromaVectorStore | |
) -> Dict[str, Any]: | |
"""Process a single Drive file""" | |
mime_type = file.get('mimeType', '') | |
# Skip if mime type not supported | |
if mime_type not in self.supported_mime_types: | |
return { | |
'status': 'skipped', | |
'data': { | |
'name': file['name'], | |
'path': file.get('display_path', file['name']), | |
'reason': f'Unsupported mime type: {mime_type}' | |
} | |
} | |
try: | |
document_id = file['id'] | |
modified_time = file.get('modifiedTime', 'N/A') | |
# Check if document should be processed | |
if self.save_document(document_id, vector_store, modified_time): | |
# Download and process file | |
temp_file_path = await self._download_and_save_file( | |
file['id'], | |
mime_type | |
) | |
try: | |
# Process document | |
processed_doc = await self.doc_processor.process_document( | |
str(temp_file_path) | |
) | |
# Add to vector store with path information | |
self._add_to_vector_store( | |
processed_doc['chunks'], | |
file, | |
mime_type, | |
vector_store | |
) | |
# Add MongoDB storage - Store Google Drive URL | |
await self.mongodb.store_document( | |
document_id=document_id, | |
filename=file['name'], | |
content_type=mime_type, | |
file_size=0, # Not needed for drive documents | |
url_path=f"https://drive.google.com/file/d/{document_id}/view", | |
source="google_drive" | |
) | |
return { | |
'status': 'processed', | |
'data': { | |
'name': file['name'], | |
'path': file.get('display_path', file['name']), | |
'id': file['id'], | |
'chunks_processed': len(processed_doc['chunks']) | |
} | |
} | |
finally: | |
# Clean up temporary file | |
if temp_file_path.exists(): | |
temp_file_path.unlink() | |
else: | |
return { | |
'status': 'skipped', | |
'data': { | |
'name': file['name'], | |
'path': file.get('display_path', file['name']), | |
'reason': 'Document already exists in the memory.' | |
} | |
} | |
except Exception as e: | |
logger.error(f"Error processing file {file['name']}: {str(e)}") | |
return { | |
'status': 'error', | |
'data': { | |
'file_name': file['name'], | |
'path': file.get('display_path', file['name']), | |
'error': str(e) | |
} | |
} | |
def _add_to_vector_store( | |
self, | |
chunks: List[str], | |
file: Dict[str, Any], | |
mime_type: str, | |
vector_store: ChromaVectorStore | |
) -> None: | |
"""Add processed chunks to vector store with path information""" | |
chunk_metadatas = [] | |
chunk_ids = [] | |
modified_time = file.get('modifiedTime', 'N/A') | |
file_path = file.get('display_path', file['name']) | |
for i, chunk in enumerate(chunks): | |
chunk_id = f"{file['id']}-chunk-{i}" | |
chunk_ids.append(chunk_id) | |
chunk_metadatas.append({ | |
"source": file_path, # Use full path instead of just name | |
"document_id": file['id'], | |
"chunk_index": i, | |
"mime_type": mime_type, | |
"modified_time": modified_time, | |
"total_chunks": len(chunks), | |
"file_type": self.supported_mime_types[mime_type], | |
"is_google_doc": mime_type.startswith('application/vnd.google-apps') | |
}) | |
vector_store.add_documents( | |
documents=chunks, | |
metadatas=chunk_metadatas, | |
ids=chunk_ids | |
) | |
async def _download_and_save_file( | |
self, | |
file_id: str, | |
mime_type: str | |
) -> Path: | |
"""Download and save file to temporary location""" | |
extension = self.supported_mime_types[mime_type] | |
temp_file_path = self.temp_dir / f"{file_id}{extension}" | |
if mime_type in self.google_docs_export_types: | |
# Download Google Doc in the specified export format | |
content = self.google_drive_service.export_file( | |
file_id, | |
self.google_docs_export_types[mime_type] | |
) | |
else: | |
# Download regular file | |
content = self.google_drive_service.download_file(file_id) | |
with open(temp_file_path, 'wb') as f: | |
if isinstance(content, str): | |
f.write(content.encode('utf-8')) | |
else: | |
f.write(content) | |
return temp_file_path | |
def save_document( | |
self, | |
document_id: str, | |
vector_store: ChromaVectorStore, | |
modified_date: str | |
) -> bool: | |
""" | |
Check if document needs to be processed based on modification date | |
Args: | |
document_id (str): ID of the document to check | |
vector_store (ChromaVectorStore): Vector store instance | |
modified_date (str): Modified date to compare against | |
Returns: | |
bool: True if document should be processed, False otherwise | |
""" | |
try: | |
# Retrieve all chunks for the given document_id | |
chunks = vector_store.get_document_chunks(document_id) | |
if not chunks: | |
# Document doesn't exist in vector store | |
return True | |
# Check the modified_time of the first chunk | |
first_chunk_metadata = chunks[0].get("metadata", {}) | |
if first_chunk_metadata.get("modified_time") != modified_date: | |
# If modified_time doesn't match, delete existing chunks | |
vector_store.delete_document(document_id) | |
logger.info( | |
f"Document {document_id} has been modified, will reprocess") | |
return True | |
logger.info(f"Document {document_id} is up to date, skipping") | |
return False | |
except Exception as e: | |
logger.error(f"Error checking document status: {str(e)}") | |
# In case of error, process the document to be safe | |
return True | |
def _cleanup_temp_dir(self) -> None: | |
"""Clean up temporary directory if empty""" | |
try: | |
if self.temp_dir.exists() and not any(self.temp_dir.iterdir()): | |
self.temp_dir.rmdir() | |
except Exception as e: | |
logger.error(f"Error cleaning up temp directory: {str(e)}") | |
# Don't raise the error as this is a cleanup operation | |