chatbot-backend / src /utils /drive_document_processor.py
TalatMasood's picture
Log google drive documents in the mongodb, add source of the document and made chunks to overlap text.
acdfaa9
raw
history blame
15 kB
# src/utils/drive_document_processor.py
from pathlib import Path
from typing import Dict, List, Any, Tuple
import logging
from fastapi import HTTPException
from src.utils.google_drive_service import GoogleDriveService
from src.utils.document_processor import DocumentProcessor
from src.vectorstores.chroma_vectorstore import ChromaVectorStore
from src.utils.logger import logger
from src.db.mongodb_store import MongoDBStore
class DriveDocumentProcessor:
def __init__(
self,
google_service_account_path: str,
folder_id: str,
temp_dir: str,
doc_processor: DocumentProcessor,
mongodb: MongoDBStore # Add MongoDB
):
"""
Initialize Drive Document Processor
Args:
google_service_account_path (str): Path to Google service account credentials
folder_id (str): Google Drive folder ID to process
temp_dir (str): Directory for temporary files
doc_processor (DocumentProcessor): Instance of DocumentProcessor
"""
self.google_drive_service = GoogleDriveService(
google_service_account_path)
self.folder_id = folder_id
self.temp_dir = Path(temp_dir)
self.doc_processor = doc_processor
self.mongodb = mongodb # Store MongoDB instance
# Create temp directory if it doesn't exist
self.temp_dir.mkdir(exist_ok=True)
# Define supported MIME types
self.supported_mime_types = {
# Google Docs
'application/vnd.google-apps.document': '.docx',
# Microsoft Word Documents
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
'application/msword': '.doc',
# Microsoft Excel Documents
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
'application/vnd.ms-excel': '.xls',
# Text Documents
'text/plain': '.txt',
'text/csv': '.csv',
'text/markdown': '.md',
'text/html': '.html',
'text/xml': '.xml',
'application/json': '.json',
'application/rtf': '.rtf',
# PDF Documents
'application/pdf': '.pdf'
}
self.google_docs_export_types = {
'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
}
async def _cleanup_orphaned_documents(
self,
drive_files: List[Dict[str, Any]],
vector_store: ChromaVectorStore
) -> Dict[str, Any]:
"""
Clean up documents that exist in MongoDB but not in Google Drive
Args:
drive_files (List[Dict[str, Any]]): List of files from Google Drive
vector_store (ChromaVectorStore): Vector store instance
Returns:
Dict[str, Any]: Cleanup statistics
"""
try:
# Get all documents from MongoDB
mongo_docs = await self.mongodb.get_all_documents()
# Create set of Google Drive file IDs
drive_file_ids = {file['id'] for file in drive_files}
deleted_count = 0
failed_deletions = []
# Check each MongoDB document
for doc in mongo_docs:
# Only process Google Drive documents
if doc.get('source') != 'google_drive':
continue
doc_id = doc.get('document_id')
if not doc_id or doc_id not in drive_file_ids:
try:
# Delete from MongoDB
await self.mongodb.delete_document(doc_id)
# Delete from vector store
vector_store.delete_document(doc_id)
deleted_count += 1
except Exception as e:
logger.error(
f"Error deleting orphaned document {doc_id}: {str(e)}")
failed_deletions.append({
'document_id': doc_id,
'error': str(e)
})
return {
'orphaned_documents_deleted': deleted_count,
'failed_deletions': failed_deletions
}
except Exception as e:
logger.error(f"Error in cleanup_orphaned_documents: {str(e)}")
raise
async def process_documents(
self,
vector_store: ChromaVectorStore,
# New parameter with default True for backward compatibility
include_subfolders: bool = True
) -> Dict[str, Any]:
"""
Process all documents in the specified Drive folder
Args:
vector_store (ChromaVectorStore): Vector store instance
include_subfolders (bool): Whether to process documents in subfolders
Returns:
Dict[str, Any]: Processing results
"""
try:
# Get documents from folder
files = self.google_drive_service.get_folder_contents(
self.folder_id,
include_subfolders=include_subfolders
)
# Clean up orphaned documents first
cleanup_results = await self._cleanup_orphaned_documents(files, vector_store)
processed_files = []
skipped_files = []
errors = []
for file in files:
# Skip if it's a folder
if file.get('mimeType') == 'application/vnd.google-apps.folder':
continue
# Get file path (including folder structure if available)
file_path = self._get_file_path(file)
file['display_path'] = file_path
result = await self._process_single_file(file, vector_store)
if result['status'] == 'processed':
processed_files.append(result['data'])
elif result['status'] == 'skipped':
skipped_files.append(result['data'])
else: # status == 'error'
errors.append(result['data'])
# Clean up temporary directory if empty
self._cleanup_temp_dir()
return {
"status": "completed",
"processed_files": {
"count": len(processed_files),
"details": processed_files
},
"skipped_files": {
"count": len(skipped_files),
"details": skipped_files
},
"errors": {
"count": len(errors),
"details": errors
}
}
except Exception as e:
logger.error(f"Error processing Drive documents: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"Failed to process drive documents: {str(e)}"
)
def _get_file_path(self, file: Dict[str, Any]) -> str:
"""
Get the full path for a file including its folder structure
Args:
file (Dict[str, Any]): File metadata
Returns:
str: Display path of the file
"""
path_parts = [file['name']]
# Add folder path if available (new structure)
if folder_path := file.get('folder_path', []):
for folder in reversed(folder_path):
path_parts.insert(0, folder['name'])
return '/'.join(path_parts)
async def _process_single_file(
self,
file: Dict[str, Any],
vector_store: ChromaVectorStore
) -> Dict[str, Any]:
"""Process a single Drive file"""
mime_type = file.get('mimeType', '')
# Skip if mime type not supported
if mime_type not in self.supported_mime_types:
return {
'status': 'skipped',
'data': {
'name': file['name'],
'path': file.get('display_path', file['name']),
'reason': f'Unsupported mime type: {mime_type}'
}
}
try:
document_id = file['id']
modified_time = file.get('modifiedTime', 'N/A')
# Check if document should be processed
if self.save_document(document_id, vector_store, modified_time):
# Download and process file
temp_file_path = await self._download_and_save_file(
file['id'],
mime_type
)
try:
# Process document
processed_doc = await self.doc_processor.process_document(
str(temp_file_path)
)
# Add to vector store with path information
self._add_to_vector_store(
processed_doc['chunks'],
file,
mime_type,
vector_store
)
# Add MongoDB storage - Store Google Drive URL
await self.mongodb.store_document(
document_id=document_id,
filename=file['name'],
content_type=mime_type,
file_size=0, # Not needed for drive documents
url_path=f"https://drive.google.com/file/d/{document_id}/view",
source="google_drive"
)
return {
'status': 'processed',
'data': {
'name': file['name'],
'path': file.get('display_path', file['name']),
'id': file['id'],
'chunks_processed': len(processed_doc['chunks'])
}
}
finally:
# Clean up temporary file
if temp_file_path.exists():
temp_file_path.unlink()
else:
return {
'status': 'skipped',
'data': {
'name': file['name'],
'path': file.get('display_path', file['name']),
'reason': 'Document already exists in the memory.'
}
}
except Exception as e:
logger.error(f"Error processing file {file['name']}: {str(e)}")
return {
'status': 'error',
'data': {
'file_name': file['name'],
'path': file.get('display_path', file['name']),
'error': str(e)
}
}
def _add_to_vector_store(
self,
chunks: List[str],
file: Dict[str, Any],
mime_type: str,
vector_store: ChromaVectorStore
) -> None:
"""Add processed chunks to vector store with path information"""
chunk_metadatas = []
chunk_ids = []
modified_time = file.get('modifiedTime', 'N/A')
file_path = file.get('display_path', file['name'])
for i, chunk in enumerate(chunks):
chunk_id = f"{file['id']}-chunk-{i}"
chunk_ids.append(chunk_id)
chunk_metadatas.append({
"source": file_path, # Use full path instead of just name
"document_id": file['id'],
"chunk_index": i,
"mime_type": mime_type,
"modified_time": modified_time,
"total_chunks": len(chunks),
"file_type": self.supported_mime_types[mime_type],
"is_google_doc": mime_type.startswith('application/vnd.google-apps')
})
vector_store.add_documents(
documents=chunks,
metadatas=chunk_metadatas,
ids=chunk_ids
)
async def _download_and_save_file(
self,
file_id: str,
mime_type: str
) -> Path:
"""Download and save file to temporary location"""
extension = self.supported_mime_types[mime_type]
temp_file_path = self.temp_dir / f"{file_id}{extension}"
if mime_type in self.google_docs_export_types:
# Download Google Doc in the specified export format
content = self.google_drive_service.export_file(
file_id,
self.google_docs_export_types[mime_type]
)
else:
# Download regular file
content = self.google_drive_service.download_file(file_id)
with open(temp_file_path, 'wb') as f:
if isinstance(content, str):
f.write(content.encode('utf-8'))
else:
f.write(content)
return temp_file_path
def save_document(
self,
document_id: str,
vector_store: ChromaVectorStore,
modified_date: str
) -> bool:
"""
Check if document needs to be processed based on modification date
Args:
document_id (str): ID of the document to check
vector_store (ChromaVectorStore): Vector store instance
modified_date (str): Modified date to compare against
Returns:
bool: True if document should be processed, False otherwise
"""
try:
# Retrieve all chunks for the given document_id
chunks = vector_store.get_document_chunks(document_id)
if not chunks:
# Document doesn't exist in vector store
return True
# Check the modified_time of the first chunk
first_chunk_metadata = chunks[0].get("metadata", {})
if first_chunk_metadata.get("modified_time") != modified_date:
# If modified_time doesn't match, delete existing chunks
vector_store.delete_document(document_id)
logger.info(
f"Document {document_id} has been modified, will reprocess")
return True
logger.info(f"Document {document_id} is up to date, skipping")
return False
except Exception as e:
logger.error(f"Error checking document status: {str(e)}")
# In case of error, process the document to be safe
return True
def _cleanup_temp_dir(self) -> None:
"""Clean up temporary directory if empty"""
try:
if self.temp_dir.exists() and not any(self.temp_dir.iterdir()):
self.temp_dir.rmdir()
except Exception as e:
logger.error(f"Error cleaning up temp directory: {str(e)}")
# Don't raise the error as this is a cleanup operation