Spaces:

TalatMasud
/

chatbot-backend

Running

App Files Files Community

chatbot-backend / src /utils /drive_document_processor.py

TalatMasood

Log google drive documents in the mongodb, add source of the document and made chunks to overlap text.

acdfaa9 about 1 month ago

raw

history blame

15 kB

	# src/utils/drive_document_processor.py
	from pathlib import Path
	from typing import Dict, List, Any, Tuple
	import logging
	from fastapi import HTTPException

	from src.utils.google_drive_service import GoogleDriveService
	from src.utils.document_processor import DocumentProcessor
	from src.vectorstores.chroma_vectorstore import ChromaVectorStore
	from src.utils.logger import logger
	from src.db.mongodb_store import MongoDBStore


	class DriveDocumentProcessor:
	def __init__(
	self,
	google_service_account_path: str,
	folder_id: str,
	temp_dir: str,
	doc_processor: DocumentProcessor,
	mongodb: MongoDBStore # Add MongoDB
	):
	"""
	Initialize Drive Document Processor

	Args:
	google_service_account_path (str): Path to Google service account credentials
	folder_id (str): Google Drive folder ID to process
	temp_dir (str): Directory for temporary files
	doc_processor (DocumentProcessor): Instance of DocumentProcessor
	"""
	self.google_drive_service = GoogleDriveService(
	google_service_account_path)
	self.folder_id = folder_id
	self.temp_dir = Path(temp_dir)
	self.doc_processor = doc_processor
	self.mongodb = mongodb # Store MongoDB instance

	# Create temp directory if it doesn't exist
	self.temp_dir.mkdir(exist_ok=True)

	# Define supported MIME types
	self.supported_mime_types = {
	# Google Docs
	'application/vnd.google-apps.document': '.docx',

	# Microsoft Word Documents
	'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
	'application/msword': '.doc',

	# Microsoft Excel Documents
	'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
	'application/vnd.ms-excel': '.xls',

	# Text Documents
	'text/plain': '.txt',
	'text/csv': '.csv',
	'text/markdown': '.md',
	'text/html': '.html',
	'text/xml': '.xml',
	'application/json': '.json',
	'application/rtf': '.rtf',

	# PDF Documents
	'application/pdf': '.pdf'
	}

	self.google_docs_export_types = {
	'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
	}

	async def _cleanup_orphaned_documents(
	self,
	drive_files: List[Dict[str, Any]],
	vector_store: ChromaVectorStore
	) -> Dict[str, Any]:
	"""
	Clean up documents that exist in MongoDB but not in Google Drive

	Args:
	drive_files (List[Dict[str, Any]]): List of files from Google Drive
	vector_store (ChromaVectorStore): Vector store instance

	Returns:
	Dict[str, Any]: Cleanup statistics
	"""
	try:
	# Get all documents from MongoDB
	mongo_docs = await self.mongodb.get_all_documents()

	# Create set of Google Drive file IDs
	drive_file_ids = {file['id'] for file in drive_files}

	deleted_count = 0
	failed_deletions = []

	# Check each MongoDB document
	for doc in mongo_docs:
	# Only process Google Drive documents
	if doc.get('source') != 'google_drive':
	continue

	doc_id = doc.get('document_id')
	if not doc_id or doc_id not in drive_file_ids:
	try:
	# Delete from MongoDB
	await self.mongodb.delete_document(doc_id)

	# Delete from vector store
	vector_store.delete_document(doc_id)

	deleted_count += 1

	except Exception as e:
	logger.error(
	f"Error deleting orphaned document {doc_id}: {str(e)}")
	failed_deletions.append({
	'document_id': doc_id,
	'error': str(e)
	})

	return {
	'orphaned_documents_deleted': deleted_count,
	'failed_deletions': failed_deletions
	}

	except Exception as e:
	logger.error(f"Error in cleanup_orphaned_documents: {str(e)}")
	raise

	async def process_documents(
	self,
	vector_store: ChromaVectorStore,
	# New parameter with default True for backward compatibility
	include_subfolders: bool = True
	) -> Dict[str, Any]:
	"""
	Process all documents in the specified Drive folder

	Args:
	vector_store (ChromaVectorStore): Vector store instance
	include_subfolders (bool): Whether to process documents in subfolders

	Returns:
	Dict[str, Any]: Processing results
	"""
	try:
	# Get documents from folder
	files = self.google_drive_service.get_folder_contents(
	self.folder_id,
	include_subfolders=include_subfolders
	)

	# Clean up orphaned documents first
	cleanup_results = await self._cleanup_orphaned_documents(files, vector_store)

	processed_files = []
	skipped_files = []
	errors = []

	for file in files:
	# Skip if it's a folder
	if file.get('mimeType') == 'application/vnd.google-apps.folder':
	continue

	# Get file path (including folder structure if available)
	file_path = self._get_file_path(file)
	file['display_path'] = file_path

	result = await self._process_single_file(file, vector_store)

	if result['status'] == 'processed':
	processed_files.append(result['data'])
	elif result['status'] == 'skipped':
	skipped_files.append(result['data'])
	else: # status == 'error'
	errors.append(result['data'])

	# Clean up temporary directory if empty
	self._cleanup_temp_dir()

	return {
	"status": "completed",
	"processed_files": {
	"count": len(processed_files),
	"details": processed_files
	},
	"skipped_files": {
	"count": len(skipped_files),
	"details": skipped_files
	},
	"errors": {
	"count": len(errors),
	"details": errors
	}
	}

	except Exception as e:
	logger.error(f"Error processing Drive documents: {str(e)}")
	raise HTTPException(
	status_code=500,
	detail=f"Failed to process drive documents: {str(e)}"
	)

	def _get_file_path(self, file: Dict[str, Any]) -> str:
	"""
	Get the full path for a file including its folder structure

	Args:
	file (Dict[str, Any]): File metadata

	Returns:
	str: Display path of the file
	"""
	path_parts = [file['name']]

	# Add folder path if available (new structure)
	if folder_path := file.get('folder_path', []):
	for folder in reversed(folder_path):
	path_parts.insert(0, folder['name'])

	return '/'.join(path_parts)

	async def _process_single_file(
	self,
	file: Dict[str, Any],
	vector_store: ChromaVectorStore
	) -> Dict[str, Any]:
	"""Process a single Drive file"""
	mime_type = file.get('mimeType', '')

	# Skip if mime type not supported
	if mime_type not in self.supported_mime_types:
	return {
	'status': 'skipped',
	'data': {
	'name': file['name'],
	'path': file.get('display_path', file['name']),
	'reason': f'Unsupported mime type: {mime_type}'
	}
	}

	try:
	document_id = file['id']
	modified_time = file.get('modifiedTime', 'N/A')

	# Check if document should be processed
	if self.save_document(document_id, vector_store, modified_time):
	# Download and process file
	temp_file_path = await self._download_and_save_file(
	file['id'],
	mime_type
	)

	try:
	# Process document
	processed_doc = await self.doc_processor.process_document(
	str(temp_file_path)
	)

	# Add to vector store with path information
	self._add_to_vector_store(
	processed_doc['chunks'],
	file,
	mime_type,
	vector_store
	)

	# Add MongoDB storage - Store Google Drive URL
	await self.mongodb.store_document(
	document_id=document_id,
	filename=file['name'],
	content_type=mime_type,
	file_size=0, # Not needed for drive documents
	url_path=f"https://drive.google.com/file/d/{document_id}/view",
	source="google_drive"
	)

	return {
	'status': 'processed',
	'data': {
	'name': file['name'],
	'path': file.get('display_path', file['name']),
	'id': file['id'],
	'chunks_processed': len(processed_doc['chunks'])
	}
	}

	finally:
	# Clean up temporary file
	if temp_file_path.exists():
	temp_file_path.unlink()
	else:
	return {
	'status': 'skipped',
	'data': {
	'name': file['name'],
	'path': file.get('display_path', file['name']),
	'reason': 'Document already exists in the memory.'
	}
	}

	except Exception as e:
	logger.error(f"Error processing file {file['name']}: {str(e)}")
	return {
	'status': 'error',
	'data': {
	'file_name': file['name'],
	'path': file.get('display_path', file['name']),
	'error': str(e)
	}
	}

	def _add_to_vector_store(
	self,
	chunks: List[str],
	file: Dict[str, Any],
	mime_type: str,
	vector_store: ChromaVectorStore
	) -> None:
	"""Add processed chunks to vector store with path information"""
	chunk_metadatas = []
	chunk_ids = []

	modified_time = file.get('modifiedTime', 'N/A')
	file_path = file.get('display_path', file['name'])

	for i, chunk in enumerate(chunks):
	chunk_id = f"{file['id']}-chunk-{i}"
	chunk_ids.append(chunk_id)
	chunk_metadatas.append({
	"source": file_path, # Use full path instead of just name
	"document_id": file['id'],
	"chunk_index": i,
	"mime_type": mime_type,
	"modified_time": modified_time,
	"total_chunks": len(chunks),
	"file_type": self.supported_mime_types[mime_type],
	"is_google_doc": mime_type.startswith('application/vnd.google-apps')
	})

	vector_store.add_documents(
	documents=chunks,
	metadatas=chunk_metadatas,
	ids=chunk_ids
	)

	async def _download_and_save_file(
	self,
	file_id: str,
	mime_type: str
	) -> Path:
	"""Download and save file to temporary location"""
	extension = self.supported_mime_types[mime_type]
	temp_file_path = self.temp_dir / f"{file_id}{extension}"

	if mime_type in self.google_docs_export_types:
	# Download Google Doc in the specified export format
	content = self.google_drive_service.export_file(
	file_id,
	self.google_docs_export_types[mime_type]
	)
	else:
	# Download regular file
	content = self.google_drive_service.download_file(file_id)

	with open(temp_file_path, 'wb') as f:
	if isinstance(content, str):
	f.write(content.encode('utf-8'))
	else:
	f.write(content)

	return temp_file_path

	def save_document(
	self,
	document_id: str,
	vector_store: ChromaVectorStore,
	modified_date: str
	) -> bool:
	"""
	Check if document needs to be processed based on modification date

	Args:
	document_id (str): ID of the document to check
	vector_store (ChromaVectorStore): Vector store instance
	modified_date (str): Modified date to compare against

	Returns:
	bool: True if document should be processed, False otherwise
	"""
	try:
	# Retrieve all chunks for the given document_id
	chunks = vector_store.get_document_chunks(document_id)

	if not chunks:
	# Document doesn't exist in vector store
	return True

	# Check the modified_time of the first chunk
	first_chunk_metadata = chunks[0].get("metadata", {})

	if first_chunk_metadata.get("modified_time") != modified_date:
	# If modified_time doesn't match, delete existing chunks
	vector_store.delete_document(document_id)
	logger.info(
	f"Document {document_id} has been modified, will reprocess")
	return True

	logger.info(f"Document {document_id} is up to date, skipping")
	return False

	except Exception as e:
	logger.error(f"Error checking document status: {str(e)}")
	# In case of error, process the document to be safe
	return True

	def _cleanup_temp_dir(self) -> None:
	"""Clean up temporary directory if empty"""
	try:
	if self.temp_dir.exists() and not any(self.temp_dir.iterdir()):
	self.temp_dir.rmdir()
	except Exception as e:
	logger.error(f"Error cleaning up temp directory: {str(e)}")
	# Don't raise the error as this is a cleanup operation