Spaces:

TalatMasud
/

chatbot-backend

Running

chatbot-backend / src /utils /drive_document_processor.py

Implementation for Google drive is done. Now it is working fine, except need a new function that can delete the chunks if a document is removed from google drive.

aee2bfd 5 months ago

raw

history blame

11.4 kB

	# src/utils/drive_document_processor.py
	from pathlib import Path
	from typing import Dict, List, Any, Tuple
	import logging
	from fastapi import HTTPException

	from src.utils.google_drive_service import GoogleDriveService
	from src.utils.document_processor import DocumentProcessor
	from src.vectorstores.chroma_vectorstore import ChromaVectorStore
	from src.utils.logger import logger

	class DriveDocumentProcessor:
	def __init__(
	self,
	google_service_account_path: str,
	folder_id: str,
	temp_dir: str,
	doc_processor: DocumentProcessor
	):
	"""
	Initialize Drive Document Processor

	Args:
	google_service_account_path (str): Path to Google service account credentials
	folder_id (str): Google Drive folder ID to process
	temp_dir (str): Directory for temporary files
	doc_processor (DocumentProcessor): Instance of DocumentProcessor
	"""
	self.google_drive_service = GoogleDriveService(google_service_account_path)
	self.folder_id = folder_id
	self.temp_dir = Path(temp_dir)
	self.doc_processor = doc_processor

	# Create temp directory if it doesn't exist
	self.temp_dir.mkdir(exist_ok=True)

	# Define supported MIME types
	self.supported_mime_types = {
	# Google Docs
	'application/vnd.google-apps.document': '.docx', # Export Google Docs as DOCX

	# Microsoft Word Documents
	'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
	'application/msword': '.doc',

	# Microsoft Excel Documents
	'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
	'application/vnd.ms-excel': '.xls',

	# Text Documents
	'text/plain': '.txt',
	'text/csv': '.csv',
	'text/markdown': '.md',
	'text/html': '.html',
	'text/xml': '.xml',
	'application/json': '.json',
	'application/rtf': '.rtf',

	# PDF Documents
	'application/pdf': '.pdf'
	}

	# Define export MIME types for Google Docs formats
	self.google_docs_export_types = {
	'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
	}

	async def process_documents(
	self,
	vector_store: ChromaVectorStore
	) -> Dict[str, Any]:
	"""
	Process all documents in the specified Drive folder

	Args:
	vector_store (ChromaVectorStore): Vector store instance

	Returns:
	Dict[str, Any]: Processing results
	"""
	try:
	# Get documents from folder
	files = self.google_drive_service.get_folder_contents(self.folder_id)

	processed_files = []
	skipped_files = []
	errors = []

	for file in files:
	result = await self._process_single_file(file, vector_store)

	if result['status'] == 'processed':
	processed_files.append(result['data'])
	elif result['status'] == 'skipped':
	skipped_files.append(result['data'])
	else: # status == 'error'
	errors.append(result['data'])

	# Clean up temporary directory if empty
	self._cleanup_temp_dir()

	return {
	"status": "completed",
	"processed_files": {
	"count": len(processed_files),
	"details": processed_files
	},
	"skipped_files": {
	"count": len(skipped_files),
	"details": skipped_files
	},
	"errors": {
	"count": len(errors),
	"details": errors
	}
	}

	except Exception as e:
	logger.error(f"Error processing Drive documents: {str(e)}")
	raise HTTPException(
	status_code=500,
	detail=f"Failed to process drive documents: {str(e)}"
	)

	async def _process_single_file(
	self,
	file: Dict[str, Any],
	vector_store: ChromaVectorStore
	) -> Dict[str, Any]:
	"""Process a single Drive file"""

	mime_type = file.get('mimeType', '')

	# Skip if mime type not supported
	if mime_type not in self.supported_mime_types:
	return {
	'status': 'skipped',
	'data': {
	'name': file['name'],
	'reason': f'Unsupported mime type: {mime_type}'
	}
	}

	try:
	document_id = file['id']
	modified_time = file.get('modifiedTime', 'N/A') # Get last modified time

	# Check if document should be processed
	if self.save_document(document_id, vector_store, modified_time):
	# Download and process file
	temp_file_path = await self._download_and_save_file(
	file['id'],
	mime_type
	)

	try:
	# Process document
	processed_doc = await self.doc_processor.process_document(
	str(temp_file_path)
	)

	# Add to vector store
	self._add_to_vector_store(
	processed_doc['chunks'],
	file,
	mime_type,
	vector_store
	)

	return {
	'status': 'processed',
	'data': {
	'name': file['name'],
	'id': file['id'],
	'chunks_processed': len(processed_doc['chunks'])
	}
	}

	finally:
	# Clean up temporary file
	if temp_file_path.exists():
	temp_file_path.unlink()
	else:
	# Return skipped status if document already exists and is up to date
	return {
	'status': 'skipped',
	'data': {
	'name': file['name'],
	'reason': 'Document already exists in the memory.'
	}
	}

	except Exception as e:
	logger.error(f"Error processing file {file['name']}: {str(e)}")
	return {
	'status': 'error',
	'data': {
	'file_name': file['name'],
	'error': str(e)
	}
	}

	except Exception as e:
	logger.error(f"Error processing file {file['name']}: {str(e)}")
	return {
	'status': 'error',
	'data': {
	'file_name': file['name'],
	'error': str(e)
	}
	}

	async def _download_and_save_file(
	self,
	file_id: str,
	mime_type: str
	) -> Path:
	"""Download and save file to temporary location"""
	extension = self.supported_mime_types[mime_type]
	temp_file_path = self.temp_dir / f"{file_id}{extension}"

	if mime_type in self.google_docs_export_types:
	# Download Google Doc in the specified export format
	content = self.google_drive_service.export_file(
	file_id,
	self.google_docs_export_types[mime_type]
	)
	else:
	# Download regular file
	content = self.google_drive_service.download_file(file_id)

	with open(temp_file_path, 'wb') as f:
	if isinstance(content, str):
	f.write(content.encode('utf-8'))
	else:
	f.write(content)

	return temp_file_path

	def _add_to_vector_store(
	self,
	chunks: List[str],
	file: Dict[str, Any],
	mime_type: str,
	vector_store: ChromaVectorStore
	) -> None:
	"""Add processed chunks to vector store"""
	chunk_metadatas = []
	chunk_ids = []

	# document_id = file['id']
	modified_time = file.get('modifiedTime', 'N/A') # Get last modified time
	#self.delete_updated_document(document_id, vector_store, modified_time)


	for i, chunk in enumerate(chunks):
	chunk_id = f"{file['id']}-chunk-{i}"
	chunk_ids.append(chunk_id)
	chunk_metadatas.append({
	"source": file['name'],
	"document_id": file['id'],
	"chunk_index": i,
	"mime_type": mime_type,
	"modified_time": modified_time,
	"total_chunks": len(chunks),
	"file_type": self.supported_mime_types[mime_type],
	"is_google_doc": mime_type.startswith('application/vnd.google-apps')
	})

	vector_store.add_documents(
	documents=chunks,
	metadatas=chunk_metadatas,
	ids=chunk_ids
	)

	def save_document(self, document_id: str, vector_store: ChromaVectorStore, modified_date: str) -> bool:
	"""
	Deletes all chunks of a document if the modified_time does not match the given modified_date.

	Args:
	document_id (str): The ID of the document.
	vector_store (ChromaVectorStore): The Chroma vector store instance.
	modified_date (str): The expected modification date.
	"""
	try:
	# Retrieve all chunks for the given document_id
	chunks = vector_store.get_document_chunks(document_id)

	if not chunks:
	logging.warning(f"No chunks found for document_id: {document_id}. Nothing to delete.")
	return True

	# Check the modified_time of the first chunk
	first_chunk_metadata = chunks[0].get("metadata", {})

	if first_chunk_metadata.get("modified_time") != modified_date:
	# If modified_time doesn't match, delete all chunks
	vector_store.delete_document(document_id)
	logging.info(f"Deleted all chunks for document_id: {document_id} due to modified_time mismatch.")
	return True
	else:
	logging.info(f"No deletion needed for document_id: {document_id}, modified_time is unchanged.")
	return False


	except Exception as e:
	logging.error(f"Error while deleting chunks for document_id {document_id}: {str(e)}")
	return True



	def _cleanup_temp_dir(self) -> None:
	"""Clean up temporary directory if empty"""
	if self.temp_dir.exists() and not any(self.temp_dir.iterdir()):
	self.temp_dir.rmdir()