Spaces:

TalatMasud
/

chatbot-backend

Running

App Files Files Community

chatbot-backend / src /implementations /document_service.py

TalatMasood

Refactoring code

e9d730a 3 months ago

raw

history blame

5.07 kB

	# src/implementations/document_service.py
	from pathlib import Path
	import shutil
	import os
	import uuid
	from typing import List, Tuple
	from fastapi import UploadFile, BackgroundTasks
	from ..vectorstores.chroma_vectorstore import ChromaVectorStore
	from ..utils.document_processor import DocumentProcessor
	from ..models import DocumentResponse, DocumentInfo, BatchUploadResponse
	from ..utils.logger import logger

	class DocumentService:
	def __init__(self, doc_processor: DocumentProcessor):
	self.doc_processor = doc_processor
	self.upload_dir = Path("temp_uploads")
	self.upload_dir.mkdir(exist_ok=True)

	async def process_documents(
	self,
	files: List[UploadFile],
	vector_store: ChromaVectorStore,
	background_tasks: BackgroundTasks
	) -> BatchUploadResponse:
	"""Process multiple document uploads"""
	processed_files, failed_files = await self._handle_file_uploads(
	files,
	vector_store,
	background_tasks
	)

	return BatchUploadResponse(
	message=f"Processed {len(processed_files)} documents with {len(failed_files)} failures",
	processed_files=processed_files,
	failed_files=failed_files
	)

	async def _handle_file_uploads(
	self,
	files: List[UploadFile],
	vector_store: ChromaVectorStore,
	background_tasks: BackgroundTasks
	) -> Tuple[List[DocumentResponse], List[dict]]:
	"""Handle individual file uploads and processing"""
	processed_files = []
	failed_files = []

	for file in files:
	try:
	if not self._is_supported_format(file.filename):
	failed_files.append(self._create_failed_file_entry(
	file.filename,
	"Unsupported file format"
	))
	continue

	document_response = await self._process_single_file(
	file,
	vector_store,
	background_tasks
	)
	processed_files.append(document_response)

	except Exception as e:
	logger.error(f"Error processing file {file.filename}: {str(e)}")
	failed_files.append(self._create_failed_file_entry(
	file.filename,
	str(e)
	))

	return processed_files, failed_files

	def _is_supported_format(self, filename: str) -> bool:
	"""Check if file format is supported"""
	return any(filename.lower().endswith(ext)
	for ext in self.doc_processor.supported_formats)

	async def _process_single_file(
	self,
	file: UploadFile,
	vector_store: ChromaVectorStore,
	background_tasks: BackgroundTasks
	) -> DocumentResponse:
	"""Process a single file upload"""
	document_id = str(uuid.uuid4())
	temp_path = self.upload_dir / f"{document_id}_{file.filename}"

	# Save file
	with open(temp_path, "wb") as buffer:
	shutil.copyfileobj(file.file, buffer)

	# Add background task for processing
	background_tasks.add_task(
	self._process_and_store_document,
	temp_path,
	vector_store,
	document_id
	)

	return DocumentResponse(
	message="Document queued for processing",
	document_id=document_id,
	status="processing",
	document_info=DocumentInfo(
	original_filename=file.filename,
	size=os.path.getsize(temp_path),
	content_type=file.content_type
	)
	)

	async def _process_and_store_document(
	self,
	file_path: Path,
	vector_store: ChromaVectorStore,
	document_id: str
	):
	"""Process document and store in vector database"""
	try:
	processed_doc = await self.doc_processor.process_document(file_path)

	vector_store.add_documents(
	documents=processed_doc['chunks'],
	metadatas=[{
	'document_id': document_id,
	'chunk_id': i,
	'source': str(file_path.name),
	'metadata': processed_doc['metadata']
	} for i in range(len(processed_doc['chunks']))],
	ids=[f"{document_id}_chunk_{i}" for i in range(len(processed_doc['chunks']))]
	)

	return processed_doc
	finally:
	if file_path.exists():
	file_path.unlink()

	def _create_failed_file_entry(self, filename: str, error: str) -> dict:
	"""Create a failed file entry"""
	return {
	"filename": filename,
	"error": error
	}

	def cleanup(self):
	"""Clean up upload directory"""
	if self.upload_dir.exists() and not any(self.upload_dir.iterdir()):
	self.upload_dir.rmdir()