chatbot-backend / src /implementations /document_service.py
TalatMasood's picture
Refactoring code
e9d730a
raw
history blame
5.07 kB
# src/implementations/document_service.py
from pathlib import Path
import shutil
import os
import uuid
from typing import List, Tuple
from fastapi import UploadFile, BackgroundTasks
from ..vectorstores.chroma_vectorstore import ChromaVectorStore
from ..utils.document_processor import DocumentProcessor
from ..models import DocumentResponse, DocumentInfo, BatchUploadResponse
from ..utils.logger import logger
class DocumentService:
def __init__(self, doc_processor: DocumentProcessor):
self.doc_processor = doc_processor
self.upload_dir = Path("temp_uploads")
self.upload_dir.mkdir(exist_ok=True)
async def process_documents(
self,
files: List[UploadFile],
vector_store: ChromaVectorStore,
background_tasks: BackgroundTasks
) -> BatchUploadResponse:
"""Process multiple document uploads"""
processed_files, failed_files = await self._handle_file_uploads(
files,
vector_store,
background_tasks
)
return BatchUploadResponse(
message=f"Processed {len(processed_files)} documents with {len(failed_files)} failures",
processed_files=processed_files,
failed_files=failed_files
)
async def _handle_file_uploads(
self,
files: List[UploadFile],
vector_store: ChromaVectorStore,
background_tasks: BackgroundTasks
) -> Tuple[List[DocumentResponse], List[dict]]:
"""Handle individual file uploads and processing"""
processed_files = []
failed_files = []
for file in files:
try:
if not self._is_supported_format(file.filename):
failed_files.append(self._create_failed_file_entry(
file.filename,
"Unsupported file format"
))
continue
document_response = await self._process_single_file(
file,
vector_store,
background_tasks
)
processed_files.append(document_response)
except Exception as e:
logger.error(f"Error processing file {file.filename}: {str(e)}")
failed_files.append(self._create_failed_file_entry(
file.filename,
str(e)
))
return processed_files, failed_files
def _is_supported_format(self, filename: str) -> bool:
"""Check if file format is supported"""
return any(filename.lower().endswith(ext)
for ext in self.doc_processor.supported_formats)
async def _process_single_file(
self,
file: UploadFile,
vector_store: ChromaVectorStore,
background_tasks: BackgroundTasks
) -> DocumentResponse:
"""Process a single file upload"""
document_id = str(uuid.uuid4())
temp_path = self.upload_dir / f"{document_id}_{file.filename}"
# Save file
with open(temp_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
# Add background task for processing
background_tasks.add_task(
self._process_and_store_document,
temp_path,
vector_store,
document_id
)
return DocumentResponse(
message="Document queued for processing",
document_id=document_id,
status="processing",
document_info=DocumentInfo(
original_filename=file.filename,
size=os.path.getsize(temp_path),
content_type=file.content_type
)
)
async def _process_and_store_document(
self,
file_path: Path,
vector_store: ChromaVectorStore,
document_id: str
):
"""Process document and store in vector database"""
try:
processed_doc = await self.doc_processor.process_document(file_path)
vector_store.add_documents(
documents=processed_doc['chunks'],
metadatas=[{
'document_id': document_id,
'chunk_id': i,
'source': str(file_path.name),
'metadata': processed_doc['metadata']
} for i in range(len(processed_doc['chunks']))],
ids=[f"{document_id}_chunk_{i}" for i in range(len(processed_doc['chunks']))]
)
return processed_doc
finally:
if file_path.exists():
file_path.unlink()
def _create_failed_file_entry(self, filename: str, error: str) -> dict:
"""Create a failed file entry"""
return {
"filename": filename,
"error": error
}
def cleanup(self):
"""Clean up upload directory"""
if self.upload_dir.exists() and not any(self.upload_dir.iterdir()):
self.upload_dir.rmdir()