# src/implementations/document_service.py from pathlib import Path import shutil import os from uuid import uuid4 from typing import List, Tuple, Dict from fastapi import UploadFile, BackgroundTasks from src.vectorstores.chroma_vectorstore import ChromaVectorStore from src.utils.document_processor import DocumentProcessor from src.models import DocumentResponse, DocumentInfo, BatchUploadResponse from src.utils.logger import logger from src.db.mongodb_store import MongoDBStore class DocumentService: def __init__( self, doc_processor: DocumentProcessor, mongodb: MongoDBStore ): self.doc_processor = doc_processor self.mongodb = mongodb self.upload_dir = Path("temp_uploads") self.upload_dir.mkdir(exist_ok=True) async def process_documents( self, files: List[UploadFile], vector_store: ChromaVectorStore, background_tasks: BackgroundTasks ) -> BatchUploadResponse: """Process multiple document uploads""" processed_files, failed_files = await self._handle_file_uploads( files, vector_store, background_tasks ) return BatchUploadResponse( message=f"Processed {len(processed_files)} documents with {len(failed_files)} failures", processed_files=processed_files, failed_files=failed_files ) async def _handle_file_uploads( self, files: List[UploadFile], vector_store: ChromaVectorStore, background_tasks: BackgroundTasks ) -> Tuple[List[DocumentResponse], List[dict]]: """Handle individual file uploads and processing""" processed_files = [] failed_files = [] for file in files: try: if not self._is_supported_format(file.filename): failed_files.append(self._create_failed_file_entry( file.filename, "Unsupported file format" )) continue document_response = await self._process_single_file( file, vector_store, background_tasks ) processed_files.append(document_response) except Exception as e: logger.error(f"Error processing file {file.filename}: {str(e)}") failed_files.append(self._create_failed_file_entry( file.filename, str(e) )) return processed_files, failed_files async def _process_single_file( self, file: UploadFile, vector_store: ChromaVectorStore, background_tasks: BackgroundTasks ) -> DocumentResponse: """Process a single file upload""" # Generate UUID for document document_id = str(uuid4()) temp_path = self.upload_dir / f"{document_id}_{file.filename}" try: # Save file temporarily with open(temp_path, "wb") as buffer: shutil.copyfileobj(file.file, buffer) # Process the document to get content and metadata processed_doc = await self.doc_processor.process_document(temp_path) content = processed_doc['content'] # First, store in MongoDB await self.mongodb.store_document( document_id=document_id, filename=file.filename, content=content, content_type=file.content_type, file_size=os.path.getsize(temp_path) ) # Then process for vector store in background background_tasks.add_task( self._process_for_vector_store, processed_doc['chunks'], # Use the chunks from processed document vector_store, document_id, file.filename ) return DocumentResponse( message="Document uploaded successfully", document_id=document_id, status="processing", document_info=DocumentInfo( original_filename=file.filename, size=os.path.getsize(temp_path), content_type=file.content_type ) ) finally: # Clean up temporary file if temp_path.exists(): temp_path.unlink() async def _process_for_vector_store( self, chunks: List[str], # Now accepting pre-processed chunks vector_store: ChromaVectorStore, document_id: str, filename: str ): """Process document content for vector store""" try: # Generate chunk IDs using document_id chunk_ids = [f"{document_id}-chunk-{i}" for i in range(len(chunks))] # Get embeddings embeddings = vector_store.embedding_function(chunks) # Prepare metadata for each chunk metadatas = [{ 'document_id': document_id, # MongoDB document ID 'source_file': filename, 'chunk_index': i, 'total_chunks': len(chunks) } for i in range(len(chunks))] # Store in vector store vector_store.add_documents( documents=chunks, embeddings=embeddings, metadatas=metadatas, ids=chunk_ids ) logger.info(f"Successfully processed document {filename} (ID: {document_id}) into {len(chunks)} chunks") except Exception as e: logger.error(f"Error processing document {filename} (ID: {document_id}) for vector store: {str(e)}") raise def _is_supported_format(self, filename: str) -> bool: """Check if file format is supported""" return any(filename.lower().endswith(ext) for ext in self.doc_processor.supported_formats) def _create_failed_file_entry(self, filename: str, error: str) -> dict: """Create a failed file entry""" return { "filename": filename, "error": error } def cleanup(self): """Clean up upload directory""" if self.upload_dir.exists() and not any(self.upload_dir.iterdir()): self.upload_dir.rmdir()