# src/implementations/document_service.py from fastapi import HTTPException from pathlib import Path import shutil import os from uuid import uuid4 from typing import List, Tuple, Dict from fastapi import UploadFile, BackgroundTasks from src.vectorstores.chroma_vectorstore import ChromaVectorStore from src.utils.document_processor import DocumentProcessor from src.models import DocumentResponse, DocumentInfo, BatchUploadResponse from src.utils.logger import logger from src.db.mongodb_store import MongoDBStore class DocumentService: def __init__( self, doc_processor: DocumentProcessor, mongodb: MongoDBStore ): self.doc_processor = doc_processor self.mongodb = mongodb self.permanent_dir = Path("uploads") self.permanent_dir.mkdir(exist_ok=True) async def check_duplicate_filename(self, filename: str) -> bool: """ Check if a file with the same name exists Args: filename (str): Original filename to check Returns: bool: True if duplicate exists, False otherwise """ documents = await self.mongodb.get_all_documents() return any(doc.get('filename') == filename for doc in documents) async def process_documents( self, files: List[UploadFile], vector_store: ChromaVectorStore, background_tasks: BackgroundTasks ) -> BatchUploadResponse: """Process multiple document uploads""" processed_files, failed_files = await self._handle_file_uploads( files, vector_store, background_tasks ) return BatchUploadResponse( message=f"Processed {len(processed_files)} documents with {len(failed_files)} failures", processed_files=processed_files, failed_files=failed_files ) async def _handle_file_uploads( self, files: List[UploadFile], vector_store: ChromaVectorStore, background_tasks: BackgroundTasks ) -> Tuple[List[DocumentResponse], List[dict]]: """Handle individual file uploads and processing""" processed_files = [] failed_files = [] for file in files: try: # Check for duplicate filename if await self.check_duplicate_filename(file.filename): failed_files.append(self._create_failed_file_entry( file.filename, "A document with this name already exists. Please upload another document." )) continue if not self._is_supported_format(file.filename): failed_files.append(self._create_failed_file_entry( file.filename, "Unsupported file format" )) continue document_response = await self._process_single_file( file, vector_store, background_tasks ) processed_files.append(document_response) except Exception as e: logger.error( f"Error processing file {file.filename}: {str(e)}") failed_files.append(self._create_failed_file_entry( file.filename, str(e) )) return processed_files, failed_files async def _process_single_file( self, file: UploadFile, vector_store: ChromaVectorStore, background_tasks: BackgroundTasks ) -> DocumentResponse: """Process a single file upload with proper handle closure""" document_id = str(uuid4()) filename = f"{document_id}_{file.filename}" file_path = self.permanent_dir / filename url_path = f"/docs/{filename}" try: # Save file to permanent location using a context manager with open(file_path, "wb") as buffer: shutil.copyfileobj(file.file, buffer) # Close the uploaded file explicitly await file.close() # Process document with proper cleanup for Excel files try: processed_doc = await self.doc_processor.process_document(file_path) # For Excel files, ensure pandas closes the file if file_path.suffix.lower() in ['.xlsx', '.xls']: import gc gc.collect() # Help cleanup any lingering file handles except Exception as proc_error: logger.error(f"Error processing document: {str(proc_error)}") raise # Store in MongoDB with url_path await self.mongodb.store_document( document_id=document_id, filename=file.filename, content_type=file.content_type, file_size=os.path.getsize(file_path), url_path=url_path, source="user_upload" ) # Process for vector store in background background_tasks.add_task( self._process_for_vector_store, processed_doc['chunks'], vector_store, document_id, file.filename ) return DocumentResponse( message="Document uploaded successfully", document_id=document_id, status="processing", document_info=DocumentInfo( original_filename=file.filename, size=os.path.getsize(file_path), content_type=file.content_type, url_path=url_path ) ) except Exception as e: # Clean up file if it was created if file_path.exists(): try: file_path.unlink() except Exception as cleanup_error: logger.error( f"Error cleaning up file {file_path}: {str(cleanup_error)}") # Clean up from MongoDB if document was created try: await self.mongodb.delete_document(document_id) except Exception as db_cleanup_error: logger.error( f"Error cleaning up MongoDB document {document_id}: {str(db_cleanup_error)}") logger.error(f"Error processing file {file.filename}: {str(e)}") raise async def _process_for_vector_store( self, chunks: List[str], vector_store: ChromaVectorStore, document_id: str, filename: str ): """Process document content for vector store""" try: # Generate chunk IDs using document_id chunk_ids = [ f"{document_id}-chunk-{i}" for i in range(len(chunks))] # Get embeddings embeddings = vector_store.embedding_function(chunks) # Prepare metadata for each chunk metadatas = [{ 'document_id': document_id, 'source_file': filename, 'chunk_index': i, 'total_chunks': len(chunks) } for i in range(len(chunks))] # Store in vector store vector_store.add_documents( documents=chunks, embeddings=embeddings, metadatas=metadatas, ids=chunk_ids ) logger.info( f"Successfully processed document {filename} (ID: {document_id}) into {len(chunks)} chunks") except Exception as e: logger.error( f"Error processing document {filename} (ID: {document_id}) for vector store: {str(e)}") raise def _is_supported_format(self, filename: str) -> bool: """Check if file format is supported""" return any(filename.lower().endswith(ext) for ext in self.doc_processor.supported_formats) def _create_failed_file_entry(self, filename: str, error: str) -> dict: """Create a failed file entry""" return { "filename": filename, "error": error } async def delete_document(self, document_id: str) -> bool: """Delete document from storage and MongoDB""" try: # Get document details from MongoDB doc = await self.mongodb.get_document(document_id) if doc: # Get filename from url_path filename = doc['url_path'].split('/')[-1] file_path = self.permanent_dir / filename # Delete physical file if it exists if file_path.exists(): file_path.unlink() # Delete from MongoDB return await self.mongodb.delete_document(document_id) return False except Exception as e: logger.error(f"Error deleting document: {str(e)}") raise def cleanup(self): """Clean up permanent directory if empty""" if self.permanent_dir.exists() and not any(self.permanent_dir.iterdir()): self.permanent_dir.rmdir()