Spaces:

TalatMasud
/

chatbot-backend

Running

File size: 6,545 Bytes

# src/implementations/document_service.py
from pathlib import Path
import shutil
import os
from uuid import uuid4
from typing import List, Tuple, Dict
from fastapi import UploadFile, BackgroundTasks

from src.vectorstores.chroma_vectorstore import ChromaVectorStore
from src.utils.document_processor import DocumentProcessor
from src.models import DocumentResponse, DocumentInfo, BatchUploadResponse
from src.utils.logger import logger
from src.db.mongodb_store import MongoDBStore

class DocumentService:
    def __init__(
        self, 
        doc_processor: DocumentProcessor,
        mongodb: MongoDBStore
    ):
        self.doc_processor = doc_processor
        self.mongodb = mongodb
        self.upload_dir = Path("temp_uploads")
        self.upload_dir.mkdir(exist_ok=True)

    async def process_documents(
        self,
        files: List[UploadFile],
        vector_store: ChromaVectorStore,
        background_tasks: BackgroundTasks
    ) -> BatchUploadResponse:
        """Process multiple document uploads"""
        processed_files, failed_files = await self._handle_file_uploads(
            files, 
            vector_store, 
            background_tasks
        )

        return BatchUploadResponse(
            message=f"Processed {len(processed_files)} documents with {len(failed_files)} failures",
            processed_files=processed_files,
            failed_files=failed_files
        )

    async def _handle_file_uploads(
        self,
        files: List[UploadFile],
        vector_store: ChromaVectorStore,
        background_tasks: BackgroundTasks
    ) -> Tuple[List[DocumentResponse], List[dict]]:
        """Handle individual file uploads and processing"""
        processed_files = []
        failed_files = []

        for file in files:
            try:
                if not self._is_supported_format(file.filename):
                    failed_files.append(self._create_failed_file_entry(
                        file.filename, 
                        "Unsupported file format"
                    ))
                    continue

                document_response = await self._process_single_file(
                    file, 
                    vector_store, 
                    background_tasks
                )
                processed_files.append(document_response)

            except Exception as e:
                logger.error(f"Error processing file {file.filename}: {str(e)}")
                failed_files.append(self._create_failed_file_entry(
                    file.filename, 
                    str(e)
                ))

        return processed_files, failed_files

    async def _process_single_file(
        self,
        file: UploadFile,
        vector_store: ChromaVectorStore,
        background_tasks: BackgroundTasks
    ) -> DocumentResponse:
        """Process a single file upload"""
        # Generate UUID for document
        document_id = str(uuid4())
        temp_path = self.upload_dir / f"{document_id}_{file.filename}"
        
        try:
            # Save file temporarily
            with open(temp_path, "wb") as buffer:
                shutil.copyfileobj(file.file, buffer)

            # Process the document to get content and metadata
            processed_doc = await self.doc_processor.process_document(temp_path)
            content = processed_doc['content']
            
            # First, store in MongoDB
            await self.mongodb.store_document(
                document_id=document_id,
                filename=file.filename,
                content=content,
                content_type=file.content_type,
                file_size=os.path.getsize(temp_path)
            )

            # Then process for vector store in background
            background_tasks.add_task(
                self._process_for_vector_store,
                processed_doc['chunks'],  # Use the chunks from processed document
                vector_store,
                document_id,
                file.filename
            )

            return DocumentResponse(
                message="Document uploaded successfully",
                document_id=document_id,
                status="processing",
                document_info=DocumentInfo(
                    original_filename=file.filename,
                    size=os.path.getsize(temp_path),
                    content_type=file.content_type
                )
            )
        finally:
            # Clean up temporary file
            if temp_path.exists():
                temp_path.unlink()

    async def _process_for_vector_store(
        self,
        chunks: List[str],  # Now accepting pre-processed chunks
        vector_store: ChromaVectorStore,
        document_id: str,
        filename: str
    ):
        """Process document content for vector store"""
        try:
            # Generate chunk IDs using document_id
            chunk_ids = [f"{document_id}-chunk-{i}" for i in range(len(chunks))]
            
            # Get embeddings
            embeddings = vector_store.embedding_function(chunks)
            
            # Prepare metadata for each chunk
            metadatas = [{
                'document_id': document_id,  # MongoDB document ID
                'source_file': filename,
                'chunk_index': i,
                'total_chunks': len(chunks)
            } for i in range(len(chunks))]
            
            # Store in vector store
            vector_store.add_documents(
                documents=chunks,
                embeddings=embeddings,
                metadatas=metadatas,
                ids=chunk_ids
            )
            
            logger.info(f"Successfully processed document {filename} (ID: {document_id}) into {len(chunks)} chunks")
            
        except Exception as e:
            logger.error(f"Error processing document {filename} (ID: {document_id}) for vector store: {str(e)}")
            raise

    def _is_supported_format(self, filename: str) -> bool:
        """Check if file format is supported"""
        return any(filename.lower().endswith(ext) 
                  for ext in self.doc_processor.supported_formats)

    def _create_failed_file_entry(self, filename: str, error: str) -> dict:
        """Create a failed file entry"""
        return {
            "filename": filename,
            "error": error
        }

    def cleanup(self):
        """Clean up upload directory"""
        if self.upload_dir.exists() and not any(self.upload_dir.iterdir()):
            self.upload_dir.rmdir()