Spaces:
Running
Running
# src/implementations/document_service.py | |
from pathlib import Path | |
import shutil | |
import os | |
from uuid import uuid4 | |
from typing import List, Tuple, Dict | |
from fastapi import UploadFile, BackgroundTasks | |
from src.vectorstores.chroma_vectorstore import ChromaVectorStore | |
from src.utils.document_processor import DocumentProcessor | |
from src.models import DocumentResponse, DocumentInfo, BatchUploadResponse | |
from src.utils.logger import logger | |
from src.db.mongodb_store import MongoDBStore | |
class DocumentService: | |
def __init__( | |
self, | |
doc_processor: DocumentProcessor, | |
mongodb: MongoDBStore | |
): | |
self.doc_processor = doc_processor | |
self.mongodb = mongodb | |
self.upload_dir = Path("temp_uploads") | |
self.upload_dir.mkdir(exist_ok=True) | |
async def process_documents( | |
self, | |
files: List[UploadFile], | |
vector_store: ChromaVectorStore, | |
background_tasks: BackgroundTasks | |
) -> BatchUploadResponse: | |
"""Process multiple document uploads""" | |
processed_files, failed_files = await self._handle_file_uploads( | |
files, | |
vector_store, | |
background_tasks | |
) | |
return BatchUploadResponse( | |
message=f"Processed {len(processed_files)} documents with {len(failed_files)} failures", | |
processed_files=processed_files, | |
failed_files=failed_files | |
) | |
async def _handle_file_uploads( | |
self, | |
files: List[UploadFile], | |
vector_store: ChromaVectorStore, | |
background_tasks: BackgroundTasks | |
) -> Tuple[List[DocumentResponse], List[dict]]: | |
"""Handle individual file uploads and processing""" | |
processed_files = [] | |
failed_files = [] | |
for file in files: | |
try: | |
if not self._is_supported_format(file.filename): | |
failed_files.append(self._create_failed_file_entry( | |
file.filename, | |
"Unsupported file format" | |
)) | |
continue | |
document_response = await self._process_single_file( | |
file, | |
vector_store, | |
background_tasks | |
) | |
processed_files.append(document_response) | |
except Exception as e: | |
logger.error(f"Error processing file {file.filename}: {str(e)}") | |
failed_files.append(self._create_failed_file_entry( | |
file.filename, | |
str(e) | |
)) | |
return processed_files, failed_files | |
async def _process_single_file( | |
self, | |
file: UploadFile, | |
vector_store: ChromaVectorStore, | |
background_tasks: BackgroundTasks | |
) -> DocumentResponse: | |
"""Process a single file upload""" | |
# Generate UUID for document | |
document_id = str(uuid4()) | |
temp_path = self.upload_dir / f"{document_id}_{file.filename}" | |
try: | |
# Save file temporarily | |
with open(temp_path, "wb") as buffer: | |
shutil.copyfileobj(file.file, buffer) | |
# Process the document to get content and metadata | |
processed_doc = await self.doc_processor.process_document(temp_path) | |
content = processed_doc['content'] | |
# First, store in MongoDB | |
await self.mongodb.store_document( | |
document_id=document_id, | |
filename=file.filename, | |
content=content, | |
content_type=file.content_type, | |
file_size=os.path.getsize(temp_path) | |
) | |
# Then process for vector store in background | |
background_tasks.add_task( | |
self._process_for_vector_store, | |
processed_doc['chunks'], # Use the chunks from processed document | |
vector_store, | |
document_id, | |
file.filename | |
) | |
return DocumentResponse( | |
message="Document uploaded successfully", | |
document_id=document_id, | |
status="processing", | |
document_info=DocumentInfo( | |
original_filename=file.filename, | |
size=os.path.getsize(temp_path), | |
content_type=file.content_type | |
) | |
) | |
finally: | |
# Clean up temporary file | |
if temp_path.exists(): | |
temp_path.unlink() | |
async def _process_for_vector_store( | |
self, | |
chunks: List[str], # Now accepting pre-processed chunks | |
vector_store: ChromaVectorStore, | |
document_id: str, | |
filename: str | |
): | |
"""Process document content for vector store""" | |
try: | |
# Generate chunk IDs using document_id | |
chunk_ids = [f"{document_id}-chunk-{i}" for i in range(len(chunks))] | |
# Get embeddings | |
embeddings = vector_store.embedding_function(chunks) | |
# Prepare metadata for each chunk | |
metadatas = [{ | |
'document_id': document_id, # MongoDB document ID | |
'source_file': filename, | |
'chunk_index': i, | |
'total_chunks': len(chunks) | |
} for i in range(len(chunks))] | |
# Store in vector store | |
vector_store.add_documents( | |
documents=chunks, | |
embeddings=embeddings, | |
metadatas=metadatas, | |
ids=chunk_ids | |
) | |
logger.info(f"Successfully processed document {filename} (ID: {document_id}) into {len(chunks)} chunks") | |
except Exception as e: | |
logger.error(f"Error processing document {filename} (ID: {document_id}) for vector store: {str(e)}") | |
raise | |
def _is_supported_format(self, filename: str) -> bool: | |
"""Check if file format is supported""" | |
return any(filename.lower().endswith(ext) | |
for ext in self.doc_processor.supported_formats) | |
def _create_failed_file_entry(self, filename: str, error: str) -> dict: | |
"""Create a failed file entry""" | |
return { | |
"filename": filename, | |
"error": error | |
} | |
def cleanup(self): | |
"""Clean up upload directory""" | |
if self.upload_dir.exists() and not any(self.upload_dir.iterdir()): | |
self.upload_dir.rmdir() |