chatbot-backend / src /implementations /document_service.py
TalatMasood's picture
Update knowledge upload api and linked chromadb to mongodb
d161383
raw
history blame
6.55 kB
# src/implementations/document_service.py
from pathlib import Path
import shutil
import os
from uuid import uuid4
from typing import List, Tuple, Dict
from fastapi import UploadFile, BackgroundTasks
from src.vectorstores.chroma_vectorstore import ChromaVectorStore
from src.utils.document_processor import DocumentProcessor
from src.models import DocumentResponse, DocumentInfo, BatchUploadResponse
from src.utils.logger import logger
from src.db.mongodb_store import MongoDBStore
class DocumentService:
def __init__(
self,
doc_processor: DocumentProcessor,
mongodb: MongoDBStore
):
self.doc_processor = doc_processor
self.mongodb = mongodb
self.upload_dir = Path("temp_uploads")
self.upload_dir.mkdir(exist_ok=True)
async def process_documents(
self,
files: List[UploadFile],
vector_store: ChromaVectorStore,
background_tasks: BackgroundTasks
) -> BatchUploadResponse:
"""Process multiple document uploads"""
processed_files, failed_files = await self._handle_file_uploads(
files,
vector_store,
background_tasks
)
return BatchUploadResponse(
message=f"Processed {len(processed_files)} documents with {len(failed_files)} failures",
processed_files=processed_files,
failed_files=failed_files
)
async def _handle_file_uploads(
self,
files: List[UploadFile],
vector_store: ChromaVectorStore,
background_tasks: BackgroundTasks
) -> Tuple[List[DocumentResponse], List[dict]]:
"""Handle individual file uploads and processing"""
processed_files = []
failed_files = []
for file in files:
try:
if not self._is_supported_format(file.filename):
failed_files.append(self._create_failed_file_entry(
file.filename,
"Unsupported file format"
))
continue
document_response = await self._process_single_file(
file,
vector_store,
background_tasks
)
processed_files.append(document_response)
except Exception as e:
logger.error(f"Error processing file {file.filename}: {str(e)}")
failed_files.append(self._create_failed_file_entry(
file.filename,
str(e)
))
return processed_files, failed_files
async def _process_single_file(
self,
file: UploadFile,
vector_store: ChromaVectorStore,
background_tasks: BackgroundTasks
) -> DocumentResponse:
"""Process a single file upload"""
# Generate UUID for document
document_id = str(uuid4())
temp_path = self.upload_dir / f"{document_id}_{file.filename}"
try:
# Save file temporarily
with open(temp_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
# Process the document to get content and metadata
processed_doc = await self.doc_processor.process_document(temp_path)
content = processed_doc['content']
# First, store in MongoDB
await self.mongodb.store_document(
document_id=document_id,
filename=file.filename,
content=content,
content_type=file.content_type,
file_size=os.path.getsize(temp_path)
)
# Then process for vector store in background
background_tasks.add_task(
self._process_for_vector_store,
processed_doc['chunks'], # Use the chunks from processed document
vector_store,
document_id,
file.filename
)
return DocumentResponse(
message="Document uploaded successfully",
document_id=document_id,
status="processing",
document_info=DocumentInfo(
original_filename=file.filename,
size=os.path.getsize(temp_path),
content_type=file.content_type
)
)
finally:
# Clean up temporary file
if temp_path.exists():
temp_path.unlink()
async def _process_for_vector_store(
self,
chunks: List[str], # Now accepting pre-processed chunks
vector_store: ChromaVectorStore,
document_id: str,
filename: str
):
"""Process document content for vector store"""
try:
# Generate chunk IDs using document_id
chunk_ids = [f"{document_id}-chunk-{i}" for i in range(len(chunks))]
# Get embeddings
embeddings = vector_store.embedding_function(chunks)
# Prepare metadata for each chunk
metadatas = [{
'document_id': document_id, # MongoDB document ID
'source_file': filename,
'chunk_index': i,
'total_chunks': len(chunks)
} for i in range(len(chunks))]
# Store in vector store
vector_store.add_documents(
documents=chunks,
embeddings=embeddings,
metadatas=metadatas,
ids=chunk_ids
)
logger.info(f"Successfully processed document {filename} (ID: {document_id}) into {len(chunks)} chunks")
except Exception as e:
logger.error(f"Error processing document {filename} (ID: {document_id}) for vector store: {str(e)}")
raise
def _is_supported_format(self, filename: str) -> bool:
"""Check if file format is supported"""
return any(filename.lower().endswith(ext)
for ext in self.doc_processor.supported_formats)
def _create_failed_file_entry(self, filename: str, error: str) -> dict:
"""Create a failed file entry"""
return {
"filename": filename,
"error": error
}
def cleanup(self):
"""Clean up upload directory"""
if self.upload_dir.exists() and not any(self.upload_dir.iterdir()):
self.upload_dir.rmdir()