chatbot-backend / src /implementations /document_service.py
TalatMasood's picture
Log google drive documents in the mongodb, add source of the document and made chunks to overlap text.
acdfaa9
raw
history blame
9.35 kB
# src/implementations/document_service.py
from fastapi import HTTPException
from pathlib import Path
import shutil
import os
from uuid import uuid4
from typing import List, Tuple, Dict
from fastapi import UploadFile, BackgroundTasks
from src.vectorstores.chroma_vectorstore import ChromaVectorStore
from src.utils.document_processor import DocumentProcessor
from src.models import DocumentResponse, DocumentInfo, BatchUploadResponse
from src.utils.logger import logger
from src.db.mongodb_store import MongoDBStore
class DocumentService:
def __init__(
self,
doc_processor: DocumentProcessor,
mongodb: MongoDBStore
):
self.doc_processor = doc_processor
self.mongodb = mongodb
self.permanent_dir = Path("uploads")
self.permanent_dir.mkdir(exist_ok=True)
async def check_duplicate_filename(self, filename: str) -> bool:
"""
Check if a file with the same name exists
Args:
filename (str): Original filename to check
Returns:
bool: True if duplicate exists, False otherwise
"""
documents = await self.mongodb.get_all_documents()
return any(doc.get('filename') == filename for doc in documents)
async def process_documents(
self,
files: List[UploadFile],
vector_store: ChromaVectorStore,
background_tasks: BackgroundTasks
) -> BatchUploadResponse:
"""Process multiple document uploads"""
processed_files, failed_files = await self._handle_file_uploads(
files,
vector_store,
background_tasks
)
return BatchUploadResponse(
message=f"Processed {len(processed_files)} documents with {len(failed_files)} failures",
processed_files=processed_files,
failed_files=failed_files
)
async def _handle_file_uploads(
self,
files: List[UploadFile],
vector_store: ChromaVectorStore,
background_tasks: BackgroundTasks
) -> Tuple[List[DocumentResponse], List[dict]]:
"""Handle individual file uploads and processing"""
processed_files = []
failed_files = []
for file in files:
try:
# Check for duplicate filename
if await self.check_duplicate_filename(file.filename):
failed_files.append(self._create_failed_file_entry(
file.filename,
"A document with this name already exists. Please upload another document."
))
continue
if not self._is_supported_format(file.filename):
failed_files.append(self._create_failed_file_entry(
file.filename,
"Unsupported file format"
))
continue
document_response = await self._process_single_file(
file,
vector_store,
background_tasks
)
processed_files.append(document_response)
except Exception as e:
logger.error(
f"Error processing file {file.filename}: {str(e)}")
failed_files.append(self._create_failed_file_entry(
file.filename,
str(e)
))
return processed_files, failed_files
async def _process_single_file(
self,
file: UploadFile,
vector_store: ChromaVectorStore,
background_tasks: BackgroundTasks
) -> DocumentResponse:
"""Process a single file upload with proper handle closure"""
document_id = str(uuid4())
filename = f"{document_id}_{file.filename}"
file_path = self.permanent_dir / filename
url_path = f"/docs/{filename}"
try:
# Save file to permanent location using a context manager
with open(file_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
# Close the uploaded file explicitly
await file.close()
# Process document with proper cleanup for Excel files
try:
processed_doc = await self.doc_processor.process_document(file_path)
# For Excel files, ensure pandas closes the file
if file_path.suffix.lower() in ['.xlsx', '.xls']:
import gc
gc.collect() # Help cleanup any lingering file handles
except Exception as proc_error:
logger.error(f"Error processing document: {str(proc_error)}")
raise
# Store in MongoDB with url_path
await self.mongodb.store_document(
document_id=document_id,
filename=file.filename,
content_type=file.content_type,
file_size=os.path.getsize(file_path),
url_path=url_path,
source="user_upload"
)
# Process for vector store in background
background_tasks.add_task(
self._process_for_vector_store,
processed_doc['chunks'],
vector_store,
document_id,
file.filename
)
return DocumentResponse(
message="Document uploaded successfully",
document_id=document_id,
status="processing",
document_info=DocumentInfo(
original_filename=file.filename,
size=os.path.getsize(file_path),
content_type=file.content_type,
url_path=url_path
)
)
except Exception as e:
# Clean up file if it was created
if file_path.exists():
try:
file_path.unlink()
except Exception as cleanup_error:
logger.error(
f"Error cleaning up file {file_path}: {str(cleanup_error)}")
# Clean up from MongoDB if document was created
try:
await self.mongodb.delete_document(document_id)
except Exception as db_cleanup_error:
logger.error(
f"Error cleaning up MongoDB document {document_id}: {str(db_cleanup_error)}")
logger.error(f"Error processing file {file.filename}: {str(e)}")
raise
async def _process_for_vector_store(
self,
chunks: List[str],
vector_store: ChromaVectorStore,
document_id: str,
filename: str
):
"""Process document content for vector store"""
try:
# Generate chunk IDs using document_id
chunk_ids = [
f"{document_id}-chunk-{i}" for i in range(len(chunks))]
# Get embeddings
embeddings = vector_store.embedding_function(chunks)
# Prepare metadata for each chunk
metadatas = [{
'document_id': document_id,
'source_file': filename,
'chunk_index': i,
'total_chunks': len(chunks)
} for i in range(len(chunks))]
# Store in vector store
vector_store.add_documents(
documents=chunks,
embeddings=embeddings,
metadatas=metadatas,
ids=chunk_ids
)
logger.info(
f"Successfully processed document {filename} (ID: {document_id}) into {len(chunks)} chunks")
except Exception as e:
logger.error(
f"Error processing document {filename} (ID: {document_id}) for vector store: {str(e)}")
raise
def _is_supported_format(self, filename: str) -> bool:
"""Check if file format is supported"""
return any(filename.lower().endswith(ext)
for ext in self.doc_processor.supported_formats)
def _create_failed_file_entry(self, filename: str, error: str) -> dict:
"""Create a failed file entry"""
return {
"filename": filename,
"error": error
}
async def delete_document(self, document_id: str) -> bool:
"""Delete document from storage and MongoDB"""
try:
# Get document details from MongoDB
doc = await self.mongodb.get_document(document_id)
if doc:
# Get filename from url_path
filename = doc['url_path'].split('/')[-1]
file_path = self.permanent_dir / filename
# Delete physical file if it exists
if file_path.exists():
file_path.unlink()
# Delete from MongoDB
return await self.mongodb.delete_document(document_id)
return False
except Exception as e:
logger.error(f"Error deleting document: {str(e)}")
raise
def cleanup(self):
"""Clean up permanent directory if empty"""
if self.permanent_dir.exists() and not any(self.permanent_dir.iterdir()):
self.permanent_dir.rmdir()